aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp_input.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4/tcp_input.c')
-rw-r--r--net/ipv4/tcp_input.c676
1 files changed, 176 insertions, 500 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 0d9bdacce99f..08bbe6096528 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -93,12 +93,11 @@ int sysctl_tcp_stdurg __read_mostly;
93int sysctl_tcp_rfc1337 __read_mostly; 93int sysctl_tcp_rfc1337 __read_mostly;
94int sysctl_tcp_max_orphans __read_mostly = NR_FILE; 94int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
95int sysctl_tcp_frto __read_mostly = 2; 95int sysctl_tcp_frto __read_mostly = 2;
96int sysctl_tcp_frto_response __read_mostly;
97 96
98int sysctl_tcp_thin_dupack __read_mostly; 97int sysctl_tcp_thin_dupack __read_mostly;
99 98
100int sysctl_tcp_moderate_rcvbuf __read_mostly = 1; 99int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
101int sysctl_tcp_early_retrans __read_mostly = 2; 100int sysctl_tcp_early_retrans __read_mostly = 3;
102 101
103#define FLAG_DATA 0x01 /* Incoming frame contained data. */ 102#define FLAG_DATA 0x01 /* Incoming frame contained data. */
104#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ 103#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
@@ -108,17 +107,16 @@ int sysctl_tcp_early_retrans __read_mostly = 2;
108#define FLAG_DATA_SACKED 0x20 /* New SACK. */ 107#define FLAG_DATA_SACKED 0x20 /* New SACK. */
109#define FLAG_ECE 0x40 /* ECE in this ACK */ 108#define FLAG_ECE 0x40 /* ECE in this ACK */
110#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/ 109#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/
111#define FLAG_ONLY_ORIG_SACKED 0x200 /* SACKs only non-rexmit sent before RTO */ 110#define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */
112#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ 111#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
113#define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */ 112#define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */
114#define FLAG_NONHEAD_RETRANS_ACKED 0x1000 /* Non-head rexmitted data was ACKed */
115#define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */ 113#define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */
114#define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */
116 115
117#define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED) 116#define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED)
118#define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED) 117#define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)
119#define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE) 118#define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE)
120#define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED) 119#define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED)
121#define FLAG_ANY_PROGRESS (FLAG_FORWARD_PROGRESS|FLAG_SND_UNA_ADVANCED)
122 120
123#define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH) 121#define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH)
124#define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH)) 122#define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH))
@@ -1159,10 +1157,8 @@ static u8 tcp_sacktag_one(struct sock *sk,
1159 tcp_highest_sack_seq(tp))) 1157 tcp_highest_sack_seq(tp)))
1160 state->reord = min(fack_count, 1158 state->reord = min(fack_count,
1161 state->reord); 1159 state->reord);
1162 1160 if (!after(end_seq, tp->high_seq))
1163 /* SACK enhanced F-RTO (RFC4138; Appendix B) */ 1161 state->flag |= FLAG_ORIG_SACK_ACKED;
1164 if (!after(end_seq, tp->frto_highmark))
1165 state->flag |= FLAG_ONLY_ORIG_SACKED;
1166 } 1162 }
1167 1163
1168 if (sacked & TCPCB_LOST) { 1164 if (sacked & TCPCB_LOST) {
@@ -1555,7 +1551,6 @@ static int
1555tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, 1551tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
1556 u32 prior_snd_una) 1552 u32 prior_snd_una)
1557{ 1553{
1558 const struct inet_connection_sock *icsk = inet_csk(sk);
1559 struct tcp_sock *tp = tcp_sk(sk); 1554 struct tcp_sock *tp = tcp_sk(sk);
1560 const unsigned char *ptr = (skb_transport_header(ack_skb) + 1555 const unsigned char *ptr = (skb_transport_header(ack_skb) +
1561 TCP_SKB_CB(ack_skb)->sacked); 1556 TCP_SKB_CB(ack_skb)->sacked);
@@ -1728,12 +1723,6 @@ walk:
1728 start_seq, end_seq, dup_sack); 1723 start_seq, end_seq, dup_sack);
1729 1724
1730advance_sp: 1725advance_sp:
1731 /* SACK enhanced FRTO (RFC4138, Appendix B): Clearing correct
1732 * due to in-order walk
1733 */
1734 if (after(end_seq, tp->frto_highmark))
1735 state.flag &= ~FLAG_ONLY_ORIG_SACKED;
1736
1737 i++; 1726 i++;
1738 } 1727 }
1739 1728
@@ -1750,8 +1739,7 @@ advance_sp:
1750 tcp_verify_left_out(tp); 1739 tcp_verify_left_out(tp);
1751 1740
1752 if ((state.reord < tp->fackets_out) && 1741 if ((state.reord < tp->fackets_out) &&
1753 ((icsk->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker) && 1742 ((inet_csk(sk)->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker))
1754 (!tp->frto_highmark || after(tp->snd_una, tp->frto_highmark)))
1755 tcp_update_reordering(sk, tp->fackets_out - state.reord, 0); 1743 tcp_update_reordering(sk, tp->fackets_out - state.reord, 0);
1756 1744
1757out: 1745out:
@@ -1825,197 +1813,6 @@ static inline void tcp_reset_reno_sack(struct tcp_sock *tp)
1825 tp->sacked_out = 0; 1813 tp->sacked_out = 0;
1826} 1814}
1827 1815
1828static int tcp_is_sackfrto(const struct tcp_sock *tp)
1829{
1830 return (sysctl_tcp_frto == 0x2) && !tcp_is_reno(tp);
1831}
1832
1833/* F-RTO can only be used if TCP has never retransmitted anything other than
1834 * head (SACK enhanced variant from Appendix B of RFC4138 is more robust here)
1835 */
1836bool tcp_use_frto(struct sock *sk)
1837{
1838 const struct tcp_sock *tp = tcp_sk(sk);
1839 const struct inet_connection_sock *icsk = inet_csk(sk);
1840 struct sk_buff *skb;
1841
1842 if (!sysctl_tcp_frto)
1843 return false;
1844
1845 /* MTU probe and F-RTO won't really play nicely along currently */
1846 if (icsk->icsk_mtup.probe_size)
1847 return false;
1848
1849 if (tcp_is_sackfrto(tp))
1850 return true;
1851
1852 /* Avoid expensive walking of rexmit queue if possible */
1853 if (tp->retrans_out > 1)
1854 return false;
1855
1856 skb = tcp_write_queue_head(sk);
1857 if (tcp_skb_is_last(sk, skb))
1858 return true;
1859 skb = tcp_write_queue_next(sk, skb); /* Skips head */
1860 tcp_for_write_queue_from(skb, sk) {
1861 if (skb == tcp_send_head(sk))
1862 break;
1863 if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS)
1864 return false;
1865 /* Short-circuit when first non-SACKed skb has been checked */
1866 if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
1867 break;
1868 }
1869 return true;
1870}
1871
1872/* RTO occurred, but do not yet enter Loss state. Instead, defer RTO
1873 * recovery a bit and use heuristics in tcp_process_frto() to detect if
1874 * the RTO was spurious. Only clear SACKED_RETRANS of the head here to
1875 * keep retrans_out counting accurate (with SACK F-RTO, other than head
1876 * may still have that bit set); TCPCB_LOST and remaining SACKED_RETRANS
1877 * bits are handled if the Loss state is really to be entered (in
1878 * tcp_enter_frto_loss).
1879 *
1880 * Do like tcp_enter_loss() would; when RTO expires the second time it
1881 * does:
1882 * "Reduce ssthresh if it has not yet been made inside this window."
1883 */
1884void tcp_enter_frto(struct sock *sk)
1885{
1886 const struct inet_connection_sock *icsk = inet_csk(sk);
1887 struct tcp_sock *tp = tcp_sk(sk);
1888 struct sk_buff *skb;
1889
1890 if ((!tp->frto_counter && icsk->icsk_ca_state <= TCP_CA_Disorder) ||
1891 tp->snd_una == tp->high_seq ||
1892 ((icsk->icsk_ca_state == TCP_CA_Loss || tp->frto_counter) &&
1893 !icsk->icsk_retransmits)) {
1894 tp->prior_ssthresh = tcp_current_ssthresh(sk);
1895 /* Our state is too optimistic in ssthresh() call because cwnd
1896 * is not reduced until tcp_enter_frto_loss() when previous F-RTO
1897 * recovery has not yet completed. Pattern would be this: RTO,
1898 * Cumulative ACK, RTO (2xRTO for the same segment does not end
1899 * up here twice).
1900 * RFC4138 should be more specific on what to do, even though
1901 * RTO is quite unlikely to occur after the first Cumulative ACK
1902 * due to back-off and complexity of triggering events ...
1903 */
1904 if (tp->frto_counter) {
1905 u32 stored_cwnd;
1906 stored_cwnd = tp->snd_cwnd;
1907 tp->snd_cwnd = 2;
1908 tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
1909 tp->snd_cwnd = stored_cwnd;
1910 } else {
1911 tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
1912 }
1913 /* ... in theory, cong.control module could do "any tricks" in
1914 * ssthresh(), which means that ca_state, lost bits and lost_out
1915 * counter would have to be faked before the call occurs. We
1916 * consider that too expensive, unlikely and hacky, so modules
1917 * using these in ssthresh() must deal these incompatibility
1918 * issues if they receives CA_EVENT_FRTO and frto_counter != 0
1919 */
1920 tcp_ca_event(sk, CA_EVENT_FRTO);
1921 }
1922
1923 tp->undo_marker = tp->snd_una;
1924 tp->undo_retrans = 0;
1925
1926 skb = tcp_write_queue_head(sk);
1927 if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS)
1928 tp->undo_marker = 0;
1929 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
1930 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
1931 tp->retrans_out -= tcp_skb_pcount(skb);
1932 }
1933 tcp_verify_left_out(tp);
1934
1935 /* Too bad if TCP was application limited */
1936 tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp) + 1);
1937
1938 /* Earlier loss recovery underway (see RFC4138; Appendix B).
1939 * The last condition is necessary at least in tp->frto_counter case.
1940 */
1941 if (tcp_is_sackfrto(tp) && (tp->frto_counter ||
1942 ((1 << icsk->icsk_ca_state) & (TCPF_CA_Recovery|TCPF_CA_Loss))) &&
1943 after(tp->high_seq, tp->snd_una)) {
1944 tp->frto_highmark = tp->high_seq;
1945 } else {
1946 tp->frto_highmark = tp->snd_nxt;
1947 }
1948 tcp_set_ca_state(sk, TCP_CA_Disorder);
1949 tp->high_seq = tp->snd_nxt;
1950 tp->frto_counter = 1;
1951}
1952
1953/* Enter Loss state after F-RTO was applied. Dupack arrived after RTO,
1954 * which indicates that we should follow the traditional RTO recovery,
1955 * i.e. mark everything lost and do go-back-N retransmission.
1956 */
1957static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag)
1958{
1959 struct tcp_sock *tp = tcp_sk(sk);
1960 struct sk_buff *skb;
1961
1962 tp->lost_out = 0;
1963 tp->retrans_out = 0;
1964 if (tcp_is_reno(tp))
1965 tcp_reset_reno_sack(tp);
1966
1967 tcp_for_write_queue(skb, sk) {
1968 if (skb == tcp_send_head(sk))
1969 break;
1970
1971 TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
1972 /*
1973 * Count the retransmission made on RTO correctly (only when
1974 * waiting for the first ACK and did not get it)...
1975 */
1976 if ((tp->frto_counter == 1) && !(flag & FLAG_DATA_ACKED)) {
1977 /* For some reason this R-bit might get cleared? */
1978 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)
1979 tp->retrans_out += tcp_skb_pcount(skb);
1980 /* ...enter this if branch just for the first segment */
1981 flag |= FLAG_DATA_ACKED;
1982 } else {
1983 if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS)
1984 tp->undo_marker = 0;
1985 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
1986 }
1987
1988 /* Marking forward transmissions that were made after RTO lost
1989 * can cause unnecessary retransmissions in some scenarios,
1990 * SACK blocks will mitigate that in some but not in all cases.
1991 * We used to not mark them but it was causing break-ups with
1992 * receivers that do only in-order receival.
1993 *
1994 * TODO: we could detect presence of such receiver and select
1995 * different behavior per flow.
1996 */
1997 if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
1998 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
1999 tp->lost_out += tcp_skb_pcount(skb);
2000 tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
2001 }
2002 }
2003 tcp_verify_left_out(tp);
2004
2005 tp->snd_cwnd = tcp_packets_in_flight(tp) + allowed_segments;
2006 tp->snd_cwnd_cnt = 0;
2007 tp->snd_cwnd_stamp = tcp_time_stamp;
2008 tp->frto_counter = 0;
2009
2010 tp->reordering = min_t(unsigned int, tp->reordering,
2011 sysctl_tcp_reordering);
2012 tcp_set_ca_state(sk, TCP_CA_Loss);
2013 tp->high_seq = tp->snd_nxt;
2014 TCP_ECN_queue_cwr(tp);
2015
2016 tcp_clear_all_retrans_hints(tp);
2017}
2018
2019static void tcp_clear_retrans_partial(struct tcp_sock *tp) 1816static void tcp_clear_retrans_partial(struct tcp_sock *tp)
2020{ 1817{
2021 tp->retrans_out = 0; 1818 tp->retrans_out = 0;
@@ -2042,10 +1839,13 @@ void tcp_enter_loss(struct sock *sk, int how)
2042 const struct inet_connection_sock *icsk = inet_csk(sk); 1839 const struct inet_connection_sock *icsk = inet_csk(sk);
2043 struct tcp_sock *tp = tcp_sk(sk); 1840 struct tcp_sock *tp = tcp_sk(sk);
2044 struct sk_buff *skb; 1841 struct sk_buff *skb;
1842 bool new_recovery = false;
2045 1843
2046 /* Reduce ssthresh if it has not yet been made inside this window. */ 1844 /* Reduce ssthresh if it has not yet been made inside this window. */
2047 if (icsk->icsk_ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq || 1845 if (icsk->icsk_ca_state <= TCP_CA_Disorder ||
1846 !after(tp->high_seq, tp->snd_una) ||
2048 (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) { 1847 (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
1848 new_recovery = true;
2049 tp->prior_ssthresh = tcp_current_ssthresh(sk); 1849 tp->prior_ssthresh = tcp_current_ssthresh(sk);
2050 tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); 1850 tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
2051 tcp_ca_event(sk, CA_EVENT_LOSS); 1851 tcp_ca_event(sk, CA_EVENT_LOSS);
@@ -2059,11 +1859,8 @@ void tcp_enter_loss(struct sock *sk, int how)
2059 if (tcp_is_reno(tp)) 1859 if (tcp_is_reno(tp))
2060 tcp_reset_reno_sack(tp); 1860 tcp_reset_reno_sack(tp);
2061 1861
2062 if (!how) { 1862 tp->undo_marker = tp->snd_una;
2063 /* Push undo marker, if it was plain RTO and nothing 1863 if (how) {
2064 * was retransmitted. */
2065 tp->undo_marker = tp->snd_una;
2066 } else {
2067 tp->sacked_out = 0; 1864 tp->sacked_out = 0;
2068 tp->fackets_out = 0; 1865 tp->fackets_out = 0;
2069 } 1866 }
@@ -2090,8 +1887,14 @@ void tcp_enter_loss(struct sock *sk, int how)
2090 tcp_set_ca_state(sk, TCP_CA_Loss); 1887 tcp_set_ca_state(sk, TCP_CA_Loss);
2091 tp->high_seq = tp->snd_nxt; 1888 tp->high_seq = tp->snd_nxt;
2092 TCP_ECN_queue_cwr(tp); 1889 TCP_ECN_queue_cwr(tp);
2093 /* Abort F-RTO algorithm if one is in progress */ 1890
2094 tp->frto_counter = 0; 1891 /* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous
1892 * loss recovery is underway except recurring timeout(s) on
1893 * the same SND.UNA (sec 3.2). Disable F-RTO on path MTU probing
1894 */
1895 tp->frto = sysctl_tcp_frto &&
1896 (new_recovery || icsk->icsk_retransmits) &&
1897 !inet_csk(sk)->icsk_mtup.probe_size;
2095} 1898}
2096 1899
2097/* If ACK arrived pointing to a remembered SACK, it means that our 1900/* If ACK arrived pointing to a remembered SACK, it means that our
@@ -2150,15 +1953,16 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag)
2150 * max(RTT/4, 2msec) unless ack has ECE mark, no RTT samples 1953 * max(RTT/4, 2msec) unless ack has ECE mark, no RTT samples
2151 * available, or RTO is scheduled to fire first. 1954 * available, or RTO is scheduled to fire first.
2152 */ 1955 */
2153 if (sysctl_tcp_early_retrans < 2 || (flag & FLAG_ECE) || !tp->srtt) 1956 if (sysctl_tcp_early_retrans < 2 || sysctl_tcp_early_retrans > 3 ||
1957 (flag & FLAG_ECE) || !tp->srtt)
2154 return false; 1958 return false;
2155 1959
2156 delay = max_t(unsigned long, (tp->srtt >> 5), msecs_to_jiffies(2)); 1960 delay = max_t(unsigned long, (tp->srtt >> 5), msecs_to_jiffies(2));
2157 if (!time_after(inet_csk(sk)->icsk_timeout, (jiffies + delay))) 1961 if (!time_after(inet_csk(sk)->icsk_timeout, (jiffies + delay)))
2158 return false; 1962 return false;
2159 1963
2160 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, delay, TCP_RTO_MAX); 1964 inet_csk_reset_xmit_timer(sk, ICSK_TIME_EARLY_RETRANS, delay,
2161 tp->early_retrans_delayed = 1; 1965 TCP_RTO_MAX);
2162 return true; 1966 return true;
2163} 1967}
2164 1968
@@ -2274,10 +2078,6 @@ static bool tcp_time_to_recover(struct sock *sk, int flag)
2274 struct tcp_sock *tp = tcp_sk(sk); 2078 struct tcp_sock *tp = tcp_sk(sk);
2275 __u32 packets_out; 2079 __u32 packets_out;
2276 2080
2277 /* Do not perform any recovery during F-RTO algorithm */
2278 if (tp->frto_counter)
2279 return false;
2280
2281 /* Trick#1: The loss is proven. */ 2081 /* Trick#1: The loss is proven. */
2282 if (tp->lost_out) 2082 if (tp->lost_out)
2283 return true; 2083 return true;
@@ -2321,7 +2121,7 @@ static bool tcp_time_to_recover(struct sock *sk, int flag)
2321 * interval if appropriate. 2121 * interval if appropriate.
2322 */ 2122 */
2323 if (tp->do_early_retrans && !tp->retrans_out && tp->sacked_out && 2123 if (tp->do_early_retrans && !tp->retrans_out && tp->sacked_out &&
2324 (tp->packets_out == (tp->sacked_out + 1) && tp->packets_out < 4) && 2124 (tp->packets_out >= (tp->sacked_out + 1) && tp->packets_out < 4) &&
2325 !tcp_may_send_now(sk)) 2125 !tcp_may_send_now(sk))
2326 return !tcp_pause_early_retransmit(sk, flag); 2126 return !tcp_pause_early_retransmit(sk, flag);
2327 2127
@@ -2638,12 +2438,12 @@ static int tcp_try_undo_partial(struct sock *sk, int acked)
2638 return failed; 2438 return failed;
2639} 2439}
2640 2440
2641/* Undo during loss recovery after partial ACK. */ 2441/* Undo during loss recovery after partial ACK or using F-RTO. */
2642static bool tcp_try_undo_loss(struct sock *sk) 2442static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo)
2643{ 2443{
2644 struct tcp_sock *tp = tcp_sk(sk); 2444 struct tcp_sock *tp = tcp_sk(sk);
2645 2445
2646 if (tcp_may_undo(tp)) { 2446 if (frto_undo || tcp_may_undo(tp)) {
2647 struct sk_buff *skb; 2447 struct sk_buff *skb;
2648 tcp_for_write_queue(skb, sk) { 2448 tcp_for_write_queue(skb, sk) {
2649 if (skb == tcp_send_head(sk)) 2449 if (skb == tcp_send_head(sk))
@@ -2657,9 +2457,12 @@ static bool tcp_try_undo_loss(struct sock *sk)
2657 tp->lost_out = 0; 2457 tp->lost_out = 0;
2658 tcp_undo_cwr(sk, true); 2458 tcp_undo_cwr(sk, true);
2659 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSUNDO); 2459 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSUNDO);
2460 if (frto_undo)
2461 NET_INC_STATS_BH(sock_net(sk),
2462 LINUX_MIB_TCPSPURIOUSRTOS);
2660 inet_csk(sk)->icsk_retransmits = 0; 2463 inet_csk(sk)->icsk_retransmits = 0;
2661 tp->undo_marker = 0; 2464 tp->undo_marker = 0;
2662 if (tcp_is_sack(tp)) 2465 if (frto_undo || tcp_is_sack(tp))
2663 tcp_set_ca_state(sk, TCP_CA_Open); 2466 tcp_set_ca_state(sk, TCP_CA_Open);
2664 return true; 2467 return true;
2665 } 2468 }
@@ -2681,6 +2484,7 @@ static void tcp_init_cwnd_reduction(struct sock *sk, const bool set_ssthresh)
2681 struct tcp_sock *tp = tcp_sk(sk); 2484 struct tcp_sock *tp = tcp_sk(sk);
2682 2485
2683 tp->high_seq = tp->snd_nxt; 2486 tp->high_seq = tp->snd_nxt;
2487 tp->tlp_high_seq = 0;
2684 tp->snd_cwnd_cnt = 0; 2488 tp->snd_cwnd_cnt = 0;
2685 tp->prior_cwnd = tp->snd_cwnd; 2489 tp->prior_cwnd = tp->snd_cwnd;
2686 tp->prr_delivered = 0; 2490 tp->prr_delivered = 0;
@@ -2758,7 +2562,7 @@ static void tcp_try_to_open(struct sock *sk, int flag, int newly_acked_sacked)
2758 2562
2759 tcp_verify_left_out(tp); 2563 tcp_verify_left_out(tp);
2760 2564
2761 if (!tp->frto_counter && !tcp_any_retrans_done(sk)) 2565 if (!tcp_any_retrans_done(sk))
2762 tp->retrans_stamp = 0; 2566 tp->retrans_stamp = 0;
2763 2567
2764 if (flag & FLAG_ECE) 2568 if (flag & FLAG_ECE)
@@ -2875,6 +2679,58 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
2875 tcp_set_ca_state(sk, TCP_CA_Recovery); 2679 tcp_set_ca_state(sk, TCP_CA_Recovery);
2876} 2680}
2877 2681
2682/* Process an ACK in CA_Loss state. Move to CA_Open if lost data are
2683 * recovered or spurious. Otherwise retransmits more on partial ACKs.
2684 */
2685static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack)
2686{
2687 struct inet_connection_sock *icsk = inet_csk(sk);
2688 struct tcp_sock *tp = tcp_sk(sk);
2689 bool recovered = !before(tp->snd_una, tp->high_seq);
2690
2691 if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */
2692 if (flag & FLAG_ORIG_SACK_ACKED) {
2693 /* Step 3.b. A timeout is spurious if not all data are
2694 * lost, i.e., never-retransmitted data are (s)acked.
2695 */
2696 tcp_try_undo_loss(sk, true);
2697 return;
2698 }
2699 if (after(tp->snd_nxt, tp->high_seq) &&
2700 (flag & FLAG_DATA_SACKED || is_dupack)) {
2701 tp->frto = 0; /* Loss was real: 2nd part of step 3.a */
2702 } else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) {
2703 tp->high_seq = tp->snd_nxt;
2704 __tcp_push_pending_frames(sk, tcp_current_mss(sk),
2705 TCP_NAGLE_OFF);
2706 if (after(tp->snd_nxt, tp->high_seq))
2707 return; /* Step 2.b */
2708 tp->frto = 0;
2709 }
2710 }
2711
2712 if (recovered) {
2713 /* F-RTO RFC5682 sec 3.1 step 2.a and 1st part of step 3.a */
2714 icsk->icsk_retransmits = 0;
2715 tcp_try_undo_recovery(sk);
2716 return;
2717 }
2718 if (flag & FLAG_DATA_ACKED)
2719 icsk->icsk_retransmits = 0;
2720 if (tcp_is_reno(tp)) {
2721 /* A Reno DUPACK means new data in F-RTO step 2.b above are
2722 * delivered. Lower inflight to clock out (re)tranmissions.
2723 */
2724 if (after(tp->snd_nxt, tp->high_seq) && is_dupack)
2725 tcp_add_reno_sack(sk);
2726 else if (flag & FLAG_SND_UNA_ADVANCED)
2727 tcp_reset_reno_sack(tp);
2728 }
2729 if (tcp_try_undo_loss(sk, false))
2730 return;
2731 tcp_xmit_retransmit_queue(sk);
2732}
2733
2878/* Process an event, which can update packets-in-flight not trivially. 2734/* Process an event, which can update packets-in-flight not trivially.
2879 * Main goal of this function is to calculate new estimate for left_out, 2735 * Main goal of this function is to calculate new estimate for left_out,
2880 * taking into account both packets sitting in receiver's buffer and 2736 * taking into account both packets sitting in receiver's buffer and
@@ -2921,12 +2777,6 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
2921 tp->retrans_stamp = 0; 2777 tp->retrans_stamp = 0;
2922 } else if (!before(tp->snd_una, tp->high_seq)) { 2778 } else if (!before(tp->snd_una, tp->high_seq)) {
2923 switch (icsk->icsk_ca_state) { 2779 switch (icsk->icsk_ca_state) {
2924 case TCP_CA_Loss:
2925 icsk->icsk_retransmits = 0;
2926 if (tcp_try_undo_recovery(sk))
2927 return;
2928 break;
2929
2930 case TCP_CA_CWR: 2780 case TCP_CA_CWR:
2931 /* CWR is to be held something *above* high_seq 2781 /* CWR is to be held something *above* high_seq
2932 * is ACKed for CWR bit to reach receiver. */ 2782 * is ACKed for CWR bit to reach receiver. */
@@ -2957,18 +2807,10 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
2957 newly_acked_sacked = pkts_acked + tp->sacked_out - prior_sacked; 2807 newly_acked_sacked = pkts_acked + tp->sacked_out - prior_sacked;
2958 break; 2808 break;
2959 case TCP_CA_Loss: 2809 case TCP_CA_Loss:
2960 if (flag & FLAG_DATA_ACKED) 2810 tcp_process_loss(sk, flag, is_dupack);
2961 icsk->icsk_retransmits = 0;
2962 if (tcp_is_reno(tp) && flag & FLAG_SND_UNA_ADVANCED)
2963 tcp_reset_reno_sack(tp);
2964 if (!tcp_try_undo_loss(sk)) {
2965 tcp_moderate_cwnd(tp);
2966 tcp_xmit_retransmit_queue(sk);
2967 return;
2968 }
2969 if (icsk->icsk_ca_state != TCP_CA_Open) 2811 if (icsk->icsk_ca_state != TCP_CA_Open)
2970 return; 2812 return;
2971 /* Loss is undone; fall through to processing in Open state. */ 2813 /* Fall through to processing in Open state. */
2972 default: 2814 default:
2973 if (tcp_is_reno(tp)) { 2815 if (tcp_is_reno(tp)) {
2974 if (flag & FLAG_SND_UNA_ADVANCED) 2816 if (flag & FLAG_SND_UNA_ADVANCED)
@@ -3081,6 +2923,7 @@ static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
3081 */ 2923 */
3082void tcp_rearm_rto(struct sock *sk) 2924void tcp_rearm_rto(struct sock *sk)
3083{ 2925{
2926 const struct inet_connection_sock *icsk = inet_csk(sk);
3084 struct tcp_sock *tp = tcp_sk(sk); 2927 struct tcp_sock *tp = tcp_sk(sk);
3085 2928
3086 /* If the retrans timer is currently being used by Fast Open 2929 /* If the retrans timer is currently being used by Fast Open
@@ -3094,12 +2937,13 @@ void tcp_rearm_rto(struct sock *sk)
3094 } else { 2937 } else {
3095 u32 rto = inet_csk(sk)->icsk_rto; 2938 u32 rto = inet_csk(sk)->icsk_rto;
3096 /* Offset the time elapsed after installing regular RTO */ 2939 /* Offset the time elapsed after installing regular RTO */
3097 if (tp->early_retrans_delayed) { 2940 if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2941 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
3098 struct sk_buff *skb = tcp_write_queue_head(sk); 2942 struct sk_buff *skb = tcp_write_queue_head(sk);
3099 const u32 rto_time_stamp = TCP_SKB_CB(skb)->when + rto; 2943 const u32 rto_time_stamp = TCP_SKB_CB(skb)->when + rto;
3100 s32 delta = (s32)(rto_time_stamp - tcp_time_stamp); 2944 s32 delta = (s32)(rto_time_stamp - tcp_time_stamp);
3101 /* delta may not be positive if the socket is locked 2945 /* delta may not be positive if the socket is locked
3102 * when the delayed ER timer fires and is rescheduled. 2946 * when the retrans timer fires and is rescheduled.
3103 */ 2947 */
3104 if (delta > 0) 2948 if (delta > 0)
3105 rto = delta; 2949 rto = delta;
@@ -3107,7 +2951,6 @@ void tcp_rearm_rto(struct sock *sk)
3107 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto, 2951 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
3108 TCP_RTO_MAX); 2952 TCP_RTO_MAX);
3109 } 2953 }
3110 tp->early_retrans_delayed = 0;
3111} 2954}
3112 2955
3113/* This function is called when the delayed ER timer fires. TCP enters 2956/* This function is called when the delayed ER timer fires. TCP enters
@@ -3195,8 +3038,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3195 flag |= FLAG_RETRANS_DATA_ACKED; 3038 flag |= FLAG_RETRANS_DATA_ACKED;
3196 ca_seq_rtt = -1; 3039 ca_seq_rtt = -1;
3197 seq_rtt = -1; 3040 seq_rtt = -1;
3198 if ((flag & FLAG_DATA_ACKED) || (acked_pcount > 1))
3199 flag |= FLAG_NONHEAD_RETRANS_ACKED;
3200 } else { 3041 } else {
3201 ca_seq_rtt = now - scb->when; 3042 ca_seq_rtt = now - scb->when;
3202 last_ackt = skb->tstamp; 3043 last_ackt = skb->tstamp;
@@ -3205,6 +3046,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3205 } 3046 }
3206 if (!(sacked & TCPCB_SACKED_ACKED)) 3047 if (!(sacked & TCPCB_SACKED_ACKED))
3207 reord = min(pkts_acked, reord); 3048 reord = min(pkts_acked, reord);
3049 if (!after(scb->end_seq, tp->high_seq))
3050 flag |= FLAG_ORIG_SACK_ACKED;
3208 } 3051 }
3209 3052
3210 if (sacked & TCPCB_SACKED_ACKED) 3053 if (sacked & TCPCB_SACKED_ACKED)
@@ -3405,165 +3248,74 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32
3405 return flag; 3248 return flag;
3406} 3249}
3407 3250
3408/* A very conservative spurious RTO response algorithm: reduce cwnd and 3251/* RFC 5961 7 [ACK Throttling] */
3409 * continue in congestion avoidance. 3252static void tcp_send_challenge_ack(struct sock *sk)
3410 */
3411static void tcp_conservative_spur_to_response(struct tcp_sock *tp)
3412{ 3253{
3413 tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); 3254 /* unprotected vars, we dont care of overwrites */
3414 tp->snd_cwnd_cnt = 0; 3255 static u32 challenge_timestamp;
3415 TCP_ECN_queue_cwr(tp); 3256 static unsigned int challenge_count;
3416 tcp_moderate_cwnd(tp); 3257 u32 now = jiffies / HZ;
3417}
3418 3258
3419/* A conservative spurious RTO response algorithm: reduce cwnd using 3259 if (now != challenge_timestamp) {
3420 * PRR and continue in congestion avoidance. 3260 challenge_timestamp = now;
3421 */ 3261 challenge_count = 0;
3422static void tcp_cwr_spur_to_response(struct sock *sk) 3262 }
3423{ 3263 if (++challenge_count <= sysctl_tcp_challenge_ack_limit) {
3424 tcp_enter_cwr(sk, 0); 3264 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPCHALLENGEACK);
3265 tcp_send_ack(sk);
3266 }
3425} 3267}
3426 3268
3427static void tcp_undo_spur_to_response(struct sock *sk, int flag) 3269static void tcp_store_ts_recent(struct tcp_sock *tp)
3428{ 3270{
3429 if (flag & FLAG_ECE) 3271 tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval;
3430 tcp_cwr_spur_to_response(sk); 3272 tp->rx_opt.ts_recent_stamp = get_seconds();
3431 else
3432 tcp_undo_cwr(sk, true);
3433} 3273}
3434 3274
3435/* F-RTO spurious RTO detection algorithm (RFC4138) 3275static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
3436 *
3437 * F-RTO affects during two new ACKs following RTO (well, almost, see inline
3438 * comments). State (ACK number) is kept in frto_counter. When ACK advances
3439 * window (but not to or beyond highest sequence sent before RTO):
3440 * On First ACK, send two new segments out.
3441 * On Second ACK, RTO was likely spurious. Do spurious response (response
3442 * algorithm is not part of the F-RTO detection algorithm
3443 * given in RFC4138 but can be selected separately).
3444 * Otherwise (basically on duplicate ACK), RTO was (likely) caused by a loss
3445 * and TCP falls back to conventional RTO recovery. F-RTO allows overriding
3446 * of Nagle, this is done using frto_counter states 2 and 3, when a new data
3447 * segment of any size sent during F-RTO, state 2 is upgraded to 3.
3448 *
3449 * Rationale: if the RTO was spurious, new ACKs should arrive from the
3450 * original window even after we transmit two new data segments.
3451 *
3452 * SACK version:
3453 * on first step, wait until first cumulative ACK arrives, then move to
3454 * the second step. In second step, the next ACK decides.
3455 *
3456 * F-RTO is implemented (mainly) in four functions:
3457 * - tcp_use_frto() is used to determine if TCP is can use F-RTO
3458 * - tcp_enter_frto() prepares TCP state on RTO if F-RTO is used, it is
3459 * called when tcp_use_frto() showed green light
3460 * - tcp_process_frto() handles incoming ACKs during F-RTO algorithm
3461 * - tcp_enter_frto_loss() is called if there is not enough evidence
3462 * to prove that the RTO is indeed spurious. It transfers the control
3463 * from F-RTO to the conventional RTO recovery
3464 */
3465static bool tcp_process_frto(struct sock *sk, int flag)
3466{ 3276{
3467 struct tcp_sock *tp = tcp_sk(sk); 3277 if (tp->rx_opt.saw_tstamp && !after(seq, tp->rcv_wup)) {
3468 3278 /* PAWS bug workaround wrt. ACK frames, the PAWS discard
3469 tcp_verify_left_out(tp); 3279 * extra check below makes sure this can only happen
3470 3280 * for pure ACK frames. -DaveM
3471 /* Duplicate the behavior from Loss state (fastretrans_alert) */ 3281 *
3472 if (flag & FLAG_DATA_ACKED) 3282 * Not only, also it occurs for expired timestamps.
3473 inet_csk(sk)->icsk_retransmits = 0;
3474
3475 if ((flag & FLAG_NONHEAD_RETRANS_ACKED) ||
3476 ((tp->frto_counter >= 2) && (flag & FLAG_RETRANS_DATA_ACKED)))
3477 tp->undo_marker = 0;
3478
3479 if (!before(tp->snd_una, tp->frto_highmark)) {
3480 tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 2 : 3), flag);
3481 return true;
3482 }
3483
3484 if (!tcp_is_sackfrto(tp)) {
3485 /* RFC4138 shortcoming in step 2; should also have case c):
3486 * ACK isn't duplicate nor advances window, e.g., opposite dir
3487 * data, winupdate
3488 */ 3283 */
3489 if (!(flag & FLAG_ANY_PROGRESS) && (flag & FLAG_NOT_DUP))
3490 return true;
3491
3492 if (!(flag & FLAG_DATA_ACKED)) {
3493 tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 0 : 3),
3494 flag);
3495 return true;
3496 }
3497 } else {
3498 if (!(flag & FLAG_DATA_ACKED) && (tp->frto_counter == 1)) {
3499 if (!tcp_packets_in_flight(tp)) {
3500 tcp_enter_frto_loss(sk, 2, flag);
3501 return true;
3502 }
3503
3504 /* Prevent sending of new data. */
3505 tp->snd_cwnd = min(tp->snd_cwnd,
3506 tcp_packets_in_flight(tp));
3507 return true;
3508 }
3509
3510 if ((tp->frto_counter >= 2) &&
3511 (!(flag & FLAG_FORWARD_PROGRESS) ||
3512 ((flag & FLAG_DATA_SACKED) &&
3513 !(flag & FLAG_ONLY_ORIG_SACKED)))) {
3514 /* RFC4138 shortcoming (see comment above) */
3515 if (!(flag & FLAG_FORWARD_PROGRESS) &&
3516 (flag & FLAG_NOT_DUP))
3517 return true;
3518
3519 tcp_enter_frto_loss(sk, 3, flag);
3520 return true;
3521 }
3522 }
3523
3524 if (tp->frto_counter == 1) {
3525 /* tcp_may_send_now needs to see updated state */
3526 tp->snd_cwnd = tcp_packets_in_flight(tp) + 2;
3527 tp->frto_counter = 2;
3528
3529 if (!tcp_may_send_now(sk))
3530 tcp_enter_frto_loss(sk, 2, flag);
3531 3284
3532 return true; 3285 if (tcp_paws_check(&tp->rx_opt, 0))
3533 } else { 3286 tcp_store_ts_recent(tp);
3534 switch (sysctl_tcp_frto_response) {
3535 case 2:
3536 tcp_undo_spur_to_response(sk, flag);
3537 break;
3538 case 1:
3539 tcp_conservative_spur_to_response(tp);
3540 break;
3541 default:
3542 tcp_cwr_spur_to_response(sk);
3543 break;
3544 }
3545 tp->frto_counter = 0;
3546 tp->undo_marker = 0;
3547 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSPURIOUSRTOS);
3548 } 3287 }
3549 return false;
3550} 3288}
3551 3289
3552/* RFC 5961 7 [ACK Throttling] */ 3290/* This routine deals with acks during a TLP episode.
3553static void tcp_send_challenge_ack(struct sock *sk) 3291 * Ref: loss detection algorithm in draft-dukkipati-tcpm-tcp-loss-probe.
3292 */
3293static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
3554{ 3294{
3555 /* unprotected vars, we dont care of overwrites */ 3295 struct tcp_sock *tp = tcp_sk(sk);
3556 static u32 challenge_timestamp; 3296 bool is_tlp_dupack = (ack == tp->tlp_high_seq) &&
3557 static unsigned int challenge_count; 3297 !(flag & (FLAG_SND_UNA_ADVANCED |
3558 u32 now = jiffies / HZ; 3298 FLAG_NOT_DUP | FLAG_DATA_SACKED));
3559 3299
3560 if (now != challenge_timestamp) { 3300 /* Mark the end of TLP episode on receiving TLP dupack or when
3561 challenge_timestamp = now; 3301 * ack is after tlp_high_seq.
3562 challenge_count = 0; 3302 */
3303 if (is_tlp_dupack) {
3304 tp->tlp_high_seq = 0;
3305 return;
3563 } 3306 }
3564 if (++challenge_count <= sysctl_tcp_challenge_ack_limit) { 3307
3565 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPCHALLENGEACK); 3308 if (after(ack, tp->tlp_high_seq)) {
3566 tcp_send_ack(sk); 3309 tp->tlp_high_seq = 0;
3310 /* Don't reduce cwnd if DSACK arrives for TLP retrans. */
3311 if (!(flag & FLAG_DSACKING_ACK)) {
3312 tcp_init_cwnd_reduction(sk, true);
3313 tcp_set_ca_state(sk, TCP_CA_CWR);
3314 tcp_end_cwnd_reduction(sk);
3315 tcp_set_ca_state(sk, TCP_CA_Open);
3316 NET_INC_STATS_BH(sock_net(sk),
3317 LINUX_MIB_TCPLOSSPROBERECOVERY);
3318 }
3567 } 3319 }
3568} 3320}
3569 3321
@@ -3581,7 +3333,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3581 int prior_packets; 3333 int prior_packets;
3582 int prior_sacked = tp->sacked_out; 3334 int prior_sacked = tp->sacked_out;
3583 int pkts_acked = 0; 3335 int pkts_acked = 0;
3584 bool frto_cwnd = false;
3585 3336
3586 /* If the ack is older than previous acks 3337 /* If the ack is older than previous acks
3587 * then we can probably ignore it. 3338 * then we can probably ignore it.
@@ -3601,7 +3352,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3601 if (after(ack, tp->snd_nxt)) 3352 if (after(ack, tp->snd_nxt))
3602 goto invalid_ack; 3353 goto invalid_ack;
3603 3354
3604 if (tp->early_retrans_delayed) 3355 if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
3356 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
3605 tcp_rearm_rto(sk); 3357 tcp_rearm_rto(sk);
3606 3358
3607 if (after(ack, prior_snd_una)) 3359 if (after(ack, prior_snd_una))
@@ -3610,6 +3362,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3610 prior_fackets = tp->fackets_out; 3362 prior_fackets = tp->fackets_out;
3611 prior_in_flight = tcp_packets_in_flight(tp); 3363 prior_in_flight = tcp_packets_in_flight(tp);
3612 3364
3365 /* ts_recent update must be made after we are sure that the packet
3366 * is in window.
3367 */
3368 if (flag & FLAG_UPDATE_TS_RECENT)
3369 tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
3370
3613 if (!(flag & FLAG_SLOWPATH) && after(ack, prior_snd_una)) { 3371 if (!(flag & FLAG_SLOWPATH) && after(ack, prior_snd_una)) {
3614 /* Window is constant, pure forward advance. 3372 /* Window is constant, pure forward advance.
3615 * No more checks are required. 3373 * No more checks are required.
@@ -3654,30 +3412,29 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3654 3412
3655 pkts_acked = prior_packets - tp->packets_out; 3413 pkts_acked = prior_packets - tp->packets_out;
3656 3414
3657 if (tp->frto_counter)
3658 frto_cwnd = tcp_process_frto(sk, flag);
3659 /* Guarantee sacktag reordering detection against wrap-arounds */
3660 if (before(tp->frto_highmark, tp->snd_una))
3661 tp->frto_highmark = 0;
3662
3663 if (tcp_ack_is_dubious(sk, flag)) { 3415 if (tcp_ack_is_dubious(sk, flag)) {
3664 /* Advance CWND, if state allows this. */ 3416 /* Advance CWND, if state allows this. */
3665 if ((flag & FLAG_DATA_ACKED) && !frto_cwnd && 3417 if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(sk, flag))
3666 tcp_may_raise_cwnd(sk, flag))
3667 tcp_cong_avoid(sk, ack, prior_in_flight); 3418 tcp_cong_avoid(sk, ack, prior_in_flight);
3668 is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); 3419 is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
3669 tcp_fastretrans_alert(sk, pkts_acked, prior_sacked, 3420 tcp_fastretrans_alert(sk, pkts_acked, prior_sacked,
3670 is_dupack, flag); 3421 is_dupack, flag);
3671 } else { 3422 } else {
3672 if ((flag & FLAG_DATA_ACKED) && !frto_cwnd) 3423 if (flag & FLAG_DATA_ACKED)
3673 tcp_cong_avoid(sk, ack, prior_in_flight); 3424 tcp_cong_avoid(sk, ack, prior_in_flight);
3674 } 3425 }
3675 3426
3427 if (tp->tlp_high_seq)
3428 tcp_process_tlp_ack(sk, ack, flag);
3429
3676 if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) { 3430 if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) {
3677 struct dst_entry *dst = __sk_dst_get(sk); 3431 struct dst_entry *dst = __sk_dst_get(sk);
3678 if (dst) 3432 if (dst)
3679 dst_confirm(dst); 3433 dst_confirm(dst);
3680 } 3434 }
3435
3436 if (icsk->icsk_pending == ICSK_TIME_RETRANS)
3437 tcp_schedule_loss_probe(sk);
3681 return 1; 3438 return 1;
3682 3439
3683no_queue: 3440no_queue:
@@ -3691,6 +3448,9 @@ no_queue:
3691 */ 3448 */
3692 if (tcp_send_head(sk)) 3449 if (tcp_send_head(sk))
3693 tcp_ack_probe(sk); 3450 tcp_ack_probe(sk);
3451
3452 if (tp->tlp_high_seq)
3453 tcp_process_tlp_ack(sk, ack, flag);
3694 return 1; 3454 return 1;
3695 3455
3696invalid_ack: 3456invalid_ack:
@@ -3715,8 +3475,8 @@ old_ack:
3715 * But, this can also be called on packets in the established flow when 3475 * But, this can also be called on packets in the established flow when
3716 * the fast version below fails. 3476 * the fast version below fails.
3717 */ 3477 */
3718void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *opt_rx, 3478void tcp_parse_options(const struct sk_buff *skb,
3719 const u8 **hvpp, int estab, 3479 struct tcp_options_received *opt_rx, int estab,
3720 struct tcp_fastopen_cookie *foc) 3480 struct tcp_fastopen_cookie *foc)
3721{ 3481{
3722 const unsigned char *ptr; 3482 const unsigned char *ptr;
@@ -3800,31 +3560,6 @@ void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *o
3800 */ 3560 */
3801 break; 3561 break;
3802#endif 3562#endif
3803 case TCPOPT_COOKIE:
3804 /* This option is variable length.
3805 */
3806 switch (opsize) {
3807 case TCPOLEN_COOKIE_BASE:
3808 /* not yet implemented */
3809 break;
3810 case TCPOLEN_COOKIE_PAIR:
3811 /* not yet implemented */
3812 break;
3813 case TCPOLEN_COOKIE_MIN+0:
3814 case TCPOLEN_COOKIE_MIN+2:
3815 case TCPOLEN_COOKIE_MIN+4:
3816 case TCPOLEN_COOKIE_MIN+6:
3817 case TCPOLEN_COOKIE_MAX:
3818 /* 16-bit multiple */
3819 opt_rx->cookie_plus = opsize;
3820 *hvpp = ptr;
3821 break;
3822 default:
3823 /* ignore option */
3824 break;
3825 }
3826 break;
3827
3828 case TCPOPT_EXP: 3563 case TCPOPT_EXP:
3829 /* Fast Open option shares code 254 using a 3564 /* Fast Open option shares code 254 using a
3830 * 16 bits magic number. It's valid only in 3565 * 16 bits magic number. It's valid only in
@@ -3870,8 +3605,7 @@ static bool tcp_parse_aligned_timestamp(struct tcp_sock *tp, const struct tcphdr
3870 * If it is wrong it falls back on tcp_parse_options(). 3605 * If it is wrong it falls back on tcp_parse_options().
3871 */ 3606 */
3872static bool tcp_fast_parse_options(const struct sk_buff *skb, 3607static bool tcp_fast_parse_options(const struct sk_buff *skb,
3873 const struct tcphdr *th, 3608 const struct tcphdr *th, struct tcp_sock *tp)
3874 struct tcp_sock *tp, const u8 **hvpp)
3875{ 3609{
3876 /* In the spirit of fast parsing, compare doff directly to constant 3610 /* In the spirit of fast parsing, compare doff directly to constant
3877 * values. Because equality is used, short doff can be ignored here. 3611 * values. Because equality is used, short doff can be ignored here.
@@ -3885,7 +3619,7 @@ static bool tcp_fast_parse_options(const struct sk_buff *skb,
3885 return true; 3619 return true;
3886 } 3620 }
3887 3621
3888 tcp_parse_options(skb, &tp->rx_opt, hvpp, 1, NULL); 3622 tcp_parse_options(skb, &tp->rx_opt, 1, NULL);
3889 if (tp->rx_opt.saw_tstamp) 3623 if (tp->rx_opt.saw_tstamp)
3890 tp->rx_opt.rcv_tsecr -= tp->tsoffset; 3624 tp->rx_opt.rcv_tsecr -= tp->tsoffset;
3891 3625
@@ -3930,27 +3664,6 @@ const u8 *tcp_parse_md5sig_option(const struct tcphdr *th)
3930EXPORT_SYMBOL(tcp_parse_md5sig_option); 3664EXPORT_SYMBOL(tcp_parse_md5sig_option);
3931#endif 3665#endif
3932 3666
3933static inline void tcp_store_ts_recent(struct tcp_sock *tp)
3934{
3935 tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval;
3936 tp->rx_opt.ts_recent_stamp = get_seconds();
3937}
3938
3939static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
3940{
3941 if (tp->rx_opt.saw_tstamp && !after(seq, tp->rcv_wup)) {
3942 /* PAWS bug workaround wrt. ACK frames, the PAWS discard
3943 * extra check below makes sure this can only happen
3944 * for pure ACK frames. -DaveM
3945 *
3946 * Not only, also it occurs for expired timestamps.
3947 */
3948
3949 if (tcp_paws_check(&tp->rx_opt, 0))
3950 tcp_store_ts_recent(tp);
3951 }
3952}
3953
3954/* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM 3667/* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM
3955 * 3668 *
3956 * It is not fatal. If this ACK does _not_ change critical state (seqs, window) 3669 * It is not fatal. If this ACK does _not_ change critical state (seqs, window)
@@ -5266,12 +4979,10 @@ out:
5266static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, 4979static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
5267 const struct tcphdr *th, int syn_inerr) 4980 const struct tcphdr *th, int syn_inerr)
5268{ 4981{
5269 const u8 *hash_location;
5270 struct tcp_sock *tp = tcp_sk(sk); 4982 struct tcp_sock *tp = tcp_sk(sk);
5271 4983
5272 /* RFC1323: H1. Apply PAWS check first. */ 4984 /* RFC1323: H1. Apply PAWS check first. */
5273 if (tcp_fast_parse_options(skb, th, tp, &hash_location) && 4985 if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp &&
5274 tp->rx_opt.saw_tstamp &&
5275 tcp_paws_discard(sk, skb)) { 4986 tcp_paws_discard(sk, skb)) {
5276 if (!th->rst) { 4987 if (!th->rst) {
5277 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); 4988 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
@@ -5546,14 +5257,9 @@ slow_path:
5546 return 0; 5257 return 0;
5547 5258
5548step5: 5259step5:
5549 if (tcp_ack(sk, skb, FLAG_SLOWPATH) < 0) 5260 if (tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT) < 0)
5550 goto discard; 5261 goto discard;
5551 5262
5552 /* ts_recent update must be made after we are sure that the packet
5553 * is in window.
5554 */
5555 tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
5556
5557 tcp_rcv_rtt_measure_ts(sk, skb); 5263 tcp_rcv_rtt_measure_ts(sk, skb);
5558 5264
5559 /* Process urgent data. */ 5265 /* Process urgent data. */
@@ -5567,6 +5273,7 @@ step5:
5567 return 0; 5273 return 0;
5568 5274
5569csum_error: 5275csum_error:
5276 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
5570 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); 5277 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
5571 5278
5572discard: 5279discard:
@@ -5625,12 +5332,11 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
5625 5332
5626 if (mss == tp->rx_opt.user_mss) { 5333 if (mss == tp->rx_opt.user_mss) {
5627 struct tcp_options_received opt; 5334 struct tcp_options_received opt;
5628 const u8 *hash_location;
5629 5335
5630 /* Get original SYNACK MSS value if user MSS sets mss_clamp */ 5336 /* Get original SYNACK MSS value if user MSS sets mss_clamp */
5631 tcp_clear_options(&opt); 5337 tcp_clear_options(&opt);
5632 opt.user_mss = opt.mss_clamp = 0; 5338 opt.user_mss = opt.mss_clamp = 0;
5633 tcp_parse_options(synack, &opt, &hash_location, 0, NULL); 5339 tcp_parse_options(synack, &opt, 0, NULL);
5634 mss = opt.mss_clamp; 5340 mss = opt.mss_clamp;
5635 } 5341 }
5636 5342
@@ -5661,14 +5367,12 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
5661static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, 5367static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
5662 const struct tcphdr *th, unsigned int len) 5368 const struct tcphdr *th, unsigned int len)
5663{ 5369{
5664 const u8 *hash_location;
5665 struct inet_connection_sock *icsk = inet_csk(sk); 5370 struct inet_connection_sock *icsk = inet_csk(sk);
5666 struct tcp_sock *tp = tcp_sk(sk); 5371 struct tcp_sock *tp = tcp_sk(sk);
5667 struct tcp_cookie_values *cvp = tp->cookie_values;
5668 struct tcp_fastopen_cookie foc = { .len = -1 }; 5372 struct tcp_fastopen_cookie foc = { .len = -1 };
5669 int saved_clamp = tp->rx_opt.mss_clamp; 5373 int saved_clamp = tp->rx_opt.mss_clamp;
5670 5374
5671 tcp_parse_options(skb, &tp->rx_opt, &hash_location, 0, &foc); 5375 tcp_parse_options(skb, &tp->rx_opt, 0, &foc);
5672 if (tp->rx_opt.saw_tstamp) 5376 if (tp->rx_opt.saw_tstamp)
5673 tp->rx_opt.rcv_tsecr -= tp->tsoffset; 5377 tp->rx_opt.rcv_tsecr -= tp->tsoffset;
5674 5378
@@ -5765,30 +5469,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
5765 * is initialized. */ 5469 * is initialized. */
5766 tp->copied_seq = tp->rcv_nxt; 5470 tp->copied_seq = tp->rcv_nxt;
5767 5471
5768 if (cvp != NULL &&
5769 cvp->cookie_pair_size > 0 &&
5770 tp->rx_opt.cookie_plus > 0) {
5771 int cookie_size = tp->rx_opt.cookie_plus
5772 - TCPOLEN_COOKIE_BASE;
5773 int cookie_pair_size = cookie_size
5774 + cvp->cookie_desired;
5775
5776 /* A cookie extension option was sent and returned.
5777 * Note that each incoming SYNACK replaces the
5778 * Responder cookie. The initial exchange is most
5779 * fragile, as protection against spoofing relies
5780 * entirely upon the sequence and timestamp (above).
5781 * This replacement strategy allows the correct pair to
5782 * pass through, while any others will be filtered via
5783 * Responder verification later.
5784 */
5785 if (sizeof(cvp->cookie_pair) >= cookie_pair_size) {
5786 memcpy(&cvp->cookie_pair[cvp->cookie_desired],
5787 hash_location, cookie_size);
5788 cvp->cookie_pair_size = cookie_pair_size;
5789 }
5790 }
5791
5792 smp_mb(); 5472 smp_mb();
5793 5473
5794 tcp_finish_connect(sk, skb); 5474 tcp_finish_connect(sk, skb);
@@ -5989,7 +5669,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
5989 5669
5990 /* step 5: check the ACK field */ 5670 /* step 5: check the ACK field */
5991 if (true) { 5671 if (true) {
5992 int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH) > 0; 5672 int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH |
5673 FLAG_UPDATE_TS_RECENT) > 0;
5993 5674
5994 switch (sk->sk_state) { 5675 switch (sk->sk_state) {
5995 case TCP_SYN_RECV: 5676 case TCP_SYN_RECV:
@@ -6140,11 +5821,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
6140 } 5821 }
6141 } 5822 }
6142 5823
6143 /* ts_recent update must be made after we are sure that the packet
6144 * is in window.
6145 */
6146 tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
6147
6148 /* step 6: check the URG bit */ 5824 /* step 6: check the URG bit */
6149 tcp_urg(sk, skb, th); 5825 tcp_urg(sk, skb, th);
6150 5826