aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp_input.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4/tcp_input.c')
-rw-r--r--net/ipv4/tcp_input.c669
1 files changed, 174 insertions, 495 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 3bd55bad230a..08bbe6096528 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -93,12 +93,11 @@ int sysctl_tcp_stdurg __read_mostly;
93int sysctl_tcp_rfc1337 __read_mostly; 93int sysctl_tcp_rfc1337 __read_mostly;
94int sysctl_tcp_max_orphans __read_mostly = NR_FILE; 94int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
95int sysctl_tcp_frto __read_mostly = 2; 95int sysctl_tcp_frto __read_mostly = 2;
96int sysctl_tcp_frto_response __read_mostly;
97 96
98int sysctl_tcp_thin_dupack __read_mostly; 97int sysctl_tcp_thin_dupack __read_mostly;
99 98
100int sysctl_tcp_moderate_rcvbuf __read_mostly = 1; 99int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
101int sysctl_tcp_early_retrans __read_mostly = 2; 100int sysctl_tcp_early_retrans __read_mostly = 3;
102 101
103#define FLAG_DATA 0x01 /* Incoming frame contained data. */ 102#define FLAG_DATA 0x01 /* Incoming frame contained data. */
104#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ 103#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
@@ -108,17 +107,16 @@ int sysctl_tcp_early_retrans __read_mostly = 2;
108#define FLAG_DATA_SACKED 0x20 /* New SACK. */ 107#define FLAG_DATA_SACKED 0x20 /* New SACK. */
109#define FLAG_ECE 0x40 /* ECE in this ACK */ 108#define FLAG_ECE 0x40 /* ECE in this ACK */
110#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/ 109#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/
111#define FLAG_ONLY_ORIG_SACKED 0x200 /* SACKs only non-rexmit sent before RTO */ 110#define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */
112#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ 111#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
113#define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */ 112#define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */
114#define FLAG_NONHEAD_RETRANS_ACKED 0x1000 /* Non-head rexmitted data was ACKed */
115#define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */ 113#define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */
114#define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */
116 115
117#define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED) 116#define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED)
118#define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED) 117#define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)
119#define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE) 118#define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE)
120#define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED) 119#define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED)
121#define FLAG_ANY_PROGRESS (FLAG_FORWARD_PROGRESS|FLAG_SND_UNA_ADVANCED)
122 120
123#define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH) 121#define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH)
124#define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH)) 122#define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH))
@@ -1159,10 +1157,8 @@ static u8 tcp_sacktag_one(struct sock *sk,
1159 tcp_highest_sack_seq(tp))) 1157 tcp_highest_sack_seq(tp)))
1160 state->reord = min(fack_count, 1158 state->reord = min(fack_count,
1161 state->reord); 1159 state->reord);
1162 1160 if (!after(end_seq, tp->high_seq))
1163 /* SACK enhanced F-RTO (RFC4138; Appendix B) */ 1161 state->flag |= FLAG_ORIG_SACK_ACKED;
1164 if (!after(end_seq, tp->frto_highmark))
1165 state->flag |= FLAG_ONLY_ORIG_SACKED;
1166 } 1162 }
1167 1163
1168 if (sacked & TCPCB_LOST) { 1164 if (sacked & TCPCB_LOST) {
@@ -1555,7 +1551,6 @@ static int
1555tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, 1551tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
1556 u32 prior_snd_una) 1552 u32 prior_snd_una)
1557{ 1553{
1558 const struct inet_connection_sock *icsk = inet_csk(sk);
1559 struct tcp_sock *tp = tcp_sk(sk); 1554 struct tcp_sock *tp = tcp_sk(sk);
1560 const unsigned char *ptr = (skb_transport_header(ack_skb) + 1555 const unsigned char *ptr = (skb_transport_header(ack_skb) +
1561 TCP_SKB_CB(ack_skb)->sacked); 1556 TCP_SKB_CB(ack_skb)->sacked);
@@ -1728,12 +1723,6 @@ walk:
1728 start_seq, end_seq, dup_sack); 1723 start_seq, end_seq, dup_sack);
1729 1724
1730advance_sp: 1725advance_sp:
1731 /* SACK enhanced FRTO (RFC4138, Appendix B): Clearing correct
1732 * due to in-order walk
1733 */
1734 if (after(end_seq, tp->frto_highmark))
1735 state.flag &= ~FLAG_ONLY_ORIG_SACKED;
1736
1737 i++; 1726 i++;
1738 } 1727 }
1739 1728
@@ -1750,8 +1739,7 @@ advance_sp:
1750 tcp_verify_left_out(tp); 1739 tcp_verify_left_out(tp);
1751 1740
1752 if ((state.reord < tp->fackets_out) && 1741 if ((state.reord < tp->fackets_out) &&
1753 ((icsk->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker) && 1742 ((inet_csk(sk)->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker))
1754 (!tp->frto_highmark || after(tp->snd_una, tp->frto_highmark)))
1755 tcp_update_reordering(sk, tp->fackets_out - state.reord, 0); 1743 tcp_update_reordering(sk, tp->fackets_out - state.reord, 0);
1756 1744
1757out: 1745out:
@@ -1825,197 +1813,6 @@ static inline void tcp_reset_reno_sack(struct tcp_sock *tp)
1825 tp->sacked_out = 0; 1813 tp->sacked_out = 0;
1826} 1814}
1827 1815
1828static int tcp_is_sackfrto(const struct tcp_sock *tp)
1829{
1830 return (sysctl_tcp_frto == 0x2) && !tcp_is_reno(tp);
1831}
1832
1833/* F-RTO can only be used if TCP has never retransmitted anything other than
1834 * head (SACK enhanced variant from Appendix B of RFC4138 is more robust here)
1835 */
1836bool tcp_use_frto(struct sock *sk)
1837{
1838 const struct tcp_sock *tp = tcp_sk(sk);
1839 const struct inet_connection_sock *icsk = inet_csk(sk);
1840 struct sk_buff *skb;
1841
1842 if (!sysctl_tcp_frto)
1843 return false;
1844
1845 /* MTU probe and F-RTO won't really play nicely along currently */
1846 if (icsk->icsk_mtup.probe_size)
1847 return false;
1848
1849 if (tcp_is_sackfrto(tp))
1850 return true;
1851
1852 /* Avoid expensive walking of rexmit queue if possible */
1853 if (tp->retrans_out > 1)
1854 return false;
1855
1856 skb = tcp_write_queue_head(sk);
1857 if (tcp_skb_is_last(sk, skb))
1858 return true;
1859 skb = tcp_write_queue_next(sk, skb); /* Skips head */
1860 tcp_for_write_queue_from(skb, sk) {
1861 if (skb == tcp_send_head(sk))
1862 break;
1863 if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS)
1864 return false;
1865 /* Short-circuit when first non-SACKed skb has been checked */
1866 if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
1867 break;
1868 }
1869 return true;
1870}
1871
1872/* RTO occurred, but do not yet enter Loss state. Instead, defer RTO
1873 * recovery a bit and use heuristics in tcp_process_frto() to detect if
1874 * the RTO was spurious. Only clear SACKED_RETRANS of the head here to
1875 * keep retrans_out counting accurate (with SACK F-RTO, other than head
1876 * may still have that bit set); TCPCB_LOST and remaining SACKED_RETRANS
1877 * bits are handled if the Loss state is really to be entered (in
1878 * tcp_enter_frto_loss).
1879 *
1880 * Do like tcp_enter_loss() would; when RTO expires the second time it
1881 * does:
1882 * "Reduce ssthresh if it has not yet been made inside this window."
1883 */
1884void tcp_enter_frto(struct sock *sk)
1885{
1886 const struct inet_connection_sock *icsk = inet_csk(sk);
1887 struct tcp_sock *tp = tcp_sk(sk);
1888 struct sk_buff *skb;
1889
1890 if ((!tp->frto_counter && icsk->icsk_ca_state <= TCP_CA_Disorder) ||
1891 tp->snd_una == tp->high_seq ||
1892 ((icsk->icsk_ca_state == TCP_CA_Loss || tp->frto_counter) &&
1893 !icsk->icsk_retransmits)) {
1894 tp->prior_ssthresh = tcp_current_ssthresh(sk);
1895 /* Our state is too optimistic in ssthresh() call because cwnd
1896 * is not reduced until tcp_enter_frto_loss() when previous F-RTO
1897 * recovery has not yet completed. Pattern would be this: RTO,
1898 * Cumulative ACK, RTO (2xRTO for the same segment does not end
1899 * up here twice).
1900 * RFC4138 should be more specific on what to do, even though
1901 * RTO is quite unlikely to occur after the first Cumulative ACK
1902 * due to back-off and complexity of triggering events ...
1903 */
1904 if (tp->frto_counter) {
1905 u32 stored_cwnd;
1906 stored_cwnd = tp->snd_cwnd;
1907 tp->snd_cwnd = 2;
1908 tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
1909 tp->snd_cwnd = stored_cwnd;
1910 } else {
1911 tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
1912 }
1913 /* ... in theory, cong.control module could do "any tricks" in
1914 * ssthresh(), which means that ca_state, lost bits and lost_out
1915 * counter would have to be faked before the call occurs. We
1916 * consider that too expensive, unlikely and hacky, so modules
1917 * using these in ssthresh() must deal these incompatibility
1918 * issues if they receives CA_EVENT_FRTO and frto_counter != 0
1919 */
1920 tcp_ca_event(sk, CA_EVENT_FRTO);
1921 }
1922
1923 tp->undo_marker = tp->snd_una;
1924 tp->undo_retrans = 0;
1925
1926 skb = tcp_write_queue_head(sk);
1927 if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS)
1928 tp->undo_marker = 0;
1929 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
1930 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
1931 tp->retrans_out -= tcp_skb_pcount(skb);
1932 }
1933 tcp_verify_left_out(tp);
1934
1935 /* Too bad if TCP was application limited */
1936 tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp) + 1);
1937
1938 /* Earlier loss recovery underway (see RFC4138; Appendix B).
1939 * The last condition is necessary at least in tp->frto_counter case.
1940 */
1941 if (tcp_is_sackfrto(tp) && (tp->frto_counter ||
1942 ((1 << icsk->icsk_ca_state) & (TCPF_CA_Recovery|TCPF_CA_Loss))) &&
1943 after(tp->high_seq, tp->snd_una)) {
1944 tp->frto_highmark = tp->high_seq;
1945 } else {
1946 tp->frto_highmark = tp->snd_nxt;
1947 }
1948 tcp_set_ca_state(sk, TCP_CA_Disorder);
1949 tp->high_seq = tp->snd_nxt;
1950 tp->frto_counter = 1;
1951}
1952
1953/* Enter Loss state after F-RTO was applied. Dupack arrived after RTO,
1954 * which indicates that we should follow the traditional RTO recovery,
1955 * i.e. mark everything lost and do go-back-N retransmission.
1956 */
1957static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag)
1958{
1959 struct tcp_sock *tp = tcp_sk(sk);
1960 struct sk_buff *skb;
1961
1962 tp->lost_out = 0;
1963 tp->retrans_out = 0;
1964 if (tcp_is_reno(tp))
1965 tcp_reset_reno_sack(tp);
1966
1967 tcp_for_write_queue(skb, sk) {
1968 if (skb == tcp_send_head(sk))
1969 break;
1970
1971 TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
1972 /*
1973 * Count the retransmission made on RTO correctly (only when
1974 * waiting for the first ACK and did not get it)...
1975 */
1976 if ((tp->frto_counter == 1) && !(flag & FLAG_DATA_ACKED)) {
1977 /* For some reason this R-bit might get cleared? */
1978 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)
1979 tp->retrans_out += tcp_skb_pcount(skb);
1980 /* ...enter this if branch just for the first segment */
1981 flag |= FLAG_DATA_ACKED;
1982 } else {
1983 if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS)
1984 tp->undo_marker = 0;
1985 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
1986 }
1987
1988 /* Marking forward transmissions that were made after RTO lost
1989 * can cause unnecessary retransmissions in some scenarios,
1990 * SACK blocks will mitigate that in some but not in all cases.
1991 * We used to not mark them but it was causing break-ups with
1992 * receivers that do only in-order receival.
1993 *
1994 * TODO: we could detect presence of such receiver and select
1995 * different behavior per flow.
1996 */
1997 if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
1998 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
1999 tp->lost_out += tcp_skb_pcount(skb);
2000 tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
2001 }
2002 }
2003 tcp_verify_left_out(tp);
2004
2005 tp->snd_cwnd = tcp_packets_in_flight(tp) + allowed_segments;
2006 tp->snd_cwnd_cnt = 0;
2007 tp->snd_cwnd_stamp = tcp_time_stamp;
2008 tp->frto_counter = 0;
2009
2010 tp->reordering = min_t(unsigned int, tp->reordering,
2011 sysctl_tcp_reordering);
2012 tcp_set_ca_state(sk, TCP_CA_Loss);
2013 tp->high_seq = tp->snd_nxt;
2014 TCP_ECN_queue_cwr(tp);
2015
2016 tcp_clear_all_retrans_hints(tp);
2017}
2018
2019static void tcp_clear_retrans_partial(struct tcp_sock *tp) 1816static void tcp_clear_retrans_partial(struct tcp_sock *tp)
2020{ 1817{
2021 tp->retrans_out = 0; 1818 tp->retrans_out = 0;
@@ -2042,10 +1839,13 @@ void tcp_enter_loss(struct sock *sk, int how)
2042 const struct inet_connection_sock *icsk = inet_csk(sk); 1839 const struct inet_connection_sock *icsk = inet_csk(sk);
2043 struct tcp_sock *tp = tcp_sk(sk); 1840 struct tcp_sock *tp = tcp_sk(sk);
2044 struct sk_buff *skb; 1841 struct sk_buff *skb;
1842 bool new_recovery = false;
2045 1843
2046 /* Reduce ssthresh if it has not yet been made inside this window. */ 1844 /* Reduce ssthresh if it has not yet been made inside this window. */
2047 if (icsk->icsk_ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq || 1845 if (icsk->icsk_ca_state <= TCP_CA_Disorder ||
1846 !after(tp->high_seq, tp->snd_una) ||
2048 (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) { 1847 (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
1848 new_recovery = true;
2049 tp->prior_ssthresh = tcp_current_ssthresh(sk); 1849 tp->prior_ssthresh = tcp_current_ssthresh(sk);
2050 tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); 1850 tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
2051 tcp_ca_event(sk, CA_EVENT_LOSS); 1851 tcp_ca_event(sk, CA_EVENT_LOSS);
@@ -2087,8 +1887,14 @@ void tcp_enter_loss(struct sock *sk, int how)
2087 tcp_set_ca_state(sk, TCP_CA_Loss); 1887 tcp_set_ca_state(sk, TCP_CA_Loss);
2088 tp->high_seq = tp->snd_nxt; 1888 tp->high_seq = tp->snd_nxt;
2089 TCP_ECN_queue_cwr(tp); 1889 TCP_ECN_queue_cwr(tp);
2090 /* Abort F-RTO algorithm if one is in progress */ 1890
2091 tp->frto_counter = 0; 1891 /* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous
1892 * loss recovery is underway except recurring timeout(s) on
1893 * the same SND.UNA (sec 3.2). Disable F-RTO on path MTU probing
1894 */
1895 tp->frto = sysctl_tcp_frto &&
1896 (new_recovery || icsk->icsk_retransmits) &&
1897 !inet_csk(sk)->icsk_mtup.probe_size;
2092} 1898}
2093 1899
2094/* If ACK arrived pointing to a remembered SACK, it means that our 1900/* If ACK arrived pointing to a remembered SACK, it means that our
@@ -2147,15 +1953,16 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag)
2147 * max(RTT/4, 2msec) unless ack has ECE mark, no RTT samples 1953 * max(RTT/4, 2msec) unless ack has ECE mark, no RTT samples
2148 * available, or RTO is scheduled to fire first. 1954 * available, or RTO is scheduled to fire first.
2149 */ 1955 */
2150 if (sysctl_tcp_early_retrans < 2 || (flag & FLAG_ECE) || !tp->srtt) 1956 if (sysctl_tcp_early_retrans < 2 || sysctl_tcp_early_retrans > 3 ||
1957 (flag & FLAG_ECE) || !tp->srtt)
2151 return false; 1958 return false;
2152 1959
2153 delay = max_t(unsigned long, (tp->srtt >> 5), msecs_to_jiffies(2)); 1960 delay = max_t(unsigned long, (tp->srtt >> 5), msecs_to_jiffies(2));
2154 if (!time_after(inet_csk(sk)->icsk_timeout, (jiffies + delay))) 1961 if (!time_after(inet_csk(sk)->icsk_timeout, (jiffies + delay)))
2155 return false; 1962 return false;
2156 1963
2157 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, delay, TCP_RTO_MAX); 1964 inet_csk_reset_xmit_timer(sk, ICSK_TIME_EARLY_RETRANS, delay,
2158 tp->early_retrans_delayed = 1; 1965 TCP_RTO_MAX);
2159 return true; 1966 return true;
2160} 1967}
2161 1968
@@ -2271,10 +2078,6 @@ static bool tcp_time_to_recover(struct sock *sk, int flag)
2271 struct tcp_sock *tp = tcp_sk(sk); 2078 struct tcp_sock *tp = tcp_sk(sk);
2272 __u32 packets_out; 2079 __u32 packets_out;
2273 2080
2274 /* Do not perform any recovery during F-RTO algorithm */
2275 if (tp->frto_counter)
2276 return false;
2277
2278 /* Trick#1: The loss is proven. */ 2081 /* Trick#1: The loss is proven. */
2279 if (tp->lost_out) 2082 if (tp->lost_out)
2280 return true; 2083 return true;
@@ -2318,7 +2121,7 @@ static bool tcp_time_to_recover(struct sock *sk, int flag)
2318 * interval if appropriate. 2121 * interval if appropriate.
2319 */ 2122 */
2320 if (tp->do_early_retrans && !tp->retrans_out && tp->sacked_out && 2123 if (tp->do_early_retrans && !tp->retrans_out && tp->sacked_out &&
2321 (tp->packets_out == (tp->sacked_out + 1) && tp->packets_out < 4) && 2124 (tp->packets_out >= (tp->sacked_out + 1) && tp->packets_out < 4) &&
2322 !tcp_may_send_now(sk)) 2125 !tcp_may_send_now(sk))
2323 return !tcp_pause_early_retransmit(sk, flag); 2126 return !tcp_pause_early_retransmit(sk, flag);
2324 2127
@@ -2635,12 +2438,12 @@ static int tcp_try_undo_partial(struct sock *sk, int acked)
2635 return failed; 2438 return failed;
2636} 2439}
2637 2440
2638/* Undo during loss recovery after partial ACK. */ 2441/* Undo during loss recovery after partial ACK or using F-RTO. */
2639static bool tcp_try_undo_loss(struct sock *sk) 2442static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo)
2640{ 2443{
2641 struct tcp_sock *tp = tcp_sk(sk); 2444 struct tcp_sock *tp = tcp_sk(sk);
2642 2445
2643 if (tcp_may_undo(tp)) { 2446 if (frto_undo || tcp_may_undo(tp)) {
2644 struct sk_buff *skb; 2447 struct sk_buff *skb;
2645 tcp_for_write_queue(skb, sk) { 2448 tcp_for_write_queue(skb, sk) {
2646 if (skb == tcp_send_head(sk)) 2449 if (skb == tcp_send_head(sk))
@@ -2654,9 +2457,12 @@ static bool tcp_try_undo_loss(struct sock *sk)
2654 tp->lost_out = 0; 2457 tp->lost_out = 0;
2655 tcp_undo_cwr(sk, true); 2458 tcp_undo_cwr(sk, true);
2656 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSUNDO); 2459 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSUNDO);
2460 if (frto_undo)
2461 NET_INC_STATS_BH(sock_net(sk),
2462 LINUX_MIB_TCPSPURIOUSRTOS);
2657 inet_csk(sk)->icsk_retransmits = 0; 2463 inet_csk(sk)->icsk_retransmits = 0;
2658 tp->undo_marker = 0; 2464 tp->undo_marker = 0;
2659 if (tcp_is_sack(tp)) 2465 if (frto_undo || tcp_is_sack(tp))
2660 tcp_set_ca_state(sk, TCP_CA_Open); 2466 tcp_set_ca_state(sk, TCP_CA_Open);
2661 return true; 2467 return true;
2662 } 2468 }
@@ -2678,6 +2484,7 @@ static void tcp_init_cwnd_reduction(struct sock *sk, const bool set_ssthresh)
2678 struct tcp_sock *tp = tcp_sk(sk); 2484 struct tcp_sock *tp = tcp_sk(sk);
2679 2485
2680 tp->high_seq = tp->snd_nxt; 2486 tp->high_seq = tp->snd_nxt;
2487 tp->tlp_high_seq = 0;
2681 tp->snd_cwnd_cnt = 0; 2488 tp->snd_cwnd_cnt = 0;
2682 tp->prior_cwnd = tp->snd_cwnd; 2489 tp->prior_cwnd = tp->snd_cwnd;
2683 tp->prr_delivered = 0; 2490 tp->prr_delivered = 0;
@@ -2755,7 +2562,7 @@ static void tcp_try_to_open(struct sock *sk, int flag, int newly_acked_sacked)
2755 2562
2756 tcp_verify_left_out(tp); 2563 tcp_verify_left_out(tp);
2757 2564
2758 if (!tp->frto_counter && !tcp_any_retrans_done(sk)) 2565 if (!tcp_any_retrans_done(sk))
2759 tp->retrans_stamp = 0; 2566 tp->retrans_stamp = 0;
2760 2567
2761 if (flag & FLAG_ECE) 2568 if (flag & FLAG_ECE)
@@ -2872,6 +2679,58 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
2872 tcp_set_ca_state(sk, TCP_CA_Recovery); 2679 tcp_set_ca_state(sk, TCP_CA_Recovery);
2873} 2680}
2874 2681
2682/* Process an ACK in CA_Loss state. Move to CA_Open if lost data are
2683 * recovered or spurious. Otherwise retransmits more on partial ACKs.
2684 */
2685static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack)
2686{
2687 struct inet_connection_sock *icsk = inet_csk(sk);
2688 struct tcp_sock *tp = tcp_sk(sk);
2689 bool recovered = !before(tp->snd_una, tp->high_seq);
2690
2691 if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */
2692 if (flag & FLAG_ORIG_SACK_ACKED) {
2693 /* Step 3.b. A timeout is spurious if not all data are
2694 * lost, i.e., never-retransmitted data are (s)acked.
2695 */
2696 tcp_try_undo_loss(sk, true);
2697 return;
2698 }
2699 if (after(tp->snd_nxt, tp->high_seq) &&
2700 (flag & FLAG_DATA_SACKED || is_dupack)) {
2701 tp->frto = 0; /* Loss was real: 2nd part of step 3.a */
2702 } else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) {
2703 tp->high_seq = tp->snd_nxt;
2704 __tcp_push_pending_frames(sk, tcp_current_mss(sk),
2705 TCP_NAGLE_OFF);
2706 if (after(tp->snd_nxt, tp->high_seq))
2707 return; /* Step 2.b */
2708 tp->frto = 0;
2709 }
2710 }
2711
2712 if (recovered) {
2713 /* F-RTO RFC5682 sec 3.1 step 2.a and 1st part of step 3.a */
2714 icsk->icsk_retransmits = 0;
2715 tcp_try_undo_recovery(sk);
2716 return;
2717 }
2718 if (flag & FLAG_DATA_ACKED)
2719 icsk->icsk_retransmits = 0;
2720 if (tcp_is_reno(tp)) {
2721 /* A Reno DUPACK means new data in F-RTO step 2.b above are
2722 * delivered. Lower inflight to clock out (re)tranmissions.
2723 */
2724 if (after(tp->snd_nxt, tp->high_seq) && is_dupack)
2725 tcp_add_reno_sack(sk);
2726 else if (flag & FLAG_SND_UNA_ADVANCED)
2727 tcp_reset_reno_sack(tp);
2728 }
2729 if (tcp_try_undo_loss(sk, false))
2730 return;
2731 tcp_xmit_retransmit_queue(sk);
2732}
2733
2875/* Process an event, which can update packets-in-flight not trivially. 2734/* Process an event, which can update packets-in-flight not trivially.
2876 * Main goal of this function is to calculate new estimate for left_out, 2735 * Main goal of this function is to calculate new estimate for left_out,
2877 * taking into account both packets sitting in receiver's buffer and 2736 * taking into account both packets sitting in receiver's buffer and
@@ -2918,12 +2777,6 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
2918 tp->retrans_stamp = 0; 2777 tp->retrans_stamp = 0;
2919 } else if (!before(tp->snd_una, tp->high_seq)) { 2778 } else if (!before(tp->snd_una, tp->high_seq)) {
2920 switch (icsk->icsk_ca_state) { 2779 switch (icsk->icsk_ca_state) {
2921 case TCP_CA_Loss:
2922 icsk->icsk_retransmits = 0;
2923 if (tcp_try_undo_recovery(sk))
2924 return;
2925 break;
2926
2927 case TCP_CA_CWR: 2780 case TCP_CA_CWR:
2928 /* CWR is to be held something *above* high_seq 2781 /* CWR is to be held something *above* high_seq
2929 * is ACKed for CWR bit to reach receiver. */ 2782 * is ACKed for CWR bit to reach receiver. */
@@ -2954,18 +2807,10 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
2954 newly_acked_sacked = pkts_acked + tp->sacked_out - prior_sacked; 2807 newly_acked_sacked = pkts_acked + tp->sacked_out - prior_sacked;
2955 break; 2808 break;
2956 case TCP_CA_Loss: 2809 case TCP_CA_Loss:
2957 if (flag & FLAG_DATA_ACKED) 2810 tcp_process_loss(sk, flag, is_dupack);
2958 icsk->icsk_retransmits = 0;
2959 if (tcp_is_reno(tp) && flag & FLAG_SND_UNA_ADVANCED)
2960 tcp_reset_reno_sack(tp);
2961 if (!tcp_try_undo_loss(sk)) {
2962 tcp_moderate_cwnd(tp);
2963 tcp_xmit_retransmit_queue(sk);
2964 return;
2965 }
2966 if (icsk->icsk_ca_state != TCP_CA_Open) 2811 if (icsk->icsk_ca_state != TCP_CA_Open)
2967 return; 2812 return;
2968 /* Loss is undone; fall through to processing in Open state. */ 2813 /* Fall through to processing in Open state. */
2969 default: 2814 default:
2970 if (tcp_is_reno(tp)) { 2815 if (tcp_is_reno(tp)) {
2971 if (flag & FLAG_SND_UNA_ADVANCED) 2816 if (flag & FLAG_SND_UNA_ADVANCED)
@@ -3078,6 +2923,7 @@ static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
3078 */ 2923 */
3079void tcp_rearm_rto(struct sock *sk) 2924void tcp_rearm_rto(struct sock *sk)
3080{ 2925{
2926 const struct inet_connection_sock *icsk = inet_csk(sk);
3081 struct tcp_sock *tp = tcp_sk(sk); 2927 struct tcp_sock *tp = tcp_sk(sk);
3082 2928
3083 /* If the retrans timer is currently being used by Fast Open 2929 /* If the retrans timer is currently being used by Fast Open
@@ -3091,12 +2937,13 @@ void tcp_rearm_rto(struct sock *sk)
3091 } else { 2937 } else {
3092 u32 rto = inet_csk(sk)->icsk_rto; 2938 u32 rto = inet_csk(sk)->icsk_rto;
3093 /* Offset the time elapsed after installing regular RTO */ 2939 /* Offset the time elapsed after installing regular RTO */
3094 if (tp->early_retrans_delayed) { 2940 if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2941 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
3095 struct sk_buff *skb = tcp_write_queue_head(sk); 2942 struct sk_buff *skb = tcp_write_queue_head(sk);
3096 const u32 rto_time_stamp = TCP_SKB_CB(skb)->when + rto; 2943 const u32 rto_time_stamp = TCP_SKB_CB(skb)->when + rto;
3097 s32 delta = (s32)(rto_time_stamp - tcp_time_stamp); 2944 s32 delta = (s32)(rto_time_stamp - tcp_time_stamp);
3098 /* delta may not be positive if the socket is locked 2945 /* delta may not be positive if the socket is locked
3099 * when the delayed ER timer fires and is rescheduled. 2946 * when the retrans timer fires and is rescheduled.
3100 */ 2947 */
3101 if (delta > 0) 2948 if (delta > 0)
3102 rto = delta; 2949 rto = delta;
@@ -3104,7 +2951,6 @@ void tcp_rearm_rto(struct sock *sk)
3104 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto, 2951 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
3105 TCP_RTO_MAX); 2952 TCP_RTO_MAX);
3106 } 2953 }
3107 tp->early_retrans_delayed = 0;
3108} 2954}
3109 2955
3110/* This function is called when the delayed ER timer fires. TCP enters 2956/* This function is called when the delayed ER timer fires. TCP enters
@@ -3192,8 +3038,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3192 flag |= FLAG_RETRANS_DATA_ACKED; 3038 flag |= FLAG_RETRANS_DATA_ACKED;
3193 ca_seq_rtt = -1; 3039 ca_seq_rtt = -1;
3194 seq_rtt = -1; 3040 seq_rtt = -1;
3195 if ((flag & FLAG_DATA_ACKED) || (acked_pcount > 1))
3196 flag |= FLAG_NONHEAD_RETRANS_ACKED;
3197 } else { 3041 } else {
3198 ca_seq_rtt = now - scb->when; 3042 ca_seq_rtt = now - scb->when;
3199 last_ackt = skb->tstamp; 3043 last_ackt = skb->tstamp;
@@ -3202,6 +3046,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3202 } 3046 }
3203 if (!(sacked & TCPCB_SACKED_ACKED)) 3047 if (!(sacked & TCPCB_SACKED_ACKED))
3204 reord = min(pkts_acked, reord); 3048 reord = min(pkts_acked, reord);
3049 if (!after(scb->end_seq, tp->high_seq))
3050 flag |= FLAG_ORIG_SACK_ACKED;
3205 } 3051 }
3206 3052
3207 if (sacked & TCPCB_SACKED_ACKED) 3053 if (sacked & TCPCB_SACKED_ACKED)
@@ -3402,165 +3248,74 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32
3402 return flag; 3248 return flag;
3403} 3249}
3404 3250
3405/* A very conservative spurious RTO response algorithm: reduce cwnd and 3251/* RFC 5961 7 [ACK Throttling] */
3406 * continue in congestion avoidance. 3252static void tcp_send_challenge_ack(struct sock *sk)
3407 */
3408static void tcp_conservative_spur_to_response(struct tcp_sock *tp)
3409{ 3253{
3410 tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); 3254 /* unprotected vars, we dont care of overwrites */
3411 tp->snd_cwnd_cnt = 0; 3255 static u32 challenge_timestamp;
3412 TCP_ECN_queue_cwr(tp); 3256 static unsigned int challenge_count;
3413 tcp_moderate_cwnd(tp); 3257 u32 now = jiffies / HZ;
3414}
3415 3258
3416/* A conservative spurious RTO response algorithm: reduce cwnd using 3259 if (now != challenge_timestamp) {
3417 * PRR and continue in congestion avoidance. 3260 challenge_timestamp = now;
3418 */ 3261 challenge_count = 0;
3419static void tcp_cwr_spur_to_response(struct sock *sk) 3262 }
3420{ 3263 if (++challenge_count <= sysctl_tcp_challenge_ack_limit) {
3421 tcp_enter_cwr(sk, 0); 3264 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPCHALLENGEACK);
3265 tcp_send_ack(sk);
3266 }
3422} 3267}
3423 3268
3424static void tcp_undo_spur_to_response(struct sock *sk, int flag) 3269static void tcp_store_ts_recent(struct tcp_sock *tp)
3425{ 3270{
3426 if (flag & FLAG_ECE) 3271 tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval;
3427 tcp_cwr_spur_to_response(sk); 3272 tp->rx_opt.ts_recent_stamp = get_seconds();
3428 else
3429 tcp_undo_cwr(sk, true);
3430} 3273}
3431 3274
3432/* F-RTO spurious RTO detection algorithm (RFC4138) 3275static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
3433 *
3434 * F-RTO affects during two new ACKs following RTO (well, almost, see inline
3435 * comments). State (ACK number) is kept in frto_counter. When ACK advances
3436 * window (but not to or beyond highest sequence sent before RTO):
3437 * On First ACK, send two new segments out.
3438 * On Second ACK, RTO was likely spurious. Do spurious response (response
3439 * algorithm is not part of the F-RTO detection algorithm
3440 * given in RFC4138 but can be selected separately).
3441 * Otherwise (basically on duplicate ACK), RTO was (likely) caused by a loss
3442 * and TCP falls back to conventional RTO recovery. F-RTO allows overriding
3443 * of Nagle, this is done using frto_counter states 2 and 3, when a new data
3444 * segment of any size sent during F-RTO, state 2 is upgraded to 3.
3445 *
3446 * Rationale: if the RTO was spurious, new ACKs should arrive from the
3447 * original window even after we transmit two new data segments.
3448 *
3449 * SACK version:
3450 * on first step, wait until first cumulative ACK arrives, then move to
3451 * the second step. In second step, the next ACK decides.
3452 *
3453 * F-RTO is implemented (mainly) in four functions:
3454 * - tcp_use_frto() is used to determine if TCP is can use F-RTO
3455 * - tcp_enter_frto() prepares TCP state on RTO if F-RTO is used, it is
3456 * called when tcp_use_frto() showed green light
3457 * - tcp_process_frto() handles incoming ACKs during F-RTO algorithm
3458 * - tcp_enter_frto_loss() is called if there is not enough evidence
3459 * to prove that the RTO is indeed spurious. It transfers the control
3460 * from F-RTO to the conventional RTO recovery
3461 */
3462static bool tcp_process_frto(struct sock *sk, int flag)
3463{ 3276{
3464 struct tcp_sock *tp = tcp_sk(sk); 3277 if (tp->rx_opt.saw_tstamp && !after(seq, tp->rcv_wup)) {
3465 3278 /* PAWS bug workaround wrt. ACK frames, the PAWS discard
3466 tcp_verify_left_out(tp); 3279 * extra check below makes sure this can only happen
3467 3280 * for pure ACK frames. -DaveM
3468 /* Duplicate the behavior from Loss state (fastretrans_alert) */ 3281 *
3469 if (flag & FLAG_DATA_ACKED) 3282 * Not only, also it occurs for expired timestamps.
3470 inet_csk(sk)->icsk_retransmits = 0;
3471
3472 if ((flag & FLAG_NONHEAD_RETRANS_ACKED) ||
3473 ((tp->frto_counter >= 2) && (flag & FLAG_RETRANS_DATA_ACKED)))
3474 tp->undo_marker = 0;
3475
3476 if (!before(tp->snd_una, tp->frto_highmark)) {
3477 tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 2 : 3), flag);
3478 return true;
3479 }
3480
3481 if (!tcp_is_sackfrto(tp)) {
3482 /* RFC4138 shortcoming in step 2; should also have case c):
3483 * ACK isn't duplicate nor advances window, e.g., opposite dir
3484 * data, winupdate
3485 */ 3283 */
3486 if (!(flag & FLAG_ANY_PROGRESS) && (flag & FLAG_NOT_DUP))
3487 return true;
3488 3284
3489 if (!(flag & FLAG_DATA_ACKED)) { 3285 if (tcp_paws_check(&tp->rx_opt, 0))
3490 tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 0 : 3), 3286 tcp_store_ts_recent(tp);
3491 flag);
3492 return true;
3493 }
3494 } else {
3495 if (!(flag & FLAG_DATA_ACKED) && (tp->frto_counter == 1)) {
3496 if (!tcp_packets_in_flight(tp)) {
3497 tcp_enter_frto_loss(sk, 2, flag);
3498 return true;
3499 }
3500
3501 /* Prevent sending of new data. */
3502 tp->snd_cwnd = min(tp->snd_cwnd,
3503 tcp_packets_in_flight(tp));
3504 return true;
3505 }
3506
3507 if ((tp->frto_counter >= 2) &&
3508 (!(flag & FLAG_FORWARD_PROGRESS) ||
3509 ((flag & FLAG_DATA_SACKED) &&
3510 !(flag & FLAG_ONLY_ORIG_SACKED)))) {
3511 /* RFC4138 shortcoming (see comment above) */
3512 if (!(flag & FLAG_FORWARD_PROGRESS) &&
3513 (flag & FLAG_NOT_DUP))
3514 return true;
3515
3516 tcp_enter_frto_loss(sk, 3, flag);
3517 return true;
3518 }
3519 }
3520
3521 if (tp->frto_counter == 1) {
3522 /* tcp_may_send_now needs to see updated state */
3523 tp->snd_cwnd = tcp_packets_in_flight(tp) + 2;
3524 tp->frto_counter = 2;
3525
3526 if (!tcp_may_send_now(sk))
3527 tcp_enter_frto_loss(sk, 2, flag);
3528
3529 return true;
3530 } else {
3531 switch (sysctl_tcp_frto_response) {
3532 case 2:
3533 tcp_undo_spur_to_response(sk, flag);
3534 break;
3535 case 1:
3536 tcp_conservative_spur_to_response(tp);
3537 break;
3538 default:
3539 tcp_cwr_spur_to_response(sk);
3540 break;
3541 }
3542 tp->frto_counter = 0;
3543 tp->undo_marker = 0;
3544 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSPURIOUSRTOS);
3545 } 3287 }
3546 return false;
3547} 3288}
3548 3289
3549/* RFC 5961 7 [ACK Throttling] */ 3290/* This routine deals with acks during a TLP episode.
3550static void tcp_send_challenge_ack(struct sock *sk) 3291 * Ref: loss detection algorithm in draft-dukkipati-tcpm-tcp-loss-probe.
3292 */
3293static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
3551{ 3294{
3552 /* unprotected vars, we dont care of overwrites */ 3295 struct tcp_sock *tp = tcp_sk(sk);
3553 static u32 challenge_timestamp; 3296 bool is_tlp_dupack = (ack == tp->tlp_high_seq) &&
3554 static unsigned int challenge_count; 3297 !(flag & (FLAG_SND_UNA_ADVANCED |
3555 u32 now = jiffies / HZ; 3298 FLAG_NOT_DUP | FLAG_DATA_SACKED));
3556 3299
3557 if (now != challenge_timestamp) { 3300 /* Mark the end of TLP episode on receiving TLP dupack or when
3558 challenge_timestamp = now; 3301 * ack is after tlp_high_seq.
3559 challenge_count = 0; 3302 */
3303 if (is_tlp_dupack) {
3304 tp->tlp_high_seq = 0;
3305 return;
3560 } 3306 }
3561 if (++challenge_count <= sysctl_tcp_challenge_ack_limit) { 3307
3562 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPCHALLENGEACK); 3308 if (after(ack, tp->tlp_high_seq)) {
3563 tcp_send_ack(sk); 3309 tp->tlp_high_seq = 0;
3310 /* Don't reduce cwnd if DSACK arrives for TLP retrans. */
3311 if (!(flag & FLAG_DSACKING_ACK)) {
3312 tcp_init_cwnd_reduction(sk, true);
3313 tcp_set_ca_state(sk, TCP_CA_CWR);
3314 tcp_end_cwnd_reduction(sk);
3315 tcp_set_ca_state(sk, TCP_CA_Open);
3316 NET_INC_STATS_BH(sock_net(sk),
3317 LINUX_MIB_TCPLOSSPROBERECOVERY);
3318 }
3564 } 3319 }
3565} 3320}
3566 3321
@@ -3578,7 +3333,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3578 int prior_packets; 3333 int prior_packets;
3579 int prior_sacked = tp->sacked_out; 3334 int prior_sacked = tp->sacked_out;
3580 int pkts_acked = 0; 3335 int pkts_acked = 0;
3581 bool frto_cwnd = false;
3582 3336
3583 /* If the ack is older than previous acks 3337 /* If the ack is older than previous acks
3584 * then we can probably ignore it. 3338 * then we can probably ignore it.
@@ -3598,7 +3352,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3598 if (after(ack, tp->snd_nxt)) 3352 if (after(ack, tp->snd_nxt))
3599 goto invalid_ack; 3353 goto invalid_ack;
3600 3354
3601 if (tp->early_retrans_delayed) 3355 if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
3356 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
3602 tcp_rearm_rto(sk); 3357 tcp_rearm_rto(sk);
3603 3358
3604 if (after(ack, prior_snd_una)) 3359 if (after(ack, prior_snd_una))
@@ -3607,6 +3362,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3607 prior_fackets = tp->fackets_out; 3362 prior_fackets = tp->fackets_out;
3608 prior_in_flight = tcp_packets_in_flight(tp); 3363 prior_in_flight = tcp_packets_in_flight(tp);
3609 3364
3365 /* ts_recent update must be made after we are sure that the packet
3366 * is in window.
3367 */
3368 if (flag & FLAG_UPDATE_TS_RECENT)
3369 tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
3370
3610 if (!(flag & FLAG_SLOWPATH) && after(ack, prior_snd_una)) { 3371 if (!(flag & FLAG_SLOWPATH) && after(ack, prior_snd_una)) {
3611 /* Window is constant, pure forward advance. 3372 /* Window is constant, pure forward advance.
3612 * No more checks are required. 3373 * No more checks are required.
@@ -3651,30 +3412,29 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3651 3412
3652 pkts_acked = prior_packets - tp->packets_out; 3413 pkts_acked = prior_packets - tp->packets_out;
3653 3414
3654 if (tp->frto_counter)
3655 frto_cwnd = tcp_process_frto(sk, flag);
3656 /* Guarantee sacktag reordering detection against wrap-arounds */
3657 if (before(tp->frto_highmark, tp->snd_una))
3658 tp->frto_highmark = 0;
3659
3660 if (tcp_ack_is_dubious(sk, flag)) { 3415 if (tcp_ack_is_dubious(sk, flag)) {
3661 /* Advance CWND, if state allows this. */ 3416 /* Advance CWND, if state allows this. */
3662 if ((flag & FLAG_DATA_ACKED) && !frto_cwnd && 3417 if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(sk, flag))
3663 tcp_may_raise_cwnd(sk, flag))
3664 tcp_cong_avoid(sk, ack, prior_in_flight); 3418 tcp_cong_avoid(sk, ack, prior_in_flight);
3665 is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); 3419 is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
3666 tcp_fastretrans_alert(sk, pkts_acked, prior_sacked, 3420 tcp_fastretrans_alert(sk, pkts_acked, prior_sacked,
3667 is_dupack, flag); 3421 is_dupack, flag);
3668 } else { 3422 } else {
3669 if ((flag & FLAG_DATA_ACKED) && !frto_cwnd) 3423 if (flag & FLAG_DATA_ACKED)
3670 tcp_cong_avoid(sk, ack, prior_in_flight); 3424 tcp_cong_avoid(sk, ack, prior_in_flight);
3671 } 3425 }
3672 3426
3427 if (tp->tlp_high_seq)
3428 tcp_process_tlp_ack(sk, ack, flag);
3429
3673 if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) { 3430 if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) {
3674 struct dst_entry *dst = __sk_dst_get(sk); 3431 struct dst_entry *dst = __sk_dst_get(sk);
3675 if (dst) 3432 if (dst)
3676 dst_confirm(dst); 3433 dst_confirm(dst);
3677 } 3434 }
3435
3436 if (icsk->icsk_pending == ICSK_TIME_RETRANS)
3437 tcp_schedule_loss_probe(sk);
3678 return 1; 3438 return 1;
3679 3439
3680no_queue: 3440no_queue:
@@ -3688,6 +3448,9 @@ no_queue:
3688 */ 3448 */
3689 if (tcp_send_head(sk)) 3449 if (tcp_send_head(sk))
3690 tcp_ack_probe(sk); 3450 tcp_ack_probe(sk);
3451
3452 if (tp->tlp_high_seq)
3453 tcp_process_tlp_ack(sk, ack, flag);
3691 return 1; 3454 return 1;
3692 3455
3693invalid_ack: 3456invalid_ack:
@@ -3712,8 +3475,8 @@ old_ack:
3712 * But, this can also be called on packets in the established flow when 3475 * But, this can also be called on packets in the established flow when
3713 * the fast version below fails. 3476 * the fast version below fails.
3714 */ 3477 */
3715void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *opt_rx, 3478void tcp_parse_options(const struct sk_buff *skb,
3716 const u8 **hvpp, int estab, 3479 struct tcp_options_received *opt_rx, int estab,
3717 struct tcp_fastopen_cookie *foc) 3480 struct tcp_fastopen_cookie *foc)
3718{ 3481{
3719 const unsigned char *ptr; 3482 const unsigned char *ptr;
@@ -3797,31 +3560,6 @@ void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *o
3797 */ 3560 */
3798 break; 3561 break;
3799#endif 3562#endif
3800 case TCPOPT_COOKIE:
3801 /* This option is variable length.
3802 */
3803 switch (opsize) {
3804 case TCPOLEN_COOKIE_BASE:
3805 /* not yet implemented */
3806 break;
3807 case TCPOLEN_COOKIE_PAIR:
3808 /* not yet implemented */
3809 break;
3810 case TCPOLEN_COOKIE_MIN+0:
3811 case TCPOLEN_COOKIE_MIN+2:
3812 case TCPOLEN_COOKIE_MIN+4:
3813 case TCPOLEN_COOKIE_MIN+6:
3814 case TCPOLEN_COOKIE_MAX:
3815 /* 16-bit multiple */
3816 opt_rx->cookie_plus = opsize;
3817 *hvpp = ptr;
3818 break;
3819 default:
3820 /* ignore option */
3821 break;
3822 }
3823 break;
3824
3825 case TCPOPT_EXP: 3563 case TCPOPT_EXP:
3826 /* Fast Open option shares code 254 using a 3564 /* Fast Open option shares code 254 using a
3827 * 16 bits magic number. It's valid only in 3565 * 16 bits magic number. It's valid only in
@@ -3867,8 +3605,7 @@ static bool tcp_parse_aligned_timestamp(struct tcp_sock *tp, const struct tcphdr
3867 * If it is wrong it falls back on tcp_parse_options(). 3605 * If it is wrong it falls back on tcp_parse_options().
3868 */ 3606 */
3869static bool tcp_fast_parse_options(const struct sk_buff *skb, 3607static bool tcp_fast_parse_options(const struct sk_buff *skb,
3870 const struct tcphdr *th, 3608 const struct tcphdr *th, struct tcp_sock *tp)
3871 struct tcp_sock *tp, const u8 **hvpp)
3872{ 3609{
3873 /* In the spirit of fast parsing, compare doff directly to constant 3610 /* In the spirit of fast parsing, compare doff directly to constant
3874 * values. Because equality is used, short doff can be ignored here. 3611 * values. Because equality is used, short doff can be ignored here.
@@ -3882,7 +3619,7 @@ static bool tcp_fast_parse_options(const struct sk_buff *skb,
3882 return true; 3619 return true;
3883 } 3620 }
3884 3621
3885 tcp_parse_options(skb, &tp->rx_opt, hvpp, 1, NULL); 3622 tcp_parse_options(skb, &tp->rx_opt, 1, NULL);
3886 if (tp->rx_opt.saw_tstamp) 3623 if (tp->rx_opt.saw_tstamp)
3887 tp->rx_opt.rcv_tsecr -= tp->tsoffset; 3624 tp->rx_opt.rcv_tsecr -= tp->tsoffset;
3888 3625
@@ -3927,27 +3664,6 @@ const u8 *tcp_parse_md5sig_option(const struct tcphdr *th)
3927EXPORT_SYMBOL(tcp_parse_md5sig_option); 3664EXPORT_SYMBOL(tcp_parse_md5sig_option);
3928#endif 3665#endif
3929 3666
3930static inline void tcp_store_ts_recent(struct tcp_sock *tp)
3931{
3932 tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval;
3933 tp->rx_opt.ts_recent_stamp = get_seconds();
3934}
3935
3936static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
3937{
3938 if (tp->rx_opt.saw_tstamp && !after(seq, tp->rcv_wup)) {
3939 /* PAWS bug workaround wrt. ACK frames, the PAWS discard
3940 * extra check below makes sure this can only happen
3941 * for pure ACK frames. -DaveM
3942 *
3943 * Not only, also it occurs for expired timestamps.
3944 */
3945
3946 if (tcp_paws_check(&tp->rx_opt, 0))
3947 tcp_store_ts_recent(tp);
3948 }
3949}
3950
3951/* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM 3667/* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM
3952 * 3668 *
3953 * It is not fatal. If this ACK does _not_ change critical state (seqs, window) 3669 * It is not fatal. If this ACK does _not_ change critical state (seqs, window)
@@ -5263,12 +4979,10 @@ out:
5263static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, 4979static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
5264 const struct tcphdr *th, int syn_inerr) 4980 const struct tcphdr *th, int syn_inerr)
5265{ 4981{
5266 const u8 *hash_location;
5267 struct tcp_sock *tp = tcp_sk(sk); 4982 struct tcp_sock *tp = tcp_sk(sk);
5268 4983
5269 /* RFC1323: H1. Apply PAWS check first. */ 4984 /* RFC1323: H1. Apply PAWS check first. */
5270 if (tcp_fast_parse_options(skb, th, tp, &hash_location) && 4985 if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp &&
5271 tp->rx_opt.saw_tstamp &&
5272 tcp_paws_discard(sk, skb)) { 4986 tcp_paws_discard(sk, skb)) {
5273 if (!th->rst) { 4987 if (!th->rst) {
5274 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); 4988 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
@@ -5543,14 +5257,9 @@ slow_path:
5543 return 0; 5257 return 0;
5544 5258
5545step5: 5259step5:
5546 if (tcp_ack(sk, skb, FLAG_SLOWPATH) < 0) 5260 if (tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT) < 0)
5547 goto discard; 5261 goto discard;
5548 5262
5549 /* ts_recent update must be made after we are sure that the packet
5550 * is in window.
5551 */
5552 tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
5553
5554 tcp_rcv_rtt_measure_ts(sk, skb); 5263 tcp_rcv_rtt_measure_ts(sk, skb);
5555 5264
5556 /* Process urgent data. */ 5265 /* Process urgent data. */
@@ -5564,6 +5273,7 @@ step5:
5564 return 0; 5273 return 0;
5565 5274
5566csum_error: 5275csum_error:
5276 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
5567 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); 5277 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
5568 5278
5569discard: 5279discard:
@@ -5622,12 +5332,11 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
5622 5332
5623 if (mss == tp->rx_opt.user_mss) { 5333 if (mss == tp->rx_opt.user_mss) {
5624 struct tcp_options_received opt; 5334 struct tcp_options_received opt;
5625 const u8 *hash_location;
5626 5335
5627 /* Get original SYNACK MSS value if user MSS sets mss_clamp */ 5336 /* Get original SYNACK MSS value if user MSS sets mss_clamp */
5628 tcp_clear_options(&opt); 5337 tcp_clear_options(&opt);
5629 opt.user_mss = opt.mss_clamp = 0; 5338 opt.user_mss = opt.mss_clamp = 0;
5630 tcp_parse_options(synack, &opt, &hash_location, 0, NULL); 5339 tcp_parse_options(synack, &opt, 0, NULL);
5631 mss = opt.mss_clamp; 5340 mss = opt.mss_clamp;
5632 } 5341 }
5633 5342
@@ -5658,14 +5367,12 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
5658static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, 5367static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
5659 const struct tcphdr *th, unsigned int len) 5368 const struct tcphdr *th, unsigned int len)
5660{ 5369{
5661 const u8 *hash_location;
5662 struct inet_connection_sock *icsk = inet_csk(sk); 5370 struct inet_connection_sock *icsk = inet_csk(sk);
5663 struct tcp_sock *tp = tcp_sk(sk); 5371 struct tcp_sock *tp = tcp_sk(sk);
5664 struct tcp_cookie_values *cvp = tp->cookie_values;
5665 struct tcp_fastopen_cookie foc = { .len = -1 }; 5372 struct tcp_fastopen_cookie foc = { .len = -1 };
5666 int saved_clamp = tp->rx_opt.mss_clamp; 5373 int saved_clamp = tp->rx_opt.mss_clamp;
5667 5374
5668 tcp_parse_options(skb, &tp->rx_opt, &hash_location, 0, &foc); 5375 tcp_parse_options(skb, &tp->rx_opt, 0, &foc);
5669 if (tp->rx_opt.saw_tstamp) 5376 if (tp->rx_opt.saw_tstamp)
5670 tp->rx_opt.rcv_tsecr -= tp->tsoffset; 5377 tp->rx_opt.rcv_tsecr -= tp->tsoffset;
5671 5378
@@ -5762,30 +5469,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
5762 * is initialized. */ 5469 * is initialized. */
5763 tp->copied_seq = tp->rcv_nxt; 5470 tp->copied_seq = tp->rcv_nxt;
5764 5471
5765 if (cvp != NULL &&
5766 cvp->cookie_pair_size > 0 &&
5767 tp->rx_opt.cookie_plus > 0) {
5768 int cookie_size = tp->rx_opt.cookie_plus
5769 - TCPOLEN_COOKIE_BASE;
5770 int cookie_pair_size = cookie_size
5771 + cvp->cookie_desired;
5772
5773 /* A cookie extension option was sent and returned.
5774 * Note that each incoming SYNACK replaces the
5775 * Responder cookie. The initial exchange is most
5776 * fragile, as protection against spoofing relies
5777 * entirely upon the sequence and timestamp (above).
5778 * This replacement strategy allows the correct pair to
5779 * pass through, while any others will be filtered via
5780 * Responder verification later.
5781 */
5782 if (sizeof(cvp->cookie_pair) >= cookie_pair_size) {
5783 memcpy(&cvp->cookie_pair[cvp->cookie_desired],
5784 hash_location, cookie_size);
5785 cvp->cookie_pair_size = cookie_pair_size;
5786 }
5787 }
5788
5789 smp_mb(); 5472 smp_mb();
5790 5473
5791 tcp_finish_connect(sk, skb); 5474 tcp_finish_connect(sk, skb);
@@ -5986,7 +5669,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
5986 5669
5987 /* step 5: check the ACK field */ 5670 /* step 5: check the ACK field */
5988 if (true) { 5671 if (true) {
5989 int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH) > 0; 5672 int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH |
5673 FLAG_UPDATE_TS_RECENT) > 0;
5990 5674
5991 switch (sk->sk_state) { 5675 switch (sk->sk_state) {
5992 case TCP_SYN_RECV: 5676 case TCP_SYN_RECV:
@@ -6137,11 +5821,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
6137 } 5821 }
6138 } 5822 }
6139 5823
6140 /* ts_recent update must be made after we are sure that the packet
6141 * is in window.
6142 */
6143 tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
6144
6145 /* step 6: check the URG bit */ 5824 /* step 6: check the URG bit */
6146 tcp_urg(sk, skb, th); 5825 tcp_urg(sk, skb, th);
6147 5826