aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp_output.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4/tcp_output.c')
-rw-r--r--net/ipv4/tcp_output.c324
1 files changed, 192 insertions, 132 deletions
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 5a42e873d44a..540b7d92cc70 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -41,40 +41,25 @@
41#include <linux/compiler.h> 41#include <linux/compiler.h>
42#include <linux/gfp.h> 42#include <linux/gfp.h>
43#include <linux/module.h> 43#include <linux/module.h>
44#include <linux/static_key.h>
44 45
45/* People can turn this off for buggy TCP's found in printers etc. */ 46#include <trace/events/tcp.h>
46int sysctl_tcp_retrans_collapse __read_mostly = 1;
47
48/* People can turn this on to work with those rare, broken TCPs that
49 * interpret the window field as a signed quantity.
50 */
51int sysctl_tcp_workaround_signed_windows __read_mostly = 0;
52
53/* Default TSQ limit of four TSO segments */
54int sysctl_tcp_limit_output_bytes __read_mostly = 262144;
55
56/* This limits the percentage of the congestion window which we
57 * will allow a single TSO frame to consume. Building TSO frames
58 * which are too large can cause TCP streams to be bursty.
59 */
60int sysctl_tcp_tso_win_divisor __read_mostly = 3;
61
62/* By default, RFC2861 behavior. */
63int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
64 47
65static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, 48static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
66 int push_one, gfp_t gfp); 49 int push_one, gfp_t gfp);
67 50
68/* Account for new data that has been sent to the network. */ 51/* Account for new data that has been sent to the network. */
69static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb) 52static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb)
70{ 53{
71 struct inet_connection_sock *icsk = inet_csk(sk); 54 struct inet_connection_sock *icsk = inet_csk(sk);
72 struct tcp_sock *tp = tcp_sk(sk); 55 struct tcp_sock *tp = tcp_sk(sk);
73 unsigned int prior_packets = tp->packets_out; 56 unsigned int prior_packets = tp->packets_out;
74 57
75 tcp_advance_send_head(sk, skb);
76 tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; 58 tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
77 59
60 __skb_unlink(skb, &sk->sk_write_queue);
61 tcp_rbtree_insert(&sk->tcp_rtx_queue, skb);
62
78 tp->packets_out += tcp_skb_pcount(skb); 63 tp->packets_out += tcp_skb_pcount(skb);
79 if (!prior_packets || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) 64 if (!prior_packets || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
80 tcp_rearm_rto(sk); 65 tcp_rearm_rto(sk);
@@ -203,7 +188,7 @@ u32 tcp_default_init_rwnd(u32 mss)
203 * be a multiple of mss if possible. We assume here that mss >= 1. 188 * be a multiple of mss if possible. We assume here that mss >= 1.
204 * This MUST be enforced by all callers. 189 * This MUST be enforced by all callers.
205 */ 190 */
206void tcp_select_initial_window(int __space, __u32 mss, 191void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss,
207 __u32 *rcv_wnd, __u32 *window_clamp, 192 __u32 *rcv_wnd, __u32 *window_clamp,
208 int wscale_ok, __u8 *rcv_wscale, 193 int wscale_ok, __u8 *rcv_wscale,
209 __u32 init_rcv_wnd) 194 __u32 init_rcv_wnd)
@@ -227,7 +212,7 @@ void tcp_select_initial_window(int __space, __u32 mss,
227 * which we interpret as a sign the remote TCP is not 212 * which we interpret as a sign the remote TCP is not
228 * misinterpreting the window field as a signed quantity. 213 * misinterpreting the window field as a signed quantity.
229 */ 214 */
230 if (sysctl_tcp_workaround_signed_windows) 215 if (sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows)
231 (*rcv_wnd) = min(space, MAX_TCP_WINDOW); 216 (*rcv_wnd) = min(space, MAX_TCP_WINDOW);
232 else 217 else
233 (*rcv_wnd) = space; 218 (*rcv_wnd) = space;
@@ -235,7 +220,7 @@ void tcp_select_initial_window(int __space, __u32 mss,
235 (*rcv_wscale) = 0; 220 (*rcv_wscale) = 0;
236 if (wscale_ok) { 221 if (wscale_ok) {
237 /* Set window scaling on max possible window */ 222 /* Set window scaling on max possible window */
238 space = max_t(u32, space, sysctl_tcp_rmem[2]); 223 space = max_t(u32, space, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
239 space = max_t(u32, space, sysctl_rmem_max); 224 space = max_t(u32, space, sysctl_rmem_max);
240 space = min_t(u32, space, *window_clamp); 225 space = min_t(u32, space, *window_clamp);
241 while (space > U16_MAX && (*rcv_wscale) < TCP_MAX_WSCALE) { 226 while (space > U16_MAX && (*rcv_wscale) < TCP_MAX_WSCALE) {
@@ -287,7 +272,8 @@ static u16 tcp_select_window(struct sock *sk)
287 /* Make sure we do not exceed the maximum possible 272 /* Make sure we do not exceed the maximum possible
288 * scaled window. 273 * scaled window.
289 */ 274 */
290 if (!tp->rx_opt.rcv_wscale && sysctl_tcp_workaround_signed_windows) 275 if (!tp->rx_opt.rcv_wscale &&
276 sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows)
291 new_win = min(new_win, MAX_TCP_WINDOW); 277 new_win = min(new_win, MAX_TCP_WINDOW);
292 else 278 else
293 new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale)); 279 new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale));
@@ -395,7 +381,6 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,
395static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) 381static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
396{ 382{
397 skb->ip_summed = CHECKSUM_PARTIAL; 383 skb->ip_summed = CHECKSUM_PARTIAL;
398 skb->csum = 0;
399 384
400 TCP_SKB_CB(skb)->tcp_flags = flags; 385 TCP_SKB_CB(skb)->tcp_flags = flags;
401 TCP_SKB_CB(skb)->sacked = 0; 386 TCP_SKB_CB(skb)->sacked = 0;
@@ -418,6 +403,22 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp)
418#define OPTION_MD5 (1 << 2) 403#define OPTION_MD5 (1 << 2)
419#define OPTION_WSCALE (1 << 3) 404#define OPTION_WSCALE (1 << 3)
420#define OPTION_FAST_OPEN_COOKIE (1 << 8) 405#define OPTION_FAST_OPEN_COOKIE (1 << 8)
406#define OPTION_SMC (1 << 9)
407
408static void smc_options_write(__be32 *ptr, u16 *options)
409{
410#if IS_ENABLED(CONFIG_SMC)
411 if (static_branch_unlikely(&tcp_have_smc)) {
412 if (unlikely(OPTION_SMC & *options)) {
413 *ptr++ = htonl((TCPOPT_NOP << 24) |
414 (TCPOPT_NOP << 16) |
415 (TCPOPT_EXP << 8) |
416 (TCPOLEN_EXP_SMC_BASE));
417 *ptr++ = htonl(TCPOPT_SMC_MAGIC);
418 }
419 }
420#endif
421}
421 422
422struct tcp_out_options { 423struct tcp_out_options {
423 u16 options; /* bit field of OPTION_* */ 424 u16 options; /* bit field of OPTION_* */
@@ -536,6 +537,41 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
536 } 537 }
537 ptr += (len + 3) >> 2; 538 ptr += (len + 3) >> 2;
538 } 539 }
540
541 smc_options_write(ptr, &options);
542}
543
544static void smc_set_option(const struct tcp_sock *tp,
545 struct tcp_out_options *opts,
546 unsigned int *remaining)
547{
548#if IS_ENABLED(CONFIG_SMC)
549 if (static_branch_unlikely(&tcp_have_smc)) {
550 if (tp->syn_smc) {
551 if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
552 opts->options |= OPTION_SMC;
553 *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
554 }
555 }
556 }
557#endif
558}
559
560static void smc_set_option_cond(const struct tcp_sock *tp,
561 const struct inet_request_sock *ireq,
562 struct tcp_out_options *opts,
563 unsigned int *remaining)
564{
565#if IS_ENABLED(CONFIG_SMC)
566 if (static_branch_unlikely(&tcp_have_smc)) {
567 if (tp->syn_smc && ireq->smc_ok) {
568 if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
569 opts->options |= OPTION_SMC;
570 *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
571 }
572 }
573 }
574#endif
539} 575}
540 576
541/* Compute TCP options for SYN packets. This is not the final 577/* Compute TCP options for SYN packets. This is not the final
@@ -603,11 +639,14 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
603 } 639 }
604 } 640 }
605 641
642 smc_set_option(tp, opts, &remaining);
643
606 return MAX_TCP_OPTION_SPACE - remaining; 644 return MAX_TCP_OPTION_SPACE - remaining;
607} 645}
608 646
609/* Set up TCP options for SYN-ACKs. */ 647/* Set up TCP options for SYN-ACKs. */
610static unsigned int tcp_synack_options(struct request_sock *req, 648static unsigned int tcp_synack_options(const struct sock *sk,
649 struct request_sock *req,
611 unsigned int mss, struct sk_buff *skb, 650 unsigned int mss, struct sk_buff *skb,
612 struct tcp_out_options *opts, 651 struct tcp_out_options *opts,
613 const struct tcp_md5sig_key *md5, 652 const struct tcp_md5sig_key *md5,
@@ -663,6 +702,8 @@ static unsigned int tcp_synack_options(struct request_sock *req,
663 } 702 }
664 } 703 }
665 704
705 smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining);
706
666 return MAX_TCP_OPTION_SPACE - remaining; 707 return MAX_TCP_OPTION_SPACE - remaining;
667} 708}
668 709
@@ -973,6 +1014,12 @@ static void tcp_internal_pacing(struct sock *sk, const struct sk_buff *skb)
973 HRTIMER_MODE_ABS_PINNED); 1014 HRTIMER_MODE_ABS_PINNED);
974} 1015}
975 1016
1017static void tcp_update_skb_after_send(struct tcp_sock *tp, struct sk_buff *skb)
1018{
1019 skb->skb_mstamp = tp->tcp_mstamp;
1020 list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
1021}
1022
976/* This routine actually transmits TCP packets queued in by 1023/* This routine actually transmits TCP packets queued in by
977 * tcp_do_sendmsg(). This is used by both the initial 1024 * tcp_do_sendmsg(). This is used by both the initial
978 * transmission and possible later retransmissions. 1025 * transmission and possible later retransmissions.
@@ -1005,10 +1052,14 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
1005 TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq 1052 TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq
1006 - tp->snd_una; 1053 - tp->snd_una;
1007 oskb = skb; 1054 oskb = skb;
1008 if (unlikely(skb_cloned(skb))) 1055
1009 skb = pskb_copy(skb, gfp_mask); 1056 tcp_skb_tsorted_save(oskb) {
1010 else 1057 if (unlikely(skb_cloned(oskb)))
1011 skb = skb_clone(skb, gfp_mask); 1058 skb = pskb_copy(oskb, gfp_mask);
1059 else
1060 skb = skb_clone(oskb, gfp_mask);
1061 } tcp_skb_tsorted_restore(oskb);
1062
1012 if (unlikely(!skb)) 1063 if (unlikely(!skb))
1013 return -ENOBUFS; 1064 return -ENOBUFS;
1014 } 1065 }
@@ -1129,7 +1180,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
1129 err = net_xmit_eval(err); 1180 err = net_xmit_eval(err);
1130 } 1181 }
1131 if (!err && oskb) { 1182 if (!err && oskb) {
1132 oskb->skb_mstamp = tp->tcp_mstamp; 1183 tcp_update_skb_after_send(tp, oskb);
1133 tcp_rate_skb_sent(sk, oskb); 1184 tcp_rate_skb_sent(sk, oskb);
1134 } 1185 }
1135 return err; 1186 return err;
@@ -1167,21 +1218,6 @@ static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now)
1167 } 1218 }
1168} 1219}
1169 1220
1170/* When a modification to fackets out becomes necessary, we need to check
1171 * skb is counted to fackets_out or not.
1172 */
1173static void tcp_adjust_fackets_out(struct sock *sk, const struct sk_buff *skb,
1174 int decr)
1175{
1176 struct tcp_sock *tp = tcp_sk(sk);
1177
1178 if (!tp->sacked_out || tcp_is_reno(tp))
1179 return;
1180
1181 if (after(tcp_highest_sack_seq(tp), TCP_SKB_CB(skb)->seq))
1182 tp->fackets_out -= decr;
1183}
1184
1185/* Pcount in the middle of the write queue got changed, we need to do various 1221/* Pcount in the middle of the write queue got changed, we need to do various
1186 * tweaks to fix counters 1222 * tweaks to fix counters
1187 */ 1223 */
@@ -1202,11 +1238,9 @@ static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int de
1202 if (tcp_is_reno(tp) && decr > 0) 1238 if (tcp_is_reno(tp) && decr > 0)
1203 tp->sacked_out -= min_t(u32, tp->sacked_out, decr); 1239 tp->sacked_out -= min_t(u32, tp->sacked_out, decr);
1204 1240
1205 tcp_adjust_fackets_out(sk, skb, decr);
1206
1207 if (tp->lost_skb_hint && 1241 if (tp->lost_skb_hint &&
1208 before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->lost_skb_hint)->seq) && 1242 before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->lost_skb_hint)->seq) &&
1209 (tcp_is_fack(tp) || (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))) 1243 (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
1210 tp->lost_cnt_hint -= decr; 1244 tp->lost_cnt_hint -= decr;
1211 1245
1212 tcp_verify_left_out(tp); 1246 tcp_verify_left_out(tp);
@@ -1241,12 +1275,25 @@ static void tcp_skb_fragment_eor(struct sk_buff *skb, struct sk_buff *skb2)
1241 TCP_SKB_CB(skb)->eor = 0; 1275 TCP_SKB_CB(skb)->eor = 0;
1242} 1276}
1243 1277
1278/* Insert buff after skb on the write or rtx queue of sk. */
1279static void tcp_insert_write_queue_after(struct sk_buff *skb,
1280 struct sk_buff *buff,
1281 struct sock *sk,
1282 enum tcp_queue tcp_queue)
1283{
1284 if (tcp_queue == TCP_FRAG_IN_WRITE_QUEUE)
1285 __skb_queue_after(&sk->sk_write_queue, skb, buff);
1286 else
1287 tcp_rbtree_insert(&sk->tcp_rtx_queue, buff);
1288}
1289
1244/* Function to create two new TCP segments. Shrinks the given segment 1290/* Function to create two new TCP segments. Shrinks the given segment
1245 * to the specified size and appends a new segment with the rest of the 1291 * to the specified size and appends a new segment with the rest of the
1246 * packet to the list. This won't be called frequently, I hope. 1292 * packet to the list. This won't be called frequently, I hope.
1247 * Remember, these are still headerless SKBs at this point. 1293 * Remember, these are still headerless SKBs at this point.
1248 */ 1294 */
1249int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, 1295int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
1296 struct sk_buff *skb, u32 len,
1250 unsigned int mss_now, gfp_t gfp) 1297 unsigned int mss_now, gfp_t gfp)
1251{ 1298{
1252 struct tcp_sock *tp = tcp_sk(sk); 1299 struct tcp_sock *tp = tcp_sk(sk);
@@ -1329,7 +1376,9 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
1329 1376
1330 /* Link BUFF into the send queue. */ 1377 /* Link BUFF into the send queue. */
1331 __skb_header_release(buff); 1378 __skb_header_release(buff);
1332 tcp_insert_write_queue_after(skb, buff, sk); 1379 tcp_insert_write_queue_after(skb, buff, sk, tcp_queue);
1380 if (tcp_queue == TCP_FRAG_IN_RTX_QUEUE)
1381 list_add(&buff->tcp_tsorted_anchor, &skb->tcp_tsorted_anchor);
1333 1382
1334 return 0; 1383 return 0;
1335} 1384}
@@ -1607,7 +1656,7 @@ static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited)
1607 if (tp->packets_out > tp->snd_cwnd_used) 1656 if (tp->packets_out > tp->snd_cwnd_used)
1608 tp->snd_cwnd_used = tp->packets_out; 1657 tp->snd_cwnd_used = tp->packets_out;
1609 1658
1610 if (sysctl_tcp_slow_start_after_idle && 1659 if (sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle &&
1611 (s32)(tcp_jiffies32 - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto && 1660 (s32)(tcp_jiffies32 - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto &&
1612 !ca_ops->cong_control) 1661 !ca_ops->cong_control)
1613 tcp_cwnd_application_limited(sk); 1662 tcp_cwnd_application_limited(sk);
@@ -1616,10 +1665,10 @@ static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited)
1616 * is caused by insufficient sender buffer: 1665 * is caused by insufficient sender buffer:
1617 * 1) just sent some data (see tcp_write_xmit) 1666 * 1) just sent some data (see tcp_write_xmit)
1618 * 2) not cwnd limited (this else condition) 1667 * 2) not cwnd limited (this else condition)
1619 * 3) no more data to send (null tcp_send_head ) 1668 * 3) no more data to send (tcp_write_queue_empty())
1620 * 4) application is hitting buffer limit (SOCK_NOSPACE) 1669 * 4) application is hitting buffer limit (SOCK_NOSPACE)
1621 */ 1670 */
1622 if (!tcp_send_head(sk) && sk->sk_socket && 1671 if (tcp_write_queue_empty(sk) && sk->sk_socket &&
1623 test_bit(SOCK_NOSPACE, &sk->sk_socket->flags) && 1672 test_bit(SOCK_NOSPACE, &sk->sk_socket->flags) &&
1624 (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) 1673 (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
1625 tcp_chrono_start(sk, TCP_CHRONO_SNDBUF_LIMITED); 1674 tcp_chrono_start(sk, TCP_CHRONO_SNDBUF_LIMITED);
@@ -1671,7 +1720,7 @@ u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
1671{ 1720{
1672 u32 bytes, segs; 1721 u32 bytes, segs;
1673 1722
1674 bytes = min(sk->sk_pacing_rate >> 10, 1723 bytes = min(sk->sk_pacing_rate >> sk->sk_pacing_shift,
1675 sk->sk_gso_max_size - 1 - MAX_TCP_HEADER); 1724 sk->sk_gso_max_size - 1 - MAX_TCP_HEADER);
1676 1725
1677 /* Goal is to send at least one packet per ms, 1726 /* Goal is to send at least one packet per ms,
@@ -1694,7 +1743,8 @@ static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now)
1694 u32 tso_segs = ca_ops->tso_segs_goal ? ca_ops->tso_segs_goal(sk) : 0; 1743 u32 tso_segs = ca_ops->tso_segs_goal ? ca_ops->tso_segs_goal(sk) : 0;
1695 1744
1696 return tso_segs ? : 1745 return tso_segs ? :
1697 tcp_tso_autosize(sk, mss_now, sysctl_tcp_min_tso_segs); 1746 tcp_tso_autosize(sk, mss_now,
1747 sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
1698} 1748}
1699 1749
1700/* Returns the portion of skb which can be sent right away */ 1750/* Returns the portion of skb which can be sent right away */
@@ -1815,7 +1865,8 @@ static bool tcp_snd_wnd_test(const struct tcp_sock *tp,
1815 * know that all the data is in scatter-gather pages, and that the 1865 * know that all the data is in scatter-gather pages, and that the
1816 * packet has never been sent out before (and thus is not cloned). 1866 * packet has never been sent out before (and thus is not cloned).
1817 */ 1867 */
1818static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, 1868static int tso_fragment(struct sock *sk, enum tcp_queue tcp_queue,
1869 struct sk_buff *skb, unsigned int len,
1819 unsigned int mss_now, gfp_t gfp) 1870 unsigned int mss_now, gfp_t gfp)
1820{ 1871{
1821 struct sk_buff *buff; 1872 struct sk_buff *buff;
@@ -1824,7 +1875,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
1824 1875
1825 /* All of a TSO frame must be composed of paged data. */ 1876 /* All of a TSO frame must be composed of paged data. */
1826 if (skb->len != skb->data_len) 1877 if (skb->len != skb->data_len)
1827 return tcp_fragment(sk, skb, len, mss_now, gfp); 1878 return tcp_fragment(sk, tcp_queue, skb, len, mss_now, gfp);
1828 1879
1829 buff = sk_stream_alloc_skb(sk, 0, gfp, true); 1880 buff = sk_stream_alloc_skb(sk, 0, gfp, true);
1830 if (unlikely(!buff)) 1881 if (unlikely(!buff))
@@ -1860,7 +1911,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
1860 1911
1861 /* Link BUFF into the send queue. */ 1912 /* Link BUFF into the send queue. */
1862 __skb_header_release(buff); 1913 __skb_header_release(buff);
1863 tcp_insert_write_queue_after(skb, buff, sk); 1914 tcp_insert_write_queue_after(skb, buff, sk, tcp_queue);
1864 1915
1865 return 0; 1916 return 0;
1866} 1917}
@@ -1910,7 +1961,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
1910 if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len)) 1961 if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len))
1911 goto send_now; 1962 goto send_now;
1912 1963
1913 win_divisor = READ_ONCE(sysctl_tcp_tso_win_divisor); 1964 win_divisor = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tso_win_divisor);
1914 if (win_divisor) { 1965 if (win_divisor) {
1915 u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache); 1966 u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
1916 1967
@@ -1930,8 +1981,10 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
1930 goto send_now; 1981 goto send_now;
1931 } 1982 }
1932 1983
1933 head = tcp_write_queue_head(sk); 1984 /* TODO : use tsorted_sent_queue ? */
1934 1985 head = tcp_rtx_queue_head(sk);
1986 if (!head)
1987 goto send_now;
1935 age = tcp_stamp_us_delta(tp->tcp_mstamp, head->skb_mstamp); 1988 age = tcp_stamp_us_delta(tp->tcp_mstamp, head->skb_mstamp);
1936 /* If next ACK is likely to come too late (half srtt), do not defer */ 1989 /* If next ACK is likely to come too late (half srtt), do not defer */
1937 if (age < (tp->srtt_us >> 4)) 1990 if (age < (tp->srtt_us >> 4))
@@ -2145,18 +2198,18 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
2145{ 2198{
2146 unsigned int limit; 2199 unsigned int limit;
2147 2200
2148 limit = max(2 * skb->truesize, sk->sk_pacing_rate >> 10); 2201 limit = max(2 * skb->truesize, sk->sk_pacing_rate >> sk->sk_pacing_shift);
2149 limit = min_t(u32, limit, sysctl_tcp_limit_output_bytes); 2202 limit = min_t(u32, limit,
2203 sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes);
2150 limit <<= factor; 2204 limit <<= factor;
2151 2205
2152 if (refcount_read(&sk->sk_wmem_alloc) > limit) { 2206 if (refcount_read(&sk->sk_wmem_alloc) > limit) {
2153 /* Always send the 1st or 2nd skb in write queue. 2207 /* Always send skb if rtx queue is empty.
2154 * No need to wait for TX completion to call us back, 2208 * No need to wait for TX completion to call us back,
2155 * after softirq/tasklet schedule. 2209 * after softirq/tasklet schedule.
2156 * This helps when TX completions are delayed too much. 2210 * This helps when TX completions are delayed too much.
2157 */ 2211 */
2158 if (skb == sk->sk_write_queue.next || 2212 if (tcp_rtx_queue_empty(sk))
2159 skb->prev == sk->sk_write_queue.next)
2160 return false; 2213 return false;
2161 2214
2162 set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags); 2215 set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
@@ -2207,7 +2260,7 @@ void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type)
2207 * it's the "most interesting" or current chrono we are 2260 * it's the "most interesting" or current chrono we are
2208 * tracking and starts busy chrono if we have pending data. 2261 * tracking and starts busy chrono if we have pending data.
2209 */ 2262 */
2210 if (tcp_write_queue_empty(sk)) 2263 if (tcp_rtx_and_write_queues_empty(sk))
2211 tcp_chrono_set(tp, TCP_CHRONO_UNSPEC); 2264 tcp_chrono_set(tp, TCP_CHRONO_UNSPEC);
2212 else if (type == tp->chrono_type) 2265 else if (type == tp->chrono_type)
2213 tcp_chrono_set(tp, TCP_CHRONO_BUSY); 2266 tcp_chrono_set(tp, TCP_CHRONO_BUSY);
@@ -2263,7 +2316,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
2263 2316
2264 if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) { 2317 if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
2265 /* "skb_mstamp" is used as a start point for the retransmit timer */ 2318 /* "skb_mstamp" is used as a start point for the retransmit timer */
2266 skb->skb_mstamp = tp->tcp_mstamp; 2319 tcp_update_skb_after_send(tp, skb);
2267 goto repair; /* Skip network transmission */ 2320 goto repair; /* Skip network transmission */
2268 } 2321 }
2269 2322
@@ -2302,7 +2355,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
2302 nonagle); 2355 nonagle);
2303 2356
2304 if (skb->len > limit && 2357 if (skb->len > limit &&
2305 unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) 2358 unlikely(tso_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE,
2359 skb, limit, mss_now, gfp)))
2306 break; 2360 break;
2307 2361
2308 if (test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags)) 2362 if (test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags))
@@ -2342,7 +2396,7 @@ repair:
2342 tcp_cwnd_validate(sk, is_cwnd_limited); 2396 tcp_cwnd_validate(sk, is_cwnd_limited);
2343 return false; 2397 return false;
2344 } 2398 }
2345 return !tp->packets_out && tcp_send_head(sk); 2399 return !tp->packets_out && !tcp_write_queue_empty(sk);
2346} 2400}
2347 2401
2348bool tcp_schedule_loss_probe(struct sock *sk) 2402bool tcp_schedule_loss_probe(struct sock *sk)
@@ -2350,6 +2404,7 @@ bool tcp_schedule_loss_probe(struct sock *sk)
2350 struct inet_connection_sock *icsk = inet_csk(sk); 2404 struct inet_connection_sock *icsk = inet_csk(sk);
2351 struct tcp_sock *tp = tcp_sk(sk); 2405 struct tcp_sock *tp = tcp_sk(sk);
2352 u32 timeout, rto_delta_us; 2406 u32 timeout, rto_delta_us;
2407 int early_retrans;
2353 2408
2354 /* Don't do any loss probe on a Fast Open connection before 3WHS 2409 /* Don't do any loss probe on a Fast Open connection before 3WHS
2355 * finishes. 2410 * finishes.
@@ -2357,16 +2412,17 @@ bool tcp_schedule_loss_probe(struct sock *sk)
2357 if (tp->fastopen_rsk) 2412 if (tp->fastopen_rsk)
2358 return false; 2413 return false;
2359 2414
2415 early_retrans = sock_net(sk)->ipv4.sysctl_tcp_early_retrans;
2360 /* Schedule a loss probe in 2*RTT for SACK capable connections 2416 /* Schedule a loss probe in 2*RTT for SACK capable connections
2361 * in Open state, that are either limited by cwnd or application. 2417 * in Open state, that are either limited by cwnd or application.
2362 */ 2418 */
2363 if ((sysctl_tcp_early_retrans != 3 && sysctl_tcp_early_retrans != 4) || 2419 if ((early_retrans != 3 && early_retrans != 4) ||
2364 !tp->packets_out || !tcp_is_sack(tp) || 2420 !tp->packets_out || !tcp_is_sack(tp) ||
2365 icsk->icsk_ca_state != TCP_CA_Open) 2421 icsk->icsk_ca_state != TCP_CA_Open)
2366 return false; 2422 return false;
2367 2423
2368 if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) && 2424 if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) &&
2369 tcp_send_head(sk)) 2425 !tcp_write_queue_empty(sk))
2370 return false; 2426 return false;
2371 2427
2372 /* Probe timeout is 2*rtt. Add minimum RTO to account 2428 /* Probe timeout is 2*rtt. Add minimum RTO to account
@@ -2419,18 +2475,14 @@ void tcp_send_loss_probe(struct sock *sk)
2419 int mss = tcp_current_mss(sk); 2475 int mss = tcp_current_mss(sk);
2420 2476
2421 skb = tcp_send_head(sk); 2477 skb = tcp_send_head(sk);
2422 if (skb) { 2478 if (skb && tcp_snd_wnd_test(tp, skb, mss)) {
2423 if (tcp_snd_wnd_test(tp, skb, mss)) { 2479 pcount = tp->packets_out;
2424 pcount = tp->packets_out; 2480 tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC);
2425 tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC); 2481 if (tp->packets_out > pcount)
2426 if (tp->packets_out > pcount) 2482 goto probe_sent;
2427 goto probe_sent; 2483 goto rearm_timer;
2428 goto rearm_timer;
2429 }
2430 skb = tcp_write_queue_prev(sk, skb);
2431 } else {
2432 skb = tcp_write_queue_tail(sk);
2433 } 2484 }
2485 skb = skb_rb_last(&sk->tcp_rtx_queue);
2434 2486
2435 /* At most one outstanding TLP retransmission. */ 2487 /* At most one outstanding TLP retransmission. */
2436 if (tp->tlp_high_seq) 2488 if (tp->tlp_high_seq)
@@ -2448,10 +2500,11 @@ void tcp_send_loss_probe(struct sock *sk)
2448 goto rearm_timer; 2500 goto rearm_timer;
2449 2501
2450 if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) { 2502 if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) {
2451 if (unlikely(tcp_fragment(sk, skb, (pcount - 1) * mss, mss, 2503 if (unlikely(tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
2504 (pcount - 1) * mss, mss,
2452 GFP_ATOMIC))) 2505 GFP_ATOMIC)))
2453 goto rearm_timer; 2506 goto rearm_timer;
2454 skb = tcp_write_queue_next(sk, skb); 2507 skb = skb_rb_next(skb);
2455 } 2508 }
2456 2509
2457 if (WARN_ON(!skb || !tcp_skb_pcount(skb))) 2510 if (WARN_ON(!skb || !tcp_skb_pcount(skb)))
@@ -2651,7 +2704,7 @@ void tcp_skb_collapse_tstamp(struct sk_buff *skb,
2651static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) 2704static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
2652{ 2705{
2653 struct tcp_sock *tp = tcp_sk(sk); 2706 struct tcp_sock *tp = tcp_sk(sk);
2654 struct sk_buff *next_skb = tcp_write_queue_next(sk, skb); 2707 struct sk_buff *next_skb = skb_rb_next(skb);
2655 int skb_size, next_skb_size; 2708 int skb_size, next_skb_size;
2656 2709
2657 skb_size = skb->len; 2710 skb_size = skb->len;
@@ -2668,8 +2721,6 @@ static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
2668 } 2721 }
2669 tcp_highest_sack_replace(sk, next_skb, skb); 2722 tcp_highest_sack_replace(sk, next_skb, skb);
2670 2723
2671 tcp_unlink_write_queue(next_skb, sk);
2672
2673 if (next_skb->ip_summed == CHECKSUM_PARTIAL) 2724 if (next_skb->ip_summed == CHECKSUM_PARTIAL)
2674 skb->ip_summed = CHECKSUM_PARTIAL; 2725 skb->ip_summed = CHECKSUM_PARTIAL;
2675 2726
@@ -2697,7 +2748,7 @@ static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
2697 2748
2698 tcp_skb_collapse_tstamp(skb, next_skb); 2749 tcp_skb_collapse_tstamp(skb, next_skb);
2699 2750
2700 sk_wmem_free_skb(sk, next_skb); 2751 tcp_rtx_queue_unlink_and_free(next_skb, sk);
2701 return true; 2752 return true;
2702} 2753}
2703 2754
@@ -2708,8 +2759,6 @@ static bool tcp_can_collapse(const struct sock *sk, const struct sk_buff *skb)
2708 return false; 2759 return false;
2709 if (skb_cloned(skb)) 2760 if (skb_cloned(skb))
2710 return false; 2761 return false;
2711 if (skb == tcp_send_head(sk))
2712 return false;
2713 /* Some heuristics for collapsing over SACK'd could be invented */ 2762 /* Some heuristics for collapsing over SACK'd could be invented */
2714 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) 2763 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
2715 return false; 2764 return false;
@@ -2727,12 +2776,12 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
2727 struct sk_buff *skb = to, *tmp; 2776 struct sk_buff *skb = to, *tmp;
2728 bool first = true; 2777 bool first = true;
2729 2778
2730 if (!sysctl_tcp_retrans_collapse) 2779 if (!sock_net(sk)->ipv4.sysctl_tcp_retrans_collapse)
2731 return; 2780 return;
2732 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) 2781 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
2733 return; 2782 return;
2734 2783
2735 tcp_for_write_queue_from_safe(skb, tmp, sk) { 2784 skb_rbtree_walk_from_safe(skb, tmp) {
2736 if (!tcp_can_collapse(sk, skb)) 2785 if (!tcp_can_collapse(sk, skb))
2737 break; 2786 break;
2738 2787
@@ -2807,7 +2856,8 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
2807 2856
2808 len = cur_mss * segs; 2857 len = cur_mss * segs;
2809 if (skb->len > len) { 2858 if (skb->len > len) {
2810 if (tcp_fragment(sk, skb, len, cur_mss, GFP_ATOMIC)) 2859 if (tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, len,
2860 cur_mss, GFP_ATOMIC))
2811 return -ENOMEM; /* We'll try again later. */ 2861 return -ENOMEM; /* We'll try again later. */
2812 } else { 2862 } else {
2813 if (skb_unclone(skb, GFP_ATOMIC)) 2863 if (skb_unclone(skb, GFP_ATOMIC))
@@ -2841,11 +2891,14 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
2841 skb_headroom(skb) >= 0xFFFF)) { 2891 skb_headroom(skb) >= 0xFFFF)) {
2842 struct sk_buff *nskb; 2892 struct sk_buff *nskb;
2843 2893
2844 nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC); 2894 tcp_skb_tsorted_save(skb) {
2845 err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) : 2895 nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC);
2846 -ENOBUFS; 2896 err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) :
2897 -ENOBUFS;
2898 } tcp_skb_tsorted_restore(skb);
2899
2847 if (!err) { 2900 if (!err) {
2848 skb->skb_mstamp = tp->tcp_mstamp; 2901 tcp_update_skb_after_send(tp, skb);
2849 tcp_rate_skb_sent(sk, skb); 2902 tcp_rate_skb_sent(sk, skb);
2850 } 2903 }
2851 } else { 2904 } else {
@@ -2854,6 +2907,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
2854 2907
2855 if (likely(!err)) { 2908 if (likely(!err)) {
2856 TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS; 2909 TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS;
2910 trace_tcp_retransmit_skb(sk, skb);
2857 } else if (err != -EBUSY) { 2911 } else if (err != -EBUSY) {
2858 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL); 2912 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);
2859 } 2913 }
@@ -2890,36 +2944,25 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
2890 * retransmitted data is acknowledged. It tries to continue 2944 * retransmitted data is acknowledged. It tries to continue
2891 * resending the rest of the retransmit queue, until either 2945 * resending the rest of the retransmit queue, until either
2892 * we've sent it all or the congestion window limit is reached. 2946 * we've sent it all or the congestion window limit is reached.
2893 * If doing SACK, the first ACK which comes back for a timeout
2894 * based retransmit packet might feed us FACK information again.
2895 * If so, we use it to avoid unnecessarily retransmissions.
2896 */ 2947 */
2897void tcp_xmit_retransmit_queue(struct sock *sk) 2948void tcp_xmit_retransmit_queue(struct sock *sk)
2898{ 2949{
2899 const struct inet_connection_sock *icsk = inet_csk(sk); 2950 const struct inet_connection_sock *icsk = inet_csk(sk);
2951 struct sk_buff *skb, *rtx_head, *hole = NULL;
2900 struct tcp_sock *tp = tcp_sk(sk); 2952 struct tcp_sock *tp = tcp_sk(sk);
2901 struct sk_buff *skb;
2902 struct sk_buff *hole = NULL;
2903 u32 max_segs; 2953 u32 max_segs;
2904 int mib_idx; 2954 int mib_idx;
2905 2955
2906 if (!tp->packets_out) 2956 if (!tp->packets_out)
2907 return; 2957 return;
2908 2958
2909 if (tp->retransmit_skb_hint) { 2959 rtx_head = tcp_rtx_queue_head(sk);
2910 skb = tp->retransmit_skb_hint; 2960 skb = tp->retransmit_skb_hint ?: rtx_head;
2911 } else {
2912 skb = tcp_write_queue_head(sk);
2913 }
2914
2915 max_segs = tcp_tso_segs(sk, tcp_current_mss(sk)); 2961 max_segs = tcp_tso_segs(sk, tcp_current_mss(sk));
2916 tcp_for_write_queue_from(skb, sk) { 2962 skb_rbtree_walk_from(skb) {
2917 __u8 sacked; 2963 __u8 sacked;
2918 int segs; 2964 int segs;
2919 2965
2920 if (skb == tcp_send_head(sk))
2921 break;
2922
2923 if (tcp_pacing_check(sk)) 2966 if (tcp_pacing_check(sk))
2924 break; 2967 break;
2925 2968
@@ -2964,7 +3007,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
2964 if (tcp_in_cwnd_reduction(sk)) 3007 if (tcp_in_cwnd_reduction(sk))
2965 tp->prr_out += tcp_skb_pcount(skb); 3008 tp->prr_out += tcp_skb_pcount(skb);
2966 3009
2967 if (skb == tcp_write_queue_head(sk) && 3010 if (skb == rtx_head &&
2968 icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT) 3011 icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT)
2969 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 3012 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
2970 inet_csk(sk)->icsk_rto, 3013 inet_csk(sk)->icsk_rto,
@@ -3006,12 +3049,15 @@ void tcp_send_fin(struct sock *sk)
3006 * Note: in the latter case, FIN packet will be sent after a timeout, 3049 * Note: in the latter case, FIN packet will be sent after a timeout,
3007 * as TCP stack thinks it has already been transmitted. 3050 * as TCP stack thinks it has already been transmitted.
3008 */ 3051 */
3009 if (tskb && (tcp_send_head(sk) || tcp_under_memory_pressure(sk))) { 3052 if (!tskb && tcp_under_memory_pressure(sk))
3053 tskb = skb_rb_last(&sk->tcp_rtx_queue);
3054
3055 if (tskb) {
3010coalesce: 3056coalesce:
3011 TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN; 3057 TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN;
3012 TCP_SKB_CB(tskb)->end_seq++; 3058 TCP_SKB_CB(tskb)->end_seq++;
3013 tp->write_seq++; 3059 tp->write_seq++;
3014 if (!tcp_send_head(sk)) { 3060 if (tcp_write_queue_empty(sk)) {
3015 /* This means tskb was already sent. 3061 /* This means tskb was already sent.
3016 * Pretend we included the FIN on previous transmit. 3062 * Pretend we included the FIN on previous transmit.
3017 * We need to set tp->snd_nxt to the value it would have 3063 * We need to set tp->snd_nxt to the value it would have
@@ -3028,6 +3074,7 @@ coalesce:
3028 goto coalesce; 3074 goto coalesce;
3029 return; 3075 return;
3030 } 3076 }
3077 INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
3031 skb_reserve(skb, MAX_TCP_HEADER); 3078 skb_reserve(skb, MAX_TCP_HEADER);
3032 sk_forced_mem_schedule(sk, skb->truesize); 3079 sk_forced_mem_schedule(sk, skb->truesize);
3033 /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */ 3080 /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
@@ -3064,6 +3111,11 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority)
3064 /* Send it off. */ 3111 /* Send it off. */
3065 if (tcp_transmit_skb(sk, skb, 0, priority)) 3112 if (tcp_transmit_skb(sk, skb, 0, priority))
3066 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED); 3113 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
3114
3115 /* skb of trace_tcp_send_reset() keeps the skb that caused RST,
3116 * skb here is different to the troublesome skb, so use NULL
3117 */
3118 trace_tcp_send_reset(sk, NULL);
3067} 3119}
3068 3120
3069/* Send a crossed SYN-ACK during socket establishment. 3121/* Send a crossed SYN-ACK during socket establishment.
@@ -3076,20 +3128,24 @@ int tcp_send_synack(struct sock *sk)
3076{ 3128{
3077 struct sk_buff *skb; 3129 struct sk_buff *skb;
3078 3130
3079 skb = tcp_write_queue_head(sk); 3131 skb = tcp_rtx_queue_head(sk);
3080 if (!skb || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { 3132 if (!skb || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
3081 pr_debug("%s: wrong queue state\n", __func__); 3133 pr_err("%s: wrong queue state\n", __func__);
3082 return -EFAULT; 3134 return -EFAULT;
3083 } 3135 }
3084 if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) { 3136 if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) {
3085 if (skb_cloned(skb)) { 3137 if (skb_cloned(skb)) {
3086 struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC); 3138 struct sk_buff *nskb;
3139
3140 tcp_skb_tsorted_save(skb) {
3141 nskb = skb_copy(skb, GFP_ATOMIC);
3142 } tcp_skb_tsorted_restore(skb);
3087 if (!nskb) 3143 if (!nskb)
3088 return -ENOMEM; 3144 return -ENOMEM;
3089 tcp_unlink_write_queue(skb, sk); 3145 INIT_LIST_HEAD(&nskb->tcp_tsorted_anchor);
3146 tcp_rtx_queue_unlink_and_free(skb, sk);
3090 __skb_header_release(nskb); 3147 __skb_header_release(nskb);
3091 __tcp_add_write_queue_head(sk, nskb); 3148 tcp_rbtree_insert(&sk->tcp_rtx_queue, nskb);
3092 sk_wmem_free_skb(sk, skb);
3093 sk->sk_wmem_queued += nskb->truesize; 3149 sk->sk_wmem_queued += nskb->truesize;
3094 sk_mem_charge(sk, nskb->truesize); 3150 sk_mem_charge(sk, nskb->truesize);
3095 skb = nskb; 3151 skb = nskb;
@@ -3166,8 +3222,8 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
3166 md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req)); 3222 md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req));
3167#endif 3223#endif
3168 skb_set_hash(skb, tcp_rsk(req)->txhash, PKT_HASH_TYPE_L4); 3224 skb_set_hash(skb, tcp_rsk(req)->txhash, PKT_HASH_TYPE_L4);
3169 tcp_header_size = tcp_synack_options(req, mss, skb, &opts, md5, foc) + 3225 tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, md5,
3170 sizeof(*th); 3226 foc) + sizeof(*th);
3171 3227
3172 skb_push(skb, tcp_header_size); 3228 skb_push(skb, tcp_header_size);
3173 skb_reset_transport_header(skb); 3229 skb_reset_transport_header(skb);
@@ -3268,7 +3324,7 @@ static void tcp_connect_init(struct sock *sk)
3268 if (rcv_wnd == 0) 3324 if (rcv_wnd == 0)
3269 rcv_wnd = dst_metric(dst, RTAX_INITRWND); 3325 rcv_wnd = dst_metric(dst, RTAX_INITRWND);
3270 3326
3271 tcp_select_initial_window(tcp_full_space(sk), 3327 tcp_select_initial_window(sk, tcp_full_space(sk),
3272 tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), 3328 tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
3273 &tp->rcv_wnd, 3329 &tp->rcv_wnd,
3274 &tp->window_clamp, 3330 &tp->window_clamp,
@@ -3307,7 +3363,6 @@ static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb)
3307 3363
3308 tcb->end_seq += skb->len; 3364 tcb->end_seq += skb->len;
3309 __skb_header_release(skb); 3365 __skb_header_release(skb);
3310 __tcp_add_write_queue_tail(sk, skb);
3311 sk->sk_wmem_queued += skb->truesize; 3366 sk->sk_wmem_queued += skb->truesize;
3312 sk_mem_charge(sk, skb->truesize); 3367 sk_mem_charge(sk, skb->truesize);
3313 tp->write_seq = tcb->end_seq; 3368 tp->write_seq = tcb->end_seq;
@@ -3355,6 +3410,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
3355 int copied = copy_from_iter(skb_put(syn_data, space), space, 3410 int copied = copy_from_iter(skb_put(syn_data, space), space,
3356 &fo->data->msg_iter); 3411 &fo->data->msg_iter);
3357 if (unlikely(!copied)) { 3412 if (unlikely(!copied)) {
3413 tcp_skb_tsorted_anchor_cleanup(syn_data);
3358 kfree_skb(syn_data); 3414 kfree_skb(syn_data);
3359 goto fallback; 3415 goto fallback;
3360 } 3416 }
@@ -3385,12 +3441,13 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
3385 TCP_SKB_CB(syn_data)->tcp_flags = TCPHDR_ACK | TCPHDR_PSH; 3441 TCP_SKB_CB(syn_data)->tcp_flags = TCPHDR_ACK | TCPHDR_PSH;
3386 if (!err) { 3442 if (!err) {
3387 tp->syn_data = (fo->copied > 0); 3443 tp->syn_data = (fo->copied > 0);
3444 tcp_rbtree_insert(&sk->tcp_rtx_queue, syn_data);
3388 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT); 3445 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT);
3389 goto done; 3446 goto done;
3390 } 3447 }
3391 3448
3392 /* data was not sent, this is our new send_head */ 3449 /* data was not sent, put it in write_queue */
3393 sk->sk_send_head = syn_data; 3450 __skb_queue_tail(&sk->sk_write_queue, syn_data);
3394 tp->packets_out -= tcp_skb_pcount(syn_data); 3451 tp->packets_out -= tcp_skb_pcount(syn_data);
3395 3452
3396fallback: 3453fallback:
@@ -3433,6 +3490,7 @@ int tcp_connect(struct sock *sk)
3433 tp->retrans_stamp = tcp_time_stamp(tp); 3490 tp->retrans_stamp = tcp_time_stamp(tp);
3434 tcp_connect_queue_skb(sk, buff); 3491 tcp_connect_queue_skb(sk, buff);
3435 tcp_ecn_send_syn(sk, buff); 3492 tcp_ecn_send_syn(sk, buff);
3493 tcp_rbtree_insert(&sk->tcp_rtx_queue, buff);
3436 3494
3437 /* Send off SYN; include data in Fast Open. */ 3495 /* Send off SYN; include data in Fast Open. */
3438 err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) : 3496 err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) :
@@ -3627,7 +3685,8 @@ int tcp_write_wakeup(struct sock *sk, int mib)
3627 skb->len > mss) { 3685 skb->len > mss) {
3628 seg_size = min(seg_size, mss); 3686 seg_size = min(seg_size, mss);
3629 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; 3687 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
3630 if (tcp_fragment(sk, skb, seg_size, mss, GFP_ATOMIC)) 3688 if (tcp_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE,
3689 skb, seg_size, mss, GFP_ATOMIC))
3631 return -1; 3690 return -1;
3632 } else if (!tcp_skb_pcount(skb)) 3691 } else if (!tcp_skb_pcount(skb))
3633 tcp_set_skb_tso_segs(skb, mss); 3692 tcp_set_skb_tso_segs(skb, mss);
@@ -3657,7 +3716,7 @@ void tcp_send_probe0(struct sock *sk)
3657 3716
3658 err = tcp_write_wakeup(sk, LINUX_MIB_TCPWINPROBE); 3717 err = tcp_write_wakeup(sk, LINUX_MIB_TCPWINPROBE);
3659 3718
3660 if (tp->packets_out || !tcp_send_head(sk)) { 3719 if (tp->packets_out || tcp_write_queue_empty(sk)) {
3661 /* Cancel probe timer, if it is not required. */ 3720 /* Cancel probe timer, if it is not required. */
3662 icsk->icsk_probes_out = 0; 3721 icsk->icsk_probes_out = 0;
3663 icsk->icsk_backoff = 0; 3722 icsk->icsk_backoff = 0;
@@ -3698,6 +3757,7 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
3698 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); 3757 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
3699 if (unlikely(tcp_passive_fastopen(sk))) 3758 if (unlikely(tcp_passive_fastopen(sk)))
3700 tcp_sk(sk)->total_retrans++; 3759 tcp_sk(sk)->total_retrans++;
3760 trace_tcp_retransmit_synack(sk, req);
3701 } 3761 }
3702 return res; 3762 return res;
3703} 3763}