aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp_output.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-11-15 14:56:19 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2017-11-15 14:56:19 -0500
commit5bbcc0f595fadb4cac0eddc4401035ec0bd95b09 (patch)
tree3b65e490cc36a6c6fecac1fa24d9e0ac9ced4455 /net/ipv4/tcp_output.c
parent892204e06cb9e89fbc4b299a678f9ca358e97cac (diff)
parent50895b9de1d3e0258e015e8e55128d835d9a9f19 (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking updates from David Miller: "Highlights: 1) Maintain the TCP retransmit queue using an rbtree, with 1GB windows at 100Gb this really has become necessary. From Eric Dumazet. 2) Multi-program support for cgroup+bpf, from Alexei Starovoitov. 3) Perform broadcast flooding in hardware in mv88e6xxx, from Andrew Lunn. 4) Add meter action support to openvswitch, from Andy Zhou. 5) Add a data meta pointer for BPF accessible packets, from Daniel Borkmann. 6) Namespace-ify almost all TCP sysctl knobs, from Eric Dumazet. 7) Turn on Broadcom Tags in b53 driver, from Florian Fainelli. 8) More work to move the RTNL mutex down, from Florian Westphal. 9) Add 'bpftool' utility, to help with bpf program introspection. From Jakub Kicinski. 10) Add new 'cpumap' type for XDP_REDIRECT action, from Jesper Dangaard Brouer. 11) Support 'blocks' of transformations in the packet scheduler which can span multiple network devices, from Jiri Pirko. 12) TC flower offload support in cxgb4, from Kumar Sanghvi. 13) Priority based stream scheduler for SCTP, from Marcelo Ricardo Leitner. 14) Thunderbolt networking driver, from Amir Levy and Mika Westerberg. 15) Add RED qdisc offloadability, and use it in mlxsw driver. From Nogah Frankel. 16) eBPF based device controller for cgroup v2, from Roman Gushchin. 17) Add some fundamental tracepoints for TCP, from Song Liu. 18) Remove garbage collection from ipv6 route layer, this is a significant accomplishment. From Wei Wang. 19) Add multicast route offload support to mlxsw, from Yotam Gigi" * git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (2177 commits) tcp: highest_sack fix geneve: fix fill_info when link down bpf: fix lockdep splat net: cdc_ncm: GetNtbFormat endian fix openvswitch: meter: fix NULL pointer dereference in ovs_meter_cmd_reply_start netem: remove unnecessary 64 bit modulus netem: use 64 bit divide by rate tcp: Namespace-ify sysctl_tcp_default_congestion_control net: Protect iterations over net::fib_notifier_ops in fib_seq_sum() ipv6: set all.accept_dad to 0 by default uapi: fix linux/tls.h userspace compilation error usbnet: ipheth: prevent TX queue timeouts when device not ready vhost_net: conditionally enable tx polling uapi: fix linux/rxrpc.h userspace compilation errors net: stmmac: fix LPI transitioning for dwmac4 atm: horizon: Fix irq release error net-sysfs: trigger netlink notification on ifalias change via sysfs openvswitch: Using kfree_rcu() to simplify the code openvswitch: Make local function ovs_nsh_key_attr_size() static openvswitch: Fix return value check in ovs_meter_cmd_features() ...
Diffstat (limited to 'net/ipv4/tcp_output.c')
-rw-r--r--net/ipv4/tcp_output.c324
1 files changed, 192 insertions, 132 deletions
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 5a42e873d44a..540b7d92cc70 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -41,40 +41,25 @@
41#include <linux/compiler.h> 41#include <linux/compiler.h>
42#include <linux/gfp.h> 42#include <linux/gfp.h>
43#include <linux/module.h> 43#include <linux/module.h>
44#include <linux/static_key.h>
44 45
45/* People can turn this off for buggy TCP's found in printers etc. */ 46#include <trace/events/tcp.h>
46int sysctl_tcp_retrans_collapse __read_mostly = 1;
47
48/* People can turn this on to work with those rare, broken TCPs that
49 * interpret the window field as a signed quantity.
50 */
51int sysctl_tcp_workaround_signed_windows __read_mostly = 0;
52
53/* Default TSQ limit of four TSO segments */
54int sysctl_tcp_limit_output_bytes __read_mostly = 262144;
55
56/* This limits the percentage of the congestion window which we
57 * will allow a single TSO frame to consume. Building TSO frames
58 * which are too large can cause TCP streams to be bursty.
59 */
60int sysctl_tcp_tso_win_divisor __read_mostly = 3;
61
62/* By default, RFC2861 behavior. */
63int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
64 47
65static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, 48static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
66 int push_one, gfp_t gfp); 49 int push_one, gfp_t gfp);
67 50
68/* Account for new data that has been sent to the network. */ 51/* Account for new data that has been sent to the network. */
69static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb) 52static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb)
70{ 53{
71 struct inet_connection_sock *icsk = inet_csk(sk); 54 struct inet_connection_sock *icsk = inet_csk(sk);
72 struct tcp_sock *tp = tcp_sk(sk); 55 struct tcp_sock *tp = tcp_sk(sk);
73 unsigned int prior_packets = tp->packets_out; 56 unsigned int prior_packets = tp->packets_out;
74 57
75 tcp_advance_send_head(sk, skb);
76 tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; 58 tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
77 59
60 __skb_unlink(skb, &sk->sk_write_queue);
61 tcp_rbtree_insert(&sk->tcp_rtx_queue, skb);
62
78 tp->packets_out += tcp_skb_pcount(skb); 63 tp->packets_out += tcp_skb_pcount(skb);
79 if (!prior_packets || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) 64 if (!prior_packets || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
80 tcp_rearm_rto(sk); 65 tcp_rearm_rto(sk);
@@ -203,7 +188,7 @@ u32 tcp_default_init_rwnd(u32 mss)
203 * be a multiple of mss if possible. We assume here that mss >= 1. 188 * be a multiple of mss if possible. We assume here that mss >= 1.
204 * This MUST be enforced by all callers. 189 * This MUST be enforced by all callers.
205 */ 190 */
206void tcp_select_initial_window(int __space, __u32 mss, 191void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss,
207 __u32 *rcv_wnd, __u32 *window_clamp, 192 __u32 *rcv_wnd, __u32 *window_clamp,
208 int wscale_ok, __u8 *rcv_wscale, 193 int wscale_ok, __u8 *rcv_wscale,
209 __u32 init_rcv_wnd) 194 __u32 init_rcv_wnd)
@@ -227,7 +212,7 @@ void tcp_select_initial_window(int __space, __u32 mss,
227 * which we interpret as a sign the remote TCP is not 212 * which we interpret as a sign the remote TCP is not
228 * misinterpreting the window field as a signed quantity. 213 * misinterpreting the window field as a signed quantity.
229 */ 214 */
230 if (sysctl_tcp_workaround_signed_windows) 215 if (sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows)
231 (*rcv_wnd) = min(space, MAX_TCP_WINDOW); 216 (*rcv_wnd) = min(space, MAX_TCP_WINDOW);
232 else 217 else
233 (*rcv_wnd) = space; 218 (*rcv_wnd) = space;
@@ -235,7 +220,7 @@ void tcp_select_initial_window(int __space, __u32 mss,
235 (*rcv_wscale) = 0; 220 (*rcv_wscale) = 0;
236 if (wscale_ok) { 221 if (wscale_ok) {
237 /* Set window scaling on max possible window */ 222 /* Set window scaling on max possible window */
238 space = max_t(u32, space, sysctl_tcp_rmem[2]); 223 space = max_t(u32, space, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
239 space = max_t(u32, space, sysctl_rmem_max); 224 space = max_t(u32, space, sysctl_rmem_max);
240 space = min_t(u32, space, *window_clamp); 225 space = min_t(u32, space, *window_clamp);
241 while (space > U16_MAX && (*rcv_wscale) < TCP_MAX_WSCALE) { 226 while (space > U16_MAX && (*rcv_wscale) < TCP_MAX_WSCALE) {
@@ -287,7 +272,8 @@ static u16 tcp_select_window(struct sock *sk)
287 /* Make sure we do not exceed the maximum possible 272 /* Make sure we do not exceed the maximum possible
288 * scaled window. 273 * scaled window.
289 */ 274 */
290 if (!tp->rx_opt.rcv_wscale && sysctl_tcp_workaround_signed_windows) 275 if (!tp->rx_opt.rcv_wscale &&
276 sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows)
291 new_win = min(new_win, MAX_TCP_WINDOW); 277 new_win = min(new_win, MAX_TCP_WINDOW);
292 else 278 else
293 new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale)); 279 new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale));
@@ -395,7 +381,6 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,
395static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) 381static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
396{ 382{
397 skb->ip_summed = CHECKSUM_PARTIAL; 383 skb->ip_summed = CHECKSUM_PARTIAL;
398 skb->csum = 0;
399 384
400 TCP_SKB_CB(skb)->tcp_flags = flags; 385 TCP_SKB_CB(skb)->tcp_flags = flags;
401 TCP_SKB_CB(skb)->sacked = 0; 386 TCP_SKB_CB(skb)->sacked = 0;
@@ -418,6 +403,22 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp)
418#define OPTION_MD5 (1 << 2) 403#define OPTION_MD5 (1 << 2)
419#define OPTION_WSCALE (1 << 3) 404#define OPTION_WSCALE (1 << 3)
420#define OPTION_FAST_OPEN_COOKIE (1 << 8) 405#define OPTION_FAST_OPEN_COOKIE (1 << 8)
406#define OPTION_SMC (1 << 9)
407
408static void smc_options_write(__be32 *ptr, u16 *options)
409{
410#if IS_ENABLED(CONFIG_SMC)
411 if (static_branch_unlikely(&tcp_have_smc)) {
412 if (unlikely(OPTION_SMC & *options)) {
413 *ptr++ = htonl((TCPOPT_NOP << 24) |
414 (TCPOPT_NOP << 16) |
415 (TCPOPT_EXP << 8) |
416 (TCPOLEN_EXP_SMC_BASE));
417 *ptr++ = htonl(TCPOPT_SMC_MAGIC);
418 }
419 }
420#endif
421}
421 422
422struct tcp_out_options { 423struct tcp_out_options {
423 u16 options; /* bit field of OPTION_* */ 424 u16 options; /* bit field of OPTION_* */
@@ -536,6 +537,41 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
536 } 537 }
537 ptr += (len + 3) >> 2; 538 ptr += (len + 3) >> 2;
538 } 539 }
540
541 smc_options_write(ptr, &options);
542}
543
544static void smc_set_option(const struct tcp_sock *tp,
545 struct tcp_out_options *opts,
546 unsigned int *remaining)
547{
548#if IS_ENABLED(CONFIG_SMC)
549 if (static_branch_unlikely(&tcp_have_smc)) {
550 if (tp->syn_smc) {
551 if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
552 opts->options |= OPTION_SMC;
553 *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
554 }
555 }
556 }
557#endif
558}
559
560static void smc_set_option_cond(const struct tcp_sock *tp,
561 const struct inet_request_sock *ireq,
562 struct tcp_out_options *opts,
563 unsigned int *remaining)
564{
565#if IS_ENABLED(CONFIG_SMC)
566 if (static_branch_unlikely(&tcp_have_smc)) {
567 if (tp->syn_smc && ireq->smc_ok) {
568 if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
569 opts->options |= OPTION_SMC;
570 *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
571 }
572 }
573 }
574#endif
539} 575}
540 576
541/* Compute TCP options for SYN packets. This is not the final 577/* Compute TCP options for SYN packets. This is not the final
@@ -603,11 +639,14 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
603 } 639 }
604 } 640 }
605 641
642 smc_set_option(tp, opts, &remaining);
643
606 return MAX_TCP_OPTION_SPACE - remaining; 644 return MAX_TCP_OPTION_SPACE - remaining;
607} 645}
608 646
609/* Set up TCP options for SYN-ACKs. */ 647/* Set up TCP options for SYN-ACKs. */
610static unsigned int tcp_synack_options(struct request_sock *req, 648static unsigned int tcp_synack_options(const struct sock *sk,
649 struct request_sock *req,
611 unsigned int mss, struct sk_buff *skb, 650 unsigned int mss, struct sk_buff *skb,
612 struct tcp_out_options *opts, 651 struct tcp_out_options *opts,
613 const struct tcp_md5sig_key *md5, 652 const struct tcp_md5sig_key *md5,
@@ -663,6 +702,8 @@ static unsigned int tcp_synack_options(struct request_sock *req,
663 } 702 }
664 } 703 }
665 704
705 smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining);
706
666 return MAX_TCP_OPTION_SPACE - remaining; 707 return MAX_TCP_OPTION_SPACE - remaining;
667} 708}
668 709
@@ -973,6 +1014,12 @@ static void tcp_internal_pacing(struct sock *sk, const struct sk_buff *skb)
973 HRTIMER_MODE_ABS_PINNED); 1014 HRTIMER_MODE_ABS_PINNED);
974} 1015}
975 1016
1017static void tcp_update_skb_after_send(struct tcp_sock *tp, struct sk_buff *skb)
1018{
1019 skb->skb_mstamp = tp->tcp_mstamp;
1020 list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
1021}
1022
976/* This routine actually transmits TCP packets queued in by 1023/* This routine actually transmits TCP packets queued in by
977 * tcp_do_sendmsg(). This is used by both the initial 1024 * tcp_do_sendmsg(). This is used by both the initial
978 * transmission and possible later retransmissions. 1025 * transmission and possible later retransmissions.
@@ -1005,10 +1052,14 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
1005 TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq 1052 TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq
1006 - tp->snd_una; 1053 - tp->snd_una;
1007 oskb = skb; 1054 oskb = skb;
1008 if (unlikely(skb_cloned(skb))) 1055
1009 skb = pskb_copy(skb, gfp_mask); 1056 tcp_skb_tsorted_save(oskb) {
1010 else 1057 if (unlikely(skb_cloned(oskb)))
1011 skb = skb_clone(skb, gfp_mask); 1058 skb = pskb_copy(oskb, gfp_mask);
1059 else
1060 skb = skb_clone(oskb, gfp_mask);
1061 } tcp_skb_tsorted_restore(oskb);
1062
1012 if (unlikely(!skb)) 1063 if (unlikely(!skb))
1013 return -ENOBUFS; 1064 return -ENOBUFS;
1014 } 1065 }
@@ -1129,7 +1180,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
1129 err = net_xmit_eval(err); 1180 err = net_xmit_eval(err);
1130 } 1181 }
1131 if (!err && oskb) { 1182 if (!err && oskb) {
1132 oskb->skb_mstamp = tp->tcp_mstamp; 1183 tcp_update_skb_after_send(tp, oskb);
1133 tcp_rate_skb_sent(sk, oskb); 1184 tcp_rate_skb_sent(sk, oskb);
1134 } 1185 }
1135 return err; 1186 return err;
@@ -1167,21 +1218,6 @@ static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now)
1167 } 1218 }
1168} 1219}
1169 1220
1170/* When a modification to fackets out becomes necessary, we need to check
1171 * skb is counted to fackets_out or not.
1172 */
1173static void tcp_adjust_fackets_out(struct sock *sk, const struct sk_buff *skb,
1174 int decr)
1175{
1176 struct tcp_sock *tp = tcp_sk(sk);
1177
1178 if (!tp->sacked_out || tcp_is_reno(tp))
1179 return;
1180
1181 if (after(tcp_highest_sack_seq(tp), TCP_SKB_CB(skb)->seq))
1182 tp->fackets_out -= decr;
1183}
1184
1185/* Pcount in the middle of the write queue got changed, we need to do various 1221/* Pcount in the middle of the write queue got changed, we need to do various
1186 * tweaks to fix counters 1222 * tweaks to fix counters
1187 */ 1223 */
@@ -1202,11 +1238,9 @@ static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int de
1202 if (tcp_is_reno(tp) && decr > 0) 1238 if (tcp_is_reno(tp) && decr > 0)
1203 tp->sacked_out -= min_t(u32, tp->sacked_out, decr); 1239 tp->sacked_out -= min_t(u32, tp->sacked_out, decr);
1204 1240
1205 tcp_adjust_fackets_out(sk, skb, decr);
1206
1207 if (tp->lost_skb_hint && 1241 if (tp->lost_skb_hint &&
1208 before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->lost_skb_hint)->seq) && 1242 before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->lost_skb_hint)->seq) &&
1209 (tcp_is_fack(tp) || (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))) 1243 (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
1210 tp->lost_cnt_hint -= decr; 1244 tp->lost_cnt_hint -= decr;
1211 1245
1212 tcp_verify_left_out(tp); 1246 tcp_verify_left_out(tp);
@@ -1241,12 +1275,25 @@ static void tcp_skb_fragment_eor(struct sk_buff *skb, struct sk_buff *skb2)
1241 TCP_SKB_CB(skb)->eor = 0; 1275 TCP_SKB_CB(skb)->eor = 0;
1242} 1276}
1243 1277
1278/* Insert buff after skb on the write or rtx queue of sk. */
1279static void tcp_insert_write_queue_after(struct sk_buff *skb,
1280 struct sk_buff *buff,
1281 struct sock *sk,
1282 enum tcp_queue tcp_queue)
1283{
1284 if (tcp_queue == TCP_FRAG_IN_WRITE_QUEUE)
1285 __skb_queue_after(&sk->sk_write_queue, skb, buff);
1286 else
1287 tcp_rbtree_insert(&sk->tcp_rtx_queue, buff);
1288}
1289
1244/* Function to create two new TCP segments. Shrinks the given segment 1290/* Function to create two new TCP segments. Shrinks the given segment
1245 * to the specified size and appends a new segment with the rest of the 1291 * to the specified size and appends a new segment with the rest of the
1246 * packet to the list. This won't be called frequently, I hope. 1292 * packet to the list. This won't be called frequently, I hope.
1247 * Remember, these are still headerless SKBs at this point. 1293 * Remember, these are still headerless SKBs at this point.
1248 */ 1294 */
1249int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, 1295int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
1296 struct sk_buff *skb, u32 len,
1250 unsigned int mss_now, gfp_t gfp) 1297 unsigned int mss_now, gfp_t gfp)
1251{ 1298{
1252 struct tcp_sock *tp = tcp_sk(sk); 1299 struct tcp_sock *tp = tcp_sk(sk);
@@ -1329,7 +1376,9 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
1329 1376
1330 /* Link BUFF into the send queue. */ 1377 /* Link BUFF into the send queue. */
1331 __skb_header_release(buff); 1378 __skb_header_release(buff);
1332 tcp_insert_write_queue_after(skb, buff, sk); 1379 tcp_insert_write_queue_after(skb, buff, sk, tcp_queue);
1380 if (tcp_queue == TCP_FRAG_IN_RTX_QUEUE)
1381 list_add(&buff->tcp_tsorted_anchor, &skb->tcp_tsorted_anchor);
1333 1382
1334 return 0; 1383 return 0;
1335} 1384}
@@ -1607,7 +1656,7 @@ static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited)
1607 if (tp->packets_out > tp->snd_cwnd_used) 1656 if (tp->packets_out > tp->snd_cwnd_used)
1608 tp->snd_cwnd_used = tp->packets_out; 1657 tp->snd_cwnd_used = tp->packets_out;
1609 1658
1610 if (sysctl_tcp_slow_start_after_idle && 1659 if (sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle &&
1611 (s32)(tcp_jiffies32 - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto && 1660 (s32)(tcp_jiffies32 - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto &&
1612 !ca_ops->cong_control) 1661 !ca_ops->cong_control)
1613 tcp_cwnd_application_limited(sk); 1662 tcp_cwnd_application_limited(sk);
@@ -1616,10 +1665,10 @@ static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited)
1616 * is caused by insufficient sender buffer: 1665 * is caused by insufficient sender buffer:
1617 * 1) just sent some data (see tcp_write_xmit) 1666 * 1) just sent some data (see tcp_write_xmit)
1618 * 2) not cwnd limited (this else condition) 1667 * 2) not cwnd limited (this else condition)
1619 * 3) no more data to send (null tcp_send_head ) 1668 * 3) no more data to send (tcp_write_queue_empty())
1620 * 4) application is hitting buffer limit (SOCK_NOSPACE) 1669 * 4) application is hitting buffer limit (SOCK_NOSPACE)
1621 */ 1670 */
1622 if (!tcp_send_head(sk) && sk->sk_socket && 1671 if (tcp_write_queue_empty(sk) && sk->sk_socket &&
1623 test_bit(SOCK_NOSPACE, &sk->sk_socket->flags) && 1672 test_bit(SOCK_NOSPACE, &sk->sk_socket->flags) &&
1624 (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) 1673 (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
1625 tcp_chrono_start(sk, TCP_CHRONO_SNDBUF_LIMITED); 1674 tcp_chrono_start(sk, TCP_CHRONO_SNDBUF_LIMITED);
@@ -1671,7 +1720,7 @@ u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
1671{ 1720{
1672 u32 bytes, segs; 1721 u32 bytes, segs;
1673 1722
1674 bytes = min(sk->sk_pacing_rate >> 10, 1723 bytes = min(sk->sk_pacing_rate >> sk->sk_pacing_shift,
1675 sk->sk_gso_max_size - 1 - MAX_TCP_HEADER); 1724 sk->sk_gso_max_size - 1 - MAX_TCP_HEADER);
1676 1725
1677 /* Goal is to send at least one packet per ms, 1726 /* Goal is to send at least one packet per ms,
@@ -1694,7 +1743,8 @@ static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now)
1694 u32 tso_segs = ca_ops->tso_segs_goal ? ca_ops->tso_segs_goal(sk) : 0; 1743 u32 tso_segs = ca_ops->tso_segs_goal ? ca_ops->tso_segs_goal(sk) : 0;
1695 1744
1696 return tso_segs ? : 1745 return tso_segs ? :
1697 tcp_tso_autosize(sk, mss_now, sysctl_tcp_min_tso_segs); 1746 tcp_tso_autosize(sk, mss_now,
1747 sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
1698} 1748}
1699 1749
1700/* Returns the portion of skb which can be sent right away */ 1750/* Returns the portion of skb which can be sent right away */
@@ -1815,7 +1865,8 @@ static bool tcp_snd_wnd_test(const struct tcp_sock *tp,
1815 * know that all the data is in scatter-gather pages, and that the 1865 * know that all the data is in scatter-gather pages, and that the
1816 * packet has never been sent out before (and thus is not cloned). 1866 * packet has never been sent out before (and thus is not cloned).
1817 */ 1867 */
1818static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, 1868static int tso_fragment(struct sock *sk, enum tcp_queue tcp_queue,
1869 struct sk_buff *skb, unsigned int len,
1819 unsigned int mss_now, gfp_t gfp) 1870 unsigned int mss_now, gfp_t gfp)
1820{ 1871{
1821 struct sk_buff *buff; 1872 struct sk_buff *buff;
@@ -1824,7 +1875,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
1824 1875
1825 /* All of a TSO frame must be composed of paged data. */ 1876 /* All of a TSO frame must be composed of paged data. */
1826 if (skb->len != skb->data_len) 1877 if (skb->len != skb->data_len)
1827 return tcp_fragment(sk, skb, len, mss_now, gfp); 1878 return tcp_fragment(sk, tcp_queue, skb, len, mss_now, gfp);
1828 1879
1829 buff = sk_stream_alloc_skb(sk, 0, gfp, true); 1880 buff = sk_stream_alloc_skb(sk, 0, gfp, true);
1830 if (unlikely(!buff)) 1881 if (unlikely(!buff))
@@ -1860,7 +1911,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
1860 1911
1861 /* Link BUFF into the send queue. */ 1912 /* Link BUFF into the send queue. */
1862 __skb_header_release(buff); 1913 __skb_header_release(buff);
1863 tcp_insert_write_queue_after(skb, buff, sk); 1914 tcp_insert_write_queue_after(skb, buff, sk, tcp_queue);
1864 1915
1865 return 0; 1916 return 0;
1866} 1917}
@@ -1910,7 +1961,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
1910 if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len)) 1961 if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len))
1911 goto send_now; 1962 goto send_now;
1912 1963
1913 win_divisor = READ_ONCE(sysctl_tcp_tso_win_divisor); 1964 win_divisor = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tso_win_divisor);
1914 if (win_divisor) { 1965 if (win_divisor) {
1915 u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache); 1966 u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
1916 1967
@@ -1930,8 +1981,10 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
1930 goto send_now; 1981 goto send_now;
1931 } 1982 }
1932 1983
1933 head = tcp_write_queue_head(sk); 1984 /* TODO : use tsorted_sent_queue ? */
1934 1985 head = tcp_rtx_queue_head(sk);
1986 if (!head)
1987 goto send_now;
1935 age = tcp_stamp_us_delta(tp->tcp_mstamp, head->skb_mstamp); 1988 age = tcp_stamp_us_delta(tp->tcp_mstamp, head->skb_mstamp);
1936 /* If next ACK is likely to come too late (half srtt), do not defer */ 1989 /* If next ACK is likely to come too late (half srtt), do not defer */
1937 if (age < (tp->srtt_us >> 4)) 1990 if (age < (tp->srtt_us >> 4))
@@ -2145,18 +2198,18 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
2145{ 2198{
2146 unsigned int limit; 2199 unsigned int limit;
2147 2200
2148 limit = max(2 * skb->truesize, sk->sk_pacing_rate >> 10); 2201 limit = max(2 * skb->truesize, sk->sk_pacing_rate >> sk->sk_pacing_shift);
2149 limit = min_t(u32, limit, sysctl_tcp_limit_output_bytes); 2202 limit = min_t(u32, limit,
2203 sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes);
2150 limit <<= factor; 2204 limit <<= factor;
2151 2205
2152 if (refcount_read(&sk->sk_wmem_alloc) > limit) { 2206 if (refcount_read(&sk->sk_wmem_alloc) > limit) {
2153 /* Always send the 1st or 2nd skb in write queue. 2207 /* Always send skb if rtx queue is empty.
2154 * No need to wait for TX completion to call us back, 2208 * No need to wait for TX completion to call us back,
2155 * after softirq/tasklet schedule. 2209 * after softirq/tasklet schedule.
2156 * This helps when TX completions are delayed too much. 2210 * This helps when TX completions are delayed too much.
2157 */ 2211 */
2158 if (skb == sk->sk_write_queue.next || 2212 if (tcp_rtx_queue_empty(sk))
2159 skb->prev == sk->sk_write_queue.next)
2160 return false; 2213 return false;
2161 2214
2162 set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags); 2215 set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
@@ -2207,7 +2260,7 @@ void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type)
2207 * it's the "most interesting" or current chrono we are 2260 * it's the "most interesting" or current chrono we are
2208 * tracking and starts busy chrono if we have pending data. 2261 * tracking and starts busy chrono if we have pending data.
2209 */ 2262 */
2210 if (tcp_write_queue_empty(sk)) 2263 if (tcp_rtx_and_write_queues_empty(sk))
2211 tcp_chrono_set(tp, TCP_CHRONO_UNSPEC); 2264 tcp_chrono_set(tp, TCP_CHRONO_UNSPEC);
2212 else if (type == tp->chrono_type) 2265 else if (type == tp->chrono_type)
2213 tcp_chrono_set(tp, TCP_CHRONO_BUSY); 2266 tcp_chrono_set(tp, TCP_CHRONO_BUSY);
@@ -2263,7 +2316,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
2263 2316
2264 if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) { 2317 if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
2265 /* "skb_mstamp" is used as a start point for the retransmit timer */ 2318 /* "skb_mstamp" is used as a start point for the retransmit timer */
2266 skb->skb_mstamp = tp->tcp_mstamp; 2319 tcp_update_skb_after_send(tp, skb);
2267 goto repair; /* Skip network transmission */ 2320 goto repair; /* Skip network transmission */
2268 } 2321 }
2269 2322
@@ -2302,7 +2355,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
2302 nonagle); 2355 nonagle);
2303 2356
2304 if (skb->len > limit && 2357 if (skb->len > limit &&
2305 unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) 2358 unlikely(tso_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE,
2359 skb, limit, mss_now, gfp)))
2306 break; 2360 break;
2307 2361
2308 if (test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags)) 2362 if (test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags))
@@ -2342,7 +2396,7 @@ repair:
2342 tcp_cwnd_validate(sk, is_cwnd_limited); 2396 tcp_cwnd_validate(sk, is_cwnd_limited);
2343 return false; 2397 return false;
2344 } 2398 }
2345 return !tp->packets_out && tcp_send_head(sk); 2399 return !tp->packets_out && !tcp_write_queue_empty(sk);
2346} 2400}
2347 2401
2348bool tcp_schedule_loss_probe(struct sock *sk) 2402bool tcp_schedule_loss_probe(struct sock *sk)
@@ -2350,6 +2404,7 @@ bool tcp_schedule_loss_probe(struct sock *sk)
2350 struct inet_connection_sock *icsk = inet_csk(sk); 2404 struct inet_connection_sock *icsk = inet_csk(sk);
2351 struct tcp_sock *tp = tcp_sk(sk); 2405 struct tcp_sock *tp = tcp_sk(sk);
2352 u32 timeout, rto_delta_us; 2406 u32 timeout, rto_delta_us;
2407 int early_retrans;
2353 2408
2354 /* Don't do any loss probe on a Fast Open connection before 3WHS 2409 /* Don't do any loss probe on a Fast Open connection before 3WHS
2355 * finishes. 2410 * finishes.
@@ -2357,16 +2412,17 @@ bool tcp_schedule_loss_probe(struct sock *sk)
2357 if (tp->fastopen_rsk) 2412 if (tp->fastopen_rsk)
2358 return false; 2413 return false;
2359 2414
2415 early_retrans = sock_net(sk)->ipv4.sysctl_tcp_early_retrans;
2360 /* Schedule a loss probe in 2*RTT for SACK capable connections 2416 /* Schedule a loss probe in 2*RTT for SACK capable connections
2361 * in Open state, that are either limited by cwnd or application. 2417 * in Open state, that are either limited by cwnd or application.
2362 */ 2418 */
2363 if ((sysctl_tcp_early_retrans != 3 && sysctl_tcp_early_retrans != 4) || 2419 if ((early_retrans != 3 && early_retrans != 4) ||
2364 !tp->packets_out || !tcp_is_sack(tp) || 2420 !tp->packets_out || !tcp_is_sack(tp) ||
2365 icsk->icsk_ca_state != TCP_CA_Open) 2421 icsk->icsk_ca_state != TCP_CA_Open)
2366 return false; 2422 return false;
2367 2423
2368 if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) && 2424 if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) &&
2369 tcp_send_head(sk)) 2425 !tcp_write_queue_empty(sk))
2370 return false; 2426 return false;
2371 2427
2372 /* Probe timeout is 2*rtt. Add minimum RTO to account 2428 /* Probe timeout is 2*rtt. Add minimum RTO to account
@@ -2419,18 +2475,14 @@ void tcp_send_loss_probe(struct sock *sk)
2419 int mss = tcp_current_mss(sk); 2475 int mss = tcp_current_mss(sk);
2420 2476
2421 skb = tcp_send_head(sk); 2477 skb = tcp_send_head(sk);
2422 if (skb) { 2478 if (skb && tcp_snd_wnd_test(tp, skb, mss)) {
2423 if (tcp_snd_wnd_test(tp, skb, mss)) { 2479 pcount = tp->packets_out;
2424 pcount = tp->packets_out; 2480 tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC);
2425 tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC); 2481 if (tp->packets_out > pcount)
2426 if (tp->packets_out > pcount) 2482 goto probe_sent;
2427 goto probe_sent; 2483 goto rearm_timer;
2428 goto rearm_timer;
2429 }
2430 skb = tcp_write_queue_prev(sk, skb);
2431 } else {
2432 skb = tcp_write_queue_tail(sk);
2433 } 2484 }
2485 skb = skb_rb_last(&sk->tcp_rtx_queue);
2434 2486
2435 /* At most one outstanding TLP retransmission. */ 2487 /* At most one outstanding TLP retransmission. */
2436 if (tp->tlp_high_seq) 2488 if (tp->tlp_high_seq)
@@ -2448,10 +2500,11 @@ void tcp_send_loss_probe(struct sock *sk)
2448 goto rearm_timer; 2500 goto rearm_timer;
2449 2501
2450 if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) { 2502 if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) {
2451 if (unlikely(tcp_fragment(sk, skb, (pcount - 1) * mss, mss, 2503 if (unlikely(tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
2504 (pcount - 1) * mss, mss,
2452 GFP_ATOMIC))) 2505 GFP_ATOMIC)))
2453 goto rearm_timer; 2506 goto rearm_timer;
2454 skb = tcp_write_queue_next(sk, skb); 2507 skb = skb_rb_next(skb);
2455 } 2508 }
2456 2509
2457 if (WARN_ON(!skb || !tcp_skb_pcount(skb))) 2510 if (WARN_ON(!skb || !tcp_skb_pcount(skb)))
@@ -2651,7 +2704,7 @@ void tcp_skb_collapse_tstamp(struct sk_buff *skb,
2651static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) 2704static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
2652{ 2705{
2653 struct tcp_sock *tp = tcp_sk(sk); 2706 struct tcp_sock *tp = tcp_sk(sk);
2654 struct sk_buff *next_skb = tcp_write_queue_next(sk, skb); 2707 struct sk_buff *next_skb = skb_rb_next(skb);
2655 int skb_size, next_skb_size; 2708 int skb_size, next_skb_size;
2656 2709
2657 skb_size = skb->len; 2710 skb_size = skb->len;
@@ -2668,8 +2721,6 @@ static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
2668 } 2721 }
2669 tcp_highest_sack_replace(sk, next_skb, skb); 2722 tcp_highest_sack_replace(sk, next_skb, skb);
2670 2723
2671 tcp_unlink_write_queue(next_skb, sk);
2672
2673 if (next_skb->ip_summed == CHECKSUM_PARTIAL) 2724 if (next_skb->ip_summed == CHECKSUM_PARTIAL)
2674 skb->ip_summed = CHECKSUM_PARTIAL; 2725 skb->ip_summed = CHECKSUM_PARTIAL;
2675 2726
@@ -2697,7 +2748,7 @@ static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
2697 2748
2698 tcp_skb_collapse_tstamp(skb, next_skb); 2749 tcp_skb_collapse_tstamp(skb, next_skb);
2699 2750
2700 sk_wmem_free_skb(sk, next_skb); 2751 tcp_rtx_queue_unlink_and_free(next_skb, sk);
2701 return true; 2752 return true;
2702} 2753}
2703 2754
@@ -2708,8 +2759,6 @@ static bool tcp_can_collapse(const struct sock *sk, const struct sk_buff *skb)
2708 return false; 2759 return false;
2709 if (skb_cloned(skb)) 2760 if (skb_cloned(skb))
2710 return false; 2761 return false;
2711 if (skb == tcp_send_head(sk))
2712 return false;
2713 /* Some heuristics for collapsing over SACK'd could be invented */ 2762 /* Some heuristics for collapsing over SACK'd could be invented */
2714 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) 2763 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
2715 return false; 2764 return false;
@@ -2727,12 +2776,12 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
2727 struct sk_buff *skb = to, *tmp; 2776 struct sk_buff *skb = to, *tmp;
2728 bool first = true; 2777 bool first = true;
2729 2778
2730 if (!sysctl_tcp_retrans_collapse) 2779 if (!sock_net(sk)->ipv4.sysctl_tcp_retrans_collapse)
2731 return; 2780 return;
2732 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) 2781 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
2733 return; 2782 return;
2734 2783
2735 tcp_for_write_queue_from_safe(skb, tmp, sk) { 2784 skb_rbtree_walk_from_safe(skb, tmp) {
2736 if (!tcp_can_collapse(sk, skb)) 2785 if (!tcp_can_collapse(sk, skb))
2737 break; 2786 break;
2738 2787
@@ -2807,7 +2856,8 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
2807 2856
2808 len = cur_mss * segs; 2857 len = cur_mss * segs;
2809 if (skb->len > len) { 2858 if (skb->len > len) {
2810 if (tcp_fragment(sk, skb, len, cur_mss, GFP_ATOMIC)) 2859 if (tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, len,
2860 cur_mss, GFP_ATOMIC))
2811 return -ENOMEM; /* We'll try again later. */ 2861 return -ENOMEM; /* We'll try again later. */
2812 } else { 2862 } else {
2813 if (skb_unclone(skb, GFP_ATOMIC)) 2863 if (skb_unclone(skb, GFP_ATOMIC))
@@ -2841,11 +2891,14 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
2841 skb_headroom(skb) >= 0xFFFF)) { 2891 skb_headroom(skb) >= 0xFFFF)) {
2842 struct sk_buff *nskb; 2892 struct sk_buff *nskb;
2843 2893
2844 nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC); 2894 tcp_skb_tsorted_save(skb) {
2845 err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) : 2895 nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC);
2846 -ENOBUFS; 2896 err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) :
2897 -ENOBUFS;
2898 } tcp_skb_tsorted_restore(skb);
2899
2847 if (!err) { 2900 if (!err) {
2848 skb->skb_mstamp = tp->tcp_mstamp; 2901 tcp_update_skb_after_send(tp, skb);
2849 tcp_rate_skb_sent(sk, skb); 2902 tcp_rate_skb_sent(sk, skb);
2850 } 2903 }
2851 } else { 2904 } else {
@@ -2854,6 +2907,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
2854 2907
2855 if (likely(!err)) { 2908 if (likely(!err)) {
2856 TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS; 2909 TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS;
2910 trace_tcp_retransmit_skb(sk, skb);
2857 } else if (err != -EBUSY) { 2911 } else if (err != -EBUSY) {
2858 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL); 2912 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);
2859 } 2913 }
@@ -2890,36 +2944,25 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
2890 * retransmitted data is acknowledged. It tries to continue 2944 * retransmitted data is acknowledged. It tries to continue
2891 * resending the rest of the retransmit queue, until either 2945 * resending the rest of the retransmit queue, until either
2892 * we've sent it all or the congestion window limit is reached. 2946 * we've sent it all or the congestion window limit is reached.
2893 * If doing SACK, the first ACK which comes back for a timeout
2894 * based retransmit packet might feed us FACK information again.
2895 * If so, we use it to avoid unnecessarily retransmissions.
2896 */ 2947 */
2897void tcp_xmit_retransmit_queue(struct sock *sk) 2948void tcp_xmit_retransmit_queue(struct sock *sk)
2898{ 2949{
2899 const struct inet_connection_sock *icsk = inet_csk(sk); 2950 const struct inet_connection_sock *icsk = inet_csk(sk);
2951 struct sk_buff *skb, *rtx_head, *hole = NULL;
2900 struct tcp_sock *tp = tcp_sk(sk); 2952 struct tcp_sock *tp = tcp_sk(sk);
2901 struct sk_buff *skb;
2902 struct sk_buff *hole = NULL;
2903 u32 max_segs; 2953 u32 max_segs;
2904 int mib_idx; 2954 int mib_idx;
2905 2955
2906 if (!tp->packets_out) 2956 if (!tp->packets_out)
2907 return; 2957 return;
2908 2958
2909 if (tp->retransmit_skb_hint) { 2959 rtx_head = tcp_rtx_queue_head(sk);
2910 skb = tp->retransmit_skb_hint; 2960 skb = tp->retransmit_skb_hint ?: rtx_head;
2911 } else {
2912 skb = tcp_write_queue_head(sk);
2913 }
2914
2915 max_segs = tcp_tso_segs(sk, tcp_current_mss(sk)); 2961 max_segs = tcp_tso_segs(sk, tcp_current_mss(sk));
2916 tcp_for_write_queue_from(skb, sk) { 2962 skb_rbtree_walk_from(skb) {
2917 __u8 sacked; 2963 __u8 sacked;
2918 int segs; 2964 int segs;
2919 2965
2920 if (skb == tcp_send_head(sk))
2921 break;
2922
2923 if (tcp_pacing_check(sk)) 2966 if (tcp_pacing_check(sk))
2924 break; 2967 break;
2925 2968
@@ -2964,7 +3007,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
2964 if (tcp_in_cwnd_reduction(sk)) 3007 if (tcp_in_cwnd_reduction(sk))
2965 tp->prr_out += tcp_skb_pcount(skb); 3008 tp->prr_out += tcp_skb_pcount(skb);
2966 3009
2967 if (skb == tcp_write_queue_head(sk) && 3010 if (skb == rtx_head &&
2968 icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT) 3011 icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT)
2969 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 3012 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
2970 inet_csk(sk)->icsk_rto, 3013 inet_csk(sk)->icsk_rto,
@@ -3006,12 +3049,15 @@ void tcp_send_fin(struct sock *sk)
3006 * Note: in the latter case, FIN packet will be sent after a timeout, 3049 * Note: in the latter case, FIN packet will be sent after a timeout,
3007 * as TCP stack thinks it has already been transmitted. 3050 * as TCP stack thinks it has already been transmitted.
3008 */ 3051 */
3009 if (tskb && (tcp_send_head(sk) || tcp_under_memory_pressure(sk))) { 3052 if (!tskb && tcp_under_memory_pressure(sk))
3053 tskb = skb_rb_last(&sk->tcp_rtx_queue);
3054
3055 if (tskb) {
3010coalesce: 3056coalesce:
3011 TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN; 3057 TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN;
3012 TCP_SKB_CB(tskb)->end_seq++; 3058 TCP_SKB_CB(tskb)->end_seq++;
3013 tp->write_seq++; 3059 tp->write_seq++;
3014 if (!tcp_send_head(sk)) { 3060 if (tcp_write_queue_empty(sk)) {
3015 /* This means tskb was already sent. 3061 /* This means tskb was already sent.
3016 * Pretend we included the FIN on previous transmit. 3062 * Pretend we included the FIN on previous transmit.
3017 * We need to set tp->snd_nxt to the value it would have 3063 * We need to set tp->snd_nxt to the value it would have
@@ -3028,6 +3074,7 @@ coalesce:
3028 goto coalesce; 3074 goto coalesce;
3029 return; 3075 return;
3030 } 3076 }
3077 INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
3031 skb_reserve(skb, MAX_TCP_HEADER); 3078 skb_reserve(skb, MAX_TCP_HEADER);
3032 sk_forced_mem_schedule(sk, skb->truesize); 3079 sk_forced_mem_schedule(sk, skb->truesize);
3033 /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */ 3080 /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
@@ -3064,6 +3111,11 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority)
3064 /* Send it off. */ 3111 /* Send it off. */
3065 if (tcp_transmit_skb(sk, skb, 0, priority)) 3112 if (tcp_transmit_skb(sk, skb, 0, priority))
3066 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED); 3113 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
3114
3115 /* skb of trace_tcp_send_reset() keeps the skb that caused RST,
3116 * skb here is different to the troublesome skb, so use NULL
3117 */
3118 trace_tcp_send_reset(sk, NULL);
3067} 3119}
3068 3120
3069/* Send a crossed SYN-ACK during socket establishment. 3121/* Send a crossed SYN-ACK during socket establishment.
@@ -3076,20 +3128,24 @@ int tcp_send_synack(struct sock *sk)
3076{ 3128{
3077 struct sk_buff *skb; 3129 struct sk_buff *skb;
3078 3130
3079 skb = tcp_write_queue_head(sk); 3131 skb = tcp_rtx_queue_head(sk);
3080 if (!skb || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { 3132 if (!skb || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
3081 pr_debug("%s: wrong queue state\n", __func__); 3133 pr_err("%s: wrong queue state\n", __func__);
3082 return -EFAULT; 3134 return -EFAULT;
3083 } 3135 }
3084 if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) { 3136 if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) {
3085 if (skb_cloned(skb)) { 3137 if (skb_cloned(skb)) {
3086 struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC); 3138 struct sk_buff *nskb;
3139
3140 tcp_skb_tsorted_save(skb) {
3141 nskb = skb_copy(skb, GFP_ATOMIC);
3142 } tcp_skb_tsorted_restore(skb);
3087 if (!nskb) 3143 if (!nskb)
3088 return -ENOMEM; 3144 return -ENOMEM;
3089 tcp_unlink_write_queue(skb, sk); 3145 INIT_LIST_HEAD(&nskb->tcp_tsorted_anchor);
3146 tcp_rtx_queue_unlink_and_free(skb, sk);
3090 __skb_header_release(nskb); 3147 __skb_header_release(nskb);
3091 __tcp_add_write_queue_head(sk, nskb); 3148 tcp_rbtree_insert(&sk->tcp_rtx_queue, nskb);
3092 sk_wmem_free_skb(sk, skb);
3093 sk->sk_wmem_queued += nskb->truesize; 3149 sk->sk_wmem_queued += nskb->truesize;
3094 sk_mem_charge(sk, nskb->truesize); 3150 sk_mem_charge(sk, nskb->truesize);
3095 skb = nskb; 3151 skb = nskb;
@@ -3166,8 +3222,8 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
3166 md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req)); 3222 md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req));
3167#endif 3223#endif
3168 skb_set_hash(skb, tcp_rsk(req)->txhash, PKT_HASH_TYPE_L4); 3224 skb_set_hash(skb, tcp_rsk(req)->txhash, PKT_HASH_TYPE_L4);
3169 tcp_header_size = tcp_synack_options(req, mss, skb, &opts, md5, foc) + 3225 tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, md5,
3170 sizeof(*th); 3226 foc) + sizeof(*th);
3171 3227
3172 skb_push(skb, tcp_header_size); 3228 skb_push(skb, tcp_header_size);
3173 skb_reset_transport_header(skb); 3229 skb_reset_transport_header(skb);
@@ -3268,7 +3324,7 @@ static void tcp_connect_init(struct sock *sk)
3268 if (rcv_wnd == 0) 3324 if (rcv_wnd == 0)
3269 rcv_wnd = dst_metric(dst, RTAX_INITRWND); 3325 rcv_wnd = dst_metric(dst, RTAX_INITRWND);
3270 3326
3271 tcp_select_initial_window(tcp_full_space(sk), 3327 tcp_select_initial_window(sk, tcp_full_space(sk),
3272 tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), 3328 tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
3273 &tp->rcv_wnd, 3329 &tp->rcv_wnd,
3274 &tp->window_clamp, 3330 &tp->window_clamp,
@@ -3307,7 +3363,6 @@ static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb)
3307 3363
3308 tcb->end_seq += skb->len; 3364 tcb->end_seq += skb->len;
3309 __skb_header_release(skb); 3365 __skb_header_release(skb);
3310 __tcp_add_write_queue_tail(sk, skb);
3311 sk->sk_wmem_queued += skb->truesize; 3366 sk->sk_wmem_queued += skb->truesize;
3312 sk_mem_charge(sk, skb->truesize); 3367 sk_mem_charge(sk, skb->truesize);
3313 tp->write_seq = tcb->end_seq; 3368 tp->write_seq = tcb->end_seq;
@@ -3355,6 +3410,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
3355 int copied = copy_from_iter(skb_put(syn_data, space), space, 3410 int copied = copy_from_iter(skb_put(syn_data, space), space,
3356 &fo->data->msg_iter); 3411 &fo->data->msg_iter);
3357 if (unlikely(!copied)) { 3412 if (unlikely(!copied)) {
3413 tcp_skb_tsorted_anchor_cleanup(syn_data);
3358 kfree_skb(syn_data); 3414 kfree_skb(syn_data);
3359 goto fallback; 3415 goto fallback;
3360 } 3416 }
@@ -3385,12 +3441,13 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
3385 TCP_SKB_CB(syn_data)->tcp_flags = TCPHDR_ACK | TCPHDR_PSH; 3441 TCP_SKB_CB(syn_data)->tcp_flags = TCPHDR_ACK | TCPHDR_PSH;
3386 if (!err) { 3442 if (!err) {
3387 tp->syn_data = (fo->copied > 0); 3443 tp->syn_data = (fo->copied > 0);
3444 tcp_rbtree_insert(&sk->tcp_rtx_queue, syn_data);
3388 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT); 3445 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT);
3389 goto done; 3446 goto done;
3390 } 3447 }
3391 3448
3392 /* data was not sent, this is our new send_head */ 3449 /* data was not sent, put it in write_queue */
3393 sk->sk_send_head = syn_data; 3450 __skb_queue_tail(&sk->sk_write_queue, syn_data);
3394 tp->packets_out -= tcp_skb_pcount(syn_data); 3451 tp->packets_out -= tcp_skb_pcount(syn_data);
3395 3452
3396fallback: 3453fallback:
@@ -3433,6 +3490,7 @@ int tcp_connect(struct sock *sk)
3433 tp->retrans_stamp = tcp_time_stamp(tp); 3490 tp->retrans_stamp = tcp_time_stamp(tp);
3434 tcp_connect_queue_skb(sk, buff); 3491 tcp_connect_queue_skb(sk, buff);
3435 tcp_ecn_send_syn(sk, buff); 3492 tcp_ecn_send_syn(sk, buff);
3493 tcp_rbtree_insert(&sk->tcp_rtx_queue, buff);
3436 3494
3437 /* Send off SYN; include data in Fast Open. */ 3495 /* Send off SYN; include data in Fast Open. */
3438 err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) : 3496 err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) :
@@ -3627,7 +3685,8 @@ int tcp_write_wakeup(struct sock *sk, int mib)
3627 skb->len > mss) { 3685 skb->len > mss) {
3628 seg_size = min(seg_size, mss); 3686 seg_size = min(seg_size, mss);
3629 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; 3687 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
3630 if (tcp_fragment(sk, skb, seg_size, mss, GFP_ATOMIC)) 3688 if (tcp_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE,
3689 skb, seg_size, mss, GFP_ATOMIC))
3631 return -1; 3690 return -1;
3632 } else if (!tcp_skb_pcount(skb)) 3691 } else if (!tcp_skb_pcount(skb))
3633 tcp_set_skb_tso_segs(skb, mss); 3692 tcp_set_skb_tso_segs(skb, mss);
@@ -3657,7 +3716,7 @@ void tcp_send_probe0(struct sock *sk)
3657 3716
3658 err = tcp_write_wakeup(sk, LINUX_MIB_TCPWINPROBE); 3717 err = tcp_write_wakeup(sk, LINUX_MIB_TCPWINPROBE);
3659 3718
3660 if (tp->packets_out || !tcp_send_head(sk)) { 3719 if (tp->packets_out || tcp_write_queue_empty(sk)) {
3661 /* Cancel probe timer, if it is not required. */ 3720 /* Cancel probe timer, if it is not required. */
3662 icsk->icsk_probes_out = 0; 3721 icsk->icsk_probes_out = 0;
3663 icsk->icsk_backoff = 0; 3722 icsk->icsk_backoff = 0;
@@ -3698,6 +3757,7 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
3698 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); 3757 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
3699 if (unlikely(tcp_passive_fastopen(sk))) 3758 if (unlikely(tcp_passive_fastopen(sk)))
3700 tcp_sk(sk)->total_retrans++; 3759 tcp_sk(sk)->total_retrans++;
3760 trace_tcp_retransmit_synack(sk, req);
3701 } 3761 }
3702 return res; 3762 return res;
3703} 3763}