diff options
Diffstat (limited to 'net/ipv4/tcp_output.c')
-rw-r--r-- | net/ipv4/tcp_output.c | 324 |
1 files changed, 192 insertions, 132 deletions
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 5a42e873d44a..540b7d92cc70 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c | |||
@@ -41,40 +41,25 @@ | |||
41 | #include <linux/compiler.h> | 41 | #include <linux/compiler.h> |
42 | #include <linux/gfp.h> | 42 | #include <linux/gfp.h> |
43 | #include <linux/module.h> | 43 | #include <linux/module.h> |
44 | #include <linux/static_key.h> | ||
44 | 45 | ||
45 | /* People can turn this off for buggy TCP's found in printers etc. */ | 46 | #include <trace/events/tcp.h> |
46 | int sysctl_tcp_retrans_collapse __read_mostly = 1; | ||
47 | |||
48 | /* People can turn this on to work with those rare, broken TCPs that | ||
49 | * interpret the window field as a signed quantity. | ||
50 | */ | ||
51 | int sysctl_tcp_workaround_signed_windows __read_mostly = 0; | ||
52 | |||
53 | /* Default TSQ limit of four TSO segments */ | ||
54 | int sysctl_tcp_limit_output_bytes __read_mostly = 262144; | ||
55 | |||
56 | /* This limits the percentage of the congestion window which we | ||
57 | * will allow a single TSO frame to consume. Building TSO frames | ||
58 | * which are too large can cause TCP streams to be bursty. | ||
59 | */ | ||
60 | int sysctl_tcp_tso_win_divisor __read_mostly = 3; | ||
61 | |||
62 | /* By default, RFC2861 behavior. */ | ||
63 | int sysctl_tcp_slow_start_after_idle __read_mostly = 1; | ||
64 | 47 | ||
65 | static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, | 48 | static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, |
66 | int push_one, gfp_t gfp); | 49 | int push_one, gfp_t gfp); |
67 | 50 | ||
68 | /* Account for new data that has been sent to the network. */ | 51 | /* Account for new data that has been sent to the network. */ |
69 | static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb) | 52 | static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb) |
70 | { | 53 | { |
71 | struct inet_connection_sock *icsk = inet_csk(sk); | 54 | struct inet_connection_sock *icsk = inet_csk(sk); |
72 | struct tcp_sock *tp = tcp_sk(sk); | 55 | struct tcp_sock *tp = tcp_sk(sk); |
73 | unsigned int prior_packets = tp->packets_out; | 56 | unsigned int prior_packets = tp->packets_out; |
74 | 57 | ||
75 | tcp_advance_send_head(sk, skb); | ||
76 | tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; | 58 | tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; |
77 | 59 | ||
60 | __skb_unlink(skb, &sk->sk_write_queue); | ||
61 | tcp_rbtree_insert(&sk->tcp_rtx_queue, skb); | ||
62 | |||
78 | tp->packets_out += tcp_skb_pcount(skb); | 63 | tp->packets_out += tcp_skb_pcount(skb); |
79 | if (!prior_packets || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) | 64 | if (!prior_packets || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) |
80 | tcp_rearm_rto(sk); | 65 | tcp_rearm_rto(sk); |
@@ -203,7 +188,7 @@ u32 tcp_default_init_rwnd(u32 mss) | |||
203 | * be a multiple of mss if possible. We assume here that mss >= 1. | 188 | * be a multiple of mss if possible. We assume here that mss >= 1. |
204 | * This MUST be enforced by all callers. | 189 | * This MUST be enforced by all callers. |
205 | */ | 190 | */ |
206 | void tcp_select_initial_window(int __space, __u32 mss, | 191 | void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss, |
207 | __u32 *rcv_wnd, __u32 *window_clamp, | 192 | __u32 *rcv_wnd, __u32 *window_clamp, |
208 | int wscale_ok, __u8 *rcv_wscale, | 193 | int wscale_ok, __u8 *rcv_wscale, |
209 | __u32 init_rcv_wnd) | 194 | __u32 init_rcv_wnd) |
@@ -227,7 +212,7 @@ void tcp_select_initial_window(int __space, __u32 mss, | |||
227 | * which we interpret as a sign the remote TCP is not | 212 | * which we interpret as a sign the remote TCP is not |
228 | * misinterpreting the window field as a signed quantity. | 213 | * misinterpreting the window field as a signed quantity. |
229 | */ | 214 | */ |
230 | if (sysctl_tcp_workaround_signed_windows) | 215 | if (sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows) |
231 | (*rcv_wnd) = min(space, MAX_TCP_WINDOW); | 216 | (*rcv_wnd) = min(space, MAX_TCP_WINDOW); |
232 | else | 217 | else |
233 | (*rcv_wnd) = space; | 218 | (*rcv_wnd) = space; |
@@ -235,7 +220,7 @@ void tcp_select_initial_window(int __space, __u32 mss, | |||
235 | (*rcv_wscale) = 0; | 220 | (*rcv_wscale) = 0; |
236 | if (wscale_ok) { | 221 | if (wscale_ok) { |
237 | /* Set window scaling on max possible window */ | 222 | /* Set window scaling on max possible window */ |
238 | space = max_t(u32, space, sysctl_tcp_rmem[2]); | 223 | space = max_t(u32, space, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); |
239 | space = max_t(u32, space, sysctl_rmem_max); | 224 | space = max_t(u32, space, sysctl_rmem_max); |
240 | space = min_t(u32, space, *window_clamp); | 225 | space = min_t(u32, space, *window_clamp); |
241 | while (space > U16_MAX && (*rcv_wscale) < TCP_MAX_WSCALE) { | 226 | while (space > U16_MAX && (*rcv_wscale) < TCP_MAX_WSCALE) { |
@@ -287,7 +272,8 @@ static u16 tcp_select_window(struct sock *sk) | |||
287 | /* Make sure we do not exceed the maximum possible | 272 | /* Make sure we do not exceed the maximum possible |
288 | * scaled window. | 273 | * scaled window. |
289 | */ | 274 | */ |
290 | if (!tp->rx_opt.rcv_wscale && sysctl_tcp_workaround_signed_windows) | 275 | if (!tp->rx_opt.rcv_wscale && |
276 | sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows) | ||
291 | new_win = min(new_win, MAX_TCP_WINDOW); | 277 | new_win = min(new_win, MAX_TCP_WINDOW); |
292 | else | 278 | else |
293 | new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale)); | 279 | new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale)); |
@@ -395,7 +381,6 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, | |||
395 | static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) | 381 | static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) |
396 | { | 382 | { |
397 | skb->ip_summed = CHECKSUM_PARTIAL; | 383 | skb->ip_summed = CHECKSUM_PARTIAL; |
398 | skb->csum = 0; | ||
399 | 384 | ||
400 | TCP_SKB_CB(skb)->tcp_flags = flags; | 385 | TCP_SKB_CB(skb)->tcp_flags = flags; |
401 | TCP_SKB_CB(skb)->sacked = 0; | 386 | TCP_SKB_CB(skb)->sacked = 0; |
@@ -418,6 +403,22 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp) | |||
418 | #define OPTION_MD5 (1 << 2) | 403 | #define OPTION_MD5 (1 << 2) |
419 | #define OPTION_WSCALE (1 << 3) | 404 | #define OPTION_WSCALE (1 << 3) |
420 | #define OPTION_FAST_OPEN_COOKIE (1 << 8) | 405 | #define OPTION_FAST_OPEN_COOKIE (1 << 8) |
406 | #define OPTION_SMC (1 << 9) | ||
407 | |||
408 | static void smc_options_write(__be32 *ptr, u16 *options) | ||
409 | { | ||
410 | #if IS_ENABLED(CONFIG_SMC) | ||
411 | if (static_branch_unlikely(&tcp_have_smc)) { | ||
412 | if (unlikely(OPTION_SMC & *options)) { | ||
413 | *ptr++ = htonl((TCPOPT_NOP << 24) | | ||
414 | (TCPOPT_NOP << 16) | | ||
415 | (TCPOPT_EXP << 8) | | ||
416 | (TCPOLEN_EXP_SMC_BASE)); | ||
417 | *ptr++ = htonl(TCPOPT_SMC_MAGIC); | ||
418 | } | ||
419 | } | ||
420 | #endif | ||
421 | } | ||
421 | 422 | ||
422 | struct tcp_out_options { | 423 | struct tcp_out_options { |
423 | u16 options; /* bit field of OPTION_* */ | 424 | u16 options; /* bit field of OPTION_* */ |
@@ -536,6 +537,41 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, | |||
536 | } | 537 | } |
537 | ptr += (len + 3) >> 2; | 538 | ptr += (len + 3) >> 2; |
538 | } | 539 | } |
540 | |||
541 | smc_options_write(ptr, &options); | ||
542 | } | ||
543 | |||
544 | static void smc_set_option(const struct tcp_sock *tp, | ||
545 | struct tcp_out_options *opts, | ||
546 | unsigned int *remaining) | ||
547 | { | ||
548 | #if IS_ENABLED(CONFIG_SMC) | ||
549 | if (static_branch_unlikely(&tcp_have_smc)) { | ||
550 | if (tp->syn_smc) { | ||
551 | if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) { | ||
552 | opts->options |= OPTION_SMC; | ||
553 | *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED; | ||
554 | } | ||
555 | } | ||
556 | } | ||
557 | #endif | ||
558 | } | ||
559 | |||
560 | static void smc_set_option_cond(const struct tcp_sock *tp, | ||
561 | const struct inet_request_sock *ireq, | ||
562 | struct tcp_out_options *opts, | ||
563 | unsigned int *remaining) | ||
564 | { | ||
565 | #if IS_ENABLED(CONFIG_SMC) | ||
566 | if (static_branch_unlikely(&tcp_have_smc)) { | ||
567 | if (tp->syn_smc && ireq->smc_ok) { | ||
568 | if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) { | ||
569 | opts->options |= OPTION_SMC; | ||
570 | *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED; | ||
571 | } | ||
572 | } | ||
573 | } | ||
574 | #endif | ||
539 | } | 575 | } |
540 | 576 | ||
541 | /* Compute TCP options for SYN packets. This is not the final | 577 | /* Compute TCP options for SYN packets. This is not the final |
@@ -603,11 +639,14 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, | |||
603 | } | 639 | } |
604 | } | 640 | } |
605 | 641 | ||
642 | smc_set_option(tp, opts, &remaining); | ||
643 | |||
606 | return MAX_TCP_OPTION_SPACE - remaining; | 644 | return MAX_TCP_OPTION_SPACE - remaining; |
607 | } | 645 | } |
608 | 646 | ||
609 | /* Set up TCP options for SYN-ACKs. */ | 647 | /* Set up TCP options for SYN-ACKs. */ |
610 | static unsigned int tcp_synack_options(struct request_sock *req, | 648 | static unsigned int tcp_synack_options(const struct sock *sk, |
649 | struct request_sock *req, | ||
611 | unsigned int mss, struct sk_buff *skb, | 650 | unsigned int mss, struct sk_buff *skb, |
612 | struct tcp_out_options *opts, | 651 | struct tcp_out_options *opts, |
613 | const struct tcp_md5sig_key *md5, | 652 | const struct tcp_md5sig_key *md5, |
@@ -663,6 +702,8 @@ static unsigned int tcp_synack_options(struct request_sock *req, | |||
663 | } | 702 | } |
664 | } | 703 | } |
665 | 704 | ||
705 | smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining); | ||
706 | |||
666 | return MAX_TCP_OPTION_SPACE - remaining; | 707 | return MAX_TCP_OPTION_SPACE - remaining; |
667 | } | 708 | } |
668 | 709 | ||
@@ -973,6 +1014,12 @@ static void tcp_internal_pacing(struct sock *sk, const struct sk_buff *skb) | |||
973 | HRTIMER_MODE_ABS_PINNED); | 1014 | HRTIMER_MODE_ABS_PINNED); |
974 | } | 1015 | } |
975 | 1016 | ||
1017 | static void tcp_update_skb_after_send(struct tcp_sock *tp, struct sk_buff *skb) | ||
1018 | { | ||
1019 | skb->skb_mstamp = tp->tcp_mstamp; | ||
1020 | list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue); | ||
1021 | } | ||
1022 | |||
976 | /* This routine actually transmits TCP packets queued in by | 1023 | /* This routine actually transmits TCP packets queued in by |
977 | * tcp_do_sendmsg(). This is used by both the initial | 1024 | * tcp_do_sendmsg(). This is used by both the initial |
978 | * transmission and possible later retransmissions. | 1025 | * transmission and possible later retransmissions. |
@@ -1005,10 +1052,14 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, | |||
1005 | TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq | 1052 | TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq |
1006 | - tp->snd_una; | 1053 | - tp->snd_una; |
1007 | oskb = skb; | 1054 | oskb = skb; |
1008 | if (unlikely(skb_cloned(skb))) | 1055 | |
1009 | skb = pskb_copy(skb, gfp_mask); | 1056 | tcp_skb_tsorted_save(oskb) { |
1010 | else | 1057 | if (unlikely(skb_cloned(oskb))) |
1011 | skb = skb_clone(skb, gfp_mask); | 1058 | skb = pskb_copy(oskb, gfp_mask); |
1059 | else | ||
1060 | skb = skb_clone(oskb, gfp_mask); | ||
1061 | } tcp_skb_tsorted_restore(oskb); | ||
1062 | |||
1012 | if (unlikely(!skb)) | 1063 | if (unlikely(!skb)) |
1013 | return -ENOBUFS; | 1064 | return -ENOBUFS; |
1014 | } | 1065 | } |
@@ -1129,7 +1180,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, | |||
1129 | err = net_xmit_eval(err); | 1180 | err = net_xmit_eval(err); |
1130 | } | 1181 | } |
1131 | if (!err && oskb) { | 1182 | if (!err && oskb) { |
1132 | oskb->skb_mstamp = tp->tcp_mstamp; | 1183 | tcp_update_skb_after_send(tp, oskb); |
1133 | tcp_rate_skb_sent(sk, oskb); | 1184 | tcp_rate_skb_sent(sk, oskb); |
1134 | } | 1185 | } |
1135 | return err; | 1186 | return err; |
@@ -1167,21 +1218,6 @@ static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now) | |||
1167 | } | 1218 | } |
1168 | } | 1219 | } |
1169 | 1220 | ||
1170 | /* When a modification to fackets out becomes necessary, we need to check | ||
1171 | * skb is counted to fackets_out or not. | ||
1172 | */ | ||
1173 | static void tcp_adjust_fackets_out(struct sock *sk, const struct sk_buff *skb, | ||
1174 | int decr) | ||
1175 | { | ||
1176 | struct tcp_sock *tp = tcp_sk(sk); | ||
1177 | |||
1178 | if (!tp->sacked_out || tcp_is_reno(tp)) | ||
1179 | return; | ||
1180 | |||
1181 | if (after(tcp_highest_sack_seq(tp), TCP_SKB_CB(skb)->seq)) | ||
1182 | tp->fackets_out -= decr; | ||
1183 | } | ||
1184 | |||
1185 | /* Pcount in the middle of the write queue got changed, we need to do various | 1221 | /* Pcount in the middle of the write queue got changed, we need to do various |
1186 | * tweaks to fix counters | 1222 | * tweaks to fix counters |
1187 | */ | 1223 | */ |
@@ -1202,11 +1238,9 @@ static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int de | |||
1202 | if (tcp_is_reno(tp) && decr > 0) | 1238 | if (tcp_is_reno(tp) && decr > 0) |
1203 | tp->sacked_out -= min_t(u32, tp->sacked_out, decr); | 1239 | tp->sacked_out -= min_t(u32, tp->sacked_out, decr); |
1204 | 1240 | ||
1205 | tcp_adjust_fackets_out(sk, skb, decr); | ||
1206 | |||
1207 | if (tp->lost_skb_hint && | 1241 | if (tp->lost_skb_hint && |
1208 | before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->lost_skb_hint)->seq) && | 1242 | before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->lost_skb_hint)->seq) && |
1209 | (tcp_is_fack(tp) || (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))) | 1243 | (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) |
1210 | tp->lost_cnt_hint -= decr; | 1244 | tp->lost_cnt_hint -= decr; |
1211 | 1245 | ||
1212 | tcp_verify_left_out(tp); | 1246 | tcp_verify_left_out(tp); |
@@ -1241,12 +1275,25 @@ static void tcp_skb_fragment_eor(struct sk_buff *skb, struct sk_buff *skb2) | |||
1241 | TCP_SKB_CB(skb)->eor = 0; | 1275 | TCP_SKB_CB(skb)->eor = 0; |
1242 | } | 1276 | } |
1243 | 1277 | ||
1278 | /* Insert buff after skb on the write or rtx queue of sk. */ | ||
1279 | static void tcp_insert_write_queue_after(struct sk_buff *skb, | ||
1280 | struct sk_buff *buff, | ||
1281 | struct sock *sk, | ||
1282 | enum tcp_queue tcp_queue) | ||
1283 | { | ||
1284 | if (tcp_queue == TCP_FRAG_IN_WRITE_QUEUE) | ||
1285 | __skb_queue_after(&sk->sk_write_queue, skb, buff); | ||
1286 | else | ||
1287 | tcp_rbtree_insert(&sk->tcp_rtx_queue, buff); | ||
1288 | } | ||
1289 | |||
1244 | /* Function to create two new TCP segments. Shrinks the given segment | 1290 | /* Function to create two new TCP segments. Shrinks the given segment |
1245 | * to the specified size and appends a new segment with the rest of the | 1291 | * to the specified size and appends a new segment with the rest of the |
1246 | * packet to the list. This won't be called frequently, I hope. | 1292 | * packet to the list. This won't be called frequently, I hope. |
1247 | * Remember, these are still headerless SKBs at this point. | 1293 | * Remember, these are still headerless SKBs at this point. |
1248 | */ | 1294 | */ |
1249 | int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, | 1295 | int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, |
1296 | struct sk_buff *skb, u32 len, | ||
1250 | unsigned int mss_now, gfp_t gfp) | 1297 | unsigned int mss_now, gfp_t gfp) |
1251 | { | 1298 | { |
1252 | struct tcp_sock *tp = tcp_sk(sk); | 1299 | struct tcp_sock *tp = tcp_sk(sk); |
@@ -1329,7 +1376,9 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, | |||
1329 | 1376 | ||
1330 | /* Link BUFF into the send queue. */ | 1377 | /* Link BUFF into the send queue. */ |
1331 | __skb_header_release(buff); | 1378 | __skb_header_release(buff); |
1332 | tcp_insert_write_queue_after(skb, buff, sk); | 1379 | tcp_insert_write_queue_after(skb, buff, sk, tcp_queue); |
1380 | if (tcp_queue == TCP_FRAG_IN_RTX_QUEUE) | ||
1381 | list_add(&buff->tcp_tsorted_anchor, &skb->tcp_tsorted_anchor); | ||
1333 | 1382 | ||
1334 | return 0; | 1383 | return 0; |
1335 | } | 1384 | } |
@@ -1607,7 +1656,7 @@ static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited) | |||
1607 | if (tp->packets_out > tp->snd_cwnd_used) | 1656 | if (tp->packets_out > tp->snd_cwnd_used) |
1608 | tp->snd_cwnd_used = tp->packets_out; | 1657 | tp->snd_cwnd_used = tp->packets_out; |
1609 | 1658 | ||
1610 | if (sysctl_tcp_slow_start_after_idle && | 1659 | if (sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle && |
1611 | (s32)(tcp_jiffies32 - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto && | 1660 | (s32)(tcp_jiffies32 - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto && |
1612 | !ca_ops->cong_control) | 1661 | !ca_ops->cong_control) |
1613 | tcp_cwnd_application_limited(sk); | 1662 | tcp_cwnd_application_limited(sk); |
@@ -1616,10 +1665,10 @@ static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited) | |||
1616 | * is caused by insufficient sender buffer: | 1665 | * is caused by insufficient sender buffer: |
1617 | * 1) just sent some data (see tcp_write_xmit) | 1666 | * 1) just sent some data (see tcp_write_xmit) |
1618 | * 2) not cwnd limited (this else condition) | 1667 | * 2) not cwnd limited (this else condition) |
1619 | * 3) no more data to send (null tcp_send_head ) | 1668 | * 3) no more data to send (tcp_write_queue_empty()) |
1620 | * 4) application is hitting buffer limit (SOCK_NOSPACE) | 1669 | * 4) application is hitting buffer limit (SOCK_NOSPACE) |
1621 | */ | 1670 | */ |
1622 | if (!tcp_send_head(sk) && sk->sk_socket && | 1671 | if (tcp_write_queue_empty(sk) && sk->sk_socket && |
1623 | test_bit(SOCK_NOSPACE, &sk->sk_socket->flags) && | 1672 | test_bit(SOCK_NOSPACE, &sk->sk_socket->flags) && |
1624 | (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) | 1673 | (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) |
1625 | tcp_chrono_start(sk, TCP_CHRONO_SNDBUF_LIMITED); | 1674 | tcp_chrono_start(sk, TCP_CHRONO_SNDBUF_LIMITED); |
@@ -1671,7 +1720,7 @@ u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now, | |||
1671 | { | 1720 | { |
1672 | u32 bytes, segs; | 1721 | u32 bytes, segs; |
1673 | 1722 | ||
1674 | bytes = min(sk->sk_pacing_rate >> 10, | 1723 | bytes = min(sk->sk_pacing_rate >> sk->sk_pacing_shift, |
1675 | sk->sk_gso_max_size - 1 - MAX_TCP_HEADER); | 1724 | sk->sk_gso_max_size - 1 - MAX_TCP_HEADER); |
1676 | 1725 | ||
1677 | /* Goal is to send at least one packet per ms, | 1726 | /* Goal is to send at least one packet per ms, |
@@ -1694,7 +1743,8 @@ static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now) | |||
1694 | u32 tso_segs = ca_ops->tso_segs_goal ? ca_ops->tso_segs_goal(sk) : 0; | 1743 | u32 tso_segs = ca_ops->tso_segs_goal ? ca_ops->tso_segs_goal(sk) : 0; |
1695 | 1744 | ||
1696 | return tso_segs ? : | 1745 | return tso_segs ? : |
1697 | tcp_tso_autosize(sk, mss_now, sysctl_tcp_min_tso_segs); | 1746 | tcp_tso_autosize(sk, mss_now, |
1747 | sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs); | ||
1698 | } | 1748 | } |
1699 | 1749 | ||
1700 | /* Returns the portion of skb which can be sent right away */ | 1750 | /* Returns the portion of skb which can be sent right away */ |
@@ -1815,7 +1865,8 @@ static bool tcp_snd_wnd_test(const struct tcp_sock *tp, | |||
1815 | * know that all the data is in scatter-gather pages, and that the | 1865 | * know that all the data is in scatter-gather pages, and that the |
1816 | * packet has never been sent out before (and thus is not cloned). | 1866 | * packet has never been sent out before (and thus is not cloned). |
1817 | */ | 1867 | */ |
1818 | static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, | 1868 | static int tso_fragment(struct sock *sk, enum tcp_queue tcp_queue, |
1869 | struct sk_buff *skb, unsigned int len, | ||
1819 | unsigned int mss_now, gfp_t gfp) | 1870 | unsigned int mss_now, gfp_t gfp) |
1820 | { | 1871 | { |
1821 | struct sk_buff *buff; | 1872 | struct sk_buff *buff; |
@@ -1824,7 +1875,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, | |||
1824 | 1875 | ||
1825 | /* All of a TSO frame must be composed of paged data. */ | 1876 | /* All of a TSO frame must be composed of paged data. */ |
1826 | if (skb->len != skb->data_len) | 1877 | if (skb->len != skb->data_len) |
1827 | return tcp_fragment(sk, skb, len, mss_now, gfp); | 1878 | return tcp_fragment(sk, tcp_queue, skb, len, mss_now, gfp); |
1828 | 1879 | ||
1829 | buff = sk_stream_alloc_skb(sk, 0, gfp, true); | 1880 | buff = sk_stream_alloc_skb(sk, 0, gfp, true); |
1830 | if (unlikely(!buff)) | 1881 | if (unlikely(!buff)) |
@@ -1860,7 +1911,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, | |||
1860 | 1911 | ||
1861 | /* Link BUFF into the send queue. */ | 1912 | /* Link BUFF into the send queue. */ |
1862 | __skb_header_release(buff); | 1913 | __skb_header_release(buff); |
1863 | tcp_insert_write_queue_after(skb, buff, sk); | 1914 | tcp_insert_write_queue_after(skb, buff, sk, tcp_queue); |
1864 | 1915 | ||
1865 | return 0; | 1916 | return 0; |
1866 | } | 1917 | } |
@@ -1910,7 +1961,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, | |||
1910 | if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len)) | 1961 | if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len)) |
1911 | goto send_now; | 1962 | goto send_now; |
1912 | 1963 | ||
1913 | win_divisor = READ_ONCE(sysctl_tcp_tso_win_divisor); | 1964 | win_divisor = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tso_win_divisor); |
1914 | if (win_divisor) { | 1965 | if (win_divisor) { |
1915 | u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache); | 1966 | u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache); |
1916 | 1967 | ||
@@ -1930,8 +1981,10 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, | |||
1930 | goto send_now; | 1981 | goto send_now; |
1931 | } | 1982 | } |
1932 | 1983 | ||
1933 | head = tcp_write_queue_head(sk); | 1984 | /* TODO : use tsorted_sent_queue ? */ |
1934 | 1985 | head = tcp_rtx_queue_head(sk); | |
1986 | if (!head) | ||
1987 | goto send_now; | ||
1935 | age = tcp_stamp_us_delta(tp->tcp_mstamp, head->skb_mstamp); | 1988 | age = tcp_stamp_us_delta(tp->tcp_mstamp, head->skb_mstamp); |
1936 | /* If next ACK is likely to come too late (half srtt), do not defer */ | 1989 | /* If next ACK is likely to come too late (half srtt), do not defer */ |
1937 | if (age < (tp->srtt_us >> 4)) | 1990 | if (age < (tp->srtt_us >> 4)) |
@@ -2145,18 +2198,18 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb, | |||
2145 | { | 2198 | { |
2146 | unsigned int limit; | 2199 | unsigned int limit; |
2147 | 2200 | ||
2148 | limit = max(2 * skb->truesize, sk->sk_pacing_rate >> 10); | 2201 | limit = max(2 * skb->truesize, sk->sk_pacing_rate >> sk->sk_pacing_shift); |
2149 | limit = min_t(u32, limit, sysctl_tcp_limit_output_bytes); | 2202 | limit = min_t(u32, limit, |
2203 | sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes); | ||
2150 | limit <<= factor; | 2204 | limit <<= factor; |
2151 | 2205 | ||
2152 | if (refcount_read(&sk->sk_wmem_alloc) > limit) { | 2206 | if (refcount_read(&sk->sk_wmem_alloc) > limit) { |
2153 | /* Always send the 1st or 2nd skb in write queue. | 2207 | /* Always send skb if rtx queue is empty. |
2154 | * No need to wait for TX completion to call us back, | 2208 | * No need to wait for TX completion to call us back, |
2155 | * after softirq/tasklet schedule. | 2209 | * after softirq/tasklet schedule. |
2156 | * This helps when TX completions are delayed too much. | 2210 | * This helps when TX completions are delayed too much. |
2157 | */ | 2211 | */ |
2158 | if (skb == sk->sk_write_queue.next || | 2212 | if (tcp_rtx_queue_empty(sk)) |
2159 | skb->prev == sk->sk_write_queue.next) | ||
2160 | return false; | 2213 | return false; |
2161 | 2214 | ||
2162 | set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags); | 2215 | set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags); |
@@ -2207,7 +2260,7 @@ void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type) | |||
2207 | * it's the "most interesting" or current chrono we are | 2260 | * it's the "most interesting" or current chrono we are |
2208 | * tracking and starts busy chrono if we have pending data. | 2261 | * tracking and starts busy chrono if we have pending data. |
2209 | */ | 2262 | */ |
2210 | if (tcp_write_queue_empty(sk)) | 2263 | if (tcp_rtx_and_write_queues_empty(sk)) |
2211 | tcp_chrono_set(tp, TCP_CHRONO_UNSPEC); | 2264 | tcp_chrono_set(tp, TCP_CHRONO_UNSPEC); |
2212 | else if (type == tp->chrono_type) | 2265 | else if (type == tp->chrono_type) |
2213 | tcp_chrono_set(tp, TCP_CHRONO_BUSY); | 2266 | tcp_chrono_set(tp, TCP_CHRONO_BUSY); |
@@ -2263,7 +2316,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, | |||
2263 | 2316 | ||
2264 | if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) { | 2317 | if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) { |
2265 | /* "skb_mstamp" is used as a start point for the retransmit timer */ | 2318 | /* "skb_mstamp" is used as a start point for the retransmit timer */ |
2266 | skb->skb_mstamp = tp->tcp_mstamp; | 2319 | tcp_update_skb_after_send(tp, skb); |
2267 | goto repair; /* Skip network transmission */ | 2320 | goto repair; /* Skip network transmission */ |
2268 | } | 2321 | } |
2269 | 2322 | ||
@@ -2302,7 +2355,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, | |||
2302 | nonagle); | 2355 | nonagle); |
2303 | 2356 | ||
2304 | if (skb->len > limit && | 2357 | if (skb->len > limit && |
2305 | unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) | 2358 | unlikely(tso_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE, |
2359 | skb, limit, mss_now, gfp))) | ||
2306 | break; | 2360 | break; |
2307 | 2361 | ||
2308 | if (test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags)) | 2362 | if (test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags)) |
@@ -2342,7 +2396,7 @@ repair: | |||
2342 | tcp_cwnd_validate(sk, is_cwnd_limited); | 2396 | tcp_cwnd_validate(sk, is_cwnd_limited); |
2343 | return false; | 2397 | return false; |
2344 | } | 2398 | } |
2345 | return !tp->packets_out && tcp_send_head(sk); | 2399 | return !tp->packets_out && !tcp_write_queue_empty(sk); |
2346 | } | 2400 | } |
2347 | 2401 | ||
2348 | bool tcp_schedule_loss_probe(struct sock *sk) | 2402 | bool tcp_schedule_loss_probe(struct sock *sk) |
@@ -2350,6 +2404,7 @@ bool tcp_schedule_loss_probe(struct sock *sk) | |||
2350 | struct inet_connection_sock *icsk = inet_csk(sk); | 2404 | struct inet_connection_sock *icsk = inet_csk(sk); |
2351 | struct tcp_sock *tp = tcp_sk(sk); | 2405 | struct tcp_sock *tp = tcp_sk(sk); |
2352 | u32 timeout, rto_delta_us; | 2406 | u32 timeout, rto_delta_us; |
2407 | int early_retrans; | ||
2353 | 2408 | ||
2354 | /* Don't do any loss probe on a Fast Open connection before 3WHS | 2409 | /* Don't do any loss probe on a Fast Open connection before 3WHS |
2355 | * finishes. | 2410 | * finishes. |
@@ -2357,16 +2412,17 @@ bool tcp_schedule_loss_probe(struct sock *sk) | |||
2357 | if (tp->fastopen_rsk) | 2412 | if (tp->fastopen_rsk) |
2358 | return false; | 2413 | return false; |
2359 | 2414 | ||
2415 | early_retrans = sock_net(sk)->ipv4.sysctl_tcp_early_retrans; | ||
2360 | /* Schedule a loss probe in 2*RTT for SACK capable connections | 2416 | /* Schedule a loss probe in 2*RTT for SACK capable connections |
2361 | * in Open state, that are either limited by cwnd or application. | 2417 | * in Open state, that are either limited by cwnd or application. |
2362 | */ | 2418 | */ |
2363 | if ((sysctl_tcp_early_retrans != 3 && sysctl_tcp_early_retrans != 4) || | 2419 | if ((early_retrans != 3 && early_retrans != 4) || |
2364 | !tp->packets_out || !tcp_is_sack(tp) || | 2420 | !tp->packets_out || !tcp_is_sack(tp) || |
2365 | icsk->icsk_ca_state != TCP_CA_Open) | 2421 | icsk->icsk_ca_state != TCP_CA_Open) |
2366 | return false; | 2422 | return false; |
2367 | 2423 | ||
2368 | if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) && | 2424 | if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) && |
2369 | tcp_send_head(sk)) | 2425 | !tcp_write_queue_empty(sk)) |
2370 | return false; | 2426 | return false; |
2371 | 2427 | ||
2372 | /* Probe timeout is 2*rtt. Add minimum RTO to account | 2428 | /* Probe timeout is 2*rtt. Add minimum RTO to account |
@@ -2419,18 +2475,14 @@ void tcp_send_loss_probe(struct sock *sk) | |||
2419 | int mss = tcp_current_mss(sk); | 2475 | int mss = tcp_current_mss(sk); |
2420 | 2476 | ||
2421 | skb = tcp_send_head(sk); | 2477 | skb = tcp_send_head(sk); |
2422 | if (skb) { | 2478 | if (skb && tcp_snd_wnd_test(tp, skb, mss)) { |
2423 | if (tcp_snd_wnd_test(tp, skb, mss)) { | 2479 | pcount = tp->packets_out; |
2424 | pcount = tp->packets_out; | 2480 | tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC); |
2425 | tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC); | 2481 | if (tp->packets_out > pcount) |
2426 | if (tp->packets_out > pcount) | 2482 | goto probe_sent; |
2427 | goto probe_sent; | 2483 | goto rearm_timer; |
2428 | goto rearm_timer; | ||
2429 | } | ||
2430 | skb = tcp_write_queue_prev(sk, skb); | ||
2431 | } else { | ||
2432 | skb = tcp_write_queue_tail(sk); | ||
2433 | } | 2484 | } |
2485 | skb = skb_rb_last(&sk->tcp_rtx_queue); | ||
2434 | 2486 | ||
2435 | /* At most one outstanding TLP retransmission. */ | 2487 | /* At most one outstanding TLP retransmission. */ |
2436 | if (tp->tlp_high_seq) | 2488 | if (tp->tlp_high_seq) |
@@ -2448,10 +2500,11 @@ void tcp_send_loss_probe(struct sock *sk) | |||
2448 | goto rearm_timer; | 2500 | goto rearm_timer; |
2449 | 2501 | ||
2450 | if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) { | 2502 | if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) { |
2451 | if (unlikely(tcp_fragment(sk, skb, (pcount - 1) * mss, mss, | 2503 | if (unlikely(tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, |
2504 | (pcount - 1) * mss, mss, | ||
2452 | GFP_ATOMIC))) | 2505 | GFP_ATOMIC))) |
2453 | goto rearm_timer; | 2506 | goto rearm_timer; |
2454 | skb = tcp_write_queue_next(sk, skb); | 2507 | skb = skb_rb_next(skb); |
2455 | } | 2508 | } |
2456 | 2509 | ||
2457 | if (WARN_ON(!skb || !tcp_skb_pcount(skb))) | 2510 | if (WARN_ON(!skb || !tcp_skb_pcount(skb))) |
@@ -2651,7 +2704,7 @@ void tcp_skb_collapse_tstamp(struct sk_buff *skb, | |||
2651 | static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) | 2704 | static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) |
2652 | { | 2705 | { |
2653 | struct tcp_sock *tp = tcp_sk(sk); | 2706 | struct tcp_sock *tp = tcp_sk(sk); |
2654 | struct sk_buff *next_skb = tcp_write_queue_next(sk, skb); | 2707 | struct sk_buff *next_skb = skb_rb_next(skb); |
2655 | int skb_size, next_skb_size; | 2708 | int skb_size, next_skb_size; |
2656 | 2709 | ||
2657 | skb_size = skb->len; | 2710 | skb_size = skb->len; |
@@ -2668,8 +2721,6 @@ static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) | |||
2668 | } | 2721 | } |
2669 | tcp_highest_sack_replace(sk, next_skb, skb); | 2722 | tcp_highest_sack_replace(sk, next_skb, skb); |
2670 | 2723 | ||
2671 | tcp_unlink_write_queue(next_skb, sk); | ||
2672 | |||
2673 | if (next_skb->ip_summed == CHECKSUM_PARTIAL) | 2724 | if (next_skb->ip_summed == CHECKSUM_PARTIAL) |
2674 | skb->ip_summed = CHECKSUM_PARTIAL; | 2725 | skb->ip_summed = CHECKSUM_PARTIAL; |
2675 | 2726 | ||
@@ -2697,7 +2748,7 @@ static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) | |||
2697 | 2748 | ||
2698 | tcp_skb_collapse_tstamp(skb, next_skb); | 2749 | tcp_skb_collapse_tstamp(skb, next_skb); |
2699 | 2750 | ||
2700 | sk_wmem_free_skb(sk, next_skb); | 2751 | tcp_rtx_queue_unlink_and_free(next_skb, sk); |
2701 | return true; | 2752 | return true; |
2702 | } | 2753 | } |
2703 | 2754 | ||
@@ -2708,8 +2759,6 @@ static bool tcp_can_collapse(const struct sock *sk, const struct sk_buff *skb) | |||
2708 | return false; | 2759 | return false; |
2709 | if (skb_cloned(skb)) | 2760 | if (skb_cloned(skb)) |
2710 | return false; | 2761 | return false; |
2711 | if (skb == tcp_send_head(sk)) | ||
2712 | return false; | ||
2713 | /* Some heuristics for collapsing over SACK'd could be invented */ | 2762 | /* Some heuristics for collapsing over SACK'd could be invented */ |
2714 | if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) | 2763 | if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) |
2715 | return false; | 2764 | return false; |
@@ -2727,12 +2776,12 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to, | |||
2727 | struct sk_buff *skb = to, *tmp; | 2776 | struct sk_buff *skb = to, *tmp; |
2728 | bool first = true; | 2777 | bool first = true; |
2729 | 2778 | ||
2730 | if (!sysctl_tcp_retrans_collapse) | 2779 | if (!sock_net(sk)->ipv4.sysctl_tcp_retrans_collapse) |
2731 | return; | 2780 | return; |
2732 | if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) | 2781 | if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) |
2733 | return; | 2782 | return; |
2734 | 2783 | ||
2735 | tcp_for_write_queue_from_safe(skb, tmp, sk) { | 2784 | skb_rbtree_walk_from_safe(skb, tmp) { |
2736 | if (!tcp_can_collapse(sk, skb)) | 2785 | if (!tcp_can_collapse(sk, skb)) |
2737 | break; | 2786 | break; |
2738 | 2787 | ||
@@ -2807,7 +2856,8 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) | |||
2807 | 2856 | ||
2808 | len = cur_mss * segs; | 2857 | len = cur_mss * segs; |
2809 | if (skb->len > len) { | 2858 | if (skb->len > len) { |
2810 | if (tcp_fragment(sk, skb, len, cur_mss, GFP_ATOMIC)) | 2859 | if (tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, len, |
2860 | cur_mss, GFP_ATOMIC)) | ||
2811 | return -ENOMEM; /* We'll try again later. */ | 2861 | return -ENOMEM; /* We'll try again later. */ |
2812 | } else { | 2862 | } else { |
2813 | if (skb_unclone(skb, GFP_ATOMIC)) | 2863 | if (skb_unclone(skb, GFP_ATOMIC)) |
@@ -2841,11 +2891,14 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) | |||
2841 | skb_headroom(skb) >= 0xFFFF)) { | 2891 | skb_headroom(skb) >= 0xFFFF)) { |
2842 | struct sk_buff *nskb; | 2892 | struct sk_buff *nskb; |
2843 | 2893 | ||
2844 | nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC); | 2894 | tcp_skb_tsorted_save(skb) { |
2845 | err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) : | 2895 | nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC); |
2846 | -ENOBUFS; | 2896 | err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) : |
2897 | -ENOBUFS; | ||
2898 | } tcp_skb_tsorted_restore(skb); | ||
2899 | |||
2847 | if (!err) { | 2900 | if (!err) { |
2848 | skb->skb_mstamp = tp->tcp_mstamp; | 2901 | tcp_update_skb_after_send(tp, skb); |
2849 | tcp_rate_skb_sent(sk, skb); | 2902 | tcp_rate_skb_sent(sk, skb); |
2850 | } | 2903 | } |
2851 | } else { | 2904 | } else { |
@@ -2854,6 +2907,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) | |||
2854 | 2907 | ||
2855 | if (likely(!err)) { | 2908 | if (likely(!err)) { |
2856 | TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS; | 2909 | TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS; |
2910 | trace_tcp_retransmit_skb(sk, skb); | ||
2857 | } else if (err != -EBUSY) { | 2911 | } else if (err != -EBUSY) { |
2858 | NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL); | 2912 | NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL); |
2859 | } | 2913 | } |
@@ -2890,36 +2944,25 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) | |||
2890 | * retransmitted data is acknowledged. It tries to continue | 2944 | * retransmitted data is acknowledged. It tries to continue |
2891 | * resending the rest of the retransmit queue, until either | 2945 | * resending the rest of the retransmit queue, until either |
2892 | * we've sent it all or the congestion window limit is reached. | 2946 | * we've sent it all or the congestion window limit is reached. |
2893 | * If doing SACK, the first ACK which comes back for a timeout | ||
2894 | * based retransmit packet might feed us FACK information again. | ||
2895 | * If so, we use it to avoid unnecessarily retransmissions. | ||
2896 | */ | 2947 | */ |
2897 | void tcp_xmit_retransmit_queue(struct sock *sk) | 2948 | void tcp_xmit_retransmit_queue(struct sock *sk) |
2898 | { | 2949 | { |
2899 | const struct inet_connection_sock *icsk = inet_csk(sk); | 2950 | const struct inet_connection_sock *icsk = inet_csk(sk); |
2951 | struct sk_buff *skb, *rtx_head, *hole = NULL; | ||
2900 | struct tcp_sock *tp = tcp_sk(sk); | 2952 | struct tcp_sock *tp = tcp_sk(sk); |
2901 | struct sk_buff *skb; | ||
2902 | struct sk_buff *hole = NULL; | ||
2903 | u32 max_segs; | 2953 | u32 max_segs; |
2904 | int mib_idx; | 2954 | int mib_idx; |
2905 | 2955 | ||
2906 | if (!tp->packets_out) | 2956 | if (!tp->packets_out) |
2907 | return; | 2957 | return; |
2908 | 2958 | ||
2909 | if (tp->retransmit_skb_hint) { | 2959 | rtx_head = tcp_rtx_queue_head(sk); |
2910 | skb = tp->retransmit_skb_hint; | 2960 | skb = tp->retransmit_skb_hint ?: rtx_head; |
2911 | } else { | ||
2912 | skb = tcp_write_queue_head(sk); | ||
2913 | } | ||
2914 | |||
2915 | max_segs = tcp_tso_segs(sk, tcp_current_mss(sk)); | 2961 | max_segs = tcp_tso_segs(sk, tcp_current_mss(sk)); |
2916 | tcp_for_write_queue_from(skb, sk) { | 2962 | skb_rbtree_walk_from(skb) { |
2917 | __u8 sacked; | 2963 | __u8 sacked; |
2918 | int segs; | 2964 | int segs; |
2919 | 2965 | ||
2920 | if (skb == tcp_send_head(sk)) | ||
2921 | break; | ||
2922 | |||
2923 | if (tcp_pacing_check(sk)) | 2966 | if (tcp_pacing_check(sk)) |
2924 | break; | 2967 | break; |
2925 | 2968 | ||
@@ -2964,7 +3007,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk) | |||
2964 | if (tcp_in_cwnd_reduction(sk)) | 3007 | if (tcp_in_cwnd_reduction(sk)) |
2965 | tp->prr_out += tcp_skb_pcount(skb); | 3008 | tp->prr_out += tcp_skb_pcount(skb); |
2966 | 3009 | ||
2967 | if (skb == tcp_write_queue_head(sk) && | 3010 | if (skb == rtx_head && |
2968 | icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT) | 3011 | icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT) |
2969 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, | 3012 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, |
2970 | inet_csk(sk)->icsk_rto, | 3013 | inet_csk(sk)->icsk_rto, |
@@ -3006,12 +3049,15 @@ void tcp_send_fin(struct sock *sk) | |||
3006 | * Note: in the latter case, FIN packet will be sent after a timeout, | 3049 | * Note: in the latter case, FIN packet will be sent after a timeout, |
3007 | * as TCP stack thinks it has already been transmitted. | 3050 | * as TCP stack thinks it has already been transmitted. |
3008 | */ | 3051 | */ |
3009 | if (tskb && (tcp_send_head(sk) || tcp_under_memory_pressure(sk))) { | 3052 | if (!tskb && tcp_under_memory_pressure(sk)) |
3053 | tskb = skb_rb_last(&sk->tcp_rtx_queue); | ||
3054 | |||
3055 | if (tskb) { | ||
3010 | coalesce: | 3056 | coalesce: |
3011 | TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN; | 3057 | TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN; |
3012 | TCP_SKB_CB(tskb)->end_seq++; | 3058 | TCP_SKB_CB(tskb)->end_seq++; |
3013 | tp->write_seq++; | 3059 | tp->write_seq++; |
3014 | if (!tcp_send_head(sk)) { | 3060 | if (tcp_write_queue_empty(sk)) { |
3015 | /* This means tskb was already sent. | 3061 | /* This means tskb was already sent. |
3016 | * Pretend we included the FIN on previous transmit. | 3062 | * Pretend we included the FIN on previous transmit. |
3017 | * We need to set tp->snd_nxt to the value it would have | 3063 | * We need to set tp->snd_nxt to the value it would have |
@@ -3028,6 +3074,7 @@ coalesce: | |||
3028 | goto coalesce; | 3074 | goto coalesce; |
3029 | return; | 3075 | return; |
3030 | } | 3076 | } |
3077 | INIT_LIST_HEAD(&skb->tcp_tsorted_anchor); | ||
3031 | skb_reserve(skb, MAX_TCP_HEADER); | 3078 | skb_reserve(skb, MAX_TCP_HEADER); |
3032 | sk_forced_mem_schedule(sk, skb->truesize); | 3079 | sk_forced_mem_schedule(sk, skb->truesize); |
3033 | /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */ | 3080 | /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */ |
@@ -3064,6 +3111,11 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority) | |||
3064 | /* Send it off. */ | 3111 | /* Send it off. */ |
3065 | if (tcp_transmit_skb(sk, skb, 0, priority)) | 3112 | if (tcp_transmit_skb(sk, skb, 0, priority)) |
3066 | NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED); | 3113 | NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED); |
3114 | |||
3115 | /* skb of trace_tcp_send_reset() keeps the skb that caused RST, | ||
3116 | * skb here is different to the troublesome skb, so use NULL | ||
3117 | */ | ||
3118 | trace_tcp_send_reset(sk, NULL); | ||
3067 | } | 3119 | } |
3068 | 3120 | ||
3069 | /* Send a crossed SYN-ACK during socket establishment. | 3121 | /* Send a crossed SYN-ACK during socket establishment. |
@@ -3076,20 +3128,24 @@ int tcp_send_synack(struct sock *sk) | |||
3076 | { | 3128 | { |
3077 | struct sk_buff *skb; | 3129 | struct sk_buff *skb; |
3078 | 3130 | ||
3079 | skb = tcp_write_queue_head(sk); | 3131 | skb = tcp_rtx_queue_head(sk); |
3080 | if (!skb || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { | 3132 | if (!skb || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { |
3081 | pr_debug("%s: wrong queue state\n", __func__); | 3133 | pr_err("%s: wrong queue state\n", __func__); |
3082 | return -EFAULT; | 3134 | return -EFAULT; |
3083 | } | 3135 | } |
3084 | if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) { | 3136 | if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) { |
3085 | if (skb_cloned(skb)) { | 3137 | if (skb_cloned(skb)) { |
3086 | struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC); | 3138 | struct sk_buff *nskb; |
3139 | |||
3140 | tcp_skb_tsorted_save(skb) { | ||
3141 | nskb = skb_copy(skb, GFP_ATOMIC); | ||
3142 | } tcp_skb_tsorted_restore(skb); | ||
3087 | if (!nskb) | 3143 | if (!nskb) |
3088 | return -ENOMEM; | 3144 | return -ENOMEM; |
3089 | tcp_unlink_write_queue(skb, sk); | 3145 | INIT_LIST_HEAD(&nskb->tcp_tsorted_anchor); |
3146 | tcp_rtx_queue_unlink_and_free(skb, sk); | ||
3090 | __skb_header_release(nskb); | 3147 | __skb_header_release(nskb); |
3091 | __tcp_add_write_queue_head(sk, nskb); | 3148 | tcp_rbtree_insert(&sk->tcp_rtx_queue, nskb); |
3092 | sk_wmem_free_skb(sk, skb); | ||
3093 | sk->sk_wmem_queued += nskb->truesize; | 3149 | sk->sk_wmem_queued += nskb->truesize; |
3094 | sk_mem_charge(sk, nskb->truesize); | 3150 | sk_mem_charge(sk, nskb->truesize); |
3095 | skb = nskb; | 3151 | skb = nskb; |
@@ -3166,8 +3222,8 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, | |||
3166 | md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req)); | 3222 | md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req)); |
3167 | #endif | 3223 | #endif |
3168 | skb_set_hash(skb, tcp_rsk(req)->txhash, PKT_HASH_TYPE_L4); | 3224 | skb_set_hash(skb, tcp_rsk(req)->txhash, PKT_HASH_TYPE_L4); |
3169 | tcp_header_size = tcp_synack_options(req, mss, skb, &opts, md5, foc) + | 3225 | tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, md5, |
3170 | sizeof(*th); | 3226 | foc) + sizeof(*th); |
3171 | 3227 | ||
3172 | skb_push(skb, tcp_header_size); | 3228 | skb_push(skb, tcp_header_size); |
3173 | skb_reset_transport_header(skb); | 3229 | skb_reset_transport_header(skb); |
@@ -3268,7 +3324,7 @@ static void tcp_connect_init(struct sock *sk) | |||
3268 | if (rcv_wnd == 0) | 3324 | if (rcv_wnd == 0) |
3269 | rcv_wnd = dst_metric(dst, RTAX_INITRWND); | 3325 | rcv_wnd = dst_metric(dst, RTAX_INITRWND); |
3270 | 3326 | ||
3271 | tcp_select_initial_window(tcp_full_space(sk), | 3327 | tcp_select_initial_window(sk, tcp_full_space(sk), |
3272 | tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), | 3328 | tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), |
3273 | &tp->rcv_wnd, | 3329 | &tp->rcv_wnd, |
3274 | &tp->window_clamp, | 3330 | &tp->window_clamp, |
@@ -3307,7 +3363,6 @@ static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb) | |||
3307 | 3363 | ||
3308 | tcb->end_seq += skb->len; | 3364 | tcb->end_seq += skb->len; |
3309 | __skb_header_release(skb); | 3365 | __skb_header_release(skb); |
3310 | __tcp_add_write_queue_tail(sk, skb); | ||
3311 | sk->sk_wmem_queued += skb->truesize; | 3366 | sk->sk_wmem_queued += skb->truesize; |
3312 | sk_mem_charge(sk, skb->truesize); | 3367 | sk_mem_charge(sk, skb->truesize); |
3313 | tp->write_seq = tcb->end_seq; | 3368 | tp->write_seq = tcb->end_seq; |
@@ -3355,6 +3410,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) | |||
3355 | int copied = copy_from_iter(skb_put(syn_data, space), space, | 3410 | int copied = copy_from_iter(skb_put(syn_data, space), space, |
3356 | &fo->data->msg_iter); | 3411 | &fo->data->msg_iter); |
3357 | if (unlikely(!copied)) { | 3412 | if (unlikely(!copied)) { |
3413 | tcp_skb_tsorted_anchor_cleanup(syn_data); | ||
3358 | kfree_skb(syn_data); | 3414 | kfree_skb(syn_data); |
3359 | goto fallback; | 3415 | goto fallback; |
3360 | } | 3416 | } |
@@ -3385,12 +3441,13 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) | |||
3385 | TCP_SKB_CB(syn_data)->tcp_flags = TCPHDR_ACK | TCPHDR_PSH; | 3441 | TCP_SKB_CB(syn_data)->tcp_flags = TCPHDR_ACK | TCPHDR_PSH; |
3386 | if (!err) { | 3442 | if (!err) { |
3387 | tp->syn_data = (fo->copied > 0); | 3443 | tp->syn_data = (fo->copied > 0); |
3444 | tcp_rbtree_insert(&sk->tcp_rtx_queue, syn_data); | ||
3388 | NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT); | 3445 | NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT); |
3389 | goto done; | 3446 | goto done; |
3390 | } | 3447 | } |
3391 | 3448 | ||
3392 | /* data was not sent, this is our new send_head */ | 3449 | /* data was not sent, put it in write_queue */ |
3393 | sk->sk_send_head = syn_data; | 3450 | __skb_queue_tail(&sk->sk_write_queue, syn_data); |
3394 | tp->packets_out -= tcp_skb_pcount(syn_data); | 3451 | tp->packets_out -= tcp_skb_pcount(syn_data); |
3395 | 3452 | ||
3396 | fallback: | 3453 | fallback: |
@@ -3433,6 +3490,7 @@ int tcp_connect(struct sock *sk) | |||
3433 | tp->retrans_stamp = tcp_time_stamp(tp); | 3490 | tp->retrans_stamp = tcp_time_stamp(tp); |
3434 | tcp_connect_queue_skb(sk, buff); | 3491 | tcp_connect_queue_skb(sk, buff); |
3435 | tcp_ecn_send_syn(sk, buff); | 3492 | tcp_ecn_send_syn(sk, buff); |
3493 | tcp_rbtree_insert(&sk->tcp_rtx_queue, buff); | ||
3436 | 3494 | ||
3437 | /* Send off SYN; include data in Fast Open. */ | 3495 | /* Send off SYN; include data in Fast Open. */ |
3438 | err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) : | 3496 | err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) : |
@@ -3627,7 +3685,8 @@ int tcp_write_wakeup(struct sock *sk, int mib) | |||
3627 | skb->len > mss) { | 3685 | skb->len > mss) { |
3628 | seg_size = min(seg_size, mss); | 3686 | seg_size = min(seg_size, mss); |
3629 | TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; | 3687 | TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; |
3630 | if (tcp_fragment(sk, skb, seg_size, mss, GFP_ATOMIC)) | 3688 | if (tcp_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE, |
3689 | skb, seg_size, mss, GFP_ATOMIC)) | ||
3631 | return -1; | 3690 | return -1; |
3632 | } else if (!tcp_skb_pcount(skb)) | 3691 | } else if (!tcp_skb_pcount(skb)) |
3633 | tcp_set_skb_tso_segs(skb, mss); | 3692 | tcp_set_skb_tso_segs(skb, mss); |
@@ -3657,7 +3716,7 @@ void tcp_send_probe0(struct sock *sk) | |||
3657 | 3716 | ||
3658 | err = tcp_write_wakeup(sk, LINUX_MIB_TCPWINPROBE); | 3717 | err = tcp_write_wakeup(sk, LINUX_MIB_TCPWINPROBE); |
3659 | 3718 | ||
3660 | if (tp->packets_out || !tcp_send_head(sk)) { | 3719 | if (tp->packets_out || tcp_write_queue_empty(sk)) { |
3661 | /* Cancel probe timer, if it is not required. */ | 3720 | /* Cancel probe timer, if it is not required. */ |
3662 | icsk->icsk_probes_out = 0; | 3721 | icsk->icsk_probes_out = 0; |
3663 | icsk->icsk_backoff = 0; | 3722 | icsk->icsk_backoff = 0; |
@@ -3698,6 +3757,7 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req) | |||
3698 | __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); | 3757 | __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); |
3699 | if (unlikely(tcp_passive_fastopen(sk))) | 3758 | if (unlikely(tcp_passive_fastopen(sk))) |
3700 | tcp_sk(sk)->total_retrans++; | 3759 | tcp_sk(sk)->total_retrans++; |
3760 | trace_tcp_retransmit_synack(sk, req); | ||
3701 | } | 3761 | } |
3702 | return res; | 3762 | return res; |
3703 | } | 3763 | } |