aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp_input.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4/tcp_input.c')
-rw-r--r--net/ipv4/tcp_input.c597
1 files changed, 266 insertions, 331 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 887585045b27..dabbf1d392fb 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -76,25 +76,10 @@
76#include <linux/ipsec.h> 76#include <linux/ipsec.h>
77#include <asm/unaligned.h> 77#include <asm/unaligned.h>
78#include <linux/errqueue.h> 78#include <linux/errqueue.h>
79#include <trace/events/tcp.h>
80#include <linux/static_key.h>
79 81
80int sysctl_tcp_fack __read_mostly;
81int sysctl_tcp_max_reordering __read_mostly = 300;
82int sysctl_tcp_dsack __read_mostly = 1;
83int sysctl_tcp_app_win __read_mostly = 31;
84int sysctl_tcp_adv_win_scale __read_mostly = 1;
85EXPORT_SYMBOL(sysctl_tcp_adv_win_scale);
86
87/* rfc5961 challenge ack rate limiting */
88int sysctl_tcp_challenge_ack_limit = 1000;
89
90int sysctl_tcp_stdurg __read_mostly;
91int sysctl_tcp_rfc1337 __read_mostly;
92int sysctl_tcp_max_orphans __read_mostly = NR_FILE; 82int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
93int sysctl_tcp_frto __read_mostly = 2;
94int sysctl_tcp_min_rtt_wlen __read_mostly = 300;
95int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
96int sysctl_tcp_early_retrans __read_mostly = 3;
97int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
98 83
99#define FLAG_DATA 0x01 /* Incoming frame contained data. */ 84#define FLAG_DATA 0x01 /* Incoming frame contained data. */
100#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ 85#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
@@ -335,7 +320,7 @@ static void tcp_sndbuf_expand(struct sock *sk)
335 sndmem *= nr_segs * per_mss; 320 sndmem *= nr_segs * per_mss;
336 321
337 if (sk->sk_sndbuf < sndmem) 322 if (sk->sk_sndbuf < sndmem)
338 sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]); 323 sk->sk_sndbuf = min(sndmem, sock_net(sk)->ipv4.sysctl_tcp_wmem[2]);
339} 324}
340 325
341/* 2. Tuning advertised window (window_clamp, rcv_ssthresh) 326/* 2. Tuning advertised window (window_clamp, rcv_ssthresh)
@@ -368,8 +353,8 @@ static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb)
368{ 353{
369 struct tcp_sock *tp = tcp_sk(sk); 354 struct tcp_sock *tp = tcp_sk(sk);
370 /* Optimize this! */ 355 /* Optimize this! */
371 int truesize = tcp_win_from_space(skb->truesize) >> 1; 356 int truesize = tcp_win_from_space(sk, skb->truesize) >> 1;
372 int window = tcp_win_from_space(sysctl_tcp_rmem[2]) >> 1; 357 int window = tcp_win_from_space(sk, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1;
373 358
374 while (tp->rcv_ssthresh <= window) { 359 while (tp->rcv_ssthresh <= window) {
375 if (truesize <= skb->len) 360 if (truesize <= skb->len)
@@ -394,7 +379,7 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)
394 /* Check #2. Increase window, if skb with such overhead 379 /* Check #2. Increase window, if skb with such overhead
395 * will fit to rcvbuf in future. 380 * will fit to rcvbuf in future.
396 */ 381 */
397 if (tcp_win_from_space(skb->truesize) <= skb->len) 382 if (tcp_win_from_space(sk, skb->truesize) <= skb->len)
398 incr = 2 * tp->advmss; 383 incr = 2 * tp->advmss;
399 else 384 else
400 incr = __tcp_grow_window(sk, skb); 385 incr = __tcp_grow_window(sk, skb);
@@ -420,11 +405,11 @@ static void tcp_fixup_rcvbuf(struct sock *sk)
420 /* Dynamic Right Sizing (DRS) has 2 to 3 RTT latency 405 /* Dynamic Right Sizing (DRS) has 2 to 3 RTT latency
421 * Allow enough cushion so that sender is not limited by our window 406 * Allow enough cushion so that sender is not limited by our window
422 */ 407 */
423 if (sysctl_tcp_moderate_rcvbuf) 408 if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf)
424 rcvmem <<= 2; 409 rcvmem <<= 2;
425 410
426 if (sk->sk_rcvbuf < rcvmem) 411 if (sk->sk_rcvbuf < rcvmem)
427 sk->sk_rcvbuf = min(rcvmem, sysctl_tcp_rmem[2]); 412 sk->sk_rcvbuf = min(rcvmem, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
428} 413}
429 414
430/* 4. Try to fixup all. It is made immediately after connection enters 415/* 4. Try to fixup all. It is made immediately after connection enters
@@ -432,6 +417,7 @@ static void tcp_fixup_rcvbuf(struct sock *sk)
432 */ 417 */
433void tcp_init_buffer_space(struct sock *sk) 418void tcp_init_buffer_space(struct sock *sk)
434{ 419{
420 int tcp_app_win = sock_net(sk)->ipv4.sysctl_tcp_app_win;
435 struct tcp_sock *tp = tcp_sk(sk); 421 struct tcp_sock *tp = tcp_sk(sk);
436 int maxwin; 422 int maxwin;
437 423
@@ -450,14 +436,14 @@ void tcp_init_buffer_space(struct sock *sk)
450 if (tp->window_clamp >= maxwin) { 436 if (tp->window_clamp >= maxwin) {
451 tp->window_clamp = maxwin; 437 tp->window_clamp = maxwin;
452 438
453 if (sysctl_tcp_app_win && maxwin > 4 * tp->advmss) 439 if (tcp_app_win && maxwin > 4 * tp->advmss)
454 tp->window_clamp = max(maxwin - 440 tp->window_clamp = max(maxwin -
455 (maxwin >> sysctl_tcp_app_win), 441 (maxwin >> tcp_app_win),
456 4 * tp->advmss); 442 4 * tp->advmss);
457 } 443 }
458 444
459 /* Force reservation of one segment. */ 445 /* Force reservation of one segment. */
460 if (sysctl_tcp_app_win && 446 if (tcp_app_win &&
461 tp->window_clamp > 2 * tp->advmss && 447 tp->window_clamp > 2 * tp->advmss &&
462 tp->window_clamp + tp->advmss > maxwin) 448 tp->window_clamp + tp->advmss > maxwin)
463 tp->window_clamp = max(2 * tp->advmss, maxwin - tp->advmss); 449 tp->window_clamp = max(2 * tp->advmss, maxwin - tp->advmss);
@@ -471,15 +457,16 @@ static void tcp_clamp_window(struct sock *sk)
471{ 457{
472 struct tcp_sock *tp = tcp_sk(sk); 458 struct tcp_sock *tp = tcp_sk(sk);
473 struct inet_connection_sock *icsk = inet_csk(sk); 459 struct inet_connection_sock *icsk = inet_csk(sk);
460 struct net *net = sock_net(sk);
474 461
475 icsk->icsk_ack.quick = 0; 462 icsk->icsk_ack.quick = 0;
476 463
477 if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] && 464 if (sk->sk_rcvbuf < net->ipv4.sysctl_tcp_rmem[2] &&
478 !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && 465 !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
479 !tcp_under_memory_pressure(sk) && 466 !tcp_under_memory_pressure(sk) &&
480 sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) { 467 sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) {
481 sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), 468 sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
482 sysctl_tcp_rmem[2]); 469 net->ipv4.sysctl_tcp_rmem[2]);
483 } 470 }
484 if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) 471 if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
485 tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss); 472 tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss);
@@ -610,7 +597,7 @@ void tcp_rcv_space_adjust(struct sock *sk)
610 * <prev RTT . ><current RTT .. ><next RTT .... > 597 * <prev RTT . ><current RTT .. ><next RTT .... >
611 */ 598 */
612 599
613 if (sysctl_tcp_moderate_rcvbuf && 600 if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf &&
614 !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { 601 !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
615 int rcvwin, rcvmem, rcvbuf; 602 int rcvwin, rcvmem, rcvbuf;
616 603
@@ -634,10 +621,11 @@ void tcp_rcv_space_adjust(struct sock *sk)
634 } 621 }
635 622
636 rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER); 623 rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER);
637 while (tcp_win_from_space(rcvmem) < tp->advmss) 624 while (tcp_win_from_space(sk, rcvmem) < tp->advmss)
638 rcvmem += 128; 625 rcvmem += 128;
639 626
640 rcvbuf = min(rcvwin / tp->advmss * rcvmem, sysctl_tcp_rmem[2]); 627 rcvbuf = min(rcvwin / tp->advmss * rcvmem,
628 sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
641 if (rcvbuf > sk->sk_rcvbuf) { 629 if (rcvbuf > sk->sk_rcvbuf) {
642 sk->sk_rcvbuf = rcvbuf; 630 sk->sk_rcvbuf = rcvbuf;
643 631
@@ -781,15 +769,6 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us)
781 tp->srtt_us = max(1U, srtt); 769 tp->srtt_us = max(1U, srtt);
782} 770}
783 771
784/* Set the sk_pacing_rate to allow proper sizing of TSO packets.
785 * Note: TCP stack does not yet implement pacing.
786 * FQ packet scheduler can be used to implement cheap but effective
787 * TCP pacing, to smooth the burst on large writes when packets
788 * in flight is significantly lower than cwnd (or rwin)
789 */
790int sysctl_tcp_pacing_ss_ratio __read_mostly = 200;
791int sysctl_tcp_pacing_ca_ratio __read_mostly = 120;
792
793static void tcp_update_pacing_rate(struct sock *sk) 772static void tcp_update_pacing_rate(struct sock *sk)
794{ 773{
795 const struct tcp_sock *tp = tcp_sk(sk); 774 const struct tcp_sock *tp = tcp_sk(sk);
@@ -807,9 +786,9 @@ static void tcp_update_pacing_rate(struct sock *sk)
807 * end of slow start and should slow down. 786 * end of slow start and should slow down.
808 */ 787 */
809 if (tp->snd_cwnd < tp->snd_ssthresh / 2) 788 if (tp->snd_cwnd < tp->snd_ssthresh / 2)
810 rate *= sysctl_tcp_pacing_ss_ratio; 789 rate *= sock_net(sk)->ipv4.sysctl_tcp_pacing_ss_ratio;
811 else 790 else
812 rate *= sysctl_tcp_pacing_ca_ratio; 791 rate *= sock_net(sk)->ipv4.sysctl_tcp_pacing_ca_ratio;
813 792
814 rate *= max(tp->snd_cwnd, tp->packets_out); 793 rate *= max(tp->snd_cwnd, tp->packets_out);
815 794
@@ -863,60 +842,46 @@ __u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst)
863 return min_t(__u32, cwnd, tp->snd_cwnd_clamp); 842 return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
864} 843}
865 844
866/*
867 * Packet counting of FACK is based on in-order assumptions, therefore TCP
868 * disables it when reordering is detected
869 */
870void tcp_disable_fack(struct tcp_sock *tp)
871{
872 /* RFC3517 uses different metric in lost marker => reset on change */
873 if (tcp_is_fack(tp))
874 tp->lost_skb_hint = NULL;
875 tp->rx_opt.sack_ok &= ~TCP_FACK_ENABLED;
876}
877
878/* Take a notice that peer is sending D-SACKs */ 845/* Take a notice that peer is sending D-SACKs */
879static void tcp_dsack_seen(struct tcp_sock *tp) 846static void tcp_dsack_seen(struct tcp_sock *tp)
880{ 847{
881 tp->rx_opt.sack_ok |= TCP_DSACK_SEEN; 848 tp->rx_opt.sack_ok |= TCP_DSACK_SEEN;
849 tp->rack.dsack_seen = 1;
882} 850}
883 851
884static void tcp_update_reordering(struct sock *sk, const int metric, 852/* It's reordering when higher sequence was delivered (i.e. sacked) before
885 const int ts) 853 * some lower never-retransmitted sequence ("low_seq"). The maximum reordering
854 * distance is approximated in full-mss packet distance ("reordering").
855 */
856static void tcp_check_sack_reordering(struct sock *sk, const u32 low_seq,
857 const int ts)
886{ 858{
887 struct tcp_sock *tp = tcp_sk(sk); 859 struct tcp_sock *tp = tcp_sk(sk);
888 int mib_idx; 860 const u32 mss = tp->mss_cache;
861 u32 fack, metric;
889 862
890 if (WARN_ON_ONCE(metric < 0)) 863 fack = tcp_highest_sack_seq(tp);
864 if (!before(low_seq, fack))
891 return; 865 return;
892 866
893 if (metric > tp->reordering) { 867 metric = fack - low_seq;
894 tp->reordering = min(sysctl_tcp_max_reordering, metric); 868 if ((metric > tp->reordering * mss) && mss) {
895
896#if FASTRETRANS_DEBUG > 1 869#if FASTRETRANS_DEBUG > 1
897 pr_debug("Disorder%d %d %u f%u s%u rr%d\n", 870 pr_debug("Disorder%d %d %u f%u s%u rr%d\n",
898 tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state, 871 tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state,
899 tp->reordering, 872 tp->reordering,
900 tp->fackets_out, 873 0,
901 tp->sacked_out, 874 tp->sacked_out,
902 tp->undo_marker ? tp->undo_retrans : 0); 875 tp->undo_marker ? tp->undo_retrans : 0);
903#endif 876#endif
904 tcp_disable_fack(tp); 877 tp->reordering = min_t(u32, (metric + mss - 1) / mss,
878 sock_net(sk)->ipv4.sysctl_tcp_max_reordering);
905 } 879 }
906 880
907 tp->rack.reord = 1; 881 tp->rack.reord = 1;
908
909 /* This exciting event is worth to be remembered. 8) */ 882 /* This exciting event is worth to be remembered. 8) */
910 if (ts) 883 NET_INC_STATS(sock_net(sk),
911 mib_idx = LINUX_MIB_TCPTSREORDER; 884 ts ? LINUX_MIB_TCPTSREORDER : LINUX_MIB_TCPSACKREORDER);
912 else if (tcp_is_reno(tp))
913 mib_idx = LINUX_MIB_TCPRENOREORDER;
914 else if (tcp_is_fack(tp))
915 mib_idx = LINUX_MIB_TCPFACKREORDER;
916 else
917 mib_idx = LINUX_MIB_TCPSACKREORDER;
918
919 NET_INC_STATS(sock_net(sk), mib_idx);
920} 885}
921 886
922/* This must be called before lost_out is incremented */ 887/* This must be called before lost_out is incremented */
@@ -990,7 +955,6 @@ void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb)
990 * 3. Loss detection event of two flavors: 955 * 3. Loss detection event of two flavors:
991 * A. Scoreboard estimator decided the packet is lost. 956 * A. Scoreboard estimator decided the packet is lost.
992 * A'. Reno "three dupacks" marks head of queue lost. 957 * A'. Reno "three dupacks" marks head of queue lost.
993 * A''. Its FACK modification, head until snd.fack is lost.
994 * B. SACK arrives sacking SND.NXT at the moment, when the 958 * B. SACK arrives sacking SND.NXT at the moment, when the
995 * segment was retransmitted. 959 * segment was retransmitted.
996 * 4. D-SACK added new rule: D-SACK changes any tag to S. 960 * 4. D-SACK added new rule: D-SACK changes any tag to S.
@@ -1133,8 +1097,7 @@ static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,
1133} 1097}
1134 1098
1135struct tcp_sacktag_state { 1099struct tcp_sacktag_state {
1136 int reord; 1100 u32 reord;
1137 int fack_count;
1138 /* Timestamps for earliest and latest never-retransmitted segment 1101 /* Timestamps for earliest and latest never-retransmitted segment
1139 * that was SACKed. RTO needs the earliest RTT to stay conservative, 1102 * that was SACKed. RTO needs the earliest RTT to stay conservative,
1140 * but congestion control should still get an accurate delay signal. 1103 * but congestion control should still get an accurate delay signal.
@@ -1143,6 +1106,7 @@ struct tcp_sacktag_state {
1143 u64 last_sackt; 1106 u64 last_sackt;
1144 struct rate_sample *rate; 1107 struct rate_sample *rate;
1145 int flag; 1108 int flag;
1109 unsigned int mss_now;
1146}; 1110};
1147 1111
1148/* Check if skb is fully within the SACK block. In presence of GSO skbs, 1112/* Check if skb is fully within the SACK block. In presence of GSO skbs,
@@ -1192,7 +1156,8 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
1192 if (pkt_len >= skb->len && !in_sack) 1156 if (pkt_len >= skb->len && !in_sack)
1193 return 0; 1157 return 0;
1194 1158
1195 err = tcp_fragment(sk, skb, pkt_len, mss, GFP_ATOMIC); 1159 err = tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
1160 pkt_len, mss, GFP_ATOMIC);
1196 if (err < 0) 1161 if (err < 0)
1197 return err; 1162 return err;
1198 } 1163 }
@@ -1208,15 +1173,15 @@ static u8 tcp_sacktag_one(struct sock *sk,
1208 u64 xmit_time) 1173 u64 xmit_time)
1209{ 1174{
1210 struct tcp_sock *tp = tcp_sk(sk); 1175 struct tcp_sock *tp = tcp_sk(sk);
1211 int fack_count = state->fack_count;
1212 1176
1213 /* Account D-SACK for retransmitted packet. */ 1177 /* Account D-SACK for retransmitted packet. */
1214 if (dup_sack && (sacked & TCPCB_RETRANS)) { 1178 if (dup_sack && (sacked & TCPCB_RETRANS)) {
1215 if (tp->undo_marker && tp->undo_retrans > 0 && 1179 if (tp->undo_marker && tp->undo_retrans > 0 &&
1216 after(end_seq, tp->undo_marker)) 1180 after(end_seq, tp->undo_marker))
1217 tp->undo_retrans--; 1181 tp->undo_retrans--;
1218 if (sacked & TCPCB_SACKED_ACKED) 1182 if ((sacked & TCPCB_SACKED_ACKED) &&
1219 state->reord = min(fack_count, state->reord); 1183 before(start_seq, state->reord))
1184 state->reord = start_seq;
1220 } 1185 }
1221 1186
1222 /* Nothing to do; acked frame is about to be dropped (was ACKed). */ 1187 /* Nothing to do; acked frame is about to be dropped (was ACKed). */
@@ -1242,9 +1207,10 @@ static u8 tcp_sacktag_one(struct sock *sk,
1242 * which was in hole. It is reordering. 1207 * which was in hole. It is reordering.
1243 */ 1208 */
1244 if (before(start_seq, 1209 if (before(start_seq,
1245 tcp_highest_sack_seq(tp))) 1210 tcp_highest_sack_seq(tp)) &&
1246 state->reord = min(fack_count, 1211 before(start_seq, state->reord))
1247 state->reord); 1212 state->reord = start_seq;
1213
1248 if (!after(end_seq, tp->high_seq)) 1214 if (!after(end_seq, tp->high_seq))
1249 state->flag |= FLAG_ORIG_SACK_ACKED; 1215 state->flag |= FLAG_ORIG_SACK_ACKED;
1250 if (state->first_sackt == 0) 1216 if (state->first_sackt == 0)
@@ -1263,15 +1229,10 @@ static u8 tcp_sacktag_one(struct sock *sk,
1263 tp->sacked_out += pcount; 1229 tp->sacked_out += pcount;
1264 tp->delivered += pcount; /* Out-of-order packets delivered */ 1230 tp->delivered += pcount; /* Out-of-order packets delivered */
1265 1231
1266 fack_count += pcount;
1267
1268 /* Lost marker hint past SACKed? Tweak RFC3517 cnt */ 1232 /* Lost marker hint past SACKed? Tweak RFC3517 cnt */
1269 if (!tcp_is_fack(tp) && tp->lost_skb_hint && 1233 if (tp->lost_skb_hint &&
1270 before(start_seq, TCP_SKB_CB(tp->lost_skb_hint)->seq)) 1234 before(start_seq, TCP_SKB_CB(tp->lost_skb_hint)->seq))
1271 tp->lost_cnt_hint += pcount; 1235 tp->lost_cnt_hint += pcount;
1272
1273 if (fack_count > tp->fackets_out)
1274 tp->fackets_out = fack_count;
1275 } 1236 }
1276 1237
1277 /* D-SACK. We can detect redundant retransmission in S|R and plain R 1238 /* D-SACK. We can detect redundant retransmission in S|R and plain R
@@ -1289,13 +1250,13 @@ static u8 tcp_sacktag_one(struct sock *sk,
1289/* Shift newly-SACKed bytes from this skb to the immediately previous 1250/* Shift newly-SACKed bytes from this skb to the immediately previous
1290 * already-SACKed sk_buff. Mark the newly-SACKed bytes as such. 1251 * already-SACKed sk_buff. Mark the newly-SACKed bytes as such.
1291 */ 1252 */
1292static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, 1253static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev,
1254 struct sk_buff *skb,
1293 struct tcp_sacktag_state *state, 1255 struct tcp_sacktag_state *state,
1294 unsigned int pcount, int shifted, int mss, 1256 unsigned int pcount, int shifted, int mss,
1295 bool dup_sack) 1257 bool dup_sack)
1296{ 1258{
1297 struct tcp_sock *tp = tcp_sk(sk); 1259 struct tcp_sock *tp = tcp_sk(sk);
1298 struct sk_buff *prev = tcp_write_queue_prev(sk, skb);
1299 u32 start_seq = TCP_SKB_CB(skb)->seq; /* start of newly-SACKed */ 1260 u32 start_seq = TCP_SKB_CB(skb)->seq; /* start of newly-SACKed */
1300 u32 end_seq = start_seq + shifted; /* end of newly-SACKed */ 1261 u32 end_seq = start_seq + shifted; /* end of newly-SACKed */
1301 1262
@@ -1364,8 +1325,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
1364 if (unlikely(TCP_SKB_CB(prev)->tx.delivered_mstamp)) 1325 if (unlikely(TCP_SKB_CB(prev)->tx.delivered_mstamp))
1365 TCP_SKB_CB(prev)->tx.delivered_mstamp = 0; 1326 TCP_SKB_CB(prev)->tx.delivered_mstamp = 0;
1366 1327
1367 tcp_unlink_write_queue(skb, sk); 1328 tcp_rtx_queue_unlink_and_free(skb, sk);
1368 sk_wmem_free_skb(sk, skb);
1369 1329
1370 NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKMERGED); 1330 NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKMERGED);
1371 1331
@@ -1415,9 +1375,9 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
1415 goto fallback; 1375 goto fallback;
1416 1376
1417 /* Can only happen with delayed DSACK + discard craziness */ 1377 /* Can only happen with delayed DSACK + discard craziness */
1418 if (unlikely(skb == tcp_write_queue_head(sk))) 1378 prev = skb_rb_prev(skb);
1379 if (!prev)
1419 goto fallback; 1380 goto fallback;
1420 prev = tcp_write_queue_prev(sk, skb);
1421 1381
1422 if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) 1382 if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED)
1423 goto fallback; 1383 goto fallback;
@@ -1496,18 +1456,17 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
1496 1456
1497 if (!skb_shift(prev, skb, len)) 1457 if (!skb_shift(prev, skb, len))
1498 goto fallback; 1458 goto fallback;
1499 if (!tcp_shifted_skb(sk, skb, state, pcount, len, mss, dup_sack)) 1459 if (!tcp_shifted_skb(sk, prev, skb, state, pcount, len, mss, dup_sack))
1500 goto out; 1460 goto out;
1501 1461
1502 /* Hole filled allows collapsing with the next as well, this is very 1462 /* Hole filled allows collapsing with the next as well, this is very
1503 * useful when hole on every nth skb pattern happens 1463 * useful when hole on every nth skb pattern happens
1504 */ 1464 */
1505 if (prev == tcp_write_queue_tail(sk)) 1465 skb = skb_rb_next(prev);
1466 if (!skb)
1506 goto out; 1467 goto out;
1507 skb = tcp_write_queue_next(sk, prev);
1508 1468
1509 if (!skb_can_shift(skb) || 1469 if (!skb_can_shift(skb) ||
1510 (skb == tcp_send_head(sk)) ||
1511 ((TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) || 1470 ((TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) ||
1512 (mss != tcp_skb_seglen(skb))) 1471 (mss != tcp_skb_seglen(skb)))
1513 goto out; 1472 goto out;
@@ -1515,11 +1474,11 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
1515 len = skb->len; 1474 len = skb->len;
1516 if (skb_shift(prev, skb, len)) { 1475 if (skb_shift(prev, skb, len)) {
1517 pcount += tcp_skb_pcount(skb); 1476 pcount += tcp_skb_pcount(skb);
1518 tcp_shifted_skb(sk, skb, state, tcp_skb_pcount(skb), len, mss, 0); 1477 tcp_shifted_skb(sk, prev, skb, state, tcp_skb_pcount(skb),
1478 len, mss, 0);
1519 } 1479 }
1520 1480
1521out: 1481out:
1522 state->fack_count += pcount;
1523 return prev; 1482 return prev;
1524 1483
1525noop: 1484noop:
@@ -1539,13 +1498,10 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
1539 struct tcp_sock *tp = tcp_sk(sk); 1498 struct tcp_sock *tp = tcp_sk(sk);
1540 struct sk_buff *tmp; 1499 struct sk_buff *tmp;
1541 1500
1542 tcp_for_write_queue_from(skb, sk) { 1501 skb_rbtree_walk_from(skb) {
1543 int in_sack = 0; 1502 int in_sack = 0;
1544 bool dup_sack = dup_sack_in; 1503 bool dup_sack = dup_sack_in;
1545 1504
1546 if (skb == tcp_send_head(sk))
1547 break;
1548
1549 /* queue is in-order => we can short-circuit the walk early */ 1505 /* queue is in-order => we can short-circuit the walk early */
1550 if (!before(TCP_SKB_CB(skb)->seq, end_seq)) 1506 if (!before(TCP_SKB_CB(skb)->seq, end_seq))
1551 break; 1507 break;
@@ -1594,34 +1550,48 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
1594 tcp_skb_pcount(skb), 1550 tcp_skb_pcount(skb),
1595 skb->skb_mstamp); 1551 skb->skb_mstamp);
1596 tcp_rate_skb_delivered(sk, skb, state->rate); 1552 tcp_rate_skb_delivered(sk, skb, state->rate);
1553 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
1554 list_del_init(&skb->tcp_tsorted_anchor);
1597 1555
1598 if (!before(TCP_SKB_CB(skb)->seq, 1556 if (!before(TCP_SKB_CB(skb)->seq,
1599 tcp_highest_sack_seq(tp))) 1557 tcp_highest_sack_seq(tp)))
1600 tcp_advance_highest_sack(sk, skb); 1558 tcp_advance_highest_sack(sk, skb);
1601 } 1559 }
1602
1603 state->fack_count += tcp_skb_pcount(skb);
1604 } 1560 }
1605 return skb; 1561 return skb;
1606} 1562}
1607 1563
1608/* Avoid all extra work that is being done by sacktag while walking in 1564static struct sk_buff *tcp_sacktag_bsearch(struct sock *sk,
1609 * a normal way 1565 struct tcp_sacktag_state *state,
1610 */ 1566 u32 seq)
1567{
1568 struct rb_node *parent, **p = &sk->tcp_rtx_queue.rb_node;
1569 struct sk_buff *skb;
1570
1571 while (*p) {
1572 parent = *p;
1573 skb = rb_to_skb(parent);
1574 if (before(seq, TCP_SKB_CB(skb)->seq)) {
1575 p = &parent->rb_left;
1576 continue;
1577 }
1578 if (!before(seq, TCP_SKB_CB(skb)->end_seq)) {
1579 p = &parent->rb_right;
1580 continue;
1581 }
1582 return skb;
1583 }
1584 return NULL;
1585}
1586
1611static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk, 1587static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk,
1612 struct tcp_sacktag_state *state, 1588 struct tcp_sacktag_state *state,
1613 u32 skip_to_seq) 1589 u32 skip_to_seq)
1614{ 1590{
1615 tcp_for_write_queue_from(skb, sk) { 1591 if (skb && after(TCP_SKB_CB(skb)->seq, skip_to_seq))
1616 if (skb == tcp_send_head(sk)) 1592 return skb;
1617 break;
1618
1619 if (after(TCP_SKB_CB(skb)->end_seq, skip_to_seq))
1620 break;
1621 1593
1622 state->fack_count += tcp_skb_pcount(skb); 1594 return tcp_sacktag_bsearch(sk, state, skip_to_seq);
1623 }
1624 return skb;
1625} 1595}
1626 1596
1627static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb, 1597static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb,
@@ -1666,13 +1636,10 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
1666 int first_sack_index; 1636 int first_sack_index;
1667 1637
1668 state->flag = 0; 1638 state->flag = 0;
1669 state->reord = tp->packets_out; 1639 state->reord = tp->snd_nxt;
1670 1640
1671 if (!tp->sacked_out) { 1641 if (!tp->sacked_out)
1672 if (WARN_ON(tp->fackets_out))
1673 tp->fackets_out = 0;
1674 tcp_highest_sack_reset(sk); 1642 tcp_highest_sack_reset(sk);
1675 }
1676 1643
1677 found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire, 1644 found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire,
1678 num_sacks, prior_snd_una); 1645 num_sacks, prior_snd_una);
@@ -1743,8 +1710,8 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
1743 } 1710 }
1744 } 1711 }
1745 1712
1746 skb = tcp_write_queue_head(sk); 1713 state->mss_now = tcp_current_mss(sk);
1747 state->fack_count = 0; 1714 skb = NULL;
1748 i = 0; 1715 i = 0;
1749 1716
1750 if (!tp->sacked_out) { 1717 if (!tp->sacked_out) {
@@ -1801,7 +1768,6 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
1801 skb = tcp_highest_sack(sk); 1768 skb = tcp_highest_sack(sk);
1802 if (!skb) 1769 if (!skb)
1803 break; 1770 break;
1804 state->fack_count = tp->fackets_out;
1805 cache++; 1771 cache++;
1806 goto walk; 1772 goto walk;
1807 } 1773 }
@@ -1816,7 +1782,6 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
1816 skb = tcp_highest_sack(sk); 1782 skb = tcp_highest_sack(sk);
1817 if (!skb) 1783 if (!skb)
1818 break; 1784 break;
1819 state->fack_count = tp->fackets_out;
1820 } 1785 }
1821 skb = tcp_sacktag_skip(skb, sk, state, start_seq); 1786 skb = tcp_sacktag_skip(skb, sk, state, start_seq);
1822 1787
@@ -1836,9 +1801,8 @@ advance_sp:
1836 for (j = 0; j < used_sacks; j++) 1801 for (j = 0; j < used_sacks; j++)
1837 tp->recv_sack_cache[i++] = sp[j]; 1802 tp->recv_sack_cache[i++] = sp[j];
1838 1803
1839 if ((state->reord < tp->fackets_out) && 1804 if (inet_csk(sk)->icsk_ca_state != TCP_CA_Loss || tp->undo_marker)
1840 ((inet_csk(sk)->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker)) 1805 tcp_check_sack_reordering(sk, state->reord, 0);
1841 tcp_update_reordering(sk, tp->fackets_out - state->reord, 0);
1842 1806
1843 tcp_verify_left_out(tp); 1807 tcp_verify_left_out(tp);
1844out: 1808out:
@@ -1876,8 +1840,13 @@ static bool tcp_limit_reno_sacked(struct tcp_sock *tp)
1876static void tcp_check_reno_reordering(struct sock *sk, const int addend) 1840static void tcp_check_reno_reordering(struct sock *sk, const int addend)
1877{ 1841{
1878 struct tcp_sock *tp = tcp_sk(sk); 1842 struct tcp_sock *tp = tcp_sk(sk);
1879 if (tcp_limit_reno_sacked(tp)) 1843
1880 tcp_update_reordering(sk, tp->packets_out + addend, 0); 1844 if (!tcp_limit_reno_sacked(tp))
1845 return;
1846
1847 tp->reordering = min_t(u32, tp->packets_out + addend,
1848 sock_net(sk)->ipv4.sysctl_tcp_max_reordering);
1849 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRENOREORDER);
1881} 1850}
1882 1851
1883/* Emulate SACKs for SACKless connection: account for a new dupack. */ 1852/* Emulate SACKs for SACKless connection: account for a new dupack. */
@@ -1923,7 +1892,6 @@ void tcp_clear_retrans(struct tcp_sock *tp)
1923 tp->lost_out = 0; 1892 tp->lost_out = 0;
1924 tp->undo_marker = 0; 1893 tp->undo_marker = 0;
1925 tp->undo_retrans = -1; 1894 tp->undo_retrans = -1;
1926 tp->fackets_out = 0;
1927 tp->sacked_out = 0; 1895 tp->sacked_out = 0;
1928} 1896}
1929 1897
@@ -1968,19 +1936,15 @@ void tcp_enter_loss(struct sock *sk)
1968 if (tcp_is_reno(tp)) 1936 if (tcp_is_reno(tp))
1969 tcp_reset_reno_sack(tp); 1937 tcp_reset_reno_sack(tp);
1970 1938
1971 skb = tcp_write_queue_head(sk); 1939 skb = tcp_rtx_queue_head(sk);
1972 is_reneg = skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED); 1940 is_reneg = skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED);
1973 if (is_reneg) { 1941 if (is_reneg) {
1974 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING); 1942 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
1975 tp->sacked_out = 0; 1943 tp->sacked_out = 0;
1976 tp->fackets_out = 0;
1977 } 1944 }
1978 tcp_clear_all_retrans_hints(tp); 1945 tcp_clear_all_retrans_hints(tp);
1979 1946
1980 tcp_for_write_queue(skb, sk) { 1947 skb_rbtree_walk_from(skb) {
1981 if (skb == tcp_send_head(sk))
1982 break;
1983
1984 mark_lost = (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) || 1948 mark_lost = (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) ||
1985 is_reneg); 1949 is_reneg);
1986 if (mark_lost) 1950 if (mark_lost)
@@ -2014,7 +1978,7 @@ void tcp_enter_loss(struct sock *sk)
2014 * falsely raise the receive window, which results in repeated 1978 * falsely raise the receive window, which results in repeated
2015 * timeouts and stop-and-go behavior. 1979 * timeouts and stop-and-go behavior.
2016 */ 1980 */
2017 tp->frto = sysctl_tcp_frto && 1981 tp->frto = net->ipv4.sysctl_tcp_frto &&
2018 (new_recovery || icsk->icsk_retransmits) && 1982 (new_recovery || icsk->icsk_retransmits) &&
2019 !inet_csk(sk)->icsk_mtup.probe_size; 1983 !inet_csk(sk)->icsk_mtup.probe_size;
2020} 1984}
@@ -2043,19 +2007,10 @@ static bool tcp_check_sack_reneging(struct sock *sk, int flag)
2043 return false; 2007 return false;
2044} 2008}
2045 2009
2046static inline int tcp_fackets_out(const struct tcp_sock *tp)
2047{
2048 return tcp_is_reno(tp) ? tp->sacked_out + 1 : tp->fackets_out;
2049}
2050
2051/* Heurestics to calculate number of duplicate ACKs. There's no dupACKs 2010/* Heurestics to calculate number of duplicate ACKs. There's no dupACKs
2052 * counter when SACK is enabled (without SACK, sacked_out is used for 2011 * counter when SACK is enabled (without SACK, sacked_out is used for
2053 * that purpose). 2012 * that purpose).
2054 * 2013 *
2055 * Instead, with FACK TCP uses fackets_out that includes both SACKed
2056 * segments up to the highest received SACK block so far and holes in
2057 * between them.
2058 *
2059 * With reordering, holes may still be in flight, so RFC3517 recovery 2014 * With reordering, holes may still be in flight, so RFC3517 recovery
2060 * uses pure sacked_out (total number of SACKed segments) even though 2015 * uses pure sacked_out (total number of SACKed segments) even though
2061 * it violates the RFC that uses duplicate ACKs, often these are equal 2016 * it violates the RFC that uses duplicate ACKs, often these are equal
@@ -2065,10 +2020,10 @@ static inline int tcp_fackets_out(const struct tcp_sock *tp)
2065 */ 2020 */
2066static inline int tcp_dupack_heuristics(const struct tcp_sock *tp) 2021static inline int tcp_dupack_heuristics(const struct tcp_sock *tp)
2067{ 2022{
2068 return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1; 2023 return tp->sacked_out + 1;
2069} 2024}
2070 2025
2071/* Linux NewReno/SACK/FACK/ECN state machine. 2026/* Linux NewReno/SACK/ECN state machine.
2072 * -------------------------------------- 2027 * --------------------------------------
2073 * 2028 *
2074 * "Open" Normal state, no dubious events, fast path. 2029 * "Open" Normal state, no dubious events, fast path.
@@ -2133,16 +2088,6 @@ static inline int tcp_dupack_heuristics(const struct tcp_sock *tp)
2133 * dynamically measured and adjusted. This is implemented in 2088 * dynamically measured and adjusted. This is implemented in
2134 * tcp_rack_mark_lost. 2089 * tcp_rack_mark_lost.
2135 * 2090 *
2136 * FACK (Disabled by default. Subsumbed by RACK):
2137 * It is the simplest heuristics. As soon as we decided
2138 * that something is lost, we decide that _all_ not SACKed
2139 * packets until the most forward SACK are lost. I.e.
2140 * lost_out = fackets_out - sacked_out and left_out = fackets_out.
2141 * It is absolutely correct estimate, if network does not reorder
2142 * packets. And it loses any connection to reality when reordering
2143 * takes place. We use FACK by default until reordering
2144 * is suspected on the path to this destination.
2145 *
2146 * If the receiver does not support SACK: 2091 * If the receiver does not support SACK:
2147 * 2092 *
2148 * NewReno (RFC6582): in Recovery we assume that one segment 2093 * NewReno (RFC6582): in Recovery we assume that one segment
@@ -2191,7 +2136,7 @@ static bool tcp_time_to_recover(struct sock *sk, int flag)
2191} 2136}
2192 2137
2193/* Detect loss in event "A" above by marking head of queue up as lost. 2138/* Detect loss in event "A" above by marking head of queue up as lost.
2194 * For FACK or non-SACK(Reno) senders, the first "packets" number of segments 2139 * For non-SACK(Reno) senders, the first "packets" number of segments
2195 * are considered lost. For RFC3517 SACK, a segment is considered lost if it 2140 * are considered lost. For RFC3517 SACK, a segment is considered lost if it
2196 * has at least tp->reordering SACKed seqments above it; "packets" refers to 2141 * has at least tp->reordering SACKed seqments above it; "packets" refers to
2197 * the maximum SACKed segments to pass before reaching this limit. 2142 * the maximum SACKed segments to pass before reaching this limit.
@@ -2206,20 +2151,18 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
2206 const u32 loss_high = tcp_is_sack(tp) ? tp->snd_nxt : tp->high_seq; 2151 const u32 loss_high = tcp_is_sack(tp) ? tp->snd_nxt : tp->high_seq;
2207 2152
2208 WARN_ON(packets > tp->packets_out); 2153 WARN_ON(packets > tp->packets_out);
2209 if (tp->lost_skb_hint) { 2154 skb = tp->lost_skb_hint;
2210 skb = tp->lost_skb_hint; 2155 if (skb) {
2211 cnt = tp->lost_cnt_hint;
2212 /* Head already handled? */ 2156 /* Head already handled? */
2213 if (mark_head && skb != tcp_write_queue_head(sk)) 2157 if (mark_head && after(TCP_SKB_CB(skb)->seq, tp->snd_una))
2214 return; 2158 return;
2159 cnt = tp->lost_cnt_hint;
2215 } else { 2160 } else {
2216 skb = tcp_write_queue_head(sk); 2161 skb = tcp_rtx_queue_head(sk);
2217 cnt = 0; 2162 cnt = 0;
2218 } 2163 }
2219 2164
2220 tcp_for_write_queue_from(skb, sk) { 2165 skb_rbtree_walk_from(skb) {
2221 if (skb == tcp_send_head(sk))
2222 break;
2223 /* TODO: do this better */ 2166 /* TODO: do this better */
2224 /* this is not the most efficient way to do this... */ 2167 /* this is not the most efficient way to do this... */
2225 tp->lost_skb_hint = skb; 2168 tp->lost_skb_hint = skb;
@@ -2229,12 +2172,12 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
2229 break; 2172 break;
2230 2173
2231 oldcnt = cnt; 2174 oldcnt = cnt;
2232 if (tcp_is_fack(tp) || tcp_is_reno(tp) || 2175 if (tcp_is_reno(tp) ||
2233 (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) 2176 (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
2234 cnt += tcp_skb_pcount(skb); 2177 cnt += tcp_skb_pcount(skb);
2235 2178
2236 if (cnt > packets) { 2179 if (cnt > packets) {
2237 if ((tcp_is_sack(tp) && !tcp_is_fack(tp)) || 2180 if (tcp_is_sack(tp) ||
2238 (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) || 2181 (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) ||
2239 (oldcnt >= packets)) 2182 (oldcnt >= packets))
2240 break; 2183 break;
@@ -2243,7 +2186,8 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
2243 /* If needed, chop off the prefix to mark as lost. */ 2186 /* If needed, chop off the prefix to mark as lost. */
2244 lost = (packets - oldcnt) * mss; 2187 lost = (packets - oldcnt) * mss;
2245 if (lost < skb->len && 2188 if (lost < skb->len &&
2246 tcp_fragment(sk, skb, lost, mss, GFP_ATOMIC) < 0) 2189 tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
2190 lost, mss, GFP_ATOMIC) < 0)
2247 break; 2191 break;
2248 cnt = packets; 2192 cnt = packets;
2249 } 2193 }
@@ -2264,11 +2208,6 @@ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
2264 2208
2265 if (tcp_is_reno(tp)) { 2209 if (tcp_is_reno(tp)) {
2266 tcp_mark_head_lost(sk, 1, 1); 2210 tcp_mark_head_lost(sk, 1, 1);
2267 } else if (tcp_is_fack(tp)) {
2268 int lost = tp->fackets_out - tp->reordering;
2269 if (lost <= 0)
2270 lost = 1;
2271 tcp_mark_head_lost(sk, lost, 0);
2272 } else { 2211 } else {
2273 int sacked_upto = tp->sacked_out - tp->reordering; 2212 int sacked_upto = tp->sacked_out - tp->reordering;
2274 if (sacked_upto >= 0) 2213 if (sacked_upto >= 0)
@@ -2327,16 +2266,16 @@ static bool tcp_any_retrans_done(const struct sock *sk)
2327 if (tp->retrans_out) 2266 if (tp->retrans_out)
2328 return true; 2267 return true;
2329 2268
2330 skb = tcp_write_queue_head(sk); 2269 skb = tcp_rtx_queue_head(sk);
2331 if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS)) 2270 if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS))
2332 return true; 2271 return true;
2333 2272
2334 return false; 2273 return false;
2335} 2274}
2336 2275
2337#if FASTRETRANS_DEBUG > 1
2338static void DBGUNDO(struct sock *sk, const char *msg) 2276static void DBGUNDO(struct sock *sk, const char *msg)
2339{ 2277{
2278#if FASTRETRANS_DEBUG > 1
2340 struct tcp_sock *tp = tcp_sk(sk); 2279 struct tcp_sock *tp = tcp_sk(sk);
2341 struct inet_sock *inet = inet_sk(sk); 2280 struct inet_sock *inet = inet_sk(sk);
2342 2281
@@ -2358,10 +2297,8 @@ static void DBGUNDO(struct sock *sk, const char *msg)
2358 tp->packets_out); 2297 tp->packets_out);
2359 } 2298 }
2360#endif 2299#endif
2361}
2362#else
2363#define DBGUNDO(x...) do { } while (0)
2364#endif 2300#endif
2301}
2365 2302
2366static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss) 2303static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss)
2367{ 2304{
@@ -2370,9 +2307,7 @@ static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss)
2370 if (unmark_loss) { 2307 if (unmark_loss) {
2371 struct sk_buff *skb; 2308 struct sk_buff *skb;
2372 2309
2373 tcp_for_write_queue(skb, sk) { 2310 skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
2374 if (skb == tcp_send_head(sk))
2375 break;
2376 TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; 2311 TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
2377 } 2312 }
2378 tp->lost_out = 0; 2313 tp->lost_out = 0;
@@ -2417,6 +2352,8 @@ static bool tcp_try_undo_recovery(struct sock *sk)
2417 mib_idx = LINUX_MIB_TCPFULLUNDO; 2352 mib_idx = LINUX_MIB_TCPFULLUNDO;
2418 2353
2419 NET_INC_STATS(sock_net(sk), mib_idx); 2354 NET_INC_STATS(sock_net(sk), mib_idx);
2355 } else if (tp->rack.reo_wnd_persist) {
2356 tp->rack.reo_wnd_persist--;
2420 } 2357 }
2421 if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) { 2358 if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) {
2422 /* Hold old state until something *above* high_seq 2359 /* Hold old state until something *above* high_seq
@@ -2436,6 +2373,8 @@ static bool tcp_try_undo_dsack(struct sock *sk)
2436 struct tcp_sock *tp = tcp_sk(sk); 2373 struct tcp_sock *tp = tcp_sk(sk);
2437 2374
2438 if (tp->undo_marker && !tp->undo_retrans) { 2375 if (tp->undo_marker && !tp->undo_retrans) {
2376 tp->rack.reo_wnd_persist = min(TCP_RACK_RECOVERY_THRESH,
2377 tp->rack.reo_wnd_persist + 1);
2439 DBGUNDO(sk, "D-SACK"); 2378 DBGUNDO(sk, "D-SACK");
2440 tcp_undo_cwnd_reduction(sk, false); 2379 tcp_undo_cwnd_reduction(sk, false);
2441 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKUNDO); 2380 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKUNDO);
@@ -2616,9 +2555,7 @@ void tcp_simple_retransmit(struct sock *sk)
2616 struct sk_buff *skb; 2555 struct sk_buff *skb;
2617 unsigned int mss = tcp_current_mss(sk); 2556 unsigned int mss = tcp_current_mss(sk);
2618 2557
2619 tcp_for_write_queue(skb, sk) { 2558 skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
2620 if (skb == tcp_send_head(sk))
2621 break;
2622 if (tcp_skb_seglen(skb) > mss && 2559 if (tcp_skb_seglen(skb) > mss &&
2623 !(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) { 2560 !(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
2624 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { 2561 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
@@ -2712,7 +2649,7 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack,
2712 * is updated in tcp_ack()). Otherwise fall back to 2649 * is updated in tcp_ack()). Otherwise fall back to
2713 * the conventional recovery. 2650 * the conventional recovery.
2714 */ 2651 */
2715 if (tcp_send_head(sk) && 2652 if (!tcp_write_queue_empty(sk) &&
2716 after(tcp_wnd_end(tp), tp->snd_nxt)) { 2653 after(tcp_wnd_end(tp), tp->snd_nxt)) {
2717 *rexmit = REXMIT_NEW; 2654 *rexmit = REXMIT_NEW;
2718 return; 2655 return;
@@ -2739,15 +2676,15 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack,
2739} 2676}
2740 2677
2741/* Undo during fast recovery after partial ACK. */ 2678/* Undo during fast recovery after partial ACK. */
2742static bool tcp_try_undo_partial(struct sock *sk, const int acked) 2679static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una)
2743{ 2680{
2744 struct tcp_sock *tp = tcp_sk(sk); 2681 struct tcp_sock *tp = tcp_sk(sk);
2745 2682
2746 if (tp->undo_marker && tcp_packet_delayed(tp)) { 2683 if (tp->undo_marker && tcp_packet_delayed(tp)) {
2747 /* Plain luck! Hole if filled with delayed 2684 /* Plain luck! Hole if filled with delayed
2748 * packet, rather than with a retransmit. 2685 * packet, rather than with a retransmit. Check reordering.
2749 */ 2686 */
2750 tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1); 2687 tcp_check_sack_reordering(sk, prior_snd_una, 1);
2751 2688
2752 /* We are getting evidence that the reordering degree is higher 2689 /* We are getting evidence that the reordering degree is higher
2753 * than we realized. If there are no retransmits out then we 2690 * than we realized. If there are no retransmits out then we
@@ -2774,7 +2711,7 @@ static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag)
2774 struct tcp_sock *tp = tcp_sk(sk); 2711 struct tcp_sock *tp = tcp_sk(sk);
2775 2712
2776 /* Use RACK to detect loss */ 2713 /* Use RACK to detect loss */
2777 if (sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION) { 2714 if (sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION) {
2778 u32 prior_retrans = tp->retrans_out; 2715 u32 prior_retrans = tp->retrans_out;
2779 2716
2780 tcp_rack_mark_lost(sk); 2717 tcp_rack_mark_lost(sk);
@@ -2783,6 +2720,14 @@ static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag)
2783 } 2720 }
2784} 2721}
2785 2722
2723static bool tcp_force_fast_retransmit(struct sock *sk)
2724{
2725 struct tcp_sock *tp = tcp_sk(sk);
2726
2727 return after(tcp_highest_sack_seq(tp),
2728 tp->snd_una + tp->reordering * tp->mss_cache);
2729}
2730
2786/* Process an event, which can update packets-in-flight not trivially. 2731/* Process an event, which can update packets-in-flight not trivially.
2787 * Main goal of this function is to calculate new estimate for left_out, 2732 * Main goal of this function is to calculate new estimate for left_out,
2788 * taking into account both packets sitting in receiver's buffer and 2733 * taking into account both packets sitting in receiver's buffer and
@@ -2795,19 +2740,17 @@ static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag)
2795 * It does _not_ decide what to send, it is made in function 2740 * It does _not_ decide what to send, it is made in function
2796 * tcp_xmit_retransmit_queue(). 2741 * tcp_xmit_retransmit_queue().
2797 */ 2742 */
2798static void tcp_fastretrans_alert(struct sock *sk, const int acked, 2743static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
2799 bool is_dupack, int *ack_flag, int *rexmit) 2744 bool is_dupack, int *ack_flag, int *rexmit)
2800{ 2745{
2801 struct inet_connection_sock *icsk = inet_csk(sk); 2746 struct inet_connection_sock *icsk = inet_csk(sk);
2802 struct tcp_sock *tp = tcp_sk(sk); 2747 struct tcp_sock *tp = tcp_sk(sk);
2803 int fast_rexmit = 0, flag = *ack_flag; 2748 int fast_rexmit = 0, flag = *ack_flag;
2804 bool do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) && 2749 bool do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) &&
2805 (tcp_fackets_out(tp) > tp->reordering)); 2750 tcp_force_fast_retransmit(sk));
2806 2751
2807 if (WARN_ON(!tp->packets_out && tp->sacked_out)) 2752 if (!tp->packets_out && tp->sacked_out)
2808 tp->sacked_out = 0; 2753 tp->sacked_out = 0;
2809 if (WARN_ON(!tp->sacked_out && tp->fackets_out))
2810 tp->fackets_out = 0;
2811 2754
2812 /* Now state machine starts. 2755 /* Now state machine starts.
2813 * A. ECE, hence prohibit cwnd undoing, the reduction is required. */ 2756 * A. ECE, hence prohibit cwnd undoing, the reduction is required. */
@@ -2854,11 +2797,11 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
2854 if (tcp_is_reno(tp) && is_dupack) 2797 if (tcp_is_reno(tp) && is_dupack)
2855 tcp_add_reno_sack(sk); 2798 tcp_add_reno_sack(sk);
2856 } else { 2799 } else {
2857 if (tcp_try_undo_partial(sk, acked)) 2800 if (tcp_try_undo_partial(sk, prior_snd_una))
2858 return; 2801 return;
2859 /* Partial ACK arrived. Force fast retransmit. */ 2802 /* Partial ACK arrived. Force fast retransmit. */
2860 do_lost = tcp_is_reno(tp) || 2803 do_lost = tcp_is_reno(tp) ||
2861 tcp_fackets_out(tp) > tp->reordering; 2804 tcp_force_fast_retransmit(sk);
2862 } 2805 }
2863 if (tcp_try_undo_dsack(sk)) { 2806 if (tcp_try_undo_dsack(sk)) {
2864 tcp_try_keep_open(sk); 2807 tcp_try_keep_open(sk);
@@ -2873,6 +2816,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
2873 (*ack_flag & FLAG_LOST_RETRANS))) 2816 (*ack_flag & FLAG_LOST_RETRANS)))
2874 return; 2817 return;
2875 /* Change state if cwnd is undone or retransmits are lost */ 2818 /* Change state if cwnd is undone or retransmits are lost */
2819 /* fall through */
2876 default: 2820 default:
2877 if (tcp_is_reno(tp)) { 2821 if (tcp_is_reno(tp)) {
2878 if (flag & FLAG_SND_UNA_ADVANCED) 2822 if (flag & FLAG_SND_UNA_ADVANCED)
@@ -2913,8 +2857,8 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
2913 2857
2914static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us) 2858static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us)
2915{ 2859{
2860 u32 wlen = sock_net(sk)->ipv4.sysctl_tcp_min_rtt_wlen * HZ;
2916 struct tcp_sock *tp = tcp_sk(sk); 2861 struct tcp_sock *tp = tcp_sk(sk);
2917 u32 wlen = sysctl_tcp_min_rtt_wlen * HZ;
2918 2862
2919 minmax_running_min(&tp->rtt_min, wlen, tcp_jiffies32, 2863 minmax_running_min(&tp->rtt_min, wlen, tcp_jiffies32,
2920 rtt_us ? : jiffies_to_usecs(1)); 2864 rtt_us ? : jiffies_to_usecs(1));
@@ -3056,28 +3000,31 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
3056 3000
3057 shinfo = skb_shinfo(skb); 3001 shinfo = skb_shinfo(skb);
3058 if (!before(shinfo->tskey, prior_snd_una) && 3002 if (!before(shinfo->tskey, prior_snd_una) &&
3059 before(shinfo->tskey, tcp_sk(sk)->snd_una)) 3003 before(shinfo->tskey, tcp_sk(sk)->snd_una)) {
3060 __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK); 3004 tcp_skb_tsorted_save(skb) {
3005 __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK);
3006 } tcp_skb_tsorted_restore(skb);
3007 }
3061} 3008}
3062 3009
3063/* Remove acknowledged frames from the retransmission queue. If our packet 3010/* Remove acknowledged frames from the retransmission queue. If our packet
3064 * is before the ack sequence we can discard it as it's confirmed to have 3011 * is before the ack sequence we can discard it as it's confirmed to have
3065 * arrived at the other end. 3012 * arrived at the other end.
3066 */ 3013 */
3067static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, 3014static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
3068 u32 prior_snd_una, int *acked, 3015 u32 prior_snd_una,
3069 struct tcp_sacktag_state *sack) 3016 struct tcp_sacktag_state *sack)
3070{ 3017{
3071 const struct inet_connection_sock *icsk = inet_csk(sk); 3018 const struct inet_connection_sock *icsk = inet_csk(sk);
3072 u64 first_ackt, last_ackt; 3019 u64 first_ackt, last_ackt;
3073 struct tcp_sock *tp = tcp_sk(sk); 3020 struct tcp_sock *tp = tcp_sk(sk);
3074 u32 prior_sacked = tp->sacked_out; 3021 u32 prior_sacked = tp->sacked_out;
3075 u32 reord = tp->packets_out; 3022 u32 reord = tp->snd_nxt; /* lowest acked un-retx un-sacked seq */
3023 struct sk_buff *skb, *next;
3076 bool fully_acked = true; 3024 bool fully_acked = true;
3077 long sack_rtt_us = -1L; 3025 long sack_rtt_us = -1L;
3078 long seq_rtt_us = -1L; 3026 long seq_rtt_us = -1L;
3079 long ca_rtt_us = -1L; 3027 long ca_rtt_us = -1L;
3080 struct sk_buff *skb;
3081 u32 pkts_acked = 0; 3028 u32 pkts_acked = 0;
3082 u32 last_in_flight = 0; 3029 u32 last_in_flight = 0;
3083 bool rtt_update; 3030 bool rtt_update;
@@ -3085,8 +3032,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3085 3032
3086 first_ackt = 0; 3033 first_ackt = 0;
3087 3034
3088 while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) { 3035 for (skb = skb_rb_first(&sk->tcp_rtx_queue); skb; skb = next) {
3089 struct tcp_skb_cb *scb = TCP_SKB_CB(skb); 3036 struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
3037 const u32 start_seq = scb->seq;
3090 u8 sacked = scb->sacked; 3038 u8 sacked = scb->sacked;
3091 u32 acked_pcount; 3039 u32 acked_pcount;
3092 3040
@@ -3103,8 +3051,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3103 break; 3051 break;
3104 fully_acked = false; 3052 fully_acked = false;
3105 } else { 3053 } else {
3106 /* Speedup tcp_unlink_write_queue() and next loop */
3107 prefetchw(skb->next);
3108 acked_pcount = tcp_skb_pcount(skb); 3054 acked_pcount = tcp_skb_pcount(skb);
3109 } 3055 }
3110 3056
@@ -3119,7 +3065,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3119 first_ackt = last_ackt; 3065 first_ackt = last_ackt;
3120 3066
3121 last_in_flight = TCP_SKB_CB(skb)->tx.in_flight; 3067 last_in_flight = TCP_SKB_CB(skb)->tx.in_flight;
3122 reord = min(pkts_acked, reord); 3068 if (before(start_seq, reord))
3069 reord = start_seq;
3123 if (!after(scb->end_seq, tp->high_seq)) 3070 if (!after(scb->end_seq, tp->high_seq))
3124 flag |= FLAG_ORIG_SACK_ACKED; 3071 flag |= FLAG_ORIG_SACK_ACKED;
3125 } 3072 }
@@ -3156,12 +3103,12 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3156 if (!fully_acked) 3103 if (!fully_acked)
3157 break; 3104 break;
3158 3105
3159 tcp_unlink_write_queue(skb, sk); 3106 next = skb_rb_next(skb);
3160 sk_wmem_free_skb(sk, skb);
3161 if (unlikely(skb == tp->retransmit_skb_hint)) 3107 if (unlikely(skb == tp->retransmit_skb_hint))
3162 tp->retransmit_skb_hint = NULL; 3108 tp->retransmit_skb_hint = NULL;
3163 if (unlikely(skb == tp->lost_skb_hint)) 3109 if (unlikely(skb == tp->lost_skb_hint))
3164 tp->lost_skb_hint = NULL; 3110 tp->lost_skb_hint = NULL;
3111 tcp_rtx_queue_unlink_and_free(skb, sk);
3165 } 3112 }
3166 3113
3167 if (!skb) 3114 if (!skb)
@@ -3197,16 +3144,12 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3197 int delta; 3144 int delta;
3198 3145
3199 /* Non-retransmitted hole got filled? That's reordering */ 3146 /* Non-retransmitted hole got filled? That's reordering */
3200 if (reord < prior_fackets && reord <= tp->fackets_out) 3147 if (before(reord, prior_fack))
3201 tcp_update_reordering(sk, tp->fackets_out - reord, 0); 3148 tcp_check_sack_reordering(sk, reord, 0);
3202 3149
3203 delta = tcp_is_fack(tp) ? pkts_acked : 3150 delta = prior_sacked - tp->sacked_out;
3204 prior_sacked - tp->sacked_out;
3205 tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta); 3151 tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta);
3206 } 3152 }
3207
3208 tp->fackets_out -= min(pkts_acked, tp->fackets_out);
3209
3210 } else if (skb && rtt_update && sack_rtt_us >= 0 && 3153 } else if (skb && rtt_update && sack_rtt_us >= 0 &&
3211 sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp)) { 3154 sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp)) {
3212 /* Do not re-arm RTO if the sack RTT is measured from data sent 3155 /* Do not re-arm RTO if the sack RTT is measured from data sent
@@ -3247,18 +3190,19 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3247 } 3190 }
3248 } 3191 }
3249#endif 3192#endif
3250 *acked = pkts_acked;
3251 return flag; 3193 return flag;
3252} 3194}
3253 3195
3254static void tcp_ack_probe(struct sock *sk) 3196static void tcp_ack_probe(struct sock *sk)
3255{ 3197{
3256 const struct tcp_sock *tp = tcp_sk(sk);
3257 struct inet_connection_sock *icsk = inet_csk(sk); 3198 struct inet_connection_sock *icsk = inet_csk(sk);
3199 struct sk_buff *head = tcp_send_head(sk);
3200 const struct tcp_sock *tp = tcp_sk(sk);
3258 3201
3259 /* Was it a usable window open? */ 3202 /* Was it a usable window open? */
3260 3203 if (!head)
3261 if (!after(TCP_SKB_CB(tcp_send_head(sk))->end_seq, tcp_wnd_end(tp))) { 3204 return;
3205 if (!after(TCP_SKB_CB(head)->end_seq, tcp_wnd_end(tp))) {
3262 icsk->icsk_backoff = 0; 3206 icsk->icsk_backoff = 0;
3263 inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0); 3207 inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0);
3264 /* Socket must be waked up by subsequent tcp_data_snd_check(). 3208 /* Socket must be waked up by subsequent tcp_data_snd_check().
@@ -3378,7 +3322,7 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32
3378 tp->pred_flags = 0; 3322 tp->pred_flags = 0;
3379 tcp_fast_path_check(sk); 3323 tcp_fast_path_check(sk);
3380 3324
3381 if (tcp_send_head(sk)) 3325 if (!tcp_write_queue_empty(sk))
3382 tcp_slow_start_after_idle_check(sk); 3326 tcp_slow_start_after_idle_check(sk);
3383 3327
3384 if (nwin > tp->max_window) { 3328 if (nwin > tp->max_window) {
@@ -3399,7 +3343,7 @@ static bool __tcp_oow_rate_limited(struct net *net, int mib_idx,
3399 if (*last_oow_ack_time) { 3343 if (*last_oow_ack_time) {
3400 s32 elapsed = (s32)(tcp_jiffies32 - *last_oow_ack_time); 3344 s32 elapsed = (s32)(tcp_jiffies32 - *last_oow_ack_time);
3401 3345
3402 if (0 <= elapsed && elapsed < sysctl_tcp_invalid_ratelimit) { 3346 if (0 <= elapsed && elapsed < net->ipv4.sysctl_tcp_invalid_ratelimit) {
3403 NET_INC_STATS(net, mib_idx); 3347 NET_INC_STATS(net, mib_idx);
3404 return true; /* rate-limited: don't send yet! */ 3348 return true; /* rate-limited: don't send yet! */
3405 } 3349 }
@@ -3435,10 +3379,11 @@ static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb)
3435 static u32 challenge_timestamp; 3379 static u32 challenge_timestamp;
3436 static unsigned int challenge_count; 3380 static unsigned int challenge_count;
3437 struct tcp_sock *tp = tcp_sk(sk); 3381 struct tcp_sock *tp = tcp_sk(sk);
3382 struct net *net = sock_net(sk);
3438 u32 count, now; 3383 u32 count, now;
3439 3384
3440 /* First check our per-socket dupack rate limit. */ 3385 /* First check our per-socket dupack rate limit. */
3441 if (__tcp_oow_rate_limited(sock_net(sk), 3386 if (__tcp_oow_rate_limited(net,
3442 LINUX_MIB_TCPACKSKIPPEDCHALLENGE, 3387 LINUX_MIB_TCPACKSKIPPEDCHALLENGE,
3443 &tp->last_oow_ack_time)) 3388 &tp->last_oow_ack_time))
3444 return; 3389 return;
@@ -3446,16 +3391,16 @@ static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb)
3446 /* Then check host-wide RFC 5961 rate limit. */ 3391 /* Then check host-wide RFC 5961 rate limit. */
3447 now = jiffies / HZ; 3392 now = jiffies / HZ;
3448 if (now != challenge_timestamp) { 3393 if (now != challenge_timestamp) {
3449 u32 half = (sysctl_tcp_challenge_ack_limit + 1) >> 1; 3394 u32 ack_limit = net->ipv4.sysctl_tcp_challenge_ack_limit;
3395 u32 half = (ack_limit + 1) >> 1;
3450 3396
3451 challenge_timestamp = now; 3397 challenge_timestamp = now;
3452 WRITE_ONCE(challenge_count, half + 3398 WRITE_ONCE(challenge_count, half + prandom_u32_max(ack_limit));
3453 prandom_u32_max(sysctl_tcp_challenge_ack_limit));
3454 } 3399 }
3455 count = READ_ONCE(challenge_count); 3400 count = READ_ONCE(challenge_count);
3456 if (count > 0) { 3401 if (count > 0) {
3457 WRITE_ONCE(challenge_count, count - 1); 3402 WRITE_ONCE(challenge_count, count - 1);
3458 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPCHALLENGEACK); 3403 NET_INC_STATS(net, LINUX_MIB_TCPCHALLENGEACK);
3459 tcp_send_ack(sk); 3404 tcp_send_ack(sk);
3460 } 3405 }
3461} 3406}
@@ -3553,18 +3498,17 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3553 u32 ack_seq = TCP_SKB_CB(skb)->seq; 3498 u32 ack_seq = TCP_SKB_CB(skb)->seq;
3554 u32 ack = TCP_SKB_CB(skb)->ack_seq; 3499 u32 ack = TCP_SKB_CB(skb)->ack_seq;
3555 bool is_dupack = false; 3500 bool is_dupack = false;
3556 u32 prior_fackets;
3557 int prior_packets = tp->packets_out; 3501 int prior_packets = tp->packets_out;
3558 u32 delivered = tp->delivered; 3502 u32 delivered = tp->delivered;
3559 u32 lost = tp->lost; 3503 u32 lost = tp->lost;
3560 int acked = 0; /* Number of packets newly acked */
3561 int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */ 3504 int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */
3505 u32 prior_fack;
3562 3506
3563 sack_state.first_sackt = 0; 3507 sack_state.first_sackt = 0;
3564 sack_state.rate = &rs; 3508 sack_state.rate = &rs;
3565 3509
3566 /* We very likely will need to access write queue head. */ 3510 /* We very likely will need to access rtx queue. */
3567 prefetchw(sk->sk_write_queue.next); 3511 prefetch(sk->tcp_rtx_queue.rb_node);
3568 3512
3569 /* If the ack is older than previous acks 3513 /* If the ack is older than previous acks
3570 * then we can probably ignore it. 3514 * then we can probably ignore it.
@@ -3590,7 +3534,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3590 icsk->icsk_retransmits = 0; 3534 icsk->icsk_retransmits = 0;
3591 } 3535 }
3592 3536
3593 prior_fackets = tp->fackets_out; 3537 prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una;
3594 rs.prior_in_flight = tcp_packets_in_flight(tp); 3538 rs.prior_in_flight = tcp_packets_in_flight(tp);
3595 3539
3596 /* ts_recent update must be made after we are sure that the packet 3540 /* ts_recent update must be made after we are sure that the packet
@@ -3646,8 +3590,9 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3646 goto no_queue; 3590 goto no_queue;
3647 3591
3648 /* See if we can take anything off of the retransmit queue. */ 3592 /* See if we can take anything off of the retransmit queue. */
3649 flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked, 3593 flag |= tcp_clean_rtx_queue(sk, prior_fack, prior_snd_una, &sack_state);
3650 &sack_state); 3594
3595 tcp_rack_update_reo_wnd(sk, &rs);
3651 3596
3652 if (tp->tlp_high_seq) 3597 if (tp->tlp_high_seq)
3653 tcp_process_tlp_ack(sk, ack, flag); 3598 tcp_process_tlp_ack(sk, ack, flag);
@@ -3657,7 +3602,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3657 3602
3658 if (tcp_ack_is_dubious(sk, flag)) { 3603 if (tcp_ack_is_dubious(sk, flag)) {
3659 is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); 3604 is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
3660 tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); 3605 tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag,
3606 &rexmit);
3661 } 3607 }
3662 3608
3663 if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) 3609 if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP))
@@ -3673,13 +3619,13 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3673no_queue: 3619no_queue:
3674 /* If data was DSACKed, see if we can undo a cwnd reduction. */ 3620 /* If data was DSACKed, see if we can undo a cwnd reduction. */
3675 if (flag & FLAG_DSACKING_ACK) 3621 if (flag & FLAG_DSACKING_ACK)
3676 tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); 3622 tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag,
3623 &rexmit);
3677 /* If this ack opens up a zero window, clear backoff. It was 3624 /* If this ack opens up a zero window, clear backoff. It was
3678 * being used to time the probes, and is probably far higher than 3625 * being used to time the probes, and is probably far higher than
3679 * it needs to be for normal retransmission. 3626 * it needs to be for normal retransmission.
3680 */ 3627 */
3681 if (tcp_send_head(sk)) 3628 tcp_ack_probe(sk);
3682 tcp_ack_probe(sk);
3683 3629
3684 if (tp->tlp_high_seq) 3630 if (tp->tlp_high_seq)
3685 tcp_process_tlp_ack(sk, ack, flag); 3631 tcp_process_tlp_ack(sk, ack, flag);
@@ -3696,7 +3642,8 @@ old_ack:
3696 if (TCP_SKB_CB(skb)->sacked) { 3642 if (TCP_SKB_CB(skb)->sacked) {
3697 flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, 3643 flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
3698 &sack_state); 3644 &sack_state);
3699 tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); 3645 tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag,
3646 &rexmit);
3700 tcp_xmit_recovery(sk, rexmit); 3647 tcp_xmit_recovery(sk, rexmit);
3701 } 3648 }
3702 3649
@@ -3721,6 +3668,21 @@ static void tcp_parse_fastopen_option(int len, const unsigned char *cookie,
3721 foc->exp = exp_opt; 3668 foc->exp = exp_opt;
3722} 3669}
3723 3670
3671static void smc_parse_options(const struct tcphdr *th,
3672 struct tcp_options_received *opt_rx,
3673 const unsigned char *ptr,
3674 int opsize)
3675{
3676#if IS_ENABLED(CONFIG_SMC)
3677 if (static_branch_unlikely(&tcp_have_smc)) {
3678 if (th->syn && !(opsize & 1) &&
3679 opsize >= TCPOLEN_EXP_SMC_BASE &&
3680 get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC)
3681 opt_rx->smc_ok = 1;
3682 }
3683#endif
3684}
3685
3724/* Look for tcp options. Normally only called on SYN and SYNACK packets. 3686/* Look for tcp options. Normally only called on SYN and SYNACK packets.
3725 * But, this can also be called on packets in the established flow when 3687 * But, this can also be called on packets in the established flow when
3726 * the fast version below fails. 3688 * the fast version below fails.
@@ -3828,6 +3790,9 @@ void tcp_parse_options(const struct net *net,
3828 tcp_parse_fastopen_option(opsize - 3790 tcp_parse_fastopen_option(opsize -
3829 TCPOLEN_EXP_FASTOPEN_BASE, 3791 TCPOLEN_EXP_FASTOPEN_BASE,
3830 ptr + 2, th->syn, foc, true); 3792 ptr + 2, th->syn, foc, true);
3793 else
3794 smc_parse_options(th, opt_rx, ptr,
3795 opsize);
3831 break; 3796 break;
3832 3797
3833 } 3798 }
@@ -3995,6 +3960,8 @@ static inline bool tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq)
3995/* When we get a reset we do this. */ 3960/* When we get a reset we do this. */
3996void tcp_reset(struct sock *sk) 3961void tcp_reset(struct sock *sk)
3997{ 3962{
3963 trace_tcp_receive_reset(sk);
3964
3998 /* We want the right error as BSD sees it (and indeed as we do). */ 3965 /* We want the right error as BSD sees it (and indeed as we do). */
3999 switch (sk->sk_state) { 3966 switch (sk->sk_state) {
4000 case TCP_SYN_SENT: 3967 case TCP_SYN_SENT:
@@ -4117,7 +4084,7 @@ static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq)
4117{ 4084{
4118 struct tcp_sock *tp = tcp_sk(sk); 4085 struct tcp_sock *tp = tcp_sk(sk);
4119 4086
4120 if (tcp_is_sack(tp) && sysctl_tcp_dsack) { 4087 if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) {
4121 int mib_idx; 4088 int mib_idx;
4122 4089
4123 if (before(seq, tp->rcv_nxt)) 4090 if (before(seq, tp->rcv_nxt))
@@ -4152,7 +4119,7 @@ static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb)
4152 NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST); 4119 NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
4153 tcp_enter_quickack_mode(sk); 4120 tcp_enter_quickack_mode(sk);
4154 4121
4155 if (tcp_is_sack(tp) && sysctl_tcp_dsack) { 4122 if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) {
4156 u32 end_seq = TCP_SKB_CB(skb)->end_seq; 4123 u32 end_seq = TCP_SKB_CB(skb)->end_seq;
4157 4124
4158 if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) 4125 if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))
@@ -4268,11 +4235,6 @@ static void tcp_sack_remove(struct tcp_sock *tp)
4268 tp->rx_opt.num_sacks = num_sacks; 4235 tp->rx_opt.num_sacks = num_sacks;
4269} 4236}
4270 4237
4271enum tcp_queue {
4272 OOO_QUEUE,
4273 RCV_QUEUE,
4274};
4275
4276/** 4238/**
4277 * tcp_try_coalesce - try to merge skb to prior one 4239 * tcp_try_coalesce - try to merge skb to prior one
4278 * @sk: socket 4240 * @sk: socket
@@ -4288,7 +4250,6 @@ enum tcp_queue {
4288 * Returns true if caller should free @from instead of queueing it 4250 * Returns true if caller should free @from instead of queueing it
4289 */ 4251 */
4290static bool tcp_try_coalesce(struct sock *sk, 4252static bool tcp_try_coalesce(struct sock *sk,
4291 enum tcp_queue dest,
4292 struct sk_buff *to, 4253 struct sk_buff *to,
4293 struct sk_buff *from, 4254 struct sk_buff *from,
4294 bool *fragstolen) 4255 bool *fragstolen)
@@ -4313,10 +4274,7 @@ static bool tcp_try_coalesce(struct sock *sk,
4313 4274
4314 if (TCP_SKB_CB(from)->has_rxtstamp) { 4275 if (TCP_SKB_CB(from)->has_rxtstamp) {
4315 TCP_SKB_CB(to)->has_rxtstamp = true; 4276 TCP_SKB_CB(to)->has_rxtstamp = true;
4316 if (dest == OOO_QUEUE) 4277 to->tstamp = from->tstamp;
4317 TCP_SKB_CB(to)->swtstamp = TCP_SKB_CB(from)->swtstamp;
4318 else
4319 to->tstamp = from->tstamp;
4320 } 4278 }
4321 4279
4322 return true; 4280 return true;
@@ -4341,7 +4299,7 @@ static void tcp_ofo_queue(struct sock *sk)
4341 4299
4342 p = rb_first(&tp->out_of_order_queue); 4300 p = rb_first(&tp->out_of_order_queue);
4343 while (p) { 4301 while (p) {
4344 skb = rb_entry(p, struct sk_buff, rbnode); 4302 skb = rb_to_skb(p);
4345 if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) 4303 if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
4346 break; 4304 break;
4347 4305
@@ -4353,9 +4311,6 @@ static void tcp_ofo_queue(struct sock *sk)
4353 } 4311 }
4354 p = rb_next(p); 4312 p = rb_next(p);
4355 rb_erase(&skb->rbnode, &tp->out_of_order_queue); 4313 rb_erase(&skb->rbnode, &tp->out_of_order_queue);
4356 /* Replace tstamp which was stomped by rbnode */
4357 if (TCP_SKB_CB(skb)->has_rxtstamp)
4358 skb->tstamp = TCP_SKB_CB(skb)->swtstamp;
4359 4314
4360 if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) { 4315 if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) {
4361 SOCK_DEBUG(sk, "ofo packet was already received\n"); 4316 SOCK_DEBUG(sk, "ofo packet was already received\n");
@@ -4367,8 +4322,7 @@ static void tcp_ofo_queue(struct sock *sk)
4367 TCP_SKB_CB(skb)->end_seq); 4322 TCP_SKB_CB(skb)->end_seq);
4368 4323
4369 tail = skb_peek_tail(&sk->sk_receive_queue); 4324 tail = skb_peek_tail(&sk->sk_receive_queue);
4370 eaten = tail && tcp_try_coalesce(sk, RCV_QUEUE, 4325 eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen);
4371 tail, skb, &fragstolen);
4372 tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq); 4326 tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
4373 fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; 4327 fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
4374 if (!eaten) 4328 if (!eaten)
@@ -4409,7 +4363,7 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
4409static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) 4363static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
4410{ 4364{
4411 struct tcp_sock *tp = tcp_sk(sk); 4365 struct tcp_sock *tp = tcp_sk(sk);
4412 struct rb_node **p, *q, *parent; 4366 struct rb_node **p, *parent;
4413 struct sk_buff *skb1; 4367 struct sk_buff *skb1;
4414 u32 seq, end_seq; 4368 u32 seq, end_seq;
4415 bool fragstolen; 4369 bool fragstolen;
@@ -4422,10 +4376,6 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
4422 return; 4376 return;
4423 } 4377 }
4424 4378
4425 /* Stash tstamp to avoid being stomped on by rbnode */
4426 if (TCP_SKB_CB(skb)->has_rxtstamp)
4427 TCP_SKB_CB(skb)->swtstamp = skb->tstamp;
4428
4429 /* Disable header prediction. */ 4379 /* Disable header prediction. */
4430 tp->pred_flags = 0; 4380 tp->pred_flags = 0;
4431 inet_csk_schedule_ack(sk); 4381 inet_csk_schedule_ack(sk);
@@ -4453,7 +4403,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
4453 /* In the typical case, we are adding an skb to the end of the list. 4403 /* In the typical case, we are adding an skb to the end of the list.
4454 * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup. 4404 * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup.
4455 */ 4405 */
4456 if (tcp_try_coalesce(sk, OOO_QUEUE, tp->ooo_last_skb, 4406 if (tcp_try_coalesce(sk, tp->ooo_last_skb,
4457 skb, &fragstolen)) { 4407 skb, &fragstolen)) {
4458coalesce_done: 4408coalesce_done:
4459 tcp_grow_window(sk, skb); 4409 tcp_grow_window(sk, skb);
@@ -4472,7 +4422,7 @@ coalesce_done:
4472 parent = NULL; 4422 parent = NULL;
4473 while (*p) { 4423 while (*p) {
4474 parent = *p; 4424 parent = *p;
4475 skb1 = rb_entry(parent, struct sk_buff, rbnode); 4425 skb1 = rb_to_skb(parent);
4476 if (before(seq, TCP_SKB_CB(skb1)->seq)) { 4426 if (before(seq, TCP_SKB_CB(skb1)->seq)) {
4477 p = &parent->rb_left; 4427 p = &parent->rb_left;
4478 continue; 4428 continue;
@@ -4504,7 +4454,7 @@ coalesce_done:
4504 __kfree_skb(skb1); 4454 __kfree_skb(skb1);
4505 goto merge_right; 4455 goto merge_right;
4506 } 4456 }
4507 } else if (tcp_try_coalesce(sk, OOO_QUEUE, skb1, 4457 } else if (tcp_try_coalesce(sk, skb1,
4508 skb, &fragstolen)) { 4458 skb, &fragstolen)) {
4509 goto coalesce_done; 4459 goto coalesce_done;
4510 } 4460 }
@@ -4517,9 +4467,7 @@ insert:
4517 4467
4518merge_right: 4468merge_right:
4519 /* Remove other segments covered by skb. */ 4469 /* Remove other segments covered by skb. */
4520 while ((q = rb_next(&skb->rbnode)) != NULL) { 4470 while ((skb1 = skb_rb_next(skb)) != NULL) {
4521 skb1 = rb_entry(q, struct sk_buff, rbnode);
4522
4523 if (!after(end_seq, TCP_SKB_CB(skb1)->seq)) 4471 if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
4524 break; 4472 break;
4525 if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) { 4473 if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
@@ -4534,7 +4482,7 @@ merge_right:
4534 tcp_drop(sk, skb1); 4482 tcp_drop(sk, skb1);
4535 } 4483 }
4536 /* If there is no skb after us, we are the last_skb ! */ 4484 /* If there is no skb after us, we are the last_skb ! */
4537 if (!q) 4485 if (!skb1)
4538 tp->ooo_last_skb = skb; 4486 tp->ooo_last_skb = skb;
4539 4487
4540add_sack: 4488add_sack:
@@ -4556,7 +4504,7 @@ static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int
4556 4504
4557 __skb_pull(skb, hdrlen); 4505 __skb_pull(skb, hdrlen);
4558 eaten = (tail && 4506 eaten = (tail &&
4559 tcp_try_coalesce(sk, RCV_QUEUE, tail, 4507 tcp_try_coalesce(sk, tail,
4560 skb, fragstolen)) ? 1 : 0; 4508 skb, fragstolen)) ? 1 : 0;
4561 tcp_rcv_nxt_update(tcp_sk(sk), TCP_SKB_CB(skb)->end_seq); 4509 tcp_rcv_nxt_update(tcp_sk(sk), TCP_SKB_CB(skb)->end_seq);
4562 if (!eaten) { 4510 if (!eaten) {
@@ -4720,7 +4668,7 @@ static struct sk_buff *tcp_skb_next(struct sk_buff *skb, struct sk_buff_head *li
4720 if (list) 4668 if (list)
4721 return !skb_queue_is_last(list, skb) ? skb->next : NULL; 4669 return !skb_queue_is_last(list, skb) ? skb->next : NULL;
4722 4670
4723 return rb_entry_safe(rb_next(&skb->rbnode), struct sk_buff, rbnode); 4671 return skb_rb_next(skb);
4724} 4672}
4725 4673
4726static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb, 4674static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
@@ -4741,7 +4689,7 @@ static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
4741} 4689}
4742 4690
4743/* Insert skb into rb tree, ordered by TCP_SKB_CB(skb)->seq */ 4691/* Insert skb into rb tree, ordered by TCP_SKB_CB(skb)->seq */
4744static void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb) 4692void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb)
4745{ 4693{
4746 struct rb_node **p = &root->rb_node; 4694 struct rb_node **p = &root->rb_node;
4747 struct rb_node *parent = NULL; 4695 struct rb_node *parent = NULL;
@@ -4749,7 +4697,7 @@ static void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb)
4749 4697
4750 while (*p) { 4698 while (*p) {
4751 parent = *p; 4699 parent = *p;
4752 skb1 = rb_entry(parent, struct sk_buff, rbnode); 4700 skb1 = rb_to_skb(parent);
4753 if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq)) 4701 if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq))
4754 p = &parent->rb_left; 4702 p = &parent->rb_left;
4755 else 4703 else
@@ -4796,7 +4744,7 @@ restart:
4796 * overlaps to the next one. 4744 * overlaps to the next one.
4797 */ 4745 */
4798 if (!(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) && 4746 if (!(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) &&
4799 (tcp_win_from_space(skb->truesize) > skb->len || 4747 (tcp_win_from_space(sk, skb->truesize) > skb->len ||
4800 before(TCP_SKB_CB(skb)->seq, start))) { 4748 before(TCP_SKB_CB(skb)->seq, start))) {
4801 end_of_skbs = false; 4749 end_of_skbs = false;
4802 break; 4750 break;
@@ -4868,26 +4816,19 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
4868{ 4816{
4869 struct tcp_sock *tp = tcp_sk(sk); 4817 struct tcp_sock *tp = tcp_sk(sk);
4870 struct sk_buff *skb, *head; 4818 struct sk_buff *skb, *head;
4871 struct rb_node *p;
4872 u32 start, end; 4819 u32 start, end;
4873 4820
4874 p = rb_first(&tp->out_of_order_queue); 4821 skb = skb_rb_first(&tp->out_of_order_queue);
4875 skb = rb_entry_safe(p, struct sk_buff, rbnode);
4876new_range: 4822new_range:
4877 if (!skb) { 4823 if (!skb) {
4878 p = rb_last(&tp->out_of_order_queue); 4824 tp->ooo_last_skb = skb_rb_last(&tp->out_of_order_queue);
4879 /* Note: This is possible p is NULL here. We do not
4880 * use rb_entry_safe(), as ooo_last_skb is valid only
4881 * if rbtree is not empty.
4882 */
4883 tp->ooo_last_skb = rb_entry(p, struct sk_buff, rbnode);
4884 return; 4825 return;
4885 } 4826 }
4886 start = TCP_SKB_CB(skb)->seq; 4827 start = TCP_SKB_CB(skb)->seq;
4887 end = TCP_SKB_CB(skb)->end_seq; 4828 end = TCP_SKB_CB(skb)->end_seq;
4888 4829
4889 for (head = skb;;) { 4830 for (head = skb;;) {
4890 skb = tcp_skb_next(skb, NULL); 4831 skb = skb_rb_next(skb);
4891 4832
4892 /* Range is terminated when we see a gap or when 4833 /* Range is terminated when we see a gap or when
4893 * we are at the queue end. 4834 * we are at the queue end.
@@ -4930,14 +4871,14 @@ static bool tcp_prune_ofo_queue(struct sock *sk)
4930 do { 4871 do {
4931 prev = rb_prev(node); 4872 prev = rb_prev(node);
4932 rb_erase(node, &tp->out_of_order_queue); 4873 rb_erase(node, &tp->out_of_order_queue);
4933 tcp_drop(sk, rb_entry(node, struct sk_buff, rbnode)); 4874 tcp_drop(sk, rb_to_skb(node));
4934 sk_mem_reclaim(sk); 4875 sk_mem_reclaim(sk);
4935 if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && 4876 if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
4936 !tcp_under_memory_pressure(sk)) 4877 !tcp_under_memory_pressure(sk))
4937 break; 4878 break;
4938 node = prev; 4879 node = prev;
4939 } while (node); 4880 } while (node);
4940 tp->ooo_last_skb = rb_entry(prev, struct sk_buff, rbnode); 4881 tp->ooo_last_skb = rb_to_skb(prev);
4941 4882
4942 /* Reset SACK state. A conforming SACK implementation will 4883 /* Reset SACK state. A conforming SACK implementation will
4943 * do the same at a timeout based retransmit. When a connection 4884 * do the same at a timeout based retransmit. When a connection
@@ -5112,7 +5053,7 @@ static void tcp_check_urg(struct sock *sk, const struct tcphdr *th)
5112 struct tcp_sock *tp = tcp_sk(sk); 5053 struct tcp_sock *tp = tcp_sk(sk);
5113 u32 ptr = ntohs(th->urg_ptr); 5054 u32 ptr = ntohs(th->urg_ptr);
5114 5055
5115 if (ptr && !sysctl_tcp_stdurg) 5056 if (ptr && !sock_net(sk)->ipv4.sysctl_tcp_stdurg)
5116 ptr--; 5057 ptr--;
5117 ptr += ntohl(th->seq); 5058 ptr += ntohl(th->seq);
5118 5059
@@ -5532,20 +5473,13 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
5532 security_inet_conn_established(sk, skb); 5473 security_inet_conn_established(sk, skb);
5533 } 5474 }
5534 5475
5535 /* Make sure socket is routed, for correct metrics. */ 5476 tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB);
5536 icsk->icsk_af_ops->rebuild_header(sk);
5537
5538 tcp_init_metrics(sk);
5539 tcp_call_bpf(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB);
5540 tcp_init_congestion_control(sk);
5541 5477
5542 /* Prevent spurious tcp_cwnd_restart() on first data 5478 /* Prevent spurious tcp_cwnd_restart() on first data
5543 * packet. 5479 * packet.
5544 */ 5480 */
5545 tp->lsndtime = tcp_jiffies32; 5481 tp->lsndtime = tcp_jiffies32;
5546 5482
5547 tcp_init_buffer_space(sk);
5548
5549 if (sock_flag(sk, SOCK_KEEPOPEN)) 5483 if (sock_flag(sk, SOCK_KEEPOPEN))
5550 inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp)); 5484 inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));
5551 5485
@@ -5559,7 +5493,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
5559 struct tcp_fastopen_cookie *cookie) 5493 struct tcp_fastopen_cookie *cookie)
5560{ 5494{
5561 struct tcp_sock *tp = tcp_sk(sk); 5495 struct tcp_sock *tp = tcp_sk(sk);
5562 struct sk_buff *data = tp->syn_data ? tcp_write_queue_head(sk) : NULL; 5496 struct sk_buff *data = tp->syn_data ? tcp_rtx_queue_head(sk) : NULL;
5563 u16 mss = tp->rx_opt.mss_clamp, try_exp = 0; 5497 u16 mss = tp->rx_opt.mss_clamp, try_exp = 0;
5564 bool syn_drop = false; 5498 bool syn_drop = false;
5565 5499
@@ -5594,9 +5528,8 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
5594 tcp_fastopen_cache_set(sk, mss, cookie, syn_drop, try_exp); 5528 tcp_fastopen_cache_set(sk, mss, cookie, syn_drop, try_exp);
5595 5529
5596 if (data) { /* Retransmit unacked data in SYN */ 5530 if (data) { /* Retransmit unacked data in SYN */
5597 tcp_for_write_queue_from(data, sk) { 5531 skb_rbtree_walk_from(data) {
5598 if (data == tcp_send_head(sk) || 5532 if (__tcp_retransmit_skb(sk, data, 1))
5599 __tcp_retransmit_skb(sk, data, 1))
5600 break; 5533 break;
5601 } 5534 }
5602 tcp_rearm_rto(sk); 5535 tcp_rearm_rto(sk);
@@ -5614,6 +5547,16 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
5614 return false; 5547 return false;
5615} 5548}
5616 5549
5550static void smc_check_reset_syn(struct tcp_sock *tp)
5551{
5552#if IS_ENABLED(CONFIG_SMC)
5553 if (static_branch_unlikely(&tcp_have_smc)) {
5554 if (tp->syn_smc && !tp->rx_opt.smc_ok)
5555 tp->syn_smc = 0;
5556 }
5557#endif
5558}
5559
5617static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, 5560static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
5618 const struct tcphdr *th) 5561 const struct tcphdr *th)
5619{ 5562{
@@ -5709,10 +5652,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
5709 tp->tcp_header_len = sizeof(struct tcphdr); 5652 tp->tcp_header_len = sizeof(struct tcphdr);
5710 } 5653 }
5711 5654
5712 if (tcp_is_sack(tp) && sysctl_tcp_fack)
5713 tcp_enable_fack(tp);
5714
5715 tcp_mtup_init(sk);
5716 tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); 5655 tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
5717 tcp_initialize_rcv_mss(sk); 5656 tcp_initialize_rcv_mss(sk);
5718 5657
@@ -5721,6 +5660,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
5721 * is initialized. */ 5660 * is initialized. */
5722 tp->copied_seq = tp->rcv_nxt; 5661 tp->copied_seq = tp->rcv_nxt;
5723 5662
5663 smc_check_reset_syn(tp);
5664
5724 smp_mb(); 5665 smp_mb();
5725 5666
5726 tcp_finish_connect(sk, skb); 5667 tcp_finish_connect(sk, skb);
@@ -5938,15 +5879,18 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
5938 if (req) { 5879 if (req) {
5939 inet_csk(sk)->icsk_retransmits = 0; 5880 inet_csk(sk)->icsk_retransmits = 0;
5940 reqsk_fastopen_remove(sk, req, false); 5881 reqsk_fastopen_remove(sk, req, false);
5882 /* Re-arm the timer because data may have been sent out.
5883 * This is similar to the regular data transmission case
5884 * when new data has just been ack'ed.
5885 *
5886 * (TFO) - we could try to be more aggressive and
5887 * retransmitting any data sooner based on when they
5888 * are sent out.
5889 */
5890 tcp_rearm_rto(sk);
5941 } else { 5891 } else {
5942 /* Make sure socket is routed, for correct metrics. */ 5892 tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB);
5943 icsk->icsk_af_ops->rebuild_header(sk);
5944 tcp_call_bpf(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB);
5945 tcp_init_congestion_control(sk);
5946
5947 tcp_mtup_init(sk);
5948 tp->copied_seq = tp->rcv_nxt; 5893 tp->copied_seq = tp->rcv_nxt;
5949 tcp_init_buffer_space(sk);
5950 } 5894 }
5951 smp_mb(); 5895 smp_mb();
5952 tcp_set_state(sk, TCP_ESTABLISHED); 5896 tcp_set_state(sk, TCP_ESTABLISHED);
@@ -5966,19 +5910,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
5966 if (tp->rx_opt.tstamp_ok) 5910 if (tp->rx_opt.tstamp_ok)
5967 tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; 5911 tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
5968 5912
5969 if (req) {
5970 /* Re-arm the timer because data may have been sent out.
5971 * This is similar to the regular data transmission case
5972 * when new data has just been ack'ed.
5973 *
5974 * (TFO) - we could try to be more aggressive and
5975 * retransmitting any data sooner based on when they
5976 * are sent out.
5977 */
5978 tcp_rearm_rto(sk);
5979 } else
5980 tcp_init_metrics(sk);
5981
5982 if (!inet_csk(sk)->icsk_ca_ops->cong_control) 5913 if (!inet_csk(sk)->icsk_ca_ops->cong_control)
5983 tcp_update_pacing_rate(sk); 5914 tcp_update_pacing_rate(sk);
5984 5915
@@ -6075,6 +6006,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
6075 case TCP_LAST_ACK: 6006 case TCP_LAST_ACK:
6076 if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) 6007 if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
6077 break; 6008 break;
6009 /* fall through */
6078 case TCP_FIN_WAIT1: 6010 case TCP_FIN_WAIT1:
6079 case TCP_FIN_WAIT2: 6011 case TCP_FIN_WAIT2:
6080 /* RFC 793 says to queue data in these states, 6012 /* RFC 793 says to queue data in these states,
@@ -6183,6 +6115,9 @@ static void tcp_openreq_init(struct request_sock *req,
6183 ireq->ir_rmt_port = tcp_hdr(skb)->source; 6115 ireq->ir_rmt_port = tcp_hdr(skb)->source;
6184 ireq->ir_num = ntohs(tcp_hdr(skb)->dest); 6116 ireq->ir_num = ntohs(tcp_hdr(skb)->dest);
6185 ireq->ir_mark = inet_request_mark(sk, skb); 6117 ireq->ir_mark = inet_request_mark(sk, skb);
6118#if IS_ENABLED(CONFIG_SMC)
6119 ireq->smc_ok = rx_opt->smc_ok;
6120#endif
6186} 6121}
6187 6122
6188struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops, 6123struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops,
@@ -6358,7 +6293,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
6358 tcp_openreq_init_rwin(req, sk, dst); 6293 tcp_openreq_init_rwin(req, sk, dst);
6359 if (!want_cookie) { 6294 if (!want_cookie) {
6360 tcp_reqsk_record_syn(sk, req, skb); 6295 tcp_reqsk_record_syn(sk, req, skb);
6361 fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc); 6296 fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst);
6362 } 6297 }
6363 if (fastopen_sk) { 6298 if (fastopen_sk) {
6364 af_ops->send_synack(fastopen_sk, dst, &fl, req, 6299 af_ops->send_synack(fastopen_sk, dst, &fl, req,