diff options
Diffstat (limited to 'net/ipv4/tcp_input.c')
-rw-r--r-- | net/ipv4/tcp_input.c | 597 |
1 files changed, 266 insertions, 331 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 887585045b27..dabbf1d392fb 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c | |||
@@ -76,25 +76,10 @@ | |||
76 | #include <linux/ipsec.h> | 76 | #include <linux/ipsec.h> |
77 | #include <asm/unaligned.h> | 77 | #include <asm/unaligned.h> |
78 | #include <linux/errqueue.h> | 78 | #include <linux/errqueue.h> |
79 | #include <trace/events/tcp.h> | ||
80 | #include <linux/static_key.h> | ||
79 | 81 | ||
80 | int sysctl_tcp_fack __read_mostly; | ||
81 | int sysctl_tcp_max_reordering __read_mostly = 300; | ||
82 | int sysctl_tcp_dsack __read_mostly = 1; | ||
83 | int sysctl_tcp_app_win __read_mostly = 31; | ||
84 | int sysctl_tcp_adv_win_scale __read_mostly = 1; | ||
85 | EXPORT_SYMBOL(sysctl_tcp_adv_win_scale); | ||
86 | |||
87 | /* rfc5961 challenge ack rate limiting */ | ||
88 | int sysctl_tcp_challenge_ack_limit = 1000; | ||
89 | |||
90 | int sysctl_tcp_stdurg __read_mostly; | ||
91 | int sysctl_tcp_rfc1337 __read_mostly; | ||
92 | int sysctl_tcp_max_orphans __read_mostly = NR_FILE; | 82 | int sysctl_tcp_max_orphans __read_mostly = NR_FILE; |
93 | int sysctl_tcp_frto __read_mostly = 2; | ||
94 | int sysctl_tcp_min_rtt_wlen __read_mostly = 300; | ||
95 | int sysctl_tcp_moderate_rcvbuf __read_mostly = 1; | ||
96 | int sysctl_tcp_early_retrans __read_mostly = 3; | ||
97 | int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2; | ||
98 | 83 | ||
99 | #define FLAG_DATA 0x01 /* Incoming frame contained data. */ | 84 | #define FLAG_DATA 0x01 /* Incoming frame contained data. */ |
100 | #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ | 85 | #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ |
@@ -335,7 +320,7 @@ static void tcp_sndbuf_expand(struct sock *sk) | |||
335 | sndmem *= nr_segs * per_mss; | 320 | sndmem *= nr_segs * per_mss; |
336 | 321 | ||
337 | if (sk->sk_sndbuf < sndmem) | 322 | if (sk->sk_sndbuf < sndmem) |
338 | sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]); | 323 | sk->sk_sndbuf = min(sndmem, sock_net(sk)->ipv4.sysctl_tcp_wmem[2]); |
339 | } | 324 | } |
340 | 325 | ||
341 | /* 2. Tuning advertised window (window_clamp, rcv_ssthresh) | 326 | /* 2. Tuning advertised window (window_clamp, rcv_ssthresh) |
@@ -368,8 +353,8 @@ static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb) | |||
368 | { | 353 | { |
369 | struct tcp_sock *tp = tcp_sk(sk); | 354 | struct tcp_sock *tp = tcp_sk(sk); |
370 | /* Optimize this! */ | 355 | /* Optimize this! */ |
371 | int truesize = tcp_win_from_space(skb->truesize) >> 1; | 356 | int truesize = tcp_win_from_space(sk, skb->truesize) >> 1; |
372 | int window = tcp_win_from_space(sysctl_tcp_rmem[2]) >> 1; | 357 | int window = tcp_win_from_space(sk, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1; |
373 | 358 | ||
374 | while (tp->rcv_ssthresh <= window) { | 359 | while (tp->rcv_ssthresh <= window) { |
375 | if (truesize <= skb->len) | 360 | if (truesize <= skb->len) |
@@ -394,7 +379,7 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb) | |||
394 | /* Check #2. Increase window, if skb with such overhead | 379 | /* Check #2. Increase window, if skb with such overhead |
395 | * will fit to rcvbuf in future. | 380 | * will fit to rcvbuf in future. |
396 | */ | 381 | */ |
397 | if (tcp_win_from_space(skb->truesize) <= skb->len) | 382 | if (tcp_win_from_space(sk, skb->truesize) <= skb->len) |
398 | incr = 2 * tp->advmss; | 383 | incr = 2 * tp->advmss; |
399 | else | 384 | else |
400 | incr = __tcp_grow_window(sk, skb); | 385 | incr = __tcp_grow_window(sk, skb); |
@@ -420,11 +405,11 @@ static void tcp_fixup_rcvbuf(struct sock *sk) | |||
420 | /* Dynamic Right Sizing (DRS) has 2 to 3 RTT latency | 405 | /* Dynamic Right Sizing (DRS) has 2 to 3 RTT latency |
421 | * Allow enough cushion so that sender is not limited by our window | 406 | * Allow enough cushion so that sender is not limited by our window |
422 | */ | 407 | */ |
423 | if (sysctl_tcp_moderate_rcvbuf) | 408 | if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) |
424 | rcvmem <<= 2; | 409 | rcvmem <<= 2; |
425 | 410 | ||
426 | if (sk->sk_rcvbuf < rcvmem) | 411 | if (sk->sk_rcvbuf < rcvmem) |
427 | sk->sk_rcvbuf = min(rcvmem, sysctl_tcp_rmem[2]); | 412 | sk->sk_rcvbuf = min(rcvmem, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); |
428 | } | 413 | } |
429 | 414 | ||
430 | /* 4. Try to fixup all. It is made immediately after connection enters | 415 | /* 4. Try to fixup all. It is made immediately after connection enters |
@@ -432,6 +417,7 @@ static void tcp_fixup_rcvbuf(struct sock *sk) | |||
432 | */ | 417 | */ |
433 | void tcp_init_buffer_space(struct sock *sk) | 418 | void tcp_init_buffer_space(struct sock *sk) |
434 | { | 419 | { |
420 | int tcp_app_win = sock_net(sk)->ipv4.sysctl_tcp_app_win; | ||
435 | struct tcp_sock *tp = tcp_sk(sk); | 421 | struct tcp_sock *tp = tcp_sk(sk); |
436 | int maxwin; | 422 | int maxwin; |
437 | 423 | ||
@@ -450,14 +436,14 @@ void tcp_init_buffer_space(struct sock *sk) | |||
450 | if (tp->window_clamp >= maxwin) { | 436 | if (tp->window_clamp >= maxwin) { |
451 | tp->window_clamp = maxwin; | 437 | tp->window_clamp = maxwin; |
452 | 438 | ||
453 | if (sysctl_tcp_app_win && maxwin > 4 * tp->advmss) | 439 | if (tcp_app_win && maxwin > 4 * tp->advmss) |
454 | tp->window_clamp = max(maxwin - | 440 | tp->window_clamp = max(maxwin - |
455 | (maxwin >> sysctl_tcp_app_win), | 441 | (maxwin >> tcp_app_win), |
456 | 4 * tp->advmss); | 442 | 4 * tp->advmss); |
457 | } | 443 | } |
458 | 444 | ||
459 | /* Force reservation of one segment. */ | 445 | /* Force reservation of one segment. */ |
460 | if (sysctl_tcp_app_win && | 446 | if (tcp_app_win && |
461 | tp->window_clamp > 2 * tp->advmss && | 447 | tp->window_clamp > 2 * tp->advmss && |
462 | tp->window_clamp + tp->advmss > maxwin) | 448 | tp->window_clamp + tp->advmss > maxwin) |
463 | tp->window_clamp = max(2 * tp->advmss, maxwin - tp->advmss); | 449 | tp->window_clamp = max(2 * tp->advmss, maxwin - tp->advmss); |
@@ -471,15 +457,16 @@ static void tcp_clamp_window(struct sock *sk) | |||
471 | { | 457 | { |
472 | struct tcp_sock *tp = tcp_sk(sk); | 458 | struct tcp_sock *tp = tcp_sk(sk); |
473 | struct inet_connection_sock *icsk = inet_csk(sk); | 459 | struct inet_connection_sock *icsk = inet_csk(sk); |
460 | struct net *net = sock_net(sk); | ||
474 | 461 | ||
475 | icsk->icsk_ack.quick = 0; | 462 | icsk->icsk_ack.quick = 0; |
476 | 463 | ||
477 | if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] && | 464 | if (sk->sk_rcvbuf < net->ipv4.sysctl_tcp_rmem[2] && |
478 | !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && | 465 | !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && |
479 | !tcp_under_memory_pressure(sk) && | 466 | !tcp_under_memory_pressure(sk) && |
480 | sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) { | 467 | sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) { |
481 | sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), | 468 | sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), |
482 | sysctl_tcp_rmem[2]); | 469 | net->ipv4.sysctl_tcp_rmem[2]); |
483 | } | 470 | } |
484 | if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) | 471 | if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) |
485 | tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss); | 472 | tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss); |
@@ -610,7 +597,7 @@ void tcp_rcv_space_adjust(struct sock *sk) | |||
610 | * <prev RTT . ><current RTT .. ><next RTT .... > | 597 | * <prev RTT . ><current RTT .. ><next RTT .... > |
611 | */ | 598 | */ |
612 | 599 | ||
613 | if (sysctl_tcp_moderate_rcvbuf && | 600 | if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf && |
614 | !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { | 601 | !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { |
615 | int rcvwin, rcvmem, rcvbuf; | 602 | int rcvwin, rcvmem, rcvbuf; |
616 | 603 | ||
@@ -634,10 +621,11 @@ void tcp_rcv_space_adjust(struct sock *sk) | |||
634 | } | 621 | } |
635 | 622 | ||
636 | rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER); | 623 | rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER); |
637 | while (tcp_win_from_space(rcvmem) < tp->advmss) | 624 | while (tcp_win_from_space(sk, rcvmem) < tp->advmss) |
638 | rcvmem += 128; | 625 | rcvmem += 128; |
639 | 626 | ||
640 | rcvbuf = min(rcvwin / tp->advmss * rcvmem, sysctl_tcp_rmem[2]); | 627 | rcvbuf = min(rcvwin / tp->advmss * rcvmem, |
628 | sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); | ||
641 | if (rcvbuf > sk->sk_rcvbuf) { | 629 | if (rcvbuf > sk->sk_rcvbuf) { |
642 | sk->sk_rcvbuf = rcvbuf; | 630 | sk->sk_rcvbuf = rcvbuf; |
643 | 631 | ||
@@ -781,15 +769,6 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us) | |||
781 | tp->srtt_us = max(1U, srtt); | 769 | tp->srtt_us = max(1U, srtt); |
782 | } | 770 | } |
783 | 771 | ||
784 | /* Set the sk_pacing_rate to allow proper sizing of TSO packets. | ||
785 | * Note: TCP stack does not yet implement pacing. | ||
786 | * FQ packet scheduler can be used to implement cheap but effective | ||
787 | * TCP pacing, to smooth the burst on large writes when packets | ||
788 | * in flight is significantly lower than cwnd (or rwin) | ||
789 | */ | ||
790 | int sysctl_tcp_pacing_ss_ratio __read_mostly = 200; | ||
791 | int sysctl_tcp_pacing_ca_ratio __read_mostly = 120; | ||
792 | |||
793 | static void tcp_update_pacing_rate(struct sock *sk) | 772 | static void tcp_update_pacing_rate(struct sock *sk) |
794 | { | 773 | { |
795 | const struct tcp_sock *tp = tcp_sk(sk); | 774 | const struct tcp_sock *tp = tcp_sk(sk); |
@@ -807,9 +786,9 @@ static void tcp_update_pacing_rate(struct sock *sk) | |||
807 | * end of slow start and should slow down. | 786 | * end of slow start and should slow down. |
808 | */ | 787 | */ |
809 | if (tp->snd_cwnd < tp->snd_ssthresh / 2) | 788 | if (tp->snd_cwnd < tp->snd_ssthresh / 2) |
810 | rate *= sysctl_tcp_pacing_ss_ratio; | 789 | rate *= sock_net(sk)->ipv4.sysctl_tcp_pacing_ss_ratio; |
811 | else | 790 | else |
812 | rate *= sysctl_tcp_pacing_ca_ratio; | 791 | rate *= sock_net(sk)->ipv4.sysctl_tcp_pacing_ca_ratio; |
813 | 792 | ||
814 | rate *= max(tp->snd_cwnd, tp->packets_out); | 793 | rate *= max(tp->snd_cwnd, tp->packets_out); |
815 | 794 | ||
@@ -863,60 +842,46 @@ __u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst) | |||
863 | return min_t(__u32, cwnd, tp->snd_cwnd_clamp); | 842 | return min_t(__u32, cwnd, tp->snd_cwnd_clamp); |
864 | } | 843 | } |
865 | 844 | ||
866 | /* | ||
867 | * Packet counting of FACK is based on in-order assumptions, therefore TCP | ||
868 | * disables it when reordering is detected | ||
869 | */ | ||
870 | void tcp_disable_fack(struct tcp_sock *tp) | ||
871 | { | ||
872 | /* RFC3517 uses different metric in lost marker => reset on change */ | ||
873 | if (tcp_is_fack(tp)) | ||
874 | tp->lost_skb_hint = NULL; | ||
875 | tp->rx_opt.sack_ok &= ~TCP_FACK_ENABLED; | ||
876 | } | ||
877 | |||
878 | /* Take a notice that peer is sending D-SACKs */ | 845 | /* Take a notice that peer is sending D-SACKs */ |
879 | static void tcp_dsack_seen(struct tcp_sock *tp) | 846 | static void tcp_dsack_seen(struct tcp_sock *tp) |
880 | { | 847 | { |
881 | tp->rx_opt.sack_ok |= TCP_DSACK_SEEN; | 848 | tp->rx_opt.sack_ok |= TCP_DSACK_SEEN; |
849 | tp->rack.dsack_seen = 1; | ||
882 | } | 850 | } |
883 | 851 | ||
884 | static void tcp_update_reordering(struct sock *sk, const int metric, | 852 | /* It's reordering when higher sequence was delivered (i.e. sacked) before |
885 | const int ts) | 853 | * some lower never-retransmitted sequence ("low_seq"). The maximum reordering |
854 | * distance is approximated in full-mss packet distance ("reordering"). | ||
855 | */ | ||
856 | static void tcp_check_sack_reordering(struct sock *sk, const u32 low_seq, | ||
857 | const int ts) | ||
886 | { | 858 | { |
887 | struct tcp_sock *tp = tcp_sk(sk); | 859 | struct tcp_sock *tp = tcp_sk(sk); |
888 | int mib_idx; | 860 | const u32 mss = tp->mss_cache; |
861 | u32 fack, metric; | ||
889 | 862 | ||
890 | if (WARN_ON_ONCE(metric < 0)) | 863 | fack = tcp_highest_sack_seq(tp); |
864 | if (!before(low_seq, fack)) | ||
891 | return; | 865 | return; |
892 | 866 | ||
893 | if (metric > tp->reordering) { | 867 | metric = fack - low_seq; |
894 | tp->reordering = min(sysctl_tcp_max_reordering, metric); | 868 | if ((metric > tp->reordering * mss) && mss) { |
895 | |||
896 | #if FASTRETRANS_DEBUG > 1 | 869 | #if FASTRETRANS_DEBUG > 1 |
897 | pr_debug("Disorder%d %d %u f%u s%u rr%d\n", | 870 | pr_debug("Disorder%d %d %u f%u s%u rr%d\n", |
898 | tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state, | 871 | tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state, |
899 | tp->reordering, | 872 | tp->reordering, |
900 | tp->fackets_out, | 873 | 0, |
901 | tp->sacked_out, | 874 | tp->sacked_out, |
902 | tp->undo_marker ? tp->undo_retrans : 0); | 875 | tp->undo_marker ? tp->undo_retrans : 0); |
903 | #endif | 876 | #endif |
904 | tcp_disable_fack(tp); | 877 | tp->reordering = min_t(u32, (metric + mss - 1) / mss, |
878 | sock_net(sk)->ipv4.sysctl_tcp_max_reordering); | ||
905 | } | 879 | } |
906 | 880 | ||
907 | tp->rack.reord = 1; | 881 | tp->rack.reord = 1; |
908 | |||
909 | /* This exciting event is worth to be remembered. 8) */ | 882 | /* This exciting event is worth to be remembered. 8) */ |
910 | if (ts) | 883 | NET_INC_STATS(sock_net(sk), |
911 | mib_idx = LINUX_MIB_TCPTSREORDER; | 884 | ts ? LINUX_MIB_TCPTSREORDER : LINUX_MIB_TCPSACKREORDER); |
912 | else if (tcp_is_reno(tp)) | ||
913 | mib_idx = LINUX_MIB_TCPRENOREORDER; | ||
914 | else if (tcp_is_fack(tp)) | ||
915 | mib_idx = LINUX_MIB_TCPFACKREORDER; | ||
916 | else | ||
917 | mib_idx = LINUX_MIB_TCPSACKREORDER; | ||
918 | |||
919 | NET_INC_STATS(sock_net(sk), mib_idx); | ||
920 | } | 885 | } |
921 | 886 | ||
922 | /* This must be called before lost_out is incremented */ | 887 | /* This must be called before lost_out is incremented */ |
@@ -990,7 +955,6 @@ void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb) | |||
990 | * 3. Loss detection event of two flavors: | 955 | * 3. Loss detection event of two flavors: |
991 | * A. Scoreboard estimator decided the packet is lost. | 956 | * A. Scoreboard estimator decided the packet is lost. |
992 | * A'. Reno "three dupacks" marks head of queue lost. | 957 | * A'. Reno "three dupacks" marks head of queue lost. |
993 | * A''. Its FACK modification, head until snd.fack is lost. | ||
994 | * B. SACK arrives sacking SND.NXT at the moment, when the | 958 | * B. SACK arrives sacking SND.NXT at the moment, when the |
995 | * segment was retransmitted. | 959 | * segment was retransmitted. |
996 | * 4. D-SACK added new rule: D-SACK changes any tag to S. | 960 | * 4. D-SACK added new rule: D-SACK changes any tag to S. |
@@ -1133,8 +1097,7 @@ static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb, | |||
1133 | } | 1097 | } |
1134 | 1098 | ||
1135 | struct tcp_sacktag_state { | 1099 | struct tcp_sacktag_state { |
1136 | int reord; | 1100 | u32 reord; |
1137 | int fack_count; | ||
1138 | /* Timestamps for earliest and latest never-retransmitted segment | 1101 | /* Timestamps for earliest and latest never-retransmitted segment |
1139 | * that was SACKed. RTO needs the earliest RTT to stay conservative, | 1102 | * that was SACKed. RTO needs the earliest RTT to stay conservative, |
1140 | * but congestion control should still get an accurate delay signal. | 1103 | * but congestion control should still get an accurate delay signal. |
@@ -1143,6 +1106,7 @@ struct tcp_sacktag_state { | |||
1143 | u64 last_sackt; | 1106 | u64 last_sackt; |
1144 | struct rate_sample *rate; | 1107 | struct rate_sample *rate; |
1145 | int flag; | 1108 | int flag; |
1109 | unsigned int mss_now; | ||
1146 | }; | 1110 | }; |
1147 | 1111 | ||
1148 | /* Check if skb is fully within the SACK block. In presence of GSO skbs, | 1112 | /* Check if skb is fully within the SACK block. In presence of GSO skbs, |
@@ -1192,7 +1156,8 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, | |||
1192 | if (pkt_len >= skb->len && !in_sack) | 1156 | if (pkt_len >= skb->len && !in_sack) |
1193 | return 0; | 1157 | return 0; |
1194 | 1158 | ||
1195 | err = tcp_fragment(sk, skb, pkt_len, mss, GFP_ATOMIC); | 1159 | err = tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, |
1160 | pkt_len, mss, GFP_ATOMIC); | ||
1196 | if (err < 0) | 1161 | if (err < 0) |
1197 | return err; | 1162 | return err; |
1198 | } | 1163 | } |
@@ -1208,15 +1173,15 @@ static u8 tcp_sacktag_one(struct sock *sk, | |||
1208 | u64 xmit_time) | 1173 | u64 xmit_time) |
1209 | { | 1174 | { |
1210 | struct tcp_sock *tp = tcp_sk(sk); | 1175 | struct tcp_sock *tp = tcp_sk(sk); |
1211 | int fack_count = state->fack_count; | ||
1212 | 1176 | ||
1213 | /* Account D-SACK for retransmitted packet. */ | 1177 | /* Account D-SACK for retransmitted packet. */ |
1214 | if (dup_sack && (sacked & TCPCB_RETRANS)) { | 1178 | if (dup_sack && (sacked & TCPCB_RETRANS)) { |
1215 | if (tp->undo_marker && tp->undo_retrans > 0 && | 1179 | if (tp->undo_marker && tp->undo_retrans > 0 && |
1216 | after(end_seq, tp->undo_marker)) | 1180 | after(end_seq, tp->undo_marker)) |
1217 | tp->undo_retrans--; | 1181 | tp->undo_retrans--; |
1218 | if (sacked & TCPCB_SACKED_ACKED) | 1182 | if ((sacked & TCPCB_SACKED_ACKED) && |
1219 | state->reord = min(fack_count, state->reord); | 1183 | before(start_seq, state->reord)) |
1184 | state->reord = start_seq; | ||
1220 | } | 1185 | } |
1221 | 1186 | ||
1222 | /* Nothing to do; acked frame is about to be dropped (was ACKed). */ | 1187 | /* Nothing to do; acked frame is about to be dropped (was ACKed). */ |
@@ -1242,9 +1207,10 @@ static u8 tcp_sacktag_one(struct sock *sk, | |||
1242 | * which was in hole. It is reordering. | 1207 | * which was in hole. It is reordering. |
1243 | */ | 1208 | */ |
1244 | if (before(start_seq, | 1209 | if (before(start_seq, |
1245 | tcp_highest_sack_seq(tp))) | 1210 | tcp_highest_sack_seq(tp)) && |
1246 | state->reord = min(fack_count, | 1211 | before(start_seq, state->reord)) |
1247 | state->reord); | 1212 | state->reord = start_seq; |
1213 | |||
1248 | if (!after(end_seq, tp->high_seq)) | 1214 | if (!after(end_seq, tp->high_seq)) |
1249 | state->flag |= FLAG_ORIG_SACK_ACKED; | 1215 | state->flag |= FLAG_ORIG_SACK_ACKED; |
1250 | if (state->first_sackt == 0) | 1216 | if (state->first_sackt == 0) |
@@ -1263,15 +1229,10 @@ static u8 tcp_sacktag_one(struct sock *sk, | |||
1263 | tp->sacked_out += pcount; | 1229 | tp->sacked_out += pcount; |
1264 | tp->delivered += pcount; /* Out-of-order packets delivered */ | 1230 | tp->delivered += pcount; /* Out-of-order packets delivered */ |
1265 | 1231 | ||
1266 | fack_count += pcount; | ||
1267 | |||
1268 | /* Lost marker hint past SACKed? Tweak RFC3517 cnt */ | 1232 | /* Lost marker hint past SACKed? Tweak RFC3517 cnt */ |
1269 | if (!tcp_is_fack(tp) && tp->lost_skb_hint && | 1233 | if (tp->lost_skb_hint && |
1270 | before(start_seq, TCP_SKB_CB(tp->lost_skb_hint)->seq)) | 1234 | before(start_seq, TCP_SKB_CB(tp->lost_skb_hint)->seq)) |
1271 | tp->lost_cnt_hint += pcount; | 1235 | tp->lost_cnt_hint += pcount; |
1272 | |||
1273 | if (fack_count > tp->fackets_out) | ||
1274 | tp->fackets_out = fack_count; | ||
1275 | } | 1236 | } |
1276 | 1237 | ||
1277 | /* D-SACK. We can detect redundant retransmission in S|R and plain R | 1238 | /* D-SACK. We can detect redundant retransmission in S|R and plain R |
@@ -1289,13 +1250,13 @@ static u8 tcp_sacktag_one(struct sock *sk, | |||
1289 | /* Shift newly-SACKed bytes from this skb to the immediately previous | 1250 | /* Shift newly-SACKed bytes from this skb to the immediately previous |
1290 | * already-SACKed sk_buff. Mark the newly-SACKed bytes as such. | 1251 | * already-SACKed sk_buff. Mark the newly-SACKed bytes as such. |
1291 | */ | 1252 | */ |
1292 | static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, | 1253 | static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev, |
1254 | struct sk_buff *skb, | ||
1293 | struct tcp_sacktag_state *state, | 1255 | struct tcp_sacktag_state *state, |
1294 | unsigned int pcount, int shifted, int mss, | 1256 | unsigned int pcount, int shifted, int mss, |
1295 | bool dup_sack) | 1257 | bool dup_sack) |
1296 | { | 1258 | { |
1297 | struct tcp_sock *tp = tcp_sk(sk); | 1259 | struct tcp_sock *tp = tcp_sk(sk); |
1298 | struct sk_buff *prev = tcp_write_queue_prev(sk, skb); | ||
1299 | u32 start_seq = TCP_SKB_CB(skb)->seq; /* start of newly-SACKed */ | 1260 | u32 start_seq = TCP_SKB_CB(skb)->seq; /* start of newly-SACKed */ |
1300 | u32 end_seq = start_seq + shifted; /* end of newly-SACKed */ | 1261 | u32 end_seq = start_seq + shifted; /* end of newly-SACKed */ |
1301 | 1262 | ||
@@ -1364,8 +1325,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, | |||
1364 | if (unlikely(TCP_SKB_CB(prev)->tx.delivered_mstamp)) | 1325 | if (unlikely(TCP_SKB_CB(prev)->tx.delivered_mstamp)) |
1365 | TCP_SKB_CB(prev)->tx.delivered_mstamp = 0; | 1326 | TCP_SKB_CB(prev)->tx.delivered_mstamp = 0; |
1366 | 1327 | ||
1367 | tcp_unlink_write_queue(skb, sk); | 1328 | tcp_rtx_queue_unlink_and_free(skb, sk); |
1368 | sk_wmem_free_skb(sk, skb); | ||
1369 | 1329 | ||
1370 | NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKMERGED); | 1330 | NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKMERGED); |
1371 | 1331 | ||
@@ -1415,9 +1375,9 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, | |||
1415 | goto fallback; | 1375 | goto fallback; |
1416 | 1376 | ||
1417 | /* Can only happen with delayed DSACK + discard craziness */ | 1377 | /* Can only happen with delayed DSACK + discard craziness */ |
1418 | if (unlikely(skb == tcp_write_queue_head(sk))) | 1378 | prev = skb_rb_prev(skb); |
1379 | if (!prev) | ||
1419 | goto fallback; | 1380 | goto fallback; |
1420 | prev = tcp_write_queue_prev(sk, skb); | ||
1421 | 1381 | ||
1422 | if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) | 1382 | if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) |
1423 | goto fallback; | 1383 | goto fallback; |
@@ -1496,18 +1456,17 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, | |||
1496 | 1456 | ||
1497 | if (!skb_shift(prev, skb, len)) | 1457 | if (!skb_shift(prev, skb, len)) |
1498 | goto fallback; | 1458 | goto fallback; |
1499 | if (!tcp_shifted_skb(sk, skb, state, pcount, len, mss, dup_sack)) | 1459 | if (!tcp_shifted_skb(sk, prev, skb, state, pcount, len, mss, dup_sack)) |
1500 | goto out; | 1460 | goto out; |
1501 | 1461 | ||
1502 | /* Hole filled allows collapsing with the next as well, this is very | 1462 | /* Hole filled allows collapsing with the next as well, this is very |
1503 | * useful when hole on every nth skb pattern happens | 1463 | * useful when hole on every nth skb pattern happens |
1504 | */ | 1464 | */ |
1505 | if (prev == tcp_write_queue_tail(sk)) | 1465 | skb = skb_rb_next(prev); |
1466 | if (!skb) | ||
1506 | goto out; | 1467 | goto out; |
1507 | skb = tcp_write_queue_next(sk, prev); | ||
1508 | 1468 | ||
1509 | if (!skb_can_shift(skb) || | 1469 | if (!skb_can_shift(skb) || |
1510 | (skb == tcp_send_head(sk)) || | ||
1511 | ((TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) || | 1470 | ((TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) || |
1512 | (mss != tcp_skb_seglen(skb))) | 1471 | (mss != tcp_skb_seglen(skb))) |
1513 | goto out; | 1472 | goto out; |
@@ -1515,11 +1474,11 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, | |||
1515 | len = skb->len; | 1474 | len = skb->len; |
1516 | if (skb_shift(prev, skb, len)) { | 1475 | if (skb_shift(prev, skb, len)) { |
1517 | pcount += tcp_skb_pcount(skb); | 1476 | pcount += tcp_skb_pcount(skb); |
1518 | tcp_shifted_skb(sk, skb, state, tcp_skb_pcount(skb), len, mss, 0); | 1477 | tcp_shifted_skb(sk, prev, skb, state, tcp_skb_pcount(skb), |
1478 | len, mss, 0); | ||
1519 | } | 1479 | } |
1520 | 1480 | ||
1521 | out: | 1481 | out: |
1522 | state->fack_count += pcount; | ||
1523 | return prev; | 1482 | return prev; |
1524 | 1483 | ||
1525 | noop: | 1484 | noop: |
@@ -1539,13 +1498,10 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, | |||
1539 | struct tcp_sock *tp = tcp_sk(sk); | 1498 | struct tcp_sock *tp = tcp_sk(sk); |
1540 | struct sk_buff *tmp; | 1499 | struct sk_buff *tmp; |
1541 | 1500 | ||
1542 | tcp_for_write_queue_from(skb, sk) { | 1501 | skb_rbtree_walk_from(skb) { |
1543 | int in_sack = 0; | 1502 | int in_sack = 0; |
1544 | bool dup_sack = dup_sack_in; | 1503 | bool dup_sack = dup_sack_in; |
1545 | 1504 | ||
1546 | if (skb == tcp_send_head(sk)) | ||
1547 | break; | ||
1548 | |||
1549 | /* queue is in-order => we can short-circuit the walk early */ | 1505 | /* queue is in-order => we can short-circuit the walk early */ |
1550 | if (!before(TCP_SKB_CB(skb)->seq, end_seq)) | 1506 | if (!before(TCP_SKB_CB(skb)->seq, end_seq)) |
1551 | break; | 1507 | break; |
@@ -1594,34 +1550,48 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, | |||
1594 | tcp_skb_pcount(skb), | 1550 | tcp_skb_pcount(skb), |
1595 | skb->skb_mstamp); | 1551 | skb->skb_mstamp); |
1596 | tcp_rate_skb_delivered(sk, skb, state->rate); | 1552 | tcp_rate_skb_delivered(sk, skb, state->rate); |
1553 | if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) | ||
1554 | list_del_init(&skb->tcp_tsorted_anchor); | ||
1597 | 1555 | ||
1598 | if (!before(TCP_SKB_CB(skb)->seq, | 1556 | if (!before(TCP_SKB_CB(skb)->seq, |
1599 | tcp_highest_sack_seq(tp))) | 1557 | tcp_highest_sack_seq(tp))) |
1600 | tcp_advance_highest_sack(sk, skb); | 1558 | tcp_advance_highest_sack(sk, skb); |
1601 | } | 1559 | } |
1602 | |||
1603 | state->fack_count += tcp_skb_pcount(skb); | ||
1604 | } | 1560 | } |
1605 | return skb; | 1561 | return skb; |
1606 | } | 1562 | } |
1607 | 1563 | ||
1608 | /* Avoid all extra work that is being done by sacktag while walking in | 1564 | static struct sk_buff *tcp_sacktag_bsearch(struct sock *sk, |
1609 | * a normal way | 1565 | struct tcp_sacktag_state *state, |
1610 | */ | 1566 | u32 seq) |
1567 | { | ||
1568 | struct rb_node *parent, **p = &sk->tcp_rtx_queue.rb_node; | ||
1569 | struct sk_buff *skb; | ||
1570 | |||
1571 | while (*p) { | ||
1572 | parent = *p; | ||
1573 | skb = rb_to_skb(parent); | ||
1574 | if (before(seq, TCP_SKB_CB(skb)->seq)) { | ||
1575 | p = &parent->rb_left; | ||
1576 | continue; | ||
1577 | } | ||
1578 | if (!before(seq, TCP_SKB_CB(skb)->end_seq)) { | ||
1579 | p = &parent->rb_right; | ||
1580 | continue; | ||
1581 | } | ||
1582 | return skb; | ||
1583 | } | ||
1584 | return NULL; | ||
1585 | } | ||
1586 | |||
1611 | static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk, | 1587 | static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk, |
1612 | struct tcp_sacktag_state *state, | 1588 | struct tcp_sacktag_state *state, |
1613 | u32 skip_to_seq) | 1589 | u32 skip_to_seq) |
1614 | { | 1590 | { |
1615 | tcp_for_write_queue_from(skb, sk) { | 1591 | if (skb && after(TCP_SKB_CB(skb)->seq, skip_to_seq)) |
1616 | if (skb == tcp_send_head(sk)) | 1592 | return skb; |
1617 | break; | ||
1618 | |||
1619 | if (after(TCP_SKB_CB(skb)->end_seq, skip_to_seq)) | ||
1620 | break; | ||
1621 | 1593 | ||
1622 | state->fack_count += tcp_skb_pcount(skb); | 1594 | return tcp_sacktag_bsearch(sk, state, skip_to_seq); |
1623 | } | ||
1624 | return skb; | ||
1625 | } | 1595 | } |
1626 | 1596 | ||
1627 | static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb, | 1597 | static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb, |
@@ -1666,13 +1636,10 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, | |||
1666 | int first_sack_index; | 1636 | int first_sack_index; |
1667 | 1637 | ||
1668 | state->flag = 0; | 1638 | state->flag = 0; |
1669 | state->reord = tp->packets_out; | 1639 | state->reord = tp->snd_nxt; |
1670 | 1640 | ||
1671 | if (!tp->sacked_out) { | 1641 | if (!tp->sacked_out) |
1672 | if (WARN_ON(tp->fackets_out)) | ||
1673 | tp->fackets_out = 0; | ||
1674 | tcp_highest_sack_reset(sk); | 1642 | tcp_highest_sack_reset(sk); |
1675 | } | ||
1676 | 1643 | ||
1677 | found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire, | 1644 | found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire, |
1678 | num_sacks, prior_snd_una); | 1645 | num_sacks, prior_snd_una); |
@@ -1743,8 +1710,8 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, | |||
1743 | } | 1710 | } |
1744 | } | 1711 | } |
1745 | 1712 | ||
1746 | skb = tcp_write_queue_head(sk); | 1713 | state->mss_now = tcp_current_mss(sk); |
1747 | state->fack_count = 0; | 1714 | skb = NULL; |
1748 | i = 0; | 1715 | i = 0; |
1749 | 1716 | ||
1750 | if (!tp->sacked_out) { | 1717 | if (!tp->sacked_out) { |
@@ -1801,7 +1768,6 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, | |||
1801 | skb = tcp_highest_sack(sk); | 1768 | skb = tcp_highest_sack(sk); |
1802 | if (!skb) | 1769 | if (!skb) |
1803 | break; | 1770 | break; |
1804 | state->fack_count = tp->fackets_out; | ||
1805 | cache++; | 1771 | cache++; |
1806 | goto walk; | 1772 | goto walk; |
1807 | } | 1773 | } |
@@ -1816,7 +1782,6 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, | |||
1816 | skb = tcp_highest_sack(sk); | 1782 | skb = tcp_highest_sack(sk); |
1817 | if (!skb) | 1783 | if (!skb) |
1818 | break; | 1784 | break; |
1819 | state->fack_count = tp->fackets_out; | ||
1820 | } | 1785 | } |
1821 | skb = tcp_sacktag_skip(skb, sk, state, start_seq); | 1786 | skb = tcp_sacktag_skip(skb, sk, state, start_seq); |
1822 | 1787 | ||
@@ -1836,9 +1801,8 @@ advance_sp: | |||
1836 | for (j = 0; j < used_sacks; j++) | 1801 | for (j = 0; j < used_sacks; j++) |
1837 | tp->recv_sack_cache[i++] = sp[j]; | 1802 | tp->recv_sack_cache[i++] = sp[j]; |
1838 | 1803 | ||
1839 | if ((state->reord < tp->fackets_out) && | 1804 | if (inet_csk(sk)->icsk_ca_state != TCP_CA_Loss || tp->undo_marker) |
1840 | ((inet_csk(sk)->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker)) | 1805 | tcp_check_sack_reordering(sk, state->reord, 0); |
1841 | tcp_update_reordering(sk, tp->fackets_out - state->reord, 0); | ||
1842 | 1806 | ||
1843 | tcp_verify_left_out(tp); | 1807 | tcp_verify_left_out(tp); |
1844 | out: | 1808 | out: |
@@ -1876,8 +1840,13 @@ static bool tcp_limit_reno_sacked(struct tcp_sock *tp) | |||
1876 | static void tcp_check_reno_reordering(struct sock *sk, const int addend) | 1840 | static void tcp_check_reno_reordering(struct sock *sk, const int addend) |
1877 | { | 1841 | { |
1878 | struct tcp_sock *tp = tcp_sk(sk); | 1842 | struct tcp_sock *tp = tcp_sk(sk); |
1879 | if (tcp_limit_reno_sacked(tp)) | 1843 | |
1880 | tcp_update_reordering(sk, tp->packets_out + addend, 0); | 1844 | if (!tcp_limit_reno_sacked(tp)) |
1845 | return; | ||
1846 | |||
1847 | tp->reordering = min_t(u32, tp->packets_out + addend, | ||
1848 | sock_net(sk)->ipv4.sysctl_tcp_max_reordering); | ||
1849 | NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRENOREORDER); | ||
1881 | } | 1850 | } |
1882 | 1851 | ||
1883 | /* Emulate SACKs for SACKless connection: account for a new dupack. */ | 1852 | /* Emulate SACKs for SACKless connection: account for a new dupack. */ |
@@ -1923,7 +1892,6 @@ void tcp_clear_retrans(struct tcp_sock *tp) | |||
1923 | tp->lost_out = 0; | 1892 | tp->lost_out = 0; |
1924 | tp->undo_marker = 0; | 1893 | tp->undo_marker = 0; |
1925 | tp->undo_retrans = -1; | 1894 | tp->undo_retrans = -1; |
1926 | tp->fackets_out = 0; | ||
1927 | tp->sacked_out = 0; | 1895 | tp->sacked_out = 0; |
1928 | } | 1896 | } |
1929 | 1897 | ||
@@ -1968,19 +1936,15 @@ void tcp_enter_loss(struct sock *sk) | |||
1968 | if (tcp_is_reno(tp)) | 1936 | if (tcp_is_reno(tp)) |
1969 | tcp_reset_reno_sack(tp); | 1937 | tcp_reset_reno_sack(tp); |
1970 | 1938 | ||
1971 | skb = tcp_write_queue_head(sk); | 1939 | skb = tcp_rtx_queue_head(sk); |
1972 | is_reneg = skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED); | 1940 | is_reneg = skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED); |
1973 | if (is_reneg) { | 1941 | if (is_reneg) { |
1974 | NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING); | 1942 | NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING); |
1975 | tp->sacked_out = 0; | 1943 | tp->sacked_out = 0; |
1976 | tp->fackets_out = 0; | ||
1977 | } | 1944 | } |
1978 | tcp_clear_all_retrans_hints(tp); | 1945 | tcp_clear_all_retrans_hints(tp); |
1979 | 1946 | ||
1980 | tcp_for_write_queue(skb, sk) { | 1947 | skb_rbtree_walk_from(skb) { |
1981 | if (skb == tcp_send_head(sk)) | ||
1982 | break; | ||
1983 | |||
1984 | mark_lost = (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) || | 1948 | mark_lost = (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) || |
1985 | is_reneg); | 1949 | is_reneg); |
1986 | if (mark_lost) | 1950 | if (mark_lost) |
@@ -2014,7 +1978,7 @@ void tcp_enter_loss(struct sock *sk) | |||
2014 | * falsely raise the receive window, which results in repeated | 1978 | * falsely raise the receive window, which results in repeated |
2015 | * timeouts and stop-and-go behavior. | 1979 | * timeouts and stop-and-go behavior. |
2016 | */ | 1980 | */ |
2017 | tp->frto = sysctl_tcp_frto && | 1981 | tp->frto = net->ipv4.sysctl_tcp_frto && |
2018 | (new_recovery || icsk->icsk_retransmits) && | 1982 | (new_recovery || icsk->icsk_retransmits) && |
2019 | !inet_csk(sk)->icsk_mtup.probe_size; | 1983 | !inet_csk(sk)->icsk_mtup.probe_size; |
2020 | } | 1984 | } |
@@ -2043,19 +2007,10 @@ static bool tcp_check_sack_reneging(struct sock *sk, int flag) | |||
2043 | return false; | 2007 | return false; |
2044 | } | 2008 | } |
2045 | 2009 | ||
2046 | static inline int tcp_fackets_out(const struct tcp_sock *tp) | ||
2047 | { | ||
2048 | return tcp_is_reno(tp) ? tp->sacked_out + 1 : tp->fackets_out; | ||
2049 | } | ||
2050 | |||
2051 | /* Heurestics to calculate number of duplicate ACKs. There's no dupACKs | 2010 | /* Heurestics to calculate number of duplicate ACKs. There's no dupACKs |
2052 | * counter when SACK is enabled (without SACK, sacked_out is used for | 2011 | * counter when SACK is enabled (without SACK, sacked_out is used for |
2053 | * that purpose). | 2012 | * that purpose). |
2054 | * | 2013 | * |
2055 | * Instead, with FACK TCP uses fackets_out that includes both SACKed | ||
2056 | * segments up to the highest received SACK block so far and holes in | ||
2057 | * between them. | ||
2058 | * | ||
2059 | * With reordering, holes may still be in flight, so RFC3517 recovery | 2014 | * With reordering, holes may still be in flight, so RFC3517 recovery |
2060 | * uses pure sacked_out (total number of SACKed segments) even though | 2015 | * uses pure sacked_out (total number of SACKed segments) even though |
2061 | * it violates the RFC that uses duplicate ACKs, often these are equal | 2016 | * it violates the RFC that uses duplicate ACKs, often these are equal |
@@ -2065,10 +2020,10 @@ static inline int tcp_fackets_out(const struct tcp_sock *tp) | |||
2065 | */ | 2020 | */ |
2066 | static inline int tcp_dupack_heuristics(const struct tcp_sock *tp) | 2021 | static inline int tcp_dupack_heuristics(const struct tcp_sock *tp) |
2067 | { | 2022 | { |
2068 | return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1; | 2023 | return tp->sacked_out + 1; |
2069 | } | 2024 | } |
2070 | 2025 | ||
2071 | /* Linux NewReno/SACK/FACK/ECN state machine. | 2026 | /* Linux NewReno/SACK/ECN state machine. |
2072 | * -------------------------------------- | 2027 | * -------------------------------------- |
2073 | * | 2028 | * |
2074 | * "Open" Normal state, no dubious events, fast path. | 2029 | * "Open" Normal state, no dubious events, fast path. |
@@ -2133,16 +2088,6 @@ static inline int tcp_dupack_heuristics(const struct tcp_sock *tp) | |||
2133 | * dynamically measured and adjusted. This is implemented in | 2088 | * dynamically measured and adjusted. This is implemented in |
2134 | * tcp_rack_mark_lost. | 2089 | * tcp_rack_mark_lost. |
2135 | * | 2090 | * |
2136 | * FACK (Disabled by default. Subsumbed by RACK): | ||
2137 | * It is the simplest heuristics. As soon as we decided | ||
2138 | * that something is lost, we decide that _all_ not SACKed | ||
2139 | * packets until the most forward SACK are lost. I.e. | ||
2140 | * lost_out = fackets_out - sacked_out and left_out = fackets_out. | ||
2141 | * It is absolutely correct estimate, if network does not reorder | ||
2142 | * packets. And it loses any connection to reality when reordering | ||
2143 | * takes place. We use FACK by default until reordering | ||
2144 | * is suspected on the path to this destination. | ||
2145 | * | ||
2146 | * If the receiver does not support SACK: | 2091 | * If the receiver does not support SACK: |
2147 | * | 2092 | * |
2148 | * NewReno (RFC6582): in Recovery we assume that one segment | 2093 | * NewReno (RFC6582): in Recovery we assume that one segment |
@@ -2191,7 +2136,7 @@ static bool tcp_time_to_recover(struct sock *sk, int flag) | |||
2191 | } | 2136 | } |
2192 | 2137 | ||
2193 | /* Detect loss in event "A" above by marking head of queue up as lost. | 2138 | /* Detect loss in event "A" above by marking head of queue up as lost. |
2194 | * For FACK or non-SACK(Reno) senders, the first "packets" number of segments | 2139 | * For non-SACK(Reno) senders, the first "packets" number of segments |
2195 | * are considered lost. For RFC3517 SACK, a segment is considered lost if it | 2140 | * are considered lost. For RFC3517 SACK, a segment is considered lost if it |
2196 | * has at least tp->reordering SACKed seqments above it; "packets" refers to | 2141 | * has at least tp->reordering SACKed seqments above it; "packets" refers to |
2197 | * the maximum SACKed segments to pass before reaching this limit. | 2142 | * the maximum SACKed segments to pass before reaching this limit. |
@@ -2206,20 +2151,18 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) | |||
2206 | const u32 loss_high = tcp_is_sack(tp) ? tp->snd_nxt : tp->high_seq; | 2151 | const u32 loss_high = tcp_is_sack(tp) ? tp->snd_nxt : tp->high_seq; |
2207 | 2152 | ||
2208 | WARN_ON(packets > tp->packets_out); | 2153 | WARN_ON(packets > tp->packets_out); |
2209 | if (tp->lost_skb_hint) { | 2154 | skb = tp->lost_skb_hint; |
2210 | skb = tp->lost_skb_hint; | 2155 | if (skb) { |
2211 | cnt = tp->lost_cnt_hint; | ||
2212 | /* Head already handled? */ | 2156 | /* Head already handled? */ |
2213 | if (mark_head && skb != tcp_write_queue_head(sk)) | 2157 | if (mark_head && after(TCP_SKB_CB(skb)->seq, tp->snd_una)) |
2214 | return; | 2158 | return; |
2159 | cnt = tp->lost_cnt_hint; | ||
2215 | } else { | 2160 | } else { |
2216 | skb = tcp_write_queue_head(sk); | 2161 | skb = tcp_rtx_queue_head(sk); |
2217 | cnt = 0; | 2162 | cnt = 0; |
2218 | } | 2163 | } |
2219 | 2164 | ||
2220 | tcp_for_write_queue_from(skb, sk) { | 2165 | skb_rbtree_walk_from(skb) { |
2221 | if (skb == tcp_send_head(sk)) | ||
2222 | break; | ||
2223 | /* TODO: do this better */ | 2166 | /* TODO: do this better */ |
2224 | /* this is not the most efficient way to do this... */ | 2167 | /* this is not the most efficient way to do this... */ |
2225 | tp->lost_skb_hint = skb; | 2168 | tp->lost_skb_hint = skb; |
@@ -2229,12 +2172,12 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) | |||
2229 | break; | 2172 | break; |
2230 | 2173 | ||
2231 | oldcnt = cnt; | 2174 | oldcnt = cnt; |
2232 | if (tcp_is_fack(tp) || tcp_is_reno(tp) || | 2175 | if (tcp_is_reno(tp) || |
2233 | (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) | 2176 | (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) |
2234 | cnt += tcp_skb_pcount(skb); | 2177 | cnt += tcp_skb_pcount(skb); |
2235 | 2178 | ||
2236 | if (cnt > packets) { | 2179 | if (cnt > packets) { |
2237 | if ((tcp_is_sack(tp) && !tcp_is_fack(tp)) || | 2180 | if (tcp_is_sack(tp) || |
2238 | (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) || | 2181 | (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) || |
2239 | (oldcnt >= packets)) | 2182 | (oldcnt >= packets)) |
2240 | break; | 2183 | break; |
@@ -2243,7 +2186,8 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) | |||
2243 | /* If needed, chop off the prefix to mark as lost. */ | 2186 | /* If needed, chop off the prefix to mark as lost. */ |
2244 | lost = (packets - oldcnt) * mss; | 2187 | lost = (packets - oldcnt) * mss; |
2245 | if (lost < skb->len && | 2188 | if (lost < skb->len && |
2246 | tcp_fragment(sk, skb, lost, mss, GFP_ATOMIC) < 0) | 2189 | tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, |
2190 | lost, mss, GFP_ATOMIC) < 0) | ||
2247 | break; | 2191 | break; |
2248 | cnt = packets; | 2192 | cnt = packets; |
2249 | } | 2193 | } |
@@ -2264,11 +2208,6 @@ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit) | |||
2264 | 2208 | ||
2265 | if (tcp_is_reno(tp)) { | 2209 | if (tcp_is_reno(tp)) { |
2266 | tcp_mark_head_lost(sk, 1, 1); | 2210 | tcp_mark_head_lost(sk, 1, 1); |
2267 | } else if (tcp_is_fack(tp)) { | ||
2268 | int lost = tp->fackets_out - tp->reordering; | ||
2269 | if (lost <= 0) | ||
2270 | lost = 1; | ||
2271 | tcp_mark_head_lost(sk, lost, 0); | ||
2272 | } else { | 2211 | } else { |
2273 | int sacked_upto = tp->sacked_out - tp->reordering; | 2212 | int sacked_upto = tp->sacked_out - tp->reordering; |
2274 | if (sacked_upto >= 0) | 2213 | if (sacked_upto >= 0) |
@@ -2327,16 +2266,16 @@ static bool tcp_any_retrans_done(const struct sock *sk) | |||
2327 | if (tp->retrans_out) | 2266 | if (tp->retrans_out) |
2328 | return true; | 2267 | return true; |
2329 | 2268 | ||
2330 | skb = tcp_write_queue_head(sk); | 2269 | skb = tcp_rtx_queue_head(sk); |
2331 | if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS)) | 2270 | if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS)) |
2332 | return true; | 2271 | return true; |
2333 | 2272 | ||
2334 | return false; | 2273 | return false; |
2335 | } | 2274 | } |
2336 | 2275 | ||
2337 | #if FASTRETRANS_DEBUG > 1 | ||
2338 | static void DBGUNDO(struct sock *sk, const char *msg) | 2276 | static void DBGUNDO(struct sock *sk, const char *msg) |
2339 | { | 2277 | { |
2278 | #if FASTRETRANS_DEBUG > 1 | ||
2340 | struct tcp_sock *tp = tcp_sk(sk); | 2279 | struct tcp_sock *tp = tcp_sk(sk); |
2341 | struct inet_sock *inet = inet_sk(sk); | 2280 | struct inet_sock *inet = inet_sk(sk); |
2342 | 2281 | ||
@@ -2358,10 +2297,8 @@ static void DBGUNDO(struct sock *sk, const char *msg) | |||
2358 | tp->packets_out); | 2297 | tp->packets_out); |
2359 | } | 2298 | } |
2360 | #endif | 2299 | #endif |
2361 | } | ||
2362 | #else | ||
2363 | #define DBGUNDO(x...) do { } while (0) | ||
2364 | #endif | 2300 | #endif |
2301 | } | ||
2365 | 2302 | ||
2366 | static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss) | 2303 | static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss) |
2367 | { | 2304 | { |
@@ -2370,9 +2307,7 @@ static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss) | |||
2370 | if (unmark_loss) { | 2307 | if (unmark_loss) { |
2371 | struct sk_buff *skb; | 2308 | struct sk_buff *skb; |
2372 | 2309 | ||
2373 | tcp_for_write_queue(skb, sk) { | 2310 | skb_rbtree_walk(skb, &sk->tcp_rtx_queue) { |
2374 | if (skb == tcp_send_head(sk)) | ||
2375 | break; | ||
2376 | TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; | 2311 | TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; |
2377 | } | 2312 | } |
2378 | tp->lost_out = 0; | 2313 | tp->lost_out = 0; |
@@ -2417,6 +2352,8 @@ static bool tcp_try_undo_recovery(struct sock *sk) | |||
2417 | mib_idx = LINUX_MIB_TCPFULLUNDO; | 2352 | mib_idx = LINUX_MIB_TCPFULLUNDO; |
2418 | 2353 | ||
2419 | NET_INC_STATS(sock_net(sk), mib_idx); | 2354 | NET_INC_STATS(sock_net(sk), mib_idx); |
2355 | } else if (tp->rack.reo_wnd_persist) { | ||
2356 | tp->rack.reo_wnd_persist--; | ||
2420 | } | 2357 | } |
2421 | if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) { | 2358 | if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) { |
2422 | /* Hold old state until something *above* high_seq | 2359 | /* Hold old state until something *above* high_seq |
@@ -2436,6 +2373,8 @@ static bool tcp_try_undo_dsack(struct sock *sk) | |||
2436 | struct tcp_sock *tp = tcp_sk(sk); | 2373 | struct tcp_sock *tp = tcp_sk(sk); |
2437 | 2374 | ||
2438 | if (tp->undo_marker && !tp->undo_retrans) { | 2375 | if (tp->undo_marker && !tp->undo_retrans) { |
2376 | tp->rack.reo_wnd_persist = min(TCP_RACK_RECOVERY_THRESH, | ||
2377 | tp->rack.reo_wnd_persist + 1); | ||
2439 | DBGUNDO(sk, "D-SACK"); | 2378 | DBGUNDO(sk, "D-SACK"); |
2440 | tcp_undo_cwnd_reduction(sk, false); | 2379 | tcp_undo_cwnd_reduction(sk, false); |
2441 | NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKUNDO); | 2380 | NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKUNDO); |
@@ -2616,9 +2555,7 @@ void tcp_simple_retransmit(struct sock *sk) | |||
2616 | struct sk_buff *skb; | 2555 | struct sk_buff *skb; |
2617 | unsigned int mss = tcp_current_mss(sk); | 2556 | unsigned int mss = tcp_current_mss(sk); |
2618 | 2557 | ||
2619 | tcp_for_write_queue(skb, sk) { | 2558 | skb_rbtree_walk(skb, &sk->tcp_rtx_queue) { |
2620 | if (skb == tcp_send_head(sk)) | ||
2621 | break; | ||
2622 | if (tcp_skb_seglen(skb) > mss && | 2559 | if (tcp_skb_seglen(skb) > mss && |
2623 | !(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) { | 2560 | !(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) { |
2624 | if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { | 2561 | if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { |
@@ -2712,7 +2649,7 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack, | |||
2712 | * is updated in tcp_ack()). Otherwise fall back to | 2649 | * is updated in tcp_ack()). Otherwise fall back to |
2713 | * the conventional recovery. | 2650 | * the conventional recovery. |
2714 | */ | 2651 | */ |
2715 | if (tcp_send_head(sk) && | 2652 | if (!tcp_write_queue_empty(sk) && |
2716 | after(tcp_wnd_end(tp), tp->snd_nxt)) { | 2653 | after(tcp_wnd_end(tp), tp->snd_nxt)) { |
2717 | *rexmit = REXMIT_NEW; | 2654 | *rexmit = REXMIT_NEW; |
2718 | return; | 2655 | return; |
@@ -2739,15 +2676,15 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack, | |||
2739 | } | 2676 | } |
2740 | 2677 | ||
2741 | /* Undo during fast recovery after partial ACK. */ | 2678 | /* Undo during fast recovery after partial ACK. */ |
2742 | static bool tcp_try_undo_partial(struct sock *sk, const int acked) | 2679 | static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una) |
2743 | { | 2680 | { |
2744 | struct tcp_sock *tp = tcp_sk(sk); | 2681 | struct tcp_sock *tp = tcp_sk(sk); |
2745 | 2682 | ||
2746 | if (tp->undo_marker && tcp_packet_delayed(tp)) { | 2683 | if (tp->undo_marker && tcp_packet_delayed(tp)) { |
2747 | /* Plain luck! Hole if filled with delayed | 2684 | /* Plain luck! Hole if filled with delayed |
2748 | * packet, rather than with a retransmit. | 2685 | * packet, rather than with a retransmit. Check reordering. |
2749 | */ | 2686 | */ |
2750 | tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1); | 2687 | tcp_check_sack_reordering(sk, prior_snd_una, 1); |
2751 | 2688 | ||
2752 | /* We are getting evidence that the reordering degree is higher | 2689 | /* We are getting evidence that the reordering degree is higher |
2753 | * than we realized. If there are no retransmits out then we | 2690 | * than we realized. If there are no retransmits out then we |
@@ -2774,7 +2711,7 @@ static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag) | |||
2774 | struct tcp_sock *tp = tcp_sk(sk); | 2711 | struct tcp_sock *tp = tcp_sk(sk); |
2775 | 2712 | ||
2776 | /* Use RACK to detect loss */ | 2713 | /* Use RACK to detect loss */ |
2777 | if (sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION) { | 2714 | if (sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION) { |
2778 | u32 prior_retrans = tp->retrans_out; | 2715 | u32 prior_retrans = tp->retrans_out; |
2779 | 2716 | ||
2780 | tcp_rack_mark_lost(sk); | 2717 | tcp_rack_mark_lost(sk); |
@@ -2783,6 +2720,14 @@ static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag) | |||
2783 | } | 2720 | } |
2784 | } | 2721 | } |
2785 | 2722 | ||
2723 | static bool tcp_force_fast_retransmit(struct sock *sk) | ||
2724 | { | ||
2725 | struct tcp_sock *tp = tcp_sk(sk); | ||
2726 | |||
2727 | return after(tcp_highest_sack_seq(tp), | ||
2728 | tp->snd_una + tp->reordering * tp->mss_cache); | ||
2729 | } | ||
2730 | |||
2786 | /* Process an event, which can update packets-in-flight not trivially. | 2731 | /* Process an event, which can update packets-in-flight not trivially. |
2787 | * Main goal of this function is to calculate new estimate for left_out, | 2732 | * Main goal of this function is to calculate new estimate for left_out, |
2788 | * taking into account both packets sitting in receiver's buffer and | 2733 | * taking into account both packets sitting in receiver's buffer and |
@@ -2795,19 +2740,17 @@ static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag) | |||
2795 | * It does _not_ decide what to send, it is made in function | 2740 | * It does _not_ decide what to send, it is made in function |
2796 | * tcp_xmit_retransmit_queue(). | 2741 | * tcp_xmit_retransmit_queue(). |
2797 | */ | 2742 | */ |
2798 | static void tcp_fastretrans_alert(struct sock *sk, const int acked, | 2743 | static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, |
2799 | bool is_dupack, int *ack_flag, int *rexmit) | 2744 | bool is_dupack, int *ack_flag, int *rexmit) |
2800 | { | 2745 | { |
2801 | struct inet_connection_sock *icsk = inet_csk(sk); | 2746 | struct inet_connection_sock *icsk = inet_csk(sk); |
2802 | struct tcp_sock *tp = tcp_sk(sk); | 2747 | struct tcp_sock *tp = tcp_sk(sk); |
2803 | int fast_rexmit = 0, flag = *ack_flag; | 2748 | int fast_rexmit = 0, flag = *ack_flag; |
2804 | bool do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) && | 2749 | bool do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) && |
2805 | (tcp_fackets_out(tp) > tp->reordering)); | 2750 | tcp_force_fast_retransmit(sk)); |
2806 | 2751 | ||
2807 | if (WARN_ON(!tp->packets_out && tp->sacked_out)) | 2752 | if (!tp->packets_out && tp->sacked_out) |
2808 | tp->sacked_out = 0; | 2753 | tp->sacked_out = 0; |
2809 | if (WARN_ON(!tp->sacked_out && tp->fackets_out)) | ||
2810 | tp->fackets_out = 0; | ||
2811 | 2754 | ||
2812 | /* Now state machine starts. | 2755 | /* Now state machine starts. |
2813 | * A. ECE, hence prohibit cwnd undoing, the reduction is required. */ | 2756 | * A. ECE, hence prohibit cwnd undoing, the reduction is required. */ |
@@ -2854,11 +2797,11 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, | |||
2854 | if (tcp_is_reno(tp) && is_dupack) | 2797 | if (tcp_is_reno(tp) && is_dupack) |
2855 | tcp_add_reno_sack(sk); | 2798 | tcp_add_reno_sack(sk); |
2856 | } else { | 2799 | } else { |
2857 | if (tcp_try_undo_partial(sk, acked)) | 2800 | if (tcp_try_undo_partial(sk, prior_snd_una)) |
2858 | return; | 2801 | return; |
2859 | /* Partial ACK arrived. Force fast retransmit. */ | 2802 | /* Partial ACK arrived. Force fast retransmit. */ |
2860 | do_lost = tcp_is_reno(tp) || | 2803 | do_lost = tcp_is_reno(tp) || |
2861 | tcp_fackets_out(tp) > tp->reordering; | 2804 | tcp_force_fast_retransmit(sk); |
2862 | } | 2805 | } |
2863 | if (tcp_try_undo_dsack(sk)) { | 2806 | if (tcp_try_undo_dsack(sk)) { |
2864 | tcp_try_keep_open(sk); | 2807 | tcp_try_keep_open(sk); |
@@ -2873,6 +2816,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, | |||
2873 | (*ack_flag & FLAG_LOST_RETRANS))) | 2816 | (*ack_flag & FLAG_LOST_RETRANS))) |
2874 | return; | 2817 | return; |
2875 | /* Change state if cwnd is undone or retransmits are lost */ | 2818 | /* Change state if cwnd is undone or retransmits are lost */ |
2819 | /* fall through */ | ||
2876 | default: | 2820 | default: |
2877 | if (tcp_is_reno(tp)) { | 2821 | if (tcp_is_reno(tp)) { |
2878 | if (flag & FLAG_SND_UNA_ADVANCED) | 2822 | if (flag & FLAG_SND_UNA_ADVANCED) |
@@ -2913,8 +2857,8 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, | |||
2913 | 2857 | ||
2914 | static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us) | 2858 | static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us) |
2915 | { | 2859 | { |
2860 | u32 wlen = sock_net(sk)->ipv4.sysctl_tcp_min_rtt_wlen * HZ; | ||
2916 | struct tcp_sock *tp = tcp_sk(sk); | 2861 | struct tcp_sock *tp = tcp_sk(sk); |
2917 | u32 wlen = sysctl_tcp_min_rtt_wlen * HZ; | ||
2918 | 2862 | ||
2919 | minmax_running_min(&tp->rtt_min, wlen, tcp_jiffies32, | 2863 | minmax_running_min(&tp->rtt_min, wlen, tcp_jiffies32, |
2920 | rtt_us ? : jiffies_to_usecs(1)); | 2864 | rtt_us ? : jiffies_to_usecs(1)); |
@@ -3056,28 +3000,31 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb, | |||
3056 | 3000 | ||
3057 | shinfo = skb_shinfo(skb); | 3001 | shinfo = skb_shinfo(skb); |
3058 | if (!before(shinfo->tskey, prior_snd_una) && | 3002 | if (!before(shinfo->tskey, prior_snd_una) && |
3059 | before(shinfo->tskey, tcp_sk(sk)->snd_una)) | 3003 | before(shinfo->tskey, tcp_sk(sk)->snd_una)) { |
3060 | __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK); | 3004 | tcp_skb_tsorted_save(skb) { |
3005 | __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK); | ||
3006 | } tcp_skb_tsorted_restore(skb); | ||
3007 | } | ||
3061 | } | 3008 | } |
3062 | 3009 | ||
3063 | /* Remove acknowledged frames from the retransmission queue. If our packet | 3010 | /* Remove acknowledged frames from the retransmission queue. If our packet |
3064 | * is before the ack sequence we can discard it as it's confirmed to have | 3011 | * is before the ack sequence we can discard it as it's confirmed to have |
3065 | * arrived at the other end. | 3012 | * arrived at the other end. |
3066 | */ | 3013 | */ |
3067 | static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, | 3014 | static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, |
3068 | u32 prior_snd_una, int *acked, | 3015 | u32 prior_snd_una, |
3069 | struct tcp_sacktag_state *sack) | 3016 | struct tcp_sacktag_state *sack) |
3070 | { | 3017 | { |
3071 | const struct inet_connection_sock *icsk = inet_csk(sk); | 3018 | const struct inet_connection_sock *icsk = inet_csk(sk); |
3072 | u64 first_ackt, last_ackt; | 3019 | u64 first_ackt, last_ackt; |
3073 | struct tcp_sock *tp = tcp_sk(sk); | 3020 | struct tcp_sock *tp = tcp_sk(sk); |
3074 | u32 prior_sacked = tp->sacked_out; | 3021 | u32 prior_sacked = tp->sacked_out; |
3075 | u32 reord = tp->packets_out; | 3022 | u32 reord = tp->snd_nxt; /* lowest acked un-retx un-sacked seq */ |
3023 | struct sk_buff *skb, *next; | ||
3076 | bool fully_acked = true; | 3024 | bool fully_acked = true; |
3077 | long sack_rtt_us = -1L; | 3025 | long sack_rtt_us = -1L; |
3078 | long seq_rtt_us = -1L; | 3026 | long seq_rtt_us = -1L; |
3079 | long ca_rtt_us = -1L; | 3027 | long ca_rtt_us = -1L; |
3080 | struct sk_buff *skb; | ||
3081 | u32 pkts_acked = 0; | 3028 | u32 pkts_acked = 0; |
3082 | u32 last_in_flight = 0; | 3029 | u32 last_in_flight = 0; |
3083 | bool rtt_update; | 3030 | bool rtt_update; |
@@ -3085,8 +3032,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, | |||
3085 | 3032 | ||
3086 | first_ackt = 0; | 3033 | first_ackt = 0; |
3087 | 3034 | ||
3088 | while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) { | 3035 | for (skb = skb_rb_first(&sk->tcp_rtx_queue); skb; skb = next) { |
3089 | struct tcp_skb_cb *scb = TCP_SKB_CB(skb); | 3036 | struct tcp_skb_cb *scb = TCP_SKB_CB(skb); |
3037 | const u32 start_seq = scb->seq; | ||
3090 | u8 sacked = scb->sacked; | 3038 | u8 sacked = scb->sacked; |
3091 | u32 acked_pcount; | 3039 | u32 acked_pcount; |
3092 | 3040 | ||
@@ -3103,8 +3051,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, | |||
3103 | break; | 3051 | break; |
3104 | fully_acked = false; | 3052 | fully_acked = false; |
3105 | } else { | 3053 | } else { |
3106 | /* Speedup tcp_unlink_write_queue() and next loop */ | ||
3107 | prefetchw(skb->next); | ||
3108 | acked_pcount = tcp_skb_pcount(skb); | 3054 | acked_pcount = tcp_skb_pcount(skb); |
3109 | } | 3055 | } |
3110 | 3056 | ||
@@ -3119,7 +3065,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, | |||
3119 | first_ackt = last_ackt; | 3065 | first_ackt = last_ackt; |
3120 | 3066 | ||
3121 | last_in_flight = TCP_SKB_CB(skb)->tx.in_flight; | 3067 | last_in_flight = TCP_SKB_CB(skb)->tx.in_flight; |
3122 | reord = min(pkts_acked, reord); | 3068 | if (before(start_seq, reord)) |
3069 | reord = start_seq; | ||
3123 | if (!after(scb->end_seq, tp->high_seq)) | 3070 | if (!after(scb->end_seq, tp->high_seq)) |
3124 | flag |= FLAG_ORIG_SACK_ACKED; | 3071 | flag |= FLAG_ORIG_SACK_ACKED; |
3125 | } | 3072 | } |
@@ -3156,12 +3103,12 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, | |||
3156 | if (!fully_acked) | 3103 | if (!fully_acked) |
3157 | break; | 3104 | break; |
3158 | 3105 | ||
3159 | tcp_unlink_write_queue(skb, sk); | 3106 | next = skb_rb_next(skb); |
3160 | sk_wmem_free_skb(sk, skb); | ||
3161 | if (unlikely(skb == tp->retransmit_skb_hint)) | 3107 | if (unlikely(skb == tp->retransmit_skb_hint)) |
3162 | tp->retransmit_skb_hint = NULL; | 3108 | tp->retransmit_skb_hint = NULL; |
3163 | if (unlikely(skb == tp->lost_skb_hint)) | 3109 | if (unlikely(skb == tp->lost_skb_hint)) |
3164 | tp->lost_skb_hint = NULL; | 3110 | tp->lost_skb_hint = NULL; |
3111 | tcp_rtx_queue_unlink_and_free(skb, sk); | ||
3165 | } | 3112 | } |
3166 | 3113 | ||
3167 | if (!skb) | 3114 | if (!skb) |
@@ -3197,16 +3144,12 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, | |||
3197 | int delta; | 3144 | int delta; |
3198 | 3145 | ||
3199 | /* Non-retransmitted hole got filled? That's reordering */ | 3146 | /* Non-retransmitted hole got filled? That's reordering */ |
3200 | if (reord < prior_fackets && reord <= tp->fackets_out) | 3147 | if (before(reord, prior_fack)) |
3201 | tcp_update_reordering(sk, tp->fackets_out - reord, 0); | 3148 | tcp_check_sack_reordering(sk, reord, 0); |
3202 | 3149 | ||
3203 | delta = tcp_is_fack(tp) ? pkts_acked : | 3150 | delta = prior_sacked - tp->sacked_out; |
3204 | prior_sacked - tp->sacked_out; | ||
3205 | tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta); | 3151 | tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta); |
3206 | } | 3152 | } |
3207 | |||
3208 | tp->fackets_out -= min(pkts_acked, tp->fackets_out); | ||
3209 | |||
3210 | } else if (skb && rtt_update && sack_rtt_us >= 0 && | 3153 | } else if (skb && rtt_update && sack_rtt_us >= 0 && |
3211 | sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp)) { | 3154 | sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp)) { |
3212 | /* Do not re-arm RTO if the sack RTT is measured from data sent | 3155 | /* Do not re-arm RTO if the sack RTT is measured from data sent |
@@ -3247,18 +3190,19 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, | |||
3247 | } | 3190 | } |
3248 | } | 3191 | } |
3249 | #endif | 3192 | #endif |
3250 | *acked = pkts_acked; | ||
3251 | return flag; | 3193 | return flag; |
3252 | } | 3194 | } |
3253 | 3195 | ||
3254 | static void tcp_ack_probe(struct sock *sk) | 3196 | static void tcp_ack_probe(struct sock *sk) |
3255 | { | 3197 | { |
3256 | const struct tcp_sock *tp = tcp_sk(sk); | ||
3257 | struct inet_connection_sock *icsk = inet_csk(sk); | 3198 | struct inet_connection_sock *icsk = inet_csk(sk); |
3199 | struct sk_buff *head = tcp_send_head(sk); | ||
3200 | const struct tcp_sock *tp = tcp_sk(sk); | ||
3258 | 3201 | ||
3259 | /* Was it a usable window open? */ | 3202 | /* Was it a usable window open? */ |
3260 | 3203 | if (!head) | |
3261 | if (!after(TCP_SKB_CB(tcp_send_head(sk))->end_seq, tcp_wnd_end(tp))) { | 3204 | return; |
3205 | if (!after(TCP_SKB_CB(head)->end_seq, tcp_wnd_end(tp))) { | ||
3262 | icsk->icsk_backoff = 0; | 3206 | icsk->icsk_backoff = 0; |
3263 | inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0); | 3207 | inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0); |
3264 | /* Socket must be waked up by subsequent tcp_data_snd_check(). | 3208 | /* Socket must be waked up by subsequent tcp_data_snd_check(). |
@@ -3378,7 +3322,7 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32 | |||
3378 | tp->pred_flags = 0; | 3322 | tp->pred_flags = 0; |
3379 | tcp_fast_path_check(sk); | 3323 | tcp_fast_path_check(sk); |
3380 | 3324 | ||
3381 | if (tcp_send_head(sk)) | 3325 | if (!tcp_write_queue_empty(sk)) |
3382 | tcp_slow_start_after_idle_check(sk); | 3326 | tcp_slow_start_after_idle_check(sk); |
3383 | 3327 | ||
3384 | if (nwin > tp->max_window) { | 3328 | if (nwin > tp->max_window) { |
@@ -3399,7 +3343,7 @@ static bool __tcp_oow_rate_limited(struct net *net, int mib_idx, | |||
3399 | if (*last_oow_ack_time) { | 3343 | if (*last_oow_ack_time) { |
3400 | s32 elapsed = (s32)(tcp_jiffies32 - *last_oow_ack_time); | 3344 | s32 elapsed = (s32)(tcp_jiffies32 - *last_oow_ack_time); |
3401 | 3345 | ||
3402 | if (0 <= elapsed && elapsed < sysctl_tcp_invalid_ratelimit) { | 3346 | if (0 <= elapsed && elapsed < net->ipv4.sysctl_tcp_invalid_ratelimit) { |
3403 | NET_INC_STATS(net, mib_idx); | 3347 | NET_INC_STATS(net, mib_idx); |
3404 | return true; /* rate-limited: don't send yet! */ | 3348 | return true; /* rate-limited: don't send yet! */ |
3405 | } | 3349 | } |
@@ -3435,10 +3379,11 @@ static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb) | |||
3435 | static u32 challenge_timestamp; | 3379 | static u32 challenge_timestamp; |
3436 | static unsigned int challenge_count; | 3380 | static unsigned int challenge_count; |
3437 | struct tcp_sock *tp = tcp_sk(sk); | 3381 | struct tcp_sock *tp = tcp_sk(sk); |
3382 | struct net *net = sock_net(sk); | ||
3438 | u32 count, now; | 3383 | u32 count, now; |
3439 | 3384 | ||
3440 | /* First check our per-socket dupack rate limit. */ | 3385 | /* First check our per-socket dupack rate limit. */ |
3441 | if (__tcp_oow_rate_limited(sock_net(sk), | 3386 | if (__tcp_oow_rate_limited(net, |
3442 | LINUX_MIB_TCPACKSKIPPEDCHALLENGE, | 3387 | LINUX_MIB_TCPACKSKIPPEDCHALLENGE, |
3443 | &tp->last_oow_ack_time)) | 3388 | &tp->last_oow_ack_time)) |
3444 | return; | 3389 | return; |
@@ -3446,16 +3391,16 @@ static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb) | |||
3446 | /* Then check host-wide RFC 5961 rate limit. */ | 3391 | /* Then check host-wide RFC 5961 rate limit. */ |
3447 | now = jiffies / HZ; | 3392 | now = jiffies / HZ; |
3448 | if (now != challenge_timestamp) { | 3393 | if (now != challenge_timestamp) { |
3449 | u32 half = (sysctl_tcp_challenge_ack_limit + 1) >> 1; | 3394 | u32 ack_limit = net->ipv4.sysctl_tcp_challenge_ack_limit; |
3395 | u32 half = (ack_limit + 1) >> 1; | ||
3450 | 3396 | ||
3451 | challenge_timestamp = now; | 3397 | challenge_timestamp = now; |
3452 | WRITE_ONCE(challenge_count, half + | 3398 | WRITE_ONCE(challenge_count, half + prandom_u32_max(ack_limit)); |
3453 | prandom_u32_max(sysctl_tcp_challenge_ack_limit)); | ||
3454 | } | 3399 | } |
3455 | count = READ_ONCE(challenge_count); | 3400 | count = READ_ONCE(challenge_count); |
3456 | if (count > 0) { | 3401 | if (count > 0) { |
3457 | WRITE_ONCE(challenge_count, count - 1); | 3402 | WRITE_ONCE(challenge_count, count - 1); |
3458 | NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPCHALLENGEACK); | 3403 | NET_INC_STATS(net, LINUX_MIB_TCPCHALLENGEACK); |
3459 | tcp_send_ack(sk); | 3404 | tcp_send_ack(sk); |
3460 | } | 3405 | } |
3461 | } | 3406 | } |
@@ -3553,18 +3498,17 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) | |||
3553 | u32 ack_seq = TCP_SKB_CB(skb)->seq; | 3498 | u32 ack_seq = TCP_SKB_CB(skb)->seq; |
3554 | u32 ack = TCP_SKB_CB(skb)->ack_seq; | 3499 | u32 ack = TCP_SKB_CB(skb)->ack_seq; |
3555 | bool is_dupack = false; | 3500 | bool is_dupack = false; |
3556 | u32 prior_fackets; | ||
3557 | int prior_packets = tp->packets_out; | 3501 | int prior_packets = tp->packets_out; |
3558 | u32 delivered = tp->delivered; | 3502 | u32 delivered = tp->delivered; |
3559 | u32 lost = tp->lost; | 3503 | u32 lost = tp->lost; |
3560 | int acked = 0; /* Number of packets newly acked */ | ||
3561 | int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */ | 3504 | int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */ |
3505 | u32 prior_fack; | ||
3562 | 3506 | ||
3563 | sack_state.first_sackt = 0; | 3507 | sack_state.first_sackt = 0; |
3564 | sack_state.rate = &rs; | 3508 | sack_state.rate = &rs; |
3565 | 3509 | ||
3566 | /* We very likely will need to access write queue head. */ | 3510 | /* We very likely will need to access rtx queue. */ |
3567 | prefetchw(sk->sk_write_queue.next); | 3511 | prefetch(sk->tcp_rtx_queue.rb_node); |
3568 | 3512 | ||
3569 | /* If the ack is older than previous acks | 3513 | /* If the ack is older than previous acks |
3570 | * then we can probably ignore it. | 3514 | * then we can probably ignore it. |
@@ -3590,7 +3534,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) | |||
3590 | icsk->icsk_retransmits = 0; | 3534 | icsk->icsk_retransmits = 0; |
3591 | } | 3535 | } |
3592 | 3536 | ||
3593 | prior_fackets = tp->fackets_out; | 3537 | prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una; |
3594 | rs.prior_in_flight = tcp_packets_in_flight(tp); | 3538 | rs.prior_in_flight = tcp_packets_in_flight(tp); |
3595 | 3539 | ||
3596 | /* ts_recent update must be made after we are sure that the packet | 3540 | /* ts_recent update must be made after we are sure that the packet |
@@ -3646,8 +3590,9 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) | |||
3646 | goto no_queue; | 3590 | goto no_queue; |
3647 | 3591 | ||
3648 | /* See if we can take anything off of the retransmit queue. */ | 3592 | /* See if we can take anything off of the retransmit queue. */ |
3649 | flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked, | 3593 | flag |= tcp_clean_rtx_queue(sk, prior_fack, prior_snd_una, &sack_state); |
3650 | &sack_state); | 3594 | |
3595 | tcp_rack_update_reo_wnd(sk, &rs); | ||
3651 | 3596 | ||
3652 | if (tp->tlp_high_seq) | 3597 | if (tp->tlp_high_seq) |
3653 | tcp_process_tlp_ack(sk, ack, flag); | 3598 | tcp_process_tlp_ack(sk, ack, flag); |
@@ -3657,7 +3602,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) | |||
3657 | 3602 | ||
3658 | if (tcp_ack_is_dubious(sk, flag)) { | 3603 | if (tcp_ack_is_dubious(sk, flag)) { |
3659 | is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); | 3604 | is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); |
3660 | tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); | 3605 | tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag, |
3606 | &rexmit); | ||
3661 | } | 3607 | } |
3662 | 3608 | ||
3663 | if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) | 3609 | if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) |
@@ -3673,13 +3619,13 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) | |||
3673 | no_queue: | 3619 | no_queue: |
3674 | /* If data was DSACKed, see if we can undo a cwnd reduction. */ | 3620 | /* If data was DSACKed, see if we can undo a cwnd reduction. */ |
3675 | if (flag & FLAG_DSACKING_ACK) | 3621 | if (flag & FLAG_DSACKING_ACK) |
3676 | tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); | 3622 | tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag, |
3623 | &rexmit); | ||
3677 | /* If this ack opens up a zero window, clear backoff. It was | 3624 | /* If this ack opens up a zero window, clear backoff. It was |
3678 | * being used to time the probes, and is probably far higher than | 3625 | * being used to time the probes, and is probably far higher than |
3679 | * it needs to be for normal retransmission. | 3626 | * it needs to be for normal retransmission. |
3680 | */ | 3627 | */ |
3681 | if (tcp_send_head(sk)) | 3628 | tcp_ack_probe(sk); |
3682 | tcp_ack_probe(sk); | ||
3683 | 3629 | ||
3684 | if (tp->tlp_high_seq) | 3630 | if (tp->tlp_high_seq) |
3685 | tcp_process_tlp_ack(sk, ack, flag); | 3631 | tcp_process_tlp_ack(sk, ack, flag); |
@@ -3696,7 +3642,8 @@ old_ack: | |||
3696 | if (TCP_SKB_CB(skb)->sacked) { | 3642 | if (TCP_SKB_CB(skb)->sacked) { |
3697 | flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, | 3643 | flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, |
3698 | &sack_state); | 3644 | &sack_state); |
3699 | tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); | 3645 | tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag, |
3646 | &rexmit); | ||
3700 | tcp_xmit_recovery(sk, rexmit); | 3647 | tcp_xmit_recovery(sk, rexmit); |
3701 | } | 3648 | } |
3702 | 3649 | ||
@@ -3721,6 +3668,21 @@ static void tcp_parse_fastopen_option(int len, const unsigned char *cookie, | |||
3721 | foc->exp = exp_opt; | 3668 | foc->exp = exp_opt; |
3722 | } | 3669 | } |
3723 | 3670 | ||
3671 | static void smc_parse_options(const struct tcphdr *th, | ||
3672 | struct tcp_options_received *opt_rx, | ||
3673 | const unsigned char *ptr, | ||
3674 | int opsize) | ||
3675 | { | ||
3676 | #if IS_ENABLED(CONFIG_SMC) | ||
3677 | if (static_branch_unlikely(&tcp_have_smc)) { | ||
3678 | if (th->syn && !(opsize & 1) && | ||
3679 | opsize >= TCPOLEN_EXP_SMC_BASE && | ||
3680 | get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC) | ||
3681 | opt_rx->smc_ok = 1; | ||
3682 | } | ||
3683 | #endif | ||
3684 | } | ||
3685 | |||
3724 | /* Look for tcp options. Normally only called on SYN and SYNACK packets. | 3686 | /* Look for tcp options. Normally only called on SYN and SYNACK packets. |
3725 | * But, this can also be called on packets in the established flow when | 3687 | * But, this can also be called on packets in the established flow when |
3726 | * the fast version below fails. | 3688 | * the fast version below fails. |
@@ -3828,6 +3790,9 @@ void tcp_parse_options(const struct net *net, | |||
3828 | tcp_parse_fastopen_option(opsize - | 3790 | tcp_parse_fastopen_option(opsize - |
3829 | TCPOLEN_EXP_FASTOPEN_BASE, | 3791 | TCPOLEN_EXP_FASTOPEN_BASE, |
3830 | ptr + 2, th->syn, foc, true); | 3792 | ptr + 2, th->syn, foc, true); |
3793 | else | ||
3794 | smc_parse_options(th, opt_rx, ptr, | ||
3795 | opsize); | ||
3831 | break; | 3796 | break; |
3832 | 3797 | ||
3833 | } | 3798 | } |
@@ -3995,6 +3960,8 @@ static inline bool tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq) | |||
3995 | /* When we get a reset we do this. */ | 3960 | /* When we get a reset we do this. */ |
3996 | void tcp_reset(struct sock *sk) | 3961 | void tcp_reset(struct sock *sk) |
3997 | { | 3962 | { |
3963 | trace_tcp_receive_reset(sk); | ||
3964 | |||
3998 | /* We want the right error as BSD sees it (and indeed as we do). */ | 3965 | /* We want the right error as BSD sees it (and indeed as we do). */ |
3999 | switch (sk->sk_state) { | 3966 | switch (sk->sk_state) { |
4000 | case TCP_SYN_SENT: | 3967 | case TCP_SYN_SENT: |
@@ -4117,7 +4084,7 @@ static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq) | |||
4117 | { | 4084 | { |
4118 | struct tcp_sock *tp = tcp_sk(sk); | 4085 | struct tcp_sock *tp = tcp_sk(sk); |
4119 | 4086 | ||
4120 | if (tcp_is_sack(tp) && sysctl_tcp_dsack) { | 4087 | if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) { |
4121 | int mib_idx; | 4088 | int mib_idx; |
4122 | 4089 | ||
4123 | if (before(seq, tp->rcv_nxt)) | 4090 | if (before(seq, tp->rcv_nxt)) |
@@ -4152,7 +4119,7 @@ static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb) | |||
4152 | NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST); | 4119 | NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST); |
4153 | tcp_enter_quickack_mode(sk); | 4120 | tcp_enter_quickack_mode(sk); |
4154 | 4121 | ||
4155 | if (tcp_is_sack(tp) && sysctl_tcp_dsack) { | 4122 | if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) { |
4156 | u32 end_seq = TCP_SKB_CB(skb)->end_seq; | 4123 | u32 end_seq = TCP_SKB_CB(skb)->end_seq; |
4157 | 4124 | ||
4158 | if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) | 4125 | if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) |
@@ -4268,11 +4235,6 @@ static void tcp_sack_remove(struct tcp_sock *tp) | |||
4268 | tp->rx_opt.num_sacks = num_sacks; | 4235 | tp->rx_opt.num_sacks = num_sacks; |
4269 | } | 4236 | } |
4270 | 4237 | ||
4271 | enum tcp_queue { | ||
4272 | OOO_QUEUE, | ||
4273 | RCV_QUEUE, | ||
4274 | }; | ||
4275 | |||
4276 | /** | 4238 | /** |
4277 | * tcp_try_coalesce - try to merge skb to prior one | 4239 | * tcp_try_coalesce - try to merge skb to prior one |
4278 | * @sk: socket | 4240 | * @sk: socket |
@@ -4288,7 +4250,6 @@ enum tcp_queue { | |||
4288 | * Returns true if caller should free @from instead of queueing it | 4250 | * Returns true if caller should free @from instead of queueing it |
4289 | */ | 4251 | */ |
4290 | static bool tcp_try_coalesce(struct sock *sk, | 4252 | static bool tcp_try_coalesce(struct sock *sk, |
4291 | enum tcp_queue dest, | ||
4292 | struct sk_buff *to, | 4253 | struct sk_buff *to, |
4293 | struct sk_buff *from, | 4254 | struct sk_buff *from, |
4294 | bool *fragstolen) | 4255 | bool *fragstolen) |
@@ -4313,10 +4274,7 @@ static bool tcp_try_coalesce(struct sock *sk, | |||
4313 | 4274 | ||
4314 | if (TCP_SKB_CB(from)->has_rxtstamp) { | 4275 | if (TCP_SKB_CB(from)->has_rxtstamp) { |
4315 | TCP_SKB_CB(to)->has_rxtstamp = true; | 4276 | TCP_SKB_CB(to)->has_rxtstamp = true; |
4316 | if (dest == OOO_QUEUE) | 4277 | to->tstamp = from->tstamp; |
4317 | TCP_SKB_CB(to)->swtstamp = TCP_SKB_CB(from)->swtstamp; | ||
4318 | else | ||
4319 | to->tstamp = from->tstamp; | ||
4320 | } | 4278 | } |
4321 | 4279 | ||
4322 | return true; | 4280 | return true; |
@@ -4341,7 +4299,7 @@ static void tcp_ofo_queue(struct sock *sk) | |||
4341 | 4299 | ||
4342 | p = rb_first(&tp->out_of_order_queue); | 4300 | p = rb_first(&tp->out_of_order_queue); |
4343 | while (p) { | 4301 | while (p) { |
4344 | skb = rb_entry(p, struct sk_buff, rbnode); | 4302 | skb = rb_to_skb(p); |
4345 | if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) | 4303 | if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) |
4346 | break; | 4304 | break; |
4347 | 4305 | ||
@@ -4353,9 +4311,6 @@ static void tcp_ofo_queue(struct sock *sk) | |||
4353 | } | 4311 | } |
4354 | p = rb_next(p); | 4312 | p = rb_next(p); |
4355 | rb_erase(&skb->rbnode, &tp->out_of_order_queue); | 4313 | rb_erase(&skb->rbnode, &tp->out_of_order_queue); |
4356 | /* Replace tstamp which was stomped by rbnode */ | ||
4357 | if (TCP_SKB_CB(skb)->has_rxtstamp) | ||
4358 | skb->tstamp = TCP_SKB_CB(skb)->swtstamp; | ||
4359 | 4314 | ||
4360 | if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) { | 4315 | if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) { |
4361 | SOCK_DEBUG(sk, "ofo packet was already received\n"); | 4316 | SOCK_DEBUG(sk, "ofo packet was already received\n"); |
@@ -4367,8 +4322,7 @@ static void tcp_ofo_queue(struct sock *sk) | |||
4367 | TCP_SKB_CB(skb)->end_seq); | 4322 | TCP_SKB_CB(skb)->end_seq); |
4368 | 4323 | ||
4369 | tail = skb_peek_tail(&sk->sk_receive_queue); | 4324 | tail = skb_peek_tail(&sk->sk_receive_queue); |
4370 | eaten = tail && tcp_try_coalesce(sk, RCV_QUEUE, | 4325 | eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen); |
4371 | tail, skb, &fragstolen); | ||
4372 | tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq); | 4326 | tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq); |
4373 | fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; | 4327 | fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; |
4374 | if (!eaten) | 4328 | if (!eaten) |
@@ -4409,7 +4363,7 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb, | |||
4409 | static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) | 4363 | static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) |
4410 | { | 4364 | { |
4411 | struct tcp_sock *tp = tcp_sk(sk); | 4365 | struct tcp_sock *tp = tcp_sk(sk); |
4412 | struct rb_node **p, *q, *parent; | 4366 | struct rb_node **p, *parent; |
4413 | struct sk_buff *skb1; | 4367 | struct sk_buff *skb1; |
4414 | u32 seq, end_seq; | 4368 | u32 seq, end_seq; |
4415 | bool fragstolen; | 4369 | bool fragstolen; |
@@ -4422,10 +4376,6 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) | |||
4422 | return; | 4376 | return; |
4423 | } | 4377 | } |
4424 | 4378 | ||
4425 | /* Stash tstamp to avoid being stomped on by rbnode */ | ||
4426 | if (TCP_SKB_CB(skb)->has_rxtstamp) | ||
4427 | TCP_SKB_CB(skb)->swtstamp = skb->tstamp; | ||
4428 | |||
4429 | /* Disable header prediction. */ | 4379 | /* Disable header prediction. */ |
4430 | tp->pred_flags = 0; | 4380 | tp->pred_flags = 0; |
4431 | inet_csk_schedule_ack(sk); | 4381 | inet_csk_schedule_ack(sk); |
@@ -4453,7 +4403,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) | |||
4453 | /* In the typical case, we are adding an skb to the end of the list. | 4403 | /* In the typical case, we are adding an skb to the end of the list. |
4454 | * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup. | 4404 | * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup. |
4455 | */ | 4405 | */ |
4456 | if (tcp_try_coalesce(sk, OOO_QUEUE, tp->ooo_last_skb, | 4406 | if (tcp_try_coalesce(sk, tp->ooo_last_skb, |
4457 | skb, &fragstolen)) { | 4407 | skb, &fragstolen)) { |
4458 | coalesce_done: | 4408 | coalesce_done: |
4459 | tcp_grow_window(sk, skb); | 4409 | tcp_grow_window(sk, skb); |
@@ -4472,7 +4422,7 @@ coalesce_done: | |||
4472 | parent = NULL; | 4422 | parent = NULL; |
4473 | while (*p) { | 4423 | while (*p) { |
4474 | parent = *p; | 4424 | parent = *p; |
4475 | skb1 = rb_entry(parent, struct sk_buff, rbnode); | 4425 | skb1 = rb_to_skb(parent); |
4476 | if (before(seq, TCP_SKB_CB(skb1)->seq)) { | 4426 | if (before(seq, TCP_SKB_CB(skb1)->seq)) { |
4477 | p = &parent->rb_left; | 4427 | p = &parent->rb_left; |
4478 | continue; | 4428 | continue; |
@@ -4504,7 +4454,7 @@ coalesce_done: | |||
4504 | __kfree_skb(skb1); | 4454 | __kfree_skb(skb1); |
4505 | goto merge_right; | 4455 | goto merge_right; |
4506 | } | 4456 | } |
4507 | } else if (tcp_try_coalesce(sk, OOO_QUEUE, skb1, | 4457 | } else if (tcp_try_coalesce(sk, skb1, |
4508 | skb, &fragstolen)) { | 4458 | skb, &fragstolen)) { |
4509 | goto coalesce_done; | 4459 | goto coalesce_done; |
4510 | } | 4460 | } |
@@ -4517,9 +4467,7 @@ insert: | |||
4517 | 4467 | ||
4518 | merge_right: | 4468 | merge_right: |
4519 | /* Remove other segments covered by skb. */ | 4469 | /* Remove other segments covered by skb. */ |
4520 | while ((q = rb_next(&skb->rbnode)) != NULL) { | 4470 | while ((skb1 = skb_rb_next(skb)) != NULL) { |
4521 | skb1 = rb_entry(q, struct sk_buff, rbnode); | ||
4522 | |||
4523 | if (!after(end_seq, TCP_SKB_CB(skb1)->seq)) | 4471 | if (!after(end_seq, TCP_SKB_CB(skb1)->seq)) |
4524 | break; | 4472 | break; |
4525 | if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) { | 4473 | if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) { |
@@ -4534,7 +4482,7 @@ merge_right: | |||
4534 | tcp_drop(sk, skb1); | 4482 | tcp_drop(sk, skb1); |
4535 | } | 4483 | } |
4536 | /* If there is no skb after us, we are the last_skb ! */ | 4484 | /* If there is no skb after us, we are the last_skb ! */ |
4537 | if (!q) | 4485 | if (!skb1) |
4538 | tp->ooo_last_skb = skb; | 4486 | tp->ooo_last_skb = skb; |
4539 | 4487 | ||
4540 | add_sack: | 4488 | add_sack: |
@@ -4556,7 +4504,7 @@ static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int | |||
4556 | 4504 | ||
4557 | __skb_pull(skb, hdrlen); | 4505 | __skb_pull(skb, hdrlen); |
4558 | eaten = (tail && | 4506 | eaten = (tail && |
4559 | tcp_try_coalesce(sk, RCV_QUEUE, tail, | 4507 | tcp_try_coalesce(sk, tail, |
4560 | skb, fragstolen)) ? 1 : 0; | 4508 | skb, fragstolen)) ? 1 : 0; |
4561 | tcp_rcv_nxt_update(tcp_sk(sk), TCP_SKB_CB(skb)->end_seq); | 4509 | tcp_rcv_nxt_update(tcp_sk(sk), TCP_SKB_CB(skb)->end_seq); |
4562 | if (!eaten) { | 4510 | if (!eaten) { |
@@ -4720,7 +4668,7 @@ static struct sk_buff *tcp_skb_next(struct sk_buff *skb, struct sk_buff_head *li | |||
4720 | if (list) | 4668 | if (list) |
4721 | return !skb_queue_is_last(list, skb) ? skb->next : NULL; | 4669 | return !skb_queue_is_last(list, skb) ? skb->next : NULL; |
4722 | 4670 | ||
4723 | return rb_entry_safe(rb_next(&skb->rbnode), struct sk_buff, rbnode); | 4671 | return skb_rb_next(skb); |
4724 | } | 4672 | } |
4725 | 4673 | ||
4726 | static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb, | 4674 | static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb, |
@@ -4741,7 +4689,7 @@ static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb, | |||
4741 | } | 4689 | } |
4742 | 4690 | ||
4743 | /* Insert skb into rb tree, ordered by TCP_SKB_CB(skb)->seq */ | 4691 | /* Insert skb into rb tree, ordered by TCP_SKB_CB(skb)->seq */ |
4744 | static void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb) | 4692 | void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb) |
4745 | { | 4693 | { |
4746 | struct rb_node **p = &root->rb_node; | 4694 | struct rb_node **p = &root->rb_node; |
4747 | struct rb_node *parent = NULL; | 4695 | struct rb_node *parent = NULL; |
@@ -4749,7 +4697,7 @@ static void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb) | |||
4749 | 4697 | ||
4750 | while (*p) { | 4698 | while (*p) { |
4751 | parent = *p; | 4699 | parent = *p; |
4752 | skb1 = rb_entry(parent, struct sk_buff, rbnode); | 4700 | skb1 = rb_to_skb(parent); |
4753 | if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq)) | 4701 | if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq)) |
4754 | p = &parent->rb_left; | 4702 | p = &parent->rb_left; |
4755 | else | 4703 | else |
@@ -4796,7 +4744,7 @@ restart: | |||
4796 | * overlaps to the next one. | 4744 | * overlaps to the next one. |
4797 | */ | 4745 | */ |
4798 | if (!(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) && | 4746 | if (!(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) && |
4799 | (tcp_win_from_space(skb->truesize) > skb->len || | 4747 | (tcp_win_from_space(sk, skb->truesize) > skb->len || |
4800 | before(TCP_SKB_CB(skb)->seq, start))) { | 4748 | before(TCP_SKB_CB(skb)->seq, start))) { |
4801 | end_of_skbs = false; | 4749 | end_of_skbs = false; |
4802 | break; | 4750 | break; |
@@ -4868,26 +4816,19 @@ static void tcp_collapse_ofo_queue(struct sock *sk) | |||
4868 | { | 4816 | { |
4869 | struct tcp_sock *tp = tcp_sk(sk); | 4817 | struct tcp_sock *tp = tcp_sk(sk); |
4870 | struct sk_buff *skb, *head; | 4818 | struct sk_buff *skb, *head; |
4871 | struct rb_node *p; | ||
4872 | u32 start, end; | 4819 | u32 start, end; |
4873 | 4820 | ||
4874 | p = rb_first(&tp->out_of_order_queue); | 4821 | skb = skb_rb_first(&tp->out_of_order_queue); |
4875 | skb = rb_entry_safe(p, struct sk_buff, rbnode); | ||
4876 | new_range: | 4822 | new_range: |
4877 | if (!skb) { | 4823 | if (!skb) { |
4878 | p = rb_last(&tp->out_of_order_queue); | 4824 | tp->ooo_last_skb = skb_rb_last(&tp->out_of_order_queue); |
4879 | /* Note: This is possible p is NULL here. We do not | ||
4880 | * use rb_entry_safe(), as ooo_last_skb is valid only | ||
4881 | * if rbtree is not empty. | ||
4882 | */ | ||
4883 | tp->ooo_last_skb = rb_entry(p, struct sk_buff, rbnode); | ||
4884 | return; | 4825 | return; |
4885 | } | 4826 | } |
4886 | start = TCP_SKB_CB(skb)->seq; | 4827 | start = TCP_SKB_CB(skb)->seq; |
4887 | end = TCP_SKB_CB(skb)->end_seq; | 4828 | end = TCP_SKB_CB(skb)->end_seq; |
4888 | 4829 | ||
4889 | for (head = skb;;) { | 4830 | for (head = skb;;) { |
4890 | skb = tcp_skb_next(skb, NULL); | 4831 | skb = skb_rb_next(skb); |
4891 | 4832 | ||
4892 | /* Range is terminated when we see a gap or when | 4833 | /* Range is terminated when we see a gap or when |
4893 | * we are at the queue end. | 4834 | * we are at the queue end. |
@@ -4930,14 +4871,14 @@ static bool tcp_prune_ofo_queue(struct sock *sk) | |||
4930 | do { | 4871 | do { |
4931 | prev = rb_prev(node); | 4872 | prev = rb_prev(node); |
4932 | rb_erase(node, &tp->out_of_order_queue); | 4873 | rb_erase(node, &tp->out_of_order_queue); |
4933 | tcp_drop(sk, rb_entry(node, struct sk_buff, rbnode)); | 4874 | tcp_drop(sk, rb_to_skb(node)); |
4934 | sk_mem_reclaim(sk); | 4875 | sk_mem_reclaim(sk); |
4935 | if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && | 4876 | if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && |
4936 | !tcp_under_memory_pressure(sk)) | 4877 | !tcp_under_memory_pressure(sk)) |
4937 | break; | 4878 | break; |
4938 | node = prev; | 4879 | node = prev; |
4939 | } while (node); | 4880 | } while (node); |
4940 | tp->ooo_last_skb = rb_entry(prev, struct sk_buff, rbnode); | 4881 | tp->ooo_last_skb = rb_to_skb(prev); |
4941 | 4882 | ||
4942 | /* Reset SACK state. A conforming SACK implementation will | 4883 | /* Reset SACK state. A conforming SACK implementation will |
4943 | * do the same at a timeout based retransmit. When a connection | 4884 | * do the same at a timeout based retransmit. When a connection |
@@ -5112,7 +5053,7 @@ static void tcp_check_urg(struct sock *sk, const struct tcphdr *th) | |||
5112 | struct tcp_sock *tp = tcp_sk(sk); | 5053 | struct tcp_sock *tp = tcp_sk(sk); |
5113 | u32 ptr = ntohs(th->urg_ptr); | 5054 | u32 ptr = ntohs(th->urg_ptr); |
5114 | 5055 | ||
5115 | if (ptr && !sysctl_tcp_stdurg) | 5056 | if (ptr && !sock_net(sk)->ipv4.sysctl_tcp_stdurg) |
5116 | ptr--; | 5057 | ptr--; |
5117 | ptr += ntohl(th->seq); | 5058 | ptr += ntohl(th->seq); |
5118 | 5059 | ||
@@ -5532,20 +5473,13 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) | |||
5532 | security_inet_conn_established(sk, skb); | 5473 | security_inet_conn_established(sk, skb); |
5533 | } | 5474 | } |
5534 | 5475 | ||
5535 | /* Make sure socket is routed, for correct metrics. */ | 5476 | tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB); |
5536 | icsk->icsk_af_ops->rebuild_header(sk); | ||
5537 | |||
5538 | tcp_init_metrics(sk); | ||
5539 | tcp_call_bpf(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB); | ||
5540 | tcp_init_congestion_control(sk); | ||
5541 | 5477 | ||
5542 | /* Prevent spurious tcp_cwnd_restart() on first data | 5478 | /* Prevent spurious tcp_cwnd_restart() on first data |
5543 | * packet. | 5479 | * packet. |
5544 | */ | 5480 | */ |
5545 | tp->lsndtime = tcp_jiffies32; | 5481 | tp->lsndtime = tcp_jiffies32; |
5546 | 5482 | ||
5547 | tcp_init_buffer_space(sk); | ||
5548 | |||
5549 | if (sock_flag(sk, SOCK_KEEPOPEN)) | 5483 | if (sock_flag(sk, SOCK_KEEPOPEN)) |
5550 | inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp)); | 5484 | inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp)); |
5551 | 5485 | ||
@@ -5559,7 +5493,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, | |||
5559 | struct tcp_fastopen_cookie *cookie) | 5493 | struct tcp_fastopen_cookie *cookie) |
5560 | { | 5494 | { |
5561 | struct tcp_sock *tp = tcp_sk(sk); | 5495 | struct tcp_sock *tp = tcp_sk(sk); |
5562 | struct sk_buff *data = tp->syn_data ? tcp_write_queue_head(sk) : NULL; | 5496 | struct sk_buff *data = tp->syn_data ? tcp_rtx_queue_head(sk) : NULL; |
5563 | u16 mss = tp->rx_opt.mss_clamp, try_exp = 0; | 5497 | u16 mss = tp->rx_opt.mss_clamp, try_exp = 0; |
5564 | bool syn_drop = false; | 5498 | bool syn_drop = false; |
5565 | 5499 | ||
@@ -5594,9 +5528,8 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, | |||
5594 | tcp_fastopen_cache_set(sk, mss, cookie, syn_drop, try_exp); | 5528 | tcp_fastopen_cache_set(sk, mss, cookie, syn_drop, try_exp); |
5595 | 5529 | ||
5596 | if (data) { /* Retransmit unacked data in SYN */ | 5530 | if (data) { /* Retransmit unacked data in SYN */ |
5597 | tcp_for_write_queue_from(data, sk) { | 5531 | skb_rbtree_walk_from(data) { |
5598 | if (data == tcp_send_head(sk) || | 5532 | if (__tcp_retransmit_skb(sk, data, 1)) |
5599 | __tcp_retransmit_skb(sk, data, 1)) | ||
5600 | break; | 5533 | break; |
5601 | } | 5534 | } |
5602 | tcp_rearm_rto(sk); | 5535 | tcp_rearm_rto(sk); |
@@ -5614,6 +5547,16 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, | |||
5614 | return false; | 5547 | return false; |
5615 | } | 5548 | } |
5616 | 5549 | ||
5550 | static void smc_check_reset_syn(struct tcp_sock *tp) | ||
5551 | { | ||
5552 | #if IS_ENABLED(CONFIG_SMC) | ||
5553 | if (static_branch_unlikely(&tcp_have_smc)) { | ||
5554 | if (tp->syn_smc && !tp->rx_opt.smc_ok) | ||
5555 | tp->syn_smc = 0; | ||
5556 | } | ||
5557 | #endif | ||
5558 | } | ||
5559 | |||
5617 | static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, | 5560 | static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, |
5618 | const struct tcphdr *th) | 5561 | const struct tcphdr *th) |
5619 | { | 5562 | { |
@@ -5709,10 +5652,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, | |||
5709 | tp->tcp_header_len = sizeof(struct tcphdr); | 5652 | tp->tcp_header_len = sizeof(struct tcphdr); |
5710 | } | 5653 | } |
5711 | 5654 | ||
5712 | if (tcp_is_sack(tp) && sysctl_tcp_fack) | ||
5713 | tcp_enable_fack(tp); | ||
5714 | |||
5715 | tcp_mtup_init(sk); | ||
5716 | tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); | 5655 | tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); |
5717 | tcp_initialize_rcv_mss(sk); | 5656 | tcp_initialize_rcv_mss(sk); |
5718 | 5657 | ||
@@ -5721,6 +5660,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, | |||
5721 | * is initialized. */ | 5660 | * is initialized. */ |
5722 | tp->copied_seq = tp->rcv_nxt; | 5661 | tp->copied_seq = tp->rcv_nxt; |
5723 | 5662 | ||
5663 | smc_check_reset_syn(tp); | ||
5664 | |||
5724 | smp_mb(); | 5665 | smp_mb(); |
5725 | 5666 | ||
5726 | tcp_finish_connect(sk, skb); | 5667 | tcp_finish_connect(sk, skb); |
@@ -5938,15 +5879,18 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) | |||
5938 | if (req) { | 5879 | if (req) { |
5939 | inet_csk(sk)->icsk_retransmits = 0; | 5880 | inet_csk(sk)->icsk_retransmits = 0; |
5940 | reqsk_fastopen_remove(sk, req, false); | 5881 | reqsk_fastopen_remove(sk, req, false); |
5882 | /* Re-arm the timer because data may have been sent out. | ||
5883 | * This is similar to the regular data transmission case | ||
5884 | * when new data has just been ack'ed. | ||
5885 | * | ||
5886 | * (TFO) - we could try to be more aggressive and | ||
5887 | * retransmitting any data sooner based on when they | ||
5888 | * are sent out. | ||
5889 | */ | ||
5890 | tcp_rearm_rto(sk); | ||
5941 | } else { | 5891 | } else { |
5942 | /* Make sure socket is routed, for correct metrics. */ | 5892 | tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); |
5943 | icsk->icsk_af_ops->rebuild_header(sk); | ||
5944 | tcp_call_bpf(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); | ||
5945 | tcp_init_congestion_control(sk); | ||
5946 | |||
5947 | tcp_mtup_init(sk); | ||
5948 | tp->copied_seq = tp->rcv_nxt; | 5893 | tp->copied_seq = tp->rcv_nxt; |
5949 | tcp_init_buffer_space(sk); | ||
5950 | } | 5894 | } |
5951 | smp_mb(); | 5895 | smp_mb(); |
5952 | tcp_set_state(sk, TCP_ESTABLISHED); | 5896 | tcp_set_state(sk, TCP_ESTABLISHED); |
@@ -5966,19 +5910,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) | |||
5966 | if (tp->rx_opt.tstamp_ok) | 5910 | if (tp->rx_opt.tstamp_ok) |
5967 | tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; | 5911 | tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; |
5968 | 5912 | ||
5969 | if (req) { | ||
5970 | /* Re-arm the timer because data may have been sent out. | ||
5971 | * This is similar to the regular data transmission case | ||
5972 | * when new data has just been ack'ed. | ||
5973 | * | ||
5974 | * (TFO) - we could try to be more aggressive and | ||
5975 | * retransmitting any data sooner based on when they | ||
5976 | * are sent out. | ||
5977 | */ | ||
5978 | tcp_rearm_rto(sk); | ||
5979 | } else | ||
5980 | tcp_init_metrics(sk); | ||
5981 | |||
5982 | if (!inet_csk(sk)->icsk_ca_ops->cong_control) | 5913 | if (!inet_csk(sk)->icsk_ca_ops->cong_control) |
5983 | tcp_update_pacing_rate(sk); | 5914 | tcp_update_pacing_rate(sk); |
5984 | 5915 | ||
@@ -6075,6 +6006,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) | |||
6075 | case TCP_LAST_ACK: | 6006 | case TCP_LAST_ACK: |
6076 | if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) | 6007 | if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) |
6077 | break; | 6008 | break; |
6009 | /* fall through */ | ||
6078 | case TCP_FIN_WAIT1: | 6010 | case TCP_FIN_WAIT1: |
6079 | case TCP_FIN_WAIT2: | 6011 | case TCP_FIN_WAIT2: |
6080 | /* RFC 793 says to queue data in these states, | 6012 | /* RFC 793 says to queue data in these states, |
@@ -6183,6 +6115,9 @@ static void tcp_openreq_init(struct request_sock *req, | |||
6183 | ireq->ir_rmt_port = tcp_hdr(skb)->source; | 6115 | ireq->ir_rmt_port = tcp_hdr(skb)->source; |
6184 | ireq->ir_num = ntohs(tcp_hdr(skb)->dest); | 6116 | ireq->ir_num = ntohs(tcp_hdr(skb)->dest); |
6185 | ireq->ir_mark = inet_request_mark(sk, skb); | 6117 | ireq->ir_mark = inet_request_mark(sk, skb); |
6118 | #if IS_ENABLED(CONFIG_SMC) | ||
6119 | ireq->smc_ok = rx_opt->smc_ok; | ||
6120 | #endif | ||
6186 | } | 6121 | } |
6187 | 6122 | ||
6188 | struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops, | 6123 | struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops, |
@@ -6358,7 +6293,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, | |||
6358 | tcp_openreq_init_rwin(req, sk, dst); | 6293 | tcp_openreq_init_rwin(req, sk, dst); |
6359 | if (!want_cookie) { | 6294 | if (!want_cookie) { |
6360 | tcp_reqsk_record_syn(sk, req, skb); | 6295 | tcp_reqsk_record_syn(sk, req, skb); |
6361 | fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc); | 6296 | fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst); |
6362 | } | 6297 | } |
6363 | if (fastopen_sk) { | 6298 | if (fastopen_sk) { |
6364 | af_ops->send_synack(fastopen_sk, dst, &fl, req, | 6299 | af_ops->send_synack(fastopen_sk, dst, &fl, req, |