aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp_input.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-11-15 14:56:19 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2017-11-15 14:56:19 -0500
commit5bbcc0f595fadb4cac0eddc4401035ec0bd95b09 (patch)
tree3b65e490cc36a6c6fecac1fa24d9e0ac9ced4455 /net/ipv4/tcp_input.c
parent892204e06cb9e89fbc4b299a678f9ca358e97cac (diff)
parent50895b9de1d3e0258e015e8e55128d835d9a9f19 (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking updates from David Miller: "Highlights: 1) Maintain the TCP retransmit queue using an rbtree, with 1GB windows at 100Gb this really has become necessary. From Eric Dumazet. 2) Multi-program support for cgroup+bpf, from Alexei Starovoitov. 3) Perform broadcast flooding in hardware in mv88e6xxx, from Andrew Lunn. 4) Add meter action support to openvswitch, from Andy Zhou. 5) Add a data meta pointer for BPF accessible packets, from Daniel Borkmann. 6) Namespace-ify almost all TCP sysctl knobs, from Eric Dumazet. 7) Turn on Broadcom Tags in b53 driver, from Florian Fainelli. 8) More work to move the RTNL mutex down, from Florian Westphal. 9) Add 'bpftool' utility, to help with bpf program introspection. From Jakub Kicinski. 10) Add new 'cpumap' type for XDP_REDIRECT action, from Jesper Dangaard Brouer. 11) Support 'blocks' of transformations in the packet scheduler which can span multiple network devices, from Jiri Pirko. 12) TC flower offload support in cxgb4, from Kumar Sanghvi. 13) Priority based stream scheduler for SCTP, from Marcelo Ricardo Leitner. 14) Thunderbolt networking driver, from Amir Levy and Mika Westerberg. 15) Add RED qdisc offloadability, and use it in mlxsw driver. From Nogah Frankel. 16) eBPF based device controller for cgroup v2, from Roman Gushchin. 17) Add some fundamental tracepoints for TCP, from Song Liu. 18) Remove garbage collection from ipv6 route layer, this is a significant accomplishment. From Wei Wang. 19) Add multicast route offload support to mlxsw, from Yotam Gigi" * git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (2177 commits) tcp: highest_sack fix geneve: fix fill_info when link down bpf: fix lockdep splat net: cdc_ncm: GetNtbFormat endian fix openvswitch: meter: fix NULL pointer dereference in ovs_meter_cmd_reply_start netem: remove unnecessary 64 bit modulus netem: use 64 bit divide by rate tcp: Namespace-ify sysctl_tcp_default_congestion_control net: Protect iterations over net::fib_notifier_ops in fib_seq_sum() ipv6: set all.accept_dad to 0 by default uapi: fix linux/tls.h userspace compilation error usbnet: ipheth: prevent TX queue timeouts when device not ready vhost_net: conditionally enable tx polling uapi: fix linux/rxrpc.h userspace compilation errors net: stmmac: fix LPI transitioning for dwmac4 atm: horizon: Fix irq release error net-sysfs: trigger netlink notification on ifalias change via sysfs openvswitch: Using kfree_rcu() to simplify the code openvswitch: Make local function ovs_nsh_key_attr_size() static openvswitch: Fix return value check in ovs_meter_cmd_features() ...
Diffstat (limited to 'net/ipv4/tcp_input.c')
-rw-r--r--net/ipv4/tcp_input.c597
1 files changed, 266 insertions, 331 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 887585045b27..dabbf1d392fb 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -76,25 +76,10 @@
76#include <linux/ipsec.h> 76#include <linux/ipsec.h>
77#include <asm/unaligned.h> 77#include <asm/unaligned.h>
78#include <linux/errqueue.h> 78#include <linux/errqueue.h>
79#include <trace/events/tcp.h>
80#include <linux/static_key.h>
79 81
80int sysctl_tcp_fack __read_mostly;
81int sysctl_tcp_max_reordering __read_mostly = 300;
82int sysctl_tcp_dsack __read_mostly = 1;
83int sysctl_tcp_app_win __read_mostly = 31;
84int sysctl_tcp_adv_win_scale __read_mostly = 1;
85EXPORT_SYMBOL(sysctl_tcp_adv_win_scale);
86
87/* rfc5961 challenge ack rate limiting */
88int sysctl_tcp_challenge_ack_limit = 1000;
89
90int sysctl_tcp_stdurg __read_mostly;
91int sysctl_tcp_rfc1337 __read_mostly;
92int sysctl_tcp_max_orphans __read_mostly = NR_FILE; 82int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
93int sysctl_tcp_frto __read_mostly = 2;
94int sysctl_tcp_min_rtt_wlen __read_mostly = 300;
95int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
96int sysctl_tcp_early_retrans __read_mostly = 3;
97int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
98 83
99#define FLAG_DATA 0x01 /* Incoming frame contained data. */ 84#define FLAG_DATA 0x01 /* Incoming frame contained data. */
100#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ 85#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
@@ -335,7 +320,7 @@ static void tcp_sndbuf_expand(struct sock *sk)
335 sndmem *= nr_segs * per_mss; 320 sndmem *= nr_segs * per_mss;
336 321
337 if (sk->sk_sndbuf < sndmem) 322 if (sk->sk_sndbuf < sndmem)
338 sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]); 323 sk->sk_sndbuf = min(sndmem, sock_net(sk)->ipv4.sysctl_tcp_wmem[2]);
339} 324}
340 325
341/* 2. Tuning advertised window (window_clamp, rcv_ssthresh) 326/* 2. Tuning advertised window (window_clamp, rcv_ssthresh)
@@ -368,8 +353,8 @@ static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb)
368{ 353{
369 struct tcp_sock *tp = tcp_sk(sk); 354 struct tcp_sock *tp = tcp_sk(sk);
370 /* Optimize this! */ 355 /* Optimize this! */
371 int truesize = tcp_win_from_space(skb->truesize) >> 1; 356 int truesize = tcp_win_from_space(sk, skb->truesize) >> 1;
372 int window = tcp_win_from_space(sysctl_tcp_rmem[2]) >> 1; 357 int window = tcp_win_from_space(sk, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1;
373 358
374 while (tp->rcv_ssthresh <= window) { 359 while (tp->rcv_ssthresh <= window) {
375 if (truesize <= skb->len) 360 if (truesize <= skb->len)
@@ -394,7 +379,7 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)
394 /* Check #2. Increase window, if skb with such overhead 379 /* Check #2. Increase window, if skb with such overhead
395 * will fit to rcvbuf in future. 380 * will fit to rcvbuf in future.
396 */ 381 */
397 if (tcp_win_from_space(skb->truesize) <= skb->len) 382 if (tcp_win_from_space(sk, skb->truesize) <= skb->len)
398 incr = 2 * tp->advmss; 383 incr = 2 * tp->advmss;
399 else 384 else
400 incr = __tcp_grow_window(sk, skb); 385 incr = __tcp_grow_window(sk, skb);
@@ -420,11 +405,11 @@ static void tcp_fixup_rcvbuf(struct sock *sk)
420 /* Dynamic Right Sizing (DRS) has 2 to 3 RTT latency 405 /* Dynamic Right Sizing (DRS) has 2 to 3 RTT latency
421 * Allow enough cushion so that sender is not limited by our window 406 * Allow enough cushion so that sender is not limited by our window
422 */ 407 */
423 if (sysctl_tcp_moderate_rcvbuf) 408 if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf)
424 rcvmem <<= 2; 409 rcvmem <<= 2;
425 410
426 if (sk->sk_rcvbuf < rcvmem) 411 if (sk->sk_rcvbuf < rcvmem)
427 sk->sk_rcvbuf = min(rcvmem, sysctl_tcp_rmem[2]); 412 sk->sk_rcvbuf = min(rcvmem, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
428} 413}
429 414
430/* 4. Try to fixup all. It is made immediately after connection enters 415/* 4. Try to fixup all. It is made immediately after connection enters
@@ -432,6 +417,7 @@ static void tcp_fixup_rcvbuf(struct sock *sk)
432 */ 417 */
433void tcp_init_buffer_space(struct sock *sk) 418void tcp_init_buffer_space(struct sock *sk)
434{ 419{
420 int tcp_app_win = sock_net(sk)->ipv4.sysctl_tcp_app_win;
435 struct tcp_sock *tp = tcp_sk(sk); 421 struct tcp_sock *tp = tcp_sk(sk);
436 int maxwin; 422 int maxwin;
437 423
@@ -450,14 +436,14 @@ void tcp_init_buffer_space(struct sock *sk)
450 if (tp->window_clamp >= maxwin) { 436 if (tp->window_clamp >= maxwin) {
451 tp->window_clamp = maxwin; 437 tp->window_clamp = maxwin;
452 438
453 if (sysctl_tcp_app_win && maxwin > 4 * tp->advmss) 439 if (tcp_app_win && maxwin > 4 * tp->advmss)
454 tp->window_clamp = max(maxwin - 440 tp->window_clamp = max(maxwin -
455 (maxwin >> sysctl_tcp_app_win), 441 (maxwin >> tcp_app_win),
456 4 * tp->advmss); 442 4 * tp->advmss);
457 } 443 }
458 444
459 /* Force reservation of one segment. */ 445 /* Force reservation of one segment. */
460 if (sysctl_tcp_app_win && 446 if (tcp_app_win &&
461 tp->window_clamp > 2 * tp->advmss && 447 tp->window_clamp > 2 * tp->advmss &&
462 tp->window_clamp + tp->advmss > maxwin) 448 tp->window_clamp + tp->advmss > maxwin)
463 tp->window_clamp = max(2 * tp->advmss, maxwin - tp->advmss); 449 tp->window_clamp = max(2 * tp->advmss, maxwin - tp->advmss);
@@ -471,15 +457,16 @@ static void tcp_clamp_window(struct sock *sk)
471{ 457{
472 struct tcp_sock *tp = tcp_sk(sk); 458 struct tcp_sock *tp = tcp_sk(sk);
473 struct inet_connection_sock *icsk = inet_csk(sk); 459 struct inet_connection_sock *icsk = inet_csk(sk);
460 struct net *net = sock_net(sk);
474 461
475 icsk->icsk_ack.quick = 0; 462 icsk->icsk_ack.quick = 0;
476 463
477 if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] && 464 if (sk->sk_rcvbuf < net->ipv4.sysctl_tcp_rmem[2] &&
478 !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && 465 !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
479 !tcp_under_memory_pressure(sk) && 466 !tcp_under_memory_pressure(sk) &&
480 sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) { 467 sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) {
481 sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), 468 sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
482 sysctl_tcp_rmem[2]); 469 net->ipv4.sysctl_tcp_rmem[2]);
483 } 470 }
484 if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) 471 if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
485 tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss); 472 tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss);
@@ -610,7 +597,7 @@ void tcp_rcv_space_adjust(struct sock *sk)
610 * <prev RTT . ><current RTT .. ><next RTT .... > 597 * <prev RTT . ><current RTT .. ><next RTT .... >
611 */ 598 */
612 599
613 if (sysctl_tcp_moderate_rcvbuf && 600 if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf &&
614 !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { 601 !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
615 int rcvwin, rcvmem, rcvbuf; 602 int rcvwin, rcvmem, rcvbuf;
616 603
@@ -634,10 +621,11 @@ void tcp_rcv_space_adjust(struct sock *sk)
634 } 621 }
635 622
636 rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER); 623 rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER);
637 while (tcp_win_from_space(rcvmem) < tp->advmss) 624 while (tcp_win_from_space(sk, rcvmem) < tp->advmss)
638 rcvmem += 128; 625 rcvmem += 128;
639 626
640 rcvbuf = min(rcvwin / tp->advmss * rcvmem, sysctl_tcp_rmem[2]); 627 rcvbuf = min(rcvwin / tp->advmss * rcvmem,
628 sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
641 if (rcvbuf > sk->sk_rcvbuf) { 629 if (rcvbuf > sk->sk_rcvbuf) {
642 sk->sk_rcvbuf = rcvbuf; 630 sk->sk_rcvbuf = rcvbuf;
643 631
@@ -781,15 +769,6 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us)
781 tp->srtt_us = max(1U, srtt); 769 tp->srtt_us = max(1U, srtt);
782} 770}
783 771
784/* Set the sk_pacing_rate to allow proper sizing of TSO packets.
785 * Note: TCP stack does not yet implement pacing.
786 * FQ packet scheduler can be used to implement cheap but effective
787 * TCP pacing, to smooth the burst on large writes when packets
788 * in flight is significantly lower than cwnd (or rwin)
789 */
790int sysctl_tcp_pacing_ss_ratio __read_mostly = 200;
791int sysctl_tcp_pacing_ca_ratio __read_mostly = 120;
792
793static void tcp_update_pacing_rate(struct sock *sk) 772static void tcp_update_pacing_rate(struct sock *sk)
794{ 773{
795 const struct tcp_sock *tp = tcp_sk(sk); 774 const struct tcp_sock *tp = tcp_sk(sk);
@@ -807,9 +786,9 @@ static void tcp_update_pacing_rate(struct sock *sk)
807 * end of slow start and should slow down. 786 * end of slow start and should slow down.
808 */ 787 */
809 if (tp->snd_cwnd < tp->snd_ssthresh / 2) 788 if (tp->snd_cwnd < tp->snd_ssthresh / 2)
810 rate *= sysctl_tcp_pacing_ss_ratio; 789 rate *= sock_net(sk)->ipv4.sysctl_tcp_pacing_ss_ratio;
811 else 790 else
812 rate *= sysctl_tcp_pacing_ca_ratio; 791 rate *= sock_net(sk)->ipv4.sysctl_tcp_pacing_ca_ratio;
813 792
814 rate *= max(tp->snd_cwnd, tp->packets_out); 793 rate *= max(tp->snd_cwnd, tp->packets_out);
815 794
@@ -863,60 +842,46 @@ __u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst)
863 return min_t(__u32, cwnd, tp->snd_cwnd_clamp); 842 return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
864} 843}
865 844
866/*
867 * Packet counting of FACK is based on in-order assumptions, therefore TCP
868 * disables it when reordering is detected
869 */
870void tcp_disable_fack(struct tcp_sock *tp)
871{
872 /* RFC3517 uses different metric in lost marker => reset on change */
873 if (tcp_is_fack(tp))
874 tp->lost_skb_hint = NULL;
875 tp->rx_opt.sack_ok &= ~TCP_FACK_ENABLED;
876}
877
878/* Take a notice that peer is sending D-SACKs */ 845/* Take a notice that peer is sending D-SACKs */
879static void tcp_dsack_seen(struct tcp_sock *tp) 846static void tcp_dsack_seen(struct tcp_sock *tp)
880{ 847{
881 tp->rx_opt.sack_ok |= TCP_DSACK_SEEN; 848 tp->rx_opt.sack_ok |= TCP_DSACK_SEEN;
849 tp->rack.dsack_seen = 1;
882} 850}
883 851
884static void tcp_update_reordering(struct sock *sk, const int metric, 852/* It's reordering when higher sequence was delivered (i.e. sacked) before
885 const int ts) 853 * some lower never-retransmitted sequence ("low_seq"). The maximum reordering
854 * distance is approximated in full-mss packet distance ("reordering").
855 */
856static void tcp_check_sack_reordering(struct sock *sk, const u32 low_seq,
857 const int ts)
886{ 858{
887 struct tcp_sock *tp = tcp_sk(sk); 859 struct tcp_sock *tp = tcp_sk(sk);
888 int mib_idx; 860 const u32 mss = tp->mss_cache;
861 u32 fack, metric;
889 862
890 if (WARN_ON_ONCE(metric < 0)) 863 fack = tcp_highest_sack_seq(tp);
864 if (!before(low_seq, fack))
891 return; 865 return;
892 866
893 if (metric > tp->reordering) { 867 metric = fack - low_seq;
894 tp->reordering = min(sysctl_tcp_max_reordering, metric); 868 if ((metric > tp->reordering * mss) && mss) {
895
896#if FASTRETRANS_DEBUG > 1 869#if FASTRETRANS_DEBUG > 1
897 pr_debug("Disorder%d %d %u f%u s%u rr%d\n", 870 pr_debug("Disorder%d %d %u f%u s%u rr%d\n",
898 tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state, 871 tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state,
899 tp->reordering, 872 tp->reordering,
900 tp->fackets_out, 873 0,
901 tp->sacked_out, 874 tp->sacked_out,
902 tp->undo_marker ? tp->undo_retrans : 0); 875 tp->undo_marker ? tp->undo_retrans : 0);
903#endif 876#endif
904 tcp_disable_fack(tp); 877 tp->reordering = min_t(u32, (metric + mss - 1) / mss,
878 sock_net(sk)->ipv4.sysctl_tcp_max_reordering);
905 } 879 }
906 880
907 tp->rack.reord = 1; 881 tp->rack.reord = 1;
908
909 /* This exciting event is worth to be remembered. 8) */ 882 /* This exciting event is worth to be remembered. 8) */
910 if (ts) 883 NET_INC_STATS(sock_net(sk),
911 mib_idx = LINUX_MIB_TCPTSREORDER; 884 ts ? LINUX_MIB_TCPTSREORDER : LINUX_MIB_TCPSACKREORDER);
912 else if (tcp_is_reno(tp))
913 mib_idx = LINUX_MIB_TCPRENOREORDER;
914 else if (tcp_is_fack(tp))
915 mib_idx = LINUX_MIB_TCPFACKREORDER;
916 else
917 mib_idx = LINUX_MIB_TCPSACKREORDER;
918
919 NET_INC_STATS(sock_net(sk), mib_idx);
920} 885}
921 886
922/* This must be called before lost_out is incremented */ 887/* This must be called before lost_out is incremented */
@@ -990,7 +955,6 @@ void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb)
990 * 3. Loss detection event of two flavors: 955 * 3. Loss detection event of two flavors:
991 * A. Scoreboard estimator decided the packet is lost. 956 * A. Scoreboard estimator decided the packet is lost.
992 * A'. Reno "three dupacks" marks head of queue lost. 957 * A'. Reno "three dupacks" marks head of queue lost.
993 * A''. Its FACK modification, head until snd.fack is lost.
994 * B. SACK arrives sacking SND.NXT at the moment, when the 958 * B. SACK arrives sacking SND.NXT at the moment, when the
995 * segment was retransmitted. 959 * segment was retransmitted.
996 * 4. D-SACK added new rule: D-SACK changes any tag to S. 960 * 4. D-SACK added new rule: D-SACK changes any tag to S.
@@ -1133,8 +1097,7 @@ static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,
1133} 1097}
1134 1098
1135struct tcp_sacktag_state { 1099struct tcp_sacktag_state {
1136 int reord; 1100 u32 reord;
1137 int fack_count;
1138 /* Timestamps for earliest and latest never-retransmitted segment 1101 /* Timestamps for earliest and latest never-retransmitted segment
1139 * that was SACKed. RTO needs the earliest RTT to stay conservative, 1102 * that was SACKed. RTO needs the earliest RTT to stay conservative,
1140 * but congestion control should still get an accurate delay signal. 1103 * but congestion control should still get an accurate delay signal.
@@ -1143,6 +1106,7 @@ struct tcp_sacktag_state {
1143 u64 last_sackt; 1106 u64 last_sackt;
1144 struct rate_sample *rate; 1107 struct rate_sample *rate;
1145 int flag; 1108 int flag;
1109 unsigned int mss_now;
1146}; 1110};
1147 1111
1148/* Check if skb is fully within the SACK block. In presence of GSO skbs, 1112/* Check if skb is fully within the SACK block. In presence of GSO skbs,
@@ -1192,7 +1156,8 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
1192 if (pkt_len >= skb->len && !in_sack) 1156 if (pkt_len >= skb->len && !in_sack)
1193 return 0; 1157 return 0;
1194 1158
1195 err = tcp_fragment(sk, skb, pkt_len, mss, GFP_ATOMIC); 1159 err = tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
1160 pkt_len, mss, GFP_ATOMIC);
1196 if (err < 0) 1161 if (err < 0)
1197 return err; 1162 return err;
1198 } 1163 }
@@ -1208,15 +1173,15 @@ static u8 tcp_sacktag_one(struct sock *sk,
1208 u64 xmit_time) 1173 u64 xmit_time)
1209{ 1174{
1210 struct tcp_sock *tp = tcp_sk(sk); 1175 struct tcp_sock *tp = tcp_sk(sk);
1211 int fack_count = state->fack_count;
1212 1176
1213 /* Account D-SACK for retransmitted packet. */ 1177 /* Account D-SACK for retransmitted packet. */
1214 if (dup_sack && (sacked & TCPCB_RETRANS)) { 1178 if (dup_sack && (sacked & TCPCB_RETRANS)) {
1215 if (tp->undo_marker && tp->undo_retrans > 0 && 1179 if (tp->undo_marker && tp->undo_retrans > 0 &&
1216 after(end_seq, tp->undo_marker)) 1180 after(end_seq, tp->undo_marker))
1217 tp->undo_retrans--; 1181 tp->undo_retrans--;
1218 if (sacked & TCPCB_SACKED_ACKED) 1182 if ((sacked & TCPCB_SACKED_ACKED) &&
1219 state->reord = min(fack_count, state->reord); 1183 before(start_seq, state->reord))
1184 state->reord = start_seq;
1220 } 1185 }
1221 1186
1222 /* Nothing to do; acked frame is about to be dropped (was ACKed). */ 1187 /* Nothing to do; acked frame is about to be dropped (was ACKed). */
@@ -1242,9 +1207,10 @@ static u8 tcp_sacktag_one(struct sock *sk,
1242 * which was in hole. It is reordering. 1207 * which was in hole. It is reordering.
1243 */ 1208 */
1244 if (before(start_seq, 1209 if (before(start_seq,
1245 tcp_highest_sack_seq(tp))) 1210 tcp_highest_sack_seq(tp)) &&
1246 state->reord = min(fack_count, 1211 before(start_seq, state->reord))
1247 state->reord); 1212 state->reord = start_seq;
1213
1248 if (!after(end_seq, tp->high_seq)) 1214 if (!after(end_seq, tp->high_seq))
1249 state->flag |= FLAG_ORIG_SACK_ACKED; 1215 state->flag |= FLAG_ORIG_SACK_ACKED;
1250 if (state->first_sackt == 0) 1216 if (state->first_sackt == 0)
@@ -1263,15 +1229,10 @@ static u8 tcp_sacktag_one(struct sock *sk,
1263 tp->sacked_out += pcount; 1229 tp->sacked_out += pcount;
1264 tp->delivered += pcount; /* Out-of-order packets delivered */ 1230 tp->delivered += pcount; /* Out-of-order packets delivered */
1265 1231
1266 fack_count += pcount;
1267
1268 /* Lost marker hint past SACKed? Tweak RFC3517 cnt */ 1232 /* Lost marker hint past SACKed? Tweak RFC3517 cnt */
1269 if (!tcp_is_fack(tp) && tp->lost_skb_hint && 1233 if (tp->lost_skb_hint &&
1270 before(start_seq, TCP_SKB_CB(tp->lost_skb_hint)->seq)) 1234 before(start_seq, TCP_SKB_CB(tp->lost_skb_hint)->seq))
1271 tp->lost_cnt_hint += pcount; 1235 tp->lost_cnt_hint += pcount;
1272
1273 if (fack_count > tp->fackets_out)
1274 tp->fackets_out = fack_count;
1275 } 1236 }
1276 1237
1277 /* D-SACK. We can detect redundant retransmission in S|R and plain R 1238 /* D-SACK. We can detect redundant retransmission in S|R and plain R
@@ -1289,13 +1250,13 @@ static u8 tcp_sacktag_one(struct sock *sk,
1289/* Shift newly-SACKed bytes from this skb to the immediately previous 1250/* Shift newly-SACKed bytes from this skb to the immediately previous
1290 * already-SACKed sk_buff. Mark the newly-SACKed bytes as such. 1251 * already-SACKed sk_buff. Mark the newly-SACKed bytes as such.
1291 */ 1252 */
1292static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, 1253static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev,
1254 struct sk_buff *skb,
1293 struct tcp_sacktag_state *state, 1255 struct tcp_sacktag_state *state,
1294 unsigned int pcount, int shifted, int mss, 1256 unsigned int pcount, int shifted, int mss,
1295 bool dup_sack) 1257 bool dup_sack)
1296{ 1258{
1297 struct tcp_sock *tp = tcp_sk(sk); 1259 struct tcp_sock *tp = tcp_sk(sk);
1298 struct sk_buff *prev = tcp_write_queue_prev(sk, skb);
1299 u32 start_seq = TCP_SKB_CB(skb)->seq; /* start of newly-SACKed */ 1260 u32 start_seq = TCP_SKB_CB(skb)->seq; /* start of newly-SACKed */
1300 u32 end_seq = start_seq + shifted; /* end of newly-SACKed */ 1261 u32 end_seq = start_seq + shifted; /* end of newly-SACKed */
1301 1262
@@ -1364,8 +1325,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
1364 if (unlikely(TCP_SKB_CB(prev)->tx.delivered_mstamp)) 1325 if (unlikely(TCP_SKB_CB(prev)->tx.delivered_mstamp))
1365 TCP_SKB_CB(prev)->tx.delivered_mstamp = 0; 1326 TCP_SKB_CB(prev)->tx.delivered_mstamp = 0;
1366 1327
1367 tcp_unlink_write_queue(skb, sk); 1328 tcp_rtx_queue_unlink_and_free(skb, sk);
1368 sk_wmem_free_skb(sk, skb);
1369 1329
1370 NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKMERGED); 1330 NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKMERGED);
1371 1331
@@ -1415,9 +1375,9 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
1415 goto fallback; 1375 goto fallback;
1416 1376
1417 /* Can only happen with delayed DSACK + discard craziness */ 1377 /* Can only happen with delayed DSACK + discard craziness */
1418 if (unlikely(skb == tcp_write_queue_head(sk))) 1378 prev = skb_rb_prev(skb);
1379 if (!prev)
1419 goto fallback; 1380 goto fallback;
1420 prev = tcp_write_queue_prev(sk, skb);
1421 1381
1422 if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) 1382 if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED)
1423 goto fallback; 1383 goto fallback;
@@ -1496,18 +1456,17 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
1496 1456
1497 if (!skb_shift(prev, skb, len)) 1457 if (!skb_shift(prev, skb, len))
1498 goto fallback; 1458 goto fallback;
1499 if (!tcp_shifted_skb(sk, skb, state, pcount, len, mss, dup_sack)) 1459 if (!tcp_shifted_skb(sk, prev, skb, state, pcount, len, mss, dup_sack))
1500 goto out; 1460 goto out;
1501 1461
1502 /* Hole filled allows collapsing with the next as well, this is very 1462 /* Hole filled allows collapsing with the next as well, this is very
1503 * useful when hole on every nth skb pattern happens 1463 * useful when hole on every nth skb pattern happens
1504 */ 1464 */
1505 if (prev == tcp_write_queue_tail(sk)) 1465 skb = skb_rb_next(prev);
1466 if (!skb)
1506 goto out; 1467 goto out;
1507 skb = tcp_write_queue_next(sk, prev);
1508 1468
1509 if (!skb_can_shift(skb) || 1469 if (!skb_can_shift(skb) ||
1510 (skb == tcp_send_head(sk)) ||
1511 ((TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) || 1470 ((TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) ||
1512 (mss != tcp_skb_seglen(skb))) 1471 (mss != tcp_skb_seglen(skb)))
1513 goto out; 1472 goto out;
@@ -1515,11 +1474,11 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
1515 len = skb->len; 1474 len = skb->len;
1516 if (skb_shift(prev, skb, len)) { 1475 if (skb_shift(prev, skb, len)) {
1517 pcount += tcp_skb_pcount(skb); 1476 pcount += tcp_skb_pcount(skb);
1518 tcp_shifted_skb(sk, skb, state, tcp_skb_pcount(skb), len, mss, 0); 1477 tcp_shifted_skb(sk, prev, skb, state, tcp_skb_pcount(skb),
1478 len, mss, 0);
1519 } 1479 }
1520 1480
1521out: 1481out:
1522 state->fack_count += pcount;
1523 return prev; 1482 return prev;
1524 1483
1525noop: 1484noop:
@@ -1539,13 +1498,10 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
1539 struct tcp_sock *tp = tcp_sk(sk); 1498 struct tcp_sock *tp = tcp_sk(sk);
1540 struct sk_buff *tmp; 1499 struct sk_buff *tmp;
1541 1500
1542 tcp_for_write_queue_from(skb, sk) { 1501 skb_rbtree_walk_from(skb) {
1543 int in_sack = 0; 1502 int in_sack = 0;
1544 bool dup_sack = dup_sack_in; 1503 bool dup_sack = dup_sack_in;
1545 1504
1546 if (skb == tcp_send_head(sk))
1547 break;
1548
1549 /* queue is in-order => we can short-circuit the walk early */ 1505 /* queue is in-order => we can short-circuit the walk early */
1550 if (!before(TCP_SKB_CB(skb)->seq, end_seq)) 1506 if (!before(TCP_SKB_CB(skb)->seq, end_seq))
1551 break; 1507 break;
@@ -1594,34 +1550,48 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
1594 tcp_skb_pcount(skb), 1550 tcp_skb_pcount(skb),
1595 skb->skb_mstamp); 1551 skb->skb_mstamp);
1596 tcp_rate_skb_delivered(sk, skb, state->rate); 1552 tcp_rate_skb_delivered(sk, skb, state->rate);
1553 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
1554 list_del_init(&skb->tcp_tsorted_anchor);
1597 1555
1598 if (!before(TCP_SKB_CB(skb)->seq, 1556 if (!before(TCP_SKB_CB(skb)->seq,
1599 tcp_highest_sack_seq(tp))) 1557 tcp_highest_sack_seq(tp)))
1600 tcp_advance_highest_sack(sk, skb); 1558 tcp_advance_highest_sack(sk, skb);
1601 } 1559 }
1602
1603 state->fack_count += tcp_skb_pcount(skb);
1604 } 1560 }
1605 return skb; 1561 return skb;
1606} 1562}
1607 1563
1608/* Avoid all extra work that is being done by sacktag while walking in 1564static struct sk_buff *tcp_sacktag_bsearch(struct sock *sk,
1609 * a normal way 1565 struct tcp_sacktag_state *state,
1610 */ 1566 u32 seq)
1567{
1568 struct rb_node *parent, **p = &sk->tcp_rtx_queue.rb_node;
1569 struct sk_buff *skb;
1570
1571 while (*p) {
1572 parent = *p;
1573 skb = rb_to_skb(parent);
1574 if (before(seq, TCP_SKB_CB(skb)->seq)) {
1575 p = &parent->rb_left;
1576 continue;
1577 }
1578 if (!before(seq, TCP_SKB_CB(skb)->end_seq)) {
1579 p = &parent->rb_right;
1580 continue;
1581 }
1582 return skb;
1583 }
1584 return NULL;
1585}
1586
1611static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk, 1587static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk,
1612 struct tcp_sacktag_state *state, 1588 struct tcp_sacktag_state *state,
1613 u32 skip_to_seq) 1589 u32 skip_to_seq)
1614{ 1590{
1615 tcp_for_write_queue_from(skb, sk) { 1591 if (skb && after(TCP_SKB_CB(skb)->seq, skip_to_seq))
1616 if (skb == tcp_send_head(sk)) 1592 return skb;
1617 break;
1618
1619 if (after(TCP_SKB_CB(skb)->end_seq, skip_to_seq))
1620 break;
1621 1593
1622 state->fack_count += tcp_skb_pcount(skb); 1594 return tcp_sacktag_bsearch(sk, state, skip_to_seq);
1623 }
1624 return skb;
1625} 1595}
1626 1596
1627static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb, 1597static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb,
@@ -1666,13 +1636,10 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
1666 int first_sack_index; 1636 int first_sack_index;
1667 1637
1668 state->flag = 0; 1638 state->flag = 0;
1669 state->reord = tp->packets_out; 1639 state->reord = tp->snd_nxt;
1670 1640
1671 if (!tp->sacked_out) { 1641 if (!tp->sacked_out)
1672 if (WARN_ON(tp->fackets_out))
1673 tp->fackets_out = 0;
1674 tcp_highest_sack_reset(sk); 1642 tcp_highest_sack_reset(sk);
1675 }
1676 1643
1677 found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire, 1644 found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire,
1678 num_sacks, prior_snd_una); 1645 num_sacks, prior_snd_una);
@@ -1743,8 +1710,8 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
1743 } 1710 }
1744 } 1711 }
1745 1712
1746 skb = tcp_write_queue_head(sk); 1713 state->mss_now = tcp_current_mss(sk);
1747 state->fack_count = 0; 1714 skb = NULL;
1748 i = 0; 1715 i = 0;
1749 1716
1750 if (!tp->sacked_out) { 1717 if (!tp->sacked_out) {
@@ -1801,7 +1768,6 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
1801 skb = tcp_highest_sack(sk); 1768 skb = tcp_highest_sack(sk);
1802 if (!skb) 1769 if (!skb)
1803 break; 1770 break;
1804 state->fack_count = tp->fackets_out;
1805 cache++; 1771 cache++;
1806 goto walk; 1772 goto walk;
1807 } 1773 }
@@ -1816,7 +1782,6 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
1816 skb = tcp_highest_sack(sk); 1782 skb = tcp_highest_sack(sk);
1817 if (!skb) 1783 if (!skb)
1818 break; 1784 break;
1819 state->fack_count = tp->fackets_out;
1820 } 1785 }
1821 skb = tcp_sacktag_skip(skb, sk, state, start_seq); 1786 skb = tcp_sacktag_skip(skb, sk, state, start_seq);
1822 1787
@@ -1836,9 +1801,8 @@ advance_sp:
1836 for (j = 0; j < used_sacks; j++) 1801 for (j = 0; j < used_sacks; j++)
1837 tp->recv_sack_cache[i++] = sp[j]; 1802 tp->recv_sack_cache[i++] = sp[j];
1838 1803
1839 if ((state->reord < tp->fackets_out) && 1804 if (inet_csk(sk)->icsk_ca_state != TCP_CA_Loss || tp->undo_marker)
1840 ((inet_csk(sk)->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker)) 1805 tcp_check_sack_reordering(sk, state->reord, 0);
1841 tcp_update_reordering(sk, tp->fackets_out - state->reord, 0);
1842 1806
1843 tcp_verify_left_out(tp); 1807 tcp_verify_left_out(tp);
1844out: 1808out:
@@ -1876,8 +1840,13 @@ static bool tcp_limit_reno_sacked(struct tcp_sock *tp)
1876static void tcp_check_reno_reordering(struct sock *sk, const int addend) 1840static void tcp_check_reno_reordering(struct sock *sk, const int addend)
1877{ 1841{
1878 struct tcp_sock *tp = tcp_sk(sk); 1842 struct tcp_sock *tp = tcp_sk(sk);
1879 if (tcp_limit_reno_sacked(tp)) 1843
1880 tcp_update_reordering(sk, tp->packets_out + addend, 0); 1844 if (!tcp_limit_reno_sacked(tp))
1845 return;
1846
1847 tp->reordering = min_t(u32, tp->packets_out + addend,
1848 sock_net(sk)->ipv4.sysctl_tcp_max_reordering);
1849 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRENOREORDER);
1881} 1850}
1882 1851
1883/* Emulate SACKs for SACKless connection: account for a new dupack. */ 1852/* Emulate SACKs for SACKless connection: account for a new dupack. */
@@ -1923,7 +1892,6 @@ void tcp_clear_retrans(struct tcp_sock *tp)
1923 tp->lost_out = 0; 1892 tp->lost_out = 0;
1924 tp->undo_marker = 0; 1893 tp->undo_marker = 0;
1925 tp->undo_retrans = -1; 1894 tp->undo_retrans = -1;
1926 tp->fackets_out = 0;
1927 tp->sacked_out = 0; 1895 tp->sacked_out = 0;
1928} 1896}
1929 1897
@@ -1968,19 +1936,15 @@ void tcp_enter_loss(struct sock *sk)
1968 if (tcp_is_reno(tp)) 1936 if (tcp_is_reno(tp))
1969 tcp_reset_reno_sack(tp); 1937 tcp_reset_reno_sack(tp);
1970 1938
1971 skb = tcp_write_queue_head(sk); 1939 skb = tcp_rtx_queue_head(sk);
1972 is_reneg = skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED); 1940 is_reneg = skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED);
1973 if (is_reneg) { 1941 if (is_reneg) {
1974 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING); 1942 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
1975 tp->sacked_out = 0; 1943 tp->sacked_out = 0;
1976 tp->fackets_out = 0;
1977 } 1944 }
1978 tcp_clear_all_retrans_hints(tp); 1945 tcp_clear_all_retrans_hints(tp);
1979 1946
1980 tcp_for_write_queue(skb, sk) { 1947 skb_rbtree_walk_from(skb) {
1981 if (skb == tcp_send_head(sk))
1982 break;
1983
1984 mark_lost = (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) || 1948 mark_lost = (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) ||
1985 is_reneg); 1949 is_reneg);
1986 if (mark_lost) 1950 if (mark_lost)
@@ -2014,7 +1978,7 @@ void tcp_enter_loss(struct sock *sk)
2014 * falsely raise the receive window, which results in repeated 1978 * falsely raise the receive window, which results in repeated
2015 * timeouts and stop-and-go behavior. 1979 * timeouts and stop-and-go behavior.
2016 */ 1980 */
2017 tp->frto = sysctl_tcp_frto && 1981 tp->frto = net->ipv4.sysctl_tcp_frto &&
2018 (new_recovery || icsk->icsk_retransmits) && 1982 (new_recovery || icsk->icsk_retransmits) &&
2019 !inet_csk(sk)->icsk_mtup.probe_size; 1983 !inet_csk(sk)->icsk_mtup.probe_size;
2020} 1984}
@@ -2043,19 +2007,10 @@ static bool tcp_check_sack_reneging(struct sock *sk, int flag)
2043 return false; 2007 return false;
2044} 2008}
2045 2009
2046static inline int tcp_fackets_out(const struct tcp_sock *tp)
2047{
2048 return tcp_is_reno(tp) ? tp->sacked_out + 1 : tp->fackets_out;
2049}
2050
2051/* Heurestics to calculate number of duplicate ACKs. There's no dupACKs 2010/* Heurestics to calculate number of duplicate ACKs. There's no dupACKs
2052 * counter when SACK is enabled (without SACK, sacked_out is used for 2011 * counter when SACK is enabled (without SACK, sacked_out is used for
2053 * that purpose). 2012 * that purpose).
2054 * 2013 *
2055 * Instead, with FACK TCP uses fackets_out that includes both SACKed
2056 * segments up to the highest received SACK block so far and holes in
2057 * between them.
2058 *
2059 * With reordering, holes may still be in flight, so RFC3517 recovery 2014 * With reordering, holes may still be in flight, so RFC3517 recovery
2060 * uses pure sacked_out (total number of SACKed segments) even though 2015 * uses pure sacked_out (total number of SACKed segments) even though
2061 * it violates the RFC that uses duplicate ACKs, often these are equal 2016 * it violates the RFC that uses duplicate ACKs, often these are equal
@@ -2065,10 +2020,10 @@ static inline int tcp_fackets_out(const struct tcp_sock *tp)
2065 */ 2020 */
2066static inline int tcp_dupack_heuristics(const struct tcp_sock *tp) 2021static inline int tcp_dupack_heuristics(const struct tcp_sock *tp)
2067{ 2022{
2068 return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1; 2023 return tp->sacked_out + 1;
2069} 2024}
2070 2025
2071/* Linux NewReno/SACK/FACK/ECN state machine. 2026/* Linux NewReno/SACK/ECN state machine.
2072 * -------------------------------------- 2027 * --------------------------------------
2073 * 2028 *
2074 * "Open" Normal state, no dubious events, fast path. 2029 * "Open" Normal state, no dubious events, fast path.
@@ -2133,16 +2088,6 @@ static inline int tcp_dupack_heuristics(const struct tcp_sock *tp)
2133 * dynamically measured and adjusted. This is implemented in 2088 * dynamically measured and adjusted. This is implemented in
2134 * tcp_rack_mark_lost. 2089 * tcp_rack_mark_lost.
2135 * 2090 *
2136 * FACK (Disabled by default. Subsumbed by RACK):
2137 * It is the simplest heuristics. As soon as we decided
2138 * that something is lost, we decide that _all_ not SACKed
2139 * packets until the most forward SACK are lost. I.e.
2140 * lost_out = fackets_out - sacked_out and left_out = fackets_out.
2141 * It is absolutely correct estimate, if network does not reorder
2142 * packets. And it loses any connection to reality when reordering
2143 * takes place. We use FACK by default until reordering
2144 * is suspected on the path to this destination.
2145 *
2146 * If the receiver does not support SACK: 2091 * If the receiver does not support SACK:
2147 * 2092 *
2148 * NewReno (RFC6582): in Recovery we assume that one segment 2093 * NewReno (RFC6582): in Recovery we assume that one segment
@@ -2191,7 +2136,7 @@ static bool tcp_time_to_recover(struct sock *sk, int flag)
2191} 2136}
2192 2137
2193/* Detect loss in event "A" above by marking head of queue up as lost. 2138/* Detect loss in event "A" above by marking head of queue up as lost.
2194 * For FACK or non-SACK(Reno) senders, the first "packets" number of segments 2139 * For non-SACK(Reno) senders, the first "packets" number of segments
2195 * are considered lost. For RFC3517 SACK, a segment is considered lost if it 2140 * are considered lost. For RFC3517 SACK, a segment is considered lost if it
2196 * has at least tp->reordering SACKed seqments above it; "packets" refers to 2141 * has at least tp->reordering SACKed seqments above it; "packets" refers to
2197 * the maximum SACKed segments to pass before reaching this limit. 2142 * the maximum SACKed segments to pass before reaching this limit.
@@ -2206,20 +2151,18 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
2206 const u32 loss_high = tcp_is_sack(tp) ? tp->snd_nxt : tp->high_seq; 2151 const u32 loss_high = tcp_is_sack(tp) ? tp->snd_nxt : tp->high_seq;
2207 2152
2208 WARN_ON(packets > tp->packets_out); 2153 WARN_ON(packets > tp->packets_out);
2209 if (tp->lost_skb_hint) { 2154 skb = tp->lost_skb_hint;
2210 skb = tp->lost_skb_hint; 2155 if (skb) {
2211 cnt = tp->lost_cnt_hint;
2212 /* Head already handled? */ 2156 /* Head already handled? */
2213 if (mark_head && skb != tcp_write_queue_head(sk)) 2157 if (mark_head && after(TCP_SKB_CB(skb)->seq, tp->snd_una))
2214 return; 2158 return;
2159 cnt = tp->lost_cnt_hint;
2215 } else { 2160 } else {
2216 skb = tcp_write_queue_head(sk); 2161 skb = tcp_rtx_queue_head(sk);
2217 cnt = 0; 2162 cnt = 0;
2218 } 2163 }
2219 2164
2220 tcp_for_write_queue_from(skb, sk) { 2165 skb_rbtree_walk_from(skb) {
2221 if (skb == tcp_send_head(sk))
2222 break;
2223 /* TODO: do this better */ 2166 /* TODO: do this better */
2224 /* this is not the most efficient way to do this... */ 2167 /* this is not the most efficient way to do this... */
2225 tp->lost_skb_hint = skb; 2168 tp->lost_skb_hint = skb;
@@ -2229,12 +2172,12 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
2229 break; 2172 break;
2230 2173
2231 oldcnt = cnt; 2174 oldcnt = cnt;
2232 if (tcp_is_fack(tp) || tcp_is_reno(tp) || 2175 if (tcp_is_reno(tp) ||
2233 (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) 2176 (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
2234 cnt += tcp_skb_pcount(skb); 2177 cnt += tcp_skb_pcount(skb);
2235 2178
2236 if (cnt > packets) { 2179 if (cnt > packets) {
2237 if ((tcp_is_sack(tp) && !tcp_is_fack(tp)) || 2180 if (tcp_is_sack(tp) ||
2238 (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) || 2181 (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) ||
2239 (oldcnt >= packets)) 2182 (oldcnt >= packets))
2240 break; 2183 break;
@@ -2243,7 +2186,8 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
2243 /* If needed, chop off the prefix to mark as lost. */ 2186 /* If needed, chop off the prefix to mark as lost. */
2244 lost = (packets - oldcnt) * mss; 2187 lost = (packets - oldcnt) * mss;
2245 if (lost < skb->len && 2188 if (lost < skb->len &&
2246 tcp_fragment(sk, skb, lost, mss, GFP_ATOMIC) < 0) 2189 tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
2190 lost, mss, GFP_ATOMIC) < 0)
2247 break; 2191 break;
2248 cnt = packets; 2192 cnt = packets;
2249 } 2193 }
@@ -2264,11 +2208,6 @@ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
2264 2208
2265 if (tcp_is_reno(tp)) { 2209 if (tcp_is_reno(tp)) {
2266 tcp_mark_head_lost(sk, 1, 1); 2210 tcp_mark_head_lost(sk, 1, 1);
2267 } else if (tcp_is_fack(tp)) {
2268 int lost = tp->fackets_out - tp->reordering;
2269 if (lost <= 0)
2270 lost = 1;
2271 tcp_mark_head_lost(sk, lost, 0);
2272 } else { 2211 } else {
2273 int sacked_upto = tp->sacked_out - tp->reordering; 2212 int sacked_upto = tp->sacked_out - tp->reordering;
2274 if (sacked_upto >= 0) 2213 if (sacked_upto >= 0)
@@ -2327,16 +2266,16 @@ static bool tcp_any_retrans_done(const struct sock *sk)
2327 if (tp->retrans_out) 2266 if (tp->retrans_out)
2328 return true; 2267 return true;
2329 2268
2330 skb = tcp_write_queue_head(sk); 2269 skb = tcp_rtx_queue_head(sk);
2331 if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS)) 2270 if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS))
2332 return true; 2271 return true;
2333 2272
2334 return false; 2273 return false;
2335} 2274}
2336 2275
2337#if FASTRETRANS_DEBUG > 1
2338static void DBGUNDO(struct sock *sk, const char *msg) 2276static void DBGUNDO(struct sock *sk, const char *msg)
2339{ 2277{
2278#if FASTRETRANS_DEBUG > 1
2340 struct tcp_sock *tp = tcp_sk(sk); 2279 struct tcp_sock *tp = tcp_sk(sk);
2341 struct inet_sock *inet = inet_sk(sk); 2280 struct inet_sock *inet = inet_sk(sk);
2342 2281
@@ -2358,10 +2297,8 @@ static void DBGUNDO(struct sock *sk, const char *msg)
2358 tp->packets_out); 2297 tp->packets_out);
2359 } 2298 }
2360#endif 2299#endif
2361}
2362#else
2363#define DBGUNDO(x...) do { } while (0)
2364#endif 2300#endif
2301}
2365 2302
2366static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss) 2303static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss)
2367{ 2304{
@@ -2370,9 +2307,7 @@ static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss)
2370 if (unmark_loss) { 2307 if (unmark_loss) {
2371 struct sk_buff *skb; 2308 struct sk_buff *skb;
2372 2309
2373 tcp_for_write_queue(skb, sk) { 2310 skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
2374 if (skb == tcp_send_head(sk))
2375 break;
2376 TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; 2311 TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
2377 } 2312 }
2378 tp->lost_out = 0; 2313 tp->lost_out = 0;
@@ -2417,6 +2352,8 @@ static bool tcp_try_undo_recovery(struct sock *sk)
2417 mib_idx = LINUX_MIB_TCPFULLUNDO; 2352 mib_idx = LINUX_MIB_TCPFULLUNDO;
2418 2353
2419 NET_INC_STATS(sock_net(sk), mib_idx); 2354 NET_INC_STATS(sock_net(sk), mib_idx);
2355 } else if (tp->rack.reo_wnd_persist) {
2356 tp->rack.reo_wnd_persist--;
2420 } 2357 }
2421 if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) { 2358 if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) {
2422 /* Hold old state until something *above* high_seq 2359 /* Hold old state until something *above* high_seq
@@ -2436,6 +2373,8 @@ static bool tcp_try_undo_dsack(struct sock *sk)
2436 struct tcp_sock *tp = tcp_sk(sk); 2373 struct tcp_sock *tp = tcp_sk(sk);
2437 2374
2438 if (tp->undo_marker && !tp->undo_retrans) { 2375 if (tp->undo_marker && !tp->undo_retrans) {
2376 tp->rack.reo_wnd_persist = min(TCP_RACK_RECOVERY_THRESH,
2377 tp->rack.reo_wnd_persist + 1);
2439 DBGUNDO(sk, "D-SACK"); 2378 DBGUNDO(sk, "D-SACK");
2440 tcp_undo_cwnd_reduction(sk, false); 2379 tcp_undo_cwnd_reduction(sk, false);
2441 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKUNDO); 2380 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKUNDO);
@@ -2616,9 +2555,7 @@ void tcp_simple_retransmit(struct sock *sk)
2616 struct sk_buff *skb; 2555 struct sk_buff *skb;
2617 unsigned int mss = tcp_current_mss(sk); 2556 unsigned int mss = tcp_current_mss(sk);
2618 2557
2619 tcp_for_write_queue(skb, sk) { 2558 skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
2620 if (skb == tcp_send_head(sk))
2621 break;
2622 if (tcp_skb_seglen(skb) > mss && 2559 if (tcp_skb_seglen(skb) > mss &&
2623 !(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) { 2560 !(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
2624 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { 2561 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
@@ -2712,7 +2649,7 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack,
2712 * is updated in tcp_ack()). Otherwise fall back to 2649 * is updated in tcp_ack()). Otherwise fall back to
2713 * the conventional recovery. 2650 * the conventional recovery.
2714 */ 2651 */
2715 if (tcp_send_head(sk) && 2652 if (!tcp_write_queue_empty(sk) &&
2716 after(tcp_wnd_end(tp), tp->snd_nxt)) { 2653 after(tcp_wnd_end(tp), tp->snd_nxt)) {
2717 *rexmit = REXMIT_NEW; 2654 *rexmit = REXMIT_NEW;
2718 return; 2655 return;
@@ -2739,15 +2676,15 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack,
2739} 2676}
2740 2677
2741/* Undo during fast recovery after partial ACK. */ 2678/* Undo during fast recovery after partial ACK. */
2742static bool tcp_try_undo_partial(struct sock *sk, const int acked) 2679static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una)
2743{ 2680{
2744 struct tcp_sock *tp = tcp_sk(sk); 2681 struct tcp_sock *tp = tcp_sk(sk);
2745 2682
2746 if (tp->undo_marker && tcp_packet_delayed(tp)) { 2683 if (tp->undo_marker && tcp_packet_delayed(tp)) {
2747 /* Plain luck! Hole if filled with delayed 2684 /* Plain luck! Hole if filled with delayed
2748 * packet, rather than with a retransmit. 2685 * packet, rather than with a retransmit. Check reordering.
2749 */ 2686 */
2750 tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1); 2687 tcp_check_sack_reordering(sk, prior_snd_una, 1);
2751 2688
2752 /* We are getting evidence that the reordering degree is higher 2689 /* We are getting evidence that the reordering degree is higher
2753 * than we realized. If there are no retransmits out then we 2690 * than we realized. If there are no retransmits out then we
@@ -2774,7 +2711,7 @@ static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag)
2774 struct tcp_sock *tp = tcp_sk(sk); 2711 struct tcp_sock *tp = tcp_sk(sk);
2775 2712
2776 /* Use RACK to detect loss */ 2713 /* Use RACK to detect loss */
2777 if (sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION) { 2714 if (sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION) {
2778 u32 prior_retrans = tp->retrans_out; 2715 u32 prior_retrans = tp->retrans_out;
2779 2716
2780 tcp_rack_mark_lost(sk); 2717 tcp_rack_mark_lost(sk);
@@ -2783,6 +2720,14 @@ static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag)
2783 } 2720 }
2784} 2721}
2785 2722
2723static bool tcp_force_fast_retransmit(struct sock *sk)
2724{
2725 struct tcp_sock *tp = tcp_sk(sk);
2726
2727 return after(tcp_highest_sack_seq(tp),
2728 tp->snd_una + tp->reordering * tp->mss_cache);
2729}
2730
2786/* Process an event, which can update packets-in-flight not trivially. 2731/* Process an event, which can update packets-in-flight not trivially.
2787 * Main goal of this function is to calculate new estimate for left_out, 2732 * Main goal of this function is to calculate new estimate for left_out,
2788 * taking into account both packets sitting in receiver's buffer and 2733 * taking into account both packets sitting in receiver's buffer and
@@ -2795,19 +2740,17 @@ static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag)
2795 * It does _not_ decide what to send, it is made in function 2740 * It does _not_ decide what to send, it is made in function
2796 * tcp_xmit_retransmit_queue(). 2741 * tcp_xmit_retransmit_queue().
2797 */ 2742 */
2798static void tcp_fastretrans_alert(struct sock *sk, const int acked, 2743static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
2799 bool is_dupack, int *ack_flag, int *rexmit) 2744 bool is_dupack, int *ack_flag, int *rexmit)
2800{ 2745{
2801 struct inet_connection_sock *icsk = inet_csk(sk); 2746 struct inet_connection_sock *icsk = inet_csk(sk);
2802 struct tcp_sock *tp = tcp_sk(sk); 2747 struct tcp_sock *tp = tcp_sk(sk);
2803 int fast_rexmit = 0, flag = *ack_flag; 2748 int fast_rexmit = 0, flag = *ack_flag;
2804 bool do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) && 2749 bool do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) &&
2805 (tcp_fackets_out(tp) > tp->reordering)); 2750 tcp_force_fast_retransmit(sk));
2806 2751
2807 if (WARN_ON(!tp->packets_out && tp->sacked_out)) 2752 if (!tp->packets_out && tp->sacked_out)
2808 tp->sacked_out = 0; 2753 tp->sacked_out = 0;
2809 if (WARN_ON(!tp->sacked_out && tp->fackets_out))
2810 tp->fackets_out = 0;
2811 2754
2812 /* Now state machine starts. 2755 /* Now state machine starts.
2813 * A. ECE, hence prohibit cwnd undoing, the reduction is required. */ 2756 * A. ECE, hence prohibit cwnd undoing, the reduction is required. */
@@ -2854,11 +2797,11 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
2854 if (tcp_is_reno(tp) && is_dupack) 2797 if (tcp_is_reno(tp) && is_dupack)
2855 tcp_add_reno_sack(sk); 2798 tcp_add_reno_sack(sk);
2856 } else { 2799 } else {
2857 if (tcp_try_undo_partial(sk, acked)) 2800 if (tcp_try_undo_partial(sk, prior_snd_una))
2858 return; 2801 return;
2859 /* Partial ACK arrived. Force fast retransmit. */ 2802 /* Partial ACK arrived. Force fast retransmit. */
2860 do_lost = tcp_is_reno(tp) || 2803 do_lost = tcp_is_reno(tp) ||
2861 tcp_fackets_out(tp) > tp->reordering; 2804 tcp_force_fast_retransmit(sk);
2862 } 2805 }
2863 if (tcp_try_undo_dsack(sk)) { 2806 if (tcp_try_undo_dsack(sk)) {
2864 tcp_try_keep_open(sk); 2807 tcp_try_keep_open(sk);
@@ -2873,6 +2816,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
2873 (*ack_flag & FLAG_LOST_RETRANS))) 2816 (*ack_flag & FLAG_LOST_RETRANS)))
2874 return; 2817 return;
2875 /* Change state if cwnd is undone or retransmits are lost */ 2818 /* Change state if cwnd is undone or retransmits are lost */
2819 /* fall through */
2876 default: 2820 default:
2877 if (tcp_is_reno(tp)) { 2821 if (tcp_is_reno(tp)) {
2878 if (flag & FLAG_SND_UNA_ADVANCED) 2822 if (flag & FLAG_SND_UNA_ADVANCED)
@@ -2913,8 +2857,8 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
2913 2857
2914static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us) 2858static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us)
2915{ 2859{
2860 u32 wlen = sock_net(sk)->ipv4.sysctl_tcp_min_rtt_wlen * HZ;
2916 struct tcp_sock *tp = tcp_sk(sk); 2861 struct tcp_sock *tp = tcp_sk(sk);
2917 u32 wlen = sysctl_tcp_min_rtt_wlen * HZ;
2918 2862
2919 minmax_running_min(&tp->rtt_min, wlen, tcp_jiffies32, 2863 minmax_running_min(&tp->rtt_min, wlen, tcp_jiffies32,
2920 rtt_us ? : jiffies_to_usecs(1)); 2864 rtt_us ? : jiffies_to_usecs(1));
@@ -3056,28 +3000,31 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
3056 3000
3057 shinfo = skb_shinfo(skb); 3001 shinfo = skb_shinfo(skb);
3058 if (!before(shinfo->tskey, prior_snd_una) && 3002 if (!before(shinfo->tskey, prior_snd_una) &&
3059 before(shinfo->tskey, tcp_sk(sk)->snd_una)) 3003 before(shinfo->tskey, tcp_sk(sk)->snd_una)) {
3060 __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK); 3004 tcp_skb_tsorted_save(skb) {
3005 __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK);
3006 } tcp_skb_tsorted_restore(skb);
3007 }
3061} 3008}
3062 3009
3063/* Remove acknowledged frames from the retransmission queue. If our packet 3010/* Remove acknowledged frames from the retransmission queue. If our packet
3064 * is before the ack sequence we can discard it as it's confirmed to have 3011 * is before the ack sequence we can discard it as it's confirmed to have
3065 * arrived at the other end. 3012 * arrived at the other end.
3066 */ 3013 */
3067static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, 3014static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
3068 u32 prior_snd_una, int *acked, 3015 u32 prior_snd_una,
3069 struct tcp_sacktag_state *sack) 3016 struct tcp_sacktag_state *sack)
3070{ 3017{
3071 const struct inet_connection_sock *icsk = inet_csk(sk); 3018 const struct inet_connection_sock *icsk = inet_csk(sk);
3072 u64 first_ackt, last_ackt; 3019 u64 first_ackt, last_ackt;
3073 struct tcp_sock *tp = tcp_sk(sk); 3020 struct tcp_sock *tp = tcp_sk(sk);
3074 u32 prior_sacked = tp->sacked_out; 3021 u32 prior_sacked = tp->sacked_out;
3075 u32 reord = tp->packets_out; 3022 u32 reord = tp->snd_nxt; /* lowest acked un-retx un-sacked seq */
3023 struct sk_buff *skb, *next;
3076 bool fully_acked = true; 3024 bool fully_acked = true;
3077 long sack_rtt_us = -1L; 3025 long sack_rtt_us = -1L;
3078 long seq_rtt_us = -1L; 3026 long seq_rtt_us = -1L;
3079 long ca_rtt_us = -1L; 3027 long ca_rtt_us = -1L;
3080 struct sk_buff *skb;
3081 u32 pkts_acked = 0; 3028 u32 pkts_acked = 0;
3082 u32 last_in_flight = 0; 3029 u32 last_in_flight = 0;
3083 bool rtt_update; 3030 bool rtt_update;
@@ -3085,8 +3032,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3085 3032
3086 first_ackt = 0; 3033 first_ackt = 0;
3087 3034
3088 while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) { 3035 for (skb = skb_rb_first(&sk->tcp_rtx_queue); skb; skb = next) {
3089 struct tcp_skb_cb *scb = TCP_SKB_CB(skb); 3036 struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
3037 const u32 start_seq = scb->seq;
3090 u8 sacked = scb->sacked; 3038 u8 sacked = scb->sacked;
3091 u32 acked_pcount; 3039 u32 acked_pcount;
3092 3040
@@ -3103,8 +3051,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3103 break; 3051 break;
3104 fully_acked = false; 3052 fully_acked = false;
3105 } else { 3053 } else {
3106 /* Speedup tcp_unlink_write_queue() and next loop */
3107 prefetchw(skb->next);
3108 acked_pcount = tcp_skb_pcount(skb); 3054 acked_pcount = tcp_skb_pcount(skb);
3109 } 3055 }
3110 3056
@@ -3119,7 +3065,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3119 first_ackt = last_ackt; 3065 first_ackt = last_ackt;
3120 3066
3121 last_in_flight = TCP_SKB_CB(skb)->tx.in_flight; 3067 last_in_flight = TCP_SKB_CB(skb)->tx.in_flight;
3122 reord = min(pkts_acked, reord); 3068 if (before(start_seq, reord))
3069 reord = start_seq;
3123 if (!after(scb->end_seq, tp->high_seq)) 3070 if (!after(scb->end_seq, tp->high_seq))
3124 flag |= FLAG_ORIG_SACK_ACKED; 3071 flag |= FLAG_ORIG_SACK_ACKED;
3125 } 3072 }
@@ -3156,12 +3103,12 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3156 if (!fully_acked) 3103 if (!fully_acked)
3157 break; 3104 break;
3158 3105
3159 tcp_unlink_write_queue(skb, sk); 3106 next = skb_rb_next(skb);
3160 sk_wmem_free_skb(sk, skb);
3161 if (unlikely(skb == tp->retransmit_skb_hint)) 3107 if (unlikely(skb == tp->retransmit_skb_hint))
3162 tp->retransmit_skb_hint = NULL; 3108 tp->retransmit_skb_hint = NULL;
3163 if (unlikely(skb == tp->lost_skb_hint)) 3109 if (unlikely(skb == tp->lost_skb_hint))
3164 tp->lost_skb_hint = NULL; 3110 tp->lost_skb_hint = NULL;
3111 tcp_rtx_queue_unlink_and_free(skb, sk);
3165 } 3112 }
3166 3113
3167 if (!skb) 3114 if (!skb)
@@ -3197,16 +3144,12 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3197 int delta; 3144 int delta;
3198 3145
3199 /* Non-retransmitted hole got filled? That's reordering */ 3146 /* Non-retransmitted hole got filled? That's reordering */
3200 if (reord < prior_fackets && reord <= tp->fackets_out) 3147 if (before(reord, prior_fack))
3201 tcp_update_reordering(sk, tp->fackets_out - reord, 0); 3148 tcp_check_sack_reordering(sk, reord, 0);
3202 3149
3203 delta = tcp_is_fack(tp) ? pkts_acked : 3150 delta = prior_sacked - tp->sacked_out;
3204 prior_sacked - tp->sacked_out;
3205 tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta); 3151 tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta);
3206 } 3152 }
3207
3208 tp->fackets_out -= min(pkts_acked, tp->fackets_out);
3209
3210 } else if (skb && rtt_update && sack_rtt_us >= 0 && 3153 } else if (skb && rtt_update && sack_rtt_us >= 0 &&
3211 sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp)) { 3154 sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp)) {
3212 /* Do not re-arm RTO if the sack RTT is measured from data sent 3155 /* Do not re-arm RTO if the sack RTT is measured from data sent
@@ -3247,18 +3190,19 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3247 } 3190 }
3248 } 3191 }
3249#endif 3192#endif
3250 *acked = pkts_acked;
3251 return flag; 3193 return flag;
3252} 3194}
3253 3195
3254static void tcp_ack_probe(struct sock *sk) 3196static void tcp_ack_probe(struct sock *sk)
3255{ 3197{
3256 const struct tcp_sock *tp = tcp_sk(sk);
3257 struct inet_connection_sock *icsk = inet_csk(sk); 3198 struct inet_connection_sock *icsk = inet_csk(sk);
3199 struct sk_buff *head = tcp_send_head(sk);
3200 const struct tcp_sock *tp = tcp_sk(sk);
3258 3201
3259 /* Was it a usable window open? */ 3202 /* Was it a usable window open? */
3260 3203 if (!head)
3261 if (!after(TCP_SKB_CB(tcp_send_head(sk))->end_seq, tcp_wnd_end(tp))) { 3204 return;
3205 if (!after(TCP_SKB_CB(head)->end_seq, tcp_wnd_end(tp))) {
3262 icsk->icsk_backoff = 0; 3206 icsk->icsk_backoff = 0;
3263 inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0); 3207 inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0);
3264 /* Socket must be waked up by subsequent tcp_data_snd_check(). 3208 /* Socket must be waked up by subsequent tcp_data_snd_check().
@@ -3378,7 +3322,7 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32
3378 tp->pred_flags = 0; 3322 tp->pred_flags = 0;
3379 tcp_fast_path_check(sk); 3323 tcp_fast_path_check(sk);
3380 3324
3381 if (tcp_send_head(sk)) 3325 if (!tcp_write_queue_empty(sk))
3382 tcp_slow_start_after_idle_check(sk); 3326 tcp_slow_start_after_idle_check(sk);
3383 3327
3384 if (nwin > tp->max_window) { 3328 if (nwin > tp->max_window) {
@@ -3399,7 +3343,7 @@ static bool __tcp_oow_rate_limited(struct net *net, int mib_idx,
3399 if (*last_oow_ack_time) { 3343 if (*last_oow_ack_time) {
3400 s32 elapsed = (s32)(tcp_jiffies32 - *last_oow_ack_time); 3344 s32 elapsed = (s32)(tcp_jiffies32 - *last_oow_ack_time);
3401 3345
3402 if (0 <= elapsed && elapsed < sysctl_tcp_invalid_ratelimit) { 3346 if (0 <= elapsed && elapsed < net->ipv4.sysctl_tcp_invalid_ratelimit) {
3403 NET_INC_STATS(net, mib_idx); 3347 NET_INC_STATS(net, mib_idx);
3404 return true; /* rate-limited: don't send yet! */ 3348 return true; /* rate-limited: don't send yet! */
3405 } 3349 }
@@ -3435,10 +3379,11 @@ static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb)
3435 static u32 challenge_timestamp; 3379 static u32 challenge_timestamp;
3436 static unsigned int challenge_count; 3380 static unsigned int challenge_count;
3437 struct tcp_sock *tp = tcp_sk(sk); 3381 struct tcp_sock *tp = tcp_sk(sk);
3382 struct net *net = sock_net(sk);
3438 u32 count, now; 3383 u32 count, now;
3439 3384
3440 /* First check our per-socket dupack rate limit. */ 3385 /* First check our per-socket dupack rate limit. */
3441 if (__tcp_oow_rate_limited(sock_net(sk), 3386 if (__tcp_oow_rate_limited(net,
3442 LINUX_MIB_TCPACKSKIPPEDCHALLENGE, 3387 LINUX_MIB_TCPACKSKIPPEDCHALLENGE,
3443 &tp->last_oow_ack_time)) 3388 &tp->last_oow_ack_time))
3444 return; 3389 return;
@@ -3446,16 +3391,16 @@ static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb)
3446 /* Then check host-wide RFC 5961 rate limit. */ 3391 /* Then check host-wide RFC 5961 rate limit. */
3447 now = jiffies / HZ; 3392 now = jiffies / HZ;
3448 if (now != challenge_timestamp) { 3393 if (now != challenge_timestamp) {
3449 u32 half = (sysctl_tcp_challenge_ack_limit + 1) >> 1; 3394 u32 ack_limit = net->ipv4.sysctl_tcp_challenge_ack_limit;
3395 u32 half = (ack_limit + 1) >> 1;
3450 3396
3451 challenge_timestamp = now; 3397 challenge_timestamp = now;
3452 WRITE_ONCE(challenge_count, half + 3398 WRITE_ONCE(challenge_count, half + prandom_u32_max(ack_limit));
3453 prandom_u32_max(sysctl_tcp_challenge_ack_limit));
3454 } 3399 }
3455 count = READ_ONCE(challenge_count); 3400 count = READ_ONCE(challenge_count);
3456 if (count > 0) { 3401 if (count > 0) {
3457 WRITE_ONCE(challenge_count, count - 1); 3402 WRITE_ONCE(challenge_count, count - 1);
3458 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPCHALLENGEACK); 3403 NET_INC_STATS(net, LINUX_MIB_TCPCHALLENGEACK);
3459 tcp_send_ack(sk); 3404 tcp_send_ack(sk);
3460 } 3405 }
3461} 3406}
@@ -3553,18 +3498,17 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3553 u32 ack_seq = TCP_SKB_CB(skb)->seq; 3498 u32 ack_seq = TCP_SKB_CB(skb)->seq;
3554 u32 ack = TCP_SKB_CB(skb)->ack_seq; 3499 u32 ack = TCP_SKB_CB(skb)->ack_seq;
3555 bool is_dupack = false; 3500 bool is_dupack = false;
3556 u32 prior_fackets;
3557 int prior_packets = tp->packets_out; 3501 int prior_packets = tp->packets_out;
3558 u32 delivered = tp->delivered; 3502 u32 delivered = tp->delivered;
3559 u32 lost = tp->lost; 3503 u32 lost = tp->lost;
3560 int acked = 0; /* Number of packets newly acked */
3561 int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */ 3504 int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */
3505 u32 prior_fack;
3562 3506
3563 sack_state.first_sackt = 0; 3507 sack_state.first_sackt = 0;
3564 sack_state.rate = &rs; 3508 sack_state.rate = &rs;
3565 3509
3566 /* We very likely will need to access write queue head. */ 3510 /* We very likely will need to access rtx queue. */
3567 prefetchw(sk->sk_write_queue.next); 3511 prefetch(sk->tcp_rtx_queue.rb_node);
3568 3512
3569 /* If the ack is older than previous acks 3513 /* If the ack is older than previous acks
3570 * then we can probably ignore it. 3514 * then we can probably ignore it.
@@ -3590,7 +3534,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3590 icsk->icsk_retransmits = 0; 3534 icsk->icsk_retransmits = 0;
3591 } 3535 }
3592 3536
3593 prior_fackets = tp->fackets_out; 3537 prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una;
3594 rs.prior_in_flight = tcp_packets_in_flight(tp); 3538 rs.prior_in_flight = tcp_packets_in_flight(tp);
3595 3539
3596 /* ts_recent update must be made after we are sure that the packet 3540 /* ts_recent update must be made after we are sure that the packet
@@ -3646,8 +3590,9 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3646 goto no_queue; 3590 goto no_queue;
3647 3591
3648 /* See if we can take anything off of the retransmit queue. */ 3592 /* See if we can take anything off of the retransmit queue. */
3649 flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked, 3593 flag |= tcp_clean_rtx_queue(sk, prior_fack, prior_snd_una, &sack_state);
3650 &sack_state); 3594
3595 tcp_rack_update_reo_wnd(sk, &rs);
3651 3596
3652 if (tp->tlp_high_seq) 3597 if (tp->tlp_high_seq)
3653 tcp_process_tlp_ack(sk, ack, flag); 3598 tcp_process_tlp_ack(sk, ack, flag);
@@ -3657,7 +3602,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3657 3602
3658 if (tcp_ack_is_dubious(sk, flag)) { 3603 if (tcp_ack_is_dubious(sk, flag)) {
3659 is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); 3604 is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
3660 tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); 3605 tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag,
3606 &rexmit);
3661 } 3607 }
3662 3608
3663 if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) 3609 if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP))
@@ -3673,13 +3619,13 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3673no_queue: 3619no_queue:
3674 /* If data was DSACKed, see if we can undo a cwnd reduction. */ 3620 /* If data was DSACKed, see if we can undo a cwnd reduction. */
3675 if (flag & FLAG_DSACKING_ACK) 3621 if (flag & FLAG_DSACKING_ACK)
3676 tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); 3622 tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag,
3623 &rexmit);
3677 /* If this ack opens up a zero window, clear backoff. It was 3624 /* If this ack opens up a zero window, clear backoff. It was
3678 * being used to time the probes, and is probably far higher than 3625 * being used to time the probes, and is probably far higher than
3679 * it needs to be for normal retransmission. 3626 * it needs to be for normal retransmission.
3680 */ 3627 */
3681 if (tcp_send_head(sk)) 3628 tcp_ack_probe(sk);
3682 tcp_ack_probe(sk);
3683 3629
3684 if (tp->tlp_high_seq) 3630 if (tp->tlp_high_seq)
3685 tcp_process_tlp_ack(sk, ack, flag); 3631 tcp_process_tlp_ack(sk, ack, flag);
@@ -3696,7 +3642,8 @@ old_ack:
3696 if (TCP_SKB_CB(skb)->sacked) { 3642 if (TCP_SKB_CB(skb)->sacked) {
3697 flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, 3643 flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
3698 &sack_state); 3644 &sack_state);
3699 tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); 3645 tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag,
3646 &rexmit);
3700 tcp_xmit_recovery(sk, rexmit); 3647 tcp_xmit_recovery(sk, rexmit);
3701 } 3648 }
3702 3649
@@ -3721,6 +3668,21 @@ static void tcp_parse_fastopen_option(int len, const unsigned char *cookie,
3721 foc->exp = exp_opt; 3668 foc->exp = exp_opt;
3722} 3669}
3723 3670
3671static void smc_parse_options(const struct tcphdr *th,
3672 struct tcp_options_received *opt_rx,
3673 const unsigned char *ptr,
3674 int opsize)
3675{
3676#if IS_ENABLED(CONFIG_SMC)
3677 if (static_branch_unlikely(&tcp_have_smc)) {
3678 if (th->syn && !(opsize & 1) &&
3679 opsize >= TCPOLEN_EXP_SMC_BASE &&
3680 get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC)
3681 opt_rx->smc_ok = 1;
3682 }
3683#endif
3684}
3685
3724/* Look for tcp options. Normally only called on SYN and SYNACK packets. 3686/* Look for tcp options. Normally only called on SYN and SYNACK packets.
3725 * But, this can also be called on packets in the established flow when 3687 * But, this can also be called on packets in the established flow when
3726 * the fast version below fails. 3688 * the fast version below fails.
@@ -3828,6 +3790,9 @@ void tcp_parse_options(const struct net *net,
3828 tcp_parse_fastopen_option(opsize - 3790 tcp_parse_fastopen_option(opsize -
3829 TCPOLEN_EXP_FASTOPEN_BASE, 3791 TCPOLEN_EXP_FASTOPEN_BASE,
3830 ptr + 2, th->syn, foc, true); 3792 ptr + 2, th->syn, foc, true);
3793 else
3794 smc_parse_options(th, opt_rx, ptr,
3795 opsize);
3831 break; 3796 break;
3832 3797
3833 } 3798 }
@@ -3995,6 +3960,8 @@ static inline bool tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq)
3995/* When we get a reset we do this. */ 3960/* When we get a reset we do this. */
3996void tcp_reset(struct sock *sk) 3961void tcp_reset(struct sock *sk)
3997{ 3962{
3963 trace_tcp_receive_reset(sk);
3964
3998 /* We want the right error as BSD sees it (and indeed as we do). */ 3965 /* We want the right error as BSD sees it (and indeed as we do). */
3999 switch (sk->sk_state) { 3966 switch (sk->sk_state) {
4000 case TCP_SYN_SENT: 3967 case TCP_SYN_SENT:
@@ -4117,7 +4084,7 @@ static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq)
4117{ 4084{
4118 struct tcp_sock *tp = tcp_sk(sk); 4085 struct tcp_sock *tp = tcp_sk(sk);
4119 4086
4120 if (tcp_is_sack(tp) && sysctl_tcp_dsack) { 4087 if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) {
4121 int mib_idx; 4088 int mib_idx;
4122 4089
4123 if (before(seq, tp->rcv_nxt)) 4090 if (before(seq, tp->rcv_nxt))
@@ -4152,7 +4119,7 @@ static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb)
4152 NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST); 4119 NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
4153 tcp_enter_quickack_mode(sk); 4120 tcp_enter_quickack_mode(sk);
4154 4121
4155 if (tcp_is_sack(tp) && sysctl_tcp_dsack) { 4122 if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) {
4156 u32 end_seq = TCP_SKB_CB(skb)->end_seq; 4123 u32 end_seq = TCP_SKB_CB(skb)->end_seq;
4157 4124
4158 if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) 4125 if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))
@@ -4268,11 +4235,6 @@ static void tcp_sack_remove(struct tcp_sock *tp)
4268 tp->rx_opt.num_sacks = num_sacks; 4235 tp->rx_opt.num_sacks = num_sacks;
4269} 4236}
4270 4237
4271enum tcp_queue {
4272 OOO_QUEUE,
4273 RCV_QUEUE,
4274};
4275
4276/** 4238/**
4277 * tcp_try_coalesce - try to merge skb to prior one 4239 * tcp_try_coalesce - try to merge skb to prior one
4278 * @sk: socket 4240 * @sk: socket
@@ -4288,7 +4250,6 @@ enum tcp_queue {
4288 * Returns true if caller should free @from instead of queueing it 4250 * Returns true if caller should free @from instead of queueing it
4289 */ 4251 */
4290static bool tcp_try_coalesce(struct sock *sk, 4252static bool tcp_try_coalesce(struct sock *sk,
4291 enum tcp_queue dest,
4292 struct sk_buff *to, 4253 struct sk_buff *to,
4293 struct sk_buff *from, 4254 struct sk_buff *from,
4294 bool *fragstolen) 4255 bool *fragstolen)
@@ -4313,10 +4274,7 @@ static bool tcp_try_coalesce(struct sock *sk,
4313 4274
4314 if (TCP_SKB_CB(from)->has_rxtstamp) { 4275 if (TCP_SKB_CB(from)->has_rxtstamp) {
4315 TCP_SKB_CB(to)->has_rxtstamp = true; 4276 TCP_SKB_CB(to)->has_rxtstamp = true;
4316 if (dest == OOO_QUEUE) 4277 to->tstamp = from->tstamp;
4317 TCP_SKB_CB(to)->swtstamp = TCP_SKB_CB(from)->swtstamp;
4318 else
4319 to->tstamp = from->tstamp;
4320 } 4278 }
4321 4279
4322 return true; 4280 return true;
@@ -4341,7 +4299,7 @@ static void tcp_ofo_queue(struct sock *sk)
4341 4299
4342 p = rb_first(&tp->out_of_order_queue); 4300 p = rb_first(&tp->out_of_order_queue);
4343 while (p) { 4301 while (p) {
4344 skb = rb_entry(p, struct sk_buff, rbnode); 4302 skb = rb_to_skb(p);
4345 if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) 4303 if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
4346 break; 4304 break;
4347 4305
@@ -4353,9 +4311,6 @@ static void tcp_ofo_queue(struct sock *sk)
4353 } 4311 }
4354 p = rb_next(p); 4312 p = rb_next(p);
4355 rb_erase(&skb->rbnode, &tp->out_of_order_queue); 4313 rb_erase(&skb->rbnode, &tp->out_of_order_queue);
4356 /* Replace tstamp which was stomped by rbnode */
4357 if (TCP_SKB_CB(skb)->has_rxtstamp)
4358 skb->tstamp = TCP_SKB_CB(skb)->swtstamp;
4359 4314
4360 if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) { 4315 if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) {
4361 SOCK_DEBUG(sk, "ofo packet was already received\n"); 4316 SOCK_DEBUG(sk, "ofo packet was already received\n");
@@ -4367,8 +4322,7 @@ static void tcp_ofo_queue(struct sock *sk)
4367 TCP_SKB_CB(skb)->end_seq); 4322 TCP_SKB_CB(skb)->end_seq);
4368 4323
4369 tail = skb_peek_tail(&sk->sk_receive_queue); 4324 tail = skb_peek_tail(&sk->sk_receive_queue);
4370 eaten = tail && tcp_try_coalesce(sk, RCV_QUEUE, 4325 eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen);
4371 tail, skb, &fragstolen);
4372 tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq); 4326 tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
4373 fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; 4327 fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
4374 if (!eaten) 4328 if (!eaten)
@@ -4409,7 +4363,7 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
4409static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) 4363static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
4410{ 4364{
4411 struct tcp_sock *tp = tcp_sk(sk); 4365 struct tcp_sock *tp = tcp_sk(sk);
4412 struct rb_node **p, *q, *parent; 4366 struct rb_node **p, *parent;
4413 struct sk_buff *skb1; 4367 struct sk_buff *skb1;
4414 u32 seq, end_seq; 4368 u32 seq, end_seq;
4415 bool fragstolen; 4369 bool fragstolen;
@@ -4422,10 +4376,6 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
4422 return; 4376 return;
4423 } 4377 }
4424 4378
4425 /* Stash tstamp to avoid being stomped on by rbnode */
4426 if (TCP_SKB_CB(skb)->has_rxtstamp)
4427 TCP_SKB_CB(skb)->swtstamp = skb->tstamp;
4428
4429 /* Disable header prediction. */ 4379 /* Disable header prediction. */
4430 tp->pred_flags = 0; 4380 tp->pred_flags = 0;
4431 inet_csk_schedule_ack(sk); 4381 inet_csk_schedule_ack(sk);
@@ -4453,7 +4403,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
4453 /* In the typical case, we are adding an skb to the end of the list. 4403 /* In the typical case, we are adding an skb to the end of the list.
4454 * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup. 4404 * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup.
4455 */ 4405 */
4456 if (tcp_try_coalesce(sk, OOO_QUEUE, tp->ooo_last_skb, 4406 if (tcp_try_coalesce(sk, tp->ooo_last_skb,
4457 skb, &fragstolen)) { 4407 skb, &fragstolen)) {
4458coalesce_done: 4408coalesce_done:
4459 tcp_grow_window(sk, skb); 4409 tcp_grow_window(sk, skb);
@@ -4472,7 +4422,7 @@ coalesce_done:
4472 parent = NULL; 4422 parent = NULL;
4473 while (*p) { 4423 while (*p) {
4474 parent = *p; 4424 parent = *p;
4475 skb1 = rb_entry(parent, struct sk_buff, rbnode); 4425 skb1 = rb_to_skb(parent);
4476 if (before(seq, TCP_SKB_CB(skb1)->seq)) { 4426 if (before(seq, TCP_SKB_CB(skb1)->seq)) {
4477 p = &parent->rb_left; 4427 p = &parent->rb_left;
4478 continue; 4428 continue;
@@ -4504,7 +4454,7 @@ coalesce_done:
4504 __kfree_skb(skb1); 4454 __kfree_skb(skb1);
4505 goto merge_right; 4455 goto merge_right;
4506 } 4456 }
4507 } else if (tcp_try_coalesce(sk, OOO_QUEUE, skb1, 4457 } else if (tcp_try_coalesce(sk, skb1,
4508 skb, &fragstolen)) { 4458 skb, &fragstolen)) {
4509 goto coalesce_done; 4459 goto coalesce_done;
4510 } 4460 }
@@ -4517,9 +4467,7 @@ insert:
4517 4467
4518merge_right: 4468merge_right:
4519 /* Remove other segments covered by skb. */ 4469 /* Remove other segments covered by skb. */
4520 while ((q = rb_next(&skb->rbnode)) != NULL) { 4470 while ((skb1 = skb_rb_next(skb)) != NULL) {
4521 skb1 = rb_entry(q, struct sk_buff, rbnode);
4522
4523 if (!after(end_seq, TCP_SKB_CB(skb1)->seq)) 4471 if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
4524 break; 4472 break;
4525 if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) { 4473 if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
@@ -4534,7 +4482,7 @@ merge_right:
4534 tcp_drop(sk, skb1); 4482 tcp_drop(sk, skb1);
4535 } 4483 }
4536 /* If there is no skb after us, we are the last_skb ! */ 4484 /* If there is no skb after us, we are the last_skb ! */
4537 if (!q) 4485 if (!skb1)
4538 tp->ooo_last_skb = skb; 4486 tp->ooo_last_skb = skb;
4539 4487
4540add_sack: 4488add_sack:
@@ -4556,7 +4504,7 @@ static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int
4556 4504
4557 __skb_pull(skb, hdrlen); 4505 __skb_pull(skb, hdrlen);
4558 eaten = (tail && 4506 eaten = (tail &&
4559 tcp_try_coalesce(sk, RCV_QUEUE, tail, 4507 tcp_try_coalesce(sk, tail,
4560 skb, fragstolen)) ? 1 : 0; 4508 skb, fragstolen)) ? 1 : 0;
4561 tcp_rcv_nxt_update(tcp_sk(sk), TCP_SKB_CB(skb)->end_seq); 4509 tcp_rcv_nxt_update(tcp_sk(sk), TCP_SKB_CB(skb)->end_seq);
4562 if (!eaten) { 4510 if (!eaten) {
@@ -4720,7 +4668,7 @@ static struct sk_buff *tcp_skb_next(struct sk_buff *skb, struct sk_buff_head *li
4720 if (list) 4668 if (list)
4721 return !skb_queue_is_last(list, skb) ? skb->next : NULL; 4669 return !skb_queue_is_last(list, skb) ? skb->next : NULL;
4722 4670
4723 return rb_entry_safe(rb_next(&skb->rbnode), struct sk_buff, rbnode); 4671 return skb_rb_next(skb);
4724} 4672}
4725 4673
4726static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb, 4674static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
@@ -4741,7 +4689,7 @@ static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
4741} 4689}
4742 4690
4743/* Insert skb into rb tree, ordered by TCP_SKB_CB(skb)->seq */ 4691/* Insert skb into rb tree, ordered by TCP_SKB_CB(skb)->seq */
4744static void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb) 4692void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb)
4745{ 4693{
4746 struct rb_node **p = &root->rb_node; 4694 struct rb_node **p = &root->rb_node;
4747 struct rb_node *parent = NULL; 4695 struct rb_node *parent = NULL;
@@ -4749,7 +4697,7 @@ static void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb)
4749 4697
4750 while (*p) { 4698 while (*p) {
4751 parent = *p; 4699 parent = *p;
4752 skb1 = rb_entry(parent, struct sk_buff, rbnode); 4700 skb1 = rb_to_skb(parent);
4753 if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq)) 4701 if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq))
4754 p = &parent->rb_left; 4702 p = &parent->rb_left;
4755 else 4703 else
@@ -4796,7 +4744,7 @@ restart:
4796 * overlaps to the next one. 4744 * overlaps to the next one.
4797 */ 4745 */
4798 if (!(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) && 4746 if (!(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) &&
4799 (tcp_win_from_space(skb->truesize) > skb->len || 4747 (tcp_win_from_space(sk, skb->truesize) > skb->len ||
4800 before(TCP_SKB_CB(skb)->seq, start))) { 4748 before(TCP_SKB_CB(skb)->seq, start))) {
4801 end_of_skbs = false; 4749 end_of_skbs = false;
4802 break; 4750 break;
@@ -4868,26 +4816,19 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
4868{ 4816{
4869 struct tcp_sock *tp = tcp_sk(sk); 4817 struct tcp_sock *tp = tcp_sk(sk);
4870 struct sk_buff *skb, *head; 4818 struct sk_buff *skb, *head;
4871 struct rb_node *p;
4872 u32 start, end; 4819 u32 start, end;
4873 4820
4874 p = rb_first(&tp->out_of_order_queue); 4821 skb = skb_rb_first(&tp->out_of_order_queue);
4875 skb = rb_entry_safe(p, struct sk_buff, rbnode);
4876new_range: 4822new_range:
4877 if (!skb) { 4823 if (!skb) {
4878 p = rb_last(&tp->out_of_order_queue); 4824 tp->ooo_last_skb = skb_rb_last(&tp->out_of_order_queue);
4879 /* Note: This is possible p is NULL here. We do not
4880 * use rb_entry_safe(), as ooo_last_skb is valid only
4881 * if rbtree is not empty.
4882 */
4883 tp->ooo_last_skb = rb_entry(p, struct sk_buff, rbnode);
4884 return; 4825 return;
4885 } 4826 }
4886 start = TCP_SKB_CB(skb)->seq; 4827 start = TCP_SKB_CB(skb)->seq;
4887 end = TCP_SKB_CB(skb)->end_seq; 4828 end = TCP_SKB_CB(skb)->end_seq;
4888 4829
4889 for (head = skb;;) { 4830 for (head = skb;;) {
4890 skb = tcp_skb_next(skb, NULL); 4831 skb = skb_rb_next(skb);
4891 4832
4892 /* Range is terminated when we see a gap or when 4833 /* Range is terminated when we see a gap or when
4893 * we are at the queue end. 4834 * we are at the queue end.
@@ -4930,14 +4871,14 @@ static bool tcp_prune_ofo_queue(struct sock *sk)
4930 do { 4871 do {
4931 prev = rb_prev(node); 4872 prev = rb_prev(node);
4932 rb_erase(node, &tp->out_of_order_queue); 4873 rb_erase(node, &tp->out_of_order_queue);
4933 tcp_drop(sk, rb_entry(node, struct sk_buff, rbnode)); 4874 tcp_drop(sk, rb_to_skb(node));
4934 sk_mem_reclaim(sk); 4875 sk_mem_reclaim(sk);
4935 if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && 4876 if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
4936 !tcp_under_memory_pressure(sk)) 4877 !tcp_under_memory_pressure(sk))
4937 break; 4878 break;
4938 node = prev; 4879 node = prev;
4939 } while (node); 4880 } while (node);
4940 tp->ooo_last_skb = rb_entry(prev, struct sk_buff, rbnode); 4881 tp->ooo_last_skb = rb_to_skb(prev);
4941 4882
4942 /* Reset SACK state. A conforming SACK implementation will 4883 /* Reset SACK state. A conforming SACK implementation will
4943 * do the same at a timeout based retransmit. When a connection 4884 * do the same at a timeout based retransmit. When a connection
@@ -5112,7 +5053,7 @@ static void tcp_check_urg(struct sock *sk, const struct tcphdr *th)
5112 struct tcp_sock *tp = tcp_sk(sk); 5053 struct tcp_sock *tp = tcp_sk(sk);
5113 u32 ptr = ntohs(th->urg_ptr); 5054 u32 ptr = ntohs(th->urg_ptr);
5114 5055
5115 if (ptr && !sysctl_tcp_stdurg) 5056 if (ptr && !sock_net(sk)->ipv4.sysctl_tcp_stdurg)
5116 ptr--; 5057 ptr--;
5117 ptr += ntohl(th->seq); 5058 ptr += ntohl(th->seq);
5118 5059
@@ -5532,20 +5473,13 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
5532 security_inet_conn_established(sk, skb); 5473 security_inet_conn_established(sk, skb);
5533 } 5474 }
5534 5475
5535 /* Make sure socket is routed, for correct metrics. */ 5476 tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB);
5536 icsk->icsk_af_ops->rebuild_header(sk);
5537
5538 tcp_init_metrics(sk);
5539 tcp_call_bpf(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB);
5540 tcp_init_congestion_control(sk);
5541 5477
5542 /* Prevent spurious tcp_cwnd_restart() on first data 5478 /* Prevent spurious tcp_cwnd_restart() on first data
5543 * packet. 5479 * packet.
5544 */ 5480 */
5545 tp->lsndtime = tcp_jiffies32; 5481 tp->lsndtime = tcp_jiffies32;
5546 5482
5547 tcp_init_buffer_space(sk);
5548
5549 if (sock_flag(sk, SOCK_KEEPOPEN)) 5483 if (sock_flag(sk, SOCK_KEEPOPEN))
5550 inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp)); 5484 inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));
5551 5485
@@ -5559,7 +5493,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
5559 struct tcp_fastopen_cookie *cookie) 5493 struct tcp_fastopen_cookie *cookie)
5560{ 5494{
5561 struct tcp_sock *tp = tcp_sk(sk); 5495 struct tcp_sock *tp = tcp_sk(sk);
5562 struct sk_buff *data = tp->syn_data ? tcp_write_queue_head(sk) : NULL; 5496 struct sk_buff *data = tp->syn_data ? tcp_rtx_queue_head(sk) : NULL;
5563 u16 mss = tp->rx_opt.mss_clamp, try_exp = 0; 5497 u16 mss = tp->rx_opt.mss_clamp, try_exp = 0;
5564 bool syn_drop = false; 5498 bool syn_drop = false;
5565 5499
@@ -5594,9 +5528,8 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
5594 tcp_fastopen_cache_set(sk, mss, cookie, syn_drop, try_exp); 5528 tcp_fastopen_cache_set(sk, mss, cookie, syn_drop, try_exp);
5595 5529
5596 if (data) { /* Retransmit unacked data in SYN */ 5530 if (data) { /* Retransmit unacked data in SYN */
5597 tcp_for_write_queue_from(data, sk) { 5531 skb_rbtree_walk_from(data) {
5598 if (data == tcp_send_head(sk) || 5532 if (__tcp_retransmit_skb(sk, data, 1))
5599 __tcp_retransmit_skb(sk, data, 1))
5600 break; 5533 break;
5601 } 5534 }
5602 tcp_rearm_rto(sk); 5535 tcp_rearm_rto(sk);
@@ -5614,6 +5547,16 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
5614 return false; 5547 return false;
5615} 5548}
5616 5549
5550static void smc_check_reset_syn(struct tcp_sock *tp)
5551{
5552#if IS_ENABLED(CONFIG_SMC)
5553 if (static_branch_unlikely(&tcp_have_smc)) {
5554 if (tp->syn_smc && !tp->rx_opt.smc_ok)
5555 tp->syn_smc = 0;
5556 }
5557#endif
5558}
5559
5617static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, 5560static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
5618 const struct tcphdr *th) 5561 const struct tcphdr *th)
5619{ 5562{
@@ -5709,10 +5652,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
5709 tp->tcp_header_len = sizeof(struct tcphdr); 5652 tp->tcp_header_len = sizeof(struct tcphdr);
5710 } 5653 }
5711 5654
5712 if (tcp_is_sack(tp) && sysctl_tcp_fack)
5713 tcp_enable_fack(tp);
5714
5715 tcp_mtup_init(sk);
5716 tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); 5655 tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
5717 tcp_initialize_rcv_mss(sk); 5656 tcp_initialize_rcv_mss(sk);
5718 5657
@@ -5721,6 +5660,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
5721 * is initialized. */ 5660 * is initialized. */
5722 tp->copied_seq = tp->rcv_nxt; 5661 tp->copied_seq = tp->rcv_nxt;
5723 5662
5663 smc_check_reset_syn(tp);
5664
5724 smp_mb(); 5665 smp_mb();
5725 5666
5726 tcp_finish_connect(sk, skb); 5667 tcp_finish_connect(sk, skb);
@@ -5938,15 +5879,18 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
5938 if (req) { 5879 if (req) {
5939 inet_csk(sk)->icsk_retransmits = 0; 5880 inet_csk(sk)->icsk_retransmits = 0;
5940 reqsk_fastopen_remove(sk, req, false); 5881 reqsk_fastopen_remove(sk, req, false);
5882 /* Re-arm the timer because data may have been sent out.
5883 * This is similar to the regular data transmission case
5884 * when new data has just been ack'ed.
5885 *
5886 * (TFO) - we could try to be more aggressive and
5887 * retransmitting any data sooner based on when they
5888 * are sent out.
5889 */
5890 tcp_rearm_rto(sk);
5941 } else { 5891 } else {
5942 /* Make sure socket is routed, for correct metrics. */ 5892 tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB);
5943 icsk->icsk_af_ops->rebuild_header(sk);
5944 tcp_call_bpf(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB);
5945 tcp_init_congestion_control(sk);
5946
5947 tcp_mtup_init(sk);
5948 tp->copied_seq = tp->rcv_nxt; 5893 tp->copied_seq = tp->rcv_nxt;
5949 tcp_init_buffer_space(sk);
5950 } 5894 }
5951 smp_mb(); 5895 smp_mb();
5952 tcp_set_state(sk, TCP_ESTABLISHED); 5896 tcp_set_state(sk, TCP_ESTABLISHED);
@@ -5966,19 +5910,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
5966 if (tp->rx_opt.tstamp_ok) 5910 if (tp->rx_opt.tstamp_ok)
5967 tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; 5911 tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
5968 5912
5969 if (req) {
5970 /* Re-arm the timer because data may have been sent out.
5971 * This is similar to the regular data transmission case
5972 * when new data has just been ack'ed.
5973 *
5974 * (TFO) - we could try to be more aggressive and
5975 * retransmitting any data sooner based on when they
5976 * are sent out.
5977 */
5978 tcp_rearm_rto(sk);
5979 } else
5980 tcp_init_metrics(sk);
5981
5982 if (!inet_csk(sk)->icsk_ca_ops->cong_control) 5913 if (!inet_csk(sk)->icsk_ca_ops->cong_control)
5983 tcp_update_pacing_rate(sk); 5914 tcp_update_pacing_rate(sk);
5984 5915
@@ -6075,6 +6006,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
6075 case TCP_LAST_ACK: 6006 case TCP_LAST_ACK:
6076 if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) 6007 if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
6077 break; 6008 break;
6009 /* fall through */
6078 case TCP_FIN_WAIT1: 6010 case TCP_FIN_WAIT1:
6079 case TCP_FIN_WAIT2: 6011 case TCP_FIN_WAIT2:
6080 /* RFC 793 says to queue data in these states, 6012 /* RFC 793 says to queue data in these states,
@@ -6183,6 +6115,9 @@ static void tcp_openreq_init(struct request_sock *req,
6183 ireq->ir_rmt_port = tcp_hdr(skb)->source; 6115 ireq->ir_rmt_port = tcp_hdr(skb)->source;
6184 ireq->ir_num = ntohs(tcp_hdr(skb)->dest); 6116 ireq->ir_num = ntohs(tcp_hdr(skb)->dest);
6185 ireq->ir_mark = inet_request_mark(sk, skb); 6117 ireq->ir_mark = inet_request_mark(sk, skb);
6118#if IS_ENABLED(CONFIG_SMC)
6119 ireq->smc_ok = rx_opt->smc_ok;
6120#endif
6186} 6121}
6187 6122
6188struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops, 6123struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops,
@@ -6358,7 +6293,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
6358 tcp_openreq_init_rwin(req, sk, dst); 6293 tcp_openreq_init_rwin(req, sk, dst);
6359 if (!want_cookie) { 6294 if (!want_cookie) {
6360 tcp_reqsk_record_syn(sk, req, skb); 6295 tcp_reqsk_record_syn(sk, req, skb);
6361 fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc); 6296 fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst);
6362 } 6297 }
6363 if (fastopen_sk) { 6298 if (fastopen_sk) {
6364 af_ops->send_synack(fastopen_sk, dst, &fl, req, 6299 af_ops->send_synack(fastopen_sk, dst, &fl, req,