diff options
author | Pablo Neira Ayuso <pablo@netfilter.org> | 2016-09-25 17:23:57 -0400 |
---|---|---|
committer | Pablo Neira Ayuso <pablo@netfilter.org> | 2016-09-25 17:34:19 -0400 |
commit | f20fbc0717f9f007c94b2641134b19228d0ce9ed (patch) | |
tree | 1404248ebbec552a3fb7928b75322b65d74de1bd /net/ipv4/tcp_input.c | |
parent | 8cb2a7d5667ab9a9c2fdd356357b85b63b320901 (diff) | |
parent | fe0acb5fcb7fe8cb3d68bbdb8459865c972d8f83 (diff) |
Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Conflicts:
net/netfilter/core.c
net/netfilter/nf_tables_netdev.c
Resolve two conflicts before pull request for David's net-next tree:
1) Between c73c24849011 ("netfilter: nf_tables_netdev: remove redundant
ip_hdr assignment") from the net tree and commit ddc8b6027ad0
("netfilter: introduce nft_set_pktinfo_{ipv4, ipv6}_validate()").
2) Between e8bffe0cf964 ("net: Add _nf_(un)register_hooks symbols") and
Aaron Conole's patches to replace list_head with single linked list.
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Diffstat (limited to 'net/ipv4/tcp_input.c')
-rw-r--r-- | net/ipv4/tcp_input.c | 495 |
1 files changed, 276 insertions, 219 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index f3a9f3c2c8d8..8c6ad2d319d6 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c | |||
@@ -289,6 +289,7 @@ static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr | |||
289 | static void tcp_sndbuf_expand(struct sock *sk) | 289 | static void tcp_sndbuf_expand(struct sock *sk) |
290 | { | 290 | { |
291 | const struct tcp_sock *tp = tcp_sk(sk); | 291 | const struct tcp_sock *tp = tcp_sk(sk); |
292 | const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; | ||
292 | int sndmem, per_mss; | 293 | int sndmem, per_mss; |
293 | u32 nr_segs; | 294 | u32 nr_segs; |
294 | 295 | ||
@@ -309,7 +310,8 @@ static void tcp_sndbuf_expand(struct sock *sk) | |||
309 | * Cubic needs 1.7 factor, rounded to 2 to include | 310 | * Cubic needs 1.7 factor, rounded to 2 to include |
310 | * extra cushion (application might react slowly to POLLOUT) | 311 | * extra cushion (application might react slowly to POLLOUT) |
311 | */ | 312 | */ |
312 | sndmem = 2 * nr_segs * per_mss; | 313 | sndmem = ca_ops->sndbuf_expand ? ca_ops->sndbuf_expand(sk) : 2; |
314 | sndmem *= nr_segs * per_mss; | ||
313 | 315 | ||
314 | if (sk->sk_sndbuf < sndmem) | 316 | if (sk->sk_sndbuf < sndmem) |
315 | sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]); | 317 | sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]); |
@@ -899,12 +901,29 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb) | |||
899 | tp->retransmit_high = TCP_SKB_CB(skb)->end_seq; | 901 | tp->retransmit_high = TCP_SKB_CB(skb)->end_seq; |
900 | } | 902 | } |
901 | 903 | ||
904 | /* Sum the number of packets on the wire we have marked as lost. | ||
905 | * There are two cases we care about here: | ||
906 | * a) Packet hasn't been marked lost (nor retransmitted), | ||
907 | * and this is the first loss. | ||
908 | * b) Packet has been marked both lost and retransmitted, | ||
909 | * and this means we think it was lost again. | ||
910 | */ | ||
911 | static void tcp_sum_lost(struct tcp_sock *tp, struct sk_buff *skb) | ||
912 | { | ||
913 | __u8 sacked = TCP_SKB_CB(skb)->sacked; | ||
914 | |||
915 | if (!(sacked & TCPCB_LOST) || | ||
916 | ((sacked & TCPCB_LOST) && (sacked & TCPCB_SACKED_RETRANS))) | ||
917 | tp->lost += tcp_skb_pcount(skb); | ||
918 | } | ||
919 | |||
902 | static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb) | 920 | static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb) |
903 | { | 921 | { |
904 | if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) { | 922 | if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) { |
905 | tcp_verify_retransmit_hint(tp, skb); | 923 | tcp_verify_retransmit_hint(tp, skb); |
906 | 924 | ||
907 | tp->lost_out += tcp_skb_pcount(skb); | 925 | tp->lost_out += tcp_skb_pcount(skb); |
926 | tcp_sum_lost(tp, skb); | ||
908 | TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; | 927 | TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; |
909 | } | 928 | } |
910 | } | 929 | } |
@@ -913,6 +932,7 @@ void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb) | |||
913 | { | 932 | { |
914 | tcp_verify_retransmit_hint(tp, skb); | 933 | tcp_verify_retransmit_hint(tp, skb); |
915 | 934 | ||
935 | tcp_sum_lost(tp, skb); | ||
916 | if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) { | 936 | if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) { |
917 | tp->lost_out += tcp_skb_pcount(skb); | 937 | tp->lost_out += tcp_skb_pcount(skb); |
918 | TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; | 938 | TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; |
@@ -1094,6 +1114,7 @@ struct tcp_sacktag_state { | |||
1094 | */ | 1114 | */ |
1095 | struct skb_mstamp first_sackt; | 1115 | struct skb_mstamp first_sackt; |
1096 | struct skb_mstamp last_sackt; | 1116 | struct skb_mstamp last_sackt; |
1117 | struct rate_sample *rate; | ||
1097 | int flag; | 1118 | int flag; |
1098 | }; | 1119 | }; |
1099 | 1120 | ||
@@ -1261,6 +1282,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, | |||
1261 | tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked, | 1282 | tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked, |
1262 | start_seq, end_seq, dup_sack, pcount, | 1283 | start_seq, end_seq, dup_sack, pcount, |
1263 | &skb->skb_mstamp); | 1284 | &skb->skb_mstamp); |
1285 | tcp_rate_skb_delivered(sk, skb, state->rate); | ||
1264 | 1286 | ||
1265 | if (skb == tp->lost_skb_hint) | 1287 | if (skb == tp->lost_skb_hint) |
1266 | tp->lost_cnt_hint += pcount; | 1288 | tp->lost_cnt_hint += pcount; |
@@ -1311,6 +1333,9 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, | |||
1311 | tcp_advance_highest_sack(sk, skb); | 1333 | tcp_advance_highest_sack(sk, skb); |
1312 | 1334 | ||
1313 | tcp_skb_collapse_tstamp(prev, skb); | 1335 | tcp_skb_collapse_tstamp(prev, skb); |
1336 | if (unlikely(TCP_SKB_CB(prev)->tx.delivered_mstamp.v64)) | ||
1337 | TCP_SKB_CB(prev)->tx.delivered_mstamp.v64 = 0; | ||
1338 | |||
1314 | tcp_unlink_write_queue(skb, sk); | 1339 | tcp_unlink_write_queue(skb, sk); |
1315 | sk_wmem_free_skb(sk, skb); | 1340 | sk_wmem_free_skb(sk, skb); |
1316 | 1341 | ||
@@ -1540,6 +1565,7 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, | |||
1540 | dup_sack, | 1565 | dup_sack, |
1541 | tcp_skb_pcount(skb), | 1566 | tcp_skb_pcount(skb), |
1542 | &skb->skb_mstamp); | 1567 | &skb->skb_mstamp); |
1568 | tcp_rate_skb_delivered(sk, skb, state->rate); | ||
1543 | 1569 | ||
1544 | if (!before(TCP_SKB_CB(skb)->seq, | 1570 | if (!before(TCP_SKB_CB(skb)->seq, |
1545 | tcp_highest_sack_seq(tp))) | 1571 | tcp_highest_sack_seq(tp))) |
@@ -1622,8 +1648,10 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, | |||
1622 | 1648 | ||
1623 | found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire, | 1649 | found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire, |
1624 | num_sacks, prior_snd_una); | 1650 | num_sacks, prior_snd_una); |
1625 | if (found_dup_sack) | 1651 | if (found_dup_sack) { |
1626 | state->flag |= FLAG_DSACKING_ACK; | 1652 | state->flag |= FLAG_DSACKING_ACK; |
1653 | tp->delivered++; /* A spurious retransmission is delivered */ | ||
1654 | } | ||
1627 | 1655 | ||
1628 | /* Eliminate too old ACKs, but take into | 1656 | /* Eliminate too old ACKs, but take into |
1629 | * account more or less fresh ones, they can | 1657 | * account more or less fresh ones, they can |
@@ -1890,6 +1918,7 @@ void tcp_enter_loss(struct sock *sk) | |||
1890 | struct sk_buff *skb; | 1918 | struct sk_buff *skb; |
1891 | bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery; | 1919 | bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery; |
1892 | bool is_reneg; /* is receiver reneging on SACKs? */ | 1920 | bool is_reneg; /* is receiver reneging on SACKs? */ |
1921 | bool mark_lost; | ||
1893 | 1922 | ||
1894 | /* Reduce ssthresh if it has not yet been made inside this window. */ | 1923 | /* Reduce ssthresh if it has not yet been made inside this window. */ |
1895 | if (icsk->icsk_ca_state <= TCP_CA_Disorder || | 1924 | if (icsk->icsk_ca_state <= TCP_CA_Disorder || |
@@ -1923,8 +1952,12 @@ void tcp_enter_loss(struct sock *sk) | |||
1923 | if (skb == tcp_send_head(sk)) | 1952 | if (skb == tcp_send_head(sk)) |
1924 | break; | 1953 | break; |
1925 | 1954 | ||
1955 | mark_lost = (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) || | ||
1956 | is_reneg); | ||
1957 | if (mark_lost) | ||
1958 | tcp_sum_lost(tp, skb); | ||
1926 | TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED; | 1959 | TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED; |
1927 | if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || is_reneg) { | 1960 | if (mark_lost) { |
1928 | TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED; | 1961 | TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED; |
1929 | TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; | 1962 | TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; |
1930 | tp->lost_out += tcp_skb_pcount(skb); | 1963 | tp->lost_out += tcp_skb_pcount(skb); |
@@ -2503,6 +2536,9 @@ static inline void tcp_end_cwnd_reduction(struct sock *sk) | |||
2503 | { | 2536 | { |
2504 | struct tcp_sock *tp = tcp_sk(sk); | 2537 | struct tcp_sock *tp = tcp_sk(sk); |
2505 | 2538 | ||
2539 | if (inet_csk(sk)->icsk_ca_ops->cong_control) | ||
2540 | return; | ||
2541 | |||
2506 | /* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */ | 2542 | /* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */ |
2507 | if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR || | 2543 | if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR || |
2508 | (tp->undo_marker && tp->snd_ssthresh < TCP_INFINITE_SSTHRESH)) { | 2544 | (tp->undo_marker && tp->snd_ssthresh < TCP_INFINITE_SSTHRESH)) { |
@@ -2879,67 +2915,13 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, | |||
2879 | *rexmit = REXMIT_LOST; | 2915 | *rexmit = REXMIT_LOST; |
2880 | } | 2916 | } |
2881 | 2917 | ||
2882 | /* Kathleen Nichols' algorithm for tracking the minimum value of | ||
2883 | * a data stream over some fixed time interval. (E.g., the minimum | ||
2884 | * RTT over the past five minutes.) It uses constant space and constant | ||
2885 | * time per update yet almost always delivers the same minimum as an | ||
2886 | * implementation that has to keep all the data in the window. | ||
2887 | * | ||
2888 | * The algorithm keeps track of the best, 2nd best & 3rd best min | ||
2889 | * values, maintaining an invariant that the measurement time of the | ||
2890 | * n'th best >= n-1'th best. It also makes sure that the three values | ||
2891 | * are widely separated in the time window since that bounds the worse | ||
2892 | * case error when that data is monotonically increasing over the window. | ||
2893 | * | ||
2894 | * Upon getting a new min, we can forget everything earlier because it | ||
2895 | * has no value - the new min is <= everything else in the window by | ||
2896 | * definition and it's the most recent. So we restart fresh on every new min | ||
2897 | * and overwrites 2nd & 3rd choices. The same property holds for 2nd & 3rd | ||
2898 | * best. | ||
2899 | */ | ||
2900 | static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us) | 2918 | static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us) |
2901 | { | 2919 | { |
2902 | const u32 now = tcp_time_stamp, wlen = sysctl_tcp_min_rtt_wlen * HZ; | 2920 | struct tcp_sock *tp = tcp_sk(sk); |
2903 | struct rtt_meas *m = tcp_sk(sk)->rtt_min; | 2921 | u32 wlen = sysctl_tcp_min_rtt_wlen * HZ; |
2904 | struct rtt_meas rttm = { | 2922 | |
2905 | .rtt = likely(rtt_us) ? rtt_us : jiffies_to_usecs(1), | 2923 | minmax_running_min(&tp->rtt_min, wlen, tcp_time_stamp, |
2906 | .ts = now, | 2924 | rtt_us ? : jiffies_to_usecs(1)); |
2907 | }; | ||
2908 | u32 elapsed; | ||
2909 | |||
2910 | /* Check if the new measurement updates the 1st, 2nd, or 3rd choices */ | ||
2911 | if (unlikely(rttm.rtt <= m[0].rtt)) | ||
2912 | m[0] = m[1] = m[2] = rttm; | ||
2913 | else if (rttm.rtt <= m[1].rtt) | ||
2914 | m[1] = m[2] = rttm; | ||
2915 | else if (rttm.rtt <= m[2].rtt) | ||
2916 | m[2] = rttm; | ||
2917 | |||
2918 | elapsed = now - m[0].ts; | ||
2919 | if (unlikely(elapsed > wlen)) { | ||
2920 | /* Passed entire window without a new min so make 2nd choice | ||
2921 | * the new min & 3rd choice the new 2nd. So forth and so on. | ||
2922 | */ | ||
2923 | m[0] = m[1]; | ||
2924 | m[1] = m[2]; | ||
2925 | m[2] = rttm; | ||
2926 | if (now - m[0].ts > wlen) { | ||
2927 | m[0] = m[1]; | ||
2928 | m[1] = rttm; | ||
2929 | if (now - m[0].ts > wlen) | ||
2930 | m[0] = rttm; | ||
2931 | } | ||
2932 | } else if (m[1].ts == m[0].ts && elapsed > wlen / 4) { | ||
2933 | /* Passed a quarter of the window without a new min so | ||
2934 | * take 2nd choice from the 2nd quarter of the window. | ||
2935 | */ | ||
2936 | m[2] = m[1] = rttm; | ||
2937 | } else if (m[2].ts == m[1].ts && elapsed > wlen / 2) { | ||
2938 | /* Passed half the window without a new min so take the 3rd | ||
2939 | * choice from the last half of the window. | ||
2940 | */ | ||
2941 | m[2] = rttm; | ||
2942 | } | ||
2943 | } | 2925 | } |
2944 | 2926 | ||
2945 | static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, | 2927 | static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, |
@@ -3102,10 +3084,11 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb, | |||
3102 | */ | 3084 | */ |
3103 | static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, | 3085 | static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, |
3104 | u32 prior_snd_una, int *acked, | 3086 | u32 prior_snd_una, int *acked, |
3105 | struct tcp_sacktag_state *sack) | 3087 | struct tcp_sacktag_state *sack, |
3088 | struct skb_mstamp *now) | ||
3106 | { | 3089 | { |
3107 | const struct inet_connection_sock *icsk = inet_csk(sk); | 3090 | const struct inet_connection_sock *icsk = inet_csk(sk); |
3108 | struct skb_mstamp first_ackt, last_ackt, now; | 3091 | struct skb_mstamp first_ackt, last_ackt; |
3109 | struct tcp_sock *tp = tcp_sk(sk); | 3092 | struct tcp_sock *tp = tcp_sk(sk); |
3110 | u32 prior_sacked = tp->sacked_out; | 3093 | u32 prior_sacked = tp->sacked_out; |
3111 | u32 reord = tp->packets_out; | 3094 | u32 reord = tp->packets_out; |
@@ -3137,7 +3120,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, | |||
3137 | acked_pcount = tcp_tso_acked(sk, skb); | 3120 | acked_pcount = tcp_tso_acked(sk, skb); |
3138 | if (!acked_pcount) | 3121 | if (!acked_pcount) |
3139 | break; | 3122 | break; |
3140 | |||
3141 | fully_acked = false; | 3123 | fully_acked = false; |
3142 | } else { | 3124 | } else { |
3143 | /* Speedup tcp_unlink_write_queue() and next loop */ | 3125 | /* Speedup tcp_unlink_write_queue() and next loop */ |
@@ -3173,6 +3155,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, | |||
3173 | 3155 | ||
3174 | tp->packets_out -= acked_pcount; | 3156 | tp->packets_out -= acked_pcount; |
3175 | pkts_acked += acked_pcount; | 3157 | pkts_acked += acked_pcount; |
3158 | tcp_rate_skb_delivered(sk, skb, sack->rate); | ||
3176 | 3159 | ||
3177 | /* Initial outgoing SYN's get put onto the write_queue | 3160 | /* Initial outgoing SYN's get put onto the write_queue |
3178 | * just like anything else we transmit. It is not | 3161 | * just like anything else we transmit. It is not |
@@ -3205,16 +3188,15 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, | |||
3205 | if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) | 3188 | if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) |
3206 | flag |= FLAG_SACK_RENEGING; | 3189 | flag |= FLAG_SACK_RENEGING; |
3207 | 3190 | ||
3208 | skb_mstamp_get(&now); | ||
3209 | if (likely(first_ackt.v64) && !(flag & FLAG_RETRANS_DATA_ACKED)) { | 3191 | if (likely(first_ackt.v64) && !(flag & FLAG_RETRANS_DATA_ACKED)) { |
3210 | seq_rtt_us = skb_mstamp_us_delta(&now, &first_ackt); | 3192 | seq_rtt_us = skb_mstamp_us_delta(now, &first_ackt); |
3211 | ca_rtt_us = skb_mstamp_us_delta(&now, &last_ackt); | 3193 | ca_rtt_us = skb_mstamp_us_delta(now, &last_ackt); |
3212 | } | 3194 | } |
3213 | if (sack->first_sackt.v64) { | 3195 | if (sack->first_sackt.v64) { |
3214 | sack_rtt_us = skb_mstamp_us_delta(&now, &sack->first_sackt); | 3196 | sack_rtt_us = skb_mstamp_us_delta(now, &sack->first_sackt); |
3215 | ca_rtt_us = skb_mstamp_us_delta(&now, &sack->last_sackt); | 3197 | ca_rtt_us = skb_mstamp_us_delta(now, &sack->last_sackt); |
3216 | } | 3198 | } |
3217 | 3199 | sack->rate->rtt_us = ca_rtt_us; /* RTT of last (S)ACKed packet, or -1 */ | |
3218 | rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us, | 3200 | rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us, |
3219 | ca_rtt_us); | 3201 | ca_rtt_us); |
3220 | 3202 | ||
@@ -3242,7 +3224,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, | |||
3242 | tp->fackets_out -= min(pkts_acked, tp->fackets_out); | 3224 | tp->fackets_out -= min(pkts_acked, tp->fackets_out); |
3243 | 3225 | ||
3244 | } else if (skb && rtt_update && sack_rtt_us >= 0 && | 3226 | } else if (skb && rtt_update && sack_rtt_us >= 0 && |
3245 | sack_rtt_us > skb_mstamp_us_delta(&now, &skb->skb_mstamp)) { | 3227 | sack_rtt_us > skb_mstamp_us_delta(now, &skb->skb_mstamp)) { |
3246 | /* Do not re-arm RTO if the sack RTT is measured from data sent | 3228 | /* Do not re-arm RTO if the sack RTT is measured from data sent |
3247 | * after when the head was last (re)transmitted. Otherwise the | 3229 | * after when the head was last (re)transmitted. Otherwise the |
3248 | * timeout may continue to extend in loss recovery. | 3230 | * timeout may continue to extend in loss recovery. |
@@ -3333,8 +3315,15 @@ static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag) | |||
3333 | * information. All transmission or retransmission are delayed afterwards. | 3315 | * information. All transmission or retransmission are delayed afterwards. |
3334 | */ | 3316 | */ |
3335 | static void tcp_cong_control(struct sock *sk, u32 ack, u32 acked_sacked, | 3317 | static void tcp_cong_control(struct sock *sk, u32 ack, u32 acked_sacked, |
3336 | int flag) | 3318 | int flag, const struct rate_sample *rs) |
3337 | { | 3319 | { |
3320 | const struct inet_connection_sock *icsk = inet_csk(sk); | ||
3321 | |||
3322 | if (icsk->icsk_ca_ops->cong_control) { | ||
3323 | icsk->icsk_ca_ops->cong_control(sk, rs); | ||
3324 | return; | ||
3325 | } | ||
3326 | |||
3338 | if (tcp_in_cwnd_reduction(sk)) { | 3327 | if (tcp_in_cwnd_reduction(sk)) { |
3339 | /* Reduce cwnd if state mandates */ | 3328 | /* Reduce cwnd if state mandates */ |
3340 | tcp_cwnd_reduction(sk, acked_sacked, flag); | 3329 | tcp_cwnd_reduction(sk, acked_sacked, flag); |
@@ -3579,17 +3568,21 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) | |||
3579 | struct inet_connection_sock *icsk = inet_csk(sk); | 3568 | struct inet_connection_sock *icsk = inet_csk(sk); |
3580 | struct tcp_sock *tp = tcp_sk(sk); | 3569 | struct tcp_sock *tp = tcp_sk(sk); |
3581 | struct tcp_sacktag_state sack_state; | 3570 | struct tcp_sacktag_state sack_state; |
3571 | struct rate_sample rs = { .prior_delivered = 0 }; | ||
3582 | u32 prior_snd_una = tp->snd_una; | 3572 | u32 prior_snd_una = tp->snd_una; |
3583 | u32 ack_seq = TCP_SKB_CB(skb)->seq; | 3573 | u32 ack_seq = TCP_SKB_CB(skb)->seq; |
3584 | u32 ack = TCP_SKB_CB(skb)->ack_seq; | 3574 | u32 ack = TCP_SKB_CB(skb)->ack_seq; |
3585 | bool is_dupack = false; | 3575 | bool is_dupack = false; |
3586 | u32 prior_fackets; | 3576 | u32 prior_fackets; |
3587 | int prior_packets = tp->packets_out; | 3577 | int prior_packets = tp->packets_out; |
3588 | u32 prior_delivered = tp->delivered; | 3578 | u32 delivered = tp->delivered; |
3579 | u32 lost = tp->lost; | ||
3589 | int acked = 0; /* Number of packets newly acked */ | 3580 | int acked = 0; /* Number of packets newly acked */ |
3590 | int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */ | 3581 | int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */ |
3582 | struct skb_mstamp now; | ||
3591 | 3583 | ||
3592 | sack_state.first_sackt.v64 = 0; | 3584 | sack_state.first_sackt.v64 = 0; |
3585 | sack_state.rate = &rs; | ||
3593 | 3586 | ||
3594 | /* We very likely will need to access write queue head. */ | 3587 | /* We very likely will need to access write queue head. */ |
3595 | prefetchw(sk->sk_write_queue.next); | 3588 | prefetchw(sk->sk_write_queue.next); |
@@ -3612,6 +3605,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) | |||
3612 | if (after(ack, tp->snd_nxt)) | 3605 | if (after(ack, tp->snd_nxt)) |
3613 | goto invalid_ack; | 3606 | goto invalid_ack; |
3614 | 3607 | ||
3608 | skb_mstamp_get(&now); | ||
3609 | |||
3615 | if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || | 3610 | if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || |
3616 | icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) | 3611 | icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) |
3617 | tcp_rearm_rto(sk); | 3612 | tcp_rearm_rto(sk); |
@@ -3622,6 +3617,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) | |||
3622 | } | 3617 | } |
3623 | 3618 | ||
3624 | prior_fackets = tp->fackets_out; | 3619 | prior_fackets = tp->fackets_out; |
3620 | rs.prior_in_flight = tcp_packets_in_flight(tp); | ||
3625 | 3621 | ||
3626 | /* ts_recent update must be made after we are sure that the packet | 3622 | /* ts_recent update must be made after we are sure that the packet |
3627 | * is in window. | 3623 | * is in window. |
@@ -3677,7 +3673,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) | |||
3677 | 3673 | ||
3678 | /* See if we can take anything off of the retransmit queue. */ | 3674 | /* See if we can take anything off of the retransmit queue. */ |
3679 | flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked, | 3675 | flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked, |
3680 | &sack_state); | 3676 | &sack_state, &now); |
3681 | 3677 | ||
3682 | if (tcp_ack_is_dubious(sk, flag)) { | 3678 | if (tcp_ack_is_dubious(sk, flag)) { |
3683 | is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); | 3679 | is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); |
@@ -3694,7 +3690,10 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) | |||
3694 | 3690 | ||
3695 | if (icsk->icsk_pending == ICSK_TIME_RETRANS) | 3691 | if (icsk->icsk_pending == ICSK_TIME_RETRANS) |
3696 | tcp_schedule_loss_probe(sk); | 3692 | tcp_schedule_loss_probe(sk); |
3697 | tcp_cong_control(sk, ack, tp->delivered - prior_delivered, flag); | 3693 | delivered = tp->delivered - delivered; /* freshly ACKed or SACKed */ |
3694 | lost = tp->lost - lost; /* freshly marked lost */ | ||
3695 | tcp_rate_gen(sk, delivered, lost, &now, &rs); | ||
3696 | tcp_cong_control(sk, ack, delivered, flag, &rs); | ||
3698 | tcp_xmit_recovery(sk, rexmit); | 3697 | tcp_xmit_recovery(sk, rexmit); |
3699 | return 1; | 3698 | return 1; |
3700 | 3699 | ||
@@ -4108,7 +4107,7 @@ void tcp_fin(struct sock *sk) | |||
4108 | /* It _is_ possible, that we have something out-of-order _after_ FIN. | 4107 | /* It _is_ possible, that we have something out-of-order _after_ FIN. |
4109 | * Probably, we should reset in this case. For now drop them. | 4108 | * Probably, we should reset in this case. For now drop them. |
4110 | */ | 4109 | */ |
4111 | __skb_queue_purge(&tp->out_of_order_queue); | 4110 | skb_rbtree_purge(&tp->out_of_order_queue); |
4112 | if (tcp_is_sack(tp)) | 4111 | if (tcp_is_sack(tp)) |
4113 | tcp_sack_reset(&tp->rx_opt); | 4112 | tcp_sack_reset(&tp->rx_opt); |
4114 | sk_mem_reclaim(sk); | 4113 | sk_mem_reclaim(sk); |
@@ -4268,7 +4267,7 @@ static void tcp_sack_remove(struct tcp_sock *tp) | |||
4268 | int this_sack; | 4267 | int this_sack; |
4269 | 4268 | ||
4270 | /* Empty ofo queue, hence, all the SACKs are eaten. Clear. */ | 4269 | /* Empty ofo queue, hence, all the SACKs are eaten. Clear. */ |
4271 | if (skb_queue_empty(&tp->out_of_order_queue)) { | 4270 | if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) { |
4272 | tp->rx_opt.num_sacks = 0; | 4271 | tp->rx_opt.num_sacks = 0; |
4273 | return; | 4272 | return; |
4274 | } | 4273 | } |
@@ -4344,10 +4343,13 @@ static void tcp_ofo_queue(struct sock *sk) | |||
4344 | { | 4343 | { |
4345 | struct tcp_sock *tp = tcp_sk(sk); | 4344 | struct tcp_sock *tp = tcp_sk(sk); |
4346 | __u32 dsack_high = tp->rcv_nxt; | 4345 | __u32 dsack_high = tp->rcv_nxt; |
4346 | bool fin, fragstolen, eaten; | ||
4347 | struct sk_buff *skb, *tail; | 4347 | struct sk_buff *skb, *tail; |
4348 | bool fragstolen, eaten; | 4348 | struct rb_node *p; |
4349 | 4349 | ||
4350 | while ((skb = skb_peek(&tp->out_of_order_queue)) != NULL) { | 4350 | p = rb_first(&tp->out_of_order_queue); |
4351 | while (p) { | ||
4352 | skb = rb_entry(p, struct sk_buff, rbnode); | ||
4351 | if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) | 4353 | if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) |
4352 | break; | 4354 | break; |
4353 | 4355 | ||
@@ -4357,9 +4359,10 @@ static void tcp_ofo_queue(struct sock *sk) | |||
4357 | dsack_high = TCP_SKB_CB(skb)->end_seq; | 4359 | dsack_high = TCP_SKB_CB(skb)->end_seq; |
4358 | tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack); | 4360 | tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack); |
4359 | } | 4361 | } |
4362 | p = rb_next(p); | ||
4363 | rb_erase(&skb->rbnode, &tp->out_of_order_queue); | ||
4360 | 4364 | ||
4361 | __skb_unlink(skb, &tp->out_of_order_queue); | 4365 | if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) { |
4362 | if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { | ||
4363 | SOCK_DEBUG(sk, "ofo packet was already received\n"); | 4366 | SOCK_DEBUG(sk, "ofo packet was already received\n"); |
4364 | tcp_drop(sk, skb); | 4367 | tcp_drop(sk, skb); |
4365 | continue; | 4368 | continue; |
@@ -4371,12 +4374,19 @@ static void tcp_ofo_queue(struct sock *sk) | |||
4371 | tail = skb_peek_tail(&sk->sk_receive_queue); | 4374 | tail = skb_peek_tail(&sk->sk_receive_queue); |
4372 | eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen); | 4375 | eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen); |
4373 | tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq); | 4376 | tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq); |
4377 | fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; | ||
4374 | if (!eaten) | 4378 | if (!eaten) |
4375 | __skb_queue_tail(&sk->sk_receive_queue, skb); | 4379 | __skb_queue_tail(&sk->sk_receive_queue, skb); |
4376 | if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) | 4380 | else |
4377 | tcp_fin(sk); | ||
4378 | if (eaten) | ||
4379 | kfree_skb_partial(skb, fragstolen); | 4381 | kfree_skb_partial(skb, fragstolen); |
4382 | |||
4383 | if (unlikely(fin)) { | ||
4384 | tcp_fin(sk); | ||
4385 | /* tcp_fin() purges tp->out_of_order_queue, | ||
4386 | * so we must end this loop right now. | ||
4387 | */ | ||
4388 | break; | ||
4389 | } | ||
4380 | } | 4390 | } |
4381 | } | 4391 | } |
4382 | 4392 | ||
@@ -4403,8 +4413,10 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb, | |||
4403 | static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) | 4413 | static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) |
4404 | { | 4414 | { |
4405 | struct tcp_sock *tp = tcp_sk(sk); | 4415 | struct tcp_sock *tp = tcp_sk(sk); |
4416 | struct rb_node **p, *q, *parent; | ||
4406 | struct sk_buff *skb1; | 4417 | struct sk_buff *skb1; |
4407 | u32 seq, end_seq; | 4418 | u32 seq, end_seq; |
4419 | bool fragstolen; | ||
4408 | 4420 | ||
4409 | tcp_ecn_check_ce(tp, skb); | 4421 | tcp_ecn_check_ce(tp, skb); |
4410 | 4422 | ||
@@ -4419,88 +4431,92 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) | |||
4419 | inet_csk_schedule_ack(sk); | 4431 | inet_csk_schedule_ack(sk); |
4420 | 4432 | ||
4421 | NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE); | 4433 | NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE); |
4434 | seq = TCP_SKB_CB(skb)->seq; | ||
4435 | end_seq = TCP_SKB_CB(skb)->end_seq; | ||
4422 | SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n", | 4436 | SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n", |
4423 | tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); | 4437 | tp->rcv_nxt, seq, end_seq); |
4424 | 4438 | ||
4425 | skb1 = skb_peek_tail(&tp->out_of_order_queue); | 4439 | p = &tp->out_of_order_queue.rb_node; |
4426 | if (!skb1) { | 4440 | if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) { |
4427 | /* Initial out of order segment, build 1 SACK. */ | 4441 | /* Initial out of order segment, build 1 SACK. */ |
4428 | if (tcp_is_sack(tp)) { | 4442 | if (tcp_is_sack(tp)) { |
4429 | tp->rx_opt.num_sacks = 1; | 4443 | tp->rx_opt.num_sacks = 1; |
4430 | tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq; | 4444 | tp->selective_acks[0].start_seq = seq; |
4431 | tp->selective_acks[0].end_seq = | 4445 | tp->selective_acks[0].end_seq = end_seq; |
4432 | TCP_SKB_CB(skb)->end_seq; | ||
4433 | } | 4446 | } |
4434 | __skb_queue_head(&tp->out_of_order_queue, skb); | 4447 | rb_link_node(&skb->rbnode, NULL, p); |
4448 | rb_insert_color(&skb->rbnode, &tp->out_of_order_queue); | ||
4449 | tp->ooo_last_skb = skb; | ||
4435 | goto end; | 4450 | goto end; |
4436 | } | 4451 | } |
4437 | 4452 | ||
4438 | seq = TCP_SKB_CB(skb)->seq; | 4453 | /* In the typical case, we are adding an skb to the end of the list. |
4439 | end_seq = TCP_SKB_CB(skb)->end_seq; | 4454 | * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup. |
4440 | 4455 | */ | |
4441 | if (seq == TCP_SKB_CB(skb1)->end_seq) { | 4456 | if (tcp_try_coalesce(sk, tp->ooo_last_skb, skb, &fragstolen)) { |
4442 | bool fragstolen; | 4457 | coalesce_done: |
4443 | 4458 | tcp_grow_window(sk, skb); | |
4444 | if (!tcp_try_coalesce(sk, skb1, skb, &fragstolen)) { | 4459 | kfree_skb_partial(skb, fragstolen); |
4445 | __skb_queue_after(&tp->out_of_order_queue, skb1, skb); | 4460 | skb = NULL; |
4446 | } else { | 4461 | goto add_sack; |
4447 | tcp_grow_window(sk, skb); | 4462 | } |
4448 | kfree_skb_partial(skb, fragstolen); | 4463 | /* Can avoid an rbtree lookup if we are adding skb after ooo_last_skb */ |
4449 | skb = NULL; | 4464 | if (!before(seq, TCP_SKB_CB(tp->ooo_last_skb)->end_seq)) { |
4450 | } | 4465 | parent = &tp->ooo_last_skb->rbnode; |
4451 | 4466 | p = &parent->rb_right; | |
4452 | if (!tp->rx_opt.num_sacks || | 4467 | goto insert; |
4453 | tp->selective_acks[0].end_seq != seq) | 4468 | } |
4454 | goto add_sack; | 4469 | |
4455 | 4470 | /* Find place to insert this segment. Handle overlaps on the way. */ | |
4456 | /* Common case: data arrive in order after hole. */ | 4471 | parent = NULL; |
4457 | tp->selective_acks[0].end_seq = end_seq; | 4472 | while (*p) { |
4458 | goto end; | 4473 | parent = *p; |
4459 | } | 4474 | skb1 = rb_entry(parent, struct sk_buff, rbnode); |
4460 | 4475 | if (before(seq, TCP_SKB_CB(skb1)->seq)) { | |
4461 | /* Find place to insert this segment. */ | 4476 | p = &parent->rb_left; |
4462 | while (1) { | 4477 | continue; |
4463 | if (!after(TCP_SKB_CB(skb1)->seq, seq)) | ||
4464 | break; | ||
4465 | if (skb_queue_is_first(&tp->out_of_order_queue, skb1)) { | ||
4466 | skb1 = NULL; | ||
4467 | break; | ||
4468 | } | ||
4469 | skb1 = skb_queue_prev(&tp->out_of_order_queue, skb1); | ||
4470 | } | ||
4471 | |||
4472 | /* Do skb overlap to previous one? */ | ||
4473 | if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) { | ||
4474 | if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) { | ||
4475 | /* All the bits are present. Drop. */ | ||
4476 | NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE); | ||
4477 | tcp_drop(sk, skb); | ||
4478 | skb = NULL; | ||
4479 | tcp_dsack_set(sk, seq, end_seq); | ||
4480 | goto add_sack; | ||
4481 | } | 4478 | } |
4482 | if (after(seq, TCP_SKB_CB(skb1)->seq)) { | 4479 | if (before(seq, TCP_SKB_CB(skb1)->end_seq)) { |
4483 | /* Partial overlap. */ | 4480 | if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) { |
4484 | tcp_dsack_set(sk, seq, | 4481 | /* All the bits are present. Drop. */ |
4485 | TCP_SKB_CB(skb1)->end_seq); | 4482 | NET_INC_STATS(sock_net(sk), |
4486 | } else { | 4483 | LINUX_MIB_TCPOFOMERGE); |
4487 | if (skb_queue_is_first(&tp->out_of_order_queue, | 4484 | __kfree_skb(skb); |
4488 | skb1)) | 4485 | skb = NULL; |
4489 | skb1 = NULL; | 4486 | tcp_dsack_set(sk, seq, end_seq); |
4490 | else | 4487 | goto add_sack; |
4491 | skb1 = skb_queue_prev( | 4488 | } |
4492 | &tp->out_of_order_queue, | 4489 | if (after(seq, TCP_SKB_CB(skb1)->seq)) { |
4493 | skb1); | 4490 | /* Partial overlap. */ |
4491 | tcp_dsack_set(sk, seq, TCP_SKB_CB(skb1)->end_seq); | ||
4492 | } else { | ||
4493 | /* skb's seq == skb1's seq and skb covers skb1. | ||
4494 | * Replace skb1 with skb. | ||
4495 | */ | ||
4496 | rb_replace_node(&skb1->rbnode, &skb->rbnode, | ||
4497 | &tp->out_of_order_queue); | ||
4498 | tcp_dsack_extend(sk, | ||
4499 | TCP_SKB_CB(skb1)->seq, | ||
4500 | TCP_SKB_CB(skb1)->end_seq); | ||
4501 | NET_INC_STATS(sock_net(sk), | ||
4502 | LINUX_MIB_TCPOFOMERGE); | ||
4503 | __kfree_skb(skb1); | ||
4504 | goto merge_right; | ||
4505 | } | ||
4506 | } else if (tcp_try_coalesce(sk, skb1, skb, &fragstolen)) { | ||
4507 | goto coalesce_done; | ||
4494 | } | 4508 | } |
4509 | p = &parent->rb_right; | ||
4495 | } | 4510 | } |
4496 | if (!skb1) | 4511 | insert: |
4497 | __skb_queue_head(&tp->out_of_order_queue, skb); | 4512 | /* Insert segment into RB tree. */ |
4498 | else | 4513 | rb_link_node(&skb->rbnode, parent, p); |
4499 | __skb_queue_after(&tp->out_of_order_queue, skb1, skb); | 4514 | rb_insert_color(&skb->rbnode, &tp->out_of_order_queue); |
4500 | 4515 | ||
4501 | /* And clean segments covered by new one as whole. */ | 4516 | merge_right: |
4502 | while (!skb_queue_is_last(&tp->out_of_order_queue, skb)) { | 4517 | /* Remove other segments covered by skb. */ |
4503 | skb1 = skb_queue_next(&tp->out_of_order_queue, skb); | 4518 | while ((q = rb_next(&skb->rbnode)) != NULL) { |
4519 | skb1 = rb_entry(q, struct sk_buff, rbnode); | ||
4504 | 4520 | ||
4505 | if (!after(end_seq, TCP_SKB_CB(skb1)->seq)) | 4521 | if (!after(end_seq, TCP_SKB_CB(skb1)->seq)) |
4506 | break; | 4522 | break; |
@@ -4509,12 +4525,15 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) | |||
4509 | end_seq); | 4525 | end_seq); |
4510 | break; | 4526 | break; |
4511 | } | 4527 | } |
4512 | __skb_unlink(skb1, &tp->out_of_order_queue); | 4528 | rb_erase(&skb1->rbnode, &tp->out_of_order_queue); |
4513 | tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq, | 4529 | tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq, |
4514 | TCP_SKB_CB(skb1)->end_seq); | 4530 | TCP_SKB_CB(skb1)->end_seq); |
4515 | NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE); | 4531 | NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE); |
4516 | tcp_drop(sk, skb1); | 4532 | tcp_drop(sk, skb1); |
4517 | } | 4533 | } |
4534 | /* If there is no skb after us, we are the last_skb ! */ | ||
4535 | if (!q) | ||
4536 | tp->ooo_last_skb = skb; | ||
4518 | 4537 | ||
4519 | add_sack: | 4538 | add_sack: |
4520 | if (tcp_is_sack(tp)) | 4539 | if (tcp_is_sack(tp)) |
@@ -4651,13 +4670,13 @@ queue_and_out: | |||
4651 | if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) | 4670 | if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) |
4652 | tcp_fin(sk); | 4671 | tcp_fin(sk); |
4653 | 4672 | ||
4654 | if (!skb_queue_empty(&tp->out_of_order_queue)) { | 4673 | if (!RB_EMPTY_ROOT(&tp->out_of_order_queue)) { |
4655 | tcp_ofo_queue(sk); | 4674 | tcp_ofo_queue(sk); |
4656 | 4675 | ||
4657 | /* RFC2581. 4.2. SHOULD send immediate ACK, when | 4676 | /* RFC2581. 4.2. SHOULD send immediate ACK, when |
4658 | * gap in queue is filled. | 4677 | * gap in queue is filled. |
4659 | */ | 4678 | */ |
4660 | if (skb_queue_empty(&tp->out_of_order_queue)) | 4679 | if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) |
4661 | inet_csk(sk)->icsk_ack.pingpong = 0; | 4680 | inet_csk(sk)->icsk_ack.pingpong = 0; |
4662 | } | 4681 | } |
4663 | 4682 | ||
@@ -4711,48 +4730,76 @@ drop: | |||
4711 | tcp_data_queue_ofo(sk, skb); | 4730 | tcp_data_queue_ofo(sk, skb); |
4712 | } | 4731 | } |
4713 | 4732 | ||
4733 | static struct sk_buff *tcp_skb_next(struct sk_buff *skb, struct sk_buff_head *list) | ||
4734 | { | ||
4735 | if (list) | ||
4736 | return !skb_queue_is_last(list, skb) ? skb->next : NULL; | ||
4737 | |||
4738 | return rb_entry_safe(rb_next(&skb->rbnode), struct sk_buff, rbnode); | ||
4739 | } | ||
4740 | |||
4714 | static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb, | 4741 | static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb, |
4715 | struct sk_buff_head *list) | 4742 | struct sk_buff_head *list, |
4743 | struct rb_root *root) | ||
4716 | { | 4744 | { |
4717 | struct sk_buff *next = NULL; | 4745 | struct sk_buff *next = tcp_skb_next(skb, list); |
4718 | 4746 | ||
4719 | if (!skb_queue_is_last(list, skb)) | 4747 | if (list) |
4720 | next = skb_queue_next(list, skb); | 4748 | __skb_unlink(skb, list); |
4749 | else | ||
4750 | rb_erase(&skb->rbnode, root); | ||
4721 | 4751 | ||
4722 | __skb_unlink(skb, list); | ||
4723 | __kfree_skb(skb); | 4752 | __kfree_skb(skb); |
4724 | NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED); | 4753 | NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED); |
4725 | 4754 | ||
4726 | return next; | 4755 | return next; |
4727 | } | 4756 | } |
4728 | 4757 | ||
4758 | /* Insert skb into rb tree, ordered by TCP_SKB_CB(skb)->seq */ | ||
4759 | static void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb) | ||
4760 | { | ||
4761 | struct rb_node **p = &root->rb_node; | ||
4762 | struct rb_node *parent = NULL; | ||
4763 | struct sk_buff *skb1; | ||
4764 | |||
4765 | while (*p) { | ||
4766 | parent = *p; | ||
4767 | skb1 = rb_entry(parent, struct sk_buff, rbnode); | ||
4768 | if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq)) | ||
4769 | p = &parent->rb_left; | ||
4770 | else | ||
4771 | p = &parent->rb_right; | ||
4772 | } | ||
4773 | rb_link_node(&skb->rbnode, parent, p); | ||
4774 | rb_insert_color(&skb->rbnode, root); | ||
4775 | } | ||
4776 | |||
4729 | /* Collapse contiguous sequence of skbs head..tail with | 4777 | /* Collapse contiguous sequence of skbs head..tail with |
4730 | * sequence numbers start..end. | 4778 | * sequence numbers start..end. |
4731 | * | 4779 | * |
4732 | * If tail is NULL, this means until the end of the list. | 4780 | * If tail is NULL, this means until the end of the queue. |
4733 | * | 4781 | * |
4734 | * Segments with FIN/SYN are not collapsed (only because this | 4782 | * Segments with FIN/SYN are not collapsed (only because this |
4735 | * simplifies code) | 4783 | * simplifies code) |
4736 | */ | 4784 | */ |
4737 | static void | 4785 | static void |
4738 | tcp_collapse(struct sock *sk, struct sk_buff_head *list, | 4786 | tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct rb_root *root, |
4739 | struct sk_buff *head, struct sk_buff *tail, | 4787 | struct sk_buff *head, struct sk_buff *tail, u32 start, u32 end) |
4740 | u32 start, u32 end) | ||
4741 | { | 4788 | { |
4742 | struct sk_buff *skb, *n; | 4789 | struct sk_buff *skb = head, *n; |
4790 | struct sk_buff_head tmp; | ||
4743 | bool end_of_skbs; | 4791 | bool end_of_skbs; |
4744 | 4792 | ||
4745 | /* First, check that queue is collapsible and find | 4793 | /* First, check that queue is collapsible and find |
4746 | * the point where collapsing can be useful. */ | 4794 | * the point where collapsing can be useful. |
4747 | skb = head; | 4795 | */ |
4748 | restart: | 4796 | restart: |
4749 | end_of_skbs = true; | 4797 | for (end_of_skbs = true; skb != NULL && skb != tail; skb = n) { |
4750 | skb_queue_walk_from_safe(list, skb, n) { | 4798 | n = tcp_skb_next(skb, list); |
4751 | if (skb == tail) | 4799 | |
4752 | break; | ||
4753 | /* No new bits? It is possible on ofo queue. */ | 4800 | /* No new bits? It is possible on ofo queue. */ |
4754 | if (!before(start, TCP_SKB_CB(skb)->end_seq)) { | 4801 | if (!before(start, TCP_SKB_CB(skb)->end_seq)) { |
4755 | skb = tcp_collapse_one(sk, skb, list); | 4802 | skb = tcp_collapse_one(sk, skb, list, root); |
4756 | if (!skb) | 4803 | if (!skb) |
4757 | break; | 4804 | break; |
4758 | goto restart; | 4805 | goto restart; |
@@ -4770,13 +4817,10 @@ restart: | |||
4770 | break; | 4817 | break; |
4771 | } | 4818 | } |
4772 | 4819 | ||
4773 | if (!skb_queue_is_last(list, skb)) { | 4820 | if (n && n != tail && |
4774 | struct sk_buff *next = skb_queue_next(list, skb); | 4821 | TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(n)->seq) { |
4775 | if (next != tail && | 4822 | end_of_skbs = false; |
4776 | TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(next)->seq) { | 4823 | break; |
4777 | end_of_skbs = false; | ||
4778 | break; | ||
4779 | } | ||
4780 | } | 4824 | } |
4781 | 4825 | ||
4782 | /* Decided to skip this, advance start seq. */ | 4826 | /* Decided to skip this, advance start seq. */ |
@@ -4786,17 +4830,22 @@ restart: | |||
4786 | (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN))) | 4830 | (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN))) |
4787 | return; | 4831 | return; |
4788 | 4832 | ||
4833 | __skb_queue_head_init(&tmp); | ||
4834 | |||
4789 | while (before(start, end)) { | 4835 | while (before(start, end)) { |
4790 | int copy = min_t(int, SKB_MAX_ORDER(0, 0), end - start); | 4836 | int copy = min_t(int, SKB_MAX_ORDER(0, 0), end - start); |
4791 | struct sk_buff *nskb; | 4837 | struct sk_buff *nskb; |
4792 | 4838 | ||
4793 | nskb = alloc_skb(copy, GFP_ATOMIC); | 4839 | nskb = alloc_skb(copy, GFP_ATOMIC); |
4794 | if (!nskb) | 4840 | if (!nskb) |
4795 | return; | 4841 | break; |
4796 | 4842 | ||
4797 | memcpy(nskb->cb, skb->cb, sizeof(skb->cb)); | 4843 | memcpy(nskb->cb, skb->cb, sizeof(skb->cb)); |
4798 | TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start; | 4844 | TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start; |
4799 | __skb_queue_before(list, skb, nskb); | 4845 | if (list) |
4846 | __skb_queue_before(list, skb, nskb); | ||
4847 | else | ||
4848 | __skb_queue_tail(&tmp, nskb); /* defer rbtree insertion */ | ||
4800 | skb_set_owner_r(nskb, sk); | 4849 | skb_set_owner_r(nskb, sk); |
4801 | 4850 | ||
4802 | /* Copy data, releasing collapsed skbs. */ | 4851 | /* Copy data, releasing collapsed skbs. */ |
@@ -4814,14 +4863,17 @@ restart: | |||
4814 | start += size; | 4863 | start += size; |
4815 | } | 4864 | } |
4816 | if (!before(start, TCP_SKB_CB(skb)->end_seq)) { | 4865 | if (!before(start, TCP_SKB_CB(skb)->end_seq)) { |
4817 | skb = tcp_collapse_one(sk, skb, list); | 4866 | skb = tcp_collapse_one(sk, skb, list, root); |
4818 | if (!skb || | 4867 | if (!skb || |
4819 | skb == tail || | 4868 | skb == tail || |
4820 | (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN))) | 4869 | (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN))) |
4821 | return; | 4870 | goto end; |
4822 | } | 4871 | } |
4823 | } | 4872 | } |
4824 | } | 4873 | } |
4874 | end: | ||
4875 | skb_queue_walk_safe(&tmp, skb, n) | ||
4876 | tcp_rbtree_insert(root, skb); | ||
4825 | } | 4877 | } |
4826 | 4878 | ||
4827 | /* Collapse ofo queue. Algorithm: select contiguous sequence of skbs | 4879 | /* Collapse ofo queue. Algorithm: select contiguous sequence of skbs |
@@ -4830,43 +4882,43 @@ restart: | |||
4830 | static void tcp_collapse_ofo_queue(struct sock *sk) | 4882 | static void tcp_collapse_ofo_queue(struct sock *sk) |
4831 | { | 4883 | { |
4832 | struct tcp_sock *tp = tcp_sk(sk); | 4884 | struct tcp_sock *tp = tcp_sk(sk); |
4833 | struct sk_buff *skb = skb_peek(&tp->out_of_order_queue); | 4885 | struct sk_buff *skb, *head; |
4834 | struct sk_buff *head; | 4886 | struct rb_node *p; |
4835 | u32 start, end; | 4887 | u32 start, end; |
4836 | 4888 | ||
4837 | if (!skb) | 4889 | p = rb_first(&tp->out_of_order_queue); |
4890 | skb = rb_entry_safe(p, struct sk_buff, rbnode); | ||
4891 | new_range: | ||
4892 | if (!skb) { | ||
4893 | p = rb_last(&tp->out_of_order_queue); | ||
4894 | /* Note: This is possible p is NULL here. We do not | ||
4895 | * use rb_entry_safe(), as ooo_last_skb is valid only | ||
4896 | * if rbtree is not empty. | ||
4897 | */ | ||
4898 | tp->ooo_last_skb = rb_entry(p, struct sk_buff, rbnode); | ||
4838 | return; | 4899 | return; |
4839 | 4900 | } | |
4840 | start = TCP_SKB_CB(skb)->seq; | 4901 | start = TCP_SKB_CB(skb)->seq; |
4841 | end = TCP_SKB_CB(skb)->end_seq; | 4902 | end = TCP_SKB_CB(skb)->end_seq; |
4842 | head = skb; | ||
4843 | |||
4844 | for (;;) { | ||
4845 | struct sk_buff *next = NULL; | ||
4846 | 4903 | ||
4847 | if (!skb_queue_is_last(&tp->out_of_order_queue, skb)) | 4904 | for (head = skb;;) { |
4848 | next = skb_queue_next(&tp->out_of_order_queue, skb); | 4905 | skb = tcp_skb_next(skb, NULL); |
4849 | skb = next; | ||
4850 | 4906 | ||
4851 | /* Segment is terminated when we see gap or when | 4907 | /* Range is terminated when we see a gap or when |
4852 | * we are at the end of all the queue. */ | 4908 | * we are at the queue end. |
4909 | */ | ||
4853 | if (!skb || | 4910 | if (!skb || |
4854 | after(TCP_SKB_CB(skb)->seq, end) || | 4911 | after(TCP_SKB_CB(skb)->seq, end) || |
4855 | before(TCP_SKB_CB(skb)->end_seq, start)) { | 4912 | before(TCP_SKB_CB(skb)->end_seq, start)) { |
4856 | tcp_collapse(sk, &tp->out_of_order_queue, | 4913 | tcp_collapse(sk, NULL, &tp->out_of_order_queue, |
4857 | head, skb, start, end); | 4914 | head, skb, start, end); |
4858 | head = skb; | 4915 | goto new_range; |
4859 | if (!skb) | 4916 | } |
4860 | break; | 4917 | |
4861 | /* Start new segment */ | 4918 | if (unlikely(before(TCP_SKB_CB(skb)->seq, start))) |
4862 | start = TCP_SKB_CB(skb)->seq; | 4919 | start = TCP_SKB_CB(skb)->seq; |
4920 | if (after(TCP_SKB_CB(skb)->end_seq, end)) | ||
4863 | end = TCP_SKB_CB(skb)->end_seq; | 4921 | end = TCP_SKB_CB(skb)->end_seq; |
4864 | } else { | ||
4865 | if (before(TCP_SKB_CB(skb)->seq, start)) | ||
4866 | start = TCP_SKB_CB(skb)->seq; | ||
4867 | if (after(TCP_SKB_CB(skb)->end_seq, end)) | ||
4868 | end = TCP_SKB_CB(skb)->end_seq; | ||
4869 | } | ||
4870 | } | 4922 | } |
4871 | } | 4923 | } |
4872 | 4924 | ||
@@ -4883,20 +4935,24 @@ static void tcp_collapse_ofo_queue(struct sock *sk) | |||
4883 | static bool tcp_prune_ofo_queue(struct sock *sk) | 4935 | static bool tcp_prune_ofo_queue(struct sock *sk) |
4884 | { | 4936 | { |
4885 | struct tcp_sock *tp = tcp_sk(sk); | 4937 | struct tcp_sock *tp = tcp_sk(sk); |
4886 | struct sk_buff *skb; | 4938 | struct rb_node *node, *prev; |
4887 | 4939 | ||
4888 | if (skb_queue_empty(&tp->out_of_order_queue)) | 4940 | if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) |
4889 | return false; | 4941 | return false; |
4890 | 4942 | ||
4891 | NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED); | 4943 | NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED); |
4892 | 4944 | node = &tp->ooo_last_skb->rbnode; | |
4893 | while ((skb = __skb_dequeue_tail(&tp->out_of_order_queue)) != NULL) { | 4945 | do { |
4894 | tcp_drop(sk, skb); | 4946 | prev = rb_prev(node); |
4947 | rb_erase(node, &tp->out_of_order_queue); | ||
4948 | tcp_drop(sk, rb_entry(node, struct sk_buff, rbnode)); | ||
4895 | sk_mem_reclaim(sk); | 4949 | sk_mem_reclaim(sk); |
4896 | if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && | 4950 | if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && |
4897 | !tcp_under_memory_pressure(sk)) | 4951 | !tcp_under_memory_pressure(sk)) |
4898 | break; | 4952 | break; |
4899 | } | 4953 | node = prev; |
4954 | } while (node); | ||
4955 | tp->ooo_last_skb = rb_entry(prev, struct sk_buff, rbnode); | ||
4900 | 4956 | ||
4901 | /* Reset SACK state. A conforming SACK implementation will | 4957 | /* Reset SACK state. A conforming SACK implementation will |
4902 | * do the same at a timeout based retransmit. When a connection | 4958 | * do the same at a timeout based retransmit. When a connection |
@@ -4930,7 +4986,7 @@ static int tcp_prune_queue(struct sock *sk) | |||
4930 | 4986 | ||
4931 | tcp_collapse_ofo_queue(sk); | 4987 | tcp_collapse_ofo_queue(sk); |
4932 | if (!skb_queue_empty(&sk->sk_receive_queue)) | 4988 | if (!skb_queue_empty(&sk->sk_receive_queue)) |
4933 | tcp_collapse(sk, &sk->sk_receive_queue, | 4989 | tcp_collapse(sk, &sk->sk_receive_queue, NULL, |
4934 | skb_peek(&sk->sk_receive_queue), | 4990 | skb_peek(&sk->sk_receive_queue), |
4935 | NULL, | 4991 | NULL, |
4936 | tp->copied_seq, tp->rcv_nxt); | 4992 | tp->copied_seq, tp->rcv_nxt); |
@@ -5035,7 +5091,7 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) | |||
5035 | /* We ACK each frame or... */ | 5091 | /* We ACK each frame or... */ |
5036 | tcp_in_quickack_mode(sk) || | 5092 | tcp_in_quickack_mode(sk) || |
5037 | /* We have out of order data. */ | 5093 | /* We have out of order data. */ |
5038 | (ofo_possible && skb_peek(&tp->out_of_order_queue))) { | 5094 | (ofo_possible && !RB_EMPTY_ROOT(&tp->out_of_order_queue))) { |
5039 | /* Then ack it now */ | 5095 | /* Then ack it now */ |
5040 | tcp_send_ack(sk); | 5096 | tcp_send_ack(sk); |
5041 | } else { | 5097 | } else { |
@@ -5894,7 +5950,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) | |||
5894 | * so release it. | 5950 | * so release it. |
5895 | */ | 5951 | */ |
5896 | if (req) { | 5952 | if (req) { |
5897 | tp->total_retrans = req->num_retrans; | 5953 | inet_csk(sk)->icsk_retransmits = 0; |
5898 | reqsk_fastopen_remove(sk, req, false); | 5954 | reqsk_fastopen_remove(sk, req, false); |
5899 | } else { | 5955 | } else { |
5900 | /* Make sure socket is routed, for correct metrics. */ | 5956 | /* Make sure socket is routed, for correct metrics. */ |
@@ -5936,7 +5992,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) | |||
5936 | } else | 5992 | } else |
5937 | tcp_init_metrics(sk); | 5993 | tcp_init_metrics(sk); |
5938 | 5994 | ||
5939 | tcp_update_pacing_rate(sk); | 5995 | if (!inet_csk(sk)->icsk_ca_ops->cong_control) |
5996 | tcp_update_pacing_rate(sk); | ||
5940 | 5997 | ||
5941 | /* Prevent spurious tcp_cwnd_restart() on first data packet */ | 5998 | /* Prevent spurious tcp_cwnd_restart() on first data packet */ |
5942 | tp->lsndtime = tcp_time_stamp; | 5999 | tp->lsndtime = tcp_time_stamp; |