aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp_input.c
diff options
context:
space:
mode:
authorPablo Neira Ayuso <pablo@netfilter.org>2016-09-25 17:23:57 -0400
committerPablo Neira Ayuso <pablo@netfilter.org>2016-09-25 17:34:19 -0400
commitf20fbc0717f9f007c94b2641134b19228d0ce9ed (patch)
tree1404248ebbec552a3fb7928b75322b65d74de1bd /net/ipv4/tcp_input.c
parent8cb2a7d5667ab9a9c2fdd356357b85b63b320901 (diff)
parentfe0acb5fcb7fe8cb3d68bbdb8459865c972d8f83 (diff)
Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Conflicts: net/netfilter/core.c net/netfilter/nf_tables_netdev.c Resolve two conflicts before pull request for David's net-next tree: 1) Between c73c24849011 ("netfilter: nf_tables_netdev: remove redundant ip_hdr assignment") from the net tree and commit ddc8b6027ad0 ("netfilter: introduce nft_set_pktinfo_{ipv4, ipv6}_validate()"). 2) Between e8bffe0cf964 ("net: Add _nf_(un)register_hooks symbols") and Aaron Conole's patches to replace list_head with single linked list. Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Diffstat (limited to 'net/ipv4/tcp_input.c')
-rw-r--r--net/ipv4/tcp_input.c495
1 files changed, 276 insertions, 219 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index f3a9f3c2c8d8..8c6ad2d319d6 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -289,6 +289,7 @@ static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr
289static void tcp_sndbuf_expand(struct sock *sk) 289static void tcp_sndbuf_expand(struct sock *sk)
290{ 290{
291 const struct tcp_sock *tp = tcp_sk(sk); 291 const struct tcp_sock *tp = tcp_sk(sk);
292 const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
292 int sndmem, per_mss; 293 int sndmem, per_mss;
293 u32 nr_segs; 294 u32 nr_segs;
294 295
@@ -309,7 +310,8 @@ static void tcp_sndbuf_expand(struct sock *sk)
309 * Cubic needs 1.7 factor, rounded to 2 to include 310 * Cubic needs 1.7 factor, rounded to 2 to include
310 * extra cushion (application might react slowly to POLLOUT) 311 * extra cushion (application might react slowly to POLLOUT)
311 */ 312 */
312 sndmem = 2 * nr_segs * per_mss; 313 sndmem = ca_ops->sndbuf_expand ? ca_ops->sndbuf_expand(sk) : 2;
314 sndmem *= nr_segs * per_mss;
313 315
314 if (sk->sk_sndbuf < sndmem) 316 if (sk->sk_sndbuf < sndmem)
315 sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]); 317 sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]);
@@ -899,12 +901,29 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
899 tp->retransmit_high = TCP_SKB_CB(skb)->end_seq; 901 tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
900} 902}
901 903
904/* Sum the number of packets on the wire we have marked as lost.
905 * There are two cases we care about here:
906 * a) Packet hasn't been marked lost (nor retransmitted),
907 * and this is the first loss.
908 * b) Packet has been marked both lost and retransmitted,
909 * and this means we think it was lost again.
910 */
911static void tcp_sum_lost(struct tcp_sock *tp, struct sk_buff *skb)
912{
913 __u8 sacked = TCP_SKB_CB(skb)->sacked;
914
915 if (!(sacked & TCPCB_LOST) ||
916 ((sacked & TCPCB_LOST) && (sacked & TCPCB_SACKED_RETRANS)))
917 tp->lost += tcp_skb_pcount(skb);
918}
919
902static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb) 920static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb)
903{ 921{
904 if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) { 922 if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) {
905 tcp_verify_retransmit_hint(tp, skb); 923 tcp_verify_retransmit_hint(tp, skb);
906 924
907 tp->lost_out += tcp_skb_pcount(skb); 925 tp->lost_out += tcp_skb_pcount(skb);
926 tcp_sum_lost(tp, skb);
908 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; 927 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
909 } 928 }
910} 929}
@@ -913,6 +932,7 @@ void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb)
913{ 932{
914 tcp_verify_retransmit_hint(tp, skb); 933 tcp_verify_retransmit_hint(tp, skb);
915 934
935 tcp_sum_lost(tp, skb);
916 if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) { 936 if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) {
917 tp->lost_out += tcp_skb_pcount(skb); 937 tp->lost_out += tcp_skb_pcount(skb);
918 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; 938 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
@@ -1094,6 +1114,7 @@ struct tcp_sacktag_state {
1094 */ 1114 */
1095 struct skb_mstamp first_sackt; 1115 struct skb_mstamp first_sackt;
1096 struct skb_mstamp last_sackt; 1116 struct skb_mstamp last_sackt;
1117 struct rate_sample *rate;
1097 int flag; 1118 int flag;
1098}; 1119};
1099 1120
@@ -1261,6 +1282,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
1261 tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked, 1282 tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked,
1262 start_seq, end_seq, dup_sack, pcount, 1283 start_seq, end_seq, dup_sack, pcount,
1263 &skb->skb_mstamp); 1284 &skb->skb_mstamp);
1285 tcp_rate_skb_delivered(sk, skb, state->rate);
1264 1286
1265 if (skb == tp->lost_skb_hint) 1287 if (skb == tp->lost_skb_hint)
1266 tp->lost_cnt_hint += pcount; 1288 tp->lost_cnt_hint += pcount;
@@ -1311,6 +1333,9 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
1311 tcp_advance_highest_sack(sk, skb); 1333 tcp_advance_highest_sack(sk, skb);
1312 1334
1313 tcp_skb_collapse_tstamp(prev, skb); 1335 tcp_skb_collapse_tstamp(prev, skb);
1336 if (unlikely(TCP_SKB_CB(prev)->tx.delivered_mstamp.v64))
1337 TCP_SKB_CB(prev)->tx.delivered_mstamp.v64 = 0;
1338
1314 tcp_unlink_write_queue(skb, sk); 1339 tcp_unlink_write_queue(skb, sk);
1315 sk_wmem_free_skb(sk, skb); 1340 sk_wmem_free_skb(sk, skb);
1316 1341
@@ -1540,6 +1565,7 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
1540 dup_sack, 1565 dup_sack,
1541 tcp_skb_pcount(skb), 1566 tcp_skb_pcount(skb),
1542 &skb->skb_mstamp); 1567 &skb->skb_mstamp);
1568 tcp_rate_skb_delivered(sk, skb, state->rate);
1543 1569
1544 if (!before(TCP_SKB_CB(skb)->seq, 1570 if (!before(TCP_SKB_CB(skb)->seq,
1545 tcp_highest_sack_seq(tp))) 1571 tcp_highest_sack_seq(tp)))
@@ -1622,8 +1648,10 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
1622 1648
1623 found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire, 1649 found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire,
1624 num_sacks, prior_snd_una); 1650 num_sacks, prior_snd_una);
1625 if (found_dup_sack) 1651 if (found_dup_sack) {
1626 state->flag |= FLAG_DSACKING_ACK; 1652 state->flag |= FLAG_DSACKING_ACK;
1653 tp->delivered++; /* A spurious retransmission is delivered */
1654 }
1627 1655
1628 /* Eliminate too old ACKs, but take into 1656 /* Eliminate too old ACKs, but take into
1629 * account more or less fresh ones, they can 1657 * account more or less fresh ones, they can
@@ -1890,6 +1918,7 @@ void tcp_enter_loss(struct sock *sk)
1890 struct sk_buff *skb; 1918 struct sk_buff *skb;
1891 bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery; 1919 bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery;
1892 bool is_reneg; /* is receiver reneging on SACKs? */ 1920 bool is_reneg; /* is receiver reneging on SACKs? */
1921 bool mark_lost;
1893 1922
1894 /* Reduce ssthresh if it has not yet been made inside this window. */ 1923 /* Reduce ssthresh if it has not yet been made inside this window. */
1895 if (icsk->icsk_ca_state <= TCP_CA_Disorder || 1924 if (icsk->icsk_ca_state <= TCP_CA_Disorder ||
@@ -1923,8 +1952,12 @@ void tcp_enter_loss(struct sock *sk)
1923 if (skb == tcp_send_head(sk)) 1952 if (skb == tcp_send_head(sk))
1924 break; 1953 break;
1925 1954
1955 mark_lost = (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) ||
1956 is_reneg);
1957 if (mark_lost)
1958 tcp_sum_lost(tp, skb);
1926 TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED; 1959 TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED;
1927 if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || is_reneg) { 1960 if (mark_lost) {
1928 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED; 1961 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
1929 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; 1962 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
1930 tp->lost_out += tcp_skb_pcount(skb); 1963 tp->lost_out += tcp_skb_pcount(skb);
@@ -2503,6 +2536,9 @@ static inline void tcp_end_cwnd_reduction(struct sock *sk)
2503{ 2536{
2504 struct tcp_sock *tp = tcp_sk(sk); 2537 struct tcp_sock *tp = tcp_sk(sk);
2505 2538
2539 if (inet_csk(sk)->icsk_ca_ops->cong_control)
2540 return;
2541
2506 /* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */ 2542 /* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */
2507 if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR || 2543 if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR ||
2508 (tp->undo_marker && tp->snd_ssthresh < TCP_INFINITE_SSTHRESH)) { 2544 (tp->undo_marker && tp->snd_ssthresh < TCP_INFINITE_SSTHRESH)) {
@@ -2879,67 +2915,13 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
2879 *rexmit = REXMIT_LOST; 2915 *rexmit = REXMIT_LOST;
2880} 2916}
2881 2917
2882/* Kathleen Nichols' algorithm for tracking the minimum value of
2883 * a data stream over some fixed time interval. (E.g., the minimum
2884 * RTT over the past five minutes.) It uses constant space and constant
2885 * time per update yet almost always delivers the same minimum as an
2886 * implementation that has to keep all the data in the window.
2887 *
2888 * The algorithm keeps track of the best, 2nd best & 3rd best min
2889 * values, maintaining an invariant that the measurement time of the
2890 * n'th best >= n-1'th best. It also makes sure that the three values
2891 * are widely separated in the time window since that bounds the worse
2892 * case error when that data is monotonically increasing over the window.
2893 *
2894 * Upon getting a new min, we can forget everything earlier because it
2895 * has no value - the new min is <= everything else in the window by
2896 * definition and it's the most recent. So we restart fresh on every new min
2897 * and overwrites 2nd & 3rd choices. The same property holds for 2nd & 3rd
2898 * best.
2899 */
2900static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us) 2918static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us)
2901{ 2919{
2902 const u32 now = tcp_time_stamp, wlen = sysctl_tcp_min_rtt_wlen * HZ; 2920 struct tcp_sock *tp = tcp_sk(sk);
2903 struct rtt_meas *m = tcp_sk(sk)->rtt_min; 2921 u32 wlen = sysctl_tcp_min_rtt_wlen * HZ;
2904 struct rtt_meas rttm = { 2922
2905 .rtt = likely(rtt_us) ? rtt_us : jiffies_to_usecs(1), 2923 minmax_running_min(&tp->rtt_min, wlen, tcp_time_stamp,
2906 .ts = now, 2924 rtt_us ? : jiffies_to_usecs(1));
2907 };
2908 u32 elapsed;
2909
2910 /* Check if the new measurement updates the 1st, 2nd, or 3rd choices */
2911 if (unlikely(rttm.rtt <= m[0].rtt))
2912 m[0] = m[1] = m[2] = rttm;
2913 else if (rttm.rtt <= m[1].rtt)
2914 m[1] = m[2] = rttm;
2915 else if (rttm.rtt <= m[2].rtt)
2916 m[2] = rttm;
2917
2918 elapsed = now - m[0].ts;
2919 if (unlikely(elapsed > wlen)) {
2920 /* Passed entire window without a new min so make 2nd choice
2921 * the new min & 3rd choice the new 2nd. So forth and so on.
2922 */
2923 m[0] = m[1];
2924 m[1] = m[2];
2925 m[2] = rttm;
2926 if (now - m[0].ts > wlen) {
2927 m[0] = m[1];
2928 m[1] = rttm;
2929 if (now - m[0].ts > wlen)
2930 m[0] = rttm;
2931 }
2932 } else if (m[1].ts == m[0].ts && elapsed > wlen / 4) {
2933 /* Passed a quarter of the window without a new min so
2934 * take 2nd choice from the 2nd quarter of the window.
2935 */
2936 m[2] = m[1] = rttm;
2937 } else if (m[2].ts == m[1].ts && elapsed > wlen / 2) {
2938 /* Passed half the window without a new min so take the 3rd
2939 * choice from the last half of the window.
2940 */
2941 m[2] = rttm;
2942 }
2943} 2925}
2944 2926
2945static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, 2927static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
@@ -3102,10 +3084,11 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
3102 */ 3084 */
3103static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, 3085static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3104 u32 prior_snd_una, int *acked, 3086 u32 prior_snd_una, int *acked,
3105 struct tcp_sacktag_state *sack) 3087 struct tcp_sacktag_state *sack,
3088 struct skb_mstamp *now)
3106{ 3089{
3107 const struct inet_connection_sock *icsk = inet_csk(sk); 3090 const struct inet_connection_sock *icsk = inet_csk(sk);
3108 struct skb_mstamp first_ackt, last_ackt, now; 3091 struct skb_mstamp first_ackt, last_ackt;
3109 struct tcp_sock *tp = tcp_sk(sk); 3092 struct tcp_sock *tp = tcp_sk(sk);
3110 u32 prior_sacked = tp->sacked_out; 3093 u32 prior_sacked = tp->sacked_out;
3111 u32 reord = tp->packets_out; 3094 u32 reord = tp->packets_out;
@@ -3137,7 +3120,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3137 acked_pcount = tcp_tso_acked(sk, skb); 3120 acked_pcount = tcp_tso_acked(sk, skb);
3138 if (!acked_pcount) 3121 if (!acked_pcount)
3139 break; 3122 break;
3140
3141 fully_acked = false; 3123 fully_acked = false;
3142 } else { 3124 } else {
3143 /* Speedup tcp_unlink_write_queue() and next loop */ 3125 /* Speedup tcp_unlink_write_queue() and next loop */
@@ -3173,6 +3155,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3173 3155
3174 tp->packets_out -= acked_pcount; 3156 tp->packets_out -= acked_pcount;
3175 pkts_acked += acked_pcount; 3157 pkts_acked += acked_pcount;
3158 tcp_rate_skb_delivered(sk, skb, sack->rate);
3176 3159
3177 /* Initial outgoing SYN's get put onto the write_queue 3160 /* Initial outgoing SYN's get put onto the write_queue
3178 * just like anything else we transmit. It is not 3161 * just like anything else we transmit. It is not
@@ -3205,16 +3188,15 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3205 if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) 3188 if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
3206 flag |= FLAG_SACK_RENEGING; 3189 flag |= FLAG_SACK_RENEGING;
3207 3190
3208 skb_mstamp_get(&now);
3209 if (likely(first_ackt.v64) && !(flag & FLAG_RETRANS_DATA_ACKED)) { 3191 if (likely(first_ackt.v64) && !(flag & FLAG_RETRANS_DATA_ACKED)) {
3210 seq_rtt_us = skb_mstamp_us_delta(&now, &first_ackt); 3192 seq_rtt_us = skb_mstamp_us_delta(now, &first_ackt);
3211 ca_rtt_us = skb_mstamp_us_delta(&now, &last_ackt); 3193 ca_rtt_us = skb_mstamp_us_delta(now, &last_ackt);
3212 } 3194 }
3213 if (sack->first_sackt.v64) { 3195 if (sack->first_sackt.v64) {
3214 sack_rtt_us = skb_mstamp_us_delta(&now, &sack->first_sackt); 3196 sack_rtt_us = skb_mstamp_us_delta(now, &sack->first_sackt);
3215 ca_rtt_us = skb_mstamp_us_delta(&now, &sack->last_sackt); 3197 ca_rtt_us = skb_mstamp_us_delta(now, &sack->last_sackt);
3216 } 3198 }
3217 3199 sack->rate->rtt_us = ca_rtt_us; /* RTT of last (S)ACKed packet, or -1 */
3218 rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us, 3200 rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us,
3219 ca_rtt_us); 3201 ca_rtt_us);
3220 3202
@@ -3242,7 +3224,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3242 tp->fackets_out -= min(pkts_acked, tp->fackets_out); 3224 tp->fackets_out -= min(pkts_acked, tp->fackets_out);
3243 3225
3244 } else if (skb && rtt_update && sack_rtt_us >= 0 && 3226 } else if (skb && rtt_update && sack_rtt_us >= 0 &&
3245 sack_rtt_us > skb_mstamp_us_delta(&now, &skb->skb_mstamp)) { 3227 sack_rtt_us > skb_mstamp_us_delta(now, &skb->skb_mstamp)) {
3246 /* Do not re-arm RTO if the sack RTT is measured from data sent 3228 /* Do not re-arm RTO if the sack RTT is measured from data sent
3247 * after when the head was last (re)transmitted. Otherwise the 3229 * after when the head was last (re)transmitted. Otherwise the
3248 * timeout may continue to extend in loss recovery. 3230 * timeout may continue to extend in loss recovery.
@@ -3333,8 +3315,15 @@ static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag)
3333 * information. All transmission or retransmission are delayed afterwards. 3315 * information. All transmission or retransmission are delayed afterwards.
3334 */ 3316 */
3335static void tcp_cong_control(struct sock *sk, u32 ack, u32 acked_sacked, 3317static void tcp_cong_control(struct sock *sk, u32 ack, u32 acked_sacked,
3336 int flag) 3318 int flag, const struct rate_sample *rs)
3337{ 3319{
3320 const struct inet_connection_sock *icsk = inet_csk(sk);
3321
3322 if (icsk->icsk_ca_ops->cong_control) {
3323 icsk->icsk_ca_ops->cong_control(sk, rs);
3324 return;
3325 }
3326
3338 if (tcp_in_cwnd_reduction(sk)) { 3327 if (tcp_in_cwnd_reduction(sk)) {
3339 /* Reduce cwnd if state mandates */ 3328 /* Reduce cwnd if state mandates */
3340 tcp_cwnd_reduction(sk, acked_sacked, flag); 3329 tcp_cwnd_reduction(sk, acked_sacked, flag);
@@ -3579,17 +3568,21 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3579 struct inet_connection_sock *icsk = inet_csk(sk); 3568 struct inet_connection_sock *icsk = inet_csk(sk);
3580 struct tcp_sock *tp = tcp_sk(sk); 3569 struct tcp_sock *tp = tcp_sk(sk);
3581 struct tcp_sacktag_state sack_state; 3570 struct tcp_sacktag_state sack_state;
3571 struct rate_sample rs = { .prior_delivered = 0 };
3582 u32 prior_snd_una = tp->snd_una; 3572 u32 prior_snd_una = tp->snd_una;
3583 u32 ack_seq = TCP_SKB_CB(skb)->seq; 3573 u32 ack_seq = TCP_SKB_CB(skb)->seq;
3584 u32 ack = TCP_SKB_CB(skb)->ack_seq; 3574 u32 ack = TCP_SKB_CB(skb)->ack_seq;
3585 bool is_dupack = false; 3575 bool is_dupack = false;
3586 u32 prior_fackets; 3576 u32 prior_fackets;
3587 int prior_packets = tp->packets_out; 3577 int prior_packets = tp->packets_out;
3588 u32 prior_delivered = tp->delivered; 3578 u32 delivered = tp->delivered;
3579 u32 lost = tp->lost;
3589 int acked = 0; /* Number of packets newly acked */ 3580 int acked = 0; /* Number of packets newly acked */
3590 int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */ 3581 int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */
3582 struct skb_mstamp now;
3591 3583
3592 sack_state.first_sackt.v64 = 0; 3584 sack_state.first_sackt.v64 = 0;
3585 sack_state.rate = &rs;
3593 3586
3594 /* We very likely will need to access write queue head. */ 3587 /* We very likely will need to access write queue head. */
3595 prefetchw(sk->sk_write_queue.next); 3588 prefetchw(sk->sk_write_queue.next);
@@ -3612,6 +3605,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3612 if (after(ack, tp->snd_nxt)) 3605 if (after(ack, tp->snd_nxt))
3613 goto invalid_ack; 3606 goto invalid_ack;
3614 3607
3608 skb_mstamp_get(&now);
3609
3615 if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || 3610 if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
3616 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) 3611 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
3617 tcp_rearm_rto(sk); 3612 tcp_rearm_rto(sk);
@@ -3622,6 +3617,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3622 } 3617 }
3623 3618
3624 prior_fackets = tp->fackets_out; 3619 prior_fackets = tp->fackets_out;
3620 rs.prior_in_flight = tcp_packets_in_flight(tp);
3625 3621
3626 /* ts_recent update must be made after we are sure that the packet 3622 /* ts_recent update must be made after we are sure that the packet
3627 * is in window. 3623 * is in window.
@@ -3677,7 +3673,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3677 3673
3678 /* See if we can take anything off of the retransmit queue. */ 3674 /* See if we can take anything off of the retransmit queue. */
3679 flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked, 3675 flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked,
3680 &sack_state); 3676 &sack_state, &now);
3681 3677
3682 if (tcp_ack_is_dubious(sk, flag)) { 3678 if (tcp_ack_is_dubious(sk, flag)) {
3683 is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); 3679 is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
@@ -3694,7 +3690,10 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3694 3690
3695 if (icsk->icsk_pending == ICSK_TIME_RETRANS) 3691 if (icsk->icsk_pending == ICSK_TIME_RETRANS)
3696 tcp_schedule_loss_probe(sk); 3692 tcp_schedule_loss_probe(sk);
3697 tcp_cong_control(sk, ack, tp->delivered - prior_delivered, flag); 3693 delivered = tp->delivered - delivered; /* freshly ACKed or SACKed */
3694 lost = tp->lost - lost; /* freshly marked lost */
3695 tcp_rate_gen(sk, delivered, lost, &now, &rs);
3696 tcp_cong_control(sk, ack, delivered, flag, &rs);
3698 tcp_xmit_recovery(sk, rexmit); 3697 tcp_xmit_recovery(sk, rexmit);
3699 return 1; 3698 return 1;
3700 3699
@@ -4108,7 +4107,7 @@ void tcp_fin(struct sock *sk)
4108 /* It _is_ possible, that we have something out-of-order _after_ FIN. 4107 /* It _is_ possible, that we have something out-of-order _after_ FIN.
4109 * Probably, we should reset in this case. For now drop them. 4108 * Probably, we should reset in this case. For now drop them.
4110 */ 4109 */
4111 __skb_queue_purge(&tp->out_of_order_queue); 4110 skb_rbtree_purge(&tp->out_of_order_queue);
4112 if (tcp_is_sack(tp)) 4111 if (tcp_is_sack(tp))
4113 tcp_sack_reset(&tp->rx_opt); 4112 tcp_sack_reset(&tp->rx_opt);
4114 sk_mem_reclaim(sk); 4113 sk_mem_reclaim(sk);
@@ -4268,7 +4267,7 @@ static void tcp_sack_remove(struct tcp_sock *tp)
4268 int this_sack; 4267 int this_sack;
4269 4268
4270 /* Empty ofo queue, hence, all the SACKs are eaten. Clear. */ 4269 /* Empty ofo queue, hence, all the SACKs are eaten. Clear. */
4271 if (skb_queue_empty(&tp->out_of_order_queue)) { 4270 if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
4272 tp->rx_opt.num_sacks = 0; 4271 tp->rx_opt.num_sacks = 0;
4273 return; 4272 return;
4274 } 4273 }
@@ -4344,10 +4343,13 @@ static void tcp_ofo_queue(struct sock *sk)
4344{ 4343{
4345 struct tcp_sock *tp = tcp_sk(sk); 4344 struct tcp_sock *tp = tcp_sk(sk);
4346 __u32 dsack_high = tp->rcv_nxt; 4345 __u32 dsack_high = tp->rcv_nxt;
4346 bool fin, fragstolen, eaten;
4347 struct sk_buff *skb, *tail; 4347 struct sk_buff *skb, *tail;
4348 bool fragstolen, eaten; 4348 struct rb_node *p;
4349 4349
4350 while ((skb = skb_peek(&tp->out_of_order_queue)) != NULL) { 4350 p = rb_first(&tp->out_of_order_queue);
4351 while (p) {
4352 skb = rb_entry(p, struct sk_buff, rbnode);
4351 if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) 4353 if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
4352 break; 4354 break;
4353 4355
@@ -4357,9 +4359,10 @@ static void tcp_ofo_queue(struct sock *sk)
4357 dsack_high = TCP_SKB_CB(skb)->end_seq; 4359 dsack_high = TCP_SKB_CB(skb)->end_seq;
4358 tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack); 4360 tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack);
4359 } 4361 }
4362 p = rb_next(p);
4363 rb_erase(&skb->rbnode, &tp->out_of_order_queue);
4360 4364
4361 __skb_unlink(skb, &tp->out_of_order_queue); 4365 if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) {
4362 if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
4363 SOCK_DEBUG(sk, "ofo packet was already received\n"); 4366 SOCK_DEBUG(sk, "ofo packet was already received\n");
4364 tcp_drop(sk, skb); 4367 tcp_drop(sk, skb);
4365 continue; 4368 continue;
@@ -4371,12 +4374,19 @@ static void tcp_ofo_queue(struct sock *sk)
4371 tail = skb_peek_tail(&sk->sk_receive_queue); 4374 tail = skb_peek_tail(&sk->sk_receive_queue);
4372 eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen); 4375 eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen);
4373 tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq); 4376 tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
4377 fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
4374 if (!eaten) 4378 if (!eaten)
4375 __skb_queue_tail(&sk->sk_receive_queue, skb); 4379 __skb_queue_tail(&sk->sk_receive_queue, skb);
4376 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) 4380 else
4377 tcp_fin(sk);
4378 if (eaten)
4379 kfree_skb_partial(skb, fragstolen); 4381 kfree_skb_partial(skb, fragstolen);
4382
4383 if (unlikely(fin)) {
4384 tcp_fin(sk);
4385 /* tcp_fin() purges tp->out_of_order_queue,
4386 * so we must end this loop right now.
4387 */
4388 break;
4389 }
4380 } 4390 }
4381} 4391}
4382 4392
@@ -4403,8 +4413,10 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
4403static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) 4413static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
4404{ 4414{
4405 struct tcp_sock *tp = tcp_sk(sk); 4415 struct tcp_sock *tp = tcp_sk(sk);
4416 struct rb_node **p, *q, *parent;
4406 struct sk_buff *skb1; 4417 struct sk_buff *skb1;
4407 u32 seq, end_seq; 4418 u32 seq, end_seq;
4419 bool fragstolen;
4408 4420
4409 tcp_ecn_check_ce(tp, skb); 4421 tcp_ecn_check_ce(tp, skb);
4410 4422
@@ -4419,88 +4431,92 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
4419 inet_csk_schedule_ack(sk); 4431 inet_csk_schedule_ack(sk);
4420 4432
4421 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE); 4433 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
4434 seq = TCP_SKB_CB(skb)->seq;
4435 end_seq = TCP_SKB_CB(skb)->end_seq;
4422 SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n", 4436 SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
4423 tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); 4437 tp->rcv_nxt, seq, end_seq);
4424 4438
4425 skb1 = skb_peek_tail(&tp->out_of_order_queue); 4439 p = &tp->out_of_order_queue.rb_node;
4426 if (!skb1) { 4440 if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
4427 /* Initial out of order segment, build 1 SACK. */ 4441 /* Initial out of order segment, build 1 SACK. */
4428 if (tcp_is_sack(tp)) { 4442 if (tcp_is_sack(tp)) {
4429 tp->rx_opt.num_sacks = 1; 4443 tp->rx_opt.num_sacks = 1;
4430 tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq; 4444 tp->selective_acks[0].start_seq = seq;
4431 tp->selective_acks[0].end_seq = 4445 tp->selective_acks[0].end_seq = end_seq;
4432 TCP_SKB_CB(skb)->end_seq;
4433 } 4446 }
4434 __skb_queue_head(&tp->out_of_order_queue, skb); 4447 rb_link_node(&skb->rbnode, NULL, p);
4448 rb_insert_color(&skb->rbnode, &tp->out_of_order_queue);
4449 tp->ooo_last_skb = skb;
4435 goto end; 4450 goto end;
4436 } 4451 }
4437 4452
4438 seq = TCP_SKB_CB(skb)->seq; 4453 /* In the typical case, we are adding an skb to the end of the list.
4439 end_seq = TCP_SKB_CB(skb)->end_seq; 4454 * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup.
4440 4455 */
4441 if (seq == TCP_SKB_CB(skb1)->end_seq) { 4456 if (tcp_try_coalesce(sk, tp->ooo_last_skb, skb, &fragstolen)) {
4442 bool fragstolen; 4457coalesce_done:
4443 4458 tcp_grow_window(sk, skb);
4444 if (!tcp_try_coalesce(sk, skb1, skb, &fragstolen)) { 4459 kfree_skb_partial(skb, fragstolen);
4445 __skb_queue_after(&tp->out_of_order_queue, skb1, skb); 4460 skb = NULL;
4446 } else { 4461 goto add_sack;
4447 tcp_grow_window(sk, skb); 4462 }
4448 kfree_skb_partial(skb, fragstolen); 4463 /* Can avoid an rbtree lookup if we are adding skb after ooo_last_skb */
4449 skb = NULL; 4464 if (!before(seq, TCP_SKB_CB(tp->ooo_last_skb)->end_seq)) {
4450 } 4465 parent = &tp->ooo_last_skb->rbnode;
4451 4466 p = &parent->rb_right;
4452 if (!tp->rx_opt.num_sacks || 4467 goto insert;
4453 tp->selective_acks[0].end_seq != seq) 4468 }
4454 goto add_sack; 4469
4455 4470 /* Find place to insert this segment. Handle overlaps on the way. */
4456 /* Common case: data arrive in order after hole. */ 4471 parent = NULL;
4457 tp->selective_acks[0].end_seq = end_seq; 4472 while (*p) {
4458 goto end; 4473 parent = *p;
4459 } 4474 skb1 = rb_entry(parent, struct sk_buff, rbnode);
4460 4475 if (before(seq, TCP_SKB_CB(skb1)->seq)) {
4461 /* Find place to insert this segment. */ 4476 p = &parent->rb_left;
4462 while (1) { 4477 continue;
4463 if (!after(TCP_SKB_CB(skb1)->seq, seq))
4464 break;
4465 if (skb_queue_is_first(&tp->out_of_order_queue, skb1)) {
4466 skb1 = NULL;
4467 break;
4468 }
4469 skb1 = skb_queue_prev(&tp->out_of_order_queue, skb1);
4470 }
4471
4472 /* Do skb overlap to previous one? */
4473 if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) {
4474 if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
4475 /* All the bits are present. Drop. */
4476 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
4477 tcp_drop(sk, skb);
4478 skb = NULL;
4479 tcp_dsack_set(sk, seq, end_seq);
4480 goto add_sack;
4481 } 4478 }
4482 if (after(seq, TCP_SKB_CB(skb1)->seq)) { 4479 if (before(seq, TCP_SKB_CB(skb1)->end_seq)) {
4483 /* Partial overlap. */ 4480 if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
4484 tcp_dsack_set(sk, seq, 4481 /* All the bits are present. Drop. */
4485 TCP_SKB_CB(skb1)->end_seq); 4482 NET_INC_STATS(sock_net(sk),
4486 } else { 4483 LINUX_MIB_TCPOFOMERGE);
4487 if (skb_queue_is_first(&tp->out_of_order_queue, 4484 __kfree_skb(skb);
4488 skb1)) 4485 skb = NULL;
4489 skb1 = NULL; 4486 tcp_dsack_set(sk, seq, end_seq);
4490 else 4487 goto add_sack;
4491 skb1 = skb_queue_prev( 4488 }
4492 &tp->out_of_order_queue, 4489 if (after(seq, TCP_SKB_CB(skb1)->seq)) {
4493 skb1); 4490 /* Partial overlap. */
4491 tcp_dsack_set(sk, seq, TCP_SKB_CB(skb1)->end_seq);
4492 } else {
4493 /* skb's seq == skb1's seq and skb covers skb1.
4494 * Replace skb1 with skb.
4495 */
4496 rb_replace_node(&skb1->rbnode, &skb->rbnode,
4497 &tp->out_of_order_queue);
4498 tcp_dsack_extend(sk,
4499 TCP_SKB_CB(skb1)->seq,
4500 TCP_SKB_CB(skb1)->end_seq);
4501 NET_INC_STATS(sock_net(sk),
4502 LINUX_MIB_TCPOFOMERGE);
4503 __kfree_skb(skb1);
4504 goto merge_right;
4505 }
4506 } else if (tcp_try_coalesce(sk, skb1, skb, &fragstolen)) {
4507 goto coalesce_done;
4494 } 4508 }
4509 p = &parent->rb_right;
4495 } 4510 }
4496 if (!skb1) 4511insert:
4497 __skb_queue_head(&tp->out_of_order_queue, skb); 4512 /* Insert segment into RB tree. */
4498 else 4513 rb_link_node(&skb->rbnode, parent, p);
4499 __skb_queue_after(&tp->out_of_order_queue, skb1, skb); 4514 rb_insert_color(&skb->rbnode, &tp->out_of_order_queue);
4500 4515
4501 /* And clean segments covered by new one as whole. */ 4516merge_right:
4502 while (!skb_queue_is_last(&tp->out_of_order_queue, skb)) { 4517 /* Remove other segments covered by skb. */
4503 skb1 = skb_queue_next(&tp->out_of_order_queue, skb); 4518 while ((q = rb_next(&skb->rbnode)) != NULL) {
4519 skb1 = rb_entry(q, struct sk_buff, rbnode);
4504 4520
4505 if (!after(end_seq, TCP_SKB_CB(skb1)->seq)) 4521 if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
4506 break; 4522 break;
@@ -4509,12 +4525,15 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
4509 end_seq); 4525 end_seq);
4510 break; 4526 break;
4511 } 4527 }
4512 __skb_unlink(skb1, &tp->out_of_order_queue); 4528 rb_erase(&skb1->rbnode, &tp->out_of_order_queue);
4513 tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq, 4529 tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
4514 TCP_SKB_CB(skb1)->end_seq); 4530 TCP_SKB_CB(skb1)->end_seq);
4515 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE); 4531 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
4516 tcp_drop(sk, skb1); 4532 tcp_drop(sk, skb1);
4517 } 4533 }
4534 /* If there is no skb after us, we are the last_skb ! */
4535 if (!q)
4536 tp->ooo_last_skb = skb;
4518 4537
4519add_sack: 4538add_sack:
4520 if (tcp_is_sack(tp)) 4539 if (tcp_is_sack(tp))
@@ -4651,13 +4670,13 @@ queue_and_out:
4651 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) 4670 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
4652 tcp_fin(sk); 4671 tcp_fin(sk);
4653 4672
4654 if (!skb_queue_empty(&tp->out_of_order_queue)) { 4673 if (!RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
4655 tcp_ofo_queue(sk); 4674 tcp_ofo_queue(sk);
4656 4675
4657 /* RFC2581. 4.2. SHOULD send immediate ACK, when 4676 /* RFC2581. 4.2. SHOULD send immediate ACK, when
4658 * gap in queue is filled. 4677 * gap in queue is filled.
4659 */ 4678 */
4660 if (skb_queue_empty(&tp->out_of_order_queue)) 4679 if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
4661 inet_csk(sk)->icsk_ack.pingpong = 0; 4680 inet_csk(sk)->icsk_ack.pingpong = 0;
4662 } 4681 }
4663 4682
@@ -4711,48 +4730,76 @@ drop:
4711 tcp_data_queue_ofo(sk, skb); 4730 tcp_data_queue_ofo(sk, skb);
4712} 4731}
4713 4732
4733static struct sk_buff *tcp_skb_next(struct sk_buff *skb, struct sk_buff_head *list)
4734{
4735 if (list)
4736 return !skb_queue_is_last(list, skb) ? skb->next : NULL;
4737
4738 return rb_entry_safe(rb_next(&skb->rbnode), struct sk_buff, rbnode);
4739}
4740
4714static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb, 4741static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
4715 struct sk_buff_head *list) 4742 struct sk_buff_head *list,
4743 struct rb_root *root)
4716{ 4744{
4717 struct sk_buff *next = NULL; 4745 struct sk_buff *next = tcp_skb_next(skb, list);
4718 4746
4719 if (!skb_queue_is_last(list, skb)) 4747 if (list)
4720 next = skb_queue_next(list, skb); 4748 __skb_unlink(skb, list);
4749 else
4750 rb_erase(&skb->rbnode, root);
4721 4751
4722 __skb_unlink(skb, list);
4723 __kfree_skb(skb); 4752 __kfree_skb(skb);
4724 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED); 4753 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED);
4725 4754
4726 return next; 4755 return next;
4727} 4756}
4728 4757
4758/* Insert skb into rb tree, ordered by TCP_SKB_CB(skb)->seq */
4759static void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb)
4760{
4761 struct rb_node **p = &root->rb_node;
4762 struct rb_node *parent = NULL;
4763 struct sk_buff *skb1;
4764
4765 while (*p) {
4766 parent = *p;
4767 skb1 = rb_entry(parent, struct sk_buff, rbnode);
4768 if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq))
4769 p = &parent->rb_left;
4770 else
4771 p = &parent->rb_right;
4772 }
4773 rb_link_node(&skb->rbnode, parent, p);
4774 rb_insert_color(&skb->rbnode, root);
4775}
4776
4729/* Collapse contiguous sequence of skbs head..tail with 4777/* Collapse contiguous sequence of skbs head..tail with
4730 * sequence numbers start..end. 4778 * sequence numbers start..end.
4731 * 4779 *
4732 * If tail is NULL, this means until the end of the list. 4780 * If tail is NULL, this means until the end of the queue.
4733 * 4781 *
4734 * Segments with FIN/SYN are not collapsed (only because this 4782 * Segments with FIN/SYN are not collapsed (only because this
4735 * simplifies code) 4783 * simplifies code)
4736 */ 4784 */
4737static void 4785static void
4738tcp_collapse(struct sock *sk, struct sk_buff_head *list, 4786tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct rb_root *root,
4739 struct sk_buff *head, struct sk_buff *tail, 4787 struct sk_buff *head, struct sk_buff *tail, u32 start, u32 end)
4740 u32 start, u32 end)
4741{ 4788{
4742 struct sk_buff *skb, *n; 4789 struct sk_buff *skb = head, *n;
4790 struct sk_buff_head tmp;
4743 bool end_of_skbs; 4791 bool end_of_skbs;
4744 4792
4745 /* First, check that queue is collapsible and find 4793 /* First, check that queue is collapsible and find
4746 * the point where collapsing can be useful. */ 4794 * the point where collapsing can be useful.
4747 skb = head; 4795 */
4748restart: 4796restart:
4749 end_of_skbs = true; 4797 for (end_of_skbs = true; skb != NULL && skb != tail; skb = n) {
4750 skb_queue_walk_from_safe(list, skb, n) { 4798 n = tcp_skb_next(skb, list);
4751 if (skb == tail) 4799
4752 break;
4753 /* No new bits? It is possible on ofo queue. */ 4800 /* No new bits? It is possible on ofo queue. */
4754 if (!before(start, TCP_SKB_CB(skb)->end_seq)) { 4801 if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
4755 skb = tcp_collapse_one(sk, skb, list); 4802 skb = tcp_collapse_one(sk, skb, list, root);
4756 if (!skb) 4803 if (!skb)
4757 break; 4804 break;
4758 goto restart; 4805 goto restart;
@@ -4770,13 +4817,10 @@ restart:
4770 break; 4817 break;
4771 } 4818 }
4772 4819
4773 if (!skb_queue_is_last(list, skb)) { 4820 if (n && n != tail &&
4774 struct sk_buff *next = skb_queue_next(list, skb); 4821 TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(n)->seq) {
4775 if (next != tail && 4822 end_of_skbs = false;
4776 TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(next)->seq) { 4823 break;
4777 end_of_skbs = false;
4778 break;
4779 }
4780 } 4824 }
4781 4825
4782 /* Decided to skip this, advance start seq. */ 4826 /* Decided to skip this, advance start seq. */
@@ -4786,17 +4830,22 @@ restart:
4786 (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN))) 4830 (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
4787 return; 4831 return;
4788 4832
4833 __skb_queue_head_init(&tmp);
4834
4789 while (before(start, end)) { 4835 while (before(start, end)) {
4790 int copy = min_t(int, SKB_MAX_ORDER(0, 0), end - start); 4836 int copy = min_t(int, SKB_MAX_ORDER(0, 0), end - start);
4791 struct sk_buff *nskb; 4837 struct sk_buff *nskb;
4792 4838
4793 nskb = alloc_skb(copy, GFP_ATOMIC); 4839 nskb = alloc_skb(copy, GFP_ATOMIC);
4794 if (!nskb) 4840 if (!nskb)
4795 return; 4841 break;
4796 4842
4797 memcpy(nskb->cb, skb->cb, sizeof(skb->cb)); 4843 memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
4798 TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start; 4844 TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
4799 __skb_queue_before(list, skb, nskb); 4845 if (list)
4846 __skb_queue_before(list, skb, nskb);
4847 else
4848 __skb_queue_tail(&tmp, nskb); /* defer rbtree insertion */
4800 skb_set_owner_r(nskb, sk); 4849 skb_set_owner_r(nskb, sk);
4801 4850
4802 /* Copy data, releasing collapsed skbs. */ 4851 /* Copy data, releasing collapsed skbs. */
@@ -4814,14 +4863,17 @@ restart:
4814 start += size; 4863 start += size;
4815 } 4864 }
4816 if (!before(start, TCP_SKB_CB(skb)->end_seq)) { 4865 if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
4817 skb = tcp_collapse_one(sk, skb, list); 4866 skb = tcp_collapse_one(sk, skb, list, root);
4818 if (!skb || 4867 if (!skb ||
4819 skb == tail || 4868 skb == tail ||
4820 (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN))) 4869 (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
4821 return; 4870 goto end;
4822 } 4871 }
4823 } 4872 }
4824 } 4873 }
4874end:
4875 skb_queue_walk_safe(&tmp, skb, n)
4876 tcp_rbtree_insert(root, skb);
4825} 4877}
4826 4878
4827/* Collapse ofo queue. Algorithm: select contiguous sequence of skbs 4879/* Collapse ofo queue. Algorithm: select contiguous sequence of skbs
@@ -4830,43 +4882,43 @@ restart:
4830static void tcp_collapse_ofo_queue(struct sock *sk) 4882static void tcp_collapse_ofo_queue(struct sock *sk)
4831{ 4883{
4832 struct tcp_sock *tp = tcp_sk(sk); 4884 struct tcp_sock *tp = tcp_sk(sk);
4833 struct sk_buff *skb = skb_peek(&tp->out_of_order_queue); 4885 struct sk_buff *skb, *head;
4834 struct sk_buff *head; 4886 struct rb_node *p;
4835 u32 start, end; 4887 u32 start, end;
4836 4888
4837 if (!skb) 4889 p = rb_first(&tp->out_of_order_queue);
4890 skb = rb_entry_safe(p, struct sk_buff, rbnode);
4891new_range:
4892 if (!skb) {
4893 p = rb_last(&tp->out_of_order_queue);
4894 /* Note: This is possible p is NULL here. We do not
4895 * use rb_entry_safe(), as ooo_last_skb is valid only
4896 * if rbtree is not empty.
4897 */
4898 tp->ooo_last_skb = rb_entry(p, struct sk_buff, rbnode);
4838 return; 4899 return;
4839 4900 }
4840 start = TCP_SKB_CB(skb)->seq; 4901 start = TCP_SKB_CB(skb)->seq;
4841 end = TCP_SKB_CB(skb)->end_seq; 4902 end = TCP_SKB_CB(skb)->end_seq;
4842 head = skb;
4843
4844 for (;;) {
4845 struct sk_buff *next = NULL;
4846 4903
4847 if (!skb_queue_is_last(&tp->out_of_order_queue, skb)) 4904 for (head = skb;;) {
4848 next = skb_queue_next(&tp->out_of_order_queue, skb); 4905 skb = tcp_skb_next(skb, NULL);
4849 skb = next;
4850 4906
4851 /* Segment is terminated when we see gap or when 4907 /* Range is terminated when we see a gap or when
4852 * we are at the end of all the queue. */ 4908 * we are at the queue end.
4909 */
4853 if (!skb || 4910 if (!skb ||
4854 after(TCP_SKB_CB(skb)->seq, end) || 4911 after(TCP_SKB_CB(skb)->seq, end) ||
4855 before(TCP_SKB_CB(skb)->end_seq, start)) { 4912 before(TCP_SKB_CB(skb)->end_seq, start)) {
4856 tcp_collapse(sk, &tp->out_of_order_queue, 4913 tcp_collapse(sk, NULL, &tp->out_of_order_queue,
4857 head, skb, start, end); 4914 head, skb, start, end);
4858 head = skb; 4915 goto new_range;
4859 if (!skb) 4916 }
4860 break; 4917
4861 /* Start new segment */ 4918 if (unlikely(before(TCP_SKB_CB(skb)->seq, start)))
4862 start = TCP_SKB_CB(skb)->seq; 4919 start = TCP_SKB_CB(skb)->seq;
4920 if (after(TCP_SKB_CB(skb)->end_seq, end))
4863 end = TCP_SKB_CB(skb)->end_seq; 4921 end = TCP_SKB_CB(skb)->end_seq;
4864 } else {
4865 if (before(TCP_SKB_CB(skb)->seq, start))
4866 start = TCP_SKB_CB(skb)->seq;
4867 if (after(TCP_SKB_CB(skb)->end_seq, end))
4868 end = TCP_SKB_CB(skb)->end_seq;
4869 }
4870 } 4922 }
4871} 4923}
4872 4924
@@ -4883,20 +4935,24 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
4883static bool tcp_prune_ofo_queue(struct sock *sk) 4935static bool tcp_prune_ofo_queue(struct sock *sk)
4884{ 4936{
4885 struct tcp_sock *tp = tcp_sk(sk); 4937 struct tcp_sock *tp = tcp_sk(sk);
4886 struct sk_buff *skb; 4938 struct rb_node *node, *prev;
4887 4939
4888 if (skb_queue_empty(&tp->out_of_order_queue)) 4940 if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
4889 return false; 4941 return false;
4890 4942
4891 NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED); 4943 NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
4892 4944 node = &tp->ooo_last_skb->rbnode;
4893 while ((skb = __skb_dequeue_tail(&tp->out_of_order_queue)) != NULL) { 4945 do {
4894 tcp_drop(sk, skb); 4946 prev = rb_prev(node);
4947 rb_erase(node, &tp->out_of_order_queue);
4948 tcp_drop(sk, rb_entry(node, struct sk_buff, rbnode));
4895 sk_mem_reclaim(sk); 4949 sk_mem_reclaim(sk);
4896 if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && 4950 if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
4897 !tcp_under_memory_pressure(sk)) 4951 !tcp_under_memory_pressure(sk))
4898 break; 4952 break;
4899 } 4953 node = prev;
4954 } while (node);
4955 tp->ooo_last_skb = rb_entry(prev, struct sk_buff, rbnode);
4900 4956
4901 /* Reset SACK state. A conforming SACK implementation will 4957 /* Reset SACK state. A conforming SACK implementation will
4902 * do the same at a timeout based retransmit. When a connection 4958 * do the same at a timeout based retransmit. When a connection
@@ -4930,7 +4986,7 @@ static int tcp_prune_queue(struct sock *sk)
4930 4986
4931 tcp_collapse_ofo_queue(sk); 4987 tcp_collapse_ofo_queue(sk);
4932 if (!skb_queue_empty(&sk->sk_receive_queue)) 4988 if (!skb_queue_empty(&sk->sk_receive_queue))
4933 tcp_collapse(sk, &sk->sk_receive_queue, 4989 tcp_collapse(sk, &sk->sk_receive_queue, NULL,
4934 skb_peek(&sk->sk_receive_queue), 4990 skb_peek(&sk->sk_receive_queue),
4935 NULL, 4991 NULL,
4936 tp->copied_seq, tp->rcv_nxt); 4992 tp->copied_seq, tp->rcv_nxt);
@@ -5035,7 +5091,7 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
5035 /* We ACK each frame or... */ 5091 /* We ACK each frame or... */
5036 tcp_in_quickack_mode(sk) || 5092 tcp_in_quickack_mode(sk) ||
5037 /* We have out of order data. */ 5093 /* We have out of order data. */
5038 (ofo_possible && skb_peek(&tp->out_of_order_queue))) { 5094 (ofo_possible && !RB_EMPTY_ROOT(&tp->out_of_order_queue))) {
5039 /* Then ack it now */ 5095 /* Then ack it now */
5040 tcp_send_ack(sk); 5096 tcp_send_ack(sk);
5041 } else { 5097 } else {
@@ -5894,7 +5950,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
5894 * so release it. 5950 * so release it.
5895 */ 5951 */
5896 if (req) { 5952 if (req) {
5897 tp->total_retrans = req->num_retrans; 5953 inet_csk(sk)->icsk_retransmits = 0;
5898 reqsk_fastopen_remove(sk, req, false); 5954 reqsk_fastopen_remove(sk, req, false);
5899 } else { 5955 } else {
5900 /* Make sure socket is routed, for correct metrics. */ 5956 /* Make sure socket is routed, for correct metrics. */
@@ -5936,7 +5992,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
5936 } else 5992 } else
5937 tcp_init_metrics(sk); 5993 tcp_init_metrics(sk);
5938 5994
5939 tcp_update_pacing_rate(sk); 5995 if (!inet_csk(sk)->icsk_ca_ops->cong_control)
5996 tcp_update_pacing_rate(sk);
5940 5997
5941 /* Prevent spurious tcp_cwnd_restart() on first data packet */ 5998 /* Prevent spurious tcp_cwnd_restart() on first data packet */
5942 tp->lsndtime = tcp_time_stamp; 5999 tp->lsndtime = tcp_time_stamp;