diff options
author | Jonathan Herman <hermanjl@cs.unc.edu> | 2013-01-17 16:15:55 -0500 |
---|---|---|
committer | Jonathan Herman <hermanjl@cs.unc.edu> | 2013-01-17 16:15:55 -0500 |
commit | 8dea78da5cee153b8af9c07a2745f6c55057fe12 (patch) | |
tree | a8f4d49d63b1ecc92f2fddceba0655b2472c5bd9 /net/ipv4/tcp_input.c | |
parent | 406089d01562f1e2bf9f089fd7637009ebaad589 (diff) |
Patched in Tegra support.
Diffstat (limited to 'net/ipv4/tcp_input.c')
-rw-r--r-- | net/ipv4/tcp_input.c | 1805 |
1 files changed, 775 insertions, 1030 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 18f97ca76b0..d73aab3fbfc 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c | |||
@@ -61,8 +61,6 @@ | |||
61 | * Pasi Sarolahti: F-RTO for dealing with spurious RTOs | 61 | * Pasi Sarolahti: F-RTO for dealing with spurious RTOs |
62 | */ | 62 | */ |
63 | 63 | ||
64 | #define pr_fmt(fmt) "TCP: " fmt | ||
65 | |||
66 | #include <linux/mm.h> | 64 | #include <linux/mm.h> |
67 | #include <linux/slab.h> | 65 | #include <linux/slab.h> |
68 | #include <linux/module.h> | 66 | #include <linux/module.h> |
@@ -85,23 +83,20 @@ int sysctl_tcp_ecn __read_mostly = 2; | |||
85 | EXPORT_SYMBOL(sysctl_tcp_ecn); | 83 | EXPORT_SYMBOL(sysctl_tcp_ecn); |
86 | int sysctl_tcp_dsack __read_mostly = 1; | 84 | int sysctl_tcp_dsack __read_mostly = 1; |
87 | int sysctl_tcp_app_win __read_mostly = 31; | 85 | int sysctl_tcp_app_win __read_mostly = 31; |
88 | int sysctl_tcp_adv_win_scale __read_mostly = 1; | 86 | int sysctl_tcp_adv_win_scale __read_mostly = 2; |
89 | EXPORT_SYMBOL(sysctl_tcp_adv_win_scale); | 87 | EXPORT_SYMBOL(sysctl_tcp_adv_win_scale); |
90 | 88 | ||
91 | /* rfc5961 challenge ack rate limiting */ | ||
92 | int sysctl_tcp_challenge_ack_limit = 100; | ||
93 | |||
94 | int sysctl_tcp_stdurg __read_mostly; | 89 | int sysctl_tcp_stdurg __read_mostly; |
95 | int sysctl_tcp_rfc1337 __read_mostly; | 90 | int sysctl_tcp_rfc1337 __read_mostly; |
96 | int sysctl_tcp_max_orphans __read_mostly = NR_FILE; | 91 | int sysctl_tcp_max_orphans __read_mostly = NR_FILE; |
97 | int sysctl_tcp_frto __read_mostly = 2; | 92 | int sysctl_tcp_frto __read_mostly = 2; |
98 | int sysctl_tcp_frto_response __read_mostly; | 93 | int sysctl_tcp_frto_response __read_mostly; |
94 | int sysctl_tcp_nometrics_save __read_mostly; | ||
99 | 95 | ||
100 | int sysctl_tcp_thin_dupack __read_mostly; | 96 | int sysctl_tcp_thin_dupack __read_mostly; |
101 | 97 | ||
102 | int sysctl_tcp_moderate_rcvbuf __read_mostly = 1; | 98 | int sysctl_tcp_moderate_rcvbuf __read_mostly = 1; |
103 | int sysctl_tcp_abc __read_mostly; | 99 | int sysctl_tcp_abc __read_mostly; |
104 | int sysctl_tcp_early_retrans __read_mostly = 2; | ||
105 | 100 | ||
106 | #define FLAG_DATA 0x01 /* Incoming frame contained data. */ | 101 | #define FLAG_DATA 0x01 /* Incoming frame contained data. */ |
107 | #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ | 102 | #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ |
@@ -110,6 +105,7 @@ int sysctl_tcp_early_retrans __read_mostly = 2; | |||
110 | #define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */ | 105 | #define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */ |
111 | #define FLAG_DATA_SACKED 0x20 /* New SACK. */ | 106 | #define FLAG_DATA_SACKED 0x20 /* New SACK. */ |
112 | #define FLAG_ECE 0x40 /* ECE in this ACK */ | 107 | #define FLAG_ECE 0x40 /* ECE in this ACK */ |
108 | #define FLAG_DATA_LOST 0x80 /* SACK detected data lossage. */ | ||
113 | #define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/ | 109 | #define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/ |
114 | #define FLAG_ONLY_ORIG_SACKED 0x200 /* SACKs only non-rexmit sent before RTO */ | 110 | #define FLAG_ONLY_ORIG_SACKED 0x200 /* SACKs only non-rexmit sent before RTO */ |
115 | #define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ | 111 | #define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ |
@@ -178,7 +174,7 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb) | |||
178 | static void tcp_incr_quickack(struct sock *sk) | 174 | static void tcp_incr_quickack(struct sock *sk) |
179 | { | 175 | { |
180 | struct inet_connection_sock *icsk = inet_csk(sk); | 176 | struct inet_connection_sock *icsk = inet_csk(sk); |
181 | unsigned int quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss); | 177 | unsigned quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss); |
182 | 178 | ||
183 | if (quickacks == 0) | 179 | if (quickacks == 0) |
184 | quickacks = 2; | 180 | quickacks = 2; |
@@ -198,10 +194,9 @@ static void tcp_enter_quickack_mode(struct sock *sk) | |||
198 | * and the session is not interactive. | 194 | * and the session is not interactive. |
199 | */ | 195 | */ |
200 | 196 | ||
201 | static inline bool tcp_in_quickack_mode(const struct sock *sk) | 197 | static inline int tcp_in_quickack_mode(const struct sock *sk) |
202 | { | 198 | { |
203 | const struct inet_connection_sock *icsk = inet_csk(sk); | 199 | const struct inet_connection_sock *icsk = inet_csk(sk); |
204 | |||
205 | return icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong; | 200 | return icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong; |
206 | } | 201 | } |
207 | 202 | ||
@@ -211,7 +206,7 @@ static inline void TCP_ECN_queue_cwr(struct tcp_sock *tp) | |||
211 | tp->ecn_flags |= TCP_ECN_QUEUE_CWR; | 206 | tp->ecn_flags |= TCP_ECN_QUEUE_CWR; |
212 | } | 207 | } |
213 | 208 | ||
214 | static inline void TCP_ECN_accept_cwr(struct tcp_sock *tp, const struct sk_buff *skb) | 209 | static inline void TCP_ECN_accept_cwr(struct tcp_sock *tp, struct sk_buff *skb) |
215 | { | 210 | { |
216 | if (tcp_hdr(skb)->cwr) | 211 | if (tcp_hdr(skb)->cwr) |
217 | tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; | 212 | tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; |
@@ -222,49 +217,36 @@ static inline void TCP_ECN_withdraw_cwr(struct tcp_sock *tp) | |||
222 | tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; | 217 | tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; |
223 | } | 218 | } |
224 | 219 | ||
225 | static inline void TCP_ECN_check_ce(struct tcp_sock *tp, const struct sk_buff *skb) | 220 | static inline void TCP_ECN_check_ce(struct tcp_sock *tp, struct sk_buff *skb) |
226 | { | 221 | { |
227 | if (!(tp->ecn_flags & TCP_ECN_OK)) | 222 | if (tp->ecn_flags & TCP_ECN_OK) { |
228 | return; | 223 | if (INET_ECN_is_ce(TCP_SKB_CB(skb)->flags)) |
229 | 224 | tp->ecn_flags |= TCP_ECN_DEMAND_CWR; | |
230 | switch (TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK) { | ||
231 | case INET_ECN_NOT_ECT: | ||
232 | /* Funny extension: if ECT is not set on a segment, | 225 | /* Funny extension: if ECT is not set on a segment, |
233 | * and we already seen ECT on a previous segment, | 226 | * it is surely retransmit. It is not in ECN RFC, |
234 | * it is probably a retransmit. | 227 | * but Linux follows this rule. */ |
235 | */ | 228 | else if (INET_ECN_is_not_ect((TCP_SKB_CB(skb)->flags))) |
236 | if (tp->ecn_flags & TCP_ECN_SEEN) | ||
237 | tcp_enter_quickack_mode((struct sock *)tp); | ||
238 | break; | ||
239 | case INET_ECN_CE: | ||
240 | if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) { | ||
241 | /* Better not delay acks, sender can have a very low cwnd */ | ||
242 | tcp_enter_quickack_mode((struct sock *)tp); | 229 | tcp_enter_quickack_mode((struct sock *)tp); |
243 | tp->ecn_flags |= TCP_ECN_DEMAND_CWR; | ||
244 | } | ||
245 | /* fallinto */ | ||
246 | default: | ||
247 | tp->ecn_flags |= TCP_ECN_SEEN; | ||
248 | } | 230 | } |
249 | } | 231 | } |
250 | 232 | ||
251 | static inline void TCP_ECN_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th) | 233 | static inline void TCP_ECN_rcv_synack(struct tcp_sock *tp, struct tcphdr *th) |
252 | { | 234 | { |
253 | if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || th->cwr)) | 235 | if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || th->cwr)) |
254 | tp->ecn_flags &= ~TCP_ECN_OK; | 236 | tp->ecn_flags &= ~TCP_ECN_OK; |
255 | } | 237 | } |
256 | 238 | ||
257 | static inline void TCP_ECN_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th) | 239 | static inline void TCP_ECN_rcv_syn(struct tcp_sock *tp, struct tcphdr *th) |
258 | { | 240 | { |
259 | if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || !th->cwr)) | 241 | if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || !th->cwr)) |
260 | tp->ecn_flags &= ~TCP_ECN_OK; | 242 | tp->ecn_flags &= ~TCP_ECN_OK; |
261 | } | 243 | } |
262 | 244 | ||
263 | static bool TCP_ECN_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th) | 245 | static inline int TCP_ECN_rcv_ecn_echo(struct tcp_sock *tp, struct tcphdr *th) |
264 | { | 246 | { |
265 | if (th->ece && !th->syn && (tp->ecn_flags & TCP_ECN_OK)) | 247 | if (th->ece && !th->syn && (tp->ecn_flags & TCP_ECN_OK)) |
266 | return true; | 248 | return 1; |
267 | return false; | 249 | return 0; |
268 | } | 250 | } |
269 | 251 | ||
270 | /* Buffer size and advertised window tuning. | 252 | /* Buffer size and advertised window tuning. |
@@ -274,11 +256,14 @@ static bool TCP_ECN_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr | |||
274 | 256 | ||
275 | static void tcp_fixup_sndbuf(struct sock *sk) | 257 | static void tcp_fixup_sndbuf(struct sock *sk) |
276 | { | 258 | { |
277 | int sndmem = SKB_TRUESIZE(tcp_sk(sk)->rx_opt.mss_clamp + MAX_TCP_HEADER); | 259 | int sndmem = tcp_sk(sk)->rx_opt.mss_clamp + MAX_TCP_HEADER + 16 + |
260 | sizeof(struct sk_buff); | ||
278 | 261 | ||
279 | sndmem *= TCP_INIT_CWND; | 262 | if (sk->sk_sndbuf < 3 * sndmem) { |
280 | if (sk->sk_sndbuf < sndmem) | 263 | sk->sk_sndbuf = 3 * sndmem; |
281 | sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]); | 264 | if (sk->sk_sndbuf > sysctl_tcp_wmem[2]) |
265 | sk->sk_sndbuf = sysctl_tcp_wmem[2]; | ||
266 | } | ||
282 | } | 267 | } |
283 | 268 | ||
284 | /* 2. Tuning advertised window (window_clamp, rcv_ssthresh) | 269 | /* 2. Tuning advertised window (window_clamp, rcv_ssthresh) |
@@ -324,14 +309,14 @@ static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb) | |||
324 | return 0; | 309 | return 0; |
325 | } | 310 | } |
326 | 311 | ||
327 | static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb) | 312 | static void tcp_grow_window(struct sock *sk, struct sk_buff *skb) |
328 | { | 313 | { |
329 | struct tcp_sock *tp = tcp_sk(sk); | 314 | struct tcp_sock *tp = tcp_sk(sk); |
330 | 315 | ||
331 | /* Check #1 */ | 316 | /* Check #1 */ |
332 | if (tp->rcv_ssthresh < tp->window_clamp && | 317 | if (tp->rcv_ssthresh < tp->window_clamp && |
333 | (int)tp->rcv_ssthresh < tcp_space(sk) && | 318 | (int)tp->rcv_ssthresh < tcp_space(sk) && |
334 | !sk_under_memory_pressure(sk)) { | 319 | !tcp_memory_pressure) { |
335 | int incr; | 320 | int incr; |
336 | 321 | ||
337 | /* Check #2. Increase window, if skb with such overhead | 322 | /* Check #2. Increase window, if skb with such overhead |
@@ -343,7 +328,6 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb) | |||
343 | incr = __tcp_grow_window(sk, skb); | 328 | incr = __tcp_grow_window(sk, skb); |
344 | 329 | ||
345 | if (incr) { | 330 | if (incr) { |
346 | incr = max_t(int, incr, 2 * skb->len); | ||
347 | tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr, | 331 | tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr, |
348 | tp->window_clamp); | 332 | tp->window_clamp); |
349 | inet_csk(sk)->icsk_ack.quick |= 1; | 333 | inet_csk(sk)->icsk_ack.quick |= 1; |
@@ -355,30 +339,23 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb) | |||
355 | 339 | ||
356 | static void tcp_fixup_rcvbuf(struct sock *sk) | 340 | static void tcp_fixup_rcvbuf(struct sock *sk) |
357 | { | 341 | { |
358 | u32 mss = tcp_sk(sk)->advmss; | 342 | struct tcp_sock *tp = tcp_sk(sk); |
359 | u32 icwnd = TCP_DEFAULT_INIT_RCVWND; | 343 | int rcvmem = tp->advmss + MAX_TCP_HEADER + 16 + sizeof(struct sk_buff); |
360 | int rcvmem; | ||
361 | 344 | ||
362 | /* Limit to 10 segments if mss <= 1460, | 345 | /* Try to select rcvbuf so that 4 mss-sized segments |
363 | * or 14600/mss segments, with a minimum of two segments. | 346 | * will fit to window and corresponding skbs will fit to our rcvbuf. |
347 | * (was 3; 4 is minimum to allow fast retransmit to work.) | ||
364 | */ | 348 | */ |
365 | if (mss > 1460) | 349 | while (tcp_win_from_space(rcvmem) < tp->advmss) |
366 | icwnd = max_t(u32, (1460 * TCP_DEFAULT_INIT_RCVWND) / mss, 2); | ||
367 | |||
368 | rcvmem = SKB_TRUESIZE(mss + MAX_TCP_HEADER); | ||
369 | while (tcp_win_from_space(rcvmem) < mss) | ||
370 | rcvmem += 128; | 350 | rcvmem += 128; |
371 | 351 | if (sk->sk_rcvbuf < 4 * rcvmem) | |
372 | rcvmem *= icwnd; | 352 | sk->sk_rcvbuf = min(4 * rcvmem, sysctl_tcp_rmem[2]); |
373 | |||
374 | if (sk->sk_rcvbuf < rcvmem) | ||
375 | sk->sk_rcvbuf = min(rcvmem, sysctl_tcp_rmem[2]); | ||
376 | } | 353 | } |
377 | 354 | ||
378 | /* 4. Try to fixup all. It is made immediately after connection enters | 355 | /* 4. Try to fixup all. It is made immediately after connection enters |
379 | * established state. | 356 | * established state. |
380 | */ | 357 | */ |
381 | void tcp_init_buffer_space(struct sock *sk) | 358 | static void tcp_init_buffer_space(struct sock *sk) |
382 | { | 359 | { |
383 | struct tcp_sock *tp = tcp_sk(sk); | 360 | struct tcp_sock *tp = tcp_sk(sk); |
384 | int maxwin; | 361 | int maxwin; |
@@ -421,8 +398,8 @@ static void tcp_clamp_window(struct sock *sk) | |||
421 | 398 | ||
422 | if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] && | 399 | if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] && |
423 | !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && | 400 | !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && |
424 | !sk_under_memory_pressure(sk) && | 401 | !tcp_memory_pressure && |
425 | sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) { | 402 | atomic_long_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) { |
426 | sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), | 403 | sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), |
427 | sysctl_tcp_rmem[2]); | 404 | sysctl_tcp_rmem[2]); |
428 | } | 405 | } |
@@ -439,7 +416,7 @@ static void tcp_clamp_window(struct sock *sk) | |||
439 | */ | 416 | */ |
440 | void tcp_initialize_rcv_mss(struct sock *sk) | 417 | void tcp_initialize_rcv_mss(struct sock *sk) |
441 | { | 418 | { |
442 | const struct tcp_sock *tp = tcp_sk(sk); | 419 | struct tcp_sock *tp = tcp_sk(sk); |
443 | unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache); | 420 | unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache); |
444 | 421 | ||
445 | hint = min(hint, tp->rcv_wnd / 2); | 422 | hint = min(hint, tp->rcv_wnd / 2); |
@@ -483,11 +460,8 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep) | |||
483 | if (!win_dep) { | 460 | if (!win_dep) { |
484 | m -= (new_sample >> 3); | 461 | m -= (new_sample >> 3); |
485 | new_sample += m; | 462 | new_sample += m; |
486 | } else { | 463 | } else if (m < new_sample) |
487 | m <<= 3; | 464 | new_sample = m << 3; |
488 | if (m < new_sample) | ||
489 | new_sample = m; | ||
490 | } | ||
491 | } else { | 465 | } else { |
492 | /* No previous measure. */ | 466 | /* No previous measure. */ |
493 | new_sample = m << 3; | 467 | new_sample = m << 3; |
@@ -503,7 +477,7 @@ static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp) | |||
503 | goto new_measure; | 477 | goto new_measure; |
504 | if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq)) | 478 | if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq)) |
505 | return; | 479 | return; |
506 | tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rcv_rtt_est.time, 1); | 480 | tcp_rcv_rtt_update(tp, jiffies - tp->rcv_rtt_est.time, 1); |
507 | 481 | ||
508 | new_measure: | 482 | new_measure: |
509 | tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd; | 483 | tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd; |
@@ -557,7 +531,8 @@ void tcp_rcv_space_adjust(struct sock *sk) | |||
557 | space /= tp->advmss; | 531 | space /= tp->advmss; |
558 | if (!space) | 532 | if (!space) |
559 | space = 1; | 533 | space = 1; |
560 | rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER); | 534 | rcvmem = (tp->advmss + MAX_TCP_HEADER + |
535 | 16 + sizeof(struct sk_buff)); | ||
561 | while (tcp_win_from_space(rcvmem) < tp->advmss) | 536 | while (tcp_win_from_space(rcvmem) < tp->advmss) |
562 | rcvmem += 128; | 537 | rcvmem += 128; |
563 | space *= rcvmem; | 538 | space *= rcvmem; |
@@ -707,7 +682,7 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt) | |||
707 | /* Calculate rto without backoff. This is the second half of Van Jacobson's | 682 | /* Calculate rto without backoff. This is the second half of Van Jacobson's |
708 | * routine referred to above. | 683 | * routine referred to above. |
709 | */ | 684 | */ |
710 | void tcp_set_rto(struct sock *sk) | 685 | static inline void tcp_set_rto(struct sock *sk) |
711 | { | 686 | { |
712 | const struct tcp_sock *tp = tcp_sk(sk); | 687 | const struct tcp_sock *tp = tcp_sk(sk); |
713 | /* Old crap is replaced with new one. 8) | 688 | /* Old crap is replaced with new one. 8) |
@@ -734,7 +709,110 @@ void tcp_set_rto(struct sock *sk) | |||
734 | tcp_bound_rto(sk); | 709 | tcp_bound_rto(sk); |
735 | } | 710 | } |
736 | 711 | ||
737 | __u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst) | 712 | /* Save metrics learned by this TCP session. |
713 | This function is called only, when TCP finishes successfully | ||
714 | i.e. when it enters TIME-WAIT or goes from LAST-ACK to CLOSE. | ||
715 | */ | ||
716 | void tcp_update_metrics(struct sock *sk) | ||
717 | { | ||
718 | struct tcp_sock *tp = tcp_sk(sk); | ||
719 | struct dst_entry *dst = __sk_dst_get(sk); | ||
720 | |||
721 | if (sysctl_tcp_nometrics_save) | ||
722 | return; | ||
723 | |||
724 | dst_confirm(dst); | ||
725 | |||
726 | if (dst && (dst->flags & DST_HOST)) { | ||
727 | const struct inet_connection_sock *icsk = inet_csk(sk); | ||
728 | int m; | ||
729 | unsigned long rtt; | ||
730 | |||
731 | if (icsk->icsk_backoff || !tp->srtt) { | ||
732 | /* This session failed to estimate rtt. Why? | ||
733 | * Probably, no packets returned in time. | ||
734 | * Reset our results. | ||
735 | */ | ||
736 | if (!(dst_metric_locked(dst, RTAX_RTT))) | ||
737 | dst_metric_set(dst, RTAX_RTT, 0); | ||
738 | return; | ||
739 | } | ||
740 | |||
741 | rtt = dst_metric_rtt(dst, RTAX_RTT); | ||
742 | m = rtt - tp->srtt; | ||
743 | |||
744 | /* If newly calculated rtt larger than stored one, | ||
745 | * store new one. Otherwise, use EWMA. Remember, | ||
746 | * rtt overestimation is always better than underestimation. | ||
747 | */ | ||
748 | if (!(dst_metric_locked(dst, RTAX_RTT))) { | ||
749 | if (m <= 0) | ||
750 | set_dst_metric_rtt(dst, RTAX_RTT, tp->srtt); | ||
751 | else | ||
752 | set_dst_metric_rtt(dst, RTAX_RTT, rtt - (m >> 3)); | ||
753 | } | ||
754 | |||
755 | if (!(dst_metric_locked(dst, RTAX_RTTVAR))) { | ||
756 | unsigned long var; | ||
757 | if (m < 0) | ||
758 | m = -m; | ||
759 | |||
760 | /* Scale deviation to rttvar fixed point */ | ||
761 | m >>= 1; | ||
762 | if (m < tp->mdev) | ||
763 | m = tp->mdev; | ||
764 | |||
765 | var = dst_metric_rtt(dst, RTAX_RTTVAR); | ||
766 | if (m >= var) | ||
767 | var = m; | ||
768 | else | ||
769 | var -= (var - m) >> 2; | ||
770 | |||
771 | set_dst_metric_rtt(dst, RTAX_RTTVAR, var); | ||
772 | } | ||
773 | |||
774 | if (tcp_in_initial_slowstart(tp)) { | ||
775 | /* Slow start still did not finish. */ | ||
776 | if (dst_metric(dst, RTAX_SSTHRESH) && | ||
777 | !dst_metric_locked(dst, RTAX_SSTHRESH) && | ||
778 | (tp->snd_cwnd >> 1) > dst_metric(dst, RTAX_SSTHRESH)) | ||
779 | dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_cwnd >> 1); | ||
780 | if (!dst_metric_locked(dst, RTAX_CWND) && | ||
781 | tp->snd_cwnd > dst_metric(dst, RTAX_CWND)) | ||
782 | dst_metric_set(dst, RTAX_CWND, tp->snd_cwnd); | ||
783 | } else if (tp->snd_cwnd > tp->snd_ssthresh && | ||
784 | icsk->icsk_ca_state == TCP_CA_Open) { | ||
785 | /* Cong. avoidance phase, cwnd is reliable. */ | ||
786 | if (!dst_metric_locked(dst, RTAX_SSTHRESH)) | ||
787 | dst_metric_set(dst, RTAX_SSTHRESH, | ||
788 | max(tp->snd_cwnd >> 1, tp->snd_ssthresh)); | ||
789 | if (!dst_metric_locked(dst, RTAX_CWND)) | ||
790 | dst_metric_set(dst, RTAX_CWND, | ||
791 | (dst_metric(dst, RTAX_CWND) + | ||
792 | tp->snd_cwnd) >> 1); | ||
793 | } else { | ||
794 | /* Else slow start did not finish, cwnd is non-sense, | ||
795 | ssthresh may be also invalid. | ||
796 | */ | ||
797 | if (!dst_metric_locked(dst, RTAX_CWND)) | ||
798 | dst_metric_set(dst, RTAX_CWND, | ||
799 | (dst_metric(dst, RTAX_CWND) + | ||
800 | tp->snd_ssthresh) >> 1); | ||
801 | if (dst_metric(dst, RTAX_SSTHRESH) && | ||
802 | !dst_metric_locked(dst, RTAX_SSTHRESH) && | ||
803 | tp->snd_ssthresh > dst_metric(dst, RTAX_SSTHRESH)) | ||
804 | dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_ssthresh); | ||
805 | } | ||
806 | |||
807 | if (!dst_metric_locked(dst, RTAX_REORDERING)) { | ||
808 | if (dst_metric(dst, RTAX_REORDERING) < tp->reordering && | ||
809 | tp->reordering != sysctl_tcp_reordering) | ||
810 | dst_metric_set(dst, RTAX_REORDERING, tp->reordering); | ||
811 | } | ||
812 | } | ||
813 | } | ||
814 | |||
815 | __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst) | ||
738 | { | 816 | { |
739 | __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); | 817 | __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); |
740 | 818 | ||
@@ -743,22 +821,124 @@ __u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst) | |||
743 | return min_t(__u32, cwnd, tp->snd_cwnd_clamp); | 821 | return min_t(__u32, cwnd, tp->snd_cwnd_clamp); |
744 | } | 822 | } |
745 | 823 | ||
824 | /* Set slow start threshold and cwnd not falling to slow start */ | ||
825 | void tcp_enter_cwr(struct sock *sk, const int set_ssthresh) | ||
826 | { | ||
827 | struct tcp_sock *tp = tcp_sk(sk); | ||
828 | const struct inet_connection_sock *icsk = inet_csk(sk); | ||
829 | |||
830 | tp->prior_ssthresh = 0; | ||
831 | tp->bytes_acked = 0; | ||
832 | if (icsk->icsk_ca_state < TCP_CA_CWR) { | ||
833 | tp->undo_marker = 0; | ||
834 | if (set_ssthresh) | ||
835 | tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); | ||
836 | tp->snd_cwnd = min(tp->snd_cwnd, | ||
837 | tcp_packets_in_flight(tp) + 1U); | ||
838 | tp->snd_cwnd_cnt = 0; | ||
839 | tp->high_seq = tp->snd_nxt; | ||
840 | tp->snd_cwnd_stamp = tcp_time_stamp; | ||
841 | TCP_ECN_queue_cwr(tp); | ||
842 | |||
843 | tcp_set_ca_state(sk, TCP_CA_CWR); | ||
844 | } | ||
845 | } | ||
846 | |||
746 | /* | 847 | /* |
747 | * Packet counting of FACK is based on in-order assumptions, therefore TCP | 848 | * Packet counting of FACK is based on in-order assumptions, therefore TCP |
748 | * disables it when reordering is detected | 849 | * disables it when reordering is detected |
749 | */ | 850 | */ |
750 | void tcp_disable_fack(struct tcp_sock *tp) | 851 | static void tcp_disable_fack(struct tcp_sock *tp) |
751 | { | 852 | { |
752 | /* RFC3517 uses different metric in lost marker => reset on change */ | 853 | /* RFC3517 uses different metric in lost marker => reset on change */ |
753 | if (tcp_is_fack(tp)) | 854 | if (tcp_is_fack(tp)) |
754 | tp->lost_skb_hint = NULL; | 855 | tp->lost_skb_hint = NULL; |
755 | tp->rx_opt.sack_ok &= ~TCP_FACK_ENABLED; | 856 | tp->rx_opt.sack_ok &= ~2; |
756 | } | 857 | } |
757 | 858 | ||
758 | /* Take a notice that peer is sending D-SACKs */ | 859 | /* Take a notice that peer is sending D-SACKs */ |
759 | static void tcp_dsack_seen(struct tcp_sock *tp) | 860 | static void tcp_dsack_seen(struct tcp_sock *tp) |
760 | { | 861 | { |
761 | tp->rx_opt.sack_ok |= TCP_DSACK_SEEN; | 862 | tp->rx_opt.sack_ok |= 4; |
863 | } | ||
864 | |||
865 | /* Initialize metrics on socket. */ | ||
866 | |||
867 | static void tcp_init_metrics(struct sock *sk) | ||
868 | { | ||
869 | struct tcp_sock *tp = tcp_sk(sk); | ||
870 | struct dst_entry *dst = __sk_dst_get(sk); | ||
871 | |||
872 | if (dst == NULL) | ||
873 | goto reset; | ||
874 | |||
875 | dst_confirm(dst); | ||
876 | |||
877 | if (dst_metric_locked(dst, RTAX_CWND)) | ||
878 | tp->snd_cwnd_clamp = dst_metric(dst, RTAX_CWND); | ||
879 | if (dst_metric(dst, RTAX_SSTHRESH)) { | ||
880 | tp->snd_ssthresh = dst_metric(dst, RTAX_SSTHRESH); | ||
881 | if (tp->snd_ssthresh > tp->snd_cwnd_clamp) | ||
882 | tp->snd_ssthresh = tp->snd_cwnd_clamp; | ||
883 | } else { | ||
884 | /* ssthresh may have been reduced unnecessarily during. | ||
885 | * 3WHS. Restore it back to its initial default. | ||
886 | */ | ||
887 | tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; | ||
888 | } | ||
889 | if (dst_metric(dst, RTAX_REORDERING) && | ||
890 | tp->reordering != dst_metric(dst, RTAX_REORDERING)) { | ||
891 | tcp_disable_fack(tp); | ||
892 | tp->reordering = dst_metric(dst, RTAX_REORDERING); | ||
893 | } | ||
894 | |||
895 | if (dst_metric(dst, RTAX_RTT) == 0 || tp->srtt == 0) | ||
896 | goto reset; | ||
897 | |||
898 | /* Initial rtt is determined from SYN,SYN-ACK. | ||
899 | * The segment is small and rtt may appear much | ||
900 | * less than real one. Use per-dst memory | ||
901 | * to make it more realistic. | ||
902 | * | ||
903 | * A bit of theory. RTT is time passed after "normal" sized packet | ||
904 | * is sent until it is ACKed. In normal circumstances sending small | ||
905 | * packets force peer to delay ACKs and calculation is correct too. | ||
906 | * The algorithm is adaptive and, provided we follow specs, it | ||
907 | * NEVER underestimate RTT. BUT! If peer tries to make some clever | ||
908 | * tricks sort of "quick acks" for time long enough to decrease RTT | ||
909 | * to low value, and then abruptly stops to do it and starts to delay | ||
910 | * ACKs, wait for troubles. | ||
911 | */ | ||
912 | if (dst_metric_rtt(dst, RTAX_RTT) > tp->srtt) { | ||
913 | tp->srtt = dst_metric_rtt(dst, RTAX_RTT); | ||
914 | tp->rtt_seq = tp->snd_nxt; | ||
915 | } | ||
916 | if (dst_metric_rtt(dst, RTAX_RTTVAR) > tp->mdev) { | ||
917 | tp->mdev = dst_metric_rtt(dst, RTAX_RTTVAR); | ||
918 | tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk)); | ||
919 | } | ||
920 | tcp_set_rto(sk); | ||
921 | reset: | ||
922 | if (tp->srtt == 0) { | ||
923 | /* RFC2988bis: We've failed to get a valid RTT sample from | ||
924 | * 3WHS. This is most likely due to retransmission, | ||
925 | * including spurious one. Reset the RTO back to 3secs | ||
926 | * from the more aggressive 1sec to avoid more spurious | ||
927 | * retransmission. | ||
928 | */ | ||
929 | tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_FALLBACK; | ||
930 | inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK; | ||
931 | } | ||
932 | /* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been | ||
933 | * retransmitted. In light of RFC2988bis' more aggressive 1sec | ||
934 | * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK | ||
935 | * retransmission has occurred. | ||
936 | */ | ||
937 | if (tp->total_retrans > 1) | ||
938 | tp->snd_cwnd = 1; | ||
939 | else | ||
940 | tp->snd_cwnd = tcp_init_cwnd(tp, dst); | ||
941 | tp->snd_cwnd_stamp = tcp_time_stamp; | ||
762 | } | 942 | } |
763 | 943 | ||
764 | static void tcp_update_reordering(struct sock *sk, const int metric, | 944 | static void tcp_update_reordering(struct sock *sk, const int metric, |
@@ -782,18 +962,15 @@ static void tcp_update_reordering(struct sock *sk, const int metric, | |||
782 | 962 | ||
783 | NET_INC_STATS_BH(sock_net(sk), mib_idx); | 963 | NET_INC_STATS_BH(sock_net(sk), mib_idx); |
784 | #if FASTRETRANS_DEBUG > 1 | 964 | #if FASTRETRANS_DEBUG > 1 |
785 | pr_debug("Disorder%d %d %u f%u s%u rr%d\n", | 965 | printk(KERN_DEBUG "Disorder%d %d %u f%u s%u rr%d\n", |
786 | tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state, | 966 | tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state, |
787 | tp->reordering, | 967 | tp->reordering, |
788 | tp->fackets_out, | 968 | tp->fackets_out, |
789 | tp->sacked_out, | 969 | tp->sacked_out, |
790 | tp->undo_marker ? tp->undo_retrans : 0); | 970 | tp->undo_marker ? tp->undo_retrans : 0); |
791 | #endif | 971 | #endif |
792 | tcp_disable_fack(tp); | 972 | tcp_disable_fack(tp); |
793 | } | 973 | } |
794 | |||
795 | if (metric > 0) | ||
796 | tcp_disable_early_retrans(tp); | ||
797 | } | 974 | } |
798 | 975 | ||
799 | /* This must be called before lost_out is incremented */ | 976 | /* This must be called before lost_out is incremented */ |
@@ -851,11 +1028,13 @@ static void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, | |||
851 | * These 6 states form finite state machine, controlled by the following events: | 1028 | * These 6 states form finite state machine, controlled by the following events: |
852 | * 1. New ACK (+SACK) arrives. (tcp_sacktag_write_queue()) | 1029 | * 1. New ACK (+SACK) arrives. (tcp_sacktag_write_queue()) |
853 | * 2. Retransmission. (tcp_retransmit_skb(), tcp_xmit_retransmit_queue()) | 1030 | * 2. Retransmission. (tcp_retransmit_skb(), tcp_xmit_retransmit_queue()) |
854 | * 3. Loss detection event of two flavors: | 1031 | * 3. Loss detection event of one of three flavors: |
855 | * A. Scoreboard estimator decided the packet is lost. | 1032 | * A. Scoreboard estimator decided the packet is lost. |
856 | * A'. Reno "three dupacks" marks head of queue lost. | 1033 | * A'. Reno "three dupacks" marks head of queue lost. |
857 | * A''. Its FACK modification, head until snd.fack is lost. | 1034 | * A''. Its FACK modfication, head until snd.fack is lost. |
858 | * B. SACK arrives sacking SND.NXT at the moment, when the | 1035 | * B. SACK arrives sacking data transmitted after never retransmitted |
1036 | * hole was sent out. | ||
1037 | * C. SACK arrives sacking SND.NXT at the moment, when the | ||
859 | * segment was retransmitted. | 1038 | * segment was retransmitted. |
860 | * 4. D-SACK added new rule: D-SACK changes any tag to S. | 1039 | * 4. D-SACK added new rule: D-SACK changes any tag to S. |
861 | * | 1040 | * |
@@ -924,36 +1103,36 @@ static void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, | |||
924 | * the exact amount is rather hard to quantify. However, tp->max_window can | 1103 | * the exact amount is rather hard to quantify. However, tp->max_window can |
925 | * be used as an exaggerated estimate. | 1104 | * be used as an exaggerated estimate. |
926 | */ | 1105 | */ |
927 | static bool tcp_is_sackblock_valid(struct tcp_sock *tp, bool is_dsack, | 1106 | static int tcp_is_sackblock_valid(struct tcp_sock *tp, int is_dsack, |
928 | u32 start_seq, u32 end_seq) | 1107 | u32 start_seq, u32 end_seq) |
929 | { | 1108 | { |
930 | /* Too far in future, or reversed (interpretation is ambiguous) */ | 1109 | /* Too far in future, or reversed (interpretation is ambiguous) */ |
931 | if (after(end_seq, tp->snd_nxt) || !before(start_seq, end_seq)) | 1110 | if (after(end_seq, tp->snd_nxt) || !before(start_seq, end_seq)) |
932 | return false; | 1111 | return 0; |
933 | 1112 | ||
934 | /* Nasty start_seq wrap-around check (see comments above) */ | 1113 | /* Nasty start_seq wrap-around check (see comments above) */ |
935 | if (!before(start_seq, tp->snd_nxt)) | 1114 | if (!before(start_seq, tp->snd_nxt)) |
936 | return false; | 1115 | return 0; |
937 | 1116 | ||
938 | /* In outstanding window? ...This is valid exit for D-SACKs too. | 1117 | /* In outstanding window? ...This is valid exit for D-SACKs too. |
939 | * start_seq == snd_una is non-sensical (see comments above) | 1118 | * start_seq == snd_una is non-sensical (see comments above) |
940 | */ | 1119 | */ |
941 | if (after(start_seq, tp->snd_una)) | 1120 | if (after(start_seq, tp->snd_una)) |
942 | return true; | 1121 | return 1; |
943 | 1122 | ||
944 | if (!is_dsack || !tp->undo_marker) | 1123 | if (!is_dsack || !tp->undo_marker) |
945 | return false; | 1124 | return 0; |
946 | 1125 | ||
947 | /* ...Then it's D-SACK, and must reside below snd_una completely */ | 1126 | /* ...Then it's D-SACK, and must reside below snd_una completely */ |
948 | if (after(end_seq, tp->snd_una)) | 1127 | if (after(end_seq, tp->snd_una)) |
949 | return false; | 1128 | return 0; |
950 | 1129 | ||
951 | if (!before(start_seq, tp->undo_marker)) | 1130 | if (!before(start_seq, tp->undo_marker)) |
952 | return true; | 1131 | return 1; |
953 | 1132 | ||
954 | /* Too old */ | 1133 | /* Too old */ |
955 | if (!after(end_seq, tp->undo_marker)) | 1134 | if (!after(end_seq, tp->undo_marker)) |
956 | return false; | 1135 | return 0; |
957 | 1136 | ||
958 | /* Undo_marker boundary crossing (overestimates a lot). Known already: | 1137 | /* Undo_marker boundary crossing (overestimates a lot). Known already: |
959 | * start_seq < undo_marker and end_seq >= undo_marker. | 1138 | * start_seq < undo_marker and end_seq >= undo_marker. |
@@ -962,7 +1141,7 @@ static bool tcp_is_sackblock_valid(struct tcp_sock *tp, bool is_dsack, | |||
962 | } | 1141 | } |
963 | 1142 | ||
964 | /* Check for lost retransmit. This superb idea is borrowed from "ratehalving". | 1143 | /* Check for lost retransmit. This superb idea is borrowed from "ratehalving". |
965 | * Event "B". Later note: FACK people cheated me again 8), we have to account | 1144 | * Event "C". Later note: FACK people cheated me again 8), we have to account |
966 | * for reordering! Ugly, but should help. | 1145 | * for reordering! Ugly, but should help. |
967 | * | 1146 | * |
968 | * Search retransmitted skbs from write_queue that were sent when snd_nxt was | 1147 | * Search retransmitted skbs from write_queue that were sent when snd_nxt was |
@@ -1025,17 +1204,17 @@ static void tcp_mark_lost_retrans(struct sock *sk) | |||
1025 | tp->lost_retrans_low = new_low_seq; | 1204 | tp->lost_retrans_low = new_low_seq; |
1026 | } | 1205 | } |
1027 | 1206 | ||
1028 | static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb, | 1207 | static int tcp_check_dsack(struct sock *sk, struct sk_buff *ack_skb, |
1029 | struct tcp_sack_block_wire *sp, int num_sacks, | 1208 | struct tcp_sack_block_wire *sp, int num_sacks, |
1030 | u32 prior_snd_una) | 1209 | u32 prior_snd_una) |
1031 | { | 1210 | { |
1032 | struct tcp_sock *tp = tcp_sk(sk); | 1211 | struct tcp_sock *tp = tcp_sk(sk); |
1033 | u32 start_seq_0 = get_unaligned_be32(&sp[0].start_seq); | 1212 | u32 start_seq_0 = get_unaligned_be32(&sp[0].start_seq); |
1034 | u32 end_seq_0 = get_unaligned_be32(&sp[0].end_seq); | 1213 | u32 end_seq_0 = get_unaligned_be32(&sp[0].end_seq); |
1035 | bool dup_sack = false; | 1214 | int dup_sack = 0; |
1036 | 1215 | ||
1037 | if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) { | 1216 | if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) { |
1038 | dup_sack = true; | 1217 | dup_sack = 1; |
1039 | tcp_dsack_seen(tp); | 1218 | tcp_dsack_seen(tp); |
1040 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKRECV); | 1219 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKRECV); |
1041 | } else if (num_sacks > 1) { | 1220 | } else if (num_sacks > 1) { |
@@ -1044,7 +1223,7 @@ static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb, | |||
1044 | 1223 | ||
1045 | if (!after(end_seq_0, end_seq_1) && | 1224 | if (!after(end_seq_0, end_seq_1) && |
1046 | !before(start_seq_0, start_seq_1)) { | 1225 | !before(start_seq_0, start_seq_1)) { |
1047 | dup_sack = true; | 1226 | dup_sack = 1; |
1048 | tcp_dsack_seen(tp); | 1227 | tcp_dsack_seen(tp); |
1049 | NET_INC_STATS_BH(sock_net(sk), | 1228 | NET_INC_STATS_BH(sock_net(sk), |
1050 | LINUX_MIB_TCPDSACKOFORECV); | 1229 | LINUX_MIB_TCPDSACKOFORECV); |
@@ -1075,10 +1254,9 @@ struct tcp_sacktag_state { | |||
1075 | * FIXME: this could be merged to shift decision code | 1254 | * FIXME: this could be merged to shift decision code |
1076 | */ | 1255 | */ |
1077 | static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, | 1256 | static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, |
1078 | u32 start_seq, u32 end_seq) | 1257 | u32 start_seq, u32 end_seq) |
1079 | { | 1258 | { |
1080 | int err; | 1259 | int in_sack, err; |
1081 | bool in_sack; | ||
1082 | unsigned int pkt_len; | 1260 | unsigned int pkt_len; |
1083 | unsigned int mss; | 1261 | unsigned int mss; |
1084 | 1262 | ||
@@ -1120,26 +1298,25 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, | |||
1120 | return in_sack; | 1298 | return in_sack; |
1121 | } | 1299 | } |
1122 | 1300 | ||
1123 | /* Mark the given newly-SACKed range as such, adjusting counters and hints. */ | 1301 | static u8 tcp_sacktag_one(struct sk_buff *skb, struct sock *sk, |
1124 | static u8 tcp_sacktag_one(struct sock *sk, | 1302 | struct tcp_sacktag_state *state, |
1125 | struct tcp_sacktag_state *state, u8 sacked, | 1303 | int dup_sack, int pcount) |
1126 | u32 start_seq, u32 end_seq, | ||
1127 | bool dup_sack, int pcount) | ||
1128 | { | 1304 | { |
1129 | struct tcp_sock *tp = tcp_sk(sk); | 1305 | struct tcp_sock *tp = tcp_sk(sk); |
1306 | u8 sacked = TCP_SKB_CB(skb)->sacked; | ||
1130 | int fack_count = state->fack_count; | 1307 | int fack_count = state->fack_count; |
1131 | 1308 | ||
1132 | /* Account D-SACK for retransmitted packet. */ | 1309 | /* Account D-SACK for retransmitted packet. */ |
1133 | if (dup_sack && (sacked & TCPCB_RETRANS)) { | 1310 | if (dup_sack && (sacked & TCPCB_RETRANS)) { |
1134 | if (tp->undo_marker && tp->undo_retrans && | 1311 | if (tp->undo_marker && tp->undo_retrans && |
1135 | after(end_seq, tp->undo_marker)) | 1312 | after(TCP_SKB_CB(skb)->end_seq, tp->undo_marker)) |
1136 | tp->undo_retrans--; | 1313 | tp->undo_retrans--; |
1137 | if (sacked & TCPCB_SACKED_ACKED) | 1314 | if (sacked & TCPCB_SACKED_ACKED) |
1138 | state->reord = min(fack_count, state->reord); | 1315 | state->reord = min(fack_count, state->reord); |
1139 | } | 1316 | } |
1140 | 1317 | ||
1141 | /* Nothing to do; acked frame is about to be dropped (was ACKed). */ | 1318 | /* Nothing to do; acked frame is about to be dropped (was ACKed). */ |
1142 | if (!after(end_seq, tp->snd_una)) | 1319 | if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) |
1143 | return sacked; | 1320 | return sacked; |
1144 | 1321 | ||
1145 | if (!(sacked & TCPCB_SACKED_ACKED)) { | 1322 | if (!(sacked & TCPCB_SACKED_ACKED)) { |
@@ -1158,13 +1335,13 @@ static u8 tcp_sacktag_one(struct sock *sk, | |||
1158 | /* New sack for not retransmitted frame, | 1335 | /* New sack for not retransmitted frame, |
1159 | * which was in hole. It is reordering. | 1336 | * which was in hole. It is reordering. |
1160 | */ | 1337 | */ |
1161 | if (before(start_seq, | 1338 | if (before(TCP_SKB_CB(skb)->seq, |
1162 | tcp_highest_sack_seq(tp))) | 1339 | tcp_highest_sack_seq(tp))) |
1163 | state->reord = min(fack_count, | 1340 | state->reord = min(fack_count, |
1164 | state->reord); | 1341 | state->reord); |
1165 | 1342 | ||
1166 | /* SACK enhanced F-RTO (RFC4138; Appendix B) */ | 1343 | /* SACK enhanced F-RTO (RFC4138; Appendix B) */ |
1167 | if (!after(end_seq, tp->frto_highmark)) | 1344 | if (!after(TCP_SKB_CB(skb)->end_seq, tp->frto_highmark)) |
1168 | state->flag |= FLAG_ONLY_ORIG_SACKED; | 1345 | state->flag |= FLAG_ONLY_ORIG_SACKED; |
1169 | } | 1346 | } |
1170 | 1347 | ||
@@ -1182,7 +1359,8 @@ static u8 tcp_sacktag_one(struct sock *sk, | |||
1182 | 1359 | ||
1183 | /* Lost marker hint past SACKed? Tweak RFC3517 cnt */ | 1360 | /* Lost marker hint past SACKed? Tweak RFC3517 cnt */ |
1184 | if (!tcp_is_fack(tp) && (tp->lost_skb_hint != NULL) && | 1361 | if (!tcp_is_fack(tp) && (tp->lost_skb_hint != NULL) && |
1185 | before(start_seq, TCP_SKB_CB(tp->lost_skb_hint)->seq)) | 1362 | before(TCP_SKB_CB(skb)->seq, |
1363 | TCP_SKB_CB(tp->lost_skb_hint)->seq)) | ||
1186 | tp->lost_cnt_hint += pcount; | 1364 | tp->lost_cnt_hint += pcount; |
1187 | 1365 | ||
1188 | if (fack_count > tp->fackets_out) | 1366 | if (fack_count > tp->fackets_out) |
@@ -1201,30 +1379,16 @@ static u8 tcp_sacktag_one(struct sock *sk, | |||
1201 | return sacked; | 1379 | return sacked; |
1202 | } | 1380 | } |
1203 | 1381 | ||
1204 | /* Shift newly-SACKed bytes from this skb to the immediately previous | 1382 | static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, |
1205 | * already-SACKed sk_buff. Mark the newly-SACKed bytes as such. | 1383 | struct tcp_sacktag_state *state, |
1206 | */ | 1384 | unsigned int pcount, int shifted, int mss, |
1207 | static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, | 1385 | int dup_sack) |
1208 | struct tcp_sacktag_state *state, | ||
1209 | unsigned int pcount, int shifted, int mss, | ||
1210 | bool dup_sack) | ||
1211 | { | 1386 | { |
1212 | struct tcp_sock *tp = tcp_sk(sk); | 1387 | struct tcp_sock *tp = tcp_sk(sk); |
1213 | struct sk_buff *prev = tcp_write_queue_prev(sk, skb); | 1388 | struct sk_buff *prev = tcp_write_queue_prev(sk, skb); |
1214 | u32 start_seq = TCP_SKB_CB(skb)->seq; /* start of newly-SACKed */ | ||
1215 | u32 end_seq = start_seq + shifted; /* end of newly-SACKed */ | ||
1216 | 1389 | ||
1217 | BUG_ON(!pcount); | 1390 | BUG_ON(!pcount); |
1218 | 1391 | ||
1219 | /* Adjust counters and hints for the newly sacked sequence | ||
1220 | * range but discard the return value since prev is already | ||
1221 | * marked. We must tag the range first because the seq | ||
1222 | * advancement below implicitly advances | ||
1223 | * tcp_highest_sack_seq() when skb is highest_sack. | ||
1224 | */ | ||
1225 | tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked, | ||
1226 | start_seq, end_seq, dup_sack, pcount); | ||
1227 | |||
1228 | if (skb == tp->lost_skb_hint) | 1392 | if (skb == tp->lost_skb_hint) |
1229 | tp->lost_cnt_hint += pcount; | 1393 | tp->lost_cnt_hint += pcount; |
1230 | 1394 | ||
@@ -1251,13 +1415,16 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, | |||
1251 | skb_shinfo(skb)->gso_type = 0; | 1415 | skb_shinfo(skb)->gso_type = 0; |
1252 | } | 1416 | } |
1253 | 1417 | ||
1418 | /* We discard results */ | ||
1419 | tcp_sacktag_one(skb, sk, state, dup_sack, pcount); | ||
1420 | |||
1254 | /* Difference in this won't matter, both ACKed by the same cumul. ACK */ | 1421 | /* Difference in this won't matter, both ACKed by the same cumul. ACK */ |
1255 | TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS); | 1422 | TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS); |
1256 | 1423 | ||
1257 | if (skb->len > 0) { | 1424 | if (skb->len > 0) { |
1258 | BUG_ON(!tcp_skb_pcount(skb)); | 1425 | BUG_ON(!tcp_skb_pcount(skb)); |
1259 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKSHIFTED); | 1426 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKSHIFTED); |
1260 | return false; | 1427 | return 0; |
1261 | } | 1428 | } |
1262 | 1429 | ||
1263 | /* Whole SKB was eaten :-) */ | 1430 | /* Whole SKB was eaten :-) */ |
@@ -1271,7 +1438,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, | |||
1271 | tp->lost_cnt_hint -= tcp_skb_pcount(prev); | 1438 | tp->lost_cnt_hint -= tcp_skb_pcount(prev); |
1272 | } | 1439 | } |
1273 | 1440 | ||
1274 | TCP_SKB_CB(skb)->tcp_flags |= TCP_SKB_CB(prev)->tcp_flags; | 1441 | TCP_SKB_CB(skb)->flags |= TCP_SKB_CB(prev)->flags; |
1275 | if (skb == tcp_highest_sack(sk)) | 1442 | if (skb == tcp_highest_sack(sk)) |
1276 | tcp_advance_highest_sack(sk, skb); | 1443 | tcp_advance_highest_sack(sk, skb); |
1277 | 1444 | ||
@@ -1280,19 +1447,19 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, | |||
1280 | 1447 | ||
1281 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKMERGED); | 1448 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKMERGED); |
1282 | 1449 | ||
1283 | return true; | 1450 | return 1; |
1284 | } | 1451 | } |
1285 | 1452 | ||
1286 | /* I wish gso_size would have a bit more sane initialization than | 1453 | /* I wish gso_size would have a bit more sane initialization than |
1287 | * something-or-zero which complicates things | 1454 | * something-or-zero which complicates things |
1288 | */ | 1455 | */ |
1289 | static int tcp_skb_seglen(const struct sk_buff *skb) | 1456 | static int tcp_skb_seglen(struct sk_buff *skb) |
1290 | { | 1457 | { |
1291 | return tcp_skb_pcount(skb) == 1 ? skb->len : tcp_skb_mss(skb); | 1458 | return tcp_skb_pcount(skb) == 1 ? skb->len : tcp_skb_mss(skb); |
1292 | } | 1459 | } |
1293 | 1460 | ||
1294 | /* Shifting pages past head area doesn't work */ | 1461 | /* Shifting pages past head area doesn't work */ |
1295 | static int skb_can_shift(const struct sk_buff *skb) | 1462 | static int skb_can_shift(struct sk_buff *skb) |
1296 | { | 1463 | { |
1297 | return !skb_headlen(skb) && skb_is_nonlinear(skb); | 1464 | return !skb_headlen(skb) && skb_is_nonlinear(skb); |
1298 | } | 1465 | } |
@@ -1303,7 +1470,7 @@ static int skb_can_shift(const struct sk_buff *skb) | |||
1303 | static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, | 1470 | static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, |
1304 | struct tcp_sacktag_state *state, | 1471 | struct tcp_sacktag_state *state, |
1305 | u32 start_seq, u32 end_seq, | 1472 | u32 start_seq, u32 end_seq, |
1306 | bool dup_sack) | 1473 | int dup_sack) |
1307 | { | 1474 | { |
1308 | struct tcp_sock *tp = tcp_sk(sk); | 1475 | struct tcp_sock *tp = tcp_sk(sk); |
1309 | struct sk_buff *prev; | 1476 | struct sk_buff *prev; |
@@ -1398,10 +1565,6 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, | |||
1398 | } | 1565 | } |
1399 | } | 1566 | } |
1400 | 1567 | ||
1401 | /* tcp_sacktag_one() won't SACK-tag ranges below snd_una */ | ||
1402 | if (!after(TCP_SKB_CB(skb)->seq + len, tp->snd_una)) | ||
1403 | goto fallback; | ||
1404 | |||
1405 | if (!skb_shift(prev, skb, len)) | 1568 | if (!skb_shift(prev, skb, len)) |
1406 | goto fallback; | 1569 | goto fallback; |
1407 | if (!tcp_shifted_skb(sk, skb, state, pcount, len, mss, dup_sack)) | 1570 | if (!tcp_shifted_skb(sk, skb, state, pcount, len, mss, dup_sack)) |
@@ -1442,14 +1605,14 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, | |||
1442 | struct tcp_sack_block *next_dup, | 1605 | struct tcp_sack_block *next_dup, |
1443 | struct tcp_sacktag_state *state, | 1606 | struct tcp_sacktag_state *state, |
1444 | u32 start_seq, u32 end_seq, | 1607 | u32 start_seq, u32 end_seq, |
1445 | bool dup_sack_in) | 1608 | int dup_sack_in) |
1446 | { | 1609 | { |
1447 | struct tcp_sock *tp = tcp_sk(sk); | 1610 | struct tcp_sock *tp = tcp_sk(sk); |
1448 | struct sk_buff *tmp; | 1611 | struct sk_buff *tmp; |
1449 | 1612 | ||
1450 | tcp_for_write_queue_from(skb, sk) { | 1613 | tcp_for_write_queue_from(skb, sk) { |
1451 | int in_sack = 0; | 1614 | int in_sack = 0; |
1452 | bool dup_sack = dup_sack_in; | 1615 | int dup_sack = dup_sack_in; |
1453 | 1616 | ||
1454 | if (skb == tcp_send_head(sk)) | 1617 | if (skb == tcp_send_head(sk)) |
1455 | break; | 1618 | break; |
@@ -1464,7 +1627,7 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, | |||
1464 | next_dup->start_seq, | 1627 | next_dup->start_seq, |
1465 | next_dup->end_seq); | 1628 | next_dup->end_seq); |
1466 | if (in_sack > 0) | 1629 | if (in_sack > 0) |
1467 | dup_sack = true; | 1630 | dup_sack = 1; |
1468 | } | 1631 | } |
1469 | 1632 | ||
1470 | /* skb reference here is a bit tricky to get right, since | 1633 | /* skb reference here is a bit tricky to get right, since |
@@ -1492,14 +1655,10 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, | |||
1492 | break; | 1655 | break; |
1493 | 1656 | ||
1494 | if (in_sack) { | 1657 | if (in_sack) { |
1495 | TCP_SKB_CB(skb)->sacked = | 1658 | TCP_SKB_CB(skb)->sacked = tcp_sacktag_one(skb, sk, |
1496 | tcp_sacktag_one(sk, | 1659 | state, |
1497 | state, | 1660 | dup_sack, |
1498 | TCP_SKB_CB(skb)->sacked, | 1661 | tcp_skb_pcount(skb)); |
1499 | TCP_SKB_CB(skb)->seq, | ||
1500 | TCP_SKB_CB(skb)->end_seq, | ||
1501 | dup_sack, | ||
1502 | tcp_skb_pcount(skb)); | ||
1503 | 1662 | ||
1504 | if (!before(TCP_SKB_CB(skb)->seq, | 1663 | if (!before(TCP_SKB_CB(skb)->seq, |
1505 | tcp_highest_sack_seq(tp))) | 1664 | tcp_highest_sack_seq(tp))) |
@@ -1549,19 +1708,19 @@ static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb, | |||
1549 | return skb; | 1708 | return skb; |
1550 | } | 1709 | } |
1551 | 1710 | ||
1552 | static int tcp_sack_cache_ok(const struct tcp_sock *tp, const struct tcp_sack_block *cache) | 1711 | static int tcp_sack_cache_ok(struct tcp_sock *tp, struct tcp_sack_block *cache) |
1553 | { | 1712 | { |
1554 | return cache < tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache); | 1713 | return cache < tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache); |
1555 | } | 1714 | } |
1556 | 1715 | ||
1557 | static int | 1716 | static int |
1558 | tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, | 1717 | tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, |
1559 | u32 prior_snd_una) | 1718 | u32 prior_snd_una) |
1560 | { | 1719 | { |
1561 | const struct inet_connection_sock *icsk = inet_csk(sk); | 1720 | const struct inet_connection_sock *icsk = inet_csk(sk); |
1562 | struct tcp_sock *tp = tcp_sk(sk); | 1721 | struct tcp_sock *tp = tcp_sk(sk); |
1563 | const unsigned char *ptr = (skb_transport_header(ack_skb) + | 1722 | unsigned char *ptr = (skb_transport_header(ack_skb) + |
1564 | TCP_SKB_CB(ack_skb)->sacked); | 1723 | TCP_SKB_CB(ack_skb)->sacked); |
1565 | struct tcp_sack_block_wire *sp_wire = (struct tcp_sack_block_wire *)(ptr+2); | 1724 | struct tcp_sack_block_wire *sp_wire = (struct tcp_sack_block_wire *)(ptr+2); |
1566 | struct tcp_sack_block sp[TCP_NUM_SACKS]; | 1725 | struct tcp_sack_block sp[TCP_NUM_SACKS]; |
1567 | struct tcp_sack_block *cache; | 1726 | struct tcp_sack_block *cache; |
@@ -1569,7 +1728,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, | |||
1569 | struct sk_buff *skb; | 1728 | struct sk_buff *skb; |
1570 | int num_sacks = min(TCP_NUM_SACKS, (ptr[1] - TCPOLEN_SACK_BASE) >> 3); | 1729 | int num_sacks = min(TCP_NUM_SACKS, (ptr[1] - TCPOLEN_SACK_BASE) >> 3); |
1571 | int used_sacks; | 1730 | int used_sacks; |
1572 | bool found_dup_sack = false; | 1731 | int found_dup_sack = 0; |
1573 | int i, j; | 1732 | int i, j; |
1574 | int first_sack_index; | 1733 | int first_sack_index; |
1575 | 1734 | ||
@@ -1600,7 +1759,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, | |||
1600 | used_sacks = 0; | 1759 | used_sacks = 0; |
1601 | first_sack_index = 0; | 1760 | first_sack_index = 0; |
1602 | for (i = 0; i < num_sacks; i++) { | 1761 | for (i = 0; i < num_sacks; i++) { |
1603 | bool dup_sack = !i && found_dup_sack; | 1762 | int dup_sack = !i && found_dup_sack; |
1604 | 1763 | ||
1605 | sp[used_sacks].start_seq = get_unaligned_be32(&sp_wire[i].start_seq); | 1764 | sp[used_sacks].start_seq = get_unaligned_be32(&sp_wire[i].start_seq); |
1606 | sp[used_sacks].end_seq = get_unaligned_be32(&sp_wire[i].end_seq); | 1765 | sp[used_sacks].end_seq = get_unaligned_be32(&sp_wire[i].end_seq); |
@@ -1667,12 +1826,16 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, | |||
1667 | while (i < used_sacks) { | 1826 | while (i < used_sacks) { |
1668 | u32 start_seq = sp[i].start_seq; | 1827 | u32 start_seq = sp[i].start_seq; |
1669 | u32 end_seq = sp[i].end_seq; | 1828 | u32 end_seq = sp[i].end_seq; |
1670 | bool dup_sack = (found_dup_sack && (i == first_sack_index)); | 1829 | int dup_sack = (found_dup_sack && (i == first_sack_index)); |
1671 | struct tcp_sack_block *next_dup = NULL; | 1830 | struct tcp_sack_block *next_dup = NULL; |
1672 | 1831 | ||
1673 | if (found_dup_sack && ((i + 1) == first_sack_index)) | 1832 | if (found_dup_sack && ((i + 1) == first_sack_index)) |
1674 | next_dup = &sp[i + 1]; | 1833 | next_dup = &sp[i + 1]; |
1675 | 1834 | ||
1835 | /* Event "B" in the comment above. */ | ||
1836 | if (after(end_seq, tp->high_seq)) | ||
1837 | state.flag |= FLAG_DATA_LOST; | ||
1838 | |||
1676 | /* Skip too early cached blocks */ | 1839 | /* Skip too early cached blocks */ |
1677 | while (tcp_sack_cache_ok(tp, cache) && | 1840 | while (tcp_sack_cache_ok(tp, cache) && |
1678 | !before(start_seq, cache->end_seq)) | 1841 | !before(start_seq, cache->end_seq)) |
@@ -1769,9 +1932,9 @@ out: | |||
1769 | } | 1932 | } |
1770 | 1933 | ||
1771 | /* Limits sacked_out so that sum with lost_out isn't ever larger than | 1934 | /* Limits sacked_out so that sum with lost_out isn't ever larger than |
1772 | * packets_out. Returns false if sacked_out adjustement wasn't necessary. | 1935 | * packets_out. Returns zero if sacked_out adjustement wasn't necessary. |
1773 | */ | 1936 | */ |
1774 | static bool tcp_limit_reno_sacked(struct tcp_sock *tp) | 1937 | static int tcp_limit_reno_sacked(struct tcp_sock *tp) |
1775 | { | 1938 | { |
1776 | u32 holes; | 1939 | u32 holes; |
1777 | 1940 | ||
@@ -1780,9 +1943,9 @@ static bool tcp_limit_reno_sacked(struct tcp_sock *tp) | |||
1780 | 1943 | ||
1781 | if ((tp->sacked_out + holes) > tp->packets_out) { | 1944 | if ((tp->sacked_out + holes) > tp->packets_out) { |
1782 | tp->sacked_out = tp->packets_out - holes; | 1945 | tp->sacked_out = tp->packets_out - holes; |
1783 | return true; | 1946 | return 1; |
1784 | } | 1947 | } |
1785 | return false; | 1948 | return 0; |
1786 | } | 1949 | } |
1787 | 1950 | ||
1788 | /* If we receive more dupacks than we expected counting segments | 1951 | /* If we receive more dupacks than we expected counting segments |
@@ -1836,40 +1999,40 @@ static int tcp_is_sackfrto(const struct tcp_sock *tp) | |||
1836 | /* F-RTO can only be used if TCP has never retransmitted anything other than | 1999 | /* F-RTO can only be used if TCP has never retransmitted anything other than |
1837 | * head (SACK enhanced variant from Appendix B of RFC4138 is more robust here) | 2000 | * head (SACK enhanced variant from Appendix B of RFC4138 is more robust here) |
1838 | */ | 2001 | */ |
1839 | bool tcp_use_frto(struct sock *sk) | 2002 | int tcp_use_frto(struct sock *sk) |
1840 | { | 2003 | { |
1841 | const struct tcp_sock *tp = tcp_sk(sk); | 2004 | const struct tcp_sock *tp = tcp_sk(sk); |
1842 | const struct inet_connection_sock *icsk = inet_csk(sk); | 2005 | const struct inet_connection_sock *icsk = inet_csk(sk); |
1843 | struct sk_buff *skb; | 2006 | struct sk_buff *skb; |
1844 | 2007 | ||
1845 | if (!sysctl_tcp_frto) | 2008 | if (!sysctl_tcp_frto) |
1846 | return false; | 2009 | return 0; |
1847 | 2010 | ||
1848 | /* MTU probe and F-RTO won't really play nicely along currently */ | 2011 | /* MTU probe and F-RTO won't really play nicely along currently */ |
1849 | if (icsk->icsk_mtup.probe_size) | 2012 | if (icsk->icsk_mtup.probe_size) |
1850 | return false; | 2013 | return 0; |
1851 | 2014 | ||
1852 | if (tcp_is_sackfrto(tp)) | 2015 | if (tcp_is_sackfrto(tp)) |
1853 | return true; | 2016 | return 1; |
1854 | 2017 | ||
1855 | /* Avoid expensive walking of rexmit queue if possible */ | 2018 | /* Avoid expensive walking of rexmit queue if possible */ |
1856 | if (tp->retrans_out > 1) | 2019 | if (tp->retrans_out > 1) |
1857 | return false; | 2020 | return 0; |
1858 | 2021 | ||
1859 | skb = tcp_write_queue_head(sk); | 2022 | skb = tcp_write_queue_head(sk); |
1860 | if (tcp_skb_is_last(sk, skb)) | 2023 | if (tcp_skb_is_last(sk, skb)) |
1861 | return true; | 2024 | return 1; |
1862 | skb = tcp_write_queue_next(sk, skb); /* Skips head */ | 2025 | skb = tcp_write_queue_next(sk, skb); /* Skips head */ |
1863 | tcp_for_write_queue_from(skb, sk) { | 2026 | tcp_for_write_queue_from(skb, sk) { |
1864 | if (skb == tcp_send_head(sk)) | 2027 | if (skb == tcp_send_head(sk)) |
1865 | break; | 2028 | break; |
1866 | if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) | 2029 | if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) |
1867 | return false; | 2030 | return 0; |
1868 | /* Short-circuit when first non-SACKed skb has been checked */ | 2031 | /* Short-circuit when first non-SACKed skb has been checked */ |
1869 | if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) | 2032 | if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) |
1870 | break; | 2033 | break; |
1871 | } | 2034 | } |
1872 | return true; | 2035 | return 1; |
1873 | } | 2036 | } |
1874 | 2037 | ||
1875 | /* RTO occurred, but do not yet enter Loss state. Instead, defer RTO | 2038 | /* RTO occurred, but do not yet enter Loss state. Instead, defer RTO |
@@ -2105,7 +2268,7 @@ void tcp_enter_loss(struct sock *sk, int how) | |||
2105 | * | 2268 | * |
2106 | * Do processing similar to RTO timeout. | 2269 | * Do processing similar to RTO timeout. |
2107 | */ | 2270 | */ |
2108 | static bool tcp_check_sack_reneging(struct sock *sk, int flag) | 2271 | static int tcp_check_sack_reneging(struct sock *sk, int flag) |
2109 | { | 2272 | { |
2110 | if (flag & FLAG_SACK_RENEGING) { | 2273 | if (flag & FLAG_SACK_RENEGING) { |
2111 | struct inet_connection_sock *icsk = inet_csk(sk); | 2274 | struct inet_connection_sock *icsk = inet_csk(sk); |
@@ -2116,12 +2279,12 @@ static bool tcp_check_sack_reneging(struct sock *sk, int flag) | |||
2116 | tcp_retransmit_skb(sk, tcp_write_queue_head(sk)); | 2279 | tcp_retransmit_skb(sk, tcp_write_queue_head(sk)); |
2117 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, | 2280 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, |
2118 | icsk->icsk_rto, TCP_RTO_MAX); | 2281 | icsk->icsk_rto, TCP_RTO_MAX); |
2119 | return true; | 2282 | return 1; |
2120 | } | 2283 | } |
2121 | return false; | 2284 | return 0; |
2122 | } | 2285 | } |
2123 | 2286 | ||
2124 | static inline int tcp_fackets_out(const struct tcp_sock *tp) | 2287 | static inline int tcp_fackets_out(struct tcp_sock *tp) |
2125 | { | 2288 | { |
2126 | return tcp_is_reno(tp) ? tp->sacked_out + 1 : tp->fackets_out; | 2289 | return tcp_is_reno(tp) ? tp->sacked_out + 1 : tp->fackets_out; |
2127 | } | 2290 | } |
@@ -2141,41 +2304,19 @@ static inline int tcp_fackets_out(const struct tcp_sock *tp) | |||
2141 | * they differ. Since neither occurs due to loss, TCP should really | 2304 | * they differ. Since neither occurs due to loss, TCP should really |
2142 | * ignore them. | 2305 | * ignore them. |
2143 | */ | 2306 | */ |
2144 | static inline int tcp_dupack_heuristics(const struct tcp_sock *tp) | 2307 | static inline int tcp_dupack_heuristics(struct tcp_sock *tp) |
2145 | { | 2308 | { |
2146 | return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1; | 2309 | return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1; |
2147 | } | 2310 | } |
2148 | 2311 | ||
2149 | static bool tcp_pause_early_retransmit(struct sock *sk, int flag) | 2312 | static inline int tcp_skb_timedout(struct sock *sk, struct sk_buff *skb) |
2150 | { | ||
2151 | struct tcp_sock *tp = tcp_sk(sk); | ||
2152 | unsigned long delay; | ||
2153 | |||
2154 | /* Delay early retransmit and entering fast recovery for | ||
2155 | * max(RTT/4, 2msec) unless ack has ECE mark, no RTT samples | ||
2156 | * available, or RTO is scheduled to fire first. | ||
2157 | */ | ||
2158 | if (sysctl_tcp_early_retrans < 2 || (flag & FLAG_ECE) || !tp->srtt) | ||
2159 | return false; | ||
2160 | |||
2161 | delay = max_t(unsigned long, (tp->srtt >> 5), msecs_to_jiffies(2)); | ||
2162 | if (!time_after(inet_csk(sk)->icsk_timeout, (jiffies + delay))) | ||
2163 | return false; | ||
2164 | |||
2165 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, delay, TCP_RTO_MAX); | ||
2166 | tp->early_retrans_delayed = 1; | ||
2167 | return true; | ||
2168 | } | ||
2169 | |||
2170 | static inline int tcp_skb_timedout(const struct sock *sk, | ||
2171 | const struct sk_buff *skb) | ||
2172 | { | 2313 | { |
2173 | return tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto; | 2314 | return tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto; |
2174 | } | 2315 | } |
2175 | 2316 | ||
2176 | static inline int tcp_head_timedout(const struct sock *sk) | 2317 | static inline int tcp_head_timedout(struct sock *sk) |
2177 | { | 2318 | { |
2178 | const struct tcp_sock *tp = tcp_sk(sk); | 2319 | struct tcp_sock *tp = tcp_sk(sk); |
2179 | 2320 | ||
2180 | return tp->packets_out && | 2321 | return tp->packets_out && |
2181 | tcp_skb_timedout(sk, tcp_write_queue_head(sk)); | 2322 | tcp_skb_timedout(sk, tcp_write_queue_head(sk)); |
@@ -2274,28 +2415,28 @@ static inline int tcp_head_timedout(const struct sock *sk) | |||
2274 | * Main question: may we further continue forward transmission | 2415 | * Main question: may we further continue forward transmission |
2275 | * with the same cwnd? | 2416 | * with the same cwnd? |
2276 | */ | 2417 | */ |
2277 | static bool tcp_time_to_recover(struct sock *sk, int flag) | 2418 | static int tcp_time_to_recover(struct sock *sk) |
2278 | { | 2419 | { |
2279 | struct tcp_sock *tp = tcp_sk(sk); | 2420 | struct tcp_sock *tp = tcp_sk(sk); |
2280 | __u32 packets_out; | 2421 | __u32 packets_out; |
2281 | 2422 | ||
2282 | /* Do not perform any recovery during F-RTO algorithm */ | 2423 | /* Do not perform any recovery during F-RTO algorithm */ |
2283 | if (tp->frto_counter) | 2424 | if (tp->frto_counter) |
2284 | return false; | 2425 | return 0; |
2285 | 2426 | ||
2286 | /* Trick#1: The loss is proven. */ | 2427 | /* Trick#1: The loss is proven. */ |
2287 | if (tp->lost_out) | 2428 | if (tp->lost_out) |
2288 | return true; | 2429 | return 1; |
2289 | 2430 | ||
2290 | /* Not-A-Trick#2 : Classic rule... */ | 2431 | /* Not-A-Trick#2 : Classic rule... */ |
2291 | if (tcp_dupack_heuristics(tp) > tp->reordering) | 2432 | if (tcp_dupack_heuristics(tp) > tp->reordering) |
2292 | return true; | 2433 | return 1; |
2293 | 2434 | ||
2294 | /* Trick#3 : when we use RFC2988 timer restart, fast | 2435 | /* Trick#3 : when we use RFC2988 timer restart, fast |
2295 | * retransmit can be triggered by timeout of queue head. | 2436 | * retransmit can be triggered by timeout of queue head. |
2296 | */ | 2437 | */ |
2297 | if (tcp_is_fack(tp) && tcp_head_timedout(sk)) | 2438 | if (tcp_is_fack(tp) && tcp_head_timedout(sk)) |
2298 | return true; | 2439 | return 1; |
2299 | 2440 | ||
2300 | /* Trick#4: It is still not OK... But will it be useful to delay | 2441 | /* Trick#4: It is still not OK... But will it be useful to delay |
2301 | * recovery more? | 2442 | * recovery more? |
@@ -2307,7 +2448,7 @@ static bool tcp_time_to_recover(struct sock *sk, int flag) | |||
2307 | /* We have nothing to send. This connection is limited | 2448 | /* We have nothing to send. This connection is limited |
2308 | * either by receiver window or by application. | 2449 | * either by receiver window or by application. |
2309 | */ | 2450 | */ |
2310 | return true; | 2451 | return 1; |
2311 | } | 2452 | } |
2312 | 2453 | ||
2313 | /* If a thin stream is detected, retransmit after first | 2454 | /* If a thin stream is detected, retransmit after first |
@@ -2318,19 +2459,9 @@ static bool tcp_time_to_recover(struct sock *sk, int flag) | |||
2318 | if ((tp->thin_dupack || sysctl_tcp_thin_dupack) && | 2459 | if ((tp->thin_dupack || sysctl_tcp_thin_dupack) && |
2319 | tcp_stream_is_thin(tp) && tcp_dupack_heuristics(tp) > 1 && | 2460 | tcp_stream_is_thin(tp) && tcp_dupack_heuristics(tp) > 1 && |
2320 | tcp_is_sack(tp) && !tcp_send_head(sk)) | 2461 | tcp_is_sack(tp) && !tcp_send_head(sk)) |
2321 | return true; | 2462 | return 1; |
2322 | 2463 | ||
2323 | /* Trick#6: TCP early retransmit, per RFC5827. To avoid spurious | 2464 | return 0; |
2324 | * retransmissions due to small network reorderings, we implement | ||
2325 | * Mitigation A.3 in the RFC and delay the retransmission for a short | ||
2326 | * interval if appropriate. | ||
2327 | */ | ||
2328 | if (tp->do_early_retrans && !tp->retrans_out && tp->sacked_out && | ||
2329 | (tp->packets_out == (tp->sacked_out + 1) && tp->packets_out < 4) && | ||
2330 | !tcp_may_send_now(sk)) | ||
2331 | return !tcp_pause_early_retransmit(sk, flag); | ||
2332 | |||
2333 | return false; | ||
2334 | } | 2465 | } |
2335 | 2466 | ||
2336 | /* New heuristics: it is possible only after we switched to restart timer | 2467 | /* New heuristics: it is possible only after we switched to restart timer |
@@ -2371,11 +2502,8 @@ static void tcp_timeout_skbs(struct sock *sk) | |||
2371 | tcp_verify_left_out(tp); | 2502 | tcp_verify_left_out(tp); |
2372 | } | 2503 | } |
2373 | 2504 | ||
2374 | /* Detect loss in event "A" above by marking head of queue up as lost. | 2505 | /* Mark head of queue up as lost. With RFC3517 SACK, the packets is |
2375 | * For FACK or non-SACK(Reno) senders, the first "packets" number of segments | 2506 | * is against sacked "cnt", otherwise it's against facked "cnt" |
2376 | * are considered lost. For RFC3517 SACK, a segment is considered lost if it | ||
2377 | * has at least tp->reordering SACKed seqments above it; "packets" refers to | ||
2378 | * the maximum SACKed segments to pass before reaching this limit. | ||
2379 | */ | 2507 | */ |
2380 | static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) | 2508 | static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) |
2381 | { | 2509 | { |
@@ -2384,8 +2512,6 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) | |||
2384 | int cnt, oldcnt; | 2512 | int cnt, oldcnt; |
2385 | int err; | 2513 | int err; |
2386 | unsigned int mss; | 2514 | unsigned int mss; |
2387 | /* Use SACK to deduce losses of new sequences sent during recovery */ | ||
2388 | const u32 loss_high = tcp_is_sack(tp) ? tp->snd_nxt : tp->high_seq; | ||
2389 | 2515 | ||
2390 | WARN_ON(packets > tp->packets_out); | 2516 | WARN_ON(packets > tp->packets_out); |
2391 | if (tp->lost_skb_hint) { | 2517 | if (tp->lost_skb_hint) { |
@@ -2407,7 +2533,7 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) | |||
2407 | tp->lost_skb_hint = skb; | 2533 | tp->lost_skb_hint = skb; |
2408 | tp->lost_cnt_hint = cnt; | 2534 | tp->lost_cnt_hint = cnt; |
2409 | 2535 | ||
2410 | if (after(TCP_SKB_CB(skb)->end_seq, loss_high)) | 2536 | if (after(TCP_SKB_CB(skb)->end_seq, tp->high_seq)) |
2411 | break; | 2537 | break; |
2412 | 2538 | ||
2413 | oldcnt = cnt; | 2539 | oldcnt = cnt; |
@@ -2417,7 +2543,6 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) | |||
2417 | 2543 | ||
2418 | if (cnt > packets) { | 2544 | if (cnt > packets) { |
2419 | if ((tcp_is_sack(tp) && !tcp_is_fack(tp)) || | 2545 | if ((tcp_is_sack(tp) && !tcp_is_fack(tp)) || |
2420 | (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) || | ||
2421 | (oldcnt >= packets)) | 2546 | (oldcnt >= packets)) |
2422 | break; | 2547 | break; |
2423 | 2548 | ||
@@ -2470,10 +2595,39 @@ static inline void tcp_moderate_cwnd(struct tcp_sock *tp) | |||
2470 | tp->snd_cwnd_stamp = tcp_time_stamp; | 2595 | tp->snd_cwnd_stamp = tcp_time_stamp; |
2471 | } | 2596 | } |
2472 | 2597 | ||
2598 | /* Lower bound on congestion window is slow start threshold | ||
2599 | * unless congestion avoidance choice decides to overide it. | ||
2600 | */ | ||
2601 | static inline u32 tcp_cwnd_min(const struct sock *sk) | ||
2602 | { | ||
2603 | const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; | ||
2604 | |||
2605 | return ca_ops->min_cwnd ? ca_ops->min_cwnd(sk) : tcp_sk(sk)->snd_ssthresh; | ||
2606 | } | ||
2607 | |||
2608 | /* Decrease cwnd each second ack. */ | ||
2609 | static void tcp_cwnd_down(struct sock *sk, int flag) | ||
2610 | { | ||
2611 | struct tcp_sock *tp = tcp_sk(sk); | ||
2612 | int decr = tp->snd_cwnd_cnt + 1; | ||
2613 | |||
2614 | if ((flag & (FLAG_ANY_PROGRESS | FLAG_DSACKING_ACK)) || | ||
2615 | (tcp_is_reno(tp) && !(flag & FLAG_NOT_DUP))) { | ||
2616 | tp->snd_cwnd_cnt = decr & 1; | ||
2617 | decr >>= 1; | ||
2618 | |||
2619 | if (decr && tp->snd_cwnd > tcp_cwnd_min(sk)) | ||
2620 | tp->snd_cwnd -= decr; | ||
2621 | |||
2622 | tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp) + 1); | ||
2623 | tp->snd_cwnd_stamp = tcp_time_stamp; | ||
2624 | } | ||
2625 | } | ||
2626 | |||
2473 | /* Nothing was retransmitted or returned timestamp is less | 2627 | /* Nothing was retransmitted or returned timestamp is less |
2474 | * than timestamp of the first retransmission. | 2628 | * than timestamp of the first retransmission. |
2475 | */ | 2629 | */ |
2476 | static inline bool tcp_packet_delayed(const struct tcp_sock *tp) | 2630 | static inline int tcp_packet_delayed(struct tcp_sock *tp) |
2477 | { | 2631 | { |
2478 | return !tp->retrans_stamp || | 2632 | return !tp->retrans_stamp || |
2479 | (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && | 2633 | (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && |
@@ -2489,22 +2643,22 @@ static void DBGUNDO(struct sock *sk, const char *msg) | |||
2489 | struct inet_sock *inet = inet_sk(sk); | 2643 | struct inet_sock *inet = inet_sk(sk); |
2490 | 2644 | ||
2491 | if (sk->sk_family == AF_INET) { | 2645 | if (sk->sk_family == AF_INET) { |
2492 | pr_debug("Undo %s %pI4/%u c%u l%u ss%u/%u p%u\n", | 2646 | printk(KERN_DEBUG "Undo %s %pI4/%u c%u l%u ss%u/%u p%u\n", |
2493 | msg, | 2647 | msg, |
2494 | &inet->inet_daddr, ntohs(inet->inet_dport), | 2648 | &inet->inet_daddr, ntohs(inet->inet_dport), |
2495 | tp->snd_cwnd, tcp_left_out(tp), | 2649 | tp->snd_cwnd, tcp_left_out(tp), |
2496 | tp->snd_ssthresh, tp->prior_ssthresh, | 2650 | tp->snd_ssthresh, tp->prior_ssthresh, |
2497 | tp->packets_out); | 2651 | tp->packets_out); |
2498 | } | 2652 | } |
2499 | #if IS_ENABLED(CONFIG_IPV6) | 2653 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) |
2500 | else if (sk->sk_family == AF_INET6) { | 2654 | else if (sk->sk_family == AF_INET6) { |
2501 | struct ipv6_pinfo *np = inet6_sk(sk); | 2655 | struct ipv6_pinfo *np = inet6_sk(sk); |
2502 | pr_debug("Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n", | 2656 | printk(KERN_DEBUG "Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n", |
2503 | msg, | 2657 | msg, |
2504 | &np->daddr, ntohs(inet->inet_dport), | 2658 | &np->daddr, ntohs(inet->inet_dport), |
2505 | tp->snd_cwnd, tcp_left_out(tp), | 2659 | tp->snd_cwnd, tcp_left_out(tp), |
2506 | tp->snd_ssthresh, tp->prior_ssthresh, | 2660 | tp->snd_ssthresh, tp->prior_ssthresh, |
2507 | tp->packets_out); | 2661 | tp->packets_out); |
2508 | } | 2662 | } |
2509 | #endif | 2663 | #endif |
2510 | } | 2664 | } |
@@ -2534,13 +2688,13 @@ static void tcp_undo_cwr(struct sock *sk, const bool undo_ssthresh) | |||
2534 | tp->snd_cwnd_stamp = tcp_time_stamp; | 2688 | tp->snd_cwnd_stamp = tcp_time_stamp; |
2535 | } | 2689 | } |
2536 | 2690 | ||
2537 | static inline bool tcp_may_undo(const struct tcp_sock *tp) | 2691 | static inline int tcp_may_undo(struct tcp_sock *tp) |
2538 | { | 2692 | { |
2539 | return tp->undo_marker && (!tp->undo_retrans || tcp_packet_delayed(tp)); | 2693 | return tp->undo_marker && (!tp->undo_retrans || tcp_packet_delayed(tp)); |
2540 | } | 2694 | } |
2541 | 2695 | ||
2542 | /* People celebrate: "We love our President!" */ | 2696 | /* People celebrate: "We love our President!" */ |
2543 | static bool tcp_try_undo_recovery(struct sock *sk) | 2697 | static int tcp_try_undo_recovery(struct sock *sk) |
2544 | { | 2698 | { |
2545 | struct tcp_sock *tp = tcp_sk(sk); | 2699 | struct tcp_sock *tp = tcp_sk(sk); |
2546 | 2700 | ||
@@ -2565,10 +2719,10 @@ static bool tcp_try_undo_recovery(struct sock *sk) | |||
2565 | * is ACKed. For Reno it is MUST to prevent false | 2719 | * is ACKed. For Reno it is MUST to prevent false |
2566 | * fast retransmits (RFC2582). SACK TCP is safe. */ | 2720 | * fast retransmits (RFC2582). SACK TCP is safe. */ |
2567 | tcp_moderate_cwnd(tp); | 2721 | tcp_moderate_cwnd(tp); |
2568 | return true; | 2722 | return 1; |
2569 | } | 2723 | } |
2570 | tcp_set_ca_state(sk, TCP_CA_Open); | 2724 | tcp_set_ca_state(sk, TCP_CA_Open); |
2571 | return false; | 2725 | return 0; |
2572 | } | 2726 | } |
2573 | 2727 | ||
2574 | /* Try to undo cwnd reduction, because D-SACKs acked all retransmitted data */ | 2728 | /* Try to undo cwnd reduction, because D-SACKs acked all retransmitted data */ |
@@ -2598,19 +2752,19 @@ static void tcp_try_undo_dsack(struct sock *sk) | |||
2598 | * that successive retransmissions of a segment must not advance | 2752 | * that successive retransmissions of a segment must not advance |
2599 | * retrans_stamp under any conditions. | 2753 | * retrans_stamp under any conditions. |
2600 | */ | 2754 | */ |
2601 | static bool tcp_any_retrans_done(const struct sock *sk) | 2755 | static int tcp_any_retrans_done(struct sock *sk) |
2602 | { | 2756 | { |
2603 | const struct tcp_sock *tp = tcp_sk(sk); | 2757 | struct tcp_sock *tp = tcp_sk(sk); |
2604 | struct sk_buff *skb; | 2758 | struct sk_buff *skb; |
2605 | 2759 | ||
2606 | if (tp->retrans_out) | 2760 | if (tp->retrans_out) |
2607 | return true; | 2761 | return 1; |
2608 | 2762 | ||
2609 | skb = tcp_write_queue_head(sk); | 2763 | skb = tcp_write_queue_head(sk); |
2610 | if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS)) | 2764 | if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS)) |
2611 | return true; | 2765 | return 1; |
2612 | 2766 | ||
2613 | return false; | 2767 | return 0; |
2614 | } | 2768 | } |
2615 | 2769 | ||
2616 | /* Undo during fast recovery after partial ACK. */ | 2770 | /* Undo during fast recovery after partial ACK. */ |
@@ -2644,7 +2798,7 @@ static int tcp_try_undo_partial(struct sock *sk, int acked) | |||
2644 | } | 2798 | } |
2645 | 2799 | ||
2646 | /* Undo during loss recovery after partial ACK. */ | 2800 | /* Undo during loss recovery after partial ACK. */ |
2647 | static bool tcp_try_undo_loss(struct sock *sk) | 2801 | static int tcp_try_undo_loss(struct sock *sk) |
2648 | { | 2802 | { |
2649 | struct tcp_sock *tp = tcp_sk(sk); | 2803 | struct tcp_sock *tp = tcp_sk(sk); |
2650 | 2804 | ||
@@ -2666,91 +2820,28 @@ static bool tcp_try_undo_loss(struct sock *sk) | |||
2666 | tp->undo_marker = 0; | 2820 | tp->undo_marker = 0; |
2667 | if (tcp_is_sack(tp)) | 2821 | if (tcp_is_sack(tp)) |
2668 | tcp_set_ca_state(sk, TCP_CA_Open); | 2822 | tcp_set_ca_state(sk, TCP_CA_Open); |
2669 | return true; | 2823 | return 1; |
2670 | } | ||
2671 | return false; | ||
2672 | } | ||
2673 | |||
2674 | /* The cwnd reduction in CWR and Recovery use the PRR algorithm | ||
2675 | * https://datatracker.ietf.org/doc/draft-ietf-tcpm-proportional-rate-reduction/ | ||
2676 | * It computes the number of packets to send (sndcnt) based on packets newly | ||
2677 | * delivered: | ||
2678 | * 1) If the packets in flight is larger than ssthresh, PRR spreads the | ||
2679 | * cwnd reductions across a full RTT. | ||
2680 | * 2) If packets in flight is lower than ssthresh (such as due to excess | ||
2681 | * losses and/or application stalls), do not perform any further cwnd | ||
2682 | * reductions, but instead slow start up to ssthresh. | ||
2683 | */ | ||
2684 | static void tcp_init_cwnd_reduction(struct sock *sk, const bool set_ssthresh) | ||
2685 | { | ||
2686 | struct tcp_sock *tp = tcp_sk(sk); | ||
2687 | |||
2688 | tp->high_seq = tp->snd_nxt; | ||
2689 | tp->bytes_acked = 0; | ||
2690 | tp->snd_cwnd_cnt = 0; | ||
2691 | tp->prior_cwnd = tp->snd_cwnd; | ||
2692 | tp->prr_delivered = 0; | ||
2693 | tp->prr_out = 0; | ||
2694 | if (set_ssthresh) | ||
2695 | tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk); | ||
2696 | TCP_ECN_queue_cwr(tp); | ||
2697 | } | ||
2698 | |||
2699 | static void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, | ||
2700 | int fast_rexmit) | ||
2701 | { | ||
2702 | struct tcp_sock *tp = tcp_sk(sk); | ||
2703 | int sndcnt = 0; | ||
2704 | int delta = tp->snd_ssthresh - tcp_packets_in_flight(tp); | ||
2705 | |||
2706 | tp->prr_delivered += newly_acked_sacked; | ||
2707 | if (tcp_packets_in_flight(tp) > tp->snd_ssthresh) { | ||
2708 | u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered + | ||
2709 | tp->prior_cwnd - 1; | ||
2710 | sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out; | ||
2711 | } else { | ||
2712 | sndcnt = min_t(int, delta, | ||
2713 | max_t(int, tp->prr_delivered - tp->prr_out, | ||
2714 | newly_acked_sacked) + 1); | ||
2715 | } | 2824 | } |
2716 | 2825 | return 0; | |
2717 | sndcnt = max(sndcnt, (fast_rexmit ? 1 : 0)); | ||
2718 | tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt; | ||
2719 | } | 2826 | } |
2720 | 2827 | ||
2721 | static inline void tcp_end_cwnd_reduction(struct sock *sk) | 2828 | static inline void tcp_complete_cwr(struct sock *sk) |
2722 | { | 2829 | { |
2723 | struct tcp_sock *tp = tcp_sk(sk); | 2830 | struct tcp_sock *tp = tcp_sk(sk); |
2724 | 2831 | /* Do not moderate cwnd if it's already undone in cwr or recovery */ | |
2725 | /* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */ | 2832 | if (tp->undo_marker && tp->snd_cwnd > tp->snd_ssthresh) { |
2726 | if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR || | ||
2727 | (tp->undo_marker && tp->snd_ssthresh < TCP_INFINITE_SSTHRESH)) { | ||
2728 | tp->snd_cwnd = tp->snd_ssthresh; | 2833 | tp->snd_cwnd = tp->snd_ssthresh; |
2729 | tp->snd_cwnd_stamp = tcp_time_stamp; | 2834 | tp->snd_cwnd_stamp = tcp_time_stamp; |
2730 | } | 2835 | } |
2731 | tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR); | 2836 | tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR); |
2732 | } | 2837 | } |
2733 | 2838 | ||
2734 | /* Enter CWR state. Disable cwnd undo since congestion is proven with ECN */ | ||
2735 | void tcp_enter_cwr(struct sock *sk, const int set_ssthresh) | ||
2736 | { | ||
2737 | struct tcp_sock *tp = tcp_sk(sk); | ||
2738 | |||
2739 | tp->prior_ssthresh = 0; | ||
2740 | tp->bytes_acked = 0; | ||
2741 | if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { | ||
2742 | tp->undo_marker = 0; | ||
2743 | tcp_init_cwnd_reduction(sk, set_ssthresh); | ||
2744 | tcp_set_ca_state(sk, TCP_CA_CWR); | ||
2745 | } | ||
2746 | } | ||
2747 | |||
2748 | static void tcp_try_keep_open(struct sock *sk) | 2839 | static void tcp_try_keep_open(struct sock *sk) |
2749 | { | 2840 | { |
2750 | struct tcp_sock *tp = tcp_sk(sk); | 2841 | struct tcp_sock *tp = tcp_sk(sk); |
2751 | int state = TCP_CA_Open; | 2842 | int state = TCP_CA_Open; |
2752 | 2843 | ||
2753 | if (tcp_left_out(tp) || tcp_any_retrans_done(sk)) | 2844 | if (tcp_left_out(tp) || tcp_any_retrans_done(sk) || tp->undo_marker) |
2754 | state = TCP_CA_Disorder; | 2845 | state = TCP_CA_Disorder; |
2755 | 2846 | ||
2756 | if (inet_csk(sk)->icsk_ca_state != state) { | 2847 | if (inet_csk(sk)->icsk_ca_state != state) { |
@@ -2759,7 +2850,7 @@ static void tcp_try_keep_open(struct sock *sk) | |||
2759 | } | 2850 | } |
2760 | } | 2851 | } |
2761 | 2852 | ||
2762 | static void tcp_try_to_open(struct sock *sk, int flag, int newly_acked_sacked) | 2853 | static void tcp_try_to_open(struct sock *sk, int flag) |
2763 | { | 2854 | { |
2764 | struct tcp_sock *tp = tcp_sk(sk); | 2855 | struct tcp_sock *tp = tcp_sk(sk); |
2765 | 2856 | ||
@@ -2773,10 +2864,9 @@ static void tcp_try_to_open(struct sock *sk, int flag, int newly_acked_sacked) | |||
2773 | 2864 | ||
2774 | if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) { | 2865 | if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) { |
2775 | tcp_try_keep_open(sk); | 2866 | tcp_try_keep_open(sk); |
2776 | if (inet_csk(sk)->icsk_ca_state != TCP_CA_Open) | 2867 | tcp_moderate_cwnd(tp); |
2777 | tcp_moderate_cwnd(tp); | ||
2778 | } else { | 2868 | } else { |
2779 | tcp_cwnd_reduction(sk, newly_acked_sacked, 0); | 2869 | tcp_cwnd_down(sk, flag); |
2780 | } | 2870 | } |
2781 | } | 2871 | } |
2782 | 2872 | ||
@@ -2858,30 +2948,6 @@ void tcp_simple_retransmit(struct sock *sk) | |||
2858 | } | 2948 | } |
2859 | EXPORT_SYMBOL(tcp_simple_retransmit); | 2949 | EXPORT_SYMBOL(tcp_simple_retransmit); |
2860 | 2950 | ||
2861 | static void tcp_enter_recovery(struct sock *sk, bool ece_ack) | ||
2862 | { | ||
2863 | struct tcp_sock *tp = tcp_sk(sk); | ||
2864 | int mib_idx; | ||
2865 | |||
2866 | if (tcp_is_reno(tp)) | ||
2867 | mib_idx = LINUX_MIB_TCPRENORECOVERY; | ||
2868 | else | ||
2869 | mib_idx = LINUX_MIB_TCPSACKRECOVERY; | ||
2870 | |||
2871 | NET_INC_STATS_BH(sock_net(sk), mib_idx); | ||
2872 | |||
2873 | tp->prior_ssthresh = 0; | ||
2874 | tp->undo_marker = tp->snd_una; | ||
2875 | tp->undo_retrans = tp->retrans_out; | ||
2876 | |||
2877 | if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { | ||
2878 | if (!ece_ack) | ||
2879 | tp->prior_ssthresh = tcp_current_ssthresh(sk); | ||
2880 | tcp_init_cwnd_reduction(sk, true); | ||
2881 | } | ||
2882 | tcp_set_ca_state(sk, TCP_CA_Recovery); | ||
2883 | } | ||
2884 | |||
2885 | /* Process an event, which can update packets-in-flight not trivially. | 2951 | /* Process an event, which can update packets-in-flight not trivially. |
2886 | * Main goal of this function is to calculate new estimate for left_out, | 2952 | * Main goal of this function is to calculate new estimate for left_out, |
2887 | * taking into account both packets sitting in receiver's buffer and | 2953 | * taking into account both packets sitting in receiver's buffer and |
@@ -2893,16 +2959,14 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack) | |||
2893 | * It does _not_ decide what to send, it is made in function | 2959 | * It does _not_ decide what to send, it is made in function |
2894 | * tcp_xmit_retransmit_queue(). | 2960 | * tcp_xmit_retransmit_queue(). |
2895 | */ | 2961 | */ |
2896 | static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, | 2962 | static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag) |
2897 | int prior_sacked, bool is_dupack, | ||
2898 | int flag) | ||
2899 | { | 2963 | { |
2900 | struct inet_connection_sock *icsk = inet_csk(sk); | 2964 | struct inet_connection_sock *icsk = inet_csk(sk); |
2901 | struct tcp_sock *tp = tcp_sk(sk); | 2965 | struct tcp_sock *tp = tcp_sk(sk); |
2966 | int is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); | ||
2902 | int do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) && | 2967 | int do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) && |
2903 | (tcp_fackets_out(tp) > tp->reordering)); | 2968 | (tcp_fackets_out(tp) > tp->reordering)); |
2904 | int newly_acked_sacked = 0; | 2969 | int fast_rexmit = 0, mib_idx; |
2905 | int fast_rexmit = 0; | ||
2906 | 2970 | ||
2907 | if (WARN_ON(!tp->packets_out && tp->sacked_out)) | 2971 | if (WARN_ON(!tp->packets_out && tp->sacked_out)) |
2908 | tp->sacked_out = 0; | 2972 | tp->sacked_out = 0; |
@@ -2918,10 +2982,19 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, | |||
2918 | if (tcp_check_sack_reneging(sk, flag)) | 2982 | if (tcp_check_sack_reneging(sk, flag)) |
2919 | return; | 2983 | return; |
2920 | 2984 | ||
2921 | /* C. Check consistency of the current state. */ | 2985 | /* C. Process data loss notification, provided it is valid. */ |
2986 | if (tcp_is_fack(tp) && (flag & FLAG_DATA_LOST) && | ||
2987 | before(tp->snd_una, tp->high_seq) && | ||
2988 | icsk->icsk_ca_state != TCP_CA_Open && | ||
2989 | tp->fackets_out > tp->reordering) { | ||
2990 | tcp_mark_head_lost(sk, tp->fackets_out - tp->reordering, 0); | ||
2991 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSS); | ||
2992 | } | ||
2993 | |||
2994 | /* D. Check consistency of the current state. */ | ||
2922 | tcp_verify_left_out(tp); | 2995 | tcp_verify_left_out(tp); |
2923 | 2996 | ||
2924 | /* D. Check state exit conditions. State can be terminated | 2997 | /* E. Check state exit conditions. State can be terminated |
2925 | * when high_seq is ACKed. */ | 2998 | * when high_seq is ACKed. */ |
2926 | if (icsk->icsk_ca_state == TCP_CA_Open) { | 2999 | if (icsk->icsk_ca_state == TCP_CA_Open) { |
2927 | WARN_ON(tp->retrans_out != 0); | 3000 | WARN_ON(tp->retrans_out != 0); |
@@ -2938,7 +3011,18 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, | |||
2938 | /* CWR is to be held something *above* high_seq | 3011 | /* CWR is to be held something *above* high_seq |
2939 | * is ACKed for CWR bit to reach receiver. */ | 3012 | * is ACKed for CWR bit to reach receiver. */ |
2940 | if (tp->snd_una != tp->high_seq) { | 3013 | if (tp->snd_una != tp->high_seq) { |
2941 | tcp_end_cwnd_reduction(sk); | 3014 | tcp_complete_cwr(sk); |
3015 | tcp_set_ca_state(sk, TCP_CA_Open); | ||
3016 | } | ||
3017 | break; | ||
3018 | |||
3019 | case TCP_CA_Disorder: | ||
3020 | tcp_try_undo_dsack(sk); | ||
3021 | if (!tp->undo_marker || | ||
3022 | /* For SACK case do not Open to allow to undo | ||
3023 | * catching for all duplicate ACKs. */ | ||
3024 | tcp_is_reno(tp) || tp->snd_una != tp->high_seq) { | ||
3025 | tp->undo_marker = 0; | ||
2942 | tcp_set_ca_state(sk, TCP_CA_Open); | 3026 | tcp_set_ca_state(sk, TCP_CA_Open); |
2943 | } | 3027 | } |
2944 | break; | 3028 | break; |
@@ -2948,12 +3032,12 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, | |||
2948 | tcp_reset_reno_sack(tp); | 3032 | tcp_reset_reno_sack(tp); |
2949 | if (tcp_try_undo_recovery(sk)) | 3033 | if (tcp_try_undo_recovery(sk)) |
2950 | return; | 3034 | return; |
2951 | tcp_end_cwnd_reduction(sk); | 3035 | tcp_complete_cwr(sk); |
2952 | break; | 3036 | break; |
2953 | } | 3037 | } |
2954 | } | 3038 | } |
2955 | 3039 | ||
2956 | /* E. Process state. */ | 3040 | /* F. Process state. */ |
2957 | switch (icsk->icsk_ca_state) { | 3041 | switch (icsk->icsk_ca_state) { |
2958 | case TCP_CA_Recovery: | 3042 | case TCP_CA_Recovery: |
2959 | if (!(flag & FLAG_SND_UNA_ADVANCED)) { | 3043 | if (!(flag & FLAG_SND_UNA_ADVANCED)) { |
@@ -2961,7 +3045,6 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, | |||
2961 | tcp_add_reno_sack(sk); | 3045 | tcp_add_reno_sack(sk); |
2962 | } else | 3046 | } else |
2963 | do_lost = tcp_try_undo_partial(sk, pkts_acked); | 3047 | do_lost = tcp_try_undo_partial(sk, pkts_acked); |
2964 | newly_acked_sacked = pkts_acked + tp->sacked_out - prior_sacked; | ||
2965 | break; | 3048 | break; |
2966 | case TCP_CA_Loss: | 3049 | case TCP_CA_Loss: |
2967 | if (flag & FLAG_DATA_ACKED) | 3050 | if (flag & FLAG_DATA_ACKED) |
@@ -2983,13 +3066,12 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, | |||
2983 | if (is_dupack) | 3066 | if (is_dupack) |
2984 | tcp_add_reno_sack(sk); | 3067 | tcp_add_reno_sack(sk); |
2985 | } | 3068 | } |
2986 | newly_acked_sacked = pkts_acked + tp->sacked_out - prior_sacked; | ||
2987 | 3069 | ||
2988 | if (icsk->icsk_ca_state <= TCP_CA_Disorder) | 3070 | if (icsk->icsk_ca_state == TCP_CA_Disorder) |
2989 | tcp_try_undo_dsack(sk); | 3071 | tcp_try_undo_dsack(sk); |
2990 | 3072 | ||
2991 | if (!tcp_time_to_recover(sk, flag)) { | 3073 | if (!tcp_time_to_recover(sk)) { |
2992 | tcp_try_to_open(sk, flag, newly_acked_sacked); | 3074 | tcp_try_to_open(sk, flag); |
2993 | return; | 3075 | return; |
2994 | } | 3076 | } |
2995 | 3077 | ||
@@ -3005,13 +3087,35 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, | |||
3005 | } | 3087 | } |
3006 | 3088 | ||
3007 | /* Otherwise enter Recovery state */ | 3089 | /* Otherwise enter Recovery state */ |
3008 | tcp_enter_recovery(sk, (flag & FLAG_ECE)); | 3090 | |
3091 | if (tcp_is_reno(tp)) | ||
3092 | mib_idx = LINUX_MIB_TCPRENORECOVERY; | ||
3093 | else | ||
3094 | mib_idx = LINUX_MIB_TCPSACKRECOVERY; | ||
3095 | |||
3096 | NET_INC_STATS_BH(sock_net(sk), mib_idx); | ||
3097 | |||
3098 | tp->high_seq = tp->snd_nxt; | ||
3099 | tp->prior_ssthresh = 0; | ||
3100 | tp->undo_marker = tp->snd_una; | ||
3101 | tp->undo_retrans = tp->retrans_out; | ||
3102 | |||
3103 | if (icsk->icsk_ca_state < TCP_CA_CWR) { | ||
3104 | if (!(flag & FLAG_ECE)) | ||
3105 | tp->prior_ssthresh = tcp_current_ssthresh(sk); | ||
3106 | tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); | ||
3107 | TCP_ECN_queue_cwr(tp); | ||
3108 | } | ||
3109 | |||
3110 | tp->bytes_acked = 0; | ||
3111 | tp->snd_cwnd_cnt = 0; | ||
3112 | tcp_set_ca_state(sk, TCP_CA_Recovery); | ||
3009 | fast_rexmit = 1; | 3113 | fast_rexmit = 1; |
3010 | } | 3114 | } |
3011 | 3115 | ||
3012 | if (do_lost || (tcp_is_fack(tp) && tcp_head_timedout(sk))) | 3116 | if (do_lost || (tcp_is_fack(tp) && tcp_head_timedout(sk))) |
3013 | tcp_update_scoreboard(sk, fast_rexmit); | 3117 | tcp_update_scoreboard(sk, fast_rexmit); |
3014 | tcp_cwnd_reduction(sk, newly_acked_sacked, fast_rexmit); | 3118 | tcp_cwnd_down(sk, flag); |
3015 | tcp_xmit_retransmit_queue(sk); | 3119 | tcp_xmit_retransmit_queue(sk); |
3016 | } | 3120 | } |
3017 | 3121 | ||
@@ -3086,53 +3190,16 @@ static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) | |||
3086 | /* Restart timer after forward progress on connection. | 3190 | /* Restart timer after forward progress on connection. |
3087 | * RFC2988 recommends to restart timer to now+rto. | 3191 | * RFC2988 recommends to restart timer to now+rto. |
3088 | */ | 3192 | */ |
3089 | void tcp_rearm_rto(struct sock *sk) | 3193 | static void tcp_rearm_rto(struct sock *sk) |
3090 | { | 3194 | { |
3091 | struct tcp_sock *tp = tcp_sk(sk); | 3195 | struct tcp_sock *tp = tcp_sk(sk); |
3092 | 3196 | ||
3093 | /* If the retrans timer is currently being used by Fast Open | ||
3094 | * for SYN-ACK retrans purpose, stay put. | ||
3095 | */ | ||
3096 | if (tp->fastopen_rsk) | ||
3097 | return; | ||
3098 | |||
3099 | if (!tp->packets_out) { | 3197 | if (!tp->packets_out) { |
3100 | inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); | 3198 | inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); |
3101 | } else { | 3199 | } else { |
3102 | u32 rto = inet_csk(sk)->icsk_rto; | 3200 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, |
3103 | /* Offset the time elapsed after installing regular RTO */ | 3201 | inet_csk(sk)->icsk_rto, TCP_RTO_MAX); |
3104 | if (tp->early_retrans_delayed) { | ||
3105 | struct sk_buff *skb = tcp_write_queue_head(sk); | ||
3106 | const u32 rto_time_stamp = TCP_SKB_CB(skb)->when + rto; | ||
3107 | s32 delta = (s32)(rto_time_stamp - tcp_time_stamp); | ||
3108 | /* delta may not be positive if the socket is locked | ||
3109 | * when the delayed ER timer fires and is rescheduled. | ||
3110 | */ | ||
3111 | if (delta > 0) | ||
3112 | rto = delta; | ||
3113 | } | ||
3114 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto, | ||
3115 | TCP_RTO_MAX); | ||
3116 | } | 3202 | } |
3117 | tp->early_retrans_delayed = 0; | ||
3118 | } | ||
3119 | |||
3120 | /* This function is called when the delayed ER timer fires. TCP enters | ||
3121 | * fast recovery and performs fast-retransmit. | ||
3122 | */ | ||
3123 | void tcp_resume_early_retransmit(struct sock *sk) | ||
3124 | { | ||
3125 | struct tcp_sock *tp = tcp_sk(sk); | ||
3126 | |||
3127 | tcp_rearm_rto(sk); | ||
3128 | |||
3129 | /* Stop if ER is disabled after the delayed ER timer is scheduled */ | ||
3130 | if (!tp->do_early_retrans) | ||
3131 | return; | ||
3132 | |||
3133 | tcp_enter_recovery(sk, false); | ||
3134 | tcp_update_scoreboard(sk, 1); | ||
3135 | tcp_xmit_retransmit_queue(sk); | ||
3136 | } | 3203 | } |
3137 | 3204 | ||
3138 | /* If we get here, the whole TSO packet has not been acked. */ | 3205 | /* If we get here, the whole TSO packet has not been acked. */ |
@@ -3167,7 +3234,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, | |||
3167 | const struct inet_connection_sock *icsk = inet_csk(sk); | 3234 | const struct inet_connection_sock *icsk = inet_csk(sk); |
3168 | struct sk_buff *skb; | 3235 | struct sk_buff *skb; |
3169 | u32 now = tcp_time_stamp; | 3236 | u32 now = tcp_time_stamp; |
3170 | int fully_acked = true; | 3237 | int fully_acked = 1; |
3171 | int flag = 0; | 3238 | int flag = 0; |
3172 | u32 pkts_acked = 0; | 3239 | u32 pkts_acked = 0; |
3173 | u32 reord = tp->packets_out; | 3240 | u32 reord = tp->packets_out; |
@@ -3191,7 +3258,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, | |||
3191 | if (!acked_pcount) | 3258 | if (!acked_pcount) |
3192 | break; | 3259 | break; |
3193 | 3260 | ||
3194 | fully_acked = false; | 3261 | fully_acked = 0; |
3195 | } else { | 3262 | } else { |
3196 | acked_pcount = tcp_skb_pcount(skb); | 3263 | acked_pcount = tcp_skb_pcount(skb); |
3197 | } | 3264 | } |
@@ -3229,7 +3296,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, | |||
3229 | * connection startup slow start one packet too | 3296 | * connection startup slow start one packet too |
3230 | * quickly. This is severely frowned upon behavior. | 3297 | * quickly. This is severely frowned upon behavior. |
3231 | */ | 3298 | */ |
3232 | if (!(scb->tcp_flags & TCPHDR_SYN)) { | 3299 | if (!(scb->flags & TCPHDR_SYN)) { |
3233 | flag |= FLAG_DATA_ACKED; | 3300 | flag |= FLAG_DATA_ACKED; |
3234 | } else { | 3301 | } else { |
3235 | flag |= FLAG_SYN_ACKED; | 3302 | flag |= FLAG_SYN_ACKED; |
@@ -3308,18 +3375,18 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, | |||
3308 | if (!tp->packets_out && tcp_is_sack(tp)) { | 3375 | if (!tp->packets_out && tcp_is_sack(tp)) { |
3309 | icsk = inet_csk(sk); | 3376 | icsk = inet_csk(sk); |
3310 | if (tp->lost_out) { | 3377 | if (tp->lost_out) { |
3311 | pr_debug("Leak l=%u %d\n", | 3378 | printk(KERN_DEBUG "Leak l=%u %d\n", |
3312 | tp->lost_out, icsk->icsk_ca_state); | 3379 | tp->lost_out, icsk->icsk_ca_state); |
3313 | tp->lost_out = 0; | 3380 | tp->lost_out = 0; |
3314 | } | 3381 | } |
3315 | if (tp->sacked_out) { | 3382 | if (tp->sacked_out) { |
3316 | pr_debug("Leak s=%u %d\n", | 3383 | printk(KERN_DEBUG "Leak s=%u %d\n", |
3317 | tp->sacked_out, icsk->icsk_ca_state); | 3384 | tp->sacked_out, icsk->icsk_ca_state); |
3318 | tp->sacked_out = 0; | 3385 | tp->sacked_out = 0; |
3319 | } | 3386 | } |
3320 | if (tp->retrans_out) { | 3387 | if (tp->retrans_out) { |
3321 | pr_debug("Leak r=%u %d\n", | 3388 | printk(KERN_DEBUG "Leak r=%u %d\n", |
3322 | tp->retrans_out, icsk->icsk_ca_state); | 3389 | tp->retrans_out, icsk->icsk_ca_state); |
3323 | tp->retrans_out = 0; | 3390 | tp->retrans_out = 0; |
3324 | } | 3391 | } |
3325 | } | 3392 | } |
@@ -3347,23 +3414,23 @@ static void tcp_ack_probe(struct sock *sk) | |||
3347 | } | 3414 | } |
3348 | } | 3415 | } |
3349 | 3416 | ||
3350 | static inline bool tcp_ack_is_dubious(const struct sock *sk, const int flag) | 3417 | static inline int tcp_ack_is_dubious(const struct sock *sk, const int flag) |
3351 | { | 3418 | { |
3352 | return !(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) || | 3419 | return !(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) || |
3353 | inet_csk(sk)->icsk_ca_state != TCP_CA_Open; | 3420 | inet_csk(sk)->icsk_ca_state != TCP_CA_Open; |
3354 | } | 3421 | } |
3355 | 3422 | ||
3356 | static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag) | 3423 | static inline int tcp_may_raise_cwnd(const struct sock *sk, const int flag) |
3357 | { | 3424 | { |
3358 | const struct tcp_sock *tp = tcp_sk(sk); | 3425 | const struct tcp_sock *tp = tcp_sk(sk); |
3359 | return (!(flag & FLAG_ECE) || tp->snd_cwnd < tp->snd_ssthresh) && | 3426 | return (!(flag & FLAG_ECE) || tp->snd_cwnd < tp->snd_ssthresh) && |
3360 | !tcp_in_cwnd_reduction(sk); | 3427 | !((1 << inet_csk(sk)->icsk_ca_state) & (TCPF_CA_Recovery | TCPF_CA_CWR)); |
3361 | } | 3428 | } |
3362 | 3429 | ||
3363 | /* Check that window update is acceptable. | 3430 | /* Check that window update is acceptable. |
3364 | * The function assumes that snd_una<=ack<=snd_next. | 3431 | * The function assumes that snd_una<=ack<=snd_next. |
3365 | */ | 3432 | */ |
3366 | static inline bool tcp_may_update_window(const struct tcp_sock *tp, | 3433 | static inline int tcp_may_update_window(const struct tcp_sock *tp, |
3367 | const u32 ack, const u32 ack_seq, | 3434 | const u32 ack, const u32 ack_seq, |
3368 | const u32 nwin) | 3435 | const u32 nwin) |
3369 | { | 3436 | { |
@@ -3377,7 +3444,7 @@ static inline bool tcp_may_update_window(const struct tcp_sock *tp, | |||
3377 | * Window update algorithm, described in RFC793/RFC1122 (used in linux-2.2 | 3444 | * Window update algorithm, described in RFC793/RFC1122 (used in linux-2.2 |
3378 | * and in FreeBSD. NetBSD's one is even worse.) is wrong. | 3445 | * and in FreeBSD. NetBSD's one is even worse.) is wrong. |
3379 | */ | 3446 | */ |
3380 | static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32 ack, | 3447 | static int tcp_ack_update_window(struct sock *sk, struct sk_buff *skb, u32 ack, |
3381 | u32 ack_seq) | 3448 | u32 ack_seq) |
3382 | { | 3449 | { |
3383 | struct tcp_sock *tp = tcp_sk(sk); | 3450 | struct tcp_sock *tp = tcp_sk(sk); |
@@ -3425,9 +3492,9 @@ static void tcp_conservative_spur_to_response(struct tcp_sock *tp) | |||
3425 | } | 3492 | } |
3426 | 3493 | ||
3427 | /* A conservative spurious RTO response algorithm: reduce cwnd using | 3494 | /* A conservative spurious RTO response algorithm: reduce cwnd using |
3428 | * PRR and continue in congestion avoidance. | 3495 | * rate halving and continue in congestion avoidance. |
3429 | */ | 3496 | */ |
3430 | static void tcp_cwr_spur_to_response(struct sock *sk) | 3497 | static void tcp_ratehalving_spur_to_response(struct sock *sk) |
3431 | { | 3498 | { |
3432 | tcp_enter_cwr(sk, 0); | 3499 | tcp_enter_cwr(sk, 0); |
3433 | } | 3500 | } |
@@ -3435,7 +3502,7 @@ static void tcp_cwr_spur_to_response(struct sock *sk) | |||
3435 | static void tcp_undo_spur_to_response(struct sock *sk, int flag) | 3502 | static void tcp_undo_spur_to_response(struct sock *sk, int flag) |
3436 | { | 3503 | { |
3437 | if (flag & FLAG_ECE) | 3504 | if (flag & FLAG_ECE) |
3438 | tcp_cwr_spur_to_response(sk); | 3505 | tcp_ratehalving_spur_to_response(sk); |
3439 | else | 3506 | else |
3440 | tcp_undo_cwr(sk, true); | 3507 | tcp_undo_cwr(sk, true); |
3441 | } | 3508 | } |
@@ -3470,7 +3537,7 @@ static void tcp_undo_spur_to_response(struct sock *sk, int flag) | |||
3470 | * to prove that the RTO is indeed spurious. It transfers the control | 3537 | * to prove that the RTO is indeed spurious. It transfers the control |
3471 | * from F-RTO to the conventional RTO recovery | 3538 | * from F-RTO to the conventional RTO recovery |
3472 | */ | 3539 | */ |
3473 | static bool tcp_process_frto(struct sock *sk, int flag) | 3540 | static int tcp_process_frto(struct sock *sk, int flag) |
3474 | { | 3541 | { |
3475 | struct tcp_sock *tp = tcp_sk(sk); | 3542 | struct tcp_sock *tp = tcp_sk(sk); |
3476 | 3543 | ||
@@ -3486,7 +3553,7 @@ static bool tcp_process_frto(struct sock *sk, int flag) | |||
3486 | 3553 | ||
3487 | if (!before(tp->snd_una, tp->frto_highmark)) { | 3554 | if (!before(tp->snd_una, tp->frto_highmark)) { |
3488 | tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 2 : 3), flag); | 3555 | tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 2 : 3), flag); |
3489 | return true; | 3556 | return 1; |
3490 | } | 3557 | } |
3491 | 3558 | ||
3492 | if (!tcp_is_sackfrto(tp)) { | 3559 | if (!tcp_is_sackfrto(tp)) { |
@@ -3495,19 +3562,19 @@ static bool tcp_process_frto(struct sock *sk, int flag) | |||
3495 | * data, winupdate | 3562 | * data, winupdate |
3496 | */ | 3563 | */ |
3497 | if (!(flag & FLAG_ANY_PROGRESS) && (flag & FLAG_NOT_DUP)) | 3564 | if (!(flag & FLAG_ANY_PROGRESS) && (flag & FLAG_NOT_DUP)) |
3498 | return true; | 3565 | return 1; |
3499 | 3566 | ||
3500 | if (!(flag & FLAG_DATA_ACKED)) { | 3567 | if (!(flag & FLAG_DATA_ACKED)) { |
3501 | tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 0 : 3), | 3568 | tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 0 : 3), |
3502 | flag); | 3569 | flag); |
3503 | return true; | 3570 | return 1; |
3504 | } | 3571 | } |
3505 | } else { | 3572 | } else { |
3506 | if (!(flag & FLAG_DATA_ACKED) && (tp->frto_counter == 1)) { | 3573 | if (!(flag & FLAG_DATA_ACKED) && (tp->frto_counter == 1)) { |
3507 | /* Prevent sending of new data. */ | 3574 | /* Prevent sending of new data. */ |
3508 | tp->snd_cwnd = min(tp->snd_cwnd, | 3575 | tp->snd_cwnd = min(tp->snd_cwnd, |
3509 | tcp_packets_in_flight(tp)); | 3576 | tcp_packets_in_flight(tp)); |
3510 | return true; | 3577 | return 1; |
3511 | } | 3578 | } |
3512 | 3579 | ||
3513 | if ((tp->frto_counter >= 2) && | 3580 | if ((tp->frto_counter >= 2) && |
@@ -3517,10 +3584,10 @@ static bool tcp_process_frto(struct sock *sk, int flag) | |||
3517 | /* RFC4138 shortcoming (see comment above) */ | 3584 | /* RFC4138 shortcoming (see comment above) */ |
3518 | if (!(flag & FLAG_FORWARD_PROGRESS) && | 3585 | if (!(flag & FLAG_FORWARD_PROGRESS) && |
3519 | (flag & FLAG_NOT_DUP)) | 3586 | (flag & FLAG_NOT_DUP)) |
3520 | return true; | 3587 | return 1; |
3521 | 3588 | ||
3522 | tcp_enter_frto_loss(sk, 3, flag); | 3589 | tcp_enter_frto_loss(sk, 3, flag); |
3523 | return true; | 3590 | return 1; |
3524 | } | 3591 | } |
3525 | } | 3592 | } |
3526 | 3593 | ||
@@ -3532,7 +3599,7 @@ static bool tcp_process_frto(struct sock *sk, int flag) | |||
3532 | if (!tcp_may_send_now(sk)) | 3599 | if (!tcp_may_send_now(sk)) |
3533 | tcp_enter_frto_loss(sk, 2, flag); | 3600 | tcp_enter_frto_loss(sk, 2, flag); |
3534 | 3601 | ||
3535 | return true; | 3602 | return 1; |
3536 | } else { | 3603 | } else { |
3537 | switch (sysctl_tcp_frto_response) { | 3604 | switch (sysctl_tcp_frto_response) { |
3538 | case 2: | 3605 | case 2: |
@@ -3542,61 +3609,34 @@ static bool tcp_process_frto(struct sock *sk, int flag) | |||
3542 | tcp_conservative_spur_to_response(tp); | 3609 | tcp_conservative_spur_to_response(tp); |
3543 | break; | 3610 | break; |
3544 | default: | 3611 | default: |
3545 | tcp_cwr_spur_to_response(sk); | 3612 | tcp_ratehalving_spur_to_response(sk); |
3546 | break; | 3613 | break; |
3547 | } | 3614 | } |
3548 | tp->frto_counter = 0; | 3615 | tp->frto_counter = 0; |
3549 | tp->undo_marker = 0; | 3616 | tp->undo_marker = 0; |
3550 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSPURIOUSRTOS); | 3617 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSPURIOUSRTOS); |
3551 | } | 3618 | } |
3552 | return false; | 3619 | return 0; |
3553 | } | ||
3554 | |||
3555 | /* RFC 5961 7 [ACK Throttling] */ | ||
3556 | static void tcp_send_challenge_ack(struct sock *sk) | ||
3557 | { | ||
3558 | /* unprotected vars, we dont care of overwrites */ | ||
3559 | static u32 challenge_timestamp; | ||
3560 | static unsigned int challenge_count; | ||
3561 | u32 now = jiffies / HZ; | ||
3562 | |||
3563 | if (now != challenge_timestamp) { | ||
3564 | challenge_timestamp = now; | ||
3565 | challenge_count = 0; | ||
3566 | } | ||
3567 | if (++challenge_count <= sysctl_tcp_challenge_ack_limit) { | ||
3568 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPCHALLENGEACK); | ||
3569 | tcp_send_ack(sk); | ||
3570 | } | ||
3571 | } | 3620 | } |
3572 | 3621 | ||
3573 | /* This routine deals with incoming acks, but not outgoing ones. */ | 3622 | /* This routine deals with incoming acks, but not outgoing ones. */ |
3574 | static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) | 3623 | static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) |
3575 | { | 3624 | { |
3576 | struct inet_connection_sock *icsk = inet_csk(sk); | 3625 | struct inet_connection_sock *icsk = inet_csk(sk); |
3577 | struct tcp_sock *tp = tcp_sk(sk); | 3626 | struct tcp_sock *tp = tcp_sk(sk); |
3578 | u32 prior_snd_una = tp->snd_una; | 3627 | u32 prior_snd_una = tp->snd_una; |
3579 | u32 ack_seq = TCP_SKB_CB(skb)->seq; | 3628 | u32 ack_seq = TCP_SKB_CB(skb)->seq; |
3580 | u32 ack = TCP_SKB_CB(skb)->ack_seq; | 3629 | u32 ack = TCP_SKB_CB(skb)->ack_seq; |
3581 | bool is_dupack = false; | ||
3582 | u32 prior_in_flight; | 3630 | u32 prior_in_flight; |
3583 | u32 prior_fackets; | 3631 | u32 prior_fackets; |
3584 | int prior_packets; | 3632 | int prior_packets; |
3585 | int prior_sacked = tp->sacked_out; | 3633 | int frto_cwnd = 0; |
3586 | int pkts_acked = 0; | ||
3587 | bool frto_cwnd = false; | ||
3588 | 3634 | ||
3589 | /* If the ack is older than previous acks | 3635 | /* If the ack is older than previous acks |
3590 | * then we can probably ignore it. | 3636 | * then we can probably ignore it. |
3591 | */ | 3637 | */ |
3592 | if (before(ack, prior_snd_una)) { | 3638 | if (before(ack, prior_snd_una)) |
3593 | /* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */ | ||
3594 | if (before(ack, prior_snd_una - tp->max_window)) { | ||
3595 | tcp_send_challenge_ack(sk); | ||
3596 | return -1; | ||
3597 | } | ||
3598 | goto old_ack; | 3639 | goto old_ack; |
3599 | } | ||
3600 | 3640 | ||
3601 | /* If the ack includes data we haven't sent yet, discard | 3641 | /* If the ack includes data we haven't sent yet, discard |
3602 | * this segment (RFC793 Section 3.9). | 3642 | * this segment (RFC793 Section 3.9). |
@@ -3604,9 +3644,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) | |||
3604 | if (after(ack, tp->snd_nxt)) | 3644 | if (after(ack, tp->snd_nxt)) |
3605 | goto invalid_ack; | 3645 | goto invalid_ack; |
3606 | 3646 | ||
3607 | if (tp->early_retrans_delayed) | ||
3608 | tcp_rearm_rto(sk); | ||
3609 | |||
3610 | if (after(ack, prior_snd_una)) | 3647 | if (after(ack, prior_snd_una)) |
3611 | flag |= FLAG_SND_UNA_ADVANCED; | 3648 | flag |= FLAG_SND_UNA_ADVANCED; |
3612 | 3649 | ||
@@ -3664,8 +3701,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) | |||
3664 | /* See if we can take anything off of the retransmit queue. */ | 3701 | /* See if we can take anything off of the retransmit queue. */ |
3665 | flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una); | 3702 | flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una); |
3666 | 3703 | ||
3667 | pkts_acked = prior_packets - tp->packets_out; | ||
3668 | |||
3669 | if (tp->frto_counter) | 3704 | if (tp->frto_counter) |
3670 | frto_cwnd = tcp_process_frto(sk, flag); | 3705 | frto_cwnd = tcp_process_frto(sk, flag); |
3671 | /* Guarantee sacktag reordering detection against wrap-arounds */ | 3706 | /* Guarantee sacktag reordering detection against wrap-arounds */ |
@@ -3677,26 +3712,19 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) | |||
3677 | if ((flag & FLAG_DATA_ACKED) && !frto_cwnd && | 3712 | if ((flag & FLAG_DATA_ACKED) && !frto_cwnd && |
3678 | tcp_may_raise_cwnd(sk, flag)) | 3713 | tcp_may_raise_cwnd(sk, flag)) |
3679 | tcp_cong_avoid(sk, ack, prior_in_flight); | 3714 | tcp_cong_avoid(sk, ack, prior_in_flight); |
3680 | is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); | 3715 | tcp_fastretrans_alert(sk, prior_packets - tp->packets_out, |
3681 | tcp_fastretrans_alert(sk, pkts_acked, prior_sacked, | 3716 | flag); |
3682 | is_dupack, flag); | ||
3683 | } else { | 3717 | } else { |
3684 | if ((flag & FLAG_DATA_ACKED) && !frto_cwnd) | 3718 | if ((flag & FLAG_DATA_ACKED) && !frto_cwnd) |
3685 | tcp_cong_avoid(sk, ack, prior_in_flight); | 3719 | tcp_cong_avoid(sk, ack, prior_in_flight); |
3686 | } | 3720 | } |
3687 | 3721 | ||
3688 | if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) { | 3722 | if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) |
3689 | struct dst_entry *dst = __sk_dst_get(sk); | 3723 | dst_confirm(__sk_dst_get(sk)); |
3690 | if (dst) | 3724 | |
3691 | dst_confirm(dst); | ||
3692 | } | ||
3693 | return 1; | 3725 | return 1; |
3694 | 3726 | ||
3695 | no_queue: | 3727 | no_queue: |
3696 | /* If data was DSACKed, see if we can undo a cwnd reduction. */ | ||
3697 | if (flag & FLAG_DSACKING_ACK) | ||
3698 | tcp_fastretrans_alert(sk, pkts_acked, prior_sacked, | ||
3699 | is_dupack, flag); | ||
3700 | /* If this ack opens up a zero window, clear backoff. It was | 3728 | /* If this ack opens up a zero window, clear backoff. It was |
3701 | * being used to time the probes, and is probably far higher than | 3729 | * being used to time the probes, and is probably far higher than |
3702 | * it needs to be for normal retransmission. | 3730 | * it needs to be for normal retransmission. |
@@ -3710,13 +3738,10 @@ invalid_ack: | |||
3710 | return -1; | 3738 | return -1; |
3711 | 3739 | ||
3712 | old_ack: | 3740 | old_ack: |
3713 | /* If data was SACKed, tag it and see if we should send more data. | ||
3714 | * If data was DSACKed, see if we can undo a cwnd reduction. | ||
3715 | */ | ||
3716 | if (TCP_SKB_CB(skb)->sacked) { | 3741 | if (TCP_SKB_CB(skb)->sacked) { |
3717 | flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una); | 3742 | tcp_sacktag_write_queue(sk, skb, prior_snd_una); |
3718 | tcp_fastretrans_alert(sk, pkts_acked, prior_sacked, | 3743 | if (icsk->icsk_ca_state == TCP_CA_Open) |
3719 | is_dupack, flag); | 3744 | tcp_try_keep_open(sk); |
3720 | } | 3745 | } |
3721 | 3746 | ||
3722 | SOCK_DEBUG(sk, "Ack %u before %u:%u\n", ack, tp->snd_una, tp->snd_nxt); | 3747 | SOCK_DEBUG(sk, "Ack %u before %u:%u\n", ack, tp->snd_una, tp->snd_nxt); |
@@ -3727,15 +3752,14 @@ old_ack: | |||
3727 | * But, this can also be called on packets in the established flow when | 3752 | * But, this can also be called on packets in the established flow when |
3728 | * the fast version below fails. | 3753 | * the fast version below fails. |
3729 | */ | 3754 | */ |
3730 | void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *opt_rx, | 3755 | void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, |
3731 | const u8 **hvpp, int estab, | 3756 | u8 **hvpp, int estab) |
3732 | struct tcp_fastopen_cookie *foc) | ||
3733 | { | 3757 | { |
3734 | const unsigned char *ptr; | 3758 | unsigned char *ptr; |
3735 | const struct tcphdr *th = tcp_hdr(skb); | 3759 | struct tcphdr *th = tcp_hdr(skb); |
3736 | int length = (th->doff * 4) - sizeof(struct tcphdr); | 3760 | int length = (th->doff * 4) - sizeof(struct tcphdr); |
3737 | 3761 | ||
3738 | ptr = (const unsigned char *)(th + 1); | 3762 | ptr = (unsigned char *)(th + 1); |
3739 | opt_rx->saw_tstamp = 0; | 3763 | opt_rx->saw_tstamp = 0; |
3740 | 3764 | ||
3741 | while (length > 0) { | 3765 | while (length > 0) { |
@@ -3772,9 +3796,10 @@ void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *o | |||
3772 | __u8 snd_wscale = *(__u8 *)ptr; | 3796 | __u8 snd_wscale = *(__u8 *)ptr; |
3773 | opt_rx->wscale_ok = 1; | 3797 | opt_rx->wscale_ok = 1; |
3774 | if (snd_wscale > 14) { | 3798 | if (snd_wscale > 14) { |
3775 | net_info_ratelimited("%s: Illegal window scaling value %d >14 received\n", | 3799 | if (net_ratelimit()) |
3776 | __func__, | 3800 | printk(KERN_INFO "tcp_parse_options: Illegal window " |
3777 | snd_wscale); | 3801 | "scaling value %d >14 received.\n", |
3802 | snd_wscale); | ||
3778 | snd_wscale = 14; | 3803 | snd_wscale = 14; |
3779 | } | 3804 | } |
3780 | opt_rx->snd_wscale = snd_wscale; | 3805 | opt_rx->snd_wscale = snd_wscale; |
@@ -3792,7 +3817,7 @@ void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *o | |||
3792 | case TCPOPT_SACK_PERM: | 3817 | case TCPOPT_SACK_PERM: |
3793 | if (opsize == TCPOLEN_SACK_PERM && th->syn && | 3818 | if (opsize == TCPOLEN_SACK_PERM && th->syn && |
3794 | !estab && sysctl_tcp_sack) { | 3819 | !estab && sysctl_tcp_sack) { |
3795 | opt_rx->sack_ok = TCP_SACK_SEEN; | 3820 | opt_rx->sack_ok = 1; |
3796 | tcp_sack_reset(opt_rx); | 3821 | tcp_sack_reset(opt_rx); |
3797 | } | 3822 | } |
3798 | break; | 3823 | break; |
@@ -3836,25 +3861,8 @@ void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *o | |||
3836 | break; | 3861 | break; |
3837 | } | 3862 | } |
3838 | break; | 3863 | break; |
3839 | |||
3840 | case TCPOPT_EXP: | ||
3841 | /* Fast Open option shares code 254 using a | ||
3842 | * 16 bits magic number. It's valid only in | ||
3843 | * SYN or SYN-ACK with an even size. | ||
3844 | */ | ||
3845 | if (opsize < TCPOLEN_EXP_FASTOPEN_BASE || | ||
3846 | get_unaligned_be16(ptr) != TCPOPT_FASTOPEN_MAGIC || | ||
3847 | foc == NULL || !th->syn || (opsize & 1)) | ||
3848 | break; | ||
3849 | foc->len = opsize - TCPOLEN_EXP_FASTOPEN_BASE; | ||
3850 | if (foc->len >= TCP_FASTOPEN_COOKIE_MIN && | ||
3851 | foc->len <= TCP_FASTOPEN_COOKIE_MAX) | ||
3852 | memcpy(foc->val, ptr + 2, foc->len); | ||
3853 | else if (foc->len != 0) | ||
3854 | foc->len = -1; | ||
3855 | break; | ||
3856 | |||
3857 | } | 3864 | } |
3865 | |||
3858 | ptr += opsize-2; | 3866 | ptr += opsize-2; |
3859 | length -= opsize; | 3867 | length -= opsize; |
3860 | } | 3868 | } |
@@ -3862,9 +3870,9 @@ void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *o | |||
3862 | } | 3870 | } |
3863 | EXPORT_SYMBOL(tcp_parse_options); | 3871 | EXPORT_SYMBOL(tcp_parse_options); |
3864 | 3872 | ||
3865 | static bool tcp_parse_aligned_timestamp(struct tcp_sock *tp, const struct tcphdr *th) | 3873 | static int tcp_parse_aligned_timestamp(struct tcp_sock *tp, struct tcphdr *th) |
3866 | { | 3874 | { |
3867 | const __be32 *ptr = (const __be32 *)(th + 1); | 3875 | __be32 *ptr = (__be32 *)(th + 1); |
3868 | 3876 | ||
3869 | if (*ptr == htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | 3877 | if (*ptr == htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
3870 | | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) { | 3878 | | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) { |
@@ -3873,41 +3881,40 @@ static bool tcp_parse_aligned_timestamp(struct tcp_sock *tp, const struct tcphdr | |||
3873 | tp->rx_opt.rcv_tsval = ntohl(*ptr); | 3881 | tp->rx_opt.rcv_tsval = ntohl(*ptr); |
3874 | ++ptr; | 3882 | ++ptr; |
3875 | tp->rx_opt.rcv_tsecr = ntohl(*ptr); | 3883 | tp->rx_opt.rcv_tsecr = ntohl(*ptr); |
3876 | return true; | 3884 | return 1; |
3877 | } | 3885 | } |
3878 | return false; | 3886 | return 0; |
3879 | } | 3887 | } |
3880 | 3888 | ||
3881 | /* Fast parse options. This hopes to only see timestamps. | 3889 | /* Fast parse options. This hopes to only see timestamps. |
3882 | * If it is wrong it falls back on tcp_parse_options(). | 3890 | * If it is wrong it falls back on tcp_parse_options(). |
3883 | */ | 3891 | */ |
3884 | static bool tcp_fast_parse_options(const struct sk_buff *skb, | 3892 | static int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th, |
3885 | const struct tcphdr *th, | 3893 | struct tcp_sock *tp, u8 **hvpp) |
3886 | struct tcp_sock *tp, const u8 **hvpp) | ||
3887 | { | 3894 | { |
3888 | /* In the spirit of fast parsing, compare doff directly to constant | 3895 | /* In the spirit of fast parsing, compare doff directly to constant |
3889 | * values. Because equality is used, short doff can be ignored here. | 3896 | * values. Because equality is used, short doff can be ignored here. |
3890 | */ | 3897 | */ |
3891 | if (th->doff == (sizeof(*th) / 4)) { | 3898 | if (th->doff == (sizeof(*th) / 4)) { |
3892 | tp->rx_opt.saw_tstamp = 0; | 3899 | tp->rx_opt.saw_tstamp = 0; |
3893 | return false; | 3900 | return 0; |
3894 | } else if (tp->rx_opt.tstamp_ok && | 3901 | } else if (tp->rx_opt.tstamp_ok && |
3895 | th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / 4)) { | 3902 | th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / 4)) { |
3896 | if (tcp_parse_aligned_timestamp(tp, th)) | 3903 | if (tcp_parse_aligned_timestamp(tp, th)) |
3897 | return true; | 3904 | return 1; |
3898 | } | 3905 | } |
3899 | tcp_parse_options(skb, &tp->rx_opt, hvpp, 1, NULL); | 3906 | tcp_parse_options(skb, &tp->rx_opt, hvpp, 1); |
3900 | return true; | 3907 | return 1; |
3901 | } | 3908 | } |
3902 | 3909 | ||
3903 | #ifdef CONFIG_TCP_MD5SIG | 3910 | #ifdef CONFIG_TCP_MD5SIG |
3904 | /* | 3911 | /* |
3905 | * Parse MD5 Signature option | 3912 | * Parse MD5 Signature option |
3906 | */ | 3913 | */ |
3907 | const u8 *tcp_parse_md5sig_option(const struct tcphdr *th) | 3914 | u8 *tcp_parse_md5sig_option(struct tcphdr *th) |
3908 | { | 3915 | { |
3909 | int length = (th->doff << 2) - sizeof(*th); | 3916 | int length = (th->doff << 2) - sizeof (*th); |
3910 | const u8 *ptr = (const u8 *)(th + 1); | 3917 | u8 *ptr = (u8*)(th + 1); |
3911 | 3918 | ||
3912 | /* If the TCP option is too short, we can short cut */ | 3919 | /* If the TCP option is too short, we can short cut */ |
3913 | if (length < TCPOLEN_MD5SIG) | 3920 | if (length < TCPOLEN_MD5SIG) |
@@ -3984,8 +3991,8 @@ static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) | |||
3984 | 3991 | ||
3985 | static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb) | 3992 | static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb) |
3986 | { | 3993 | { |
3987 | const struct tcp_sock *tp = tcp_sk(sk); | 3994 | struct tcp_sock *tp = tcp_sk(sk); |
3988 | const struct tcphdr *th = tcp_hdr(skb); | 3995 | struct tcphdr *th = tcp_hdr(skb); |
3989 | u32 seq = TCP_SKB_CB(skb)->seq; | 3996 | u32 seq = TCP_SKB_CB(skb)->seq; |
3990 | u32 ack = TCP_SKB_CB(skb)->ack_seq; | 3997 | u32 ack = TCP_SKB_CB(skb)->ack_seq; |
3991 | 3998 | ||
@@ -4002,7 +4009,7 @@ static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb) | |||
4002 | (s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (inet_csk(sk)->icsk_rto * 1024) / HZ); | 4009 | (s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (inet_csk(sk)->icsk_rto * 1024) / HZ); |
4003 | } | 4010 | } |
4004 | 4011 | ||
4005 | static inline bool tcp_paws_discard(const struct sock *sk, | 4012 | static inline int tcp_paws_discard(const struct sock *sk, |
4006 | const struct sk_buff *skb) | 4013 | const struct sk_buff *skb) |
4007 | { | 4014 | { |
4008 | const struct tcp_sock *tp = tcp_sk(sk); | 4015 | const struct tcp_sock *tp = tcp_sk(sk); |
@@ -4024,14 +4031,14 @@ static inline bool tcp_paws_discard(const struct sock *sk, | |||
4024 | * (borrowed from freebsd) | 4031 | * (borrowed from freebsd) |
4025 | */ | 4032 | */ |
4026 | 4033 | ||
4027 | static inline bool tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq) | 4034 | static inline int tcp_sequence(struct tcp_sock *tp, u32 seq, u32 end_seq) |
4028 | { | 4035 | { |
4029 | return !before(end_seq, tp->rcv_wup) && | 4036 | return !before(end_seq, tp->rcv_wup) && |
4030 | !after(seq, tp->rcv_nxt + tcp_receive_window(tp)); | 4037 | !after(seq, tp->rcv_nxt + tcp_receive_window(tp)); |
4031 | } | 4038 | } |
4032 | 4039 | ||
4033 | /* When we get a reset we do this. */ | 4040 | /* When we get a reset we do this. */ |
4034 | void tcp_reset(struct sock *sk) | 4041 | static void tcp_reset(struct sock *sk) |
4035 | { | 4042 | { |
4036 | /* We want the right error as BSD sees it (and indeed as we do). */ | 4043 | /* We want the right error as BSD sees it (and indeed as we do). */ |
4037 | switch (sk->sk_state) { | 4044 | switch (sk->sk_state) { |
@@ -4069,7 +4076,7 @@ void tcp_reset(struct sock *sk) | |||
4069 | * | 4076 | * |
4070 | * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT. | 4077 | * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT. |
4071 | */ | 4078 | */ |
4072 | static void tcp_fin(struct sock *sk) | 4079 | static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) |
4073 | { | 4080 | { |
4074 | struct tcp_sock *tp = tcp_sk(sk); | 4081 | struct tcp_sock *tp = tcp_sk(sk); |
4075 | 4082 | ||
@@ -4113,7 +4120,7 @@ static void tcp_fin(struct sock *sk) | |||
4113 | /* Only TCP_LISTEN and TCP_CLOSE are left, in these | 4120 | /* Only TCP_LISTEN and TCP_CLOSE are left, in these |
4114 | * cases we should never reach this piece of code. | 4121 | * cases we should never reach this piece of code. |
4115 | */ | 4122 | */ |
4116 | pr_err("%s: Impossible, sk->sk_state=%d\n", | 4123 | printk(KERN_ERR "%s: Impossible, sk->sk_state=%d\n", |
4117 | __func__, sk->sk_state); | 4124 | __func__, sk->sk_state); |
4118 | break; | 4125 | break; |
4119 | } | 4126 | } |
@@ -4138,7 +4145,7 @@ static void tcp_fin(struct sock *sk) | |||
4138 | } | 4145 | } |
4139 | } | 4146 | } |
4140 | 4147 | ||
4141 | static inline bool tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, | 4148 | static inline int tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, |
4142 | u32 end_seq) | 4149 | u32 end_seq) |
4143 | { | 4150 | { |
4144 | if (!after(seq, sp->end_seq) && !after(sp->start_seq, end_seq)) { | 4151 | if (!after(seq, sp->end_seq) && !after(sp->start_seq, end_seq)) { |
@@ -4146,9 +4153,9 @@ static inline bool tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, | |||
4146 | sp->start_seq = seq; | 4153 | sp->start_seq = seq; |
4147 | if (after(end_seq, sp->end_seq)) | 4154 | if (after(end_seq, sp->end_seq)) |
4148 | sp->end_seq = end_seq; | 4155 | sp->end_seq = end_seq; |
4149 | return true; | 4156 | return 1; |
4150 | } | 4157 | } |
4151 | return false; | 4158 | return 0; |
4152 | } | 4159 | } |
4153 | 4160 | ||
4154 | static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq) | 4161 | static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq) |
@@ -4181,7 +4188,7 @@ static void tcp_dsack_extend(struct sock *sk, u32 seq, u32 end_seq) | |||
4181 | tcp_sack_extend(tp->duplicate_sack, seq, end_seq); | 4188 | tcp_sack_extend(tp->duplicate_sack, seq, end_seq); |
4182 | } | 4189 | } |
4183 | 4190 | ||
4184 | static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb) | 4191 | static void tcp_send_dupack(struct sock *sk, struct sk_buff *skb) |
4185 | { | 4192 | { |
4186 | struct tcp_sock *tp = tcp_sk(sk); | 4193 | struct tcp_sock *tp = tcp_sk(sk); |
4187 | 4194 | ||
@@ -4340,258 +4347,37 @@ static void tcp_ofo_queue(struct sock *sk) | |||
4340 | __skb_queue_tail(&sk->sk_receive_queue, skb); | 4347 | __skb_queue_tail(&sk->sk_receive_queue, skb); |
4341 | tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; | 4348 | tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; |
4342 | if (tcp_hdr(skb)->fin) | 4349 | if (tcp_hdr(skb)->fin) |
4343 | tcp_fin(sk); | 4350 | tcp_fin(skb, sk, tcp_hdr(skb)); |
4344 | } | 4351 | } |
4345 | } | 4352 | } |
4346 | 4353 | ||
4347 | static bool tcp_prune_ofo_queue(struct sock *sk); | 4354 | static int tcp_prune_ofo_queue(struct sock *sk); |
4348 | static int tcp_prune_queue(struct sock *sk); | 4355 | static int tcp_prune_queue(struct sock *sk); |
4349 | 4356 | ||
4350 | static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb, | 4357 | static inline int tcp_try_rmem_schedule(struct sock *sk, unsigned int size) |
4351 | unsigned int size) | ||
4352 | { | 4358 | { |
4353 | if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || | 4359 | if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || |
4354 | !sk_rmem_schedule(sk, skb, size)) { | 4360 | !sk_rmem_schedule(sk, size)) { |
4355 | 4361 | ||
4356 | if (tcp_prune_queue(sk) < 0) | 4362 | if (tcp_prune_queue(sk) < 0) |
4357 | return -1; | 4363 | return -1; |
4358 | 4364 | ||
4359 | if (!sk_rmem_schedule(sk, skb, size)) { | 4365 | if (!sk_rmem_schedule(sk, size)) { |
4360 | if (!tcp_prune_ofo_queue(sk)) | 4366 | if (!tcp_prune_ofo_queue(sk)) |
4361 | return -1; | 4367 | return -1; |
4362 | 4368 | ||
4363 | if (!sk_rmem_schedule(sk, skb, size)) | 4369 | if (!sk_rmem_schedule(sk, size)) |
4364 | return -1; | 4370 | return -1; |
4365 | } | 4371 | } |
4366 | } | 4372 | } |
4367 | return 0; | 4373 | return 0; |
4368 | } | 4374 | } |
4369 | 4375 | ||
4370 | /** | ||
4371 | * tcp_try_coalesce - try to merge skb to prior one | ||
4372 | * @sk: socket | ||
4373 | * @to: prior buffer | ||
4374 | * @from: buffer to add in queue | ||
4375 | * @fragstolen: pointer to boolean | ||
4376 | * | ||
4377 | * Before queueing skb @from after @to, try to merge them | ||
4378 | * to reduce overall memory use and queue lengths, if cost is small. | ||
4379 | * Packets in ofo or receive queues can stay a long time. | ||
4380 | * Better try to coalesce them right now to avoid future collapses. | ||
4381 | * Returns true if caller should free @from instead of queueing it | ||
4382 | */ | ||
4383 | static bool tcp_try_coalesce(struct sock *sk, | ||
4384 | struct sk_buff *to, | ||
4385 | struct sk_buff *from, | ||
4386 | bool *fragstolen) | ||
4387 | { | ||
4388 | int delta; | ||
4389 | |||
4390 | *fragstolen = false; | ||
4391 | |||
4392 | if (tcp_hdr(from)->fin) | ||
4393 | return false; | ||
4394 | |||
4395 | /* Its possible this segment overlaps with prior segment in queue */ | ||
4396 | if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq) | ||
4397 | return false; | ||
4398 | |||
4399 | if (!skb_try_coalesce(to, from, fragstolen, &delta)) | ||
4400 | return false; | ||
4401 | |||
4402 | atomic_add(delta, &sk->sk_rmem_alloc); | ||
4403 | sk_mem_charge(sk, delta); | ||
4404 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE); | ||
4405 | TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq; | ||
4406 | TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq; | ||
4407 | return true; | ||
4408 | } | ||
4409 | |||
4410 | static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) | ||
4411 | { | ||
4412 | struct tcp_sock *tp = tcp_sk(sk); | ||
4413 | struct sk_buff *skb1; | ||
4414 | u32 seq, end_seq; | ||
4415 | |||
4416 | TCP_ECN_check_ce(tp, skb); | ||
4417 | |||
4418 | if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) { | ||
4419 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFODROP); | ||
4420 | __kfree_skb(skb); | ||
4421 | return; | ||
4422 | } | ||
4423 | |||
4424 | /* Disable header prediction. */ | ||
4425 | tp->pred_flags = 0; | ||
4426 | inet_csk_schedule_ack(sk); | ||
4427 | |||
4428 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOQUEUE); | ||
4429 | SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n", | ||
4430 | tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); | ||
4431 | |||
4432 | skb1 = skb_peek_tail(&tp->out_of_order_queue); | ||
4433 | if (!skb1) { | ||
4434 | /* Initial out of order segment, build 1 SACK. */ | ||
4435 | if (tcp_is_sack(tp)) { | ||
4436 | tp->rx_opt.num_sacks = 1; | ||
4437 | tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq; | ||
4438 | tp->selective_acks[0].end_seq = | ||
4439 | TCP_SKB_CB(skb)->end_seq; | ||
4440 | } | ||
4441 | __skb_queue_head(&tp->out_of_order_queue, skb); | ||
4442 | goto end; | ||
4443 | } | ||
4444 | |||
4445 | seq = TCP_SKB_CB(skb)->seq; | ||
4446 | end_seq = TCP_SKB_CB(skb)->end_seq; | ||
4447 | |||
4448 | if (seq == TCP_SKB_CB(skb1)->end_seq) { | ||
4449 | bool fragstolen; | ||
4450 | |||
4451 | if (!tcp_try_coalesce(sk, skb1, skb, &fragstolen)) { | ||
4452 | __skb_queue_after(&tp->out_of_order_queue, skb1, skb); | ||
4453 | } else { | ||
4454 | kfree_skb_partial(skb, fragstolen); | ||
4455 | skb = NULL; | ||
4456 | } | ||
4457 | |||
4458 | if (!tp->rx_opt.num_sacks || | ||
4459 | tp->selective_acks[0].end_seq != seq) | ||
4460 | goto add_sack; | ||
4461 | |||
4462 | /* Common case: data arrive in order after hole. */ | ||
4463 | tp->selective_acks[0].end_seq = end_seq; | ||
4464 | goto end; | ||
4465 | } | ||
4466 | |||
4467 | /* Find place to insert this segment. */ | ||
4468 | while (1) { | ||
4469 | if (!after(TCP_SKB_CB(skb1)->seq, seq)) | ||
4470 | break; | ||
4471 | if (skb_queue_is_first(&tp->out_of_order_queue, skb1)) { | ||
4472 | skb1 = NULL; | ||
4473 | break; | ||
4474 | } | ||
4475 | skb1 = skb_queue_prev(&tp->out_of_order_queue, skb1); | ||
4476 | } | ||
4477 | |||
4478 | /* Do skb overlap to previous one? */ | ||
4479 | if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) { | ||
4480 | if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) { | ||
4481 | /* All the bits are present. Drop. */ | ||
4482 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE); | ||
4483 | __kfree_skb(skb); | ||
4484 | skb = NULL; | ||
4485 | tcp_dsack_set(sk, seq, end_seq); | ||
4486 | goto add_sack; | ||
4487 | } | ||
4488 | if (after(seq, TCP_SKB_CB(skb1)->seq)) { | ||
4489 | /* Partial overlap. */ | ||
4490 | tcp_dsack_set(sk, seq, | ||
4491 | TCP_SKB_CB(skb1)->end_seq); | ||
4492 | } else { | ||
4493 | if (skb_queue_is_first(&tp->out_of_order_queue, | ||
4494 | skb1)) | ||
4495 | skb1 = NULL; | ||
4496 | else | ||
4497 | skb1 = skb_queue_prev( | ||
4498 | &tp->out_of_order_queue, | ||
4499 | skb1); | ||
4500 | } | ||
4501 | } | ||
4502 | if (!skb1) | ||
4503 | __skb_queue_head(&tp->out_of_order_queue, skb); | ||
4504 | else | ||
4505 | __skb_queue_after(&tp->out_of_order_queue, skb1, skb); | ||
4506 | |||
4507 | /* And clean segments covered by new one as whole. */ | ||
4508 | while (!skb_queue_is_last(&tp->out_of_order_queue, skb)) { | ||
4509 | skb1 = skb_queue_next(&tp->out_of_order_queue, skb); | ||
4510 | |||
4511 | if (!after(end_seq, TCP_SKB_CB(skb1)->seq)) | ||
4512 | break; | ||
4513 | if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) { | ||
4514 | tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq, | ||
4515 | end_seq); | ||
4516 | break; | ||
4517 | } | ||
4518 | __skb_unlink(skb1, &tp->out_of_order_queue); | ||
4519 | tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq, | ||
4520 | TCP_SKB_CB(skb1)->end_seq); | ||
4521 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE); | ||
4522 | __kfree_skb(skb1); | ||
4523 | } | ||
4524 | |||
4525 | add_sack: | ||
4526 | if (tcp_is_sack(tp)) | ||
4527 | tcp_sack_new_ofo_skb(sk, seq, end_seq); | ||
4528 | end: | ||
4529 | if (skb) | ||
4530 | skb_set_owner_r(skb, sk); | ||
4531 | } | ||
4532 | |||
4533 | static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen, | ||
4534 | bool *fragstolen) | ||
4535 | { | ||
4536 | int eaten; | ||
4537 | struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue); | ||
4538 | |||
4539 | __skb_pull(skb, hdrlen); | ||
4540 | eaten = (tail && | ||
4541 | tcp_try_coalesce(sk, tail, skb, fragstolen)) ? 1 : 0; | ||
4542 | tcp_sk(sk)->rcv_nxt = TCP_SKB_CB(skb)->end_seq; | ||
4543 | if (!eaten) { | ||
4544 | __skb_queue_tail(&sk->sk_receive_queue, skb); | ||
4545 | skb_set_owner_r(skb, sk); | ||
4546 | } | ||
4547 | return eaten; | ||
4548 | } | ||
4549 | |||
4550 | int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size) | ||
4551 | { | ||
4552 | struct sk_buff *skb = NULL; | ||
4553 | struct tcphdr *th; | ||
4554 | bool fragstolen; | ||
4555 | |||
4556 | if (size == 0) | ||
4557 | return 0; | ||
4558 | |||
4559 | skb = alloc_skb(size + sizeof(*th), sk->sk_allocation); | ||
4560 | if (!skb) | ||
4561 | goto err; | ||
4562 | |||
4563 | if (tcp_try_rmem_schedule(sk, skb, size + sizeof(*th))) | ||
4564 | goto err_free; | ||
4565 | |||
4566 | th = (struct tcphdr *)skb_put(skb, sizeof(*th)); | ||
4567 | skb_reset_transport_header(skb); | ||
4568 | memset(th, 0, sizeof(*th)); | ||
4569 | |||
4570 | if (memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size)) | ||
4571 | goto err_free; | ||
4572 | |||
4573 | TCP_SKB_CB(skb)->seq = tcp_sk(sk)->rcv_nxt; | ||
4574 | TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size; | ||
4575 | TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1; | ||
4576 | |||
4577 | if (tcp_queue_rcv(sk, skb, sizeof(*th), &fragstolen)) { | ||
4578 | WARN_ON_ONCE(fragstolen); /* should not happen */ | ||
4579 | __kfree_skb(skb); | ||
4580 | } | ||
4581 | return size; | ||
4582 | |||
4583 | err_free: | ||
4584 | kfree_skb(skb); | ||
4585 | err: | ||
4586 | return -ENOMEM; | ||
4587 | } | ||
4588 | |||
4589 | static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) | 4376 | static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) |
4590 | { | 4377 | { |
4591 | const struct tcphdr *th = tcp_hdr(skb); | 4378 | struct tcphdr *th = tcp_hdr(skb); |
4592 | struct tcp_sock *tp = tcp_sk(sk); | 4379 | struct tcp_sock *tp = tcp_sk(sk); |
4593 | int eaten = -1; | 4380 | int eaten = -1; |
4594 | bool fragstolen = false; | ||
4595 | 4381 | ||
4596 | if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) | 4382 | if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) |
4597 | goto drop; | 4383 | goto drop; |
@@ -4633,16 +4419,17 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) | |||
4633 | if (eaten <= 0) { | 4419 | if (eaten <= 0) { |
4634 | queue_and_out: | 4420 | queue_and_out: |
4635 | if (eaten < 0 && | 4421 | if (eaten < 0 && |
4636 | tcp_try_rmem_schedule(sk, skb, skb->truesize)) | 4422 | tcp_try_rmem_schedule(sk, skb->truesize)) |
4637 | goto drop; | 4423 | goto drop; |
4638 | 4424 | ||
4639 | eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen); | 4425 | skb_set_owner_r(skb, sk); |
4426 | __skb_queue_tail(&sk->sk_receive_queue, skb); | ||
4640 | } | 4427 | } |
4641 | tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; | 4428 | tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; |
4642 | if (skb->len) | 4429 | if (skb->len) |
4643 | tcp_event_data_recv(sk, skb); | 4430 | tcp_event_data_recv(sk, skb); |
4644 | if (th->fin) | 4431 | if (th->fin) |
4645 | tcp_fin(sk); | 4432 | tcp_fin(skb, sk, th); |
4646 | 4433 | ||
4647 | if (!skb_queue_empty(&tp->out_of_order_queue)) { | 4434 | if (!skb_queue_empty(&tp->out_of_order_queue)) { |
4648 | tcp_ofo_queue(sk); | 4435 | tcp_ofo_queue(sk); |
@@ -4660,8 +4447,8 @@ queue_and_out: | |||
4660 | tcp_fast_path_check(sk); | 4447 | tcp_fast_path_check(sk); |
4661 | 4448 | ||
4662 | if (eaten > 0) | 4449 | if (eaten > 0) |
4663 | kfree_skb_partial(skb, fragstolen); | 4450 | __kfree_skb(skb); |
4664 | if (!sock_flag(sk, SOCK_DEAD)) | 4451 | else if (!sock_flag(sk, SOCK_DEAD)) |
4665 | sk->sk_data_ready(sk, 0); | 4452 | sk->sk_data_ready(sk, 0); |
4666 | return; | 4453 | return; |
4667 | } | 4454 | } |
@@ -4701,7 +4488,105 @@ drop: | |||
4701 | goto queue_and_out; | 4488 | goto queue_and_out; |
4702 | } | 4489 | } |
4703 | 4490 | ||
4704 | tcp_data_queue_ofo(sk, skb); | 4491 | TCP_ECN_check_ce(tp, skb); |
4492 | |||
4493 | if (tcp_try_rmem_schedule(sk, skb->truesize)) | ||
4494 | goto drop; | ||
4495 | |||
4496 | /* Disable header prediction. */ | ||
4497 | tp->pred_flags = 0; | ||
4498 | inet_csk_schedule_ack(sk); | ||
4499 | |||
4500 | SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n", | ||
4501 | tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); | ||
4502 | |||
4503 | skb_set_owner_r(skb, sk); | ||
4504 | |||
4505 | if (!skb_peek(&tp->out_of_order_queue)) { | ||
4506 | /* Initial out of order segment, build 1 SACK. */ | ||
4507 | if (tcp_is_sack(tp)) { | ||
4508 | tp->rx_opt.num_sacks = 1; | ||
4509 | tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq; | ||
4510 | tp->selective_acks[0].end_seq = | ||
4511 | TCP_SKB_CB(skb)->end_seq; | ||
4512 | } | ||
4513 | __skb_queue_head(&tp->out_of_order_queue, skb); | ||
4514 | } else { | ||
4515 | struct sk_buff *skb1 = skb_peek_tail(&tp->out_of_order_queue); | ||
4516 | u32 seq = TCP_SKB_CB(skb)->seq; | ||
4517 | u32 end_seq = TCP_SKB_CB(skb)->end_seq; | ||
4518 | |||
4519 | if (seq == TCP_SKB_CB(skb1)->end_seq) { | ||
4520 | __skb_queue_after(&tp->out_of_order_queue, skb1, skb); | ||
4521 | |||
4522 | if (!tp->rx_opt.num_sacks || | ||
4523 | tp->selective_acks[0].end_seq != seq) | ||
4524 | goto add_sack; | ||
4525 | |||
4526 | /* Common case: data arrive in order after hole. */ | ||
4527 | tp->selective_acks[0].end_seq = end_seq; | ||
4528 | return; | ||
4529 | } | ||
4530 | |||
4531 | /* Find place to insert this segment. */ | ||
4532 | while (1) { | ||
4533 | if (!after(TCP_SKB_CB(skb1)->seq, seq)) | ||
4534 | break; | ||
4535 | if (skb_queue_is_first(&tp->out_of_order_queue, skb1)) { | ||
4536 | skb1 = NULL; | ||
4537 | break; | ||
4538 | } | ||
4539 | skb1 = skb_queue_prev(&tp->out_of_order_queue, skb1); | ||
4540 | } | ||
4541 | |||
4542 | /* Do skb overlap to previous one? */ | ||
4543 | if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) { | ||
4544 | if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) { | ||
4545 | /* All the bits are present. Drop. */ | ||
4546 | __kfree_skb(skb); | ||
4547 | tcp_dsack_set(sk, seq, end_seq); | ||
4548 | goto add_sack; | ||
4549 | } | ||
4550 | if (after(seq, TCP_SKB_CB(skb1)->seq)) { | ||
4551 | /* Partial overlap. */ | ||
4552 | tcp_dsack_set(sk, seq, | ||
4553 | TCP_SKB_CB(skb1)->end_seq); | ||
4554 | } else { | ||
4555 | if (skb_queue_is_first(&tp->out_of_order_queue, | ||
4556 | skb1)) | ||
4557 | skb1 = NULL; | ||
4558 | else | ||
4559 | skb1 = skb_queue_prev( | ||
4560 | &tp->out_of_order_queue, | ||
4561 | skb1); | ||
4562 | } | ||
4563 | } | ||
4564 | if (!skb1) | ||
4565 | __skb_queue_head(&tp->out_of_order_queue, skb); | ||
4566 | else | ||
4567 | __skb_queue_after(&tp->out_of_order_queue, skb1, skb); | ||
4568 | |||
4569 | /* And clean segments covered by new one as whole. */ | ||
4570 | while (!skb_queue_is_last(&tp->out_of_order_queue, skb)) { | ||
4571 | skb1 = skb_queue_next(&tp->out_of_order_queue, skb); | ||
4572 | |||
4573 | if (!after(end_seq, TCP_SKB_CB(skb1)->seq)) | ||
4574 | break; | ||
4575 | if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) { | ||
4576 | tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq, | ||
4577 | end_seq); | ||
4578 | break; | ||
4579 | } | ||
4580 | __skb_unlink(skb1, &tp->out_of_order_queue); | ||
4581 | tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq, | ||
4582 | TCP_SKB_CB(skb1)->end_seq); | ||
4583 | __kfree_skb(skb1); | ||
4584 | } | ||
4585 | |||
4586 | add_sack: | ||
4587 | if (tcp_is_sack(tp)) | ||
4588 | tcp_sack_new_ofo_skb(sk, seq, end_seq); | ||
4589 | } | ||
4705 | } | 4590 | } |
4706 | 4591 | ||
4707 | static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb, | 4592 | static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb, |
@@ -4880,10 +4765,10 @@ static void tcp_collapse_ofo_queue(struct sock *sk) | |||
4880 | * Purge the out-of-order queue. | 4765 | * Purge the out-of-order queue. |
4881 | * Return true if queue was pruned. | 4766 | * Return true if queue was pruned. |
4882 | */ | 4767 | */ |
4883 | static bool tcp_prune_ofo_queue(struct sock *sk) | 4768 | static int tcp_prune_ofo_queue(struct sock *sk) |
4884 | { | 4769 | { |
4885 | struct tcp_sock *tp = tcp_sk(sk); | 4770 | struct tcp_sock *tp = tcp_sk(sk); |
4886 | bool res = false; | 4771 | int res = 0; |
4887 | 4772 | ||
4888 | if (!skb_queue_empty(&tp->out_of_order_queue)) { | 4773 | if (!skb_queue_empty(&tp->out_of_order_queue)) { |
4889 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_OFOPRUNED); | 4774 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_OFOPRUNED); |
@@ -4897,7 +4782,7 @@ static bool tcp_prune_ofo_queue(struct sock *sk) | |||
4897 | if (tp->rx_opt.sack_ok) | 4782 | if (tp->rx_opt.sack_ok) |
4898 | tcp_sack_reset(&tp->rx_opt); | 4783 | tcp_sack_reset(&tp->rx_opt); |
4899 | sk_mem_reclaim(sk); | 4784 | sk_mem_reclaim(sk); |
4900 | res = true; | 4785 | res = 1; |
4901 | } | 4786 | } |
4902 | return res; | 4787 | return res; |
4903 | } | 4788 | } |
@@ -4919,7 +4804,7 @@ static int tcp_prune_queue(struct sock *sk) | |||
4919 | 4804 | ||
4920 | if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) | 4805 | if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) |
4921 | tcp_clamp_window(sk); | 4806 | tcp_clamp_window(sk); |
4922 | else if (sk_under_memory_pressure(sk)) | 4807 | else if (tcp_memory_pressure) |
4923 | tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); | 4808 | tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); |
4924 | 4809 | ||
4925 | tcp_collapse_ofo_queue(sk); | 4810 | tcp_collapse_ofo_queue(sk); |
@@ -4974,29 +4859,29 @@ void tcp_cwnd_application_limited(struct sock *sk) | |||
4974 | tp->snd_cwnd_stamp = tcp_time_stamp; | 4859 | tp->snd_cwnd_stamp = tcp_time_stamp; |
4975 | } | 4860 | } |
4976 | 4861 | ||
4977 | static bool tcp_should_expand_sndbuf(const struct sock *sk) | 4862 | static int tcp_should_expand_sndbuf(struct sock *sk) |
4978 | { | 4863 | { |
4979 | const struct tcp_sock *tp = tcp_sk(sk); | 4864 | struct tcp_sock *tp = tcp_sk(sk); |
4980 | 4865 | ||
4981 | /* If the user specified a specific send buffer setting, do | 4866 | /* If the user specified a specific send buffer setting, do |
4982 | * not modify it. | 4867 | * not modify it. |
4983 | */ | 4868 | */ |
4984 | if (sk->sk_userlocks & SOCK_SNDBUF_LOCK) | 4869 | if (sk->sk_userlocks & SOCK_SNDBUF_LOCK) |
4985 | return false; | 4870 | return 0; |
4986 | 4871 | ||
4987 | /* If we are under global TCP memory pressure, do not expand. */ | 4872 | /* If we are under global TCP memory pressure, do not expand. */ |
4988 | if (sk_under_memory_pressure(sk)) | 4873 | if (tcp_memory_pressure) |
4989 | return false; | 4874 | return 0; |
4990 | 4875 | ||
4991 | /* If we are under soft global TCP memory pressure, do not expand. */ | 4876 | /* If we are under soft global TCP memory pressure, do not expand. */ |
4992 | if (sk_memory_allocated(sk) >= sk_prot_mem_limits(sk, 0)) | 4877 | if (atomic_long_read(&tcp_memory_allocated) >= sysctl_tcp_mem[0]) |
4993 | return false; | 4878 | return 0; |
4994 | 4879 | ||
4995 | /* If we filled the congestion window, do not expand. */ | 4880 | /* If we filled the congestion window, do not expand. */ |
4996 | if (tp->packets_out >= tp->snd_cwnd) | 4881 | if (tp->packets_out >= tp->snd_cwnd) |
4997 | return false; | 4882 | return 0; |
4998 | 4883 | ||
4999 | return true; | 4884 | return 1; |
5000 | } | 4885 | } |
5001 | 4886 | ||
5002 | /* When incoming ACK allowed to free some skb from write_queue, | 4887 | /* When incoming ACK allowed to free some skb from write_queue, |
@@ -5010,10 +4895,8 @@ static void tcp_new_space(struct sock *sk) | |||
5010 | struct tcp_sock *tp = tcp_sk(sk); | 4895 | struct tcp_sock *tp = tcp_sk(sk); |
5011 | 4896 | ||
5012 | if (tcp_should_expand_sndbuf(sk)) { | 4897 | if (tcp_should_expand_sndbuf(sk)) { |
5013 | int sndmem = SKB_TRUESIZE(max_t(u32, | 4898 | int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) + |
5014 | tp->rx_opt.mss_clamp, | 4899 | MAX_TCP_HEADER + 16 + sizeof(struct sk_buff); |
5015 | tp->mss_cache) + | ||
5016 | MAX_TCP_HEADER); | ||
5017 | int demanded = max_t(unsigned int, tp->snd_cwnd, | 4900 | int demanded = max_t(unsigned int, tp->snd_cwnd, |
5018 | tp->reordering + 1); | 4901 | tp->reordering + 1); |
5019 | sndmem *= 2 * demanded; | 4902 | sndmem *= 2 * demanded; |
@@ -5085,7 +4968,7 @@ static inline void tcp_ack_snd_check(struct sock *sk) | |||
5085 | * either form (or just set the sysctl tcp_stdurg). | 4968 | * either form (or just set the sysctl tcp_stdurg). |
5086 | */ | 4969 | */ |
5087 | 4970 | ||
5088 | static void tcp_check_urg(struct sock *sk, const struct tcphdr *th) | 4971 | static void tcp_check_urg(struct sock *sk, struct tcphdr *th) |
5089 | { | 4972 | { |
5090 | struct tcp_sock *tp = tcp_sk(sk); | 4973 | struct tcp_sock *tp = tcp_sk(sk); |
5091 | u32 ptr = ntohs(th->urg_ptr); | 4974 | u32 ptr = ntohs(th->urg_ptr); |
@@ -5151,7 +5034,7 @@ static void tcp_check_urg(struct sock *sk, const struct tcphdr *th) | |||
5151 | } | 5034 | } |
5152 | 5035 | ||
5153 | /* This is the 'fast' part of urgent handling. */ | 5036 | /* This is the 'fast' part of urgent handling. */ |
5154 | static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th) | 5037 | static void tcp_urg(struct sock *sk, struct sk_buff *skb, struct tcphdr *th) |
5155 | { | 5038 | { |
5156 | struct tcp_sock *tp = tcp_sk(sk); | 5039 | struct tcp_sock *tp = tcp_sk(sk); |
5157 | 5040 | ||
@@ -5214,7 +5097,7 @@ static __sum16 __tcp_checksum_complete_user(struct sock *sk, | |||
5214 | return result; | 5097 | return result; |
5215 | } | 5098 | } |
5216 | 5099 | ||
5217 | static inline bool tcp_checksum_complete_user(struct sock *sk, | 5100 | static inline int tcp_checksum_complete_user(struct sock *sk, |
5218 | struct sk_buff *skb) | 5101 | struct sk_buff *skb) |
5219 | { | 5102 | { |
5220 | return !skb_csum_unnecessary(skb) && | 5103 | return !skb_csum_unnecessary(skb) && |
@@ -5222,19 +5105,19 @@ static inline bool tcp_checksum_complete_user(struct sock *sk, | |||
5222 | } | 5105 | } |
5223 | 5106 | ||
5224 | #ifdef CONFIG_NET_DMA | 5107 | #ifdef CONFIG_NET_DMA |
5225 | static bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, | 5108 | static int tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, |
5226 | int hlen) | 5109 | int hlen) |
5227 | { | 5110 | { |
5228 | struct tcp_sock *tp = tcp_sk(sk); | 5111 | struct tcp_sock *tp = tcp_sk(sk); |
5229 | int chunk = skb->len - hlen; | 5112 | int chunk = skb->len - hlen; |
5230 | int dma_cookie; | 5113 | int dma_cookie; |
5231 | bool copied_early = false; | 5114 | int copied_early = 0; |
5232 | 5115 | ||
5233 | if (tp->ucopy.wakeup) | 5116 | if (tp->ucopy.wakeup) |
5234 | return false; | 5117 | return 0; |
5235 | 5118 | ||
5236 | if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) | 5119 | if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) |
5237 | tp->ucopy.dma_chan = net_dma_find_channel(); | 5120 | tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY); |
5238 | 5121 | ||
5239 | if (tp->ucopy.dma_chan && skb_csum_unnecessary(skb)) { | 5122 | if (tp->ucopy.dma_chan && skb_csum_unnecessary(skb)) { |
5240 | 5123 | ||
@@ -5247,7 +5130,7 @@ static bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, | |||
5247 | goto out; | 5130 | goto out; |
5248 | 5131 | ||
5249 | tp->ucopy.dma_cookie = dma_cookie; | 5132 | tp->ucopy.dma_cookie = dma_cookie; |
5250 | copied_early = true; | 5133 | copied_early = 1; |
5251 | 5134 | ||
5252 | tp->ucopy.len -= chunk; | 5135 | tp->ucopy.len -= chunk; |
5253 | tp->copied_seq += chunk; | 5136 | tp->copied_seq += chunk; |
@@ -5271,10 +5154,10 @@ out: | |||
5271 | /* Does PAWS and seqno based validation of an incoming segment, flags will | 5154 | /* Does PAWS and seqno based validation of an incoming segment, flags will |
5272 | * play significant role here. | 5155 | * play significant role here. |
5273 | */ | 5156 | */ |
5274 | static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, | 5157 | static int tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, |
5275 | const struct tcphdr *th, int syn_inerr) | 5158 | struct tcphdr *th, int syn_inerr) |
5276 | { | 5159 | { |
5277 | const u8 *hash_location; | 5160 | u8 *hash_location; |
5278 | struct tcp_sock *tp = tcp_sk(sk); | 5161 | struct tcp_sock *tp = tcp_sk(sk); |
5279 | 5162 | ||
5280 | /* RFC1323: H1. Apply PAWS check first. */ | 5163 | /* RFC1323: H1. Apply PAWS check first. */ |
@@ -5297,48 +5180,38 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, | |||
5297 | * an acknowledgment should be sent in reply (unless the RST | 5180 | * an acknowledgment should be sent in reply (unless the RST |
5298 | * bit is set, if so drop the segment and return)". | 5181 | * bit is set, if so drop the segment and return)". |
5299 | */ | 5182 | */ |
5300 | if (!th->rst) { | 5183 | if (!th->rst) |
5301 | if (th->syn) | ||
5302 | goto syn_challenge; | ||
5303 | tcp_send_dupack(sk, skb); | 5184 | tcp_send_dupack(sk, skb); |
5304 | } | ||
5305 | goto discard; | 5185 | goto discard; |
5306 | } | 5186 | } |
5307 | 5187 | ||
5308 | /* Step 2: check RST bit */ | 5188 | /* Step 2: check RST bit */ |
5309 | if (th->rst) { | 5189 | if (th->rst) { |
5310 | /* RFC 5961 3.2 : | 5190 | tcp_reset(sk); |
5311 | * If sequence number exactly matches RCV.NXT, then | ||
5312 | * RESET the connection | ||
5313 | * else | ||
5314 | * Send a challenge ACK | ||
5315 | */ | ||
5316 | if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) | ||
5317 | tcp_reset(sk); | ||
5318 | else | ||
5319 | tcp_send_challenge_ack(sk); | ||
5320 | goto discard; | 5191 | goto discard; |
5321 | } | 5192 | } |
5322 | 5193 | ||
5194 | /* ts_recent update must be made after we are sure that the packet | ||
5195 | * is in window. | ||
5196 | */ | ||
5197 | tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq); | ||
5198 | |||
5323 | /* step 3: check security and precedence [ignored] */ | 5199 | /* step 3: check security and precedence [ignored] */ |
5324 | 5200 | ||
5325 | /* step 4: Check for a SYN | 5201 | /* step 4: Check for a SYN in window. */ |
5326 | * RFC 5691 4.2 : Send a challenge ack | 5202 | if (th->syn && !before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { |
5327 | */ | ||
5328 | if (th->syn) { | ||
5329 | syn_challenge: | ||
5330 | if (syn_inerr) | 5203 | if (syn_inerr) |
5331 | TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); | 5204 | TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); |
5332 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE); | 5205 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONSYN); |
5333 | tcp_send_challenge_ack(sk); | 5206 | tcp_reset(sk); |
5334 | goto discard; | 5207 | return -1; |
5335 | } | 5208 | } |
5336 | 5209 | ||
5337 | return true; | 5210 | return 1; |
5338 | 5211 | ||
5339 | discard: | 5212 | discard: |
5340 | __kfree_skb(skb); | 5213 | __kfree_skb(skb); |
5341 | return false; | 5214 | return 0; |
5342 | } | 5215 | } |
5343 | 5216 | ||
5344 | /* | 5217 | /* |
@@ -5365,12 +5238,11 @@ discard: | |||
5365 | * tcp_data_queue when everything is OK. | 5238 | * tcp_data_queue when everything is OK. |
5366 | */ | 5239 | */ |
5367 | int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, | 5240 | int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, |
5368 | const struct tcphdr *th, unsigned int len) | 5241 | struct tcphdr *th, unsigned len) |
5369 | { | 5242 | { |
5370 | struct tcp_sock *tp = tcp_sk(sk); | 5243 | struct tcp_sock *tp = tcp_sk(sk); |
5244 | int res; | ||
5371 | 5245 | ||
5372 | if (unlikely(sk->sk_rx_dst == NULL)) | ||
5373 | inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb); | ||
5374 | /* | 5246 | /* |
5375 | * Header prediction. | 5247 | * Header prediction. |
5376 | * The code loosely follows the one in the famous | 5248 | * The code loosely follows the one in the famous |
@@ -5450,14 +5322,11 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, | |||
5450 | } else { | 5322 | } else { |
5451 | int eaten = 0; | 5323 | int eaten = 0; |
5452 | int copied_early = 0; | 5324 | int copied_early = 0; |
5453 | bool fragstolen = false; | ||
5454 | 5325 | ||
5455 | if (tp->copied_seq == tp->rcv_nxt && | 5326 | if (tp->copied_seq == tp->rcv_nxt && |
5456 | len - tcp_header_len <= tp->ucopy.len) { | 5327 | len - tcp_header_len <= tp->ucopy.len) { |
5457 | #ifdef CONFIG_NET_DMA | 5328 | #ifdef CONFIG_NET_DMA |
5458 | if (tp->ucopy.task == current && | 5329 | if (tcp_dma_try_early_copy(sk, skb, tcp_header_len)) { |
5459 | sock_owned_by_user(sk) && | ||
5460 | tcp_dma_try_early_copy(sk, skb, tcp_header_len)) { | ||
5461 | copied_early = 1; | 5330 | copied_early = 1; |
5462 | eaten = 1; | 5331 | eaten = 1; |
5463 | } | 5332 | } |
@@ -5510,8 +5379,10 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, | |||
5510 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITS); | 5379 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITS); |
5511 | 5380 | ||
5512 | /* Bulk data transfer: receiver */ | 5381 | /* Bulk data transfer: receiver */ |
5513 | eaten = tcp_queue_rcv(sk, skb, tcp_header_len, | 5382 | __skb_pull(skb, tcp_header_len); |
5514 | &fragstolen); | 5383 | __skb_queue_tail(&sk->sk_receive_queue, skb); |
5384 | skb_set_owner_r(skb, sk); | ||
5385 | tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; | ||
5515 | } | 5386 | } |
5516 | 5387 | ||
5517 | tcp_event_data_recv(sk, skb); | 5388 | tcp_event_data_recv(sk, skb); |
@@ -5533,8 +5404,9 @@ no_ack: | |||
5533 | else | 5404 | else |
5534 | #endif | 5405 | #endif |
5535 | if (eaten) | 5406 | if (eaten) |
5536 | kfree_skb_partial(skb, fragstolen); | 5407 | __kfree_skb(skb); |
5537 | sk->sk_data_ready(sk, 0); | 5408 | else |
5409 | sk->sk_data_ready(sk, 0); | ||
5538 | return 0; | 5410 | return 0; |
5539 | } | 5411 | } |
5540 | } | 5412 | } |
@@ -5543,25 +5415,18 @@ slow_path: | |||
5543 | if (len < (th->doff << 2) || tcp_checksum_complete_user(sk, skb)) | 5415 | if (len < (th->doff << 2) || tcp_checksum_complete_user(sk, skb)) |
5544 | goto csum_error; | 5416 | goto csum_error; |
5545 | 5417 | ||
5546 | if (!th->ack && !th->rst) | ||
5547 | goto discard; | ||
5548 | |||
5549 | /* | 5418 | /* |
5550 | * Standard slow path. | 5419 | * Standard slow path. |
5551 | */ | 5420 | */ |
5552 | 5421 | ||
5553 | if (!tcp_validate_incoming(sk, skb, th, 1)) | 5422 | res = tcp_validate_incoming(sk, skb, th, 1); |
5554 | return 0; | 5423 | if (res <= 0) |
5424 | return -res; | ||
5555 | 5425 | ||
5556 | step5: | 5426 | step5: |
5557 | if (tcp_ack(sk, skb, FLAG_SLOWPATH) < 0) | 5427 | if (th->ack && tcp_ack(sk, skb, FLAG_SLOWPATH) < 0) |
5558 | goto discard; | 5428 | goto discard; |
5559 | 5429 | ||
5560 | /* ts_recent update must be made after we are sure that the packet | ||
5561 | * is in window. | ||
5562 | */ | ||
5563 | tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq); | ||
5564 | |||
5565 | tcp_rcv_rtt_measure_ts(sk, skb); | 5430 | tcp_rcv_rtt_measure_ts(sk, skb); |
5566 | 5431 | ||
5567 | /* Process urgent data. */ | 5432 | /* Process urgent data. */ |
@@ -5583,101 +5448,16 @@ discard: | |||
5583 | } | 5448 | } |
5584 | EXPORT_SYMBOL(tcp_rcv_established); | 5449 | EXPORT_SYMBOL(tcp_rcv_established); |
5585 | 5450 | ||
5586 | void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) | ||
5587 | { | ||
5588 | struct tcp_sock *tp = tcp_sk(sk); | ||
5589 | struct inet_connection_sock *icsk = inet_csk(sk); | ||
5590 | |||
5591 | tcp_set_state(sk, TCP_ESTABLISHED); | ||
5592 | |||
5593 | if (skb != NULL) { | ||
5594 | icsk->icsk_af_ops->sk_rx_dst_set(sk, skb); | ||
5595 | security_inet_conn_established(sk, skb); | ||
5596 | } | ||
5597 | |||
5598 | /* Make sure socket is routed, for correct metrics. */ | ||
5599 | icsk->icsk_af_ops->rebuild_header(sk); | ||
5600 | |||
5601 | tcp_init_metrics(sk); | ||
5602 | |||
5603 | tcp_init_congestion_control(sk); | ||
5604 | |||
5605 | /* Prevent spurious tcp_cwnd_restart() on first data | ||
5606 | * packet. | ||
5607 | */ | ||
5608 | tp->lsndtime = tcp_time_stamp; | ||
5609 | |||
5610 | tcp_init_buffer_space(sk); | ||
5611 | |||
5612 | if (sock_flag(sk, SOCK_KEEPOPEN)) | ||
5613 | inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp)); | ||
5614 | |||
5615 | if (!tp->rx_opt.snd_wscale) | ||
5616 | __tcp_fast_path_on(tp, tp->snd_wnd); | ||
5617 | else | ||
5618 | tp->pred_flags = 0; | ||
5619 | |||
5620 | if (!sock_flag(sk, SOCK_DEAD)) { | ||
5621 | sk->sk_state_change(sk); | ||
5622 | sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT); | ||
5623 | } | ||
5624 | } | ||
5625 | |||
5626 | static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, | ||
5627 | struct tcp_fastopen_cookie *cookie) | ||
5628 | { | ||
5629 | struct tcp_sock *tp = tcp_sk(sk); | ||
5630 | struct sk_buff *data = tp->syn_data ? tcp_write_queue_head(sk) : NULL; | ||
5631 | u16 mss = tp->rx_opt.mss_clamp; | ||
5632 | bool syn_drop; | ||
5633 | |||
5634 | if (mss == tp->rx_opt.user_mss) { | ||
5635 | struct tcp_options_received opt; | ||
5636 | const u8 *hash_location; | ||
5637 | |||
5638 | /* Get original SYNACK MSS value if user MSS sets mss_clamp */ | ||
5639 | tcp_clear_options(&opt); | ||
5640 | opt.user_mss = opt.mss_clamp = 0; | ||
5641 | tcp_parse_options(synack, &opt, &hash_location, 0, NULL); | ||
5642 | mss = opt.mss_clamp; | ||
5643 | } | ||
5644 | |||
5645 | if (!tp->syn_fastopen) /* Ignore an unsolicited cookie */ | ||
5646 | cookie->len = -1; | ||
5647 | |||
5648 | /* The SYN-ACK neither has cookie nor acknowledges the data. Presumably | ||
5649 | * the remote receives only the retransmitted (regular) SYNs: either | ||
5650 | * the original SYN-data or the corresponding SYN-ACK is lost. | ||
5651 | */ | ||
5652 | syn_drop = (cookie->len <= 0 && data && | ||
5653 | inet_csk(sk)->icsk_retransmits); | ||
5654 | |||
5655 | tcp_fastopen_cache_set(sk, mss, cookie, syn_drop); | ||
5656 | |||
5657 | if (data) { /* Retransmit unacked data in SYN */ | ||
5658 | tcp_for_write_queue_from(data, sk) { | ||
5659 | if (data == tcp_send_head(sk) || | ||
5660 | __tcp_retransmit_skb(sk, data)) | ||
5661 | break; | ||
5662 | } | ||
5663 | tcp_rearm_rto(sk); | ||
5664 | return true; | ||
5665 | } | ||
5666 | tp->syn_data_acked = tp->syn_data; | ||
5667 | return false; | ||
5668 | } | ||
5669 | |||
5670 | static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, | 5451 | static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, |
5671 | const struct tcphdr *th, unsigned int len) | 5452 | struct tcphdr *th, unsigned len) |
5672 | { | 5453 | { |
5673 | const u8 *hash_location; | 5454 | u8 *hash_location; |
5674 | struct inet_connection_sock *icsk = inet_csk(sk); | 5455 | struct inet_connection_sock *icsk = inet_csk(sk); |
5675 | struct tcp_sock *tp = tcp_sk(sk); | 5456 | struct tcp_sock *tp = tcp_sk(sk); |
5676 | struct tcp_cookie_values *cvp = tp->cookie_values; | 5457 | struct tcp_cookie_values *cvp = tp->cookie_values; |
5677 | struct tcp_fastopen_cookie foc = { .len = -1 }; | ||
5678 | int saved_clamp = tp->rx_opt.mss_clamp; | 5458 | int saved_clamp = tp->rx_opt.mss_clamp; |
5679 | 5459 | ||
5680 | tcp_parse_options(skb, &tp->rx_opt, &hash_location, 0, &foc); | 5460 | tcp_parse_options(skb, &tp->rx_opt, &hash_location, 0); |
5681 | 5461 | ||
5682 | if (th->ack) { | 5462 | if (th->ack) { |
5683 | /* rfc793: | 5463 | /* rfc793: |
@@ -5687,9 +5467,11 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, | |||
5687 | * If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send | 5467 | * If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send |
5688 | * a reset (unless the RST bit is set, if so drop | 5468 | * a reset (unless the RST bit is set, if so drop |
5689 | * the segment and return)" | 5469 | * the segment and return)" |
5470 | * | ||
5471 | * We do not send data with SYN, so that RFC-correct | ||
5472 | * test reduces to: | ||
5690 | */ | 5473 | */ |
5691 | if (!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_una) || | 5474 | if (TCP_SKB_CB(skb)->ack_seq != tp->snd_nxt) |
5692 | after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) | ||
5693 | goto reset_and_undo; | 5475 | goto reset_and_undo; |
5694 | 5476 | ||
5695 | if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && | 5477 | if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && |
@@ -5731,7 +5513,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, | |||
5731 | 5513 | ||
5732 | TCP_ECN_rcv_synack(tp, th); | 5514 | TCP_ECN_rcv_synack(tp, th); |
5733 | 5515 | ||
5734 | tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); | 5516 | tp->snd_wl1 = TCP_SKB_CB(skb)->seq; |
5735 | tcp_ack(sk, skb, FLAG_SLOWPATH); | 5517 | tcp_ack(sk, skb, FLAG_SLOWPATH); |
5736 | 5518 | ||
5737 | /* Ok.. it's good. Set up sequence numbers and | 5519 | /* Ok.. it's good. Set up sequence numbers and |
@@ -5744,6 +5526,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, | |||
5744 | * never scaled. | 5526 | * never scaled. |
5745 | */ | 5527 | */ |
5746 | tp->snd_wnd = ntohs(th->window); | 5528 | tp->snd_wnd = ntohs(th->window); |
5529 | tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); | ||
5747 | 5530 | ||
5748 | if (!tp->rx_opt.wscale_ok) { | 5531 | if (!tp->rx_opt.wscale_ok) { |
5749 | tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = 0; | 5532 | tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = 0; |
@@ -5797,12 +5580,36 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, | |||
5797 | } | 5580 | } |
5798 | 5581 | ||
5799 | smp_mb(); | 5582 | smp_mb(); |
5583 | tcp_set_state(sk, TCP_ESTABLISHED); | ||
5800 | 5584 | ||
5801 | tcp_finish_connect(sk, skb); | 5585 | security_inet_conn_established(sk, skb); |
5802 | 5586 | ||
5803 | if ((tp->syn_fastopen || tp->syn_data) && | 5587 | /* Make sure socket is routed, for correct metrics. */ |
5804 | tcp_rcv_fastopen_synack(sk, skb, &foc)) | 5588 | icsk->icsk_af_ops->rebuild_header(sk); |
5805 | return -1; | 5589 | |
5590 | tcp_init_metrics(sk); | ||
5591 | |||
5592 | tcp_init_congestion_control(sk); | ||
5593 | |||
5594 | /* Prevent spurious tcp_cwnd_restart() on first data | ||
5595 | * packet. | ||
5596 | */ | ||
5597 | tp->lsndtime = tcp_time_stamp; | ||
5598 | |||
5599 | tcp_init_buffer_space(sk); | ||
5600 | |||
5601 | if (sock_flag(sk, SOCK_KEEPOPEN)) | ||
5602 | inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp)); | ||
5603 | |||
5604 | if (!tp->rx_opt.snd_wscale) | ||
5605 | __tcp_fast_path_on(tp, tp->snd_wnd); | ||
5606 | else | ||
5607 | tp->pred_flags = 0; | ||
5608 | |||
5609 | if (!sock_flag(sk, SOCK_DEAD)) { | ||
5610 | sk->sk_state_change(sk); | ||
5611 | sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT); | ||
5612 | } | ||
5806 | 5613 | ||
5807 | if (sk->sk_write_pending || | 5614 | if (sk->sk_write_pending || |
5808 | icsk->icsk_accept_queue.rskq_defer_accept || | 5615 | icsk->icsk_accept_queue.rskq_defer_accept || |
@@ -5816,6 +5623,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, | |||
5816 | */ | 5623 | */ |
5817 | inet_csk_schedule_ack(sk); | 5624 | inet_csk_schedule_ack(sk); |
5818 | icsk->icsk_ack.lrcvtime = tcp_time_stamp; | 5625 | icsk->icsk_ack.lrcvtime = tcp_time_stamp; |
5626 | icsk->icsk_ack.ato = TCP_ATO_MIN; | ||
5627 | tcp_incr_quickack(sk); | ||
5819 | tcp_enter_quickack_mode(sk); | 5628 | tcp_enter_quickack_mode(sk); |
5820 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, | 5629 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, |
5821 | TCP_DELACK_MAX, TCP_RTO_MAX); | 5630 | TCP_DELACK_MAX, TCP_RTO_MAX); |
@@ -5881,9 +5690,7 @@ discard: | |||
5881 | tcp_send_synack(sk); | 5690 | tcp_send_synack(sk); |
5882 | #if 0 | 5691 | #if 0 |
5883 | /* Note, we could accept data and URG from this segment. | 5692 | /* Note, we could accept data and URG from this segment. |
5884 | * There are no obstacles to make this (except that we must | 5693 | * There are no obstacles to make this. |
5885 | * either change tcp_recvmsg() to prevent it from returning data | ||
5886 | * before 3WHS completes per RFC793, or employ TCP Fast Open). | ||
5887 | * | 5694 | * |
5888 | * However, if we ignore data in ACKless segments sometimes, | 5695 | * However, if we ignore data in ACKless segments sometimes, |
5889 | * we have no reasons to accept it sometimes. | 5696 | * we have no reasons to accept it sometimes. |
@@ -5919,12 +5726,12 @@ reset_and_undo: | |||
5919 | */ | 5726 | */ |
5920 | 5727 | ||
5921 | int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, | 5728 | int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, |
5922 | const struct tcphdr *th, unsigned int len) | 5729 | struct tcphdr *th, unsigned len) |
5923 | { | 5730 | { |
5924 | struct tcp_sock *tp = tcp_sk(sk); | 5731 | struct tcp_sock *tp = tcp_sk(sk); |
5925 | struct inet_connection_sock *icsk = inet_csk(sk); | 5732 | struct inet_connection_sock *icsk = inet_csk(sk); |
5926 | struct request_sock *req; | ||
5927 | int queued = 0; | 5733 | int queued = 0; |
5734 | int res; | ||
5928 | 5735 | ||
5929 | tp->rx_opt.saw_tstamp = 0; | 5736 | tp->rx_opt.saw_tstamp = 0; |
5930 | 5737 | ||
@@ -5940,8 +5747,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, | |||
5940 | goto discard; | 5747 | goto discard; |
5941 | 5748 | ||
5942 | if (th->syn) { | 5749 | if (th->syn) { |
5943 | if (th->fin) | ||
5944 | goto discard; | ||
5945 | if (icsk->icsk_af_ops->conn_request(sk, skb) < 0) | 5750 | if (icsk->icsk_af_ops->conn_request(sk, skb) < 0) |
5946 | return 1; | 5751 | return 1; |
5947 | 5752 | ||
@@ -5979,47 +5784,18 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, | |||
5979 | return 0; | 5784 | return 0; |
5980 | } | 5785 | } |
5981 | 5786 | ||
5982 | req = tp->fastopen_rsk; | 5787 | res = tcp_validate_incoming(sk, skb, th, 0); |
5983 | if (req != NULL) { | 5788 | if (res <= 0) |
5984 | WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV && | 5789 | return -res; |
5985 | sk->sk_state != TCP_FIN_WAIT1); | ||
5986 | |||
5987 | if (tcp_check_req(sk, skb, req, NULL, true) == NULL) | ||
5988 | goto discard; | ||
5989 | } | ||
5990 | |||
5991 | if (!th->ack && !th->rst) | ||
5992 | goto discard; | ||
5993 | |||
5994 | if (!tcp_validate_incoming(sk, skb, th, 0)) | ||
5995 | return 0; | ||
5996 | 5790 | ||
5997 | /* step 5: check the ACK field */ | 5791 | /* step 5: check the ACK field */ |
5998 | if (true) { | 5792 | if (th->ack) { |
5999 | int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH) > 0; | 5793 | int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH) > 0; |
6000 | 5794 | ||
6001 | switch (sk->sk_state) { | 5795 | switch (sk->sk_state) { |
6002 | case TCP_SYN_RECV: | 5796 | case TCP_SYN_RECV: |
6003 | if (acceptable) { | 5797 | if (acceptable) { |
6004 | /* Once we leave TCP_SYN_RECV, we no longer | 5798 | tp->copied_seq = tp->rcv_nxt; |
6005 | * need req so release it. | ||
6006 | */ | ||
6007 | if (req) { | ||
6008 | tcp_synack_rtt_meas(sk, req); | ||
6009 | tp->total_retrans = req->num_retrans; | ||
6010 | |||
6011 | reqsk_fastopen_remove(sk, req, false); | ||
6012 | } else { | ||
6013 | /* Make sure socket is routed, for | ||
6014 | * correct metrics. | ||
6015 | */ | ||
6016 | icsk->icsk_af_ops->rebuild_header(sk); | ||
6017 | tcp_init_congestion_control(sk); | ||
6018 | |||
6019 | tcp_mtup_init(sk); | ||
6020 | tcp_init_buffer_space(sk); | ||
6021 | tp->copied_seq = tp->rcv_nxt; | ||
6022 | } | ||
6023 | smp_mb(); | 5799 | smp_mb(); |
6024 | tcp_set_state(sk, TCP_ESTABLISHED); | 5800 | tcp_set_state(sk, TCP_ESTABLISHED); |
6025 | sk->sk_state_change(sk); | 5801 | sk->sk_state_change(sk); |
@@ -6041,27 +5817,23 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, | |||
6041 | if (tp->rx_opt.tstamp_ok) | 5817 | if (tp->rx_opt.tstamp_ok) |
6042 | tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; | 5818 | tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; |
6043 | 5819 | ||
6044 | if (req) { | 5820 | /* Make sure socket is routed, for |
6045 | /* Re-arm the timer because data may | 5821 | * correct metrics. |
6046 | * have been sent out. This is similar | 5822 | */ |
6047 | * to the regular data transmission case | 5823 | icsk->icsk_af_ops->rebuild_header(sk); |
6048 | * when new data has just been ack'ed. | 5824 | |
6049 | * | 5825 | tcp_init_metrics(sk); |
6050 | * (TFO) - we could try to be more | 5826 | |
6051 | * aggressive and retranmitting any data | 5827 | tcp_init_congestion_control(sk); |
6052 | * sooner based on when they were sent | ||
6053 | * out. | ||
6054 | */ | ||
6055 | tcp_rearm_rto(sk); | ||
6056 | } else | ||
6057 | tcp_init_metrics(sk); | ||
6058 | 5828 | ||
6059 | /* Prevent spurious tcp_cwnd_restart() on | 5829 | /* Prevent spurious tcp_cwnd_restart() on |
6060 | * first data packet. | 5830 | * first data packet. |
6061 | */ | 5831 | */ |
6062 | tp->lsndtime = tcp_time_stamp; | 5832 | tp->lsndtime = tcp_time_stamp; |
6063 | 5833 | ||
5834 | tcp_mtup_init(sk); | ||
6064 | tcp_initialize_rcv_mss(sk); | 5835 | tcp_initialize_rcv_mss(sk); |
5836 | tcp_init_buffer_space(sk); | ||
6065 | tcp_fast_path_on(tp); | 5837 | tcp_fast_path_on(tp); |
6066 | } else { | 5838 | } else { |
6067 | return 1; | 5839 | return 1; |
@@ -6069,33 +5841,10 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, | |||
6069 | break; | 5841 | break; |
6070 | 5842 | ||
6071 | case TCP_FIN_WAIT1: | 5843 | case TCP_FIN_WAIT1: |
6072 | /* If we enter the TCP_FIN_WAIT1 state and we are a | ||
6073 | * Fast Open socket and this is the first acceptable | ||
6074 | * ACK we have received, this would have acknowledged | ||
6075 | * our SYNACK so stop the SYNACK timer. | ||
6076 | */ | ||
6077 | if (req != NULL) { | ||
6078 | /* Return RST if ack_seq is invalid. | ||
6079 | * Note that RFC793 only says to generate a | ||
6080 | * DUPACK for it but for TCP Fast Open it seems | ||
6081 | * better to treat this case like TCP_SYN_RECV | ||
6082 | * above. | ||
6083 | */ | ||
6084 | if (!acceptable) | ||
6085 | return 1; | ||
6086 | /* We no longer need the request sock. */ | ||
6087 | reqsk_fastopen_remove(sk, req, false); | ||
6088 | tcp_rearm_rto(sk); | ||
6089 | } | ||
6090 | if (tp->snd_una == tp->write_seq) { | 5844 | if (tp->snd_una == tp->write_seq) { |
6091 | struct dst_entry *dst; | ||
6092 | |||
6093 | tcp_set_state(sk, TCP_FIN_WAIT2); | 5845 | tcp_set_state(sk, TCP_FIN_WAIT2); |
6094 | sk->sk_shutdown |= SEND_SHUTDOWN; | 5846 | sk->sk_shutdown |= SEND_SHUTDOWN; |
6095 | 5847 | dst_confirm(__sk_dst_get(sk)); | |
6096 | dst = __sk_dst_get(sk); | ||
6097 | if (dst) | ||
6098 | dst_confirm(dst); | ||
6099 | 5848 | ||
6100 | if (!sock_flag(sk, SOCK_DEAD)) | 5849 | if (!sock_flag(sk, SOCK_DEAD)) |
6101 | /* Wake up lingering close() */ | 5850 | /* Wake up lingering close() */ |
@@ -6145,12 +5894,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, | |||
6145 | } | 5894 | } |
6146 | break; | 5895 | break; |
6147 | } | 5896 | } |
6148 | } | 5897 | } else |
6149 | 5898 | goto discard; | |
6150 | /* ts_recent update must be made after we are sure that the packet | ||
6151 | * is in window. | ||
6152 | */ | ||
6153 | tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq); | ||
6154 | 5899 | ||
6155 | /* step 6: check the URG bit */ | 5900 | /* step 6: check the URG bit */ |
6156 | tcp_urg(sk, skb, th); | 5901 | tcp_urg(sk, skb, th); |