diff options
Diffstat (limited to 'net/ipv4/tcp_input.c')
-rw-r--r-- | net/ipv4/tcp_input.c | 131 |
1 files changed, 85 insertions, 46 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index a16b01b537ba..b935397c703c 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c | |||
@@ -267,11 +267,31 @@ static bool TCP_ECN_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr | |||
267 | * 1. Tuning sk->sk_sndbuf, when connection enters established state. | 267 | * 1. Tuning sk->sk_sndbuf, when connection enters established state. |
268 | */ | 268 | */ |
269 | 269 | ||
270 | static void tcp_fixup_sndbuf(struct sock *sk) | 270 | static void tcp_sndbuf_expand(struct sock *sk) |
271 | { | 271 | { |
272 | int sndmem = SKB_TRUESIZE(tcp_sk(sk)->rx_opt.mss_clamp + MAX_TCP_HEADER); | 272 | const struct tcp_sock *tp = tcp_sk(sk); |
273 | int sndmem, per_mss; | ||
274 | u32 nr_segs; | ||
275 | |||
276 | /* Worst case is non GSO/TSO : each frame consumes one skb | ||
277 | * and skb->head is kmalloced using power of two area of memory | ||
278 | */ | ||
279 | per_mss = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) + | ||
280 | MAX_TCP_HEADER + | ||
281 | SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); | ||
282 | |||
283 | per_mss = roundup_pow_of_two(per_mss) + | ||
284 | SKB_DATA_ALIGN(sizeof(struct sk_buff)); | ||
285 | |||
286 | nr_segs = max_t(u32, TCP_INIT_CWND, tp->snd_cwnd); | ||
287 | nr_segs = max_t(u32, nr_segs, tp->reordering + 1); | ||
288 | |||
289 | /* Fast Recovery (RFC 5681 3.2) : | ||
290 | * Cubic needs 1.7 factor, rounded to 2 to include | ||
291 | * extra cushion (application might react slowly to POLLOUT) | ||
292 | */ | ||
293 | sndmem = 2 * nr_segs * per_mss; | ||
273 | 294 | ||
274 | sndmem *= TCP_INIT_CWND; | ||
275 | if (sk->sk_sndbuf < sndmem) | 295 | if (sk->sk_sndbuf < sndmem) |
276 | sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]); | 296 | sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]); |
277 | } | 297 | } |
@@ -355,6 +375,12 @@ static void tcp_fixup_rcvbuf(struct sock *sk) | |||
355 | rcvmem = 2 * SKB_TRUESIZE(mss + MAX_TCP_HEADER) * | 375 | rcvmem = 2 * SKB_TRUESIZE(mss + MAX_TCP_HEADER) * |
356 | tcp_default_init_rwnd(mss); | 376 | tcp_default_init_rwnd(mss); |
357 | 377 | ||
378 | /* Dynamic Right Sizing (DRS) has 2 to 3 RTT latency | ||
379 | * Allow enough cushion so that sender is not limited by our window | ||
380 | */ | ||
381 | if (sysctl_tcp_moderate_rcvbuf) | ||
382 | rcvmem <<= 2; | ||
383 | |||
358 | if (sk->sk_rcvbuf < rcvmem) | 384 | if (sk->sk_rcvbuf < rcvmem) |
359 | sk->sk_rcvbuf = min(rcvmem, sysctl_tcp_rmem[2]); | 385 | sk->sk_rcvbuf = min(rcvmem, sysctl_tcp_rmem[2]); |
360 | } | 386 | } |
@@ -370,9 +396,11 @@ void tcp_init_buffer_space(struct sock *sk) | |||
370 | if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) | 396 | if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) |
371 | tcp_fixup_rcvbuf(sk); | 397 | tcp_fixup_rcvbuf(sk); |
372 | if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK)) | 398 | if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK)) |
373 | tcp_fixup_sndbuf(sk); | 399 | tcp_sndbuf_expand(sk); |
374 | 400 | ||
375 | tp->rcvq_space.space = tp->rcv_wnd; | 401 | tp->rcvq_space.space = tp->rcv_wnd; |
402 | tp->rcvq_space.time = tcp_time_stamp; | ||
403 | tp->rcvq_space.seq = tp->copied_seq; | ||
376 | 404 | ||
377 | maxwin = tcp_full_space(sk); | 405 | maxwin = tcp_full_space(sk); |
378 | 406 | ||
@@ -512,48 +540,62 @@ void tcp_rcv_space_adjust(struct sock *sk) | |||
512 | { | 540 | { |
513 | struct tcp_sock *tp = tcp_sk(sk); | 541 | struct tcp_sock *tp = tcp_sk(sk); |
514 | int time; | 542 | int time; |
515 | int space; | 543 | int copied; |
516 | |||
517 | if (tp->rcvq_space.time == 0) | ||
518 | goto new_measure; | ||
519 | 544 | ||
520 | time = tcp_time_stamp - tp->rcvq_space.time; | 545 | time = tcp_time_stamp - tp->rcvq_space.time; |
521 | if (time < (tp->rcv_rtt_est.rtt >> 3) || tp->rcv_rtt_est.rtt == 0) | 546 | if (time < (tp->rcv_rtt_est.rtt >> 3) || tp->rcv_rtt_est.rtt == 0) |
522 | return; | 547 | return; |
523 | 548 | ||
524 | space = 2 * (tp->copied_seq - tp->rcvq_space.seq); | 549 | /* Number of bytes copied to user in last RTT */ |
550 | copied = tp->copied_seq - tp->rcvq_space.seq; | ||
551 | if (copied <= tp->rcvq_space.space) | ||
552 | goto new_measure; | ||
525 | 553 | ||
526 | space = max(tp->rcvq_space.space, space); | 554 | /* A bit of theory : |
555 | * copied = bytes received in previous RTT, our base window | ||
556 | * To cope with packet losses, we need a 2x factor | ||
557 | * To cope with slow start, and sender growing its cwin by 100 % | ||
558 | * every RTT, we need a 4x factor, because the ACK we are sending | ||
559 | * now is for the next RTT, not the current one : | ||
560 | * <prev RTT . ><current RTT .. ><next RTT .... > | ||
561 | */ | ||
527 | 562 | ||
528 | if (tp->rcvq_space.space != space) { | 563 | if (sysctl_tcp_moderate_rcvbuf && |
529 | int rcvmem; | 564 | !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { |
565 | int rcvwin, rcvmem, rcvbuf; | ||
530 | 566 | ||
531 | tp->rcvq_space.space = space; | 567 | /* minimal window to cope with packet losses, assuming |
568 | * steady state. Add some cushion because of small variations. | ||
569 | */ | ||
570 | rcvwin = (copied << 1) + 16 * tp->advmss; | ||
532 | 571 | ||
533 | if (sysctl_tcp_moderate_rcvbuf && | 572 | /* If rate increased by 25%, |
534 | !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { | 573 | * assume slow start, rcvwin = 3 * copied |
535 | int new_clamp = space; | 574 | * If rate increased by 50%, |
575 | * assume sender can use 2x growth, rcvwin = 4 * copied | ||
576 | */ | ||
577 | if (copied >= | ||
578 | tp->rcvq_space.space + (tp->rcvq_space.space >> 2)) { | ||
579 | if (copied >= | ||
580 | tp->rcvq_space.space + (tp->rcvq_space.space >> 1)) | ||
581 | rcvwin <<= 1; | ||
582 | else | ||
583 | rcvwin += (rcvwin >> 1); | ||
584 | } | ||
536 | 585 | ||
537 | /* Receive space grows, normalize in order to | 586 | rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER); |
538 | * take into account packet headers and sk_buff | 587 | while (tcp_win_from_space(rcvmem) < tp->advmss) |
539 | * structure overhead. | 588 | rcvmem += 128; |
540 | */ | 589 | |
541 | space /= tp->advmss; | 590 | rcvbuf = min(rcvwin / tp->advmss * rcvmem, sysctl_tcp_rmem[2]); |
542 | if (!space) | 591 | if (rcvbuf > sk->sk_rcvbuf) { |
543 | space = 1; | 592 | sk->sk_rcvbuf = rcvbuf; |
544 | rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER); | 593 | |
545 | while (tcp_win_from_space(rcvmem) < tp->advmss) | 594 | /* Make the window clamp follow along. */ |
546 | rcvmem += 128; | 595 | tp->window_clamp = rcvwin; |
547 | space *= rcvmem; | ||
548 | space = min(space, sysctl_tcp_rmem[2]); | ||
549 | if (space > sk->sk_rcvbuf) { | ||
550 | sk->sk_rcvbuf = space; | ||
551 | |||
552 | /* Make the window clamp follow along. */ | ||
553 | tp->window_clamp = new_clamp; | ||
554 | } | ||
555 | } | 596 | } |
556 | } | 597 | } |
598 | tp->rcvq_space.space = copied; | ||
557 | 599 | ||
558 | new_measure: | 600 | new_measure: |
559 | tp->rcvq_space.seq = tp->copied_seq; | 601 | tp->rcvq_space.seq = tp->copied_seq; |
@@ -713,7 +755,12 @@ static void tcp_update_pacing_rate(struct sock *sk) | |||
713 | if (tp->srtt > 8 + 2) | 755 | if (tp->srtt > 8 + 2) |
714 | do_div(rate, tp->srtt); | 756 | do_div(rate, tp->srtt); |
715 | 757 | ||
716 | sk->sk_pacing_rate = min_t(u64, rate, ~0U); | 758 | /* ACCESS_ONCE() is needed because sch_fq fetches sk_pacing_rate |
759 | * without any lock. We want to make sure compiler wont store | ||
760 | * intermediate values in this location. | ||
761 | */ | ||
762 | ACCESS_ONCE(sk->sk_pacing_rate) = min_t(u64, rate, | ||
763 | sk->sk_max_pacing_rate); | ||
717 | } | 764 | } |
718 | 765 | ||
719 | /* Calculate rto without backoff. This is the second half of Van Jacobson's | 766 | /* Calculate rto without backoff. This is the second half of Van Jacobson's |
@@ -2973,7 +3020,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, | |||
2973 | const struct inet_connection_sock *icsk = inet_csk(sk); | 3020 | const struct inet_connection_sock *icsk = inet_csk(sk); |
2974 | struct sk_buff *skb; | 3021 | struct sk_buff *skb; |
2975 | u32 now = tcp_time_stamp; | 3022 | u32 now = tcp_time_stamp; |
2976 | int fully_acked = true; | 3023 | bool fully_acked = true; |
2977 | int flag = 0; | 3024 | int flag = 0; |
2978 | u32 pkts_acked = 0; | 3025 | u32 pkts_acked = 0; |
2979 | u32 reord = tp->packets_out; | 3026 | u32 reord = tp->packets_out; |
@@ -4704,15 +4751,7 @@ static void tcp_new_space(struct sock *sk) | |||
4704 | struct tcp_sock *tp = tcp_sk(sk); | 4751 | struct tcp_sock *tp = tcp_sk(sk); |
4705 | 4752 | ||
4706 | if (tcp_should_expand_sndbuf(sk)) { | 4753 | if (tcp_should_expand_sndbuf(sk)) { |
4707 | int sndmem = SKB_TRUESIZE(max_t(u32, | 4754 | tcp_sndbuf_expand(sk); |
4708 | tp->rx_opt.mss_clamp, | ||
4709 | tp->mss_cache) + | ||
4710 | MAX_TCP_HEADER); | ||
4711 | int demanded = max_t(unsigned int, tp->snd_cwnd, | ||
4712 | tp->reordering + 1); | ||
4713 | sndmem *= 2 * demanded; | ||
4714 | if (sndmem > sk->sk_sndbuf) | ||
4715 | sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]); | ||
4716 | tp->snd_cwnd_stamp = tcp_time_stamp; | 4755 | tp->snd_cwnd_stamp = tcp_time_stamp; |
4717 | } | 4756 | } |
4718 | 4757 | ||
@@ -5677,8 +5716,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, | |||
5677 | tcp_init_congestion_control(sk); | 5716 | tcp_init_congestion_control(sk); |
5678 | 5717 | ||
5679 | tcp_mtup_init(sk); | 5718 | tcp_mtup_init(sk); |
5680 | tcp_init_buffer_space(sk); | ||
5681 | tp->copied_seq = tp->rcv_nxt; | 5719 | tp->copied_seq = tp->rcv_nxt; |
5720 | tcp_init_buffer_space(sk); | ||
5682 | } | 5721 | } |
5683 | smp_mb(); | 5722 | smp_mb(); |
5684 | tcp_set_state(sk, TCP_ESTABLISHED); | 5723 | tcp_set_state(sk, TCP_ESTABLISHED); |