aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp_input.c
diff options
context:
space:
mode:
authorEric Dumazet <eric.dumazet@gmail.com>2013-09-20 16:56:58 -0400
committerDavid S. Miller <davem@davemloft.net>2013-09-24 11:07:32 -0400
commitb0983d3c9b132c33b6fb2e28d157a1edc18a173c (patch)
tree7cd835c46ced9d4dd7154d9f554a496dc28cdfd0 /net/ipv4/tcp_input.c
parent086293542b991fb88a2e41ae7b4f82ac65a20e1a (diff)
tcp: fix dynamic right sizing
Dynamic Right Sizing (DRS) is supposed to open TCP receive window automatically, but suffers from two bugs, presented by order of importance. 1) tcp_rcv_space_adjust() fix : Using twice the last received amount is very pessimistic, because it doesn't allow fast recovery or proper slow start ramp up, if sender wants to increase cwin by 100% every RTT. copied = bytes received in previous RTT 2*copied = bytes we expect to receive in next RTT 4*copied = bytes we need to advertise in rwin at end of next RTT DRS is one RTT late, it needs a 4x factor. If sender is not using ABC, and increases cwin by 50% every rtt, then we needed 1.5*1.5 = 2.25 factor. This is probably why this bug was not really noticed. 2) There is no window adjustment after first RTT. DRS triggers only after the second RTT. DRS needs two RTT to initialize, so tcp_fixup_rcvbuf() should setup sk_rcvbuf to allow proper window grow for first two RTT. This patch increases TCP efficiency particularly for large RTT flows when autotuning is used at the receiver, and more particularly in presence of packet losses. Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: Neal Cardwell <ncardwell@google.com> Signed-off-by: Yuchung Cheng <ycheng@google.com> Cc: Van Jacobson <vanj@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4/tcp_input.c')
-rw-r--r--net/ipv4/tcp_input.c84
1 files changed, 53 insertions, 31 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 25a89eaa669d..5d083855c111 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -355,6 +355,12 @@ static void tcp_fixup_rcvbuf(struct sock *sk)
355 rcvmem = 2 * SKB_TRUESIZE(mss + MAX_TCP_HEADER) * 355 rcvmem = 2 * SKB_TRUESIZE(mss + MAX_TCP_HEADER) *
356 tcp_default_init_rwnd(mss); 356 tcp_default_init_rwnd(mss);
357 357
358 /* Dynamic Right Sizing (DRS) has 2 to 3 RTT latency
359 * Allow enough cushion so that sender is not limited by our window
360 */
361 if (sysctl_tcp_moderate_rcvbuf)
362 rcvmem <<= 2;
363
358 if (sk->sk_rcvbuf < rcvmem) 364 if (sk->sk_rcvbuf < rcvmem)
359 sk->sk_rcvbuf = min(rcvmem, sysctl_tcp_rmem[2]); 365 sk->sk_rcvbuf = min(rcvmem, sysctl_tcp_rmem[2]);
360} 366}
@@ -373,6 +379,8 @@ void tcp_init_buffer_space(struct sock *sk)
373 tcp_fixup_sndbuf(sk); 379 tcp_fixup_sndbuf(sk);
374 380
375 tp->rcvq_space.space = tp->rcv_wnd; 381 tp->rcvq_space.space = tp->rcv_wnd;
382 tp->rcvq_space.time = tcp_time_stamp;
383 tp->rcvq_space.seq = tp->copied_seq;
376 384
377 maxwin = tcp_full_space(sk); 385 maxwin = tcp_full_space(sk);
378 386
@@ -512,48 +520,62 @@ void tcp_rcv_space_adjust(struct sock *sk)
512{ 520{
513 struct tcp_sock *tp = tcp_sk(sk); 521 struct tcp_sock *tp = tcp_sk(sk);
514 int time; 522 int time;
515 int space; 523 int copied;
516
517 if (tp->rcvq_space.time == 0)
518 goto new_measure;
519 524
520 time = tcp_time_stamp - tp->rcvq_space.time; 525 time = tcp_time_stamp - tp->rcvq_space.time;
521 if (time < (tp->rcv_rtt_est.rtt >> 3) || tp->rcv_rtt_est.rtt == 0) 526 if (time < (tp->rcv_rtt_est.rtt >> 3) || tp->rcv_rtt_est.rtt == 0)
522 return; 527 return;
523 528
524 space = 2 * (tp->copied_seq - tp->rcvq_space.seq); 529 /* Number of bytes copied to user in last RTT */
530 copied = tp->copied_seq - tp->rcvq_space.seq;
531 if (copied <= tp->rcvq_space.space)
532 goto new_measure;
533
534 /* A bit of theory :
535 * copied = bytes received in previous RTT, our base window
536 * To cope with packet losses, we need a 2x factor
537 * To cope with slow start, and sender growing its cwin by 100 %
538 * every RTT, we need a 4x factor, because the ACK we are sending
539 * now is for the next RTT, not the current one :
540 * <prev RTT . ><current RTT .. ><next RTT .... >
541 */
542
543 if (sysctl_tcp_moderate_rcvbuf &&
544 !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
545 int rcvwin, rcvmem, rcvbuf;
525 546
526 space = max(tp->rcvq_space.space, space); 547 /* minimal window to cope with packet losses, assuming
548 * steady state. Add some cushion because of small variations.
549 */
550 rcvwin = (copied << 1) + 16 * tp->advmss;
527 551
528 if (tp->rcvq_space.space != space) { 552 /* If rate increased by 25%,
529 int rcvmem; 553 * assume slow start, rcvwin = 3 * copied
554 * If rate increased by 50%,
555 * assume sender can use 2x growth, rcvwin = 4 * copied
556 */
557 if (copied >=
558 tp->rcvq_space.space + (tp->rcvq_space.space >> 2)) {
559 if (copied >=
560 tp->rcvq_space.space + (tp->rcvq_space.space >> 1))
561 rcvwin <<= 1;
562 else
563 rcvwin += (rcvwin >> 1);
564 }
530 565
531 tp->rcvq_space.space = space; 566 rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER);
567 while (tcp_win_from_space(rcvmem) < tp->advmss)
568 rcvmem += 128;
532 569
533 if (sysctl_tcp_moderate_rcvbuf && 570 rcvbuf = min(rcvwin / tp->advmss * rcvmem, sysctl_tcp_rmem[2]);
534 !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { 571 if (rcvbuf > sk->sk_rcvbuf) {
535 int new_clamp = space; 572 sk->sk_rcvbuf = rcvbuf;
536 573
537 /* Receive space grows, normalize in order to 574 /* Make the window clamp follow along. */
538 * take into account packet headers and sk_buff 575 tp->window_clamp = rcvwin;
539 * structure overhead.
540 */
541 space /= tp->advmss;
542 if (!space)
543 space = 1;
544 rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER);
545 while (tcp_win_from_space(rcvmem) < tp->advmss)
546 rcvmem += 128;
547 space *= rcvmem;
548 space = min(space, sysctl_tcp_rmem[2]);
549 if (space > sk->sk_rcvbuf) {
550 sk->sk_rcvbuf = space;
551
552 /* Make the window clamp follow along. */
553 tp->window_clamp = new_clamp;
554 }
555 } 576 }
556 } 577 }
578 tp->rcvq_space.space = copied;
557 579
558new_measure: 580new_measure:
559 tp->rcvq_space.seq = tp->copied_seq; 581 tp->rcvq_space.seq = tp->copied_seq;
@@ -5674,8 +5696,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
5674 tcp_init_congestion_control(sk); 5696 tcp_init_congestion_control(sk);
5675 5697
5676 tcp_mtup_init(sk); 5698 tcp_mtup_init(sk);
5677 tcp_init_buffer_space(sk);
5678 tp->copied_seq = tp->rcv_nxt; 5699 tp->copied_seq = tp->rcv_nxt;
5700 tcp_init_buffer_space(sk);
5679 } 5701 }
5680 smp_mb(); 5702 smp_mb();
5681 tcp_set_state(sk, TCP_ESTABLISHED); 5703 tcp_set_state(sk, TCP_ESTABLISHED);