aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp_input.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4/tcp_input.c')
-rw-r--r--net/ipv4/tcp_input.c131
1 files changed, 85 insertions, 46 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index a16b01b537ba..b935397c703c 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -267,11 +267,31 @@ static bool TCP_ECN_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr
267 * 1. Tuning sk->sk_sndbuf, when connection enters established state. 267 * 1. Tuning sk->sk_sndbuf, when connection enters established state.
268 */ 268 */
269 269
270static void tcp_fixup_sndbuf(struct sock *sk) 270static void tcp_sndbuf_expand(struct sock *sk)
271{ 271{
272 int sndmem = SKB_TRUESIZE(tcp_sk(sk)->rx_opt.mss_clamp + MAX_TCP_HEADER); 272 const struct tcp_sock *tp = tcp_sk(sk);
273 int sndmem, per_mss;
274 u32 nr_segs;
275
276 /* Worst case is non GSO/TSO : each frame consumes one skb
277 * and skb->head is kmalloced using power of two area of memory
278 */
279 per_mss = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) +
280 MAX_TCP_HEADER +
281 SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
282
283 per_mss = roundup_pow_of_two(per_mss) +
284 SKB_DATA_ALIGN(sizeof(struct sk_buff));
285
286 nr_segs = max_t(u32, TCP_INIT_CWND, tp->snd_cwnd);
287 nr_segs = max_t(u32, nr_segs, tp->reordering + 1);
288
289 /* Fast Recovery (RFC 5681 3.2) :
290 * Cubic needs 1.7 factor, rounded to 2 to include
291 * extra cushion (application might react slowly to POLLOUT)
292 */
293 sndmem = 2 * nr_segs * per_mss;
273 294
274 sndmem *= TCP_INIT_CWND;
275 if (sk->sk_sndbuf < sndmem) 295 if (sk->sk_sndbuf < sndmem)
276 sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]); 296 sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]);
277} 297}
@@ -355,6 +375,12 @@ static void tcp_fixup_rcvbuf(struct sock *sk)
355 rcvmem = 2 * SKB_TRUESIZE(mss + MAX_TCP_HEADER) * 375 rcvmem = 2 * SKB_TRUESIZE(mss + MAX_TCP_HEADER) *
356 tcp_default_init_rwnd(mss); 376 tcp_default_init_rwnd(mss);
357 377
378 /* Dynamic Right Sizing (DRS) has 2 to 3 RTT latency
379 * Allow enough cushion so that sender is not limited by our window
380 */
381 if (sysctl_tcp_moderate_rcvbuf)
382 rcvmem <<= 2;
383
358 if (sk->sk_rcvbuf < rcvmem) 384 if (sk->sk_rcvbuf < rcvmem)
359 sk->sk_rcvbuf = min(rcvmem, sysctl_tcp_rmem[2]); 385 sk->sk_rcvbuf = min(rcvmem, sysctl_tcp_rmem[2]);
360} 386}
@@ -370,9 +396,11 @@ void tcp_init_buffer_space(struct sock *sk)
370 if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) 396 if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK))
371 tcp_fixup_rcvbuf(sk); 397 tcp_fixup_rcvbuf(sk);
372 if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK)) 398 if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK))
373 tcp_fixup_sndbuf(sk); 399 tcp_sndbuf_expand(sk);
374 400
375 tp->rcvq_space.space = tp->rcv_wnd; 401 tp->rcvq_space.space = tp->rcv_wnd;
402 tp->rcvq_space.time = tcp_time_stamp;
403 tp->rcvq_space.seq = tp->copied_seq;
376 404
377 maxwin = tcp_full_space(sk); 405 maxwin = tcp_full_space(sk);
378 406
@@ -512,48 +540,62 @@ void tcp_rcv_space_adjust(struct sock *sk)
512{ 540{
513 struct tcp_sock *tp = tcp_sk(sk); 541 struct tcp_sock *tp = tcp_sk(sk);
514 int time; 542 int time;
515 int space; 543 int copied;
516
517 if (tp->rcvq_space.time == 0)
518 goto new_measure;
519 544
520 time = tcp_time_stamp - tp->rcvq_space.time; 545 time = tcp_time_stamp - tp->rcvq_space.time;
521 if (time < (tp->rcv_rtt_est.rtt >> 3) || tp->rcv_rtt_est.rtt == 0) 546 if (time < (tp->rcv_rtt_est.rtt >> 3) || tp->rcv_rtt_est.rtt == 0)
522 return; 547 return;
523 548
524 space = 2 * (tp->copied_seq - tp->rcvq_space.seq); 549 /* Number of bytes copied to user in last RTT */
550 copied = tp->copied_seq - tp->rcvq_space.seq;
551 if (copied <= tp->rcvq_space.space)
552 goto new_measure;
525 553
526 space = max(tp->rcvq_space.space, space); 554 /* A bit of theory :
555 * copied = bytes received in previous RTT, our base window
556 * To cope with packet losses, we need a 2x factor
557 * To cope with slow start, and sender growing its cwin by 100 %
558 * every RTT, we need a 4x factor, because the ACK we are sending
559 * now is for the next RTT, not the current one :
560 * <prev RTT . ><current RTT .. ><next RTT .... >
561 */
527 562
528 if (tp->rcvq_space.space != space) { 563 if (sysctl_tcp_moderate_rcvbuf &&
529 int rcvmem; 564 !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
565 int rcvwin, rcvmem, rcvbuf;
530 566
531 tp->rcvq_space.space = space; 567 /* minimal window to cope with packet losses, assuming
568 * steady state. Add some cushion because of small variations.
569 */
570 rcvwin = (copied << 1) + 16 * tp->advmss;
532 571
533 if (sysctl_tcp_moderate_rcvbuf && 572 /* If rate increased by 25%,
534 !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { 573 * assume slow start, rcvwin = 3 * copied
535 int new_clamp = space; 574 * If rate increased by 50%,
575 * assume sender can use 2x growth, rcvwin = 4 * copied
576 */
577 if (copied >=
578 tp->rcvq_space.space + (tp->rcvq_space.space >> 2)) {
579 if (copied >=
580 tp->rcvq_space.space + (tp->rcvq_space.space >> 1))
581 rcvwin <<= 1;
582 else
583 rcvwin += (rcvwin >> 1);
584 }
536 585
537 /* Receive space grows, normalize in order to 586 rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER);
538 * take into account packet headers and sk_buff 587 while (tcp_win_from_space(rcvmem) < tp->advmss)
539 * structure overhead. 588 rcvmem += 128;
540 */ 589
541 space /= tp->advmss; 590 rcvbuf = min(rcvwin / tp->advmss * rcvmem, sysctl_tcp_rmem[2]);
542 if (!space) 591 if (rcvbuf > sk->sk_rcvbuf) {
543 space = 1; 592 sk->sk_rcvbuf = rcvbuf;
544 rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER); 593
545 while (tcp_win_from_space(rcvmem) < tp->advmss) 594 /* Make the window clamp follow along. */
546 rcvmem += 128; 595 tp->window_clamp = rcvwin;
547 space *= rcvmem;
548 space = min(space, sysctl_tcp_rmem[2]);
549 if (space > sk->sk_rcvbuf) {
550 sk->sk_rcvbuf = space;
551
552 /* Make the window clamp follow along. */
553 tp->window_clamp = new_clamp;
554 }
555 } 596 }
556 } 597 }
598 tp->rcvq_space.space = copied;
557 599
558new_measure: 600new_measure:
559 tp->rcvq_space.seq = tp->copied_seq; 601 tp->rcvq_space.seq = tp->copied_seq;
@@ -713,7 +755,12 @@ static void tcp_update_pacing_rate(struct sock *sk)
713 if (tp->srtt > 8 + 2) 755 if (tp->srtt > 8 + 2)
714 do_div(rate, tp->srtt); 756 do_div(rate, tp->srtt);
715 757
716 sk->sk_pacing_rate = min_t(u64, rate, ~0U); 758 /* ACCESS_ONCE() is needed because sch_fq fetches sk_pacing_rate
759 * without any lock. We want to make sure compiler wont store
760 * intermediate values in this location.
761 */
762 ACCESS_ONCE(sk->sk_pacing_rate) = min_t(u64, rate,
763 sk->sk_max_pacing_rate);
717} 764}
718 765
719/* Calculate rto without backoff. This is the second half of Van Jacobson's 766/* Calculate rto without backoff. This is the second half of Van Jacobson's
@@ -2973,7 +3020,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
2973 const struct inet_connection_sock *icsk = inet_csk(sk); 3020 const struct inet_connection_sock *icsk = inet_csk(sk);
2974 struct sk_buff *skb; 3021 struct sk_buff *skb;
2975 u32 now = tcp_time_stamp; 3022 u32 now = tcp_time_stamp;
2976 int fully_acked = true; 3023 bool fully_acked = true;
2977 int flag = 0; 3024 int flag = 0;
2978 u32 pkts_acked = 0; 3025 u32 pkts_acked = 0;
2979 u32 reord = tp->packets_out; 3026 u32 reord = tp->packets_out;
@@ -4704,15 +4751,7 @@ static void tcp_new_space(struct sock *sk)
4704 struct tcp_sock *tp = tcp_sk(sk); 4751 struct tcp_sock *tp = tcp_sk(sk);
4705 4752
4706 if (tcp_should_expand_sndbuf(sk)) { 4753 if (tcp_should_expand_sndbuf(sk)) {
4707 int sndmem = SKB_TRUESIZE(max_t(u32, 4754 tcp_sndbuf_expand(sk);
4708 tp->rx_opt.mss_clamp,
4709 tp->mss_cache) +
4710 MAX_TCP_HEADER);
4711 int demanded = max_t(unsigned int, tp->snd_cwnd,
4712 tp->reordering + 1);
4713 sndmem *= 2 * demanded;
4714 if (sndmem > sk->sk_sndbuf)
4715 sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]);
4716 tp->snd_cwnd_stamp = tcp_time_stamp; 4755 tp->snd_cwnd_stamp = tcp_time_stamp;
4717 } 4756 }
4718 4757
@@ -5677,8 +5716,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
5677 tcp_init_congestion_control(sk); 5716 tcp_init_congestion_control(sk);
5678 5717
5679 tcp_mtup_init(sk); 5718 tcp_mtup_init(sk);
5680 tcp_init_buffer_space(sk);
5681 tp->copied_seq = tp->rcv_nxt; 5719 tp->copied_seq = tp->rcv_nxt;
5720 tcp_init_buffer_space(sk);
5682 } 5721 }
5683 smp_mb(); 5722 smp_mb();
5684 tcp_set_state(sk, TCP_ESTABLISHED); 5723 tcp_set_state(sk, TCP_ESTABLISHED);