aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp_input.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4/tcp_input.c')
-rw-r--r--net/ipv4/tcp_input.c180
1 files changed, 120 insertions, 60 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 25a89eaa669d..c53b7f35c51d 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -267,11 +267,31 @@ static bool TCP_ECN_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr
267 * 1. Tuning sk->sk_sndbuf, when connection enters established state. 267 * 1. Tuning sk->sk_sndbuf, when connection enters established state.
268 */ 268 */
269 269
270static void tcp_fixup_sndbuf(struct sock *sk) 270static void tcp_sndbuf_expand(struct sock *sk)
271{ 271{
272 int sndmem = SKB_TRUESIZE(tcp_sk(sk)->rx_opt.mss_clamp + MAX_TCP_HEADER); 272 const struct tcp_sock *tp = tcp_sk(sk);
273 int sndmem, per_mss;
274 u32 nr_segs;
275
276 /* Worst case is non GSO/TSO : each frame consumes one skb
277 * and skb->head is kmalloced using power of two area of memory
278 */
279 per_mss = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) +
280 MAX_TCP_HEADER +
281 SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
282
283 per_mss = roundup_pow_of_two(per_mss) +
284 SKB_DATA_ALIGN(sizeof(struct sk_buff));
285
286 nr_segs = max_t(u32, TCP_INIT_CWND, tp->snd_cwnd);
287 nr_segs = max_t(u32, nr_segs, tp->reordering + 1);
288
289 /* Fast Recovery (RFC 5681 3.2) :
290 * Cubic needs 1.7 factor, rounded to 2 to include
291 * extra cushion (application might react slowly to POLLOUT)
292 */
293 sndmem = 2 * nr_segs * per_mss;
273 294
274 sndmem *= TCP_INIT_CWND;
275 if (sk->sk_sndbuf < sndmem) 295 if (sk->sk_sndbuf < sndmem)
276 sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]); 296 sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]);
277} 297}
@@ -355,6 +375,12 @@ static void tcp_fixup_rcvbuf(struct sock *sk)
355 rcvmem = 2 * SKB_TRUESIZE(mss + MAX_TCP_HEADER) * 375 rcvmem = 2 * SKB_TRUESIZE(mss + MAX_TCP_HEADER) *
356 tcp_default_init_rwnd(mss); 376 tcp_default_init_rwnd(mss);
357 377
378 /* Dynamic Right Sizing (DRS) has 2 to 3 RTT latency
379 * Allow enough cushion so that sender is not limited by our window
380 */
381 if (sysctl_tcp_moderate_rcvbuf)
382 rcvmem <<= 2;
383
358 if (sk->sk_rcvbuf < rcvmem) 384 if (sk->sk_rcvbuf < rcvmem)
359 sk->sk_rcvbuf = min(rcvmem, sysctl_tcp_rmem[2]); 385 sk->sk_rcvbuf = min(rcvmem, sysctl_tcp_rmem[2]);
360} 386}
@@ -370,9 +396,11 @@ void tcp_init_buffer_space(struct sock *sk)
370 if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) 396 if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK))
371 tcp_fixup_rcvbuf(sk); 397 tcp_fixup_rcvbuf(sk);
372 if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK)) 398 if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK))
373 tcp_fixup_sndbuf(sk); 399 tcp_sndbuf_expand(sk);
374 400
375 tp->rcvq_space.space = tp->rcv_wnd; 401 tp->rcvq_space.space = tp->rcv_wnd;
402 tp->rcvq_space.time = tcp_time_stamp;
403 tp->rcvq_space.seq = tp->copied_seq;
376 404
377 maxwin = tcp_full_space(sk); 405 maxwin = tcp_full_space(sk);
378 406
@@ -512,48 +540,62 @@ void tcp_rcv_space_adjust(struct sock *sk)
512{ 540{
513 struct tcp_sock *tp = tcp_sk(sk); 541 struct tcp_sock *tp = tcp_sk(sk);
514 int time; 542 int time;
515 int space; 543 int copied;
516
517 if (tp->rcvq_space.time == 0)
518 goto new_measure;
519 544
520 time = tcp_time_stamp - tp->rcvq_space.time; 545 time = tcp_time_stamp - tp->rcvq_space.time;
521 if (time < (tp->rcv_rtt_est.rtt >> 3) || tp->rcv_rtt_est.rtt == 0) 546 if (time < (tp->rcv_rtt_est.rtt >> 3) || tp->rcv_rtt_est.rtt == 0)
522 return; 547 return;
523 548
524 space = 2 * (tp->copied_seq - tp->rcvq_space.seq); 549 /* Number of bytes copied to user in last RTT */
550 copied = tp->copied_seq - tp->rcvq_space.seq;
551 if (copied <= tp->rcvq_space.space)
552 goto new_measure;
525 553
526 space = max(tp->rcvq_space.space, space); 554 /* A bit of theory :
555 * copied = bytes received in previous RTT, our base window
556 * To cope with packet losses, we need a 2x factor
557 * To cope with slow start, and sender growing its cwin by 100 %
558 * every RTT, we need a 4x factor, because the ACK we are sending
559 * now is for the next RTT, not the current one :
560 * <prev RTT . ><current RTT .. ><next RTT .... >
561 */
527 562
528 if (tp->rcvq_space.space != space) { 563 if (sysctl_tcp_moderate_rcvbuf &&
529 int rcvmem; 564 !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
565 int rcvwin, rcvmem, rcvbuf;
530 566
531 tp->rcvq_space.space = space; 567 /* minimal window to cope with packet losses, assuming
568 * steady state. Add some cushion because of small variations.
569 */
570 rcvwin = (copied << 1) + 16 * tp->advmss;
532 571
533 if (sysctl_tcp_moderate_rcvbuf && 572 /* If rate increased by 25%,
534 !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { 573 * assume slow start, rcvwin = 3 * copied
535 int new_clamp = space; 574 * If rate increased by 50%,
575 * assume sender can use 2x growth, rcvwin = 4 * copied
576 */
577 if (copied >=
578 tp->rcvq_space.space + (tp->rcvq_space.space >> 2)) {
579 if (copied >=
580 tp->rcvq_space.space + (tp->rcvq_space.space >> 1))
581 rcvwin <<= 1;
582 else
583 rcvwin += (rcvwin >> 1);
584 }
536 585
537 /* Receive space grows, normalize in order to 586 rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER);
538 * take into account packet headers and sk_buff 587 while (tcp_win_from_space(rcvmem) < tp->advmss)
539 * structure overhead. 588 rcvmem += 128;
540 */ 589
541 space /= tp->advmss; 590 rcvbuf = min(rcvwin / tp->advmss * rcvmem, sysctl_tcp_rmem[2]);
542 if (!space) 591 if (rcvbuf > sk->sk_rcvbuf) {
543 space = 1; 592 sk->sk_rcvbuf = rcvbuf;
544 rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER); 593
545 while (tcp_win_from_space(rcvmem) < tp->advmss) 594 /* Make the window clamp follow along. */
546 rcvmem += 128; 595 tp->window_clamp = rcvwin;
547 space *= rcvmem;
548 space = min(space, sysctl_tcp_rmem[2]);
549 if (space > sk->sk_rcvbuf) {
550 sk->sk_rcvbuf = space;
551
552 /* Make the window clamp follow along. */
553 tp->window_clamp = new_clamp;
554 }
555 } 596 }
556 } 597 }
598 tp->rcvq_space.space = copied;
557 599
558new_measure: 600new_measure:
559 tp->rcvq_space.seq = tp->copied_seq; 601 tp->rcvq_space.seq = tp->copied_seq;
@@ -713,7 +755,12 @@ static void tcp_update_pacing_rate(struct sock *sk)
713 if (tp->srtt > 8 + 2) 755 if (tp->srtt > 8 + 2)
714 do_div(rate, tp->srtt); 756 do_div(rate, tp->srtt);
715 757
716 sk->sk_pacing_rate = min_t(u64, rate, ~0U); 758 /* ACCESS_ONCE() is needed because sch_fq fetches sk_pacing_rate
759 * without any lock. We want to make sure compiler wont store
760 * intermediate values in this location.
761 */
762 ACCESS_ONCE(sk->sk_pacing_rate) = min_t(u64, rate,
763 sk->sk_max_pacing_rate);
717} 764}
718 765
719/* Calculate rto without backoff. This is the second half of Van Jacobson's 766/* Calculate rto without backoff. This is the second half of Van Jacobson's
@@ -1284,7 +1331,10 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
1284 tp->lost_cnt_hint -= tcp_skb_pcount(prev); 1331 tp->lost_cnt_hint -= tcp_skb_pcount(prev);
1285 } 1332 }
1286 1333
1287 TCP_SKB_CB(skb)->tcp_flags |= TCP_SKB_CB(prev)->tcp_flags; 1334 TCP_SKB_CB(prev)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1335 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
1336 TCP_SKB_CB(prev)->end_seq++;
1337
1288 if (skb == tcp_highest_sack(sk)) 1338 if (skb == tcp_highest_sack(sk))
1289 tcp_advance_highest_sack(sk, skb); 1339 tcp_advance_highest_sack(sk, skb);
1290 1340
@@ -2853,7 +2903,8 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
2853 * left edge of the send window. 2903 * left edge of the send window.
2854 * See draft-ietf-tcplw-high-performance-00, section 3.3. 2904 * See draft-ietf-tcplw-high-performance-00, section 3.3.
2855 */ 2905 */
2856 if (seq_rtt < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) 2906 if (seq_rtt < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
2907 flag & FLAG_ACKED)
2857 seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; 2908 seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
2858 2909
2859 if (seq_rtt < 0) 2910 if (seq_rtt < 0)
@@ -2868,20 +2919,25 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
2868} 2919}
2869 2920
2870/* Compute time elapsed between (last) SYNACK and the ACK completing 3WHS. */ 2921/* Compute time elapsed between (last) SYNACK and the ACK completing 3WHS. */
2871static void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req) 2922static void tcp_synack_rtt_meas(struct sock *sk, const u32 synack_stamp)
2872{ 2923{
2873 struct tcp_sock *tp = tcp_sk(sk); 2924 struct tcp_sock *tp = tcp_sk(sk);
2874 s32 seq_rtt = -1; 2925 s32 seq_rtt = -1;
2875 2926
2876 if (tp->lsndtime && !tp->total_retrans) 2927 if (synack_stamp && !tp->total_retrans)
2877 seq_rtt = tcp_time_stamp - tp->lsndtime; 2928 seq_rtt = tcp_time_stamp - synack_stamp;
2878 tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt, -1); 2929
2930 /* If the ACK acks both the SYNACK and the (Fast Open'd) data packets
2931 * sent in SYN_RECV, SYNACK RTT is the smooth RTT computed in tcp_ack()
2932 */
2933 if (!tp->srtt)
2934 tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt, -1);
2879} 2935}
2880 2936
2881static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) 2937static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 acked, u32 in_flight)
2882{ 2938{
2883 const struct inet_connection_sock *icsk = inet_csk(sk); 2939 const struct inet_connection_sock *icsk = inet_csk(sk);
2884 icsk->icsk_ca_ops->cong_avoid(sk, ack, in_flight); 2940 icsk->icsk_ca_ops->cong_avoid(sk, ack, acked, in_flight);
2885 tcp_sk(sk)->snd_cwnd_stamp = tcp_time_stamp; 2941 tcp_sk(sk)->snd_cwnd_stamp = tcp_time_stamp;
2886} 2942}
2887 2943
@@ -2970,7 +3026,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
2970 const struct inet_connection_sock *icsk = inet_csk(sk); 3026 const struct inet_connection_sock *icsk = inet_csk(sk);
2971 struct sk_buff *skb; 3027 struct sk_buff *skb;
2972 u32 now = tcp_time_stamp; 3028 u32 now = tcp_time_stamp;
2973 int fully_acked = true; 3029 bool fully_acked = true;
2974 int flag = 0; 3030 int flag = 0;
2975 u32 pkts_acked = 0; 3031 u32 pkts_acked = 0;
2976 u32 reord = tp->packets_out; 3032 u32 reord = tp->packets_out;
@@ -2978,6 +3034,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
2978 s32 seq_rtt = -1; 3034 s32 seq_rtt = -1;
2979 s32 ca_seq_rtt = -1; 3035 s32 ca_seq_rtt = -1;
2980 ktime_t last_ackt = net_invalid_timestamp(); 3036 ktime_t last_ackt = net_invalid_timestamp();
3037 bool rtt_update;
2981 3038
2982 while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) { 3039 while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) {
2983 struct tcp_skb_cb *scb = TCP_SKB_CB(skb); 3040 struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
@@ -3054,14 +3111,13 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3054 if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) 3111 if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
3055 flag |= FLAG_SACK_RENEGING; 3112 flag |= FLAG_SACK_RENEGING;
3056 3113
3057 if (tcp_ack_update_rtt(sk, flag, seq_rtt, sack_rtt) || 3114 rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt, sack_rtt);
3058 (flag & FLAG_ACKED))
3059 tcp_rearm_rto(sk);
3060 3115
3061 if (flag & FLAG_ACKED) { 3116 if (flag & FLAG_ACKED) {
3062 const struct tcp_congestion_ops *ca_ops 3117 const struct tcp_congestion_ops *ca_ops
3063 = inet_csk(sk)->icsk_ca_ops; 3118 = inet_csk(sk)->icsk_ca_ops;
3064 3119
3120 tcp_rearm_rto(sk);
3065 if (unlikely(icsk->icsk_mtup.probe_size && 3121 if (unlikely(icsk->icsk_mtup.probe_size &&
3066 !after(tp->mtu_probe.probe_seq_end, tp->snd_una))) { 3122 !after(tp->mtu_probe.probe_seq_end, tp->snd_una))) {
3067 tcp_mtup_probe_success(sk); 3123 tcp_mtup_probe_success(sk);
@@ -3100,6 +3156,13 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3100 3156
3101 ca_ops->pkts_acked(sk, pkts_acked, rtt_us); 3157 ca_ops->pkts_acked(sk, pkts_acked, rtt_us);
3102 } 3158 }
3159 } else if (skb && rtt_update && sack_rtt >= 0 &&
3160 sack_rtt > (s32)(now - TCP_SKB_CB(skb)->when)) {
3161 /* Do not re-arm RTO if the sack RTT is measured from data sent
3162 * after when the head was last (re)transmitted. Otherwise the
3163 * timeout may continue to extend in loss recovery.
3164 */
3165 tcp_rearm_rto(sk);
3103 } 3166 }
3104 3167
3105#if FASTRETRANS_DEBUG > 0 3168#if FASTRETRANS_DEBUG > 0
@@ -3288,7 +3351,7 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
3288 tcp_init_cwnd_reduction(sk, true); 3351 tcp_init_cwnd_reduction(sk, true);
3289 tcp_set_ca_state(sk, TCP_CA_CWR); 3352 tcp_set_ca_state(sk, TCP_CA_CWR);
3290 tcp_end_cwnd_reduction(sk); 3353 tcp_end_cwnd_reduction(sk);
3291 tcp_set_ca_state(sk, TCP_CA_Open); 3354 tcp_try_keep_open(sk);
3292 NET_INC_STATS_BH(sock_net(sk), 3355 NET_INC_STATS_BH(sock_net(sk),
3293 LINUX_MIB_TCPLOSSPROBERECOVERY); 3356 LINUX_MIB_TCPLOSSPROBERECOVERY);
3294 } 3357 }
@@ -3391,7 +3454,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3391 3454
3392 /* Advance cwnd if state allows */ 3455 /* Advance cwnd if state allows */
3393 if (tcp_may_raise_cwnd(sk, flag)) 3456 if (tcp_may_raise_cwnd(sk, flag))
3394 tcp_cong_avoid(sk, ack, prior_in_flight); 3457 tcp_cong_avoid(sk, ack, acked, prior_in_flight);
3395 3458
3396 if (tcp_ack_is_dubious(sk, flag)) { 3459 if (tcp_ack_is_dubious(sk, flag)) {
3397 is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); 3460 is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
@@ -4701,15 +4764,7 @@ static void tcp_new_space(struct sock *sk)
4701 struct tcp_sock *tp = tcp_sk(sk); 4764 struct tcp_sock *tp = tcp_sk(sk);
4702 4765
4703 if (tcp_should_expand_sndbuf(sk)) { 4766 if (tcp_should_expand_sndbuf(sk)) {
4704 int sndmem = SKB_TRUESIZE(max_t(u32, 4767 tcp_sndbuf_expand(sk);
4705 tp->rx_opt.mss_clamp,
4706 tp->mss_cache) +
4707 MAX_TCP_HEADER);
4708 int demanded = max_t(unsigned int, tp->snd_cwnd,
4709 tp->reordering + 1);
4710 sndmem *= 2 * demanded;
4711 if (sndmem > sk->sk_sndbuf)
4712 sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]);
4713 tp->snd_cwnd_stamp = tcp_time_stamp; 4768 tp->snd_cwnd_stamp = tcp_time_stamp;
4714 } 4769 }
4715 4770
@@ -5584,6 +5639,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
5584 struct request_sock *req; 5639 struct request_sock *req;
5585 int queued = 0; 5640 int queued = 0;
5586 bool acceptable; 5641 bool acceptable;
5642 u32 synack_stamp;
5587 5643
5588 tp->rx_opt.saw_tstamp = 0; 5644 tp->rx_opt.saw_tstamp = 0;
5589 5645
@@ -5666,16 +5722,18 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
5666 * so release it. 5722 * so release it.
5667 */ 5723 */
5668 if (req) { 5724 if (req) {
5725 synack_stamp = tcp_rsk(req)->snt_synack;
5669 tp->total_retrans = req->num_retrans; 5726 tp->total_retrans = req->num_retrans;
5670 reqsk_fastopen_remove(sk, req, false); 5727 reqsk_fastopen_remove(sk, req, false);
5671 } else { 5728 } else {
5729 synack_stamp = tp->lsndtime;
5672 /* Make sure socket is routed, for correct metrics. */ 5730 /* Make sure socket is routed, for correct metrics. */
5673 icsk->icsk_af_ops->rebuild_header(sk); 5731 icsk->icsk_af_ops->rebuild_header(sk);
5674 tcp_init_congestion_control(sk); 5732 tcp_init_congestion_control(sk);
5675 5733
5676 tcp_mtup_init(sk); 5734 tcp_mtup_init(sk);
5677 tcp_init_buffer_space(sk);
5678 tp->copied_seq = tp->rcv_nxt; 5735 tp->copied_seq = tp->rcv_nxt;
5736 tcp_init_buffer_space(sk);
5679 } 5737 }
5680 smp_mb(); 5738 smp_mb();
5681 tcp_set_state(sk, TCP_ESTABLISHED); 5739 tcp_set_state(sk, TCP_ESTABLISHED);
@@ -5691,7 +5749,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
5691 tp->snd_una = TCP_SKB_CB(skb)->ack_seq; 5749 tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
5692 tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale; 5750 tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale;
5693 tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); 5751 tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
5694 tcp_synack_rtt_meas(sk, req); 5752 tcp_synack_rtt_meas(sk, synack_stamp);
5695 5753
5696 if (tp->rx_opt.tstamp_ok) 5754 if (tp->rx_opt.tstamp_ok)
5697 tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; 5755 tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
@@ -5709,6 +5767,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
5709 } else 5767 } else
5710 tcp_init_metrics(sk); 5768 tcp_init_metrics(sk);
5711 5769
5770 tcp_update_pacing_rate(sk);
5771
5712 /* Prevent spurious tcp_cwnd_restart() on first data packet */ 5772 /* Prevent spurious tcp_cwnd_restart() on first data packet */
5713 tp->lsndtime = tcp_time_stamp; 5773 tp->lsndtime = tcp_time_stamp;
5714 5774