diff options
Diffstat (limited to 'net/ipv4/tcp_output.c')
-rw-r--r-- | net/ipv4/tcp_output.c | 113 |
1 files changed, 69 insertions, 44 deletions
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index de3bd8458588..882e0b0964d0 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c | |||
@@ -55,7 +55,7 @@ int sysctl_tcp_workaround_signed_windows __read_mostly = 0; | |||
55 | int sysctl_tcp_tso_win_divisor __read_mostly = 3; | 55 | int sysctl_tcp_tso_win_divisor __read_mostly = 3; |
56 | 56 | ||
57 | int sysctl_tcp_mtu_probing __read_mostly = 0; | 57 | int sysctl_tcp_mtu_probing __read_mostly = 0; |
58 | int sysctl_tcp_base_mss __read_mostly = 512; | 58 | int sysctl_tcp_base_mss __read_mostly = TCP_BASE_MSS; |
59 | 59 | ||
60 | /* By default, RFC2861 behavior. */ | 60 | /* By default, RFC2861 behavior. */ |
61 | int sysctl_tcp_slow_start_after_idle __read_mostly = 1; | 61 | int sysctl_tcp_slow_start_after_idle __read_mostly = 1; |
@@ -73,7 +73,7 @@ static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb) | |||
73 | tcp_advance_send_head(sk, skb); | 73 | tcp_advance_send_head(sk, skb); |
74 | tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; | 74 | tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; |
75 | 75 | ||
76 | /* Don't override Nagle indefinately with F-RTO */ | 76 | /* Don't override Nagle indefinitely with F-RTO */ |
77 | if (tp->frto_counter == 2) | 77 | if (tp->frto_counter == 2) |
78 | tp->frto_counter = 3; | 78 | tp->frto_counter = 3; |
79 | 79 | ||
@@ -119,9 +119,13 @@ static __u16 tcp_advertise_mss(struct sock *sk) | |||
119 | struct dst_entry *dst = __sk_dst_get(sk); | 119 | struct dst_entry *dst = __sk_dst_get(sk); |
120 | int mss = tp->advmss; | 120 | int mss = tp->advmss; |
121 | 121 | ||
122 | if (dst && dst_metric(dst, RTAX_ADVMSS) < mss) { | 122 | if (dst) { |
123 | mss = dst_metric(dst, RTAX_ADVMSS); | 123 | unsigned int metric = dst_metric_advmss(dst); |
124 | tp->advmss = mss; | 124 | |
125 | if (metric < mss) { | ||
126 | mss = metric; | ||
127 | tp->advmss = mss; | ||
128 | } | ||
125 | } | 129 | } |
126 | 130 | ||
127 | return (__u16)mss; | 131 | return (__u16)mss; |
@@ -224,24 +228,22 @@ void tcp_select_initial_window(int __space, __u32 mss, | |||
224 | } | 228 | } |
225 | } | 229 | } |
226 | 230 | ||
227 | /* Set initial window to value enough for senders, | 231 | /* Set initial window to a value enough for senders starting with |
228 | * following RFC2414. Senders, not following this RFC, | 232 | * initial congestion window of TCP_DEFAULT_INIT_RCVWND. Place |
229 | * will be satisfied with 2. | 233 | * a limit on the initial window when mss is larger than 1460. |
230 | */ | 234 | */ |
231 | if (mss > (1 << *rcv_wscale)) { | 235 | if (mss > (1 << *rcv_wscale)) { |
232 | int init_cwnd = 4; | 236 | int init_cwnd = TCP_DEFAULT_INIT_RCVWND; |
233 | if (mss > 1460 * 3) | 237 | if (mss > 1460) |
234 | init_cwnd = 2; | 238 | init_cwnd = |
235 | else if (mss > 1460) | 239 | max_t(u32, (1460 * TCP_DEFAULT_INIT_RCVWND) / mss, 2); |
236 | init_cwnd = 3; | ||
237 | /* when initializing use the value from init_rcv_wnd | 240 | /* when initializing use the value from init_rcv_wnd |
238 | * rather than the default from above | 241 | * rather than the default from above |
239 | */ | 242 | */ |
240 | if (init_rcv_wnd && | 243 | if (init_rcv_wnd) |
241 | (*rcv_wnd > init_rcv_wnd * mss)) | 244 | *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss); |
242 | *rcv_wnd = init_rcv_wnd * mss; | 245 | else |
243 | else if (*rcv_wnd > init_cwnd * mss) | 246 | *rcv_wnd = min(*rcv_wnd, init_cwnd * mss); |
244 | *rcv_wnd = init_cwnd * mss; | ||
245 | } | 247 | } |
246 | 248 | ||
247 | /* Set the clamp no higher than max representable value */ | 249 | /* Set the clamp no higher than max representable value */ |
@@ -392,27 +394,30 @@ struct tcp_out_options { | |||
392 | */ | 394 | */ |
393 | static u8 tcp_cookie_size_check(u8 desired) | 395 | static u8 tcp_cookie_size_check(u8 desired) |
394 | { | 396 | { |
395 | if (desired > 0) { | 397 | int cookie_size; |
398 | |||
399 | if (desired > 0) | ||
396 | /* previously specified */ | 400 | /* previously specified */ |
397 | return desired; | 401 | return desired; |
398 | } | 402 | |
399 | if (sysctl_tcp_cookie_size <= 0) { | 403 | cookie_size = ACCESS_ONCE(sysctl_tcp_cookie_size); |
404 | if (cookie_size <= 0) | ||
400 | /* no default specified */ | 405 | /* no default specified */ |
401 | return 0; | 406 | return 0; |
402 | } | 407 | |
403 | if (sysctl_tcp_cookie_size <= TCP_COOKIE_MIN) { | 408 | if (cookie_size <= TCP_COOKIE_MIN) |
404 | /* value too small, specify minimum */ | 409 | /* value too small, specify minimum */ |
405 | return TCP_COOKIE_MIN; | 410 | return TCP_COOKIE_MIN; |
406 | } | 411 | |
407 | if (sysctl_tcp_cookie_size >= TCP_COOKIE_MAX) { | 412 | if (cookie_size >= TCP_COOKIE_MAX) |
408 | /* value too large, specify maximum */ | 413 | /* value too large, specify maximum */ |
409 | return TCP_COOKIE_MAX; | 414 | return TCP_COOKIE_MAX; |
410 | } | 415 | |
411 | if (0x1 & sysctl_tcp_cookie_size) { | 416 | if (cookie_size & 1) |
412 | /* 8-bit multiple, illegal, fix it */ | 417 | /* 8-bit multiple, illegal, fix it */ |
413 | return (u8)(sysctl_tcp_cookie_size + 0x1); | 418 | cookie_size++; |
414 | } | 419 | |
415 | return (u8)sysctl_tcp_cookie_size; | 420 | return (u8)cookie_size; |
416 | } | 421 | } |
417 | 422 | ||
418 | /* Write previously computed TCP options to the packet. | 423 | /* Write previously computed TCP options to the packet. |
@@ -828,8 +833,11 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, | |||
828 | &md5); | 833 | &md5); |
829 | tcp_header_size = tcp_options_size + sizeof(struct tcphdr); | 834 | tcp_header_size = tcp_options_size + sizeof(struct tcphdr); |
830 | 835 | ||
831 | if (tcp_packets_in_flight(tp) == 0) | 836 | if (tcp_packets_in_flight(tp) == 0) { |
832 | tcp_ca_event(sk, CA_EVENT_TX_START); | 837 | tcp_ca_event(sk, CA_EVENT_TX_START); |
838 | skb->ooo_okay = 1; | ||
839 | } else | ||
840 | skb->ooo_okay = 0; | ||
833 | 841 | ||
834 | skb_push(skb, tcp_header_size); | 842 | skb_push(skb, tcp_header_size); |
835 | skb_reset_transport_header(skb); | 843 | skb_reset_transport_header(skb); |
@@ -891,7 +899,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, | |||
891 | TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, | 899 | TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, |
892 | tcp_skb_pcount(skb)); | 900 | tcp_skb_pcount(skb)); |
893 | 901 | ||
894 | err = icsk->icsk_af_ops->queue_xmit(skb); | 902 | err = icsk->icsk_af_ops->queue_xmit(skb, &inet->cork.fl); |
895 | if (likely(err <= 0)) | 903 | if (likely(err <= 0)) |
896 | return err; | 904 | return err; |
897 | 905 | ||
@@ -995,7 +1003,8 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, | |||
995 | int nlen; | 1003 | int nlen; |
996 | u8 flags; | 1004 | u8 flags; |
997 | 1005 | ||
998 | BUG_ON(len > skb->len); | 1006 | if (WARN_ON(len > skb->len)) |
1007 | return -EINVAL; | ||
999 | 1008 | ||
1000 | nsize = skb_headlen(skb) - len; | 1009 | nsize = skb_headlen(skb) - len; |
1001 | if (nsize < 0) | 1010 | if (nsize < 0) |
@@ -1342,7 +1351,7 @@ static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, | |||
1342 | return 0; | 1351 | return 0; |
1343 | } | 1352 | } |
1344 | 1353 | ||
1345 | /* Intialize TSO state of a skb. | 1354 | /* Initialize TSO state of a skb. |
1346 | * This must be invoked the first time we consider transmitting | 1355 | * This must be invoked the first time we consider transmitting |
1347 | * SKB onto the wire. | 1356 | * SKB onto the wire. |
1348 | */ | 1357 | */ |
@@ -1376,9 +1385,9 @@ static inline int tcp_nagle_check(const struct tcp_sock *tp, | |||
1376 | const struct sk_buff *skb, | 1385 | const struct sk_buff *skb, |
1377 | unsigned mss_now, int nonagle) | 1386 | unsigned mss_now, int nonagle) |
1378 | { | 1387 | { |
1379 | return (skb->len < mss_now && | 1388 | return skb->len < mss_now && |
1380 | ((nonagle & TCP_NAGLE_CORK) || | 1389 | ((nonagle & TCP_NAGLE_CORK) || |
1381 | (!nonagle && tp->packets_out && tcp_minshall_check(tp)))); | 1390 | (!nonagle && tp->packets_out && tcp_minshall_check(tp))); |
1382 | } | 1391 | } |
1383 | 1392 | ||
1384 | /* Return non-zero if the Nagle test allows this packet to be | 1393 | /* Return non-zero if the Nagle test allows this packet to be |
@@ -1449,10 +1458,10 @@ int tcp_may_send_now(struct sock *sk) | |||
1449 | struct tcp_sock *tp = tcp_sk(sk); | 1458 | struct tcp_sock *tp = tcp_sk(sk); |
1450 | struct sk_buff *skb = tcp_send_head(sk); | 1459 | struct sk_buff *skb = tcp_send_head(sk); |
1451 | 1460 | ||
1452 | return (skb && | 1461 | return skb && |
1453 | tcp_snd_test(sk, skb, tcp_current_mss(sk), | 1462 | tcp_snd_test(sk, skb, tcp_current_mss(sk), |
1454 | (tcp_skb_is_last(sk, skb) ? | 1463 | (tcp_skb_is_last(sk, skb) ? |
1455 | tp->nonagle : TCP_NAGLE_PUSH))); | 1464 | tp->nonagle : TCP_NAGLE_PUSH)); |
1456 | } | 1465 | } |
1457 | 1466 | ||
1458 | /* Trim TSO SKB to LEN bytes, put the remaining data into a new packet | 1467 | /* Trim TSO SKB to LEN bytes, put the remaining data into a new packet |
@@ -1519,6 +1528,7 @@ static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) | |||
1519 | struct tcp_sock *tp = tcp_sk(sk); | 1528 | struct tcp_sock *tp = tcp_sk(sk); |
1520 | const struct inet_connection_sock *icsk = inet_csk(sk); | 1529 | const struct inet_connection_sock *icsk = inet_csk(sk); |
1521 | u32 send_win, cong_win, limit, in_flight; | 1530 | u32 send_win, cong_win, limit, in_flight; |
1531 | int win_divisor; | ||
1522 | 1532 | ||
1523 | if (TCP_SKB_CB(skb)->flags & TCPHDR_FIN) | 1533 | if (TCP_SKB_CB(skb)->flags & TCPHDR_FIN) |
1524 | goto send_now; | 1534 | goto send_now; |
@@ -1550,13 +1560,14 @@ static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) | |||
1550 | if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len)) | 1560 | if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len)) |
1551 | goto send_now; | 1561 | goto send_now; |
1552 | 1562 | ||
1553 | if (sysctl_tcp_tso_win_divisor) { | 1563 | win_divisor = ACCESS_ONCE(sysctl_tcp_tso_win_divisor); |
1564 | if (win_divisor) { | ||
1554 | u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache); | 1565 | u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache); |
1555 | 1566 | ||
1556 | /* If at least some fraction of a window is available, | 1567 | /* If at least some fraction of a window is available, |
1557 | * just use it. | 1568 | * just use it. |
1558 | */ | 1569 | */ |
1559 | chunk /= sysctl_tcp_tso_win_divisor; | 1570 | chunk /= win_divisor; |
1560 | if (limit >= chunk) | 1571 | if (limit >= chunk) |
1561 | goto send_now; | 1572 | goto send_now; |
1562 | } else { | 1573 | } else { |
@@ -2152,7 +2163,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) | |||
2152 | if (!tp->retrans_stamp) | 2163 | if (!tp->retrans_stamp) |
2153 | tp->retrans_stamp = TCP_SKB_CB(skb)->when; | 2164 | tp->retrans_stamp = TCP_SKB_CB(skb)->when; |
2154 | 2165 | ||
2155 | tp->undo_retrans++; | 2166 | tp->undo_retrans += tcp_skb_pcount(skb); |
2156 | 2167 | ||
2157 | /* snd_nxt is stored to detect loss of retransmitted segment, | 2168 | /* snd_nxt is stored to detect loss of retransmitted segment, |
2158 | * see tcp_input.c tcp_sacktag_write_queue(). | 2169 | * see tcp_input.c tcp_sacktag_write_queue(). |
@@ -2421,7 +2432,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, | |||
2421 | 2432 | ||
2422 | skb_dst_set(skb, dst_clone(dst)); | 2433 | skb_dst_set(skb, dst_clone(dst)); |
2423 | 2434 | ||
2424 | mss = dst_metric(dst, RTAX_ADVMSS); | 2435 | mss = dst_metric_advmss(dst); |
2425 | if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss) | 2436 | if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss) |
2426 | mss = tp->rx_opt.user_mss; | 2437 | mss = tp->rx_opt.user_mss; |
2427 | 2438 | ||
@@ -2429,6 +2440,12 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, | |||
2429 | __u8 rcv_wscale; | 2440 | __u8 rcv_wscale; |
2430 | /* Set this up on the first call only */ | 2441 | /* Set this up on the first call only */ |
2431 | req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW); | 2442 | req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW); |
2443 | |||
2444 | /* limit the window selection if the user enforce a smaller rx buffer */ | ||
2445 | if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && | ||
2446 | (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0)) | ||
2447 | req->window_clamp = tcp_full_space(sk); | ||
2448 | |||
2432 | /* tcp_full_space because it is guaranteed to be the first packet */ | 2449 | /* tcp_full_space because it is guaranteed to be the first packet */ |
2433 | tcp_select_initial_window(tcp_full_space(sk), | 2450 | tcp_select_initial_window(tcp_full_space(sk), |
2434 | mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), | 2451 | mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), |
@@ -2549,12 +2566,17 @@ static void tcp_connect_init(struct sock *sk) | |||
2549 | 2566 | ||
2550 | if (!tp->window_clamp) | 2567 | if (!tp->window_clamp) |
2551 | tp->window_clamp = dst_metric(dst, RTAX_WINDOW); | 2568 | tp->window_clamp = dst_metric(dst, RTAX_WINDOW); |
2552 | tp->advmss = dst_metric(dst, RTAX_ADVMSS); | 2569 | tp->advmss = dst_metric_advmss(dst); |
2553 | if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->advmss) | 2570 | if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->advmss) |
2554 | tp->advmss = tp->rx_opt.user_mss; | 2571 | tp->advmss = tp->rx_opt.user_mss; |
2555 | 2572 | ||
2556 | tcp_initialize_rcv_mss(sk); | 2573 | tcp_initialize_rcv_mss(sk); |
2557 | 2574 | ||
2575 | /* limit the window selection if the user enforce a smaller rx buffer */ | ||
2576 | if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && | ||
2577 | (tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0)) | ||
2578 | tp->window_clamp = tcp_full_space(sk); | ||
2579 | |||
2558 | tcp_select_initial_window(tcp_full_space(sk), | 2580 | tcp_select_initial_window(tcp_full_space(sk), |
2559 | tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), | 2581 | tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), |
2560 | &tp->rcv_wnd, | 2582 | &tp->rcv_wnd, |
@@ -2587,6 +2609,7 @@ int tcp_connect(struct sock *sk) | |||
2587 | { | 2609 | { |
2588 | struct tcp_sock *tp = tcp_sk(sk); | 2610 | struct tcp_sock *tp = tcp_sk(sk); |
2589 | struct sk_buff *buff; | 2611 | struct sk_buff *buff; |
2612 | int err; | ||
2590 | 2613 | ||
2591 | tcp_connect_init(sk); | 2614 | tcp_connect_init(sk); |
2592 | 2615 | ||
@@ -2609,7 +2632,9 @@ int tcp_connect(struct sock *sk) | |||
2609 | sk->sk_wmem_queued += buff->truesize; | 2632 | sk->sk_wmem_queued += buff->truesize; |
2610 | sk_mem_charge(sk, buff->truesize); | 2633 | sk_mem_charge(sk, buff->truesize); |
2611 | tp->packets_out += tcp_skb_pcount(buff); | 2634 | tp->packets_out += tcp_skb_pcount(buff); |
2612 | tcp_transmit_skb(sk, buff, 1, sk->sk_allocation); | 2635 | err = tcp_transmit_skb(sk, buff, 1, sk->sk_allocation); |
2636 | if (err == -ECONNREFUSED) | ||
2637 | return err; | ||
2613 | 2638 | ||
2614 | /* We change tp->snd_nxt after the tcp_transmit_skb() call | 2639 | /* We change tp->snd_nxt after the tcp_transmit_skb() call |
2615 | * in order to make this packet get counted in tcpOutSegs. | 2640 | * in order to make this packet get counted in tcpOutSegs. |