diff options
Diffstat (limited to 'net/ipv4/tcp_output.c')
-rw-r--r-- | net/ipv4/tcp_output.c | 156 |
1 files changed, 90 insertions, 66 deletions
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 3af21296d967..65caf8b95e17 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c | |||
@@ -333,10 +333,19 @@ static void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb) | |||
333 | static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) | 333 | static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) |
334 | { | 334 | { |
335 | struct tcp_sock *tp = tcp_sk(sk); | 335 | struct tcp_sock *tp = tcp_sk(sk); |
336 | bool use_ecn = sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 || | ||
337 | tcp_ca_needs_ecn(sk); | ||
338 | |||
339 | if (!use_ecn) { | ||
340 | const struct dst_entry *dst = __sk_dst_get(sk); | ||
341 | |||
342 | if (dst && dst_feature(dst, RTAX_FEATURE_ECN)) | ||
343 | use_ecn = true; | ||
344 | } | ||
336 | 345 | ||
337 | tp->ecn_flags = 0; | 346 | tp->ecn_flags = 0; |
338 | if (sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 || | 347 | |
339 | tcp_ca_needs_ecn(sk)) { | 348 | if (use_ecn) { |
340 | TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR; | 349 | TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR; |
341 | tp->ecn_flags = TCP_ECN_OK; | 350 | tp->ecn_flags = TCP_ECN_OK; |
342 | if (tcp_ca_needs_ecn(sk)) | 351 | if (tcp_ca_needs_ecn(sk)) |
@@ -1515,6 +1524,27 @@ static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp, | |||
1515 | ((nonagle & TCP_NAGLE_CORK) || | 1524 | ((nonagle & TCP_NAGLE_CORK) || |
1516 | (!nonagle && tp->packets_out && tcp_minshall_check(tp))); | 1525 | (!nonagle && tp->packets_out && tcp_minshall_check(tp))); |
1517 | } | 1526 | } |
1527 | |||
1528 | /* Return how many segs we'd like on a TSO packet, | ||
1529 | * to send one TSO packet per ms | ||
1530 | */ | ||
1531 | static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now) | ||
1532 | { | ||
1533 | u32 bytes, segs; | ||
1534 | |||
1535 | bytes = min(sk->sk_pacing_rate >> 10, | ||
1536 | sk->sk_gso_max_size - 1 - MAX_TCP_HEADER); | ||
1537 | |||
1538 | /* Goal is to send at least one packet per ms, | ||
1539 | * not one big TSO packet every 100 ms. | ||
1540 | * This preserves ACK clocking and is consistent | ||
1541 | * with tcp_tso_should_defer() heuristic. | ||
1542 | */ | ||
1543 | segs = max_t(u32, bytes / mss_now, sysctl_tcp_min_tso_segs); | ||
1544 | |||
1545 | return min_t(u32, segs, sk->sk_gso_max_segs); | ||
1546 | } | ||
1547 | |||
1518 | /* Returns the portion of skb which can be sent right away */ | 1548 | /* Returns the portion of skb which can be sent right away */ |
1519 | static unsigned int tcp_mss_split_point(const struct sock *sk, | 1549 | static unsigned int tcp_mss_split_point(const struct sock *sk, |
1520 | const struct sk_buff *skb, | 1550 | const struct sk_buff *skb, |
@@ -1553,7 +1583,7 @@ static unsigned int tcp_mss_split_point(const struct sock *sk, | |||
1553 | static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp, | 1583 | static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp, |
1554 | const struct sk_buff *skb) | 1584 | const struct sk_buff *skb) |
1555 | { | 1585 | { |
1556 | u32 in_flight, cwnd; | 1586 | u32 in_flight, cwnd, halfcwnd; |
1557 | 1587 | ||
1558 | /* Don't be strict about the congestion window for the final FIN. */ | 1588 | /* Don't be strict about the congestion window for the final FIN. */ |
1559 | if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) && | 1589 | if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) && |
@@ -1562,10 +1592,14 @@ static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp, | |||
1562 | 1592 | ||
1563 | in_flight = tcp_packets_in_flight(tp); | 1593 | in_flight = tcp_packets_in_flight(tp); |
1564 | cwnd = tp->snd_cwnd; | 1594 | cwnd = tp->snd_cwnd; |
1565 | if (in_flight < cwnd) | 1595 | if (in_flight >= cwnd) |
1566 | return (cwnd - in_flight); | 1596 | return 0; |
1567 | 1597 | ||
1568 | return 0; | 1598 | /* For better scheduling, ensure we have at least |
1599 | * 2 GSO packets in flight. | ||
1600 | */ | ||
1601 | halfcwnd = max(cwnd >> 1, 1U); | ||
1602 | return min(halfcwnd, cwnd - in_flight); | ||
1569 | } | 1603 | } |
1570 | 1604 | ||
1571 | /* Initialize TSO state of a skb. | 1605 | /* Initialize TSO state of a skb. |
@@ -1718,7 +1752,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, | |||
1718 | * This algorithm is from John Heffner. | 1752 | * This algorithm is from John Heffner. |
1719 | */ | 1753 | */ |
1720 | static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, | 1754 | static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, |
1721 | bool *is_cwnd_limited) | 1755 | bool *is_cwnd_limited, u32 max_segs) |
1722 | { | 1756 | { |
1723 | struct tcp_sock *tp = tcp_sk(sk); | 1757 | struct tcp_sock *tp = tcp_sk(sk); |
1724 | const struct inet_connection_sock *icsk = inet_csk(sk); | 1758 | const struct inet_connection_sock *icsk = inet_csk(sk); |
@@ -1748,8 +1782,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, | |||
1748 | limit = min(send_win, cong_win); | 1782 | limit = min(send_win, cong_win); |
1749 | 1783 | ||
1750 | /* If a full-sized TSO skb can be sent, do it. */ | 1784 | /* If a full-sized TSO skb can be sent, do it. */ |
1751 | if (limit >= min_t(unsigned int, sk->sk_gso_max_size, | 1785 | if (limit >= max_segs * tp->mss_cache) |
1752 | tp->xmit_size_goal_segs * tp->mss_cache)) | ||
1753 | goto send_now; | 1786 | goto send_now; |
1754 | 1787 | ||
1755 | /* Middle in queue won't get any more data, full sendable already? */ | 1788 | /* Middle in queue won't get any more data, full sendable already? */ |
@@ -1946,6 +1979,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, | |||
1946 | int cwnd_quota; | 1979 | int cwnd_quota; |
1947 | int result; | 1980 | int result; |
1948 | bool is_cwnd_limited = false; | 1981 | bool is_cwnd_limited = false; |
1982 | u32 max_segs; | ||
1949 | 1983 | ||
1950 | sent_pkts = 0; | 1984 | sent_pkts = 0; |
1951 | 1985 | ||
@@ -1959,6 +1993,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, | |||
1959 | } | 1993 | } |
1960 | } | 1994 | } |
1961 | 1995 | ||
1996 | max_segs = tcp_tso_autosize(sk, mss_now); | ||
1962 | while ((skb = tcp_send_head(sk))) { | 1997 | while ((skb = tcp_send_head(sk))) { |
1963 | unsigned int limit; | 1998 | unsigned int limit; |
1964 | 1999 | ||
@@ -1984,17 +2019,30 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, | |||
1984 | if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) | 2019 | if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) |
1985 | break; | 2020 | break; |
1986 | 2021 | ||
1987 | if (tso_segs == 1) { | 2022 | if (tso_segs == 1 || !max_segs) { |
1988 | if (unlikely(!tcp_nagle_test(tp, skb, mss_now, | 2023 | if (unlikely(!tcp_nagle_test(tp, skb, mss_now, |
1989 | (tcp_skb_is_last(sk, skb) ? | 2024 | (tcp_skb_is_last(sk, skb) ? |
1990 | nonagle : TCP_NAGLE_PUSH)))) | 2025 | nonagle : TCP_NAGLE_PUSH)))) |
1991 | break; | 2026 | break; |
1992 | } else { | 2027 | } else { |
1993 | if (!push_one && | 2028 | if (!push_one && |
1994 | tcp_tso_should_defer(sk, skb, &is_cwnd_limited)) | 2029 | tcp_tso_should_defer(sk, skb, &is_cwnd_limited, |
2030 | max_segs)) | ||
1995 | break; | 2031 | break; |
1996 | } | 2032 | } |
1997 | 2033 | ||
2034 | limit = mss_now; | ||
2035 | if (tso_segs > 1 && max_segs && !tcp_urg_mode(tp)) | ||
2036 | limit = tcp_mss_split_point(sk, skb, mss_now, | ||
2037 | min_t(unsigned int, | ||
2038 | cwnd_quota, | ||
2039 | max_segs), | ||
2040 | nonagle); | ||
2041 | |||
2042 | if (skb->len > limit && | ||
2043 | unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) | ||
2044 | break; | ||
2045 | |||
1998 | /* TCP Small Queues : | 2046 | /* TCP Small Queues : |
1999 | * Control number of packets in qdisc/devices to two packets / or ~1 ms. | 2047 | * Control number of packets in qdisc/devices to two packets / or ~1 ms. |
2000 | * This allows for : | 2048 | * This allows for : |
@@ -2005,8 +2053,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, | |||
2005 | * of queued bytes to ensure line rate. | 2053 | * of queued bytes to ensure line rate. |
2006 | * One example is wifi aggregation (802.11 AMPDU) | 2054 | * One example is wifi aggregation (802.11 AMPDU) |
2007 | */ | 2055 | */ |
2008 | limit = max_t(unsigned int, sysctl_tcp_limit_output_bytes, | 2056 | limit = max(2 * skb->truesize, sk->sk_pacing_rate >> 10); |
2009 | sk->sk_pacing_rate >> 10); | 2057 | limit = min_t(u32, limit, sysctl_tcp_limit_output_bytes); |
2010 | 2058 | ||
2011 | if (atomic_read(&sk->sk_wmem_alloc) > limit) { | 2059 | if (atomic_read(&sk->sk_wmem_alloc) > limit) { |
2012 | set_bit(TSQ_THROTTLED, &tp->tsq_flags); | 2060 | set_bit(TSQ_THROTTLED, &tp->tsq_flags); |
@@ -2019,18 +2067,6 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, | |||
2019 | break; | 2067 | break; |
2020 | } | 2068 | } |
2021 | 2069 | ||
2022 | limit = mss_now; | ||
2023 | if (tso_segs > 1 && !tcp_urg_mode(tp)) | ||
2024 | limit = tcp_mss_split_point(sk, skb, mss_now, | ||
2025 | min_t(unsigned int, | ||
2026 | cwnd_quota, | ||
2027 | sk->sk_gso_max_segs), | ||
2028 | nonagle); | ||
2029 | |||
2030 | if (skb->len > limit && | ||
2031 | unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) | ||
2032 | break; | ||
2033 | |||
2034 | if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp))) | 2070 | if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp))) |
2035 | break; | 2071 | break; |
2036 | 2072 | ||
@@ -2126,7 +2162,7 @@ bool tcp_schedule_loss_probe(struct sock *sk) | |||
2126 | static bool skb_still_in_host_queue(const struct sock *sk, | 2162 | static bool skb_still_in_host_queue(const struct sock *sk, |
2127 | const struct sk_buff *skb) | 2163 | const struct sk_buff *skb) |
2128 | { | 2164 | { |
2129 | if (unlikely(skb_fclone_busy(skb))) { | 2165 | if (unlikely(skb_fclone_busy(sk, skb))) { |
2130 | NET_INC_STATS_BH(sock_net(sk), | 2166 | NET_INC_STATS_BH(sock_net(sk), |
2131 | LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES); | 2167 | LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES); |
2132 | return true; | 2168 | return true; |
@@ -2998,9 +3034,9 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) | |||
2998 | { | 3034 | { |
2999 | struct tcp_sock *tp = tcp_sk(sk); | 3035 | struct tcp_sock *tp = tcp_sk(sk); |
3000 | struct tcp_fastopen_request *fo = tp->fastopen_req; | 3036 | struct tcp_fastopen_request *fo = tp->fastopen_req; |
3001 | int syn_loss = 0, space, i, err = 0, iovlen = fo->data->msg_iovlen; | 3037 | int syn_loss = 0, space, err = 0; |
3002 | struct sk_buff *syn_data = NULL, *data; | ||
3003 | unsigned long last_syn_loss = 0; | 3038 | unsigned long last_syn_loss = 0; |
3039 | struct sk_buff *syn_data; | ||
3004 | 3040 | ||
3005 | tp->rx_opt.mss_clamp = tp->advmss; /* If MSS is not cached */ | 3041 | tp->rx_opt.mss_clamp = tp->advmss; /* If MSS is not cached */ |
3006 | tcp_fastopen_cache_get(sk, &tp->rx_opt.mss_clamp, &fo->cookie, | 3042 | tcp_fastopen_cache_get(sk, &tp->rx_opt.mss_clamp, &fo->cookie, |
@@ -3031,48 +3067,40 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) | |||
3031 | /* limit to order-0 allocations */ | 3067 | /* limit to order-0 allocations */ |
3032 | space = min_t(size_t, space, SKB_MAX_HEAD(MAX_TCP_HEADER)); | 3068 | space = min_t(size_t, space, SKB_MAX_HEAD(MAX_TCP_HEADER)); |
3033 | 3069 | ||
3034 | syn_data = skb_copy_expand(syn, MAX_TCP_HEADER, space, | 3070 | syn_data = sk_stream_alloc_skb(sk, space, sk->sk_allocation); |
3035 | sk->sk_allocation); | 3071 | if (!syn_data) |
3036 | if (syn_data == NULL) | 3072 | goto fallback; |
3073 | syn_data->ip_summed = CHECKSUM_PARTIAL; | ||
3074 | memcpy(syn_data->cb, syn->cb, sizeof(syn->cb)); | ||
3075 | if (unlikely(memcpy_fromiovecend(skb_put(syn_data, space), | ||
3076 | fo->data->msg_iter.iov, 0, space))) { | ||
3077 | kfree_skb(syn_data); | ||
3037 | goto fallback; | 3078 | goto fallback; |
3079 | } | ||
3038 | 3080 | ||
3039 | for (i = 0; i < iovlen && syn_data->len < space; ++i) { | 3081 | /* No more data pending in inet_wait_for_connect() */ |
3040 | struct iovec *iov = &fo->data->msg_iov[i]; | 3082 | if (space == fo->size) |
3041 | unsigned char __user *from = iov->iov_base; | 3083 | fo->data = NULL; |
3042 | int len = iov->iov_len; | 3084 | fo->copied = space; |
3043 | 3085 | ||
3044 | if (syn_data->len + len > space) | 3086 | tcp_connect_queue_skb(sk, syn_data); |
3045 | len = space - syn_data->len; | ||
3046 | else if (i + 1 == iovlen) | ||
3047 | /* No more data pending in inet_wait_for_connect() */ | ||
3048 | fo->data = NULL; | ||
3049 | 3087 | ||
3050 | if (skb_add_data(syn_data, from, len)) | 3088 | err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation); |
3051 | goto fallback; | ||
3052 | } | ||
3053 | 3089 | ||
3054 | /* Queue a data-only packet after the regular SYN for retransmission */ | 3090 | syn->skb_mstamp = syn_data->skb_mstamp; |
3055 | data = pskb_copy(syn_data, sk->sk_allocation); | ||
3056 | if (data == NULL) | ||
3057 | goto fallback; | ||
3058 | TCP_SKB_CB(data)->seq++; | ||
3059 | TCP_SKB_CB(data)->tcp_flags &= ~TCPHDR_SYN; | ||
3060 | TCP_SKB_CB(data)->tcp_flags = (TCPHDR_ACK|TCPHDR_PSH); | ||
3061 | tcp_connect_queue_skb(sk, data); | ||
3062 | fo->copied = data->len; | ||
3063 | |||
3064 | /* syn_data is about to be sent, we need to take current time stamps | ||
3065 | * for the packets that are in write queue : SYN packet and DATA | ||
3066 | */ | ||
3067 | skb_mstamp_get(&syn->skb_mstamp); | ||
3068 | data->skb_mstamp = syn->skb_mstamp; | ||
3069 | 3091 | ||
3070 | if (tcp_transmit_skb(sk, syn_data, 0, sk->sk_allocation) == 0) { | 3092 | /* Now full SYN+DATA was cloned and sent (or not), |
3093 | * remove the SYN from the original skb (syn_data) | ||
3094 | * we keep in write queue in case of a retransmit, as we | ||
3095 | * also have the SYN packet (with no data) in the same queue. | ||
3096 | */ | ||
3097 | TCP_SKB_CB(syn_data)->seq++; | ||
3098 | TCP_SKB_CB(syn_data)->tcp_flags = TCPHDR_ACK | TCPHDR_PSH; | ||
3099 | if (!err) { | ||
3071 | tp->syn_data = (fo->copied > 0); | 3100 | tp->syn_data = (fo->copied > 0); |
3072 | NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT); | 3101 | NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT); |
3073 | goto done; | 3102 | goto done; |
3074 | } | 3103 | } |
3075 | syn_data = NULL; | ||
3076 | 3104 | ||
3077 | fallback: | 3105 | fallback: |
3078 | /* Send a regular SYN with Fast Open cookie request option */ | 3106 | /* Send a regular SYN with Fast Open cookie request option */ |
@@ -3081,7 +3109,6 @@ fallback: | |||
3081 | err = tcp_transmit_skb(sk, syn, 1, sk->sk_allocation); | 3109 | err = tcp_transmit_skb(sk, syn, 1, sk->sk_allocation); |
3082 | if (err) | 3110 | if (err) |
3083 | tp->syn_fastopen = 0; | 3111 | tp->syn_fastopen = 0; |
3084 | kfree_skb(syn_data); | ||
3085 | done: | 3112 | done: |
3086 | fo->cookie.len = -1; /* Exclude Fast Open option for SYN retries */ | 3113 | fo->cookie.len = -1; /* Exclude Fast Open option for SYN retries */ |
3087 | return err; | 3114 | return err; |
@@ -3101,13 +3128,10 @@ int tcp_connect(struct sock *sk) | |||
3101 | return 0; | 3128 | return 0; |
3102 | } | 3129 | } |
3103 | 3130 | ||
3104 | buff = alloc_skb_fclone(MAX_TCP_HEADER + 15, sk->sk_allocation); | 3131 | buff = sk_stream_alloc_skb(sk, 0, sk->sk_allocation); |
3105 | if (unlikely(buff == NULL)) | 3132 | if (unlikely(!buff)) |
3106 | return -ENOBUFS; | 3133 | return -ENOBUFS; |
3107 | 3134 | ||
3108 | /* Reserve space for headers. */ | ||
3109 | skb_reserve(buff, MAX_TCP_HEADER); | ||
3110 | |||
3111 | tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN); | 3135 | tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN); |
3112 | tp->retrans_stamp = tcp_time_stamp; | 3136 | tp->retrans_stamp = tcp_time_stamp; |
3113 | tcp_connect_queue_skb(sk, buff); | 3137 | tcp_connect_queue_skb(sk, buff); |