aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp_output.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4/tcp_output.c')
-rw-r--r--net/ipv4/tcp_output.c156
1 files changed, 90 insertions, 66 deletions
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 3af21296d967..65caf8b95e17 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -333,10 +333,19 @@ static void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb)
333static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) 333static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
334{ 334{
335 struct tcp_sock *tp = tcp_sk(sk); 335 struct tcp_sock *tp = tcp_sk(sk);
336 bool use_ecn = sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 ||
337 tcp_ca_needs_ecn(sk);
338
339 if (!use_ecn) {
340 const struct dst_entry *dst = __sk_dst_get(sk);
341
342 if (dst && dst_feature(dst, RTAX_FEATURE_ECN))
343 use_ecn = true;
344 }
336 345
337 tp->ecn_flags = 0; 346 tp->ecn_flags = 0;
338 if (sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 || 347
339 tcp_ca_needs_ecn(sk)) { 348 if (use_ecn) {
340 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR; 349 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR;
341 tp->ecn_flags = TCP_ECN_OK; 350 tp->ecn_flags = TCP_ECN_OK;
342 if (tcp_ca_needs_ecn(sk)) 351 if (tcp_ca_needs_ecn(sk))
@@ -1515,6 +1524,27 @@ static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp,
1515 ((nonagle & TCP_NAGLE_CORK) || 1524 ((nonagle & TCP_NAGLE_CORK) ||
1516 (!nonagle && tp->packets_out && tcp_minshall_check(tp))); 1525 (!nonagle && tp->packets_out && tcp_minshall_check(tp)));
1517} 1526}
1527
1528/* Return how many segs we'd like on a TSO packet,
1529 * to send one TSO packet per ms
1530 */
1531static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now)
1532{
1533 u32 bytes, segs;
1534
1535 bytes = min(sk->sk_pacing_rate >> 10,
1536 sk->sk_gso_max_size - 1 - MAX_TCP_HEADER);
1537
1538 /* Goal is to send at least one packet per ms,
1539 * not one big TSO packet every 100 ms.
1540 * This preserves ACK clocking and is consistent
1541 * with tcp_tso_should_defer() heuristic.
1542 */
1543 segs = max_t(u32, bytes / mss_now, sysctl_tcp_min_tso_segs);
1544
1545 return min_t(u32, segs, sk->sk_gso_max_segs);
1546}
1547
1518/* Returns the portion of skb which can be sent right away */ 1548/* Returns the portion of skb which can be sent right away */
1519static unsigned int tcp_mss_split_point(const struct sock *sk, 1549static unsigned int tcp_mss_split_point(const struct sock *sk,
1520 const struct sk_buff *skb, 1550 const struct sk_buff *skb,
@@ -1553,7 +1583,7 @@ static unsigned int tcp_mss_split_point(const struct sock *sk,
1553static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp, 1583static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp,
1554 const struct sk_buff *skb) 1584 const struct sk_buff *skb)
1555{ 1585{
1556 u32 in_flight, cwnd; 1586 u32 in_flight, cwnd, halfcwnd;
1557 1587
1558 /* Don't be strict about the congestion window for the final FIN. */ 1588 /* Don't be strict about the congestion window for the final FIN. */
1559 if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) && 1589 if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) &&
@@ -1562,10 +1592,14 @@ static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp,
1562 1592
1563 in_flight = tcp_packets_in_flight(tp); 1593 in_flight = tcp_packets_in_flight(tp);
1564 cwnd = tp->snd_cwnd; 1594 cwnd = tp->snd_cwnd;
1565 if (in_flight < cwnd) 1595 if (in_flight >= cwnd)
1566 return (cwnd - in_flight); 1596 return 0;
1567 1597
1568 return 0; 1598 /* For better scheduling, ensure we have at least
1599 * 2 GSO packets in flight.
1600 */
1601 halfcwnd = max(cwnd >> 1, 1U);
1602 return min(halfcwnd, cwnd - in_flight);
1569} 1603}
1570 1604
1571/* Initialize TSO state of a skb. 1605/* Initialize TSO state of a skb.
@@ -1718,7 +1752,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
1718 * This algorithm is from John Heffner. 1752 * This algorithm is from John Heffner.
1719 */ 1753 */
1720static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, 1754static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
1721 bool *is_cwnd_limited) 1755 bool *is_cwnd_limited, u32 max_segs)
1722{ 1756{
1723 struct tcp_sock *tp = tcp_sk(sk); 1757 struct tcp_sock *tp = tcp_sk(sk);
1724 const struct inet_connection_sock *icsk = inet_csk(sk); 1758 const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -1748,8 +1782,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
1748 limit = min(send_win, cong_win); 1782 limit = min(send_win, cong_win);
1749 1783
1750 /* If a full-sized TSO skb can be sent, do it. */ 1784 /* If a full-sized TSO skb can be sent, do it. */
1751 if (limit >= min_t(unsigned int, sk->sk_gso_max_size, 1785 if (limit >= max_segs * tp->mss_cache)
1752 tp->xmit_size_goal_segs * tp->mss_cache))
1753 goto send_now; 1786 goto send_now;
1754 1787
1755 /* Middle in queue won't get any more data, full sendable already? */ 1788 /* Middle in queue won't get any more data, full sendable already? */
@@ -1946,6 +1979,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1946 int cwnd_quota; 1979 int cwnd_quota;
1947 int result; 1980 int result;
1948 bool is_cwnd_limited = false; 1981 bool is_cwnd_limited = false;
1982 u32 max_segs;
1949 1983
1950 sent_pkts = 0; 1984 sent_pkts = 0;
1951 1985
@@ -1959,6 +1993,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1959 } 1993 }
1960 } 1994 }
1961 1995
1996 max_segs = tcp_tso_autosize(sk, mss_now);
1962 while ((skb = tcp_send_head(sk))) { 1997 while ((skb = tcp_send_head(sk))) {
1963 unsigned int limit; 1998 unsigned int limit;
1964 1999
@@ -1984,17 +2019,30 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1984 if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) 2019 if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now)))
1985 break; 2020 break;
1986 2021
1987 if (tso_segs == 1) { 2022 if (tso_segs == 1 || !max_segs) {
1988 if (unlikely(!tcp_nagle_test(tp, skb, mss_now, 2023 if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
1989 (tcp_skb_is_last(sk, skb) ? 2024 (tcp_skb_is_last(sk, skb) ?
1990 nonagle : TCP_NAGLE_PUSH)))) 2025 nonagle : TCP_NAGLE_PUSH))))
1991 break; 2026 break;
1992 } else { 2027 } else {
1993 if (!push_one && 2028 if (!push_one &&
1994 tcp_tso_should_defer(sk, skb, &is_cwnd_limited)) 2029 tcp_tso_should_defer(sk, skb, &is_cwnd_limited,
2030 max_segs))
1995 break; 2031 break;
1996 } 2032 }
1997 2033
2034 limit = mss_now;
2035 if (tso_segs > 1 && max_segs && !tcp_urg_mode(tp))
2036 limit = tcp_mss_split_point(sk, skb, mss_now,
2037 min_t(unsigned int,
2038 cwnd_quota,
2039 max_segs),
2040 nonagle);
2041
2042 if (skb->len > limit &&
2043 unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
2044 break;
2045
1998 /* TCP Small Queues : 2046 /* TCP Small Queues :
1999 * Control number of packets in qdisc/devices to two packets / or ~1 ms. 2047 * Control number of packets in qdisc/devices to two packets / or ~1 ms.
2000 * This allows for : 2048 * This allows for :
@@ -2005,8 +2053,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
2005 * of queued bytes to ensure line rate. 2053 * of queued bytes to ensure line rate.
2006 * One example is wifi aggregation (802.11 AMPDU) 2054 * One example is wifi aggregation (802.11 AMPDU)
2007 */ 2055 */
2008 limit = max_t(unsigned int, sysctl_tcp_limit_output_bytes, 2056 limit = max(2 * skb->truesize, sk->sk_pacing_rate >> 10);
2009 sk->sk_pacing_rate >> 10); 2057 limit = min_t(u32, limit, sysctl_tcp_limit_output_bytes);
2010 2058
2011 if (atomic_read(&sk->sk_wmem_alloc) > limit) { 2059 if (atomic_read(&sk->sk_wmem_alloc) > limit) {
2012 set_bit(TSQ_THROTTLED, &tp->tsq_flags); 2060 set_bit(TSQ_THROTTLED, &tp->tsq_flags);
@@ -2019,18 +2067,6 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
2019 break; 2067 break;
2020 } 2068 }
2021 2069
2022 limit = mss_now;
2023 if (tso_segs > 1 && !tcp_urg_mode(tp))
2024 limit = tcp_mss_split_point(sk, skb, mss_now,
2025 min_t(unsigned int,
2026 cwnd_quota,
2027 sk->sk_gso_max_segs),
2028 nonagle);
2029
2030 if (skb->len > limit &&
2031 unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
2032 break;
2033
2034 if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp))) 2070 if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp)))
2035 break; 2071 break;
2036 2072
@@ -2126,7 +2162,7 @@ bool tcp_schedule_loss_probe(struct sock *sk)
2126static bool skb_still_in_host_queue(const struct sock *sk, 2162static bool skb_still_in_host_queue(const struct sock *sk,
2127 const struct sk_buff *skb) 2163 const struct sk_buff *skb)
2128{ 2164{
2129 if (unlikely(skb_fclone_busy(skb))) { 2165 if (unlikely(skb_fclone_busy(sk, skb))) {
2130 NET_INC_STATS_BH(sock_net(sk), 2166 NET_INC_STATS_BH(sock_net(sk),
2131 LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES); 2167 LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES);
2132 return true; 2168 return true;
@@ -2998,9 +3034,9 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
2998{ 3034{
2999 struct tcp_sock *tp = tcp_sk(sk); 3035 struct tcp_sock *tp = tcp_sk(sk);
3000 struct tcp_fastopen_request *fo = tp->fastopen_req; 3036 struct tcp_fastopen_request *fo = tp->fastopen_req;
3001 int syn_loss = 0, space, i, err = 0, iovlen = fo->data->msg_iovlen; 3037 int syn_loss = 0, space, err = 0;
3002 struct sk_buff *syn_data = NULL, *data;
3003 unsigned long last_syn_loss = 0; 3038 unsigned long last_syn_loss = 0;
3039 struct sk_buff *syn_data;
3004 3040
3005 tp->rx_opt.mss_clamp = tp->advmss; /* If MSS is not cached */ 3041 tp->rx_opt.mss_clamp = tp->advmss; /* If MSS is not cached */
3006 tcp_fastopen_cache_get(sk, &tp->rx_opt.mss_clamp, &fo->cookie, 3042 tcp_fastopen_cache_get(sk, &tp->rx_opt.mss_clamp, &fo->cookie,
@@ -3031,48 +3067,40 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
3031 /* limit to order-0 allocations */ 3067 /* limit to order-0 allocations */
3032 space = min_t(size_t, space, SKB_MAX_HEAD(MAX_TCP_HEADER)); 3068 space = min_t(size_t, space, SKB_MAX_HEAD(MAX_TCP_HEADER));
3033 3069
3034 syn_data = skb_copy_expand(syn, MAX_TCP_HEADER, space, 3070 syn_data = sk_stream_alloc_skb(sk, space, sk->sk_allocation);
3035 sk->sk_allocation); 3071 if (!syn_data)
3036 if (syn_data == NULL) 3072 goto fallback;
3073 syn_data->ip_summed = CHECKSUM_PARTIAL;
3074 memcpy(syn_data->cb, syn->cb, sizeof(syn->cb));
3075 if (unlikely(memcpy_fromiovecend(skb_put(syn_data, space),
3076 fo->data->msg_iter.iov, 0, space))) {
3077 kfree_skb(syn_data);
3037 goto fallback; 3078 goto fallback;
3079 }
3038 3080
3039 for (i = 0; i < iovlen && syn_data->len < space; ++i) { 3081 /* No more data pending in inet_wait_for_connect() */
3040 struct iovec *iov = &fo->data->msg_iov[i]; 3082 if (space == fo->size)
3041 unsigned char __user *from = iov->iov_base; 3083 fo->data = NULL;
3042 int len = iov->iov_len; 3084 fo->copied = space;
3043 3085
3044 if (syn_data->len + len > space) 3086 tcp_connect_queue_skb(sk, syn_data);
3045 len = space - syn_data->len;
3046 else if (i + 1 == iovlen)
3047 /* No more data pending in inet_wait_for_connect() */
3048 fo->data = NULL;
3049 3087
3050 if (skb_add_data(syn_data, from, len)) 3088 err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation);
3051 goto fallback;
3052 }
3053 3089
3054 /* Queue a data-only packet after the regular SYN for retransmission */ 3090 syn->skb_mstamp = syn_data->skb_mstamp;
3055 data = pskb_copy(syn_data, sk->sk_allocation);
3056 if (data == NULL)
3057 goto fallback;
3058 TCP_SKB_CB(data)->seq++;
3059 TCP_SKB_CB(data)->tcp_flags &= ~TCPHDR_SYN;
3060 TCP_SKB_CB(data)->tcp_flags = (TCPHDR_ACK|TCPHDR_PSH);
3061 tcp_connect_queue_skb(sk, data);
3062 fo->copied = data->len;
3063
3064 /* syn_data is about to be sent, we need to take current time stamps
3065 * for the packets that are in write queue : SYN packet and DATA
3066 */
3067 skb_mstamp_get(&syn->skb_mstamp);
3068 data->skb_mstamp = syn->skb_mstamp;
3069 3091
3070 if (tcp_transmit_skb(sk, syn_data, 0, sk->sk_allocation) == 0) { 3092 /* Now full SYN+DATA was cloned and sent (or not),
3093 * remove the SYN from the original skb (syn_data)
3094 * we keep in write queue in case of a retransmit, as we
3095 * also have the SYN packet (with no data) in the same queue.
3096 */
3097 TCP_SKB_CB(syn_data)->seq++;
3098 TCP_SKB_CB(syn_data)->tcp_flags = TCPHDR_ACK | TCPHDR_PSH;
3099 if (!err) {
3071 tp->syn_data = (fo->copied > 0); 3100 tp->syn_data = (fo->copied > 0);
3072 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT); 3101 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT);
3073 goto done; 3102 goto done;
3074 } 3103 }
3075 syn_data = NULL;
3076 3104
3077fallback: 3105fallback:
3078 /* Send a regular SYN with Fast Open cookie request option */ 3106 /* Send a regular SYN with Fast Open cookie request option */
@@ -3081,7 +3109,6 @@ fallback:
3081 err = tcp_transmit_skb(sk, syn, 1, sk->sk_allocation); 3109 err = tcp_transmit_skb(sk, syn, 1, sk->sk_allocation);
3082 if (err) 3110 if (err)
3083 tp->syn_fastopen = 0; 3111 tp->syn_fastopen = 0;
3084 kfree_skb(syn_data);
3085done: 3112done:
3086 fo->cookie.len = -1; /* Exclude Fast Open option for SYN retries */ 3113 fo->cookie.len = -1; /* Exclude Fast Open option for SYN retries */
3087 return err; 3114 return err;
@@ -3101,13 +3128,10 @@ int tcp_connect(struct sock *sk)
3101 return 0; 3128 return 0;
3102 } 3129 }
3103 3130
3104 buff = alloc_skb_fclone(MAX_TCP_HEADER + 15, sk->sk_allocation); 3131 buff = sk_stream_alloc_skb(sk, 0, sk->sk_allocation);
3105 if (unlikely(buff == NULL)) 3132 if (unlikely(!buff))
3106 return -ENOBUFS; 3133 return -ENOBUFS;
3107 3134
3108 /* Reserve space for headers. */
3109 skb_reserve(buff, MAX_TCP_HEADER);
3110
3111 tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN); 3135 tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN);
3112 tp->retrans_stamp = tcp_time_stamp; 3136 tp->retrans_stamp = tcp_time_stamp;
3113 tcp_connect_queue_skb(sk, buff); 3137 tcp_connect_queue_skb(sk, buff);