diff options
Diffstat (limited to 'net/ipv4/tcp_output.c')
-rw-r--r-- | net/ipv4/tcp_output.c | 126 |
1 files changed, 80 insertions, 46 deletions
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 2d340bd2cd3d..d92bce0ea24e 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c | |||
@@ -627,7 +627,7 @@ static unsigned int tcp_synack_options(struct sock *sk, | |||
627 | if (unlikely(!ireq->tstamp_ok)) | 627 | if (unlikely(!ireq->tstamp_ok)) |
628 | remaining -= TCPOLEN_SACKPERM_ALIGNED; | 628 | remaining -= TCPOLEN_SACKPERM_ALIGNED; |
629 | } | 629 | } |
630 | if (foc != NULL) { | 630 | if (foc != NULL && foc->len >= 0) { |
631 | u32 need = TCPOLEN_EXP_FASTOPEN_BASE + foc->len; | 631 | u32 need = TCPOLEN_EXP_FASTOPEN_BASE + foc->len; |
632 | need = (need + 3) & ~3U; /* Align to 32 bits */ | 632 | need = (need + 3) & ~3U; /* Align to 32 bits */ |
633 | if (remaining >= need) { | 633 | if (remaining >= need) { |
@@ -878,15 +878,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, | |||
878 | BUG_ON(!skb || !tcp_skb_pcount(skb)); | 878 | BUG_ON(!skb || !tcp_skb_pcount(skb)); |
879 | 879 | ||
880 | if (clone_it) { | 880 | if (clone_it) { |
881 | const struct sk_buff *fclone = skb + 1; | ||
882 | |||
883 | skb_mstamp_get(&skb->skb_mstamp); | 881 | skb_mstamp_get(&skb->skb_mstamp); |
884 | 882 | ||
885 | if (unlikely(skb->fclone == SKB_FCLONE_ORIG && | ||
886 | fclone->fclone == SKB_FCLONE_CLONE)) | ||
887 | NET_INC_STATS(sock_net(sk), | ||
888 | LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES); | ||
889 | |||
890 | if (unlikely(skb_cloned(skb))) | 883 | if (unlikely(skb_cloned(skb))) |
891 | skb = pskb_copy(skb, gfp_mask); | 884 | skb = pskb_copy(skb, gfp_mask); |
892 | else | 885 | else |
@@ -1081,7 +1074,7 @@ static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int de | |||
1081 | * Remember, these are still headerless SKBs at this point. | 1074 | * Remember, these are still headerless SKBs at this point. |
1082 | */ | 1075 | */ |
1083 | int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, | 1076 | int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, |
1084 | unsigned int mss_now) | 1077 | unsigned int mss_now, gfp_t gfp) |
1085 | { | 1078 | { |
1086 | struct tcp_sock *tp = tcp_sk(sk); | 1079 | struct tcp_sock *tp = tcp_sk(sk); |
1087 | struct sk_buff *buff; | 1080 | struct sk_buff *buff; |
@@ -1096,11 +1089,11 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, | |||
1096 | if (nsize < 0) | 1089 | if (nsize < 0) |
1097 | nsize = 0; | 1090 | nsize = 0; |
1098 | 1091 | ||
1099 | if (skb_unclone(skb, GFP_ATOMIC)) | 1092 | if (skb_unclone(skb, gfp)) |
1100 | return -ENOMEM; | 1093 | return -ENOMEM; |
1101 | 1094 | ||
1102 | /* Get a new skb... force flag on. */ | 1095 | /* Get a new skb... force flag on. */ |
1103 | buff = sk_stream_alloc_skb(sk, nsize, GFP_ATOMIC); | 1096 | buff = sk_stream_alloc_skb(sk, nsize, gfp); |
1104 | if (buff == NULL) | 1097 | if (buff == NULL) |
1105 | return -ENOMEM; /* We'll just try again later. */ | 1098 | return -ENOMEM; /* We'll just try again later. */ |
1106 | 1099 | ||
@@ -1387,12 +1380,43 @@ unsigned int tcp_current_mss(struct sock *sk) | |||
1387 | return mss_now; | 1380 | return mss_now; |
1388 | } | 1381 | } |
1389 | 1382 | ||
1390 | /* Congestion window validation. (RFC2861) */ | 1383 | /* RFC2861, slow part. Adjust cwnd, after it was not full during one rto. |
1391 | static void tcp_cwnd_validate(struct sock *sk) | 1384 | * As additional protections, we do not touch cwnd in retransmission phases, |
1385 | * and if application hit its sndbuf limit recently. | ||
1386 | */ | ||
1387 | static void tcp_cwnd_application_limited(struct sock *sk) | ||
1388 | { | ||
1389 | struct tcp_sock *tp = tcp_sk(sk); | ||
1390 | |||
1391 | if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open && | ||
1392 | sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { | ||
1393 | /* Limited by application or receiver window. */ | ||
1394 | u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk)); | ||
1395 | u32 win_used = max(tp->snd_cwnd_used, init_win); | ||
1396 | if (win_used < tp->snd_cwnd) { | ||
1397 | tp->snd_ssthresh = tcp_current_ssthresh(sk); | ||
1398 | tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1; | ||
1399 | } | ||
1400 | tp->snd_cwnd_used = 0; | ||
1401 | } | ||
1402 | tp->snd_cwnd_stamp = tcp_time_stamp; | ||
1403 | } | ||
1404 | |||
1405 | static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited) | ||
1392 | { | 1406 | { |
1393 | struct tcp_sock *tp = tcp_sk(sk); | 1407 | struct tcp_sock *tp = tcp_sk(sk); |
1394 | 1408 | ||
1395 | if (tp->packets_out >= tp->snd_cwnd) { | 1409 | /* Track the maximum number of outstanding packets in each |
1410 | * window, and remember whether we were cwnd-limited then. | ||
1411 | */ | ||
1412 | if (!before(tp->snd_una, tp->max_packets_seq) || | ||
1413 | tp->packets_out > tp->max_packets_out) { | ||
1414 | tp->max_packets_out = tp->packets_out; | ||
1415 | tp->max_packets_seq = tp->snd_nxt; | ||
1416 | tp->is_cwnd_limited = is_cwnd_limited; | ||
1417 | } | ||
1418 | |||
1419 | if (tcp_is_cwnd_limited(sk)) { | ||
1396 | /* Network is feed fully. */ | 1420 | /* Network is feed fully. */ |
1397 | tp->snd_cwnd_used = 0; | 1421 | tp->snd_cwnd_used = 0; |
1398 | tp->snd_cwnd_stamp = tcp_time_stamp; | 1422 | tp->snd_cwnd_stamp = tcp_time_stamp; |
@@ -1601,7 +1625,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, | |||
1601 | 1625 | ||
1602 | /* All of a TSO frame must be composed of paged data. */ | 1626 | /* All of a TSO frame must be composed of paged data. */ |
1603 | if (skb->len != skb->data_len) | 1627 | if (skb->len != skb->data_len) |
1604 | return tcp_fragment(sk, skb, len, mss_now); | 1628 | return tcp_fragment(sk, skb, len, mss_now, gfp); |
1605 | 1629 | ||
1606 | buff = sk_stream_alloc_skb(sk, 0, gfp); | 1630 | buff = sk_stream_alloc_skb(sk, 0, gfp); |
1607 | if (unlikely(buff == NULL)) | 1631 | if (unlikely(buff == NULL)) |
@@ -1644,7 +1668,8 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, | |||
1644 | * | 1668 | * |
1645 | * This algorithm is from John Heffner. | 1669 | * This algorithm is from John Heffner. |
1646 | */ | 1670 | */ |
1647 | static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) | 1671 | static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, |
1672 | bool *is_cwnd_limited) | ||
1648 | { | 1673 | { |
1649 | struct tcp_sock *tp = tcp_sk(sk); | 1674 | struct tcp_sock *tp = tcp_sk(sk); |
1650 | const struct inet_connection_sock *icsk = inet_csk(sk); | 1675 | const struct inet_connection_sock *icsk = inet_csk(sk); |
@@ -1708,6 +1733,9 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) | |||
1708 | if (!tp->tso_deferred) | 1733 | if (!tp->tso_deferred) |
1709 | tp->tso_deferred = 1 | (jiffies << 1); | 1734 | tp->tso_deferred = 1 | (jiffies << 1); |
1710 | 1735 | ||
1736 | if (cong_win < send_win && cong_win < skb->len) | ||
1737 | *is_cwnd_limited = true; | ||
1738 | |||
1711 | return true; | 1739 | return true; |
1712 | 1740 | ||
1713 | send_now: | 1741 | send_now: |
@@ -1868,6 +1896,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, | |||
1868 | unsigned int tso_segs, sent_pkts; | 1896 | unsigned int tso_segs, sent_pkts; |
1869 | int cwnd_quota; | 1897 | int cwnd_quota; |
1870 | int result; | 1898 | int result; |
1899 | bool is_cwnd_limited = false; | ||
1871 | 1900 | ||
1872 | sent_pkts = 0; | 1901 | sent_pkts = 0; |
1873 | 1902 | ||
@@ -1892,6 +1921,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, | |||
1892 | 1921 | ||
1893 | cwnd_quota = tcp_cwnd_test(tp, skb); | 1922 | cwnd_quota = tcp_cwnd_test(tp, skb); |
1894 | if (!cwnd_quota) { | 1923 | if (!cwnd_quota) { |
1924 | is_cwnd_limited = true; | ||
1895 | if (push_one == 2) | 1925 | if (push_one == 2) |
1896 | /* Force out a loss probe pkt. */ | 1926 | /* Force out a loss probe pkt. */ |
1897 | cwnd_quota = 1; | 1927 | cwnd_quota = 1; |
@@ -1908,7 +1938,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, | |||
1908 | nonagle : TCP_NAGLE_PUSH)))) | 1938 | nonagle : TCP_NAGLE_PUSH)))) |
1909 | break; | 1939 | break; |
1910 | } else { | 1940 | } else { |
1911 | if (!push_one && tcp_tso_should_defer(sk, skb)) | 1941 | if (!push_one && |
1942 | tcp_tso_should_defer(sk, skb, &is_cwnd_limited)) | ||
1912 | break; | 1943 | break; |
1913 | } | 1944 | } |
1914 | 1945 | ||
@@ -1973,7 +2004,7 @@ repair: | |||
1973 | /* Send one loss probe per tail loss episode. */ | 2004 | /* Send one loss probe per tail loss episode. */ |
1974 | if (push_one != 2) | 2005 | if (push_one != 2) |
1975 | tcp_schedule_loss_probe(sk); | 2006 | tcp_schedule_loss_probe(sk); |
1976 | tcp_cwnd_validate(sk); | 2007 | tcp_cwnd_validate(sk, is_cwnd_limited); |
1977 | return false; | 2008 | return false; |
1978 | } | 2009 | } |
1979 | return (push_one == 2) || (!tp->packets_out && tcp_send_head(sk)); | 2010 | return (push_one == 2) || (!tp->packets_out && tcp_send_head(sk)); |
@@ -2037,6 +2068,25 @@ bool tcp_schedule_loss_probe(struct sock *sk) | |||
2037 | return true; | 2068 | return true; |
2038 | } | 2069 | } |
2039 | 2070 | ||
2071 | /* Thanks to skb fast clones, we can detect if a prior transmit of | ||
2072 | * a packet is still in a qdisc or driver queue. | ||
2073 | * In this case, there is very little point doing a retransmit ! | ||
2074 | * Note: This is called from BH context only. | ||
2075 | */ | ||
2076 | static bool skb_still_in_host_queue(const struct sock *sk, | ||
2077 | const struct sk_buff *skb) | ||
2078 | { | ||
2079 | const struct sk_buff *fclone = skb + 1; | ||
2080 | |||
2081 | if (unlikely(skb->fclone == SKB_FCLONE_ORIG && | ||
2082 | fclone->fclone == SKB_FCLONE_CLONE)) { | ||
2083 | NET_INC_STATS_BH(sock_net(sk), | ||
2084 | LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES); | ||
2085 | return true; | ||
2086 | } | ||
2087 | return false; | ||
2088 | } | ||
2089 | |||
2040 | /* When probe timeout (PTO) fires, send a new segment if one exists, else | 2090 | /* When probe timeout (PTO) fires, send a new segment if one exists, else |
2041 | * retransmit the last segment. | 2091 | * retransmit the last segment. |
2042 | */ | 2092 | */ |
@@ -2062,12 +2112,16 @@ void tcp_send_loss_probe(struct sock *sk) | |||
2062 | if (WARN_ON(!skb)) | 2112 | if (WARN_ON(!skb)) |
2063 | goto rearm_timer; | 2113 | goto rearm_timer; |
2064 | 2114 | ||
2115 | if (skb_still_in_host_queue(sk, skb)) | ||
2116 | goto rearm_timer; | ||
2117 | |||
2065 | pcount = tcp_skb_pcount(skb); | 2118 | pcount = tcp_skb_pcount(skb); |
2066 | if (WARN_ON(!pcount)) | 2119 | if (WARN_ON(!pcount)) |
2067 | goto rearm_timer; | 2120 | goto rearm_timer; |
2068 | 2121 | ||
2069 | if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) { | 2122 | if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) { |
2070 | if (unlikely(tcp_fragment(sk, skb, (pcount - 1) * mss, mss))) | 2123 | if (unlikely(tcp_fragment(sk, skb, (pcount - 1) * mss, mss, |
2124 | GFP_ATOMIC))) | ||
2071 | goto rearm_timer; | 2125 | goto rearm_timer; |
2072 | skb = tcp_write_queue_tail(sk); | 2126 | skb = tcp_write_queue_tail(sk); |
2073 | } | 2127 | } |
@@ -2075,9 +2129,7 @@ void tcp_send_loss_probe(struct sock *sk) | |||
2075 | if (WARN_ON(!skb || !tcp_skb_pcount(skb))) | 2129 | if (WARN_ON(!skb || !tcp_skb_pcount(skb))) |
2076 | goto rearm_timer; | 2130 | goto rearm_timer; |
2077 | 2131 | ||
2078 | /* Probe with zero data doesn't trigger fast recovery. */ | 2132 | err = __tcp_retransmit_skb(sk, skb); |
2079 | if (skb->len > 0) | ||
2080 | err = __tcp_retransmit_skb(sk, skb); | ||
2081 | 2133 | ||
2082 | /* Record snd_nxt for loss detection. */ | 2134 | /* Record snd_nxt for loss detection. */ |
2083 | if (likely(!err)) | 2135 | if (likely(!err)) |
@@ -2383,6 +2435,9 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) | |||
2383 | min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf)) | 2435 | min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf)) |
2384 | return -EAGAIN; | 2436 | return -EAGAIN; |
2385 | 2437 | ||
2438 | if (skb_still_in_host_queue(sk, skb)) | ||
2439 | return -EBUSY; | ||
2440 | |||
2386 | if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) { | 2441 | if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) { |
2387 | if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) | 2442 | if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) |
2388 | BUG(); | 2443 | BUG(); |
@@ -2405,7 +2460,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) | |||
2405 | return -EAGAIN; | 2460 | return -EAGAIN; |
2406 | 2461 | ||
2407 | if (skb->len > cur_mss) { | 2462 | if (skb->len > cur_mss) { |
2408 | if (tcp_fragment(sk, skb, cur_mss, cur_mss)) | 2463 | if (tcp_fragment(sk, skb, cur_mss, cur_mss, GFP_ATOMIC)) |
2409 | return -ENOMEM; /* We'll try again later. */ | 2464 | return -ENOMEM; /* We'll try again later. */ |
2410 | } else { | 2465 | } else { |
2411 | int oldpcount = tcp_skb_pcount(skb); | 2466 | int oldpcount = tcp_skb_pcount(skb); |
@@ -2476,7 +2531,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) | |||
2476 | * see tcp_input.c tcp_sacktag_write_queue(). | 2531 | * see tcp_input.c tcp_sacktag_write_queue(). |
2477 | */ | 2532 | */ |
2478 | TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt; | 2533 | TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt; |
2479 | } else { | 2534 | } else if (err != -EBUSY) { |
2480 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL); | 2535 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL); |
2481 | } | 2536 | } |
2482 | return err; | 2537 | return err; |
@@ -2754,27 +2809,6 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, | |||
2754 | if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss) | 2809 | if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss) |
2755 | mss = tp->rx_opt.user_mss; | 2810 | mss = tp->rx_opt.user_mss; |
2756 | 2811 | ||
2757 | if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */ | ||
2758 | __u8 rcv_wscale; | ||
2759 | /* Set this up on the first call only */ | ||
2760 | req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW); | ||
2761 | |||
2762 | /* limit the window selection if the user enforce a smaller rx buffer */ | ||
2763 | if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && | ||
2764 | (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0)) | ||
2765 | req->window_clamp = tcp_full_space(sk); | ||
2766 | |||
2767 | /* tcp_full_space because it is guaranteed to be the first packet */ | ||
2768 | tcp_select_initial_window(tcp_full_space(sk), | ||
2769 | mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), | ||
2770 | &req->rcv_wnd, | ||
2771 | &req->window_clamp, | ||
2772 | ireq->wscale_ok, | ||
2773 | &rcv_wscale, | ||
2774 | dst_metric(dst, RTAX_INITRWND)); | ||
2775 | ireq->rcv_wscale = rcv_wscale; | ||
2776 | } | ||
2777 | |||
2778 | memset(&opts, 0, sizeof(opts)); | 2812 | memset(&opts, 0, sizeof(opts)); |
2779 | #ifdef CONFIG_SYN_COOKIES | 2813 | #ifdef CONFIG_SYN_COOKIES |
2780 | if (unlikely(req->cookie_ts)) | 2814 | if (unlikely(req->cookie_ts)) |
@@ -3207,7 +3241,7 @@ int tcp_write_wakeup(struct sock *sk) | |||
3207 | skb->len > mss) { | 3241 | skb->len > mss) { |
3208 | seg_size = min(seg_size, mss); | 3242 | seg_size = min(seg_size, mss); |
3209 | TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; | 3243 | TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; |
3210 | if (tcp_fragment(sk, skb, seg_size, mss)) | 3244 | if (tcp_fragment(sk, skb, seg_size, mss, GFP_ATOMIC)) |
3211 | return -1; | 3245 | return -1; |
3212 | } else if (!tcp_skb_pcount(skb)) | 3246 | } else if (!tcp_skb_pcount(skb)) |
3213 | tcp_set_skb_tso_segs(sk, skb, mss); | 3247 | tcp_set_skb_tso_segs(sk, skb, mss); |