aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp_output.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4/tcp_output.c')
-rw-r--r--net/ipv4/tcp_output.c126
1 files changed, 80 insertions, 46 deletions
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 2d340bd2cd3d..d92bce0ea24e 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -627,7 +627,7 @@ static unsigned int tcp_synack_options(struct sock *sk,
627 if (unlikely(!ireq->tstamp_ok)) 627 if (unlikely(!ireq->tstamp_ok))
628 remaining -= TCPOLEN_SACKPERM_ALIGNED; 628 remaining -= TCPOLEN_SACKPERM_ALIGNED;
629 } 629 }
630 if (foc != NULL) { 630 if (foc != NULL && foc->len >= 0) {
631 u32 need = TCPOLEN_EXP_FASTOPEN_BASE + foc->len; 631 u32 need = TCPOLEN_EXP_FASTOPEN_BASE + foc->len;
632 need = (need + 3) & ~3U; /* Align to 32 bits */ 632 need = (need + 3) & ~3U; /* Align to 32 bits */
633 if (remaining >= need) { 633 if (remaining >= need) {
@@ -878,15 +878,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
878 BUG_ON(!skb || !tcp_skb_pcount(skb)); 878 BUG_ON(!skb || !tcp_skb_pcount(skb));
879 879
880 if (clone_it) { 880 if (clone_it) {
881 const struct sk_buff *fclone = skb + 1;
882
883 skb_mstamp_get(&skb->skb_mstamp); 881 skb_mstamp_get(&skb->skb_mstamp);
884 882
885 if (unlikely(skb->fclone == SKB_FCLONE_ORIG &&
886 fclone->fclone == SKB_FCLONE_CLONE))
887 NET_INC_STATS(sock_net(sk),
888 LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES);
889
890 if (unlikely(skb_cloned(skb))) 883 if (unlikely(skb_cloned(skb)))
891 skb = pskb_copy(skb, gfp_mask); 884 skb = pskb_copy(skb, gfp_mask);
892 else 885 else
@@ -1081,7 +1074,7 @@ static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int de
1081 * Remember, these are still headerless SKBs at this point. 1074 * Remember, these are still headerless SKBs at this point.
1082 */ 1075 */
1083int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, 1076int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
1084 unsigned int mss_now) 1077 unsigned int mss_now, gfp_t gfp)
1085{ 1078{
1086 struct tcp_sock *tp = tcp_sk(sk); 1079 struct tcp_sock *tp = tcp_sk(sk);
1087 struct sk_buff *buff; 1080 struct sk_buff *buff;
@@ -1096,11 +1089,11 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
1096 if (nsize < 0) 1089 if (nsize < 0)
1097 nsize = 0; 1090 nsize = 0;
1098 1091
1099 if (skb_unclone(skb, GFP_ATOMIC)) 1092 if (skb_unclone(skb, gfp))
1100 return -ENOMEM; 1093 return -ENOMEM;
1101 1094
1102 /* Get a new skb... force flag on. */ 1095 /* Get a new skb... force flag on. */
1103 buff = sk_stream_alloc_skb(sk, nsize, GFP_ATOMIC); 1096 buff = sk_stream_alloc_skb(sk, nsize, gfp);
1104 if (buff == NULL) 1097 if (buff == NULL)
1105 return -ENOMEM; /* We'll just try again later. */ 1098 return -ENOMEM; /* We'll just try again later. */
1106 1099
@@ -1387,12 +1380,43 @@ unsigned int tcp_current_mss(struct sock *sk)
1387 return mss_now; 1380 return mss_now;
1388} 1381}
1389 1382
1390/* Congestion window validation. (RFC2861) */ 1383/* RFC2861, slow part. Adjust cwnd, after it was not full during one rto.
1391static void tcp_cwnd_validate(struct sock *sk) 1384 * As additional protections, we do not touch cwnd in retransmission phases,
1385 * and if application hit its sndbuf limit recently.
1386 */
1387static void tcp_cwnd_application_limited(struct sock *sk)
1388{
1389 struct tcp_sock *tp = tcp_sk(sk);
1390
1391 if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open &&
1392 sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
1393 /* Limited by application or receiver window. */
1394 u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk));
1395 u32 win_used = max(tp->snd_cwnd_used, init_win);
1396 if (win_used < tp->snd_cwnd) {
1397 tp->snd_ssthresh = tcp_current_ssthresh(sk);
1398 tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1;
1399 }
1400 tp->snd_cwnd_used = 0;
1401 }
1402 tp->snd_cwnd_stamp = tcp_time_stamp;
1403}
1404
1405static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited)
1392{ 1406{
1393 struct tcp_sock *tp = tcp_sk(sk); 1407 struct tcp_sock *tp = tcp_sk(sk);
1394 1408
1395 if (tp->packets_out >= tp->snd_cwnd) { 1409 /* Track the maximum number of outstanding packets in each
1410 * window, and remember whether we were cwnd-limited then.
1411 */
1412 if (!before(tp->snd_una, tp->max_packets_seq) ||
1413 tp->packets_out > tp->max_packets_out) {
1414 tp->max_packets_out = tp->packets_out;
1415 tp->max_packets_seq = tp->snd_nxt;
1416 tp->is_cwnd_limited = is_cwnd_limited;
1417 }
1418
1419 if (tcp_is_cwnd_limited(sk)) {
1396 /* Network is feed fully. */ 1420 /* Network is feed fully. */
1397 tp->snd_cwnd_used = 0; 1421 tp->snd_cwnd_used = 0;
1398 tp->snd_cwnd_stamp = tcp_time_stamp; 1422 tp->snd_cwnd_stamp = tcp_time_stamp;
@@ -1601,7 +1625,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
1601 1625
1602 /* All of a TSO frame must be composed of paged data. */ 1626 /* All of a TSO frame must be composed of paged data. */
1603 if (skb->len != skb->data_len) 1627 if (skb->len != skb->data_len)
1604 return tcp_fragment(sk, skb, len, mss_now); 1628 return tcp_fragment(sk, skb, len, mss_now, gfp);
1605 1629
1606 buff = sk_stream_alloc_skb(sk, 0, gfp); 1630 buff = sk_stream_alloc_skb(sk, 0, gfp);
1607 if (unlikely(buff == NULL)) 1631 if (unlikely(buff == NULL))
@@ -1644,7 +1668,8 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
1644 * 1668 *
1645 * This algorithm is from John Heffner. 1669 * This algorithm is from John Heffner.
1646 */ 1670 */
1647static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) 1671static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
1672 bool *is_cwnd_limited)
1648{ 1673{
1649 struct tcp_sock *tp = tcp_sk(sk); 1674 struct tcp_sock *tp = tcp_sk(sk);
1650 const struct inet_connection_sock *icsk = inet_csk(sk); 1675 const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -1708,6 +1733,9 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
1708 if (!tp->tso_deferred) 1733 if (!tp->tso_deferred)
1709 tp->tso_deferred = 1 | (jiffies << 1); 1734 tp->tso_deferred = 1 | (jiffies << 1);
1710 1735
1736 if (cong_win < send_win && cong_win < skb->len)
1737 *is_cwnd_limited = true;
1738
1711 return true; 1739 return true;
1712 1740
1713send_now: 1741send_now:
@@ -1868,6 +1896,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1868 unsigned int tso_segs, sent_pkts; 1896 unsigned int tso_segs, sent_pkts;
1869 int cwnd_quota; 1897 int cwnd_quota;
1870 int result; 1898 int result;
1899 bool is_cwnd_limited = false;
1871 1900
1872 sent_pkts = 0; 1901 sent_pkts = 0;
1873 1902
@@ -1892,6 +1921,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1892 1921
1893 cwnd_quota = tcp_cwnd_test(tp, skb); 1922 cwnd_quota = tcp_cwnd_test(tp, skb);
1894 if (!cwnd_quota) { 1923 if (!cwnd_quota) {
1924 is_cwnd_limited = true;
1895 if (push_one == 2) 1925 if (push_one == 2)
1896 /* Force out a loss probe pkt. */ 1926 /* Force out a loss probe pkt. */
1897 cwnd_quota = 1; 1927 cwnd_quota = 1;
@@ -1908,7 +1938,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1908 nonagle : TCP_NAGLE_PUSH)))) 1938 nonagle : TCP_NAGLE_PUSH))))
1909 break; 1939 break;
1910 } else { 1940 } else {
1911 if (!push_one && tcp_tso_should_defer(sk, skb)) 1941 if (!push_one &&
1942 tcp_tso_should_defer(sk, skb, &is_cwnd_limited))
1912 break; 1943 break;
1913 } 1944 }
1914 1945
@@ -1973,7 +2004,7 @@ repair:
1973 /* Send one loss probe per tail loss episode. */ 2004 /* Send one loss probe per tail loss episode. */
1974 if (push_one != 2) 2005 if (push_one != 2)
1975 tcp_schedule_loss_probe(sk); 2006 tcp_schedule_loss_probe(sk);
1976 tcp_cwnd_validate(sk); 2007 tcp_cwnd_validate(sk, is_cwnd_limited);
1977 return false; 2008 return false;
1978 } 2009 }
1979 return (push_one == 2) || (!tp->packets_out && tcp_send_head(sk)); 2010 return (push_one == 2) || (!tp->packets_out && tcp_send_head(sk));
@@ -2037,6 +2068,25 @@ bool tcp_schedule_loss_probe(struct sock *sk)
2037 return true; 2068 return true;
2038} 2069}
2039 2070
2071/* Thanks to skb fast clones, we can detect if a prior transmit of
2072 * a packet is still in a qdisc or driver queue.
2073 * In this case, there is very little point doing a retransmit !
2074 * Note: This is called from BH context only.
2075 */
2076static bool skb_still_in_host_queue(const struct sock *sk,
2077 const struct sk_buff *skb)
2078{
2079 const struct sk_buff *fclone = skb + 1;
2080
2081 if (unlikely(skb->fclone == SKB_FCLONE_ORIG &&
2082 fclone->fclone == SKB_FCLONE_CLONE)) {
2083 NET_INC_STATS_BH(sock_net(sk),
2084 LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES);
2085 return true;
2086 }
2087 return false;
2088}
2089
2040/* When probe timeout (PTO) fires, send a new segment if one exists, else 2090/* When probe timeout (PTO) fires, send a new segment if one exists, else
2041 * retransmit the last segment. 2091 * retransmit the last segment.
2042 */ 2092 */
@@ -2062,12 +2112,16 @@ void tcp_send_loss_probe(struct sock *sk)
2062 if (WARN_ON(!skb)) 2112 if (WARN_ON(!skb))
2063 goto rearm_timer; 2113 goto rearm_timer;
2064 2114
2115 if (skb_still_in_host_queue(sk, skb))
2116 goto rearm_timer;
2117
2065 pcount = tcp_skb_pcount(skb); 2118 pcount = tcp_skb_pcount(skb);
2066 if (WARN_ON(!pcount)) 2119 if (WARN_ON(!pcount))
2067 goto rearm_timer; 2120 goto rearm_timer;
2068 2121
2069 if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) { 2122 if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) {
2070 if (unlikely(tcp_fragment(sk, skb, (pcount - 1) * mss, mss))) 2123 if (unlikely(tcp_fragment(sk, skb, (pcount - 1) * mss, mss,
2124 GFP_ATOMIC)))
2071 goto rearm_timer; 2125 goto rearm_timer;
2072 skb = tcp_write_queue_tail(sk); 2126 skb = tcp_write_queue_tail(sk);
2073 } 2127 }
@@ -2075,9 +2129,7 @@ void tcp_send_loss_probe(struct sock *sk)
2075 if (WARN_ON(!skb || !tcp_skb_pcount(skb))) 2129 if (WARN_ON(!skb || !tcp_skb_pcount(skb)))
2076 goto rearm_timer; 2130 goto rearm_timer;
2077 2131
2078 /* Probe with zero data doesn't trigger fast recovery. */ 2132 err = __tcp_retransmit_skb(sk, skb);
2079 if (skb->len > 0)
2080 err = __tcp_retransmit_skb(sk, skb);
2081 2133
2082 /* Record snd_nxt for loss detection. */ 2134 /* Record snd_nxt for loss detection. */
2083 if (likely(!err)) 2135 if (likely(!err))
@@ -2383,6 +2435,9 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
2383 min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf)) 2435 min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf))
2384 return -EAGAIN; 2436 return -EAGAIN;
2385 2437
2438 if (skb_still_in_host_queue(sk, skb))
2439 return -EBUSY;
2440
2386 if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) { 2441 if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {
2387 if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) 2442 if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
2388 BUG(); 2443 BUG();
@@ -2405,7 +2460,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
2405 return -EAGAIN; 2460 return -EAGAIN;
2406 2461
2407 if (skb->len > cur_mss) { 2462 if (skb->len > cur_mss) {
2408 if (tcp_fragment(sk, skb, cur_mss, cur_mss)) 2463 if (tcp_fragment(sk, skb, cur_mss, cur_mss, GFP_ATOMIC))
2409 return -ENOMEM; /* We'll try again later. */ 2464 return -ENOMEM; /* We'll try again later. */
2410 } else { 2465 } else {
2411 int oldpcount = tcp_skb_pcount(skb); 2466 int oldpcount = tcp_skb_pcount(skb);
@@ -2476,7 +2531,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
2476 * see tcp_input.c tcp_sacktag_write_queue(). 2531 * see tcp_input.c tcp_sacktag_write_queue().
2477 */ 2532 */
2478 TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt; 2533 TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt;
2479 } else { 2534 } else if (err != -EBUSY) {
2480 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL); 2535 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);
2481 } 2536 }
2482 return err; 2537 return err;
@@ -2754,27 +2809,6 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2754 if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss) 2809 if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss)
2755 mss = tp->rx_opt.user_mss; 2810 mss = tp->rx_opt.user_mss;
2756 2811
2757 if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
2758 __u8 rcv_wscale;
2759 /* Set this up on the first call only */
2760 req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW);
2761
2762 /* limit the window selection if the user enforce a smaller rx buffer */
2763 if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
2764 (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0))
2765 req->window_clamp = tcp_full_space(sk);
2766
2767 /* tcp_full_space because it is guaranteed to be the first packet */
2768 tcp_select_initial_window(tcp_full_space(sk),
2769 mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
2770 &req->rcv_wnd,
2771 &req->window_clamp,
2772 ireq->wscale_ok,
2773 &rcv_wscale,
2774 dst_metric(dst, RTAX_INITRWND));
2775 ireq->rcv_wscale = rcv_wscale;
2776 }
2777
2778 memset(&opts, 0, sizeof(opts)); 2812 memset(&opts, 0, sizeof(opts));
2779#ifdef CONFIG_SYN_COOKIES 2813#ifdef CONFIG_SYN_COOKIES
2780 if (unlikely(req->cookie_ts)) 2814 if (unlikely(req->cookie_ts))
@@ -3207,7 +3241,7 @@ int tcp_write_wakeup(struct sock *sk)
3207 skb->len > mss) { 3241 skb->len > mss) {
3208 seg_size = min(seg_size, mss); 3242 seg_size = min(seg_size, mss);
3209 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; 3243 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
3210 if (tcp_fragment(sk, skb, seg_size, mss)) 3244 if (tcp_fragment(sk, skb, seg_size, mss, GFP_ATOMIC))
3211 return -1; 3245 return -1;
3212 } else if (!tcp_skb_pcount(skb)) 3246 } else if (!tcp_skb_pcount(skb))
3213 tcp_set_skb_tso_segs(sk, skb, mss); 3247 tcp_set_skb_tso_segs(sk, skb, mss);