diff options
Diffstat (limited to 'net/ipv4/tcp_output.c')
| -rw-r--r-- | net/ipv4/tcp_output.c | 126 |
1 files changed, 80 insertions, 46 deletions
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 2d340bd2cd3d..d92bce0ea24e 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c | |||
| @@ -627,7 +627,7 @@ static unsigned int tcp_synack_options(struct sock *sk, | |||
| 627 | if (unlikely(!ireq->tstamp_ok)) | 627 | if (unlikely(!ireq->tstamp_ok)) |
| 628 | remaining -= TCPOLEN_SACKPERM_ALIGNED; | 628 | remaining -= TCPOLEN_SACKPERM_ALIGNED; |
| 629 | } | 629 | } |
| 630 | if (foc != NULL) { | 630 | if (foc != NULL && foc->len >= 0) { |
| 631 | u32 need = TCPOLEN_EXP_FASTOPEN_BASE + foc->len; | 631 | u32 need = TCPOLEN_EXP_FASTOPEN_BASE + foc->len; |
| 632 | need = (need + 3) & ~3U; /* Align to 32 bits */ | 632 | need = (need + 3) & ~3U; /* Align to 32 bits */ |
| 633 | if (remaining >= need) { | 633 | if (remaining >= need) { |
| @@ -878,15 +878,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, | |||
| 878 | BUG_ON(!skb || !tcp_skb_pcount(skb)); | 878 | BUG_ON(!skb || !tcp_skb_pcount(skb)); |
| 879 | 879 | ||
| 880 | if (clone_it) { | 880 | if (clone_it) { |
| 881 | const struct sk_buff *fclone = skb + 1; | ||
| 882 | |||
| 883 | skb_mstamp_get(&skb->skb_mstamp); | 881 | skb_mstamp_get(&skb->skb_mstamp); |
| 884 | 882 | ||
| 885 | if (unlikely(skb->fclone == SKB_FCLONE_ORIG && | ||
| 886 | fclone->fclone == SKB_FCLONE_CLONE)) | ||
| 887 | NET_INC_STATS(sock_net(sk), | ||
| 888 | LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES); | ||
| 889 | |||
| 890 | if (unlikely(skb_cloned(skb))) | 883 | if (unlikely(skb_cloned(skb))) |
| 891 | skb = pskb_copy(skb, gfp_mask); | 884 | skb = pskb_copy(skb, gfp_mask); |
| 892 | else | 885 | else |
| @@ -1081,7 +1074,7 @@ static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int de | |||
| 1081 | * Remember, these are still headerless SKBs at this point. | 1074 | * Remember, these are still headerless SKBs at this point. |
| 1082 | */ | 1075 | */ |
| 1083 | int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, | 1076 | int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, |
| 1084 | unsigned int mss_now) | 1077 | unsigned int mss_now, gfp_t gfp) |
| 1085 | { | 1078 | { |
| 1086 | struct tcp_sock *tp = tcp_sk(sk); | 1079 | struct tcp_sock *tp = tcp_sk(sk); |
| 1087 | struct sk_buff *buff; | 1080 | struct sk_buff *buff; |
| @@ -1096,11 +1089,11 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, | |||
| 1096 | if (nsize < 0) | 1089 | if (nsize < 0) |
| 1097 | nsize = 0; | 1090 | nsize = 0; |
| 1098 | 1091 | ||
| 1099 | if (skb_unclone(skb, GFP_ATOMIC)) | 1092 | if (skb_unclone(skb, gfp)) |
| 1100 | return -ENOMEM; | 1093 | return -ENOMEM; |
| 1101 | 1094 | ||
| 1102 | /* Get a new skb... force flag on. */ | 1095 | /* Get a new skb... force flag on. */ |
| 1103 | buff = sk_stream_alloc_skb(sk, nsize, GFP_ATOMIC); | 1096 | buff = sk_stream_alloc_skb(sk, nsize, gfp); |
| 1104 | if (buff == NULL) | 1097 | if (buff == NULL) |
| 1105 | return -ENOMEM; /* We'll just try again later. */ | 1098 | return -ENOMEM; /* We'll just try again later. */ |
| 1106 | 1099 | ||
| @@ -1387,12 +1380,43 @@ unsigned int tcp_current_mss(struct sock *sk) | |||
| 1387 | return mss_now; | 1380 | return mss_now; |
| 1388 | } | 1381 | } |
| 1389 | 1382 | ||
| 1390 | /* Congestion window validation. (RFC2861) */ | 1383 | /* RFC2861, slow part. Adjust cwnd, after it was not full during one rto. |
| 1391 | static void tcp_cwnd_validate(struct sock *sk) | 1384 | * As additional protections, we do not touch cwnd in retransmission phases, |
| 1385 | * and if application hit its sndbuf limit recently. | ||
| 1386 | */ | ||
| 1387 | static void tcp_cwnd_application_limited(struct sock *sk) | ||
| 1388 | { | ||
| 1389 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 1390 | |||
| 1391 | if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open && | ||
| 1392 | sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { | ||
| 1393 | /* Limited by application or receiver window. */ | ||
| 1394 | u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk)); | ||
| 1395 | u32 win_used = max(tp->snd_cwnd_used, init_win); | ||
| 1396 | if (win_used < tp->snd_cwnd) { | ||
| 1397 | tp->snd_ssthresh = tcp_current_ssthresh(sk); | ||
| 1398 | tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1; | ||
| 1399 | } | ||
| 1400 | tp->snd_cwnd_used = 0; | ||
| 1401 | } | ||
| 1402 | tp->snd_cwnd_stamp = tcp_time_stamp; | ||
| 1403 | } | ||
| 1404 | |||
| 1405 | static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited) | ||
| 1392 | { | 1406 | { |
| 1393 | struct tcp_sock *tp = tcp_sk(sk); | 1407 | struct tcp_sock *tp = tcp_sk(sk); |
| 1394 | 1408 | ||
| 1395 | if (tp->packets_out >= tp->snd_cwnd) { | 1409 | /* Track the maximum number of outstanding packets in each |
| 1410 | * window, and remember whether we were cwnd-limited then. | ||
| 1411 | */ | ||
| 1412 | if (!before(tp->snd_una, tp->max_packets_seq) || | ||
| 1413 | tp->packets_out > tp->max_packets_out) { | ||
| 1414 | tp->max_packets_out = tp->packets_out; | ||
| 1415 | tp->max_packets_seq = tp->snd_nxt; | ||
| 1416 | tp->is_cwnd_limited = is_cwnd_limited; | ||
| 1417 | } | ||
| 1418 | |||
| 1419 | if (tcp_is_cwnd_limited(sk)) { | ||
| 1396 | /* Network is feed fully. */ | 1420 | /* Network is feed fully. */ |
| 1397 | tp->snd_cwnd_used = 0; | 1421 | tp->snd_cwnd_used = 0; |
| 1398 | tp->snd_cwnd_stamp = tcp_time_stamp; | 1422 | tp->snd_cwnd_stamp = tcp_time_stamp; |
| @@ -1601,7 +1625,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, | |||
| 1601 | 1625 | ||
| 1602 | /* All of a TSO frame must be composed of paged data. */ | 1626 | /* All of a TSO frame must be composed of paged data. */ |
| 1603 | if (skb->len != skb->data_len) | 1627 | if (skb->len != skb->data_len) |
| 1604 | return tcp_fragment(sk, skb, len, mss_now); | 1628 | return tcp_fragment(sk, skb, len, mss_now, gfp); |
| 1605 | 1629 | ||
| 1606 | buff = sk_stream_alloc_skb(sk, 0, gfp); | 1630 | buff = sk_stream_alloc_skb(sk, 0, gfp); |
| 1607 | if (unlikely(buff == NULL)) | 1631 | if (unlikely(buff == NULL)) |
| @@ -1644,7 +1668,8 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, | |||
| 1644 | * | 1668 | * |
| 1645 | * This algorithm is from John Heffner. | 1669 | * This algorithm is from John Heffner. |
| 1646 | */ | 1670 | */ |
| 1647 | static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) | 1671 | static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, |
| 1672 | bool *is_cwnd_limited) | ||
| 1648 | { | 1673 | { |
| 1649 | struct tcp_sock *tp = tcp_sk(sk); | 1674 | struct tcp_sock *tp = tcp_sk(sk); |
| 1650 | const struct inet_connection_sock *icsk = inet_csk(sk); | 1675 | const struct inet_connection_sock *icsk = inet_csk(sk); |
| @@ -1708,6 +1733,9 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) | |||
| 1708 | if (!tp->tso_deferred) | 1733 | if (!tp->tso_deferred) |
| 1709 | tp->tso_deferred = 1 | (jiffies << 1); | 1734 | tp->tso_deferred = 1 | (jiffies << 1); |
| 1710 | 1735 | ||
| 1736 | if (cong_win < send_win && cong_win < skb->len) | ||
| 1737 | *is_cwnd_limited = true; | ||
| 1738 | |||
| 1711 | return true; | 1739 | return true; |
| 1712 | 1740 | ||
| 1713 | send_now: | 1741 | send_now: |
| @@ -1868,6 +1896,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, | |||
| 1868 | unsigned int tso_segs, sent_pkts; | 1896 | unsigned int tso_segs, sent_pkts; |
| 1869 | int cwnd_quota; | 1897 | int cwnd_quota; |
| 1870 | int result; | 1898 | int result; |
| 1899 | bool is_cwnd_limited = false; | ||
| 1871 | 1900 | ||
| 1872 | sent_pkts = 0; | 1901 | sent_pkts = 0; |
| 1873 | 1902 | ||
| @@ -1892,6 +1921,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, | |||
| 1892 | 1921 | ||
| 1893 | cwnd_quota = tcp_cwnd_test(tp, skb); | 1922 | cwnd_quota = tcp_cwnd_test(tp, skb); |
| 1894 | if (!cwnd_quota) { | 1923 | if (!cwnd_quota) { |
| 1924 | is_cwnd_limited = true; | ||
| 1895 | if (push_one == 2) | 1925 | if (push_one == 2) |
| 1896 | /* Force out a loss probe pkt. */ | 1926 | /* Force out a loss probe pkt. */ |
| 1897 | cwnd_quota = 1; | 1927 | cwnd_quota = 1; |
| @@ -1908,7 +1938,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, | |||
| 1908 | nonagle : TCP_NAGLE_PUSH)))) | 1938 | nonagle : TCP_NAGLE_PUSH)))) |
| 1909 | break; | 1939 | break; |
| 1910 | } else { | 1940 | } else { |
| 1911 | if (!push_one && tcp_tso_should_defer(sk, skb)) | 1941 | if (!push_one && |
| 1942 | tcp_tso_should_defer(sk, skb, &is_cwnd_limited)) | ||
| 1912 | break; | 1943 | break; |
| 1913 | } | 1944 | } |
| 1914 | 1945 | ||
| @@ -1973,7 +2004,7 @@ repair: | |||
| 1973 | /* Send one loss probe per tail loss episode. */ | 2004 | /* Send one loss probe per tail loss episode. */ |
| 1974 | if (push_one != 2) | 2005 | if (push_one != 2) |
| 1975 | tcp_schedule_loss_probe(sk); | 2006 | tcp_schedule_loss_probe(sk); |
| 1976 | tcp_cwnd_validate(sk); | 2007 | tcp_cwnd_validate(sk, is_cwnd_limited); |
| 1977 | return false; | 2008 | return false; |
| 1978 | } | 2009 | } |
| 1979 | return (push_one == 2) || (!tp->packets_out && tcp_send_head(sk)); | 2010 | return (push_one == 2) || (!tp->packets_out && tcp_send_head(sk)); |
| @@ -2037,6 +2068,25 @@ bool tcp_schedule_loss_probe(struct sock *sk) | |||
| 2037 | return true; | 2068 | return true; |
| 2038 | } | 2069 | } |
| 2039 | 2070 | ||
| 2071 | /* Thanks to skb fast clones, we can detect if a prior transmit of | ||
| 2072 | * a packet is still in a qdisc or driver queue. | ||
| 2073 | * In this case, there is very little point doing a retransmit ! | ||
| 2074 | * Note: This is called from BH context only. | ||
| 2075 | */ | ||
| 2076 | static bool skb_still_in_host_queue(const struct sock *sk, | ||
| 2077 | const struct sk_buff *skb) | ||
| 2078 | { | ||
| 2079 | const struct sk_buff *fclone = skb + 1; | ||
| 2080 | |||
| 2081 | if (unlikely(skb->fclone == SKB_FCLONE_ORIG && | ||
| 2082 | fclone->fclone == SKB_FCLONE_CLONE)) { | ||
| 2083 | NET_INC_STATS_BH(sock_net(sk), | ||
| 2084 | LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES); | ||
| 2085 | return true; | ||
| 2086 | } | ||
| 2087 | return false; | ||
| 2088 | } | ||
| 2089 | |||
| 2040 | /* When probe timeout (PTO) fires, send a new segment if one exists, else | 2090 | /* When probe timeout (PTO) fires, send a new segment if one exists, else |
| 2041 | * retransmit the last segment. | 2091 | * retransmit the last segment. |
| 2042 | */ | 2092 | */ |
| @@ -2062,12 +2112,16 @@ void tcp_send_loss_probe(struct sock *sk) | |||
| 2062 | if (WARN_ON(!skb)) | 2112 | if (WARN_ON(!skb)) |
| 2063 | goto rearm_timer; | 2113 | goto rearm_timer; |
| 2064 | 2114 | ||
| 2115 | if (skb_still_in_host_queue(sk, skb)) | ||
| 2116 | goto rearm_timer; | ||
| 2117 | |||
| 2065 | pcount = tcp_skb_pcount(skb); | 2118 | pcount = tcp_skb_pcount(skb); |
| 2066 | if (WARN_ON(!pcount)) | 2119 | if (WARN_ON(!pcount)) |
| 2067 | goto rearm_timer; | 2120 | goto rearm_timer; |
| 2068 | 2121 | ||
| 2069 | if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) { | 2122 | if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) { |
| 2070 | if (unlikely(tcp_fragment(sk, skb, (pcount - 1) * mss, mss))) | 2123 | if (unlikely(tcp_fragment(sk, skb, (pcount - 1) * mss, mss, |
| 2124 | GFP_ATOMIC))) | ||
| 2071 | goto rearm_timer; | 2125 | goto rearm_timer; |
| 2072 | skb = tcp_write_queue_tail(sk); | 2126 | skb = tcp_write_queue_tail(sk); |
| 2073 | } | 2127 | } |
| @@ -2075,9 +2129,7 @@ void tcp_send_loss_probe(struct sock *sk) | |||
| 2075 | if (WARN_ON(!skb || !tcp_skb_pcount(skb))) | 2129 | if (WARN_ON(!skb || !tcp_skb_pcount(skb))) |
| 2076 | goto rearm_timer; | 2130 | goto rearm_timer; |
| 2077 | 2131 | ||
| 2078 | /* Probe with zero data doesn't trigger fast recovery. */ | 2132 | err = __tcp_retransmit_skb(sk, skb); |
| 2079 | if (skb->len > 0) | ||
| 2080 | err = __tcp_retransmit_skb(sk, skb); | ||
| 2081 | 2133 | ||
| 2082 | /* Record snd_nxt for loss detection. */ | 2134 | /* Record snd_nxt for loss detection. */ |
| 2083 | if (likely(!err)) | 2135 | if (likely(!err)) |
| @@ -2383,6 +2435,9 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) | |||
| 2383 | min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf)) | 2435 | min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf)) |
| 2384 | return -EAGAIN; | 2436 | return -EAGAIN; |
| 2385 | 2437 | ||
| 2438 | if (skb_still_in_host_queue(sk, skb)) | ||
| 2439 | return -EBUSY; | ||
| 2440 | |||
| 2386 | if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) { | 2441 | if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) { |
| 2387 | if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) | 2442 | if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) |
| 2388 | BUG(); | 2443 | BUG(); |
| @@ -2405,7 +2460,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) | |||
| 2405 | return -EAGAIN; | 2460 | return -EAGAIN; |
| 2406 | 2461 | ||
| 2407 | if (skb->len > cur_mss) { | 2462 | if (skb->len > cur_mss) { |
| 2408 | if (tcp_fragment(sk, skb, cur_mss, cur_mss)) | 2463 | if (tcp_fragment(sk, skb, cur_mss, cur_mss, GFP_ATOMIC)) |
| 2409 | return -ENOMEM; /* We'll try again later. */ | 2464 | return -ENOMEM; /* We'll try again later. */ |
| 2410 | } else { | 2465 | } else { |
| 2411 | int oldpcount = tcp_skb_pcount(skb); | 2466 | int oldpcount = tcp_skb_pcount(skb); |
| @@ -2476,7 +2531,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) | |||
| 2476 | * see tcp_input.c tcp_sacktag_write_queue(). | 2531 | * see tcp_input.c tcp_sacktag_write_queue(). |
| 2477 | */ | 2532 | */ |
| 2478 | TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt; | 2533 | TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt; |
| 2479 | } else { | 2534 | } else if (err != -EBUSY) { |
| 2480 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL); | 2535 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL); |
| 2481 | } | 2536 | } |
| 2482 | return err; | 2537 | return err; |
| @@ -2754,27 +2809,6 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, | |||
| 2754 | if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss) | 2809 | if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss) |
| 2755 | mss = tp->rx_opt.user_mss; | 2810 | mss = tp->rx_opt.user_mss; |
| 2756 | 2811 | ||
| 2757 | if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */ | ||
| 2758 | __u8 rcv_wscale; | ||
| 2759 | /* Set this up on the first call only */ | ||
| 2760 | req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW); | ||
| 2761 | |||
| 2762 | /* limit the window selection if the user enforce a smaller rx buffer */ | ||
| 2763 | if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && | ||
| 2764 | (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0)) | ||
| 2765 | req->window_clamp = tcp_full_space(sk); | ||
| 2766 | |||
| 2767 | /* tcp_full_space because it is guaranteed to be the first packet */ | ||
| 2768 | tcp_select_initial_window(tcp_full_space(sk), | ||
| 2769 | mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), | ||
| 2770 | &req->rcv_wnd, | ||
| 2771 | &req->window_clamp, | ||
| 2772 | ireq->wscale_ok, | ||
| 2773 | &rcv_wscale, | ||
| 2774 | dst_metric(dst, RTAX_INITRWND)); | ||
| 2775 | ireq->rcv_wscale = rcv_wscale; | ||
| 2776 | } | ||
| 2777 | |||
| 2778 | memset(&opts, 0, sizeof(opts)); | 2812 | memset(&opts, 0, sizeof(opts)); |
| 2779 | #ifdef CONFIG_SYN_COOKIES | 2813 | #ifdef CONFIG_SYN_COOKIES |
| 2780 | if (unlikely(req->cookie_ts)) | 2814 | if (unlikely(req->cookie_ts)) |
| @@ -3207,7 +3241,7 @@ int tcp_write_wakeup(struct sock *sk) | |||
| 3207 | skb->len > mss) { | 3241 | skb->len > mss) { |
| 3208 | seg_size = min(seg_size, mss); | 3242 | seg_size = min(seg_size, mss); |
| 3209 | TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; | 3243 | TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; |
| 3210 | if (tcp_fragment(sk, skb, seg_size, mss)) | 3244 | if (tcp_fragment(sk, skb, seg_size, mss, GFP_ATOMIC)) |
| 3211 | return -1; | 3245 | return -1; |
| 3212 | } else if (!tcp_skb_pcount(skb)) | 3246 | } else if (!tcp_skb_pcount(skb)) |
| 3213 | tcp_set_skb_tso_segs(sk, skb, mss); | 3247 | tcp_set_skb_tso_segs(sk, skb, mss); |
