aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp_output.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-06-12 17:27:40 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2014-06-12 17:27:40 -0400
commitf9da455b93f6ba076935b4ef4589f61e529ae046 (patch)
tree3c4e69ce1ba1d6bf65915b97a76ca2172105b278 /net/ipv4/tcp_output.c
parent0e04c641b199435f3779454055f6a7de258ecdfc (diff)
parente5eca6d41f53db48edd8cf88a3f59d2c30227f8e (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking updates from David Miller: 1) Seccomp BPF filters can now be JIT'd, from Alexei Starovoitov. 2) Multiqueue support in xen-netback and xen-netfront, from Andrew J Benniston. 3) Allow tweaking of aggregation settings in cdc_ncm driver, from Bjørn Mork. 4) BPF now has a "random" opcode, from Chema Gonzalez. 5) Add more BPF documentation and improve test framework, from Daniel Borkmann. 6) Support TCP fastopen over ipv6, from Daniel Lee. 7) Add software TSO helper functions and use them to support software TSO in mvneta and mv643xx_eth drivers. From Ezequiel Garcia. 8) Support software TSO in fec driver too, from Nimrod Andy. 9) Add Broadcom SYSTEMPORT driver, from Florian Fainelli. 10) Handle broadcasts more gracefully over macvlan when there are large numbers of interfaces configured, from Herbert Xu. 11) Allow more control over fwmark used for non-socket based responses, from Lorenzo Colitti. 12) Do TCP congestion window limiting based upon measurements, from Neal Cardwell. 13) Support busy polling in SCTP, from Neal Horman. 14) Allow RSS key to be configured via ethtool, from Venkata Duvvuru. 15) Bridge promisc mode handling improvements from Vlad Yasevich. 16) Don't use inetpeer entries to implement ID generation any more, it performs poorly, from Eric Dumazet. * git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (1522 commits) rtnetlink: fix userspace API breakage for iproute2 < v3.9.0 tcp: fixing TLP's FIN recovery net: fec: Add software TSO support net: fec: Add Scatter/gather support net: fec: Increase buffer descriptor entry number net: fec: Factorize feature setting net: fec: Enable IP header hardware checksum net: fec: Factorize the .xmit transmit function bridge: fix compile error when compiling without IPv6 support bridge: fix smatch warning / potential null pointer dereference via-rhine: fix full-duplex with autoneg disable bnx2x: Enlarge the dorq threshold for VFs bnx2x: Check for UNDI in uncommon branch bnx2x: Fix 1G-baseT link bnx2x: Fix link for KR with swapped polarity lane sctp: Fix sk_ack_backlog wrap-around problem net/core: Add VF link state control policy net/fsl: xgmac_mdio is dependent on OF_MDIO net/fsl: Make xgmac_mdio read error message useful net_sched: drr: warn when qdisc is not work conserving ...
Diffstat (limited to 'net/ipv4/tcp_output.c')
-rw-r--r--net/ipv4/tcp_output.c126
1 files changed, 80 insertions, 46 deletions
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 2d340bd2cd3d..d92bce0ea24e 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -627,7 +627,7 @@ static unsigned int tcp_synack_options(struct sock *sk,
627 if (unlikely(!ireq->tstamp_ok)) 627 if (unlikely(!ireq->tstamp_ok))
628 remaining -= TCPOLEN_SACKPERM_ALIGNED; 628 remaining -= TCPOLEN_SACKPERM_ALIGNED;
629 } 629 }
630 if (foc != NULL) { 630 if (foc != NULL && foc->len >= 0) {
631 u32 need = TCPOLEN_EXP_FASTOPEN_BASE + foc->len; 631 u32 need = TCPOLEN_EXP_FASTOPEN_BASE + foc->len;
632 need = (need + 3) & ~3U; /* Align to 32 bits */ 632 need = (need + 3) & ~3U; /* Align to 32 bits */
633 if (remaining >= need) { 633 if (remaining >= need) {
@@ -878,15 +878,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
878 BUG_ON(!skb || !tcp_skb_pcount(skb)); 878 BUG_ON(!skb || !tcp_skb_pcount(skb));
879 879
880 if (clone_it) { 880 if (clone_it) {
881 const struct sk_buff *fclone = skb + 1;
882
883 skb_mstamp_get(&skb->skb_mstamp); 881 skb_mstamp_get(&skb->skb_mstamp);
884 882
885 if (unlikely(skb->fclone == SKB_FCLONE_ORIG &&
886 fclone->fclone == SKB_FCLONE_CLONE))
887 NET_INC_STATS(sock_net(sk),
888 LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES);
889
890 if (unlikely(skb_cloned(skb))) 883 if (unlikely(skb_cloned(skb)))
891 skb = pskb_copy(skb, gfp_mask); 884 skb = pskb_copy(skb, gfp_mask);
892 else 885 else
@@ -1081,7 +1074,7 @@ static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int de
1081 * Remember, these are still headerless SKBs at this point. 1074 * Remember, these are still headerless SKBs at this point.
1082 */ 1075 */
1083int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, 1076int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
1084 unsigned int mss_now) 1077 unsigned int mss_now, gfp_t gfp)
1085{ 1078{
1086 struct tcp_sock *tp = tcp_sk(sk); 1079 struct tcp_sock *tp = tcp_sk(sk);
1087 struct sk_buff *buff; 1080 struct sk_buff *buff;
@@ -1096,11 +1089,11 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
1096 if (nsize < 0) 1089 if (nsize < 0)
1097 nsize = 0; 1090 nsize = 0;
1098 1091
1099 if (skb_unclone(skb, GFP_ATOMIC)) 1092 if (skb_unclone(skb, gfp))
1100 return -ENOMEM; 1093 return -ENOMEM;
1101 1094
1102 /* Get a new skb... force flag on. */ 1095 /* Get a new skb... force flag on. */
1103 buff = sk_stream_alloc_skb(sk, nsize, GFP_ATOMIC); 1096 buff = sk_stream_alloc_skb(sk, nsize, gfp);
1104 if (buff == NULL) 1097 if (buff == NULL)
1105 return -ENOMEM; /* We'll just try again later. */ 1098 return -ENOMEM; /* We'll just try again later. */
1106 1099
@@ -1387,12 +1380,43 @@ unsigned int tcp_current_mss(struct sock *sk)
1387 return mss_now; 1380 return mss_now;
1388} 1381}
1389 1382
1390/* Congestion window validation. (RFC2861) */ 1383/* RFC2861, slow part. Adjust cwnd, after it was not full during one rto.
1391static void tcp_cwnd_validate(struct sock *sk) 1384 * As additional protections, we do not touch cwnd in retransmission phases,
1385 * and if application hit its sndbuf limit recently.
1386 */
1387static void tcp_cwnd_application_limited(struct sock *sk)
1388{
1389 struct tcp_sock *tp = tcp_sk(sk);
1390
1391 if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open &&
1392 sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
1393 /* Limited by application or receiver window. */
1394 u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk));
1395 u32 win_used = max(tp->snd_cwnd_used, init_win);
1396 if (win_used < tp->snd_cwnd) {
1397 tp->snd_ssthresh = tcp_current_ssthresh(sk);
1398 tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1;
1399 }
1400 tp->snd_cwnd_used = 0;
1401 }
1402 tp->snd_cwnd_stamp = tcp_time_stamp;
1403}
1404
1405static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited)
1392{ 1406{
1393 struct tcp_sock *tp = tcp_sk(sk); 1407 struct tcp_sock *tp = tcp_sk(sk);
1394 1408
1395 if (tp->packets_out >= tp->snd_cwnd) { 1409 /* Track the maximum number of outstanding packets in each
1410 * window, and remember whether we were cwnd-limited then.
1411 */
1412 if (!before(tp->snd_una, tp->max_packets_seq) ||
1413 tp->packets_out > tp->max_packets_out) {
1414 tp->max_packets_out = tp->packets_out;
1415 tp->max_packets_seq = tp->snd_nxt;
1416 tp->is_cwnd_limited = is_cwnd_limited;
1417 }
1418
1419 if (tcp_is_cwnd_limited(sk)) {
1396 /* Network is feed fully. */ 1420 /* Network is feed fully. */
1397 tp->snd_cwnd_used = 0; 1421 tp->snd_cwnd_used = 0;
1398 tp->snd_cwnd_stamp = tcp_time_stamp; 1422 tp->snd_cwnd_stamp = tcp_time_stamp;
@@ -1601,7 +1625,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
1601 1625
1602 /* All of a TSO frame must be composed of paged data. */ 1626 /* All of a TSO frame must be composed of paged data. */
1603 if (skb->len != skb->data_len) 1627 if (skb->len != skb->data_len)
1604 return tcp_fragment(sk, skb, len, mss_now); 1628 return tcp_fragment(sk, skb, len, mss_now, gfp);
1605 1629
1606 buff = sk_stream_alloc_skb(sk, 0, gfp); 1630 buff = sk_stream_alloc_skb(sk, 0, gfp);
1607 if (unlikely(buff == NULL)) 1631 if (unlikely(buff == NULL))
@@ -1644,7 +1668,8 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
1644 * 1668 *
1645 * This algorithm is from John Heffner. 1669 * This algorithm is from John Heffner.
1646 */ 1670 */
1647static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) 1671static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
1672 bool *is_cwnd_limited)
1648{ 1673{
1649 struct tcp_sock *tp = tcp_sk(sk); 1674 struct tcp_sock *tp = tcp_sk(sk);
1650 const struct inet_connection_sock *icsk = inet_csk(sk); 1675 const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -1708,6 +1733,9 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
1708 if (!tp->tso_deferred) 1733 if (!tp->tso_deferred)
1709 tp->tso_deferred = 1 | (jiffies << 1); 1734 tp->tso_deferred = 1 | (jiffies << 1);
1710 1735
1736 if (cong_win < send_win && cong_win < skb->len)
1737 *is_cwnd_limited = true;
1738
1711 return true; 1739 return true;
1712 1740
1713send_now: 1741send_now:
@@ -1868,6 +1896,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1868 unsigned int tso_segs, sent_pkts; 1896 unsigned int tso_segs, sent_pkts;
1869 int cwnd_quota; 1897 int cwnd_quota;
1870 int result; 1898 int result;
1899 bool is_cwnd_limited = false;
1871 1900
1872 sent_pkts = 0; 1901 sent_pkts = 0;
1873 1902
@@ -1892,6 +1921,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1892 1921
1893 cwnd_quota = tcp_cwnd_test(tp, skb); 1922 cwnd_quota = tcp_cwnd_test(tp, skb);
1894 if (!cwnd_quota) { 1923 if (!cwnd_quota) {
1924 is_cwnd_limited = true;
1895 if (push_one == 2) 1925 if (push_one == 2)
1896 /* Force out a loss probe pkt. */ 1926 /* Force out a loss probe pkt. */
1897 cwnd_quota = 1; 1927 cwnd_quota = 1;
@@ -1908,7 +1938,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1908 nonagle : TCP_NAGLE_PUSH)))) 1938 nonagle : TCP_NAGLE_PUSH))))
1909 break; 1939 break;
1910 } else { 1940 } else {
1911 if (!push_one && tcp_tso_should_defer(sk, skb)) 1941 if (!push_one &&
1942 tcp_tso_should_defer(sk, skb, &is_cwnd_limited))
1912 break; 1943 break;
1913 } 1944 }
1914 1945
@@ -1973,7 +2004,7 @@ repair:
1973 /* Send one loss probe per tail loss episode. */ 2004 /* Send one loss probe per tail loss episode. */
1974 if (push_one != 2) 2005 if (push_one != 2)
1975 tcp_schedule_loss_probe(sk); 2006 tcp_schedule_loss_probe(sk);
1976 tcp_cwnd_validate(sk); 2007 tcp_cwnd_validate(sk, is_cwnd_limited);
1977 return false; 2008 return false;
1978 } 2009 }
1979 return (push_one == 2) || (!tp->packets_out && tcp_send_head(sk)); 2010 return (push_one == 2) || (!tp->packets_out && tcp_send_head(sk));
@@ -2037,6 +2068,25 @@ bool tcp_schedule_loss_probe(struct sock *sk)
2037 return true; 2068 return true;
2038} 2069}
2039 2070
2071/* Thanks to skb fast clones, we can detect if a prior transmit of
2072 * a packet is still in a qdisc or driver queue.
2073 * In this case, there is very little point doing a retransmit !
2074 * Note: This is called from BH context only.
2075 */
2076static bool skb_still_in_host_queue(const struct sock *sk,
2077 const struct sk_buff *skb)
2078{
2079 const struct sk_buff *fclone = skb + 1;
2080
2081 if (unlikely(skb->fclone == SKB_FCLONE_ORIG &&
2082 fclone->fclone == SKB_FCLONE_CLONE)) {
2083 NET_INC_STATS_BH(sock_net(sk),
2084 LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES);
2085 return true;
2086 }
2087 return false;
2088}
2089
2040/* When probe timeout (PTO) fires, send a new segment if one exists, else 2090/* When probe timeout (PTO) fires, send a new segment if one exists, else
2041 * retransmit the last segment. 2091 * retransmit the last segment.
2042 */ 2092 */
@@ -2062,12 +2112,16 @@ void tcp_send_loss_probe(struct sock *sk)
2062 if (WARN_ON(!skb)) 2112 if (WARN_ON(!skb))
2063 goto rearm_timer; 2113 goto rearm_timer;
2064 2114
2115 if (skb_still_in_host_queue(sk, skb))
2116 goto rearm_timer;
2117
2065 pcount = tcp_skb_pcount(skb); 2118 pcount = tcp_skb_pcount(skb);
2066 if (WARN_ON(!pcount)) 2119 if (WARN_ON(!pcount))
2067 goto rearm_timer; 2120 goto rearm_timer;
2068 2121
2069 if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) { 2122 if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) {
2070 if (unlikely(tcp_fragment(sk, skb, (pcount - 1) * mss, mss))) 2123 if (unlikely(tcp_fragment(sk, skb, (pcount - 1) * mss, mss,
2124 GFP_ATOMIC)))
2071 goto rearm_timer; 2125 goto rearm_timer;
2072 skb = tcp_write_queue_tail(sk); 2126 skb = tcp_write_queue_tail(sk);
2073 } 2127 }
@@ -2075,9 +2129,7 @@ void tcp_send_loss_probe(struct sock *sk)
2075 if (WARN_ON(!skb || !tcp_skb_pcount(skb))) 2129 if (WARN_ON(!skb || !tcp_skb_pcount(skb)))
2076 goto rearm_timer; 2130 goto rearm_timer;
2077 2131
2078 /* Probe with zero data doesn't trigger fast recovery. */ 2132 err = __tcp_retransmit_skb(sk, skb);
2079 if (skb->len > 0)
2080 err = __tcp_retransmit_skb(sk, skb);
2081 2133
2082 /* Record snd_nxt for loss detection. */ 2134 /* Record snd_nxt for loss detection. */
2083 if (likely(!err)) 2135 if (likely(!err))
@@ -2383,6 +2435,9 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
2383 min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf)) 2435 min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf))
2384 return -EAGAIN; 2436 return -EAGAIN;
2385 2437
2438 if (skb_still_in_host_queue(sk, skb))
2439 return -EBUSY;
2440
2386 if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) { 2441 if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {
2387 if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) 2442 if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
2388 BUG(); 2443 BUG();
@@ -2405,7 +2460,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
2405 return -EAGAIN; 2460 return -EAGAIN;
2406 2461
2407 if (skb->len > cur_mss) { 2462 if (skb->len > cur_mss) {
2408 if (tcp_fragment(sk, skb, cur_mss, cur_mss)) 2463 if (tcp_fragment(sk, skb, cur_mss, cur_mss, GFP_ATOMIC))
2409 return -ENOMEM; /* We'll try again later. */ 2464 return -ENOMEM; /* We'll try again later. */
2410 } else { 2465 } else {
2411 int oldpcount = tcp_skb_pcount(skb); 2466 int oldpcount = tcp_skb_pcount(skb);
@@ -2476,7 +2531,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
2476 * see tcp_input.c tcp_sacktag_write_queue(). 2531 * see tcp_input.c tcp_sacktag_write_queue().
2477 */ 2532 */
2478 TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt; 2533 TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt;
2479 } else { 2534 } else if (err != -EBUSY) {
2480 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL); 2535 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);
2481 } 2536 }
2482 return err; 2537 return err;
@@ -2754,27 +2809,6 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2754 if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss) 2809 if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss)
2755 mss = tp->rx_opt.user_mss; 2810 mss = tp->rx_opt.user_mss;
2756 2811
2757 if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
2758 __u8 rcv_wscale;
2759 /* Set this up on the first call only */
2760 req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW);
2761
2762 /* limit the window selection if the user enforce a smaller rx buffer */
2763 if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
2764 (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0))
2765 req->window_clamp = tcp_full_space(sk);
2766
2767 /* tcp_full_space because it is guaranteed to be the first packet */
2768 tcp_select_initial_window(tcp_full_space(sk),
2769 mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
2770 &req->rcv_wnd,
2771 &req->window_clamp,
2772 ireq->wscale_ok,
2773 &rcv_wscale,
2774 dst_metric(dst, RTAX_INITRWND));
2775 ireq->rcv_wscale = rcv_wscale;
2776 }
2777
2778 memset(&opts, 0, sizeof(opts)); 2812 memset(&opts, 0, sizeof(opts));
2779#ifdef CONFIG_SYN_COOKIES 2813#ifdef CONFIG_SYN_COOKIES
2780 if (unlikely(req->cookie_ts)) 2814 if (unlikely(req->cookie_ts))
@@ -3207,7 +3241,7 @@ int tcp_write_wakeup(struct sock *sk)
3207 skb->len > mss) { 3241 skb->len > mss) {
3208 seg_size = min(seg_size, mss); 3242 seg_size = min(seg_size, mss);
3209 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; 3243 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
3210 if (tcp_fragment(sk, skb, seg_size, mss)) 3244 if (tcp_fragment(sk, skb, seg_size, mss, GFP_ATOMIC))
3211 return -1; 3245 return -1;
3212 } else if (!tcp_skb_pcount(skb)) 3246 } else if (!tcp_skb_pcount(skb))
3213 tcp_set_skb_tso_segs(sk, skb, mss); 3247 tcp_set_skb_tso_segs(sk, skb, mss);