aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp_output.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4/tcp_output.c')
-rw-r--r--net/ipv4/tcp_output.c261
1 files changed, 181 insertions, 80 deletions
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 17a11e65e57f..5a7c41fbc6d3 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -86,6 +86,9 @@ static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
86 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { 86 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
87 tcp_rearm_rto(sk); 87 tcp_rearm_rto(sk);
88 } 88 }
89
90 NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT,
91 tcp_skb_pcount(skb));
89} 92}
90 93
91/* SND.NXT, if window was not shrunk. 94/* SND.NXT, if window was not shrunk.
@@ -269,6 +272,7 @@ EXPORT_SYMBOL(tcp_select_initial_window);
269static u16 tcp_select_window(struct sock *sk) 272static u16 tcp_select_window(struct sock *sk)
270{ 273{
271 struct tcp_sock *tp = tcp_sk(sk); 274 struct tcp_sock *tp = tcp_sk(sk);
275 u32 old_win = tp->rcv_wnd;
272 u32 cur_win = tcp_receive_window(tp); 276 u32 cur_win = tcp_receive_window(tp);
273 u32 new_win = __tcp_select_window(sk); 277 u32 new_win = __tcp_select_window(sk);
274 278
@@ -281,6 +285,9 @@ static u16 tcp_select_window(struct sock *sk)
281 * 285 *
282 * Relax Will Robinson. 286 * Relax Will Robinson.
283 */ 287 */
288 if (new_win == 0)
289 NET_INC_STATS(sock_net(sk),
290 LINUX_MIB_TCPWANTZEROWINDOWADV);
284 new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale); 291 new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale);
285 } 292 }
286 tp->rcv_wnd = new_win; 293 tp->rcv_wnd = new_win;
@@ -298,8 +305,14 @@ static u16 tcp_select_window(struct sock *sk)
298 new_win >>= tp->rx_opt.rcv_wscale; 305 new_win >>= tp->rx_opt.rcv_wscale;
299 306
300 /* If we advertise zero window, disable fast path. */ 307 /* If we advertise zero window, disable fast path. */
301 if (new_win == 0) 308 if (new_win == 0) {
302 tp->pred_flags = 0; 309 tp->pred_flags = 0;
310 if (old_win)
311 NET_INC_STATS(sock_net(sk),
312 LINUX_MIB_TCPTOZEROWINDOWADV);
313 } else if (old_win == 0) {
314 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFROMZEROWINDOWADV);
315 }
303 316
304 return new_win; 317 return new_win;
305} 318}
@@ -614,7 +627,7 @@ static unsigned int tcp_synack_options(struct sock *sk,
614 if (unlikely(!ireq->tstamp_ok)) 627 if (unlikely(!ireq->tstamp_ok))
615 remaining -= TCPOLEN_SACKPERM_ALIGNED; 628 remaining -= TCPOLEN_SACKPERM_ALIGNED;
616 } 629 }
617 if (foc != NULL) { 630 if (foc != NULL && foc->len >= 0) {
618 u32 need = TCPOLEN_EXP_FASTOPEN_BASE + foc->len; 631 u32 need = TCPOLEN_EXP_FASTOPEN_BASE + foc->len;
619 need = (need + 3) & ~3U; /* Align to 32 bits */ 632 need = (need + 3) & ~3U; /* Align to 32 bits */
620 if (remaining >= need) { 633 if (remaining >= need) {
@@ -787,7 +800,7 @@ void tcp_release_cb(struct sock *sk)
787 __sock_put(sk); 800 __sock_put(sk);
788 } 801 }
789 if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED)) { 802 if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED)) {
790 sk->sk_prot->mtu_reduced(sk); 803 inet_csk(sk)->icsk_af_ops->mtu_reduced(sk);
791 __sock_put(sk); 804 __sock_put(sk);
792 } 805 }
793} 806}
@@ -865,18 +878,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
865 BUG_ON(!skb || !tcp_skb_pcount(skb)); 878 BUG_ON(!skb || !tcp_skb_pcount(skb));
866 879
867 if (clone_it) { 880 if (clone_it) {
868 const struct sk_buff *fclone = skb + 1; 881 skb_mstamp_get(&skb->skb_mstamp);
869
870 /* If congestion control is doing timestamping, we must
871 * take such a timestamp before we potentially clone/copy.
872 */
873 if (icsk->icsk_ca_ops->flags & TCP_CONG_RTT_STAMP)
874 __net_timestamp(skb);
875
876 if (unlikely(skb->fclone == SKB_FCLONE_ORIG &&
877 fclone->fclone == SKB_FCLONE_CLONE))
878 NET_INC_STATS(sock_net(sk),
879 LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES);
880 882
881 if (unlikely(skb_cloned(skb))) 883 if (unlikely(skb_cloned(skb)))
882 skb = pskb_copy(skb, gfp_mask); 884 skb = pskb_copy(skb, gfp_mask);
@@ -884,6 +886,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
884 skb = skb_clone(skb, gfp_mask); 886 skb = skb_clone(skb, gfp_mask);
885 if (unlikely(!skb)) 887 if (unlikely(!skb))
886 return -ENOBUFS; 888 return -ENOBUFS;
889 /* Our usage of tstamp should remain private */
890 skb->tstamp.tv64 = 0;
887 } 891 }
888 892
889 inet = inet_sk(sk); 893 inet = inet_sk(sk);
@@ -912,6 +916,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
912 skb_orphan(skb); 916 skb_orphan(skb);
913 skb->sk = sk; 917 skb->sk = sk;
914 skb->destructor = tcp_wfree; 918 skb->destructor = tcp_wfree;
919 skb_set_hash_from_sk(skb, sk);
915 atomic_add(skb->truesize, &sk->sk_wmem_alloc); 920 atomic_add(skb->truesize, &sk->sk_wmem_alloc);
916 921
917 /* Build TCP header and checksum it. */ 922 /* Build TCP header and checksum it. */
@@ -970,11 +975,11 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
970 TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, 975 TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS,
971 tcp_skb_pcount(skb)); 976 tcp_skb_pcount(skb));
972 977
973 err = icsk->icsk_af_ops->queue_xmit(skb, &inet->cork.fl); 978 err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl);
974 if (likely(err <= 0)) 979 if (likely(err <= 0))
975 return err; 980 return err;
976 981
977 tcp_enter_cwr(sk, 1); 982 tcp_enter_cwr(sk);
978 983
979 return net_xmit_eval(err); 984 return net_xmit_eval(err);
980} 985}
@@ -1064,13 +1069,28 @@ static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int de
1064 tcp_verify_left_out(tp); 1069 tcp_verify_left_out(tp);
1065} 1070}
1066 1071
1072static void tcp_fragment_tstamp(struct sk_buff *skb, struct sk_buff *skb2)
1073{
1074 struct skb_shared_info *shinfo = skb_shinfo(skb);
1075
1076 if (unlikely(shinfo->tx_flags & SKBTX_ANY_TSTAMP) &&
1077 !before(shinfo->tskey, TCP_SKB_CB(skb2)->seq)) {
1078 struct skb_shared_info *shinfo2 = skb_shinfo(skb2);
1079 u8 tsflags = shinfo->tx_flags & SKBTX_ANY_TSTAMP;
1080
1081 shinfo->tx_flags &= ~tsflags;
1082 shinfo2->tx_flags |= tsflags;
1083 swap(shinfo->tskey, shinfo2->tskey);
1084 }
1085}
1086
1067/* Function to create two new TCP segments. Shrinks the given segment 1087/* Function to create two new TCP segments. Shrinks the given segment
1068 * to the specified size and appends a new segment with the rest of the 1088 * to the specified size and appends a new segment with the rest of the
1069 * packet to the list. This won't be called frequently, I hope. 1089 * packet to the list. This won't be called frequently, I hope.
1070 * Remember, these are still headerless SKBs at this point. 1090 * Remember, these are still headerless SKBs at this point.
1071 */ 1091 */
1072int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, 1092int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
1073 unsigned int mss_now) 1093 unsigned int mss_now, gfp_t gfp)
1074{ 1094{
1075 struct tcp_sock *tp = tcp_sk(sk); 1095 struct tcp_sock *tp = tcp_sk(sk);
1076 struct sk_buff *buff; 1096 struct sk_buff *buff;
@@ -1085,11 +1105,11 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
1085 if (nsize < 0) 1105 if (nsize < 0)
1086 nsize = 0; 1106 nsize = 0;
1087 1107
1088 if (skb_unclone(skb, GFP_ATOMIC)) 1108 if (skb_unclone(skb, gfp))
1089 return -ENOMEM; 1109 return -ENOMEM;
1090 1110
1091 /* Get a new skb... force flag on. */ 1111 /* Get a new skb... force flag on. */
1092 buff = sk_stream_alloc_skb(sk, nsize, GFP_ATOMIC); 1112 buff = sk_stream_alloc_skb(sk, nsize, gfp);
1093 if (buff == NULL) 1113 if (buff == NULL)
1094 return -ENOMEM; /* We'll just try again later. */ 1114 return -ENOMEM; /* We'll just try again later. */
1095 1115
@@ -1131,6 +1151,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
1131 */ 1151 */
1132 TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when; 1152 TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
1133 buff->tstamp = skb->tstamp; 1153 buff->tstamp = skb->tstamp;
1154 tcp_fragment_tstamp(skb, buff);
1134 1155
1135 old_factor = tcp_skb_pcount(skb); 1156 old_factor = tcp_skb_pcount(skb);
1136 1157
@@ -1376,12 +1397,43 @@ unsigned int tcp_current_mss(struct sock *sk)
1376 return mss_now; 1397 return mss_now;
1377} 1398}
1378 1399
1379/* Congestion window validation. (RFC2861) */ 1400/* RFC2861, slow part. Adjust cwnd, after it was not full during one rto.
1380static void tcp_cwnd_validate(struct sock *sk) 1401 * As additional protections, we do not touch cwnd in retransmission phases,
1402 * and if application hit its sndbuf limit recently.
1403 */
1404static void tcp_cwnd_application_limited(struct sock *sk)
1381{ 1405{
1382 struct tcp_sock *tp = tcp_sk(sk); 1406 struct tcp_sock *tp = tcp_sk(sk);
1383 1407
1384 if (tp->packets_out >= tp->snd_cwnd) { 1408 if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open &&
1409 sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
1410 /* Limited by application or receiver window. */
1411 u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk));
1412 u32 win_used = max(tp->snd_cwnd_used, init_win);
1413 if (win_used < tp->snd_cwnd) {
1414 tp->snd_ssthresh = tcp_current_ssthresh(sk);
1415 tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1;
1416 }
1417 tp->snd_cwnd_used = 0;
1418 }
1419 tp->snd_cwnd_stamp = tcp_time_stamp;
1420}
1421
1422static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited)
1423{
1424 struct tcp_sock *tp = tcp_sk(sk);
1425
1426 /* Track the maximum number of outstanding packets in each
1427 * window, and remember whether we were cwnd-limited then.
1428 */
1429 if (!before(tp->snd_una, tp->max_packets_seq) ||
1430 tp->packets_out > tp->max_packets_out) {
1431 tp->max_packets_out = tp->packets_out;
1432 tp->max_packets_seq = tp->snd_nxt;
1433 tp->is_cwnd_limited = is_cwnd_limited;
1434 }
1435
1436 if (tcp_is_cwnd_limited(sk)) {
1385 /* Network is feed fully. */ 1437 /* Network is feed fully. */
1386 tp->snd_cwnd_used = 0; 1438 tp->snd_cwnd_used = 0;
1387 tp->snd_cwnd_stamp = tcp_time_stamp; 1439 tp->snd_cwnd_stamp = tcp_time_stamp;
@@ -1426,7 +1478,7 @@ static void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now,
1426 * With Minshall's modification: all sent small packets are ACKed. 1478 * With Minshall's modification: all sent small packets are ACKed.
1427 */ 1479 */
1428static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp, 1480static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp,
1429 unsigned int mss_now, int nonagle) 1481 int nonagle)
1430{ 1482{
1431 return partial && 1483 return partial &&
1432 ((nonagle & TCP_NAGLE_CORK) || 1484 ((nonagle & TCP_NAGLE_CORK) ||
@@ -1458,7 +1510,7 @@ static unsigned int tcp_mss_split_point(const struct sock *sk,
1458 * to include this last segment in this skb. 1510 * to include this last segment in this skb.
1459 * Otherwise, we'll split the skb at last MSS boundary 1511 * Otherwise, we'll split the skb at last MSS boundary
1460 */ 1512 */
1461 if (tcp_nagle_check(partial != 0, tp, mss_now, nonagle)) 1513 if (tcp_nagle_check(partial != 0, tp, nonagle))
1462 return needed - partial; 1514 return needed - partial;
1463 1515
1464 return needed; 1516 return needed;
@@ -1521,7 +1573,7 @@ static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buf
1521 if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) 1573 if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
1522 return true; 1574 return true;
1523 1575
1524 if (!tcp_nagle_check(skb->len < cur_mss, tp, cur_mss, nonagle)) 1576 if (!tcp_nagle_check(skb->len < cur_mss, tp, nonagle))
1525 return true; 1577 return true;
1526 1578
1527 return false; 1579 return false;
@@ -1590,7 +1642,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
1590 1642
1591 /* All of a TSO frame must be composed of paged data. */ 1643 /* All of a TSO frame must be composed of paged data. */
1592 if (skb->len != skb->data_len) 1644 if (skb->len != skb->data_len)
1593 return tcp_fragment(sk, skb, len, mss_now); 1645 return tcp_fragment(sk, skb, len, mss_now, gfp);
1594 1646
1595 buff = sk_stream_alloc_skb(sk, 0, gfp); 1647 buff = sk_stream_alloc_skb(sk, 0, gfp);
1596 if (unlikely(buff == NULL)) 1648 if (unlikely(buff == NULL))
@@ -1616,6 +1668,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
1616 1668
1617 buff->ip_summed = skb->ip_summed = CHECKSUM_PARTIAL; 1669 buff->ip_summed = skb->ip_summed = CHECKSUM_PARTIAL;
1618 skb_split(skb, buff, len); 1670 skb_split(skb, buff, len);
1671 tcp_fragment_tstamp(skb, buff);
1619 1672
1620 /* Fix up tso_factor for both original and new SKB. */ 1673 /* Fix up tso_factor for both original and new SKB. */
1621 tcp_set_skb_tso_segs(sk, skb, mss_now); 1674 tcp_set_skb_tso_segs(sk, skb, mss_now);
@@ -1633,7 +1686,8 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
1633 * 1686 *
1634 * This algorithm is from John Heffner. 1687 * This algorithm is from John Heffner.
1635 */ 1688 */
1636static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) 1689static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
1690 bool *is_cwnd_limited)
1637{ 1691{
1638 struct tcp_sock *tp = tcp_sk(sk); 1692 struct tcp_sock *tp = tcp_sk(sk);
1639 const struct inet_connection_sock *icsk = inet_csk(sk); 1693 const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -1697,6 +1751,9 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
1697 if (!tp->tso_deferred) 1751 if (!tp->tso_deferred)
1698 tp->tso_deferred = 1 | (jiffies << 1); 1752 tp->tso_deferred = 1 | (jiffies << 1);
1699 1753
1754 if (cong_win < send_win && cong_win < skb->len)
1755 *is_cwnd_limited = true;
1756
1700 return true; 1757 return true;
1701 1758
1702send_now: 1759send_now:
@@ -1857,6 +1914,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1857 unsigned int tso_segs, sent_pkts; 1914 unsigned int tso_segs, sent_pkts;
1858 int cwnd_quota; 1915 int cwnd_quota;
1859 int result; 1916 int result;
1917 bool is_cwnd_limited = false;
1860 1918
1861 sent_pkts = 0; 1919 sent_pkts = 0;
1862 1920
@@ -1876,11 +1934,15 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1876 tso_segs = tcp_init_tso_segs(sk, skb, mss_now); 1934 tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
1877 BUG_ON(!tso_segs); 1935 BUG_ON(!tso_segs);
1878 1936
1879 if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) 1937 if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
1938 /* "when" is used as a start point for the retransmit timer */
1939 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1880 goto repair; /* Skip network transmission */ 1940 goto repair; /* Skip network transmission */
1941 }
1881 1942
1882 cwnd_quota = tcp_cwnd_test(tp, skb); 1943 cwnd_quota = tcp_cwnd_test(tp, skb);
1883 if (!cwnd_quota) { 1944 if (!cwnd_quota) {
1945 is_cwnd_limited = true;
1884 if (push_one == 2) 1946 if (push_one == 2)
1885 /* Force out a loss probe pkt. */ 1947 /* Force out a loss probe pkt. */
1886 cwnd_quota = 1; 1948 cwnd_quota = 1;
@@ -1897,7 +1959,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1897 nonagle : TCP_NAGLE_PUSH)))) 1959 nonagle : TCP_NAGLE_PUSH))))
1898 break; 1960 break;
1899 } else { 1961 } else {
1900 if (!push_one && tcp_tso_should_defer(sk, skb)) 1962 if (!push_one &&
1963 tcp_tso_should_defer(sk, skb, &is_cwnd_limited))
1901 break; 1964 break;
1902 } 1965 }
1903 1966
@@ -1919,10 +1982,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1919 /* It is possible TX completion already happened 1982 /* It is possible TX completion already happened
1920 * before we set TSQ_THROTTLED, so we must 1983 * before we set TSQ_THROTTLED, so we must
1921 * test again the condition. 1984 * test again the condition.
1922 * We abuse smp_mb__after_clear_bit() because
1923 * there is no smp_mb__after_set_bit() yet
1924 */ 1985 */
1925 smp_mb__after_clear_bit(); 1986 smp_mb__after_atomic();
1926 if (atomic_read(&sk->sk_wmem_alloc) > limit) 1987 if (atomic_read(&sk->sk_wmem_alloc) > limit)
1927 break; 1988 break;
1928 } 1989 }
@@ -1964,7 +2025,7 @@ repair:
1964 /* Send one loss probe per tail loss episode. */ 2025 /* Send one loss probe per tail loss episode. */
1965 if (push_one != 2) 2026 if (push_one != 2)
1966 tcp_schedule_loss_probe(sk); 2027 tcp_schedule_loss_probe(sk);
1967 tcp_cwnd_validate(sk); 2028 tcp_cwnd_validate(sk, is_cwnd_limited);
1968 return false; 2029 return false;
1969 } 2030 }
1970 return (push_one == 2) || (!tp->packets_out && tcp_send_head(sk)); 2031 return (push_one == 2) || (!tp->packets_out && tcp_send_head(sk));
@@ -1975,7 +2036,7 @@ bool tcp_schedule_loss_probe(struct sock *sk)
1975 struct inet_connection_sock *icsk = inet_csk(sk); 2036 struct inet_connection_sock *icsk = inet_csk(sk);
1976 struct tcp_sock *tp = tcp_sk(sk); 2037 struct tcp_sock *tp = tcp_sk(sk);
1977 u32 timeout, tlp_time_stamp, rto_time_stamp; 2038 u32 timeout, tlp_time_stamp, rto_time_stamp;
1978 u32 rtt = tp->srtt >> 3; 2039 u32 rtt = usecs_to_jiffies(tp->srtt_us >> 3);
1979 2040
1980 if (WARN_ON(icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS)) 2041 if (WARN_ON(icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS))
1981 return false; 2042 return false;
@@ -1997,7 +2058,7 @@ bool tcp_schedule_loss_probe(struct sock *sk)
1997 /* Schedule a loss probe in 2*RTT for SACK capable connections 2058 /* Schedule a loss probe in 2*RTT for SACK capable connections
1998 * in Open state, that are either limited by cwnd or application. 2059 * in Open state, that are either limited by cwnd or application.
1999 */ 2060 */
2000 if (sysctl_tcp_early_retrans < 3 || !tp->srtt || !tp->packets_out || 2061 if (sysctl_tcp_early_retrans < 3 || !tp->srtt_us || !tp->packets_out ||
2001 !tcp_is_sack(tp) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open) 2062 !tcp_is_sack(tp) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open)
2002 return false; 2063 return false;
2003 2064
@@ -2028,6 +2089,25 @@ bool tcp_schedule_loss_probe(struct sock *sk)
2028 return true; 2089 return true;
2029} 2090}
2030 2091
2092/* Thanks to skb fast clones, we can detect if a prior transmit of
2093 * a packet is still in a qdisc or driver queue.
2094 * In this case, there is very little point doing a retransmit !
2095 * Note: This is called from BH context only.
2096 */
2097static bool skb_still_in_host_queue(const struct sock *sk,
2098 const struct sk_buff *skb)
2099{
2100 const struct sk_buff *fclone = skb + 1;
2101
2102 if (unlikely(skb->fclone == SKB_FCLONE_ORIG &&
2103 fclone->fclone == SKB_FCLONE_CLONE)) {
2104 NET_INC_STATS_BH(sock_net(sk),
2105 LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES);
2106 return true;
2107 }
2108 return false;
2109}
2110
2031/* When probe timeout (PTO) fires, send a new segment if one exists, else 2111/* When probe timeout (PTO) fires, send a new segment if one exists, else
2032 * retransmit the last segment. 2112 * retransmit the last segment.
2033 */ 2113 */
@@ -2053,12 +2133,16 @@ void tcp_send_loss_probe(struct sock *sk)
2053 if (WARN_ON(!skb)) 2133 if (WARN_ON(!skb))
2054 goto rearm_timer; 2134 goto rearm_timer;
2055 2135
2136 if (skb_still_in_host_queue(sk, skb))
2137 goto rearm_timer;
2138
2056 pcount = tcp_skb_pcount(skb); 2139 pcount = tcp_skb_pcount(skb);
2057 if (WARN_ON(!pcount)) 2140 if (WARN_ON(!pcount))
2058 goto rearm_timer; 2141 goto rearm_timer;
2059 2142
2060 if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) { 2143 if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) {
2061 if (unlikely(tcp_fragment(sk, skb, (pcount - 1) * mss, mss))) 2144 if (unlikely(tcp_fragment(sk, skb, (pcount - 1) * mss, mss,
2145 GFP_ATOMIC)))
2062 goto rearm_timer; 2146 goto rearm_timer;
2063 skb = tcp_write_queue_tail(sk); 2147 skb = tcp_write_queue_tail(sk);
2064 } 2148 }
@@ -2066,9 +2150,7 @@ void tcp_send_loss_probe(struct sock *sk)
2066 if (WARN_ON(!skb || !tcp_skb_pcount(skb))) 2150 if (WARN_ON(!skb || !tcp_skb_pcount(skb)))
2067 goto rearm_timer; 2151 goto rearm_timer;
2068 2152
2069 /* Probe with zero data doesn't trigger fast recovery. */ 2153 err = __tcp_retransmit_skb(sk, skb);
2070 if (skb->len > 0)
2071 err = __tcp_retransmit_skb(sk, skb);
2072 2154
2073 /* Record snd_nxt for loss detection. */ 2155 /* Record snd_nxt for loss detection. */
2074 if (likely(!err)) 2156 if (likely(!err))
@@ -2082,7 +2164,6 @@ rearm_timer:
2082 if (likely(!err)) 2164 if (likely(!err))
2083 NET_INC_STATS_BH(sock_net(sk), 2165 NET_INC_STATS_BH(sock_net(sk),
2084 LINUX_MIB_TCPLOSSPROBES); 2166 LINUX_MIB_TCPLOSSPROBES);
2085 return;
2086} 2167}
2087 2168
2088/* Push out any pending frames which were held back due to 2169/* Push out any pending frames which were held back due to
@@ -2180,7 +2261,8 @@ u32 __tcp_select_window(struct sock *sk)
2180 */ 2261 */
2181 int mss = icsk->icsk_ack.rcv_mss; 2262 int mss = icsk->icsk_ack.rcv_mss;
2182 int free_space = tcp_space(sk); 2263 int free_space = tcp_space(sk);
2183 int full_space = min_t(int, tp->window_clamp, tcp_full_space(sk)); 2264 int allowed_space = tcp_full_space(sk);
2265 int full_space = min_t(int, tp->window_clamp, allowed_space);
2184 int window; 2266 int window;
2185 2267
2186 if (mss > full_space) 2268 if (mss > full_space)
@@ -2193,7 +2275,19 @@ u32 __tcp_select_window(struct sock *sk)
2193 tp->rcv_ssthresh = min(tp->rcv_ssthresh, 2275 tp->rcv_ssthresh = min(tp->rcv_ssthresh,
2194 4U * tp->advmss); 2276 4U * tp->advmss);
2195 2277
2196 if (free_space < mss) 2278 /* free_space might become our new window, make sure we don't
2279 * increase it due to wscale.
2280 */
2281 free_space = round_down(free_space, 1 << tp->rx_opt.rcv_wscale);
2282
2283 /* if free space is less than mss estimate, or is below 1/16th
2284 * of the maximum allowed, try to move to zero-window, else
2285 * tcp_clamp_window() will grow rcv buf up to tcp_rmem[2], and
2286 * new incoming data is dropped due to memory limits.
2287 * With large window, mss test triggers way too late in order
2288 * to announce zero window in time before rmem limit kicks in.
2289 */
2290 if (free_space < (allowed_space >> 4) || free_space < mss)
2197 return 0; 2291 return 0;
2198 } 2292 }
2199 2293
@@ -2362,6 +2456,9 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
2362 min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf)) 2456 min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf))
2363 return -EAGAIN; 2457 return -EAGAIN;
2364 2458
2459 if (skb_still_in_host_queue(sk, skb))
2460 return -EBUSY;
2461
2365 if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) { 2462 if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {
2366 if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) 2463 if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
2367 BUG(); 2464 BUG();
@@ -2384,7 +2481,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
2384 return -EAGAIN; 2481 return -EAGAIN;
2385 2482
2386 if (skb->len > cur_mss) { 2483 if (skb->len > cur_mss) {
2387 if (tcp_fragment(sk, skb, cur_mss, cur_mss)) 2484 if (tcp_fragment(sk, skb, cur_mss, cur_mss, GFP_ATOMIC))
2388 return -ENOMEM; /* We'll try again later. */ 2485 return -ENOMEM; /* We'll try again later. */
2389 } else { 2486 } else {
2390 int oldpcount = tcp_skb_pcount(skb); 2487 int oldpcount = tcp_skb_pcount(skb);
@@ -2418,8 +2515,14 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
2418 err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); 2515 err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
2419 } 2516 }
2420 2517
2421 if (likely(!err)) 2518 if (likely(!err)) {
2422 TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS; 2519 TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS;
2520 /* Update global TCP statistics. */
2521 TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS);
2522 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
2523 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
2524 tp->total_retrans++;
2525 }
2423 return err; 2526 return err;
2424} 2527}
2425 2528
@@ -2429,11 +2532,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
2429 int err = __tcp_retransmit_skb(sk, skb); 2532 int err = __tcp_retransmit_skb(sk, skb);
2430 2533
2431 if (err == 0) { 2534 if (err == 0) {
2432 /* Update global TCP statistics. */
2433 TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS);
2434
2435 tp->total_retrans++;
2436
2437#if FASTRETRANS_DEBUG > 0 2535#if FASTRETRANS_DEBUG > 0
2438 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { 2536 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
2439 net_dbg_ratelimited("retrans_out leaked\n"); 2537 net_dbg_ratelimited("retrans_out leaked\n");
@@ -2448,15 +2546,17 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
2448 if (!tp->retrans_stamp) 2546 if (!tp->retrans_stamp)
2449 tp->retrans_stamp = TCP_SKB_CB(skb)->when; 2547 tp->retrans_stamp = TCP_SKB_CB(skb)->when;
2450 2548
2451 tp->undo_retrans += tcp_skb_pcount(skb);
2452
2453 /* snd_nxt is stored to detect loss of retransmitted segment, 2549 /* snd_nxt is stored to detect loss of retransmitted segment,
2454 * see tcp_input.c tcp_sacktag_write_queue(). 2550 * see tcp_input.c tcp_sacktag_write_queue().
2455 */ 2551 */
2456 TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt; 2552 TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt;
2457 } else { 2553 } else if (err != -EBUSY) {
2458 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL); 2554 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);
2459 } 2555 }
2556
2557 if (tp->undo_retrans < 0)
2558 tp->undo_retrans = 0;
2559 tp->undo_retrans += tcp_skb_pcount(skb);
2460 return err; 2560 return err;
2461} 2561}
2462 2562
@@ -2717,7 +2817,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2717 int tcp_header_size; 2817 int tcp_header_size;
2718 int mss; 2818 int mss;
2719 2819
2720 skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC); 2820 skb = sock_wmalloc(sk, MAX_TCP_HEADER, 1, GFP_ATOMIC);
2721 if (unlikely(!skb)) { 2821 if (unlikely(!skb)) {
2722 dst_release(dst); 2822 dst_release(dst);
2723 return NULL; 2823 return NULL;
@@ -2732,27 +2832,6 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2732 if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss) 2832 if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss)
2733 mss = tp->rx_opt.user_mss; 2833 mss = tp->rx_opt.user_mss;
2734 2834
2735 if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
2736 __u8 rcv_wscale;
2737 /* Set this up on the first call only */
2738 req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW);
2739
2740 /* limit the window selection if the user enforce a smaller rx buffer */
2741 if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
2742 (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0))
2743 req->window_clamp = tcp_full_space(sk);
2744
2745 /* tcp_full_space because it is guaranteed to be the first packet */
2746 tcp_select_initial_window(tcp_full_space(sk),
2747 mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
2748 &req->rcv_wnd,
2749 &req->window_clamp,
2750 ireq->wscale_ok,
2751 &rcv_wscale,
2752 dst_metric(dst, RTAX_INITRWND));
2753 ireq->rcv_wscale = rcv_wscale;
2754 }
2755
2756 memset(&opts, 0, sizeof(opts)); 2835 memset(&opts, 0, sizeof(opts));
2757#ifdef CONFIG_SYN_COOKIES 2836#ifdef CONFIG_SYN_COOKIES
2758 if (unlikely(req->cookie_ts)) 2837 if (unlikely(req->cookie_ts))
@@ -2787,7 +2866,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2787 th->window = htons(min(req->rcv_wnd, 65535U)); 2866 th->window = htons(min(req->rcv_wnd, 65535U));
2788 tcp_options_write((__be32 *)(th + 1), tp, &opts); 2867 tcp_options_write((__be32 *)(th + 1), tp, &opts);
2789 th->doff = (tcp_header_size >> 2); 2868 th->doff = (tcp_header_size >> 2);
2790 TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, tcp_skb_pcount(skb)); 2869 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_OUTSEGS);
2791 2870
2792#ifdef CONFIG_TCP_MD5SIG 2871#ifdef CONFIG_TCP_MD5SIG
2793 /* Okay, we have all we need - do the md5 hash if needed */ 2872 /* Okay, we have all we need - do the md5 hash if needed */
@@ -2959,9 +3038,15 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
2959 tcp_connect_queue_skb(sk, data); 3038 tcp_connect_queue_skb(sk, data);
2960 fo->copied = data->len; 3039 fo->copied = data->len;
2961 3040
3041 /* syn_data is about to be sent, we need to take current time stamps
3042 * for the packets that are in write queue : SYN packet and DATA
3043 */
3044 skb_mstamp_get(&syn->skb_mstamp);
3045 data->skb_mstamp = syn->skb_mstamp;
3046
2962 if (tcp_transmit_skb(sk, syn_data, 0, sk->sk_allocation) == 0) { 3047 if (tcp_transmit_skb(sk, syn_data, 0, sk->sk_allocation) == 0) {
2963 tp->syn_data = (fo->copied > 0); 3048 tp->syn_data = (fo->copied > 0);
2964 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE); 3049 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT);
2965 goto done; 3050 goto done;
2966 } 3051 }
2967 syn_data = NULL; 3052 syn_data = NULL;
@@ -3049,8 +3134,9 @@ void tcp_send_delayed_ack(struct sock *sk)
3049 * Do not use inet_csk(sk)->icsk_rto here, use results of rtt measurements 3134 * Do not use inet_csk(sk)->icsk_rto here, use results of rtt measurements
3050 * directly. 3135 * directly.
3051 */ 3136 */
3052 if (tp->srtt) { 3137 if (tp->srtt_us) {
3053 int rtt = max(tp->srtt >> 3, TCP_DELACK_MIN); 3138 int rtt = max_t(int, usecs_to_jiffies(tp->srtt_us >> 3),
3139 TCP_DELACK_MIN);
3054 3140
3055 if (rtt < max_ato) 3141 if (rtt < max_ato)
3056 max_ato = rtt; 3142 max_ato = rtt;
@@ -3178,7 +3264,7 @@ int tcp_write_wakeup(struct sock *sk)
3178 skb->len > mss) { 3264 skb->len > mss) {
3179 seg_size = min(seg_size, mss); 3265 seg_size = min(seg_size, mss);
3180 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; 3266 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
3181 if (tcp_fragment(sk, skb, seg_size, mss)) 3267 if (tcp_fragment(sk, skb, seg_size, mss, GFP_ATOMIC))
3182 return -1; 3268 return -1;
3183 } else if (!tcp_skb_pcount(skb)) 3269 } else if (!tcp_skb_pcount(skb))
3184 tcp_set_skb_tso_segs(sk, skb, mss); 3270 tcp_set_skb_tso_segs(sk, skb, mss);
@@ -3236,3 +3322,18 @@ void tcp_send_probe0(struct sock *sk)
3236 TCP_RTO_MAX); 3322 TCP_RTO_MAX);
3237 } 3323 }
3238} 3324}
3325
3326int tcp_rtx_synack(struct sock *sk, struct request_sock *req)
3327{
3328 const struct tcp_request_sock_ops *af_ops = tcp_rsk(req)->af_specific;
3329 struct flowi fl;
3330 int res;
3331
3332 res = af_ops->send_synack(sk, NULL, &fl, req, 0, NULL);
3333 if (!res) {
3334 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
3335 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
3336 }
3337 return res;
3338}
3339EXPORT_SYMBOL(tcp_rtx_synack);