diff options
Diffstat (limited to 'net/ipv4/tcp_output.c')
-rw-r--r-- | net/ipv4/tcp_output.c | 261 |
1 files changed, 181 insertions, 80 deletions
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 17a11e65e57f..5a7c41fbc6d3 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c | |||
@@ -86,6 +86,9 @@ static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb) | |||
86 | icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { | 86 | icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { |
87 | tcp_rearm_rto(sk); | 87 | tcp_rearm_rto(sk); |
88 | } | 88 | } |
89 | |||
90 | NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT, | ||
91 | tcp_skb_pcount(skb)); | ||
89 | } | 92 | } |
90 | 93 | ||
91 | /* SND.NXT, if window was not shrunk. | 94 | /* SND.NXT, if window was not shrunk. |
@@ -269,6 +272,7 @@ EXPORT_SYMBOL(tcp_select_initial_window); | |||
269 | static u16 tcp_select_window(struct sock *sk) | 272 | static u16 tcp_select_window(struct sock *sk) |
270 | { | 273 | { |
271 | struct tcp_sock *tp = tcp_sk(sk); | 274 | struct tcp_sock *tp = tcp_sk(sk); |
275 | u32 old_win = tp->rcv_wnd; | ||
272 | u32 cur_win = tcp_receive_window(tp); | 276 | u32 cur_win = tcp_receive_window(tp); |
273 | u32 new_win = __tcp_select_window(sk); | 277 | u32 new_win = __tcp_select_window(sk); |
274 | 278 | ||
@@ -281,6 +285,9 @@ static u16 tcp_select_window(struct sock *sk) | |||
281 | * | 285 | * |
282 | * Relax Will Robinson. | 286 | * Relax Will Robinson. |
283 | */ | 287 | */ |
288 | if (new_win == 0) | ||
289 | NET_INC_STATS(sock_net(sk), | ||
290 | LINUX_MIB_TCPWANTZEROWINDOWADV); | ||
284 | new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale); | 291 | new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale); |
285 | } | 292 | } |
286 | tp->rcv_wnd = new_win; | 293 | tp->rcv_wnd = new_win; |
@@ -298,8 +305,14 @@ static u16 tcp_select_window(struct sock *sk) | |||
298 | new_win >>= tp->rx_opt.rcv_wscale; | 305 | new_win >>= tp->rx_opt.rcv_wscale; |
299 | 306 | ||
300 | /* If we advertise zero window, disable fast path. */ | 307 | /* If we advertise zero window, disable fast path. */ |
301 | if (new_win == 0) | 308 | if (new_win == 0) { |
302 | tp->pred_flags = 0; | 309 | tp->pred_flags = 0; |
310 | if (old_win) | ||
311 | NET_INC_STATS(sock_net(sk), | ||
312 | LINUX_MIB_TCPTOZEROWINDOWADV); | ||
313 | } else if (old_win == 0) { | ||
314 | NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFROMZEROWINDOWADV); | ||
315 | } | ||
303 | 316 | ||
304 | return new_win; | 317 | return new_win; |
305 | } | 318 | } |
@@ -614,7 +627,7 @@ static unsigned int tcp_synack_options(struct sock *sk, | |||
614 | if (unlikely(!ireq->tstamp_ok)) | 627 | if (unlikely(!ireq->tstamp_ok)) |
615 | remaining -= TCPOLEN_SACKPERM_ALIGNED; | 628 | remaining -= TCPOLEN_SACKPERM_ALIGNED; |
616 | } | 629 | } |
617 | if (foc != NULL) { | 630 | if (foc != NULL && foc->len >= 0) { |
618 | u32 need = TCPOLEN_EXP_FASTOPEN_BASE + foc->len; | 631 | u32 need = TCPOLEN_EXP_FASTOPEN_BASE + foc->len; |
619 | need = (need + 3) & ~3U; /* Align to 32 bits */ | 632 | need = (need + 3) & ~3U; /* Align to 32 bits */ |
620 | if (remaining >= need) { | 633 | if (remaining >= need) { |
@@ -787,7 +800,7 @@ void tcp_release_cb(struct sock *sk) | |||
787 | __sock_put(sk); | 800 | __sock_put(sk); |
788 | } | 801 | } |
789 | if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED)) { | 802 | if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED)) { |
790 | sk->sk_prot->mtu_reduced(sk); | 803 | inet_csk(sk)->icsk_af_ops->mtu_reduced(sk); |
791 | __sock_put(sk); | 804 | __sock_put(sk); |
792 | } | 805 | } |
793 | } | 806 | } |
@@ -865,18 +878,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, | |||
865 | BUG_ON(!skb || !tcp_skb_pcount(skb)); | 878 | BUG_ON(!skb || !tcp_skb_pcount(skb)); |
866 | 879 | ||
867 | if (clone_it) { | 880 | if (clone_it) { |
868 | const struct sk_buff *fclone = skb + 1; | 881 | skb_mstamp_get(&skb->skb_mstamp); |
869 | |||
870 | /* If congestion control is doing timestamping, we must | ||
871 | * take such a timestamp before we potentially clone/copy. | ||
872 | */ | ||
873 | if (icsk->icsk_ca_ops->flags & TCP_CONG_RTT_STAMP) | ||
874 | __net_timestamp(skb); | ||
875 | |||
876 | if (unlikely(skb->fclone == SKB_FCLONE_ORIG && | ||
877 | fclone->fclone == SKB_FCLONE_CLONE)) | ||
878 | NET_INC_STATS(sock_net(sk), | ||
879 | LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES); | ||
880 | 882 | ||
881 | if (unlikely(skb_cloned(skb))) | 883 | if (unlikely(skb_cloned(skb))) |
882 | skb = pskb_copy(skb, gfp_mask); | 884 | skb = pskb_copy(skb, gfp_mask); |
@@ -884,6 +886,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, | |||
884 | skb = skb_clone(skb, gfp_mask); | 886 | skb = skb_clone(skb, gfp_mask); |
885 | if (unlikely(!skb)) | 887 | if (unlikely(!skb)) |
886 | return -ENOBUFS; | 888 | return -ENOBUFS; |
889 | /* Our usage of tstamp should remain private */ | ||
890 | skb->tstamp.tv64 = 0; | ||
887 | } | 891 | } |
888 | 892 | ||
889 | inet = inet_sk(sk); | 893 | inet = inet_sk(sk); |
@@ -912,6 +916,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, | |||
912 | skb_orphan(skb); | 916 | skb_orphan(skb); |
913 | skb->sk = sk; | 917 | skb->sk = sk; |
914 | skb->destructor = tcp_wfree; | 918 | skb->destructor = tcp_wfree; |
919 | skb_set_hash_from_sk(skb, sk); | ||
915 | atomic_add(skb->truesize, &sk->sk_wmem_alloc); | 920 | atomic_add(skb->truesize, &sk->sk_wmem_alloc); |
916 | 921 | ||
917 | /* Build TCP header and checksum it. */ | 922 | /* Build TCP header and checksum it. */ |
@@ -970,11 +975,11 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, | |||
970 | TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, | 975 | TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, |
971 | tcp_skb_pcount(skb)); | 976 | tcp_skb_pcount(skb)); |
972 | 977 | ||
973 | err = icsk->icsk_af_ops->queue_xmit(skb, &inet->cork.fl); | 978 | err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl); |
974 | if (likely(err <= 0)) | 979 | if (likely(err <= 0)) |
975 | return err; | 980 | return err; |
976 | 981 | ||
977 | tcp_enter_cwr(sk, 1); | 982 | tcp_enter_cwr(sk); |
978 | 983 | ||
979 | return net_xmit_eval(err); | 984 | return net_xmit_eval(err); |
980 | } | 985 | } |
@@ -1064,13 +1069,28 @@ static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int de | |||
1064 | tcp_verify_left_out(tp); | 1069 | tcp_verify_left_out(tp); |
1065 | } | 1070 | } |
1066 | 1071 | ||
1072 | static void tcp_fragment_tstamp(struct sk_buff *skb, struct sk_buff *skb2) | ||
1073 | { | ||
1074 | struct skb_shared_info *shinfo = skb_shinfo(skb); | ||
1075 | |||
1076 | if (unlikely(shinfo->tx_flags & SKBTX_ANY_TSTAMP) && | ||
1077 | !before(shinfo->tskey, TCP_SKB_CB(skb2)->seq)) { | ||
1078 | struct skb_shared_info *shinfo2 = skb_shinfo(skb2); | ||
1079 | u8 tsflags = shinfo->tx_flags & SKBTX_ANY_TSTAMP; | ||
1080 | |||
1081 | shinfo->tx_flags &= ~tsflags; | ||
1082 | shinfo2->tx_flags |= tsflags; | ||
1083 | swap(shinfo->tskey, shinfo2->tskey); | ||
1084 | } | ||
1085 | } | ||
1086 | |||
1067 | /* Function to create two new TCP segments. Shrinks the given segment | 1087 | /* Function to create two new TCP segments. Shrinks the given segment |
1068 | * to the specified size and appends a new segment with the rest of the | 1088 | * to the specified size and appends a new segment with the rest of the |
1069 | * packet to the list. This won't be called frequently, I hope. | 1089 | * packet to the list. This won't be called frequently, I hope. |
1070 | * Remember, these are still headerless SKBs at this point. | 1090 | * Remember, these are still headerless SKBs at this point. |
1071 | */ | 1091 | */ |
1072 | int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, | 1092 | int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, |
1073 | unsigned int mss_now) | 1093 | unsigned int mss_now, gfp_t gfp) |
1074 | { | 1094 | { |
1075 | struct tcp_sock *tp = tcp_sk(sk); | 1095 | struct tcp_sock *tp = tcp_sk(sk); |
1076 | struct sk_buff *buff; | 1096 | struct sk_buff *buff; |
@@ -1085,11 +1105,11 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, | |||
1085 | if (nsize < 0) | 1105 | if (nsize < 0) |
1086 | nsize = 0; | 1106 | nsize = 0; |
1087 | 1107 | ||
1088 | if (skb_unclone(skb, GFP_ATOMIC)) | 1108 | if (skb_unclone(skb, gfp)) |
1089 | return -ENOMEM; | 1109 | return -ENOMEM; |
1090 | 1110 | ||
1091 | /* Get a new skb... force flag on. */ | 1111 | /* Get a new skb... force flag on. */ |
1092 | buff = sk_stream_alloc_skb(sk, nsize, GFP_ATOMIC); | 1112 | buff = sk_stream_alloc_skb(sk, nsize, gfp); |
1093 | if (buff == NULL) | 1113 | if (buff == NULL) |
1094 | return -ENOMEM; /* We'll just try again later. */ | 1114 | return -ENOMEM; /* We'll just try again later. */ |
1095 | 1115 | ||
@@ -1131,6 +1151,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, | |||
1131 | */ | 1151 | */ |
1132 | TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when; | 1152 | TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when; |
1133 | buff->tstamp = skb->tstamp; | 1153 | buff->tstamp = skb->tstamp; |
1154 | tcp_fragment_tstamp(skb, buff); | ||
1134 | 1155 | ||
1135 | old_factor = tcp_skb_pcount(skb); | 1156 | old_factor = tcp_skb_pcount(skb); |
1136 | 1157 | ||
@@ -1376,12 +1397,43 @@ unsigned int tcp_current_mss(struct sock *sk) | |||
1376 | return mss_now; | 1397 | return mss_now; |
1377 | } | 1398 | } |
1378 | 1399 | ||
1379 | /* Congestion window validation. (RFC2861) */ | 1400 | /* RFC2861, slow part. Adjust cwnd, after it was not full during one rto. |
1380 | static void tcp_cwnd_validate(struct sock *sk) | 1401 | * As additional protections, we do not touch cwnd in retransmission phases, |
1402 | * and if application hit its sndbuf limit recently. | ||
1403 | */ | ||
1404 | static void tcp_cwnd_application_limited(struct sock *sk) | ||
1381 | { | 1405 | { |
1382 | struct tcp_sock *tp = tcp_sk(sk); | 1406 | struct tcp_sock *tp = tcp_sk(sk); |
1383 | 1407 | ||
1384 | if (tp->packets_out >= tp->snd_cwnd) { | 1408 | if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open && |
1409 | sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { | ||
1410 | /* Limited by application or receiver window. */ | ||
1411 | u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk)); | ||
1412 | u32 win_used = max(tp->snd_cwnd_used, init_win); | ||
1413 | if (win_used < tp->snd_cwnd) { | ||
1414 | tp->snd_ssthresh = tcp_current_ssthresh(sk); | ||
1415 | tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1; | ||
1416 | } | ||
1417 | tp->snd_cwnd_used = 0; | ||
1418 | } | ||
1419 | tp->snd_cwnd_stamp = tcp_time_stamp; | ||
1420 | } | ||
1421 | |||
1422 | static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited) | ||
1423 | { | ||
1424 | struct tcp_sock *tp = tcp_sk(sk); | ||
1425 | |||
1426 | /* Track the maximum number of outstanding packets in each | ||
1427 | * window, and remember whether we were cwnd-limited then. | ||
1428 | */ | ||
1429 | if (!before(tp->snd_una, tp->max_packets_seq) || | ||
1430 | tp->packets_out > tp->max_packets_out) { | ||
1431 | tp->max_packets_out = tp->packets_out; | ||
1432 | tp->max_packets_seq = tp->snd_nxt; | ||
1433 | tp->is_cwnd_limited = is_cwnd_limited; | ||
1434 | } | ||
1435 | |||
1436 | if (tcp_is_cwnd_limited(sk)) { | ||
1385 | /* Network is feed fully. */ | 1437 | /* Network is feed fully. */ |
1386 | tp->snd_cwnd_used = 0; | 1438 | tp->snd_cwnd_used = 0; |
1387 | tp->snd_cwnd_stamp = tcp_time_stamp; | 1439 | tp->snd_cwnd_stamp = tcp_time_stamp; |
@@ -1426,7 +1478,7 @@ static void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now, | |||
1426 | * With Minshall's modification: all sent small packets are ACKed. | 1478 | * With Minshall's modification: all sent small packets are ACKed. |
1427 | */ | 1479 | */ |
1428 | static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp, | 1480 | static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp, |
1429 | unsigned int mss_now, int nonagle) | 1481 | int nonagle) |
1430 | { | 1482 | { |
1431 | return partial && | 1483 | return partial && |
1432 | ((nonagle & TCP_NAGLE_CORK) || | 1484 | ((nonagle & TCP_NAGLE_CORK) || |
@@ -1458,7 +1510,7 @@ static unsigned int tcp_mss_split_point(const struct sock *sk, | |||
1458 | * to include this last segment in this skb. | 1510 | * to include this last segment in this skb. |
1459 | * Otherwise, we'll split the skb at last MSS boundary | 1511 | * Otherwise, we'll split the skb at last MSS boundary |
1460 | */ | 1512 | */ |
1461 | if (tcp_nagle_check(partial != 0, tp, mss_now, nonagle)) | 1513 | if (tcp_nagle_check(partial != 0, tp, nonagle)) |
1462 | return needed - partial; | 1514 | return needed - partial; |
1463 | 1515 | ||
1464 | return needed; | 1516 | return needed; |
@@ -1521,7 +1573,7 @@ static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buf | |||
1521 | if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) | 1573 | if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) |
1522 | return true; | 1574 | return true; |
1523 | 1575 | ||
1524 | if (!tcp_nagle_check(skb->len < cur_mss, tp, cur_mss, nonagle)) | 1576 | if (!tcp_nagle_check(skb->len < cur_mss, tp, nonagle)) |
1525 | return true; | 1577 | return true; |
1526 | 1578 | ||
1527 | return false; | 1579 | return false; |
@@ -1590,7 +1642,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, | |||
1590 | 1642 | ||
1591 | /* All of a TSO frame must be composed of paged data. */ | 1643 | /* All of a TSO frame must be composed of paged data. */ |
1592 | if (skb->len != skb->data_len) | 1644 | if (skb->len != skb->data_len) |
1593 | return tcp_fragment(sk, skb, len, mss_now); | 1645 | return tcp_fragment(sk, skb, len, mss_now, gfp); |
1594 | 1646 | ||
1595 | buff = sk_stream_alloc_skb(sk, 0, gfp); | 1647 | buff = sk_stream_alloc_skb(sk, 0, gfp); |
1596 | if (unlikely(buff == NULL)) | 1648 | if (unlikely(buff == NULL)) |
@@ -1616,6 +1668,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, | |||
1616 | 1668 | ||
1617 | buff->ip_summed = skb->ip_summed = CHECKSUM_PARTIAL; | 1669 | buff->ip_summed = skb->ip_summed = CHECKSUM_PARTIAL; |
1618 | skb_split(skb, buff, len); | 1670 | skb_split(skb, buff, len); |
1671 | tcp_fragment_tstamp(skb, buff); | ||
1619 | 1672 | ||
1620 | /* Fix up tso_factor for both original and new SKB. */ | 1673 | /* Fix up tso_factor for both original and new SKB. */ |
1621 | tcp_set_skb_tso_segs(sk, skb, mss_now); | 1674 | tcp_set_skb_tso_segs(sk, skb, mss_now); |
@@ -1633,7 +1686,8 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, | |||
1633 | * | 1686 | * |
1634 | * This algorithm is from John Heffner. | 1687 | * This algorithm is from John Heffner. |
1635 | */ | 1688 | */ |
1636 | static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) | 1689 | static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, |
1690 | bool *is_cwnd_limited) | ||
1637 | { | 1691 | { |
1638 | struct tcp_sock *tp = tcp_sk(sk); | 1692 | struct tcp_sock *tp = tcp_sk(sk); |
1639 | const struct inet_connection_sock *icsk = inet_csk(sk); | 1693 | const struct inet_connection_sock *icsk = inet_csk(sk); |
@@ -1697,6 +1751,9 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) | |||
1697 | if (!tp->tso_deferred) | 1751 | if (!tp->tso_deferred) |
1698 | tp->tso_deferred = 1 | (jiffies << 1); | 1752 | tp->tso_deferred = 1 | (jiffies << 1); |
1699 | 1753 | ||
1754 | if (cong_win < send_win && cong_win < skb->len) | ||
1755 | *is_cwnd_limited = true; | ||
1756 | |||
1700 | return true; | 1757 | return true; |
1701 | 1758 | ||
1702 | send_now: | 1759 | send_now: |
@@ -1857,6 +1914,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, | |||
1857 | unsigned int tso_segs, sent_pkts; | 1914 | unsigned int tso_segs, sent_pkts; |
1858 | int cwnd_quota; | 1915 | int cwnd_quota; |
1859 | int result; | 1916 | int result; |
1917 | bool is_cwnd_limited = false; | ||
1860 | 1918 | ||
1861 | sent_pkts = 0; | 1919 | sent_pkts = 0; |
1862 | 1920 | ||
@@ -1876,11 +1934,15 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, | |||
1876 | tso_segs = tcp_init_tso_segs(sk, skb, mss_now); | 1934 | tso_segs = tcp_init_tso_segs(sk, skb, mss_now); |
1877 | BUG_ON(!tso_segs); | 1935 | BUG_ON(!tso_segs); |
1878 | 1936 | ||
1879 | if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) | 1937 | if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) { |
1938 | /* "when" is used as a start point for the retransmit timer */ | ||
1939 | TCP_SKB_CB(skb)->when = tcp_time_stamp; | ||
1880 | goto repair; /* Skip network transmission */ | 1940 | goto repair; /* Skip network transmission */ |
1941 | } | ||
1881 | 1942 | ||
1882 | cwnd_quota = tcp_cwnd_test(tp, skb); | 1943 | cwnd_quota = tcp_cwnd_test(tp, skb); |
1883 | if (!cwnd_quota) { | 1944 | if (!cwnd_quota) { |
1945 | is_cwnd_limited = true; | ||
1884 | if (push_one == 2) | 1946 | if (push_one == 2) |
1885 | /* Force out a loss probe pkt. */ | 1947 | /* Force out a loss probe pkt. */ |
1886 | cwnd_quota = 1; | 1948 | cwnd_quota = 1; |
@@ -1897,7 +1959,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, | |||
1897 | nonagle : TCP_NAGLE_PUSH)))) | 1959 | nonagle : TCP_NAGLE_PUSH)))) |
1898 | break; | 1960 | break; |
1899 | } else { | 1961 | } else { |
1900 | if (!push_one && tcp_tso_should_defer(sk, skb)) | 1962 | if (!push_one && |
1963 | tcp_tso_should_defer(sk, skb, &is_cwnd_limited)) | ||
1901 | break; | 1964 | break; |
1902 | } | 1965 | } |
1903 | 1966 | ||
@@ -1919,10 +1982,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, | |||
1919 | /* It is possible TX completion already happened | 1982 | /* It is possible TX completion already happened |
1920 | * before we set TSQ_THROTTLED, so we must | 1983 | * before we set TSQ_THROTTLED, so we must |
1921 | * test again the condition. | 1984 | * test again the condition. |
1922 | * We abuse smp_mb__after_clear_bit() because | ||
1923 | * there is no smp_mb__after_set_bit() yet | ||
1924 | */ | 1985 | */ |
1925 | smp_mb__after_clear_bit(); | 1986 | smp_mb__after_atomic(); |
1926 | if (atomic_read(&sk->sk_wmem_alloc) > limit) | 1987 | if (atomic_read(&sk->sk_wmem_alloc) > limit) |
1927 | break; | 1988 | break; |
1928 | } | 1989 | } |
@@ -1964,7 +2025,7 @@ repair: | |||
1964 | /* Send one loss probe per tail loss episode. */ | 2025 | /* Send one loss probe per tail loss episode. */ |
1965 | if (push_one != 2) | 2026 | if (push_one != 2) |
1966 | tcp_schedule_loss_probe(sk); | 2027 | tcp_schedule_loss_probe(sk); |
1967 | tcp_cwnd_validate(sk); | 2028 | tcp_cwnd_validate(sk, is_cwnd_limited); |
1968 | return false; | 2029 | return false; |
1969 | } | 2030 | } |
1970 | return (push_one == 2) || (!tp->packets_out && tcp_send_head(sk)); | 2031 | return (push_one == 2) || (!tp->packets_out && tcp_send_head(sk)); |
@@ -1975,7 +2036,7 @@ bool tcp_schedule_loss_probe(struct sock *sk) | |||
1975 | struct inet_connection_sock *icsk = inet_csk(sk); | 2036 | struct inet_connection_sock *icsk = inet_csk(sk); |
1976 | struct tcp_sock *tp = tcp_sk(sk); | 2037 | struct tcp_sock *tp = tcp_sk(sk); |
1977 | u32 timeout, tlp_time_stamp, rto_time_stamp; | 2038 | u32 timeout, tlp_time_stamp, rto_time_stamp; |
1978 | u32 rtt = tp->srtt >> 3; | 2039 | u32 rtt = usecs_to_jiffies(tp->srtt_us >> 3); |
1979 | 2040 | ||
1980 | if (WARN_ON(icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS)) | 2041 | if (WARN_ON(icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS)) |
1981 | return false; | 2042 | return false; |
@@ -1997,7 +2058,7 @@ bool tcp_schedule_loss_probe(struct sock *sk) | |||
1997 | /* Schedule a loss probe in 2*RTT for SACK capable connections | 2058 | /* Schedule a loss probe in 2*RTT for SACK capable connections |
1998 | * in Open state, that are either limited by cwnd or application. | 2059 | * in Open state, that are either limited by cwnd or application. |
1999 | */ | 2060 | */ |
2000 | if (sysctl_tcp_early_retrans < 3 || !tp->srtt || !tp->packets_out || | 2061 | if (sysctl_tcp_early_retrans < 3 || !tp->srtt_us || !tp->packets_out || |
2001 | !tcp_is_sack(tp) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open) | 2062 | !tcp_is_sack(tp) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open) |
2002 | return false; | 2063 | return false; |
2003 | 2064 | ||
@@ -2028,6 +2089,25 @@ bool tcp_schedule_loss_probe(struct sock *sk) | |||
2028 | return true; | 2089 | return true; |
2029 | } | 2090 | } |
2030 | 2091 | ||
2092 | /* Thanks to skb fast clones, we can detect if a prior transmit of | ||
2093 | * a packet is still in a qdisc or driver queue. | ||
2094 | * In this case, there is very little point doing a retransmit ! | ||
2095 | * Note: This is called from BH context only. | ||
2096 | */ | ||
2097 | static bool skb_still_in_host_queue(const struct sock *sk, | ||
2098 | const struct sk_buff *skb) | ||
2099 | { | ||
2100 | const struct sk_buff *fclone = skb + 1; | ||
2101 | |||
2102 | if (unlikely(skb->fclone == SKB_FCLONE_ORIG && | ||
2103 | fclone->fclone == SKB_FCLONE_CLONE)) { | ||
2104 | NET_INC_STATS_BH(sock_net(sk), | ||
2105 | LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES); | ||
2106 | return true; | ||
2107 | } | ||
2108 | return false; | ||
2109 | } | ||
2110 | |||
2031 | /* When probe timeout (PTO) fires, send a new segment if one exists, else | 2111 | /* When probe timeout (PTO) fires, send a new segment if one exists, else |
2032 | * retransmit the last segment. | 2112 | * retransmit the last segment. |
2033 | */ | 2113 | */ |
@@ -2053,12 +2133,16 @@ void tcp_send_loss_probe(struct sock *sk) | |||
2053 | if (WARN_ON(!skb)) | 2133 | if (WARN_ON(!skb)) |
2054 | goto rearm_timer; | 2134 | goto rearm_timer; |
2055 | 2135 | ||
2136 | if (skb_still_in_host_queue(sk, skb)) | ||
2137 | goto rearm_timer; | ||
2138 | |||
2056 | pcount = tcp_skb_pcount(skb); | 2139 | pcount = tcp_skb_pcount(skb); |
2057 | if (WARN_ON(!pcount)) | 2140 | if (WARN_ON(!pcount)) |
2058 | goto rearm_timer; | 2141 | goto rearm_timer; |
2059 | 2142 | ||
2060 | if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) { | 2143 | if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) { |
2061 | if (unlikely(tcp_fragment(sk, skb, (pcount - 1) * mss, mss))) | 2144 | if (unlikely(tcp_fragment(sk, skb, (pcount - 1) * mss, mss, |
2145 | GFP_ATOMIC))) | ||
2062 | goto rearm_timer; | 2146 | goto rearm_timer; |
2063 | skb = tcp_write_queue_tail(sk); | 2147 | skb = tcp_write_queue_tail(sk); |
2064 | } | 2148 | } |
@@ -2066,9 +2150,7 @@ void tcp_send_loss_probe(struct sock *sk) | |||
2066 | if (WARN_ON(!skb || !tcp_skb_pcount(skb))) | 2150 | if (WARN_ON(!skb || !tcp_skb_pcount(skb))) |
2067 | goto rearm_timer; | 2151 | goto rearm_timer; |
2068 | 2152 | ||
2069 | /* Probe with zero data doesn't trigger fast recovery. */ | 2153 | err = __tcp_retransmit_skb(sk, skb); |
2070 | if (skb->len > 0) | ||
2071 | err = __tcp_retransmit_skb(sk, skb); | ||
2072 | 2154 | ||
2073 | /* Record snd_nxt for loss detection. */ | 2155 | /* Record snd_nxt for loss detection. */ |
2074 | if (likely(!err)) | 2156 | if (likely(!err)) |
@@ -2082,7 +2164,6 @@ rearm_timer: | |||
2082 | if (likely(!err)) | 2164 | if (likely(!err)) |
2083 | NET_INC_STATS_BH(sock_net(sk), | 2165 | NET_INC_STATS_BH(sock_net(sk), |
2084 | LINUX_MIB_TCPLOSSPROBES); | 2166 | LINUX_MIB_TCPLOSSPROBES); |
2085 | return; | ||
2086 | } | 2167 | } |
2087 | 2168 | ||
2088 | /* Push out any pending frames which were held back due to | 2169 | /* Push out any pending frames which were held back due to |
@@ -2180,7 +2261,8 @@ u32 __tcp_select_window(struct sock *sk) | |||
2180 | */ | 2261 | */ |
2181 | int mss = icsk->icsk_ack.rcv_mss; | 2262 | int mss = icsk->icsk_ack.rcv_mss; |
2182 | int free_space = tcp_space(sk); | 2263 | int free_space = tcp_space(sk); |
2183 | int full_space = min_t(int, tp->window_clamp, tcp_full_space(sk)); | 2264 | int allowed_space = tcp_full_space(sk); |
2265 | int full_space = min_t(int, tp->window_clamp, allowed_space); | ||
2184 | int window; | 2266 | int window; |
2185 | 2267 | ||
2186 | if (mss > full_space) | 2268 | if (mss > full_space) |
@@ -2193,7 +2275,19 @@ u32 __tcp_select_window(struct sock *sk) | |||
2193 | tp->rcv_ssthresh = min(tp->rcv_ssthresh, | 2275 | tp->rcv_ssthresh = min(tp->rcv_ssthresh, |
2194 | 4U * tp->advmss); | 2276 | 4U * tp->advmss); |
2195 | 2277 | ||
2196 | if (free_space < mss) | 2278 | /* free_space might become our new window, make sure we don't |
2279 | * increase it due to wscale. | ||
2280 | */ | ||
2281 | free_space = round_down(free_space, 1 << tp->rx_opt.rcv_wscale); | ||
2282 | |||
2283 | /* if free space is less than mss estimate, or is below 1/16th | ||
2284 | * of the maximum allowed, try to move to zero-window, else | ||
2285 | * tcp_clamp_window() will grow rcv buf up to tcp_rmem[2], and | ||
2286 | * new incoming data is dropped due to memory limits. | ||
2287 | * With large window, mss test triggers way too late in order | ||
2288 | * to announce zero window in time before rmem limit kicks in. | ||
2289 | */ | ||
2290 | if (free_space < (allowed_space >> 4) || free_space < mss) | ||
2197 | return 0; | 2291 | return 0; |
2198 | } | 2292 | } |
2199 | 2293 | ||
@@ -2362,6 +2456,9 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) | |||
2362 | min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf)) | 2456 | min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf)) |
2363 | return -EAGAIN; | 2457 | return -EAGAIN; |
2364 | 2458 | ||
2459 | if (skb_still_in_host_queue(sk, skb)) | ||
2460 | return -EBUSY; | ||
2461 | |||
2365 | if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) { | 2462 | if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) { |
2366 | if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) | 2463 | if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) |
2367 | BUG(); | 2464 | BUG(); |
@@ -2384,7 +2481,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) | |||
2384 | return -EAGAIN; | 2481 | return -EAGAIN; |
2385 | 2482 | ||
2386 | if (skb->len > cur_mss) { | 2483 | if (skb->len > cur_mss) { |
2387 | if (tcp_fragment(sk, skb, cur_mss, cur_mss)) | 2484 | if (tcp_fragment(sk, skb, cur_mss, cur_mss, GFP_ATOMIC)) |
2388 | return -ENOMEM; /* We'll try again later. */ | 2485 | return -ENOMEM; /* We'll try again later. */ |
2389 | } else { | 2486 | } else { |
2390 | int oldpcount = tcp_skb_pcount(skb); | 2487 | int oldpcount = tcp_skb_pcount(skb); |
@@ -2418,8 +2515,14 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) | |||
2418 | err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); | 2515 | err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); |
2419 | } | 2516 | } |
2420 | 2517 | ||
2421 | if (likely(!err)) | 2518 | if (likely(!err)) { |
2422 | TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS; | 2519 | TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS; |
2520 | /* Update global TCP statistics. */ | ||
2521 | TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS); | ||
2522 | if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) | ||
2523 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); | ||
2524 | tp->total_retrans++; | ||
2525 | } | ||
2423 | return err; | 2526 | return err; |
2424 | } | 2527 | } |
2425 | 2528 | ||
@@ -2429,11 +2532,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) | |||
2429 | int err = __tcp_retransmit_skb(sk, skb); | 2532 | int err = __tcp_retransmit_skb(sk, skb); |
2430 | 2533 | ||
2431 | if (err == 0) { | 2534 | if (err == 0) { |
2432 | /* Update global TCP statistics. */ | ||
2433 | TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS); | ||
2434 | |||
2435 | tp->total_retrans++; | ||
2436 | |||
2437 | #if FASTRETRANS_DEBUG > 0 | 2535 | #if FASTRETRANS_DEBUG > 0 |
2438 | if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { | 2536 | if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { |
2439 | net_dbg_ratelimited("retrans_out leaked\n"); | 2537 | net_dbg_ratelimited("retrans_out leaked\n"); |
@@ -2448,15 +2546,17 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) | |||
2448 | if (!tp->retrans_stamp) | 2546 | if (!tp->retrans_stamp) |
2449 | tp->retrans_stamp = TCP_SKB_CB(skb)->when; | 2547 | tp->retrans_stamp = TCP_SKB_CB(skb)->when; |
2450 | 2548 | ||
2451 | tp->undo_retrans += tcp_skb_pcount(skb); | ||
2452 | |||
2453 | /* snd_nxt is stored to detect loss of retransmitted segment, | 2549 | /* snd_nxt is stored to detect loss of retransmitted segment, |
2454 | * see tcp_input.c tcp_sacktag_write_queue(). | 2550 | * see tcp_input.c tcp_sacktag_write_queue(). |
2455 | */ | 2551 | */ |
2456 | TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt; | 2552 | TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt; |
2457 | } else { | 2553 | } else if (err != -EBUSY) { |
2458 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL); | 2554 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL); |
2459 | } | 2555 | } |
2556 | |||
2557 | if (tp->undo_retrans < 0) | ||
2558 | tp->undo_retrans = 0; | ||
2559 | tp->undo_retrans += tcp_skb_pcount(skb); | ||
2460 | return err; | 2560 | return err; |
2461 | } | 2561 | } |
2462 | 2562 | ||
@@ -2717,7 +2817,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, | |||
2717 | int tcp_header_size; | 2817 | int tcp_header_size; |
2718 | int mss; | 2818 | int mss; |
2719 | 2819 | ||
2720 | skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC); | 2820 | skb = sock_wmalloc(sk, MAX_TCP_HEADER, 1, GFP_ATOMIC); |
2721 | if (unlikely(!skb)) { | 2821 | if (unlikely(!skb)) { |
2722 | dst_release(dst); | 2822 | dst_release(dst); |
2723 | return NULL; | 2823 | return NULL; |
@@ -2732,27 +2832,6 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, | |||
2732 | if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss) | 2832 | if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss) |
2733 | mss = tp->rx_opt.user_mss; | 2833 | mss = tp->rx_opt.user_mss; |
2734 | 2834 | ||
2735 | if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */ | ||
2736 | __u8 rcv_wscale; | ||
2737 | /* Set this up on the first call only */ | ||
2738 | req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW); | ||
2739 | |||
2740 | /* limit the window selection if the user enforce a smaller rx buffer */ | ||
2741 | if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && | ||
2742 | (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0)) | ||
2743 | req->window_clamp = tcp_full_space(sk); | ||
2744 | |||
2745 | /* tcp_full_space because it is guaranteed to be the first packet */ | ||
2746 | tcp_select_initial_window(tcp_full_space(sk), | ||
2747 | mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), | ||
2748 | &req->rcv_wnd, | ||
2749 | &req->window_clamp, | ||
2750 | ireq->wscale_ok, | ||
2751 | &rcv_wscale, | ||
2752 | dst_metric(dst, RTAX_INITRWND)); | ||
2753 | ireq->rcv_wscale = rcv_wscale; | ||
2754 | } | ||
2755 | |||
2756 | memset(&opts, 0, sizeof(opts)); | 2835 | memset(&opts, 0, sizeof(opts)); |
2757 | #ifdef CONFIG_SYN_COOKIES | 2836 | #ifdef CONFIG_SYN_COOKIES |
2758 | if (unlikely(req->cookie_ts)) | 2837 | if (unlikely(req->cookie_ts)) |
@@ -2787,7 +2866,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, | |||
2787 | th->window = htons(min(req->rcv_wnd, 65535U)); | 2866 | th->window = htons(min(req->rcv_wnd, 65535U)); |
2788 | tcp_options_write((__be32 *)(th + 1), tp, &opts); | 2867 | tcp_options_write((__be32 *)(th + 1), tp, &opts); |
2789 | th->doff = (tcp_header_size >> 2); | 2868 | th->doff = (tcp_header_size >> 2); |
2790 | TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, tcp_skb_pcount(skb)); | 2869 | TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_OUTSEGS); |
2791 | 2870 | ||
2792 | #ifdef CONFIG_TCP_MD5SIG | 2871 | #ifdef CONFIG_TCP_MD5SIG |
2793 | /* Okay, we have all we need - do the md5 hash if needed */ | 2872 | /* Okay, we have all we need - do the md5 hash if needed */ |
@@ -2959,9 +3038,15 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) | |||
2959 | tcp_connect_queue_skb(sk, data); | 3038 | tcp_connect_queue_skb(sk, data); |
2960 | fo->copied = data->len; | 3039 | fo->copied = data->len; |
2961 | 3040 | ||
3041 | /* syn_data is about to be sent, we need to take current time stamps | ||
3042 | * for the packets that are in write queue : SYN packet and DATA | ||
3043 | */ | ||
3044 | skb_mstamp_get(&syn->skb_mstamp); | ||
3045 | data->skb_mstamp = syn->skb_mstamp; | ||
3046 | |||
2962 | if (tcp_transmit_skb(sk, syn_data, 0, sk->sk_allocation) == 0) { | 3047 | if (tcp_transmit_skb(sk, syn_data, 0, sk->sk_allocation) == 0) { |
2963 | tp->syn_data = (fo->copied > 0); | 3048 | tp->syn_data = (fo->copied > 0); |
2964 | NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE); | 3049 | NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT); |
2965 | goto done; | 3050 | goto done; |
2966 | } | 3051 | } |
2967 | syn_data = NULL; | 3052 | syn_data = NULL; |
@@ -3049,8 +3134,9 @@ void tcp_send_delayed_ack(struct sock *sk) | |||
3049 | * Do not use inet_csk(sk)->icsk_rto here, use results of rtt measurements | 3134 | * Do not use inet_csk(sk)->icsk_rto here, use results of rtt measurements |
3050 | * directly. | 3135 | * directly. |
3051 | */ | 3136 | */ |
3052 | if (tp->srtt) { | 3137 | if (tp->srtt_us) { |
3053 | int rtt = max(tp->srtt >> 3, TCP_DELACK_MIN); | 3138 | int rtt = max_t(int, usecs_to_jiffies(tp->srtt_us >> 3), |
3139 | TCP_DELACK_MIN); | ||
3054 | 3140 | ||
3055 | if (rtt < max_ato) | 3141 | if (rtt < max_ato) |
3056 | max_ato = rtt; | 3142 | max_ato = rtt; |
@@ -3178,7 +3264,7 @@ int tcp_write_wakeup(struct sock *sk) | |||
3178 | skb->len > mss) { | 3264 | skb->len > mss) { |
3179 | seg_size = min(seg_size, mss); | 3265 | seg_size = min(seg_size, mss); |
3180 | TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; | 3266 | TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; |
3181 | if (tcp_fragment(sk, skb, seg_size, mss)) | 3267 | if (tcp_fragment(sk, skb, seg_size, mss, GFP_ATOMIC)) |
3182 | return -1; | 3268 | return -1; |
3183 | } else if (!tcp_skb_pcount(skb)) | 3269 | } else if (!tcp_skb_pcount(skb)) |
3184 | tcp_set_skb_tso_segs(sk, skb, mss); | 3270 | tcp_set_skb_tso_segs(sk, skb, mss); |
@@ -3236,3 +3322,18 @@ void tcp_send_probe0(struct sock *sk) | |||
3236 | TCP_RTO_MAX); | 3322 | TCP_RTO_MAX); |
3237 | } | 3323 | } |
3238 | } | 3324 | } |
3325 | |||
3326 | int tcp_rtx_synack(struct sock *sk, struct request_sock *req) | ||
3327 | { | ||
3328 | const struct tcp_request_sock_ops *af_ops = tcp_rsk(req)->af_specific; | ||
3329 | struct flowi fl; | ||
3330 | int res; | ||
3331 | |||
3332 | res = af_ops->send_synack(sk, NULL, &fl, req, 0, NULL); | ||
3333 | if (!res) { | ||
3334 | TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); | ||
3335 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); | ||
3336 | } | ||
3337 | return res; | ||
3338 | } | ||
3339 | EXPORT_SYMBOL(tcp_rtx_synack); | ||