aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp_output.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4/tcp_output.c')
-rw-r--r--net/ipv4/tcp_output.c339
1 files changed, 179 insertions, 160 deletions
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 896e9dfbdb5c..c3c082ed3879 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -76,16 +76,15 @@ static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
76 tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; 76 tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
77 77
78 tp->packets_out += tcp_skb_pcount(skb); 78 tp->packets_out += tcp_skb_pcount(skb);
79 if (!prior_packets || icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || 79 if (!prior_packets || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
80 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
81 tcp_rearm_rto(sk); 80 tcp_rearm_rto(sk);
82 }
83 81
84 NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT, 82 NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT,
85 tcp_skb_pcount(skb)); 83 tcp_skb_pcount(skb));
86} 84}
87 85
88/* SND.NXT, if window was not shrunk. 86/* SND.NXT, if window was not shrunk or the amount of shrunk was less than one
87 * window scaling factor due to loss of precision.
89 * If window has been shrunk, what should we make? It is not clear at all. 88 * If window has been shrunk, what should we make? It is not clear at all.
90 * Using SND.UNA we will fail to open window, SND.NXT is out of window. :-( 89 * Using SND.UNA we will fail to open window, SND.NXT is out of window. :-(
91 * Anything in between SND.UNA...SND.UNA+SND.WND also can be already 90 * Anything in between SND.UNA...SND.UNA+SND.WND also can be already
@@ -95,7 +94,9 @@ static inline __u32 tcp_acceptable_seq(const struct sock *sk)
95{ 94{
96 const struct tcp_sock *tp = tcp_sk(sk); 95 const struct tcp_sock *tp = tcp_sk(sk);
97 96
98 if (!before(tcp_wnd_end(tp), tp->snd_nxt)) 97 if (!before(tcp_wnd_end(tp), tp->snd_nxt) ||
98 (tp->rx_opt.wscale_ok &&
99 ((tp->snd_nxt - tcp_wnd_end(tp)) < (1 << tp->rx_opt.rcv_wscale))))
99 return tp->snd_nxt; 100 return tp->snd_nxt;
100 else 101 else
101 return tcp_wnd_end(tp); 102 return tcp_wnd_end(tp);
@@ -640,7 +641,7 @@ static unsigned int tcp_synack_options(struct request_sock *req,
640 } 641 }
641 if (likely(ireq->tstamp_ok)) { 642 if (likely(ireq->tstamp_ok)) {
642 opts->options |= OPTION_TS; 643 opts->options |= OPTION_TS;
643 opts->tsval = tcp_skb_timestamp(skb); 644 opts->tsval = tcp_skb_timestamp(skb) + tcp_rsk(req)->ts_off;
644 opts->tsecr = req->ts_recent; 645 opts->tsecr = req->ts_recent;
645 remaining -= TCPOLEN_TSTAMP_ALIGNED; 646 remaining -= TCPOLEN_TSTAMP_ALIGNED;
646 } 647 }
@@ -769,25 +770,27 @@ static void tcp_tasklet_func(unsigned long data)
769 list_del(&tp->tsq_node); 770 list_del(&tp->tsq_node);
770 771
771 sk = (struct sock *)tp; 772 sk = (struct sock *)tp;
772 bh_lock_sock(sk); 773 smp_mb__before_atomic();
773 774 clear_bit(TSQ_QUEUED, &sk->sk_tsq_flags);
774 if (!sock_owned_by_user(sk)) { 775
775 tcp_tsq_handler(sk); 776 if (!sk->sk_lock.owned &&
776 } else { 777 test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags)) {
777 /* defer the work to tcp_release_cb() */ 778 bh_lock_sock(sk);
778 set_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags); 779 if (!sock_owned_by_user(sk)) {
780 clear_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags);
781 tcp_tsq_handler(sk);
782 }
783 bh_unlock_sock(sk);
779 } 784 }
780 bh_unlock_sock(sk);
781 785
782 clear_bit(TSQ_QUEUED, &tp->tsq_flags);
783 sk_free(sk); 786 sk_free(sk);
784 } 787 }
785} 788}
786 789
787#define TCP_DEFERRED_ALL ((1UL << TCP_TSQ_DEFERRED) | \ 790#define TCP_DEFERRED_ALL (TCPF_TSQ_DEFERRED | \
788 (1UL << TCP_WRITE_TIMER_DEFERRED) | \ 791 TCPF_WRITE_TIMER_DEFERRED | \
789 (1UL << TCP_DELACK_TIMER_DEFERRED) | \ 792 TCPF_DELACK_TIMER_DEFERRED | \
790 (1UL << TCP_MTU_REDUCED_DEFERRED)) 793 TCPF_MTU_REDUCED_DEFERRED)
791/** 794/**
792 * tcp_release_cb - tcp release_sock() callback 795 * tcp_release_cb - tcp release_sock() callback
793 * @sk: socket 796 * @sk: socket
@@ -797,18 +800,17 @@ static void tcp_tasklet_func(unsigned long data)
797 */ 800 */
798void tcp_release_cb(struct sock *sk) 801void tcp_release_cb(struct sock *sk)
799{ 802{
800 struct tcp_sock *tp = tcp_sk(sk);
801 unsigned long flags, nflags; 803 unsigned long flags, nflags;
802 804
803 /* perform an atomic operation only if at least one flag is set */ 805 /* perform an atomic operation only if at least one flag is set */
804 do { 806 do {
805 flags = tp->tsq_flags; 807 flags = sk->sk_tsq_flags;
806 if (!(flags & TCP_DEFERRED_ALL)) 808 if (!(flags & TCP_DEFERRED_ALL))
807 return; 809 return;
808 nflags = flags & ~TCP_DEFERRED_ALL; 810 nflags = flags & ~TCP_DEFERRED_ALL;
809 } while (cmpxchg(&tp->tsq_flags, flags, nflags) != flags); 811 } while (cmpxchg(&sk->sk_tsq_flags, flags, nflags) != flags);
810 812
811 if (flags & (1UL << TCP_TSQ_DEFERRED)) 813 if (flags & TCPF_TSQ_DEFERRED)
812 tcp_tsq_handler(sk); 814 tcp_tsq_handler(sk);
813 815
814 /* Here begins the tricky part : 816 /* Here begins the tricky part :
@@ -822,15 +824,15 @@ void tcp_release_cb(struct sock *sk)
822 */ 824 */
823 sock_release_ownership(sk); 825 sock_release_ownership(sk);
824 826
825 if (flags & (1UL << TCP_WRITE_TIMER_DEFERRED)) { 827 if (flags & TCPF_WRITE_TIMER_DEFERRED) {
826 tcp_write_timer_handler(sk); 828 tcp_write_timer_handler(sk);
827 __sock_put(sk); 829 __sock_put(sk);
828 } 830 }
829 if (flags & (1UL << TCP_DELACK_TIMER_DEFERRED)) { 831 if (flags & TCPF_DELACK_TIMER_DEFERRED) {
830 tcp_delack_timer_handler(sk); 832 tcp_delack_timer_handler(sk);
831 __sock_put(sk); 833 __sock_put(sk);
832 } 834 }
833 if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED)) { 835 if (flags & TCPF_MTU_REDUCED_DEFERRED) {
834 inet_csk(sk)->icsk_af_ops->mtu_reduced(sk); 836 inet_csk(sk)->icsk_af_ops->mtu_reduced(sk);
835 __sock_put(sk); 837 __sock_put(sk);
836 } 838 }
@@ -860,6 +862,7 @@ void tcp_wfree(struct sk_buff *skb)
860{ 862{
861 struct sock *sk = skb->sk; 863 struct sock *sk = skb->sk;
862 struct tcp_sock *tp = tcp_sk(sk); 864 struct tcp_sock *tp = tcp_sk(sk);
865 unsigned long flags, nval, oval;
863 int wmem; 866 int wmem;
864 867
865 /* Keep one reference on sk_wmem_alloc. 868 /* Keep one reference on sk_wmem_alloc.
@@ -877,16 +880,25 @@ void tcp_wfree(struct sk_buff *skb)
877 if (wmem >= SKB_TRUESIZE(1) && this_cpu_ksoftirqd() == current) 880 if (wmem >= SKB_TRUESIZE(1) && this_cpu_ksoftirqd() == current)
878 goto out; 881 goto out;
879 882
880 if (test_and_clear_bit(TSQ_THROTTLED, &tp->tsq_flags) && 883 for (oval = READ_ONCE(sk->sk_tsq_flags);; oval = nval) {
881 !test_and_set_bit(TSQ_QUEUED, &tp->tsq_flags)) {
882 unsigned long flags;
883 struct tsq_tasklet *tsq; 884 struct tsq_tasklet *tsq;
885 bool empty;
886
887 if (!(oval & TSQF_THROTTLED) || (oval & TSQF_QUEUED))
888 goto out;
889
890 nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED | TCPF_TSQ_DEFERRED;
891 nval = cmpxchg(&sk->sk_tsq_flags, oval, nval);
892 if (nval != oval)
893 continue;
884 894
885 /* queue this socket to tasklet queue */ 895 /* queue this socket to tasklet queue */
886 local_irq_save(flags); 896 local_irq_save(flags);
887 tsq = this_cpu_ptr(&tsq_tasklet); 897 tsq = this_cpu_ptr(&tsq_tasklet);
898 empty = list_empty(&tsq->head);
888 list_add(&tp->tsq_node, &tsq->head); 899 list_add(&tp->tsq_node, &tsq->head);
889 tasklet_schedule(&tsq->tasklet); 900 if (empty)
901 tasklet_schedule(&tsq->tasklet);
890 local_irq_restore(flags); 902 local_irq_restore(flags);
891 return; 903 return;
892 } 904 }
@@ -955,6 +967,13 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
955 */ 967 */
956 skb->ooo_okay = sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1); 968 skb->ooo_okay = sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1);
957 969
970 /* If we had to use memory reserve to allocate this skb,
971 * this might cause drops if packet is looped back :
972 * Other socket might not have SOCK_MEMALLOC.
973 * Packets not looped back do not care about pfmemalloc.
974 */
975 skb->pfmemalloc = 0;
976
958 skb_push(skb, tcp_header_size); 977 skb_push(skb, tcp_header_size);
959 skb_reset_transport_header(skb); 978 skb_reset_transport_header(skb);
960 979
@@ -964,6 +983,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
964 skb_set_hash_from_sk(skb, sk); 983 skb_set_hash_from_sk(skb, sk);
965 atomic_add(skb->truesize, &sk->sk_wmem_alloc); 984 atomic_add(skb->truesize, &sk->sk_wmem_alloc);
966 985
986 skb_set_dst_pending_confirm(skb, sk->sk_dst_pending_confirm);
987
967 /* Build TCP header and checksum it. */ 988 /* Build TCP header and checksum it. */
968 th = (struct tcphdr *)skb->data; 989 th = (struct tcphdr *)skb->data;
969 th->source = inet->inet_sport; 990 th->source = inet->inet_sport;
@@ -1027,7 +1048,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
1027 skb_shinfo(skb)->gso_size = tcp_skb_mss(skb); 1048 skb_shinfo(skb)->gso_size = tcp_skb_mss(skb);
1028 1049
1029 /* Our usage of tstamp should remain private */ 1050 /* Our usage of tstamp should remain private */
1030 skb->tstamp.tv64 = 0; 1051 skb->tstamp = 0;
1031 1052
1032 /* Cleanup our debris for IP stacks */ 1053 /* Cleanup our debris for IP stacks */
1033 memset(skb->cb, 0, max(sizeof(struct inet_skb_parm), 1054 memset(skb->cb, 0, max(sizeof(struct inet_skb_parm),
@@ -1514,6 +1535,18 @@ static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited)
1514 if (sysctl_tcp_slow_start_after_idle && 1535 if (sysctl_tcp_slow_start_after_idle &&
1515 (s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto) 1536 (s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto)
1516 tcp_cwnd_application_limited(sk); 1537 tcp_cwnd_application_limited(sk);
1538
1539 /* The following conditions together indicate the starvation
1540 * is caused by insufficient sender buffer:
1541 * 1) just sent some data (see tcp_write_xmit)
1542 * 2) not cwnd limited (this else condition)
1543 * 3) no more data to send (null tcp_send_head )
1544 * 4) application is hitting buffer limit (SOCK_NOSPACE)
1545 */
1546 if (!tcp_send_head(sk) && sk->sk_socket &&
1547 test_bit(SOCK_NOSPACE, &sk->sk_socket->flags) &&
1548 (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
1549 tcp_chrono_start(sk, TCP_CHRONO_SNDBUF_LIMITED);
1517 } 1550 }
1518} 1551}
1519 1552
@@ -1910,26 +1943,26 @@ static inline void tcp_mtu_check_reprobe(struct sock *sk)
1910 */ 1943 */
1911static int tcp_mtu_probe(struct sock *sk) 1944static int tcp_mtu_probe(struct sock *sk)
1912{ 1945{
1913 struct tcp_sock *tp = tcp_sk(sk);
1914 struct inet_connection_sock *icsk = inet_csk(sk); 1946 struct inet_connection_sock *icsk = inet_csk(sk);
1947 struct tcp_sock *tp = tcp_sk(sk);
1915 struct sk_buff *skb, *nskb, *next; 1948 struct sk_buff *skb, *nskb, *next;
1916 struct net *net = sock_net(sk); 1949 struct net *net = sock_net(sk);
1917 int len;
1918 int probe_size; 1950 int probe_size;
1919 int size_needed; 1951 int size_needed;
1920 int copy; 1952 int copy, len;
1921 int mss_now; 1953 int mss_now;
1922 int interval; 1954 int interval;
1923 1955
1924 /* Not currently probing/verifying, 1956 /* Not currently probing/verifying,
1925 * not in recovery, 1957 * not in recovery,
1926 * have enough cwnd, and 1958 * have enough cwnd, and
1927 * not SACKing (the variable headers throw things off) */ 1959 * not SACKing (the variable headers throw things off)
1928 if (!icsk->icsk_mtup.enabled || 1960 */
1929 icsk->icsk_mtup.probe_size || 1961 if (likely(!icsk->icsk_mtup.enabled ||
1930 inet_csk(sk)->icsk_ca_state != TCP_CA_Open || 1962 icsk->icsk_mtup.probe_size ||
1931 tp->snd_cwnd < 11 || 1963 inet_csk(sk)->icsk_ca_state != TCP_CA_Open ||
1932 tp->rx_opt.num_sacks || tp->rx_opt.dsack) 1964 tp->snd_cwnd < 11 ||
1965 tp->rx_opt.num_sacks || tp->rx_opt.dsack))
1933 return -1; 1966 return -1;
1934 1967
1935 /* Use binary search for probe_size between tcp_mss_base, 1968 /* Use binary search for probe_size between tcp_mss_base,
@@ -2069,7 +2102,16 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
2069 limit <<= factor; 2102 limit <<= factor;
2070 2103
2071 if (atomic_read(&sk->sk_wmem_alloc) > limit) { 2104 if (atomic_read(&sk->sk_wmem_alloc) > limit) {
2072 set_bit(TSQ_THROTTLED, &tcp_sk(sk)->tsq_flags); 2105 /* Always send the 1st or 2nd skb in write queue.
2106 * No need to wait for TX completion to call us back,
2107 * after softirq/tasklet schedule.
2108 * This helps when TX completions are delayed too much.
2109 */
2110 if (skb == sk->sk_write_queue.next ||
2111 skb->prev == sk->sk_write_queue.next)
2112 return false;
2113
2114 set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
2073 /* It is possible TX completion already happened 2115 /* It is possible TX completion already happened
2074 * before we set TSQ_THROTTLED, so we must 2116 * before we set TSQ_THROTTLED, so we must
2075 * test again the condition. 2117 * test again the condition.
@@ -2081,6 +2123,47 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
2081 return false; 2123 return false;
2082} 2124}
2083 2125
2126static void tcp_chrono_set(struct tcp_sock *tp, const enum tcp_chrono new)
2127{
2128 const u32 now = tcp_time_stamp;
2129
2130 if (tp->chrono_type > TCP_CHRONO_UNSPEC)
2131 tp->chrono_stat[tp->chrono_type - 1] += now - tp->chrono_start;
2132 tp->chrono_start = now;
2133 tp->chrono_type = new;
2134}
2135
2136void tcp_chrono_start(struct sock *sk, const enum tcp_chrono type)
2137{
2138 struct tcp_sock *tp = tcp_sk(sk);
2139
2140 /* If there are multiple conditions worthy of tracking in a
2141 * chronograph then the highest priority enum takes precedence
2142 * over the other conditions. So that if something "more interesting"
2143 * starts happening, stop the previous chrono and start a new one.
2144 */
2145 if (type > tp->chrono_type)
2146 tcp_chrono_set(tp, type);
2147}
2148
2149void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type)
2150{
2151 struct tcp_sock *tp = tcp_sk(sk);
2152
2153
2154 /* There are multiple conditions worthy of tracking in a
2155 * chronograph, so that the highest priority enum takes
2156 * precedence over the other conditions (see tcp_chrono_start).
2157 * If a condition stops, we only stop chrono tracking if
2158 * it's the "most interesting" or current chrono we are
2159 * tracking and starts busy chrono if we have pending data.
2160 */
2161 if (tcp_write_queue_empty(sk))
2162 tcp_chrono_set(tp, TCP_CHRONO_UNSPEC);
2163 else if (type == tp->chrono_type)
2164 tcp_chrono_set(tp, TCP_CHRONO_BUSY);
2165}
2166
2084/* This routine writes packets to the network. It advances the 2167/* This routine writes packets to the network. It advances the
2085 * send_head. This happens as incoming acks open up the remote 2168 * send_head. This happens as incoming acks open up the remote
2086 * window for us. 2169 * window for us.
@@ -2103,7 +2186,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
2103 unsigned int tso_segs, sent_pkts; 2186 unsigned int tso_segs, sent_pkts;
2104 int cwnd_quota; 2187 int cwnd_quota;
2105 int result; 2188 int result;
2106 bool is_cwnd_limited = false; 2189 bool is_cwnd_limited = false, is_rwnd_limited = false;
2107 u32 max_segs; 2190 u32 max_segs;
2108 2191
2109 sent_pkts = 0; 2192 sent_pkts = 0;
@@ -2140,8 +2223,10 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
2140 break; 2223 break;
2141 } 2224 }
2142 2225
2143 if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) 2226 if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) {
2227 is_rwnd_limited = true;
2144 break; 2228 break;
2229 }
2145 2230
2146 if (tso_segs == 1) { 2231 if (tso_segs == 1) {
2147 if (unlikely(!tcp_nagle_test(tp, skb, mss_now, 2232 if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
@@ -2167,6 +2252,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
2167 unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) 2252 unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
2168 break; 2253 break;
2169 2254
2255 if (test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags))
2256 clear_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags);
2170 if (tcp_small_queue_check(sk, skb, 0)) 2257 if (tcp_small_queue_check(sk, skb, 0))
2171 break; 2258 break;
2172 2259
@@ -2186,6 +2273,11 @@ repair:
2186 break; 2273 break;
2187 } 2274 }
2188 2275
2276 if (is_rwnd_limited)
2277 tcp_chrono_start(sk, TCP_CHRONO_RWND_LIMITED);
2278 else
2279 tcp_chrono_stop(sk, TCP_CHRONO_RWND_LIMITED);
2280
2189 if (likely(sent_pkts)) { 2281 if (likely(sent_pkts)) {
2190 if (tcp_in_cwnd_reduction(sk)) 2282 if (tcp_in_cwnd_reduction(sk))
2191 tp->prr_out += sent_pkts; 2283 tp->prr_out += sent_pkts;
@@ -2207,8 +2299,6 @@ bool tcp_schedule_loss_probe(struct sock *sk)
2207 u32 timeout, tlp_time_stamp, rto_time_stamp; 2299 u32 timeout, tlp_time_stamp, rto_time_stamp;
2208 u32 rtt = usecs_to_jiffies(tp->srtt_us >> 3); 2300 u32 rtt = usecs_to_jiffies(tp->srtt_us >> 3);
2209 2301
2210 if (WARN_ON(icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS))
2211 return false;
2212 /* No consecutive loss probes. */ 2302 /* No consecutive loss probes. */
2213 if (WARN_ON(icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)) { 2303 if (WARN_ON(icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)) {
2214 tcp_rearm_rto(sk); 2304 tcp_rearm_rto(sk);
@@ -2227,8 +2317,9 @@ bool tcp_schedule_loss_probe(struct sock *sk)
2227 /* Schedule a loss probe in 2*RTT for SACK capable connections 2317 /* Schedule a loss probe in 2*RTT for SACK capable connections
2228 * in Open state, that are either limited by cwnd or application. 2318 * in Open state, that are either limited by cwnd or application.
2229 */ 2319 */
2230 if (sysctl_tcp_early_retrans < 3 || !tp->packets_out || 2320 if ((sysctl_tcp_early_retrans != 3 && sysctl_tcp_early_retrans != 4) ||
2231 !tcp_is_sack(tp) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open) 2321 !tp->packets_out || !tcp_is_sack(tp) ||
2322 icsk->icsk_ca_state != TCP_CA_Open)
2232 return false; 2323 return false;
2233 2324
2234 if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) && 2325 if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) &&
@@ -2436,9 +2527,11 @@ u32 __tcp_select_window(struct sock *sk)
2436 int full_space = min_t(int, tp->window_clamp, allowed_space); 2527 int full_space = min_t(int, tp->window_clamp, allowed_space);
2437 int window; 2528 int window;
2438 2529
2439 if (mss > full_space) 2530 if (unlikely(mss > full_space)) {
2440 mss = full_space; 2531 mss = full_space;
2441 2532 if (mss <= 0)
2533 return 0;
2534 }
2442 if (free_space < (full_space >> 1)) { 2535 if (free_space < (full_space >> 1)) {
2443 icsk->icsk_ack.quick = 0; 2536 icsk->icsk_ack.quick = 0;
2444 2537
@@ -2514,7 +2607,7 @@ void tcp_skb_collapse_tstamp(struct sk_buff *skb,
2514} 2607}
2515 2608
2516/* Collapses two adjacent SKB's during retransmission. */ 2609/* Collapses two adjacent SKB's during retransmission. */
2517static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) 2610static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
2518{ 2611{
2519 struct tcp_sock *tp = tcp_sk(sk); 2612 struct tcp_sock *tp = tcp_sk(sk);
2520 struct sk_buff *next_skb = tcp_write_queue_next(sk, skb); 2613 struct sk_buff *next_skb = tcp_write_queue_next(sk, skb);
@@ -2525,13 +2618,17 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
2525 2618
2526 BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1); 2619 BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1);
2527 2620
2621 if (next_skb_size) {
2622 if (next_skb_size <= skb_availroom(skb))
2623 skb_copy_bits(next_skb, 0, skb_put(skb, next_skb_size),
2624 next_skb_size);
2625 else if (!skb_shift(skb, next_skb, next_skb_size))
2626 return false;
2627 }
2528 tcp_highest_sack_combine(sk, next_skb, skb); 2628 tcp_highest_sack_combine(sk, next_skb, skb);
2529 2629
2530 tcp_unlink_write_queue(next_skb, sk); 2630 tcp_unlink_write_queue(next_skb, sk);
2531 2631
2532 skb_copy_from_linear_data(next_skb, skb_put(skb, next_skb_size),
2533 next_skb_size);
2534
2535 if (next_skb->ip_summed == CHECKSUM_PARTIAL) 2632 if (next_skb->ip_summed == CHECKSUM_PARTIAL)
2536 skb->ip_summed = CHECKSUM_PARTIAL; 2633 skb->ip_summed = CHECKSUM_PARTIAL;
2537 2634
@@ -2560,6 +2657,7 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
2560 tcp_skb_collapse_tstamp(skb, next_skb); 2657 tcp_skb_collapse_tstamp(skb, next_skb);
2561 2658
2562 sk_wmem_free_skb(sk, next_skb); 2659 sk_wmem_free_skb(sk, next_skb);
2660 return true;
2563} 2661}
2564 2662
2565/* Check if coalescing SKBs is legal. */ 2663/* Check if coalescing SKBs is legal. */
@@ -2567,14 +2665,11 @@ static bool tcp_can_collapse(const struct sock *sk, const struct sk_buff *skb)
2567{ 2665{
2568 if (tcp_skb_pcount(skb) > 1) 2666 if (tcp_skb_pcount(skb) > 1)
2569 return false; 2667 return false;
2570 /* TODO: SACK collapsing could be used to remove this condition */
2571 if (skb_shinfo(skb)->nr_frags != 0)
2572 return false;
2573 if (skb_cloned(skb)) 2668 if (skb_cloned(skb))
2574 return false; 2669 return false;
2575 if (skb == tcp_send_head(sk)) 2670 if (skb == tcp_send_head(sk))
2576 return false; 2671 return false;
2577 /* Some heurestics for collapsing over SACK'd could be invented */ 2672 /* Some heuristics for collapsing over SACK'd could be invented */
2578 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) 2673 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
2579 return false; 2674 return false;
2580 2675
@@ -2612,16 +2707,12 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
2612 2707
2613 if (space < 0) 2708 if (space < 0)
2614 break; 2709 break;
2615 /* Punt if not enough space exists in the first SKB for
2616 * the data in the second
2617 */
2618 if (skb->len > skb_availroom(to))
2619 break;
2620 2710
2621 if (after(TCP_SKB_CB(skb)->end_seq, tcp_wnd_end(tp))) 2711 if (after(TCP_SKB_CB(skb)->end_seq, tcp_wnd_end(tp)))
2622 break; 2712 break;
2623 2713
2624 tcp_collapse_retrans(sk, to); 2714 if (!tcp_collapse_retrans(sk, to))
2715 break;
2625 } 2716 }
2626} 2717}
2627 2718
@@ -2694,6 +2785,13 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
2694 if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN) 2785 if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN)
2695 tcp_ecn_clear_syn(sk, skb); 2786 tcp_ecn_clear_syn(sk, skb);
2696 2787
2788 /* Update global and local TCP statistics. */
2789 segs = tcp_skb_pcount(skb);
2790 TCP_ADD_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS, segs);
2791 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
2792 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
2793 tp->total_retrans += segs;
2794
2697 /* make sure skb->data is aligned on arches that require it 2795 /* make sure skb->data is aligned on arches that require it
2698 * and check if ack-trimming & collapsing extended the headroom 2796 * and check if ack-trimming & collapsing extended the headroom
2699 * beyond what csum_start can cover. 2797 * beyond what csum_start can cover.
@@ -2711,14 +2809,9 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
2711 } 2809 }
2712 2810
2713 if (likely(!err)) { 2811 if (likely(!err)) {
2714 segs = tcp_skb_pcount(skb);
2715
2716 TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS; 2812 TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS;
2717 /* Update global TCP statistics. */ 2813 } else if (err != -EBUSY) {
2718 TCP_ADD_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS, segs); 2814 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);
2719 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
2720 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
2721 tp->total_retrans += segs;
2722 } 2815 }
2723 return err; 2816 return err;
2724} 2817}
@@ -2741,8 +2834,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
2741 if (!tp->retrans_stamp) 2834 if (!tp->retrans_stamp)
2742 tp->retrans_stamp = tcp_skb_timestamp(skb); 2835 tp->retrans_stamp = tcp_skb_timestamp(skb);
2743 2836
2744 } else if (err != -EBUSY) {
2745 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);
2746 } 2837 }
2747 2838
2748 if (tp->undo_retrans < 0) 2839 if (tp->undo_retrans < 0)
@@ -2751,36 +2842,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
2751 return err; 2842 return err;
2752} 2843}
2753 2844
2754/* Check if we forward retransmits are possible in the current
2755 * window/congestion state.
2756 */
2757static bool tcp_can_forward_retransmit(struct sock *sk)
2758{
2759 const struct inet_connection_sock *icsk = inet_csk(sk);
2760 const struct tcp_sock *tp = tcp_sk(sk);
2761
2762 /* Forward retransmissions are possible only during Recovery. */
2763 if (icsk->icsk_ca_state != TCP_CA_Recovery)
2764 return false;
2765
2766 /* No forward retransmissions in Reno are possible. */
2767 if (tcp_is_reno(tp))
2768 return false;
2769
2770 /* Yeah, we have to make difficult choice between forward transmission
2771 * and retransmission... Both ways have their merits...
2772 *
2773 * For now we do not retransmit anything, while we have some new
2774 * segments to send. In the other cases, follow rule 3 for
2775 * NextSeg() specified in RFC3517.
2776 */
2777
2778 if (tcp_may_send_now(sk))
2779 return false;
2780
2781 return true;
2782}
2783
2784/* This gets called after a retransmit timeout, and the initially 2845/* This gets called after a retransmit timeout, and the initially
2785 * retransmitted data is acknowledged. It tries to continue 2846 * retransmitted data is acknowledged. It tries to continue
2786 * resending the rest of the retransmit queue, until either 2847 * resending the rest of the retransmit queue, until either
@@ -2795,24 +2856,16 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
2795 struct tcp_sock *tp = tcp_sk(sk); 2856 struct tcp_sock *tp = tcp_sk(sk);
2796 struct sk_buff *skb; 2857 struct sk_buff *skb;
2797 struct sk_buff *hole = NULL; 2858 struct sk_buff *hole = NULL;
2798 u32 max_segs, last_lost; 2859 u32 max_segs;
2799 int mib_idx; 2860 int mib_idx;
2800 int fwd_rexmitting = 0;
2801 2861
2802 if (!tp->packets_out) 2862 if (!tp->packets_out)
2803 return; 2863 return;
2804 2864
2805 if (!tp->lost_out)
2806 tp->retransmit_high = tp->snd_una;
2807
2808 if (tp->retransmit_skb_hint) { 2865 if (tp->retransmit_skb_hint) {
2809 skb = tp->retransmit_skb_hint; 2866 skb = tp->retransmit_skb_hint;
2810 last_lost = TCP_SKB_CB(skb)->end_seq;
2811 if (after(last_lost, tp->retransmit_high))
2812 last_lost = tp->retransmit_high;
2813 } else { 2867 } else {
2814 skb = tcp_write_queue_head(sk); 2868 skb = tcp_write_queue_head(sk);
2815 last_lost = tp->snd_una;
2816 } 2869 }
2817 2870
2818 max_segs = tcp_tso_segs(sk, tcp_current_mss(sk)); 2871 max_segs = tcp_tso_segs(sk, tcp_current_mss(sk));
@@ -2835,31 +2888,14 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
2835 */ 2888 */
2836 segs = min_t(int, segs, max_segs); 2889 segs = min_t(int, segs, max_segs);
2837 2890
2838 if (fwd_rexmitting) { 2891 if (tp->retrans_out >= tp->lost_out) {
2839begin_fwd: 2892 break;
2840 if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp)))
2841 break;
2842 mib_idx = LINUX_MIB_TCPFORWARDRETRANS;
2843
2844 } else if (!before(TCP_SKB_CB(skb)->seq, tp->retransmit_high)) {
2845 tp->retransmit_high = last_lost;
2846 if (!tcp_can_forward_retransmit(sk))
2847 break;
2848 /* Backtrack if necessary to non-L'ed skb */
2849 if (hole) {
2850 skb = hole;
2851 hole = NULL;
2852 }
2853 fwd_rexmitting = 1;
2854 goto begin_fwd;
2855
2856 } else if (!(sacked & TCPCB_LOST)) { 2893 } else if (!(sacked & TCPCB_LOST)) {
2857 if (!hole && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED))) 2894 if (!hole && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED)))
2858 hole = skb; 2895 hole = skb;
2859 continue; 2896 continue;
2860 2897
2861 } else { 2898 } else {
2862 last_lost = TCP_SKB_CB(skb)->end_seq;
2863 if (icsk->icsk_ca_state != TCP_CA_Loss) 2899 if (icsk->icsk_ca_state != TCP_CA_Loss)
2864 mib_idx = LINUX_MIB_TCPFASTRETRANS; 2900 mib_idx = LINUX_MIB_TCPFASTRETRANS;
2865 else 2901 else
@@ -2880,7 +2916,8 @@ begin_fwd:
2880 if (tcp_in_cwnd_reduction(sk)) 2916 if (tcp_in_cwnd_reduction(sk))
2881 tp->prr_out += tcp_skb_pcount(skb); 2917 tp->prr_out += tcp_skb_pcount(skb);
2882 2918
2883 if (skb == tcp_write_queue_head(sk)) 2919 if (skb == tcp_write_queue_head(sk) &&
2920 icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT)
2884 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 2921 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
2885 inet_csk(sk)->icsk_rto, 2922 inet_csk(sk)->icsk_rto,
2886 TCP_RTO_MAX); 2923 TCP_RTO_MAX);
@@ -2962,6 +2999,8 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority)
2962{ 2999{
2963 struct sk_buff *skb; 3000 struct sk_buff *skb;
2964 3001
3002 TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTRSTS);
3003
2965 /* NOTE: No TCP options attached and we never retransmit this. */ 3004 /* NOTE: No TCP options attached and we never retransmit this. */
2966 skb = alloc_skb(MAX_TCP_HEADER, priority); 3005 skb = alloc_skb(MAX_TCP_HEADER, priority);
2967 if (!skb) { 3006 if (!skb) {
@@ -2977,8 +3016,6 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority)
2977 /* Send it off. */ 3016 /* Send it off. */
2978 if (tcp_transmit_skb(sk, skb, 0, priority)) 3017 if (tcp_transmit_skb(sk, skb, 0, priority))
2979 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED); 3018 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
2980
2981 TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTRSTS);
2982} 3019}
2983 3020
2984/* Send a crossed SYN-ACK during socket establishment. 3021/* Send a crossed SYN-ACK during socket establishment.
@@ -3037,7 +3074,6 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
3037 struct sk_buff *skb; 3074 struct sk_buff *skb;
3038 int tcp_header_size; 3075 int tcp_header_size;
3039 struct tcphdr *th; 3076 struct tcphdr *th;
3040 u16 user_mss;
3041 int mss; 3077 int mss;
3042 3078
3043 skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); 3079 skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
@@ -3067,10 +3103,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
3067 } 3103 }
3068 skb_dst_set(skb, dst); 3104 skb_dst_set(skb, dst);
3069 3105
3070 mss = dst_metric_advmss(dst); 3106 mss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
3071 user_mss = READ_ONCE(tp->rx_opt.user_mss);
3072 if (user_mss && user_mss < mss)
3073 mss = user_mss;
3074 3107
3075 memset(&opts, 0, sizeof(opts)); 3108 memset(&opts, 0, sizeof(opts));
3076#ifdef CONFIG_SYN_COOKIES 3109#ifdef CONFIG_SYN_COOKIES
@@ -3123,7 +3156,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
3123#endif 3156#endif
3124 3157
3125 /* Do not fool tcpdump (if any), clean our debris */ 3158 /* Do not fool tcpdump (if any), clean our debris */
3126 skb->tstamp.tv64 = 0; 3159 skb->tstamp = 0;
3127 return skb; 3160 return skb;
3128} 3161}
3129EXPORT_SYMBOL(tcp_make_synack); 3162EXPORT_SYMBOL(tcp_make_synack);
@@ -3176,9 +3209,7 @@ static void tcp_connect_init(struct sock *sk)
3176 3209
3177 if (!tp->window_clamp) 3210 if (!tp->window_clamp)
3178 tp->window_clamp = dst_metric(dst, RTAX_WINDOW); 3211 tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
3179 tp->advmss = dst_metric_advmss(dst); 3212 tp->advmss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
3180 if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->advmss)
3181 tp->advmss = tp->rx_opt.user_mss;
3182 3213
3183 tcp_initialize_rcv_mss(sk); 3214 tcp_initialize_rcv_mss(sk);
3184 3215
@@ -3244,31 +3275,19 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
3244{ 3275{
3245 struct tcp_sock *tp = tcp_sk(sk); 3276 struct tcp_sock *tp = tcp_sk(sk);
3246 struct tcp_fastopen_request *fo = tp->fastopen_req; 3277 struct tcp_fastopen_request *fo = tp->fastopen_req;
3247 int syn_loss = 0, space, err = 0; 3278 int space, err = 0;
3248 unsigned long last_syn_loss = 0;
3249 struct sk_buff *syn_data; 3279 struct sk_buff *syn_data;
3250 3280
3251 tp->rx_opt.mss_clamp = tp->advmss; /* If MSS is not cached */ 3281 tp->rx_opt.mss_clamp = tp->advmss; /* If MSS is not cached */
3252 tcp_fastopen_cache_get(sk, &tp->rx_opt.mss_clamp, &fo->cookie, 3282 if (!tcp_fastopen_cookie_check(sk, &tp->rx_opt.mss_clamp, &fo->cookie))
3253 &syn_loss, &last_syn_loss);
3254 /* Recurring FO SYN losses: revert to regular handshake temporarily */
3255 if (syn_loss > 1 &&
3256 time_before(jiffies, last_syn_loss + (60*HZ << syn_loss))) {
3257 fo->cookie.len = -1;
3258 goto fallback;
3259 }
3260
3261 if (sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE)
3262 fo->cookie.len = -1;
3263 else if (fo->cookie.len <= 0)
3264 goto fallback; 3283 goto fallback;
3265 3284
3266 /* MSS for SYN-data is based on cached MSS and bounded by PMTU and 3285 /* MSS for SYN-data is based on cached MSS and bounded by PMTU and
3267 * user-MSS. Reserve maximum option space for middleboxes that add 3286 * user-MSS. Reserve maximum option space for middleboxes that add
3268 * private TCP options. The cost is reduced data space in SYN :( 3287 * private TCP options. The cost is reduced data space in SYN :(
3269 */ 3288 */
3270 if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->rx_opt.mss_clamp) 3289 tp->rx_opt.mss_clamp = tcp_mss_clamp(tp, tp->rx_opt.mss_clamp);
3271 tp->rx_opt.mss_clamp = tp->rx_opt.user_mss; 3290
3272 space = __tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) - 3291 space = __tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) -
3273 MAX_TCP_OPTION_SPACE; 3292 MAX_TCP_OPTION_SPACE;
3274 3293
@@ -3300,6 +3319,8 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
3300 fo->copied = space; 3319 fo->copied = space;
3301 3320
3302 tcp_connect_queue_skb(sk, syn_data); 3321 tcp_connect_queue_skb(sk, syn_data);
3322 if (syn_data->len)
3323 tcp_chrono_start(sk, TCP_CHRONO_BUSY);
3303 3324
3304 err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation); 3325 err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation);
3305 3326
@@ -3464,8 +3485,6 @@ void tcp_send_ack(struct sock *sk)
3464 /* We do not want pure acks influencing TCP Small Queues or fq/pacing 3485 /* We do not want pure acks influencing TCP Small Queues or fq/pacing
3465 * too much. 3486 * too much.
3466 * SKB_TRUESIZE(max(1 .. 66, MAX_TCP_HEADER)) is unfortunately ~784 3487 * SKB_TRUESIZE(max(1 .. 66, MAX_TCP_HEADER)) is unfortunately ~784
3467 * We also avoid tcp_wfree() overhead (cache line miss accessing
3468 * tp->tsq_flags) by using regular sock_wfree()
3469 */ 3488 */
3470 skb_set_tcp_pure_ack(buff); 3489 skb_set_tcp_pure_ack(buff);
3471 3490