aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp_output.c
diff options
context:
space:
mode:
authorEric Dumazet <edumazet@google.com>2017-10-06 01:21:27 -0400
committerDavid S. Miller <davem@davemloft.net>2017-10-06 19:28:54 -0400
commit75c119afe14f74b4dd967d75ed9f57ab6c0ef045 (patch)
treea9e03880b4f700a0f45026f06262a916d42f7e5e /net/ipv4/tcp_output.c
parentf33198163a0fbb03766444253edf6ea50685d725 (diff)
tcp: implement rb-tree based retransmit queue
Using a linear list to store all skbs in write queue has been okay for quite a while : O(N) is not too bad when N < 500. Things get messy when N is the order of 100,000 : Modern TCP stacks want 10Gbit+ of throughput even with 200 ms RTT flows. 40 ns per cache line miss means a full scan can use 4 ms, blowing away CPU caches. SACK processing often can use various hints to avoid parsing whole retransmit queue. But with high packet losses and/or high reordering, hints no longer work. Sender has to process thousands of unfriendly SACK, accumulating a huge socket backlog, burning a cpu and massively dropping packets. Using an rb-tree for retransmit queue has been avoided for years because it added complexity and overhead, but now is the time to be more resistant and say no to quadratic behavior. 1) RTX queue is no longer part of the write queue : already sent skbs are stored in one rb-tree. 2) Since reaching the head of write queue no longer needs sk->sk_send_head, we added an union of sk_send_head and tcp_rtx_queue Tested: On receiver : netem on ingress : delay 150ms 200us loss 1 GRO disabled to force stress and SACK storms. for f in `seq 1 10` do ./netperf -H lpaa6 -l30 -- -K bbr -o THROUGHPUT|tail -1 done | awk '{print $0} {sum += $0} END {printf "%7u\n",sum}' Before patch : 323.87 351.48 339.59 338.62 306.72 204.07 304.93 291.88 202.47 176.88 2840 After patch: 1700.83 2207.98 2070.17 1544.26 2114.76 2124.89 1693.14 1080.91 2216.82 1299.94 18053 Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4/tcp_output.c')
-rw-r--r--net/ipv4/tcp_output.c137
1 files changed, 74 insertions, 63 deletions
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 8162e2880178..696b0a168f16 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -66,15 +66,17 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
66 int push_one, gfp_t gfp); 66 int push_one, gfp_t gfp);
67 67
68/* Account for new data that has been sent to the network. */ 68/* Account for new data that has been sent to the network. */
69static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb) 69static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb)
70{ 70{
71 struct inet_connection_sock *icsk = inet_csk(sk); 71 struct inet_connection_sock *icsk = inet_csk(sk);
72 struct tcp_sock *tp = tcp_sk(sk); 72 struct tcp_sock *tp = tcp_sk(sk);
73 unsigned int prior_packets = tp->packets_out; 73 unsigned int prior_packets = tp->packets_out;
74 74
75 tcp_advance_send_head(sk, skb);
76 tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; 75 tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
77 76
77 __skb_unlink(skb, &sk->sk_write_queue);
78 tcp_rbtree_insert(&sk->tcp_rtx_queue, skb);
79
78 tp->packets_out += tcp_skb_pcount(skb); 80 tp->packets_out += tcp_skb_pcount(skb);
79 if (!prior_packets || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) 81 if (!prior_packets || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
80 tcp_rearm_rto(sk); 82 tcp_rearm_rto(sk);
@@ -1249,12 +1251,25 @@ static void tcp_skb_fragment_eor(struct sk_buff *skb, struct sk_buff *skb2)
1249 TCP_SKB_CB(skb)->eor = 0; 1251 TCP_SKB_CB(skb)->eor = 0;
1250} 1252}
1251 1253
1254/* Insert buff after skb on the write or rtx queue of sk. */
1255static void tcp_insert_write_queue_after(struct sk_buff *skb,
1256 struct sk_buff *buff,
1257 struct sock *sk,
1258 enum tcp_queue tcp_queue)
1259{
1260 if (tcp_queue == TCP_FRAG_IN_WRITE_QUEUE)
1261 __skb_queue_after(&sk->sk_write_queue, skb, buff);
1262 else
1263 tcp_rbtree_insert(&sk->tcp_rtx_queue, buff);
1264}
1265
1252/* Function to create two new TCP segments. Shrinks the given segment 1266/* Function to create two new TCP segments. Shrinks the given segment
1253 * to the specified size and appends a new segment with the rest of the 1267 * to the specified size and appends a new segment with the rest of the
1254 * packet to the list. This won't be called frequently, I hope. 1268 * packet to the list. This won't be called frequently, I hope.
1255 * Remember, these are still headerless SKBs at this point. 1269 * Remember, these are still headerless SKBs at this point.
1256 */ 1270 */
1257int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, 1271int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
1272 struct sk_buff *skb, u32 len,
1258 unsigned int mss_now, gfp_t gfp) 1273 unsigned int mss_now, gfp_t gfp)
1259{ 1274{
1260 struct tcp_sock *tp = tcp_sk(sk); 1275 struct tcp_sock *tp = tcp_sk(sk);
@@ -1337,7 +1352,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
1337 1352
1338 /* Link BUFF into the send queue. */ 1353 /* Link BUFF into the send queue. */
1339 __skb_header_release(buff); 1354 __skb_header_release(buff);
1340 tcp_insert_write_queue_after(skb, buff, sk); 1355 tcp_insert_write_queue_after(skb, buff, sk, tcp_queue);
1341 list_add(&buff->tcp_tsorted_anchor, &skb->tcp_tsorted_anchor); 1356 list_add(&buff->tcp_tsorted_anchor, &skb->tcp_tsorted_anchor);
1342 1357
1343 return 0; 1358 return 0;
@@ -1625,10 +1640,10 @@ static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited)
1625 * is caused by insufficient sender buffer: 1640 * is caused by insufficient sender buffer:
1626 * 1) just sent some data (see tcp_write_xmit) 1641 * 1) just sent some data (see tcp_write_xmit)
1627 * 2) not cwnd limited (this else condition) 1642 * 2) not cwnd limited (this else condition)
1628 * 3) no more data to send (null tcp_send_head ) 1643 * 3) no more data to send (tcp_write_queue_empty())
1629 * 4) application is hitting buffer limit (SOCK_NOSPACE) 1644 * 4) application is hitting buffer limit (SOCK_NOSPACE)
1630 */ 1645 */
1631 if (!tcp_send_head(sk) && sk->sk_socket && 1646 if (tcp_write_queue_empty(sk) && sk->sk_socket &&
1632 test_bit(SOCK_NOSPACE, &sk->sk_socket->flags) && 1647 test_bit(SOCK_NOSPACE, &sk->sk_socket->flags) &&
1633 (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) 1648 (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
1634 tcp_chrono_start(sk, TCP_CHRONO_SNDBUF_LIMITED); 1649 tcp_chrono_start(sk, TCP_CHRONO_SNDBUF_LIMITED);
@@ -1824,7 +1839,8 @@ static bool tcp_snd_wnd_test(const struct tcp_sock *tp,
1824 * know that all the data is in scatter-gather pages, and that the 1839 * know that all the data is in scatter-gather pages, and that the
1825 * packet has never been sent out before (and thus is not cloned). 1840 * packet has never been sent out before (and thus is not cloned).
1826 */ 1841 */
1827static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, 1842static int tso_fragment(struct sock *sk, enum tcp_queue tcp_queue,
1843 struct sk_buff *skb, unsigned int len,
1828 unsigned int mss_now, gfp_t gfp) 1844 unsigned int mss_now, gfp_t gfp)
1829{ 1845{
1830 struct sk_buff *buff; 1846 struct sk_buff *buff;
@@ -1833,7 +1849,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
1833 1849
1834 /* All of a TSO frame must be composed of paged data. */ 1850 /* All of a TSO frame must be composed of paged data. */
1835 if (skb->len != skb->data_len) 1851 if (skb->len != skb->data_len)
1836 return tcp_fragment(sk, skb, len, mss_now, gfp); 1852 return tcp_fragment(sk, tcp_queue, skb, len, mss_now, gfp);
1837 1853
1838 buff = sk_stream_alloc_skb(sk, 0, gfp, true); 1854 buff = sk_stream_alloc_skb(sk, 0, gfp, true);
1839 if (unlikely(!buff)) 1855 if (unlikely(!buff))
@@ -1869,7 +1885,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
1869 1885
1870 /* Link BUFF into the send queue. */ 1886 /* Link BUFF into the send queue. */
1871 __skb_header_release(buff); 1887 __skb_header_release(buff);
1872 tcp_insert_write_queue_after(skb, buff, sk); 1888 tcp_insert_write_queue_after(skb, buff, sk, tcp_queue);
1873 1889
1874 return 0; 1890 return 0;
1875} 1891}
@@ -1939,8 +1955,10 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
1939 goto send_now; 1955 goto send_now;
1940 } 1956 }
1941 1957
1942 head = tcp_write_queue_head(sk); 1958 /* TODO : use tsorted_sent_queue ? */
1943 1959 head = tcp_rtx_queue_head(sk);
1960 if (!head)
1961 goto send_now;
1944 age = tcp_stamp_us_delta(tp->tcp_mstamp, head->skb_mstamp); 1962 age = tcp_stamp_us_delta(tp->tcp_mstamp, head->skb_mstamp);
1945 /* If next ACK is likely to come too late (half srtt), do not defer */ 1963 /* If next ACK is likely to come too late (half srtt), do not defer */
1946 if (age < (tp->srtt_us >> 4)) 1964 if (age < (tp->srtt_us >> 4))
@@ -2158,13 +2176,12 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
2158 limit <<= factor; 2176 limit <<= factor;
2159 2177
2160 if (refcount_read(&sk->sk_wmem_alloc) > limit) { 2178 if (refcount_read(&sk->sk_wmem_alloc) > limit) {
2161 /* Always send the 1st or 2nd skb in write queue. 2179 /* Always send skb if rtx queue is empty.
2162 * No need to wait for TX completion to call us back, 2180 * No need to wait for TX completion to call us back,
2163 * after softirq/tasklet schedule. 2181 * after softirq/tasklet schedule.
2164 * This helps when TX completions are delayed too much. 2182 * This helps when TX completions are delayed too much.
2165 */ 2183 */
2166 if (skb == sk->sk_write_queue.next || 2184 if (tcp_rtx_queue_empty(sk))
2167 skb->prev == sk->sk_write_queue.next)
2168 return false; 2185 return false;
2169 2186
2170 set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags); 2187 set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
@@ -2215,7 +2232,7 @@ void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type)
2215 * it's the "most interesting" or current chrono we are 2232 * it's the "most interesting" or current chrono we are
2216 * tracking and starts busy chrono if we have pending data. 2233 * tracking and starts busy chrono if we have pending data.
2217 */ 2234 */
2218 if (tcp_write_queue_empty(sk)) 2235 if (tcp_rtx_and_write_queues_empty(sk))
2219 tcp_chrono_set(tp, TCP_CHRONO_UNSPEC); 2236 tcp_chrono_set(tp, TCP_CHRONO_UNSPEC);
2220 else if (type == tp->chrono_type) 2237 else if (type == tp->chrono_type)
2221 tcp_chrono_set(tp, TCP_CHRONO_BUSY); 2238 tcp_chrono_set(tp, TCP_CHRONO_BUSY);
@@ -2310,7 +2327,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
2310 nonagle); 2327 nonagle);
2311 2328
2312 if (skb->len > limit && 2329 if (skb->len > limit &&
2313 unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) 2330 unlikely(tso_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE,
2331 skb, limit, mss_now, gfp)))
2314 break; 2332 break;
2315 2333
2316 if (test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags)) 2334 if (test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags))
@@ -2350,7 +2368,7 @@ repair:
2350 tcp_cwnd_validate(sk, is_cwnd_limited); 2368 tcp_cwnd_validate(sk, is_cwnd_limited);
2351 return false; 2369 return false;
2352 } 2370 }
2353 return !tp->packets_out && tcp_send_head(sk); 2371 return !tp->packets_out && !tcp_write_queue_empty(sk);
2354} 2372}
2355 2373
2356bool tcp_schedule_loss_probe(struct sock *sk) 2374bool tcp_schedule_loss_probe(struct sock *sk)
@@ -2374,7 +2392,7 @@ bool tcp_schedule_loss_probe(struct sock *sk)
2374 return false; 2392 return false;
2375 2393
2376 if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) && 2394 if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) &&
2377 tcp_send_head(sk)) 2395 !tcp_write_queue_empty(sk))
2378 return false; 2396 return false;
2379 2397
2380 /* Probe timeout is 2*rtt. Add minimum RTO to account 2398 /* Probe timeout is 2*rtt. Add minimum RTO to account
@@ -2427,18 +2445,14 @@ void tcp_send_loss_probe(struct sock *sk)
2427 int mss = tcp_current_mss(sk); 2445 int mss = tcp_current_mss(sk);
2428 2446
2429 skb = tcp_send_head(sk); 2447 skb = tcp_send_head(sk);
2430 if (skb) { 2448 if (skb && tcp_snd_wnd_test(tp, skb, mss)) {
2431 if (tcp_snd_wnd_test(tp, skb, mss)) { 2449 pcount = tp->packets_out;
2432 pcount = tp->packets_out; 2450 tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC);
2433 tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC); 2451 if (tp->packets_out > pcount)
2434 if (tp->packets_out > pcount) 2452 goto probe_sent;
2435 goto probe_sent; 2453 goto rearm_timer;
2436 goto rearm_timer;
2437 }
2438 skb = tcp_write_queue_prev(sk, skb);
2439 } else {
2440 skb = tcp_write_queue_tail(sk);
2441 } 2454 }
2455 skb = skb_rb_last(&sk->tcp_rtx_queue);
2442 2456
2443 /* At most one outstanding TLP retransmission. */ 2457 /* At most one outstanding TLP retransmission. */
2444 if (tp->tlp_high_seq) 2458 if (tp->tlp_high_seq)
@@ -2456,10 +2470,11 @@ void tcp_send_loss_probe(struct sock *sk)
2456 goto rearm_timer; 2470 goto rearm_timer;
2457 2471
2458 if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) { 2472 if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) {
2459 if (unlikely(tcp_fragment(sk, skb, (pcount - 1) * mss, mss, 2473 if (unlikely(tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
2474 (pcount - 1) * mss, mss,
2460 GFP_ATOMIC))) 2475 GFP_ATOMIC)))
2461 goto rearm_timer; 2476 goto rearm_timer;
2462 skb = tcp_write_queue_next(sk, skb); 2477 skb = skb_rb_next(skb);
2463 } 2478 }
2464 2479
2465 if (WARN_ON(!skb || !tcp_skb_pcount(skb))) 2480 if (WARN_ON(!skb || !tcp_skb_pcount(skb)))
@@ -2659,7 +2674,7 @@ void tcp_skb_collapse_tstamp(struct sk_buff *skb,
2659static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) 2674static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
2660{ 2675{
2661 struct tcp_sock *tp = tcp_sk(sk); 2676 struct tcp_sock *tp = tcp_sk(sk);
2662 struct sk_buff *next_skb = tcp_write_queue_next(sk, skb); 2677 struct sk_buff *next_skb = skb_rb_next(skb);
2663 int skb_size, next_skb_size; 2678 int skb_size, next_skb_size;
2664 2679
2665 skb_size = skb->len; 2680 skb_size = skb->len;
@@ -2676,8 +2691,6 @@ static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
2676 } 2691 }
2677 tcp_highest_sack_combine(sk, next_skb, skb); 2692 tcp_highest_sack_combine(sk, next_skb, skb);
2678 2693
2679 tcp_unlink_write_queue(next_skb, sk);
2680
2681 if (next_skb->ip_summed == CHECKSUM_PARTIAL) 2694 if (next_skb->ip_summed == CHECKSUM_PARTIAL)
2682 skb->ip_summed = CHECKSUM_PARTIAL; 2695 skb->ip_summed = CHECKSUM_PARTIAL;
2683 2696
@@ -2705,7 +2718,7 @@ static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
2705 2718
2706 tcp_skb_collapse_tstamp(skb, next_skb); 2719 tcp_skb_collapse_tstamp(skb, next_skb);
2707 2720
2708 sk_wmem_free_skb(sk, next_skb); 2721 tcp_rtx_queue_unlink_and_free(next_skb, sk);
2709 return true; 2722 return true;
2710} 2723}
2711 2724
@@ -2716,8 +2729,6 @@ static bool tcp_can_collapse(const struct sock *sk, const struct sk_buff *skb)
2716 return false; 2729 return false;
2717 if (skb_cloned(skb)) 2730 if (skb_cloned(skb))
2718 return false; 2731 return false;
2719 if (skb == tcp_send_head(sk))
2720 return false;
2721 /* Some heuristics for collapsing over SACK'd could be invented */ 2732 /* Some heuristics for collapsing over SACK'd could be invented */
2722 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) 2733 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
2723 return false; 2734 return false;
@@ -2740,7 +2751,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
2740 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) 2751 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
2741 return; 2752 return;
2742 2753
2743 tcp_for_write_queue_from_safe(skb, tmp, sk) { 2754 skb_rbtree_walk_from_safe(skb, tmp) {
2744 if (!tcp_can_collapse(sk, skb)) 2755 if (!tcp_can_collapse(sk, skb))
2745 break; 2756 break;
2746 2757
@@ -2815,7 +2826,8 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
2815 2826
2816 len = cur_mss * segs; 2827 len = cur_mss * segs;
2817 if (skb->len > len) { 2828 if (skb->len > len) {
2818 if (tcp_fragment(sk, skb, len, cur_mss, GFP_ATOMIC)) 2829 if (tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, len,
2830 cur_mss, GFP_ATOMIC))
2819 return -ENOMEM; /* We'll try again later. */ 2831 return -ENOMEM; /* We'll try again later. */
2820 } else { 2832 } else {
2821 if (skb_unclone(skb, GFP_ATOMIC)) 2833 if (skb_unclone(skb, GFP_ATOMIC))
@@ -2906,29 +2918,24 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
2906void tcp_xmit_retransmit_queue(struct sock *sk) 2918void tcp_xmit_retransmit_queue(struct sock *sk)
2907{ 2919{
2908 const struct inet_connection_sock *icsk = inet_csk(sk); 2920 const struct inet_connection_sock *icsk = inet_csk(sk);
2921 struct sk_buff *skb, *rtx_head = NULL, *hole = NULL;
2909 struct tcp_sock *tp = tcp_sk(sk); 2922 struct tcp_sock *tp = tcp_sk(sk);
2910 struct sk_buff *skb;
2911 struct sk_buff *hole = NULL;
2912 u32 max_segs; 2923 u32 max_segs;
2913 int mib_idx; 2924 int mib_idx;
2914 2925
2915 if (!tp->packets_out) 2926 if (!tp->packets_out)
2916 return; 2927 return;
2917 2928
2918 if (tp->retransmit_skb_hint) { 2929 skb = tp->retransmit_skb_hint;
2919 skb = tp->retransmit_skb_hint; 2930 if (!skb) {
2920 } else { 2931 rtx_head = tcp_rtx_queue_head(sk);
2921 skb = tcp_write_queue_head(sk); 2932 skb = rtx_head;
2922 } 2933 }
2923
2924 max_segs = tcp_tso_segs(sk, tcp_current_mss(sk)); 2934 max_segs = tcp_tso_segs(sk, tcp_current_mss(sk));
2925 tcp_for_write_queue_from(skb, sk) { 2935 skb_rbtree_walk_from(skb) {
2926 __u8 sacked; 2936 __u8 sacked;
2927 int segs; 2937 int segs;
2928 2938
2929 if (skb == tcp_send_head(sk))
2930 break;
2931
2932 if (tcp_pacing_check(sk)) 2939 if (tcp_pacing_check(sk))
2933 break; 2940 break;
2934 2941
@@ -2973,7 +2980,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
2973 if (tcp_in_cwnd_reduction(sk)) 2980 if (tcp_in_cwnd_reduction(sk))
2974 tp->prr_out += tcp_skb_pcount(skb); 2981 tp->prr_out += tcp_skb_pcount(skb);
2975 2982
2976 if (skb == tcp_write_queue_head(sk) && 2983 if (skb == rtx_head &&
2977 icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT) 2984 icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT)
2978 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 2985 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
2979 inet_csk(sk)->icsk_rto, 2986 inet_csk(sk)->icsk_rto,
@@ -3015,12 +3022,15 @@ void tcp_send_fin(struct sock *sk)
3015 * Note: in the latter case, FIN packet will be sent after a timeout, 3022 * Note: in the latter case, FIN packet will be sent after a timeout,
3016 * as TCP stack thinks it has already been transmitted. 3023 * as TCP stack thinks it has already been transmitted.
3017 */ 3024 */
3018 if (tskb && (tcp_send_head(sk) || tcp_under_memory_pressure(sk))) { 3025 if (!tskb && tcp_under_memory_pressure(sk))
3026 tskb = skb_rb_last(&sk->tcp_rtx_queue);
3027
3028 if (tskb) {
3019coalesce: 3029coalesce:
3020 TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN; 3030 TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN;
3021 TCP_SKB_CB(tskb)->end_seq++; 3031 TCP_SKB_CB(tskb)->end_seq++;
3022 tp->write_seq++; 3032 tp->write_seq++;
3023 if (!tcp_send_head(sk)) { 3033 if (tcp_write_queue_empty(sk)) {
3024 /* This means tskb was already sent. 3034 /* This means tskb was already sent.
3025 * Pretend we included the FIN on previous transmit. 3035 * Pretend we included the FIN on previous transmit.
3026 * We need to set tp->snd_nxt to the value it would have 3036 * We need to set tp->snd_nxt to the value it would have
@@ -3086,9 +3096,9 @@ int tcp_send_synack(struct sock *sk)
3086{ 3096{
3087 struct sk_buff *skb; 3097 struct sk_buff *skb;
3088 3098
3089 skb = tcp_write_queue_head(sk); 3099 skb = tcp_rtx_queue_head(sk);
3090 if (!skb || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { 3100 if (!skb || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
3091 pr_debug("%s: wrong queue state\n", __func__); 3101 pr_err("%s: wrong queue state\n", __func__);
3092 return -EFAULT; 3102 return -EFAULT;
3093 } 3103 }
3094 if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) { 3104 if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) {
@@ -3101,10 +3111,9 @@ int tcp_send_synack(struct sock *sk)
3101 if (!nskb) 3111 if (!nskb)
3102 return -ENOMEM; 3112 return -ENOMEM;
3103 INIT_LIST_HEAD(&nskb->tcp_tsorted_anchor); 3113 INIT_LIST_HEAD(&nskb->tcp_tsorted_anchor);
3104 tcp_unlink_write_queue(skb, sk); 3114 tcp_rtx_queue_unlink_and_free(skb, sk);
3105 __skb_header_release(nskb); 3115 __skb_header_release(nskb);
3106 __tcp_add_write_queue_head(sk, nskb); 3116 tcp_rbtree_insert(&sk->tcp_rtx_queue, nskb);
3107 sk_wmem_free_skb(sk, skb);
3108 sk->sk_wmem_queued += nskb->truesize; 3117 sk->sk_wmem_queued += nskb->truesize;
3109 sk_mem_charge(sk, nskb->truesize); 3118 sk_mem_charge(sk, nskb->truesize);
3110 skb = nskb; 3119 skb = nskb;
@@ -3327,7 +3336,6 @@ static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb)
3327 3336
3328 tcb->end_seq += skb->len; 3337 tcb->end_seq += skb->len;
3329 __skb_header_release(skb); 3338 __skb_header_release(skb);
3330 __tcp_add_write_queue_tail(sk, skb);
3331 sk->sk_wmem_queued += skb->truesize; 3339 sk->sk_wmem_queued += skb->truesize;
3332 sk_mem_charge(sk, skb->truesize); 3340 sk_mem_charge(sk, skb->truesize);
3333 tp->write_seq = tcb->end_seq; 3341 tp->write_seq = tcb->end_seq;
@@ -3405,12 +3413,13 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
3405 TCP_SKB_CB(syn_data)->tcp_flags = TCPHDR_ACK | TCPHDR_PSH; 3413 TCP_SKB_CB(syn_data)->tcp_flags = TCPHDR_ACK | TCPHDR_PSH;
3406 if (!err) { 3414 if (!err) {
3407 tp->syn_data = (fo->copied > 0); 3415 tp->syn_data = (fo->copied > 0);
3416 tcp_rbtree_insert(&sk->tcp_rtx_queue, syn_data);
3408 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT); 3417 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT);
3409 goto done; 3418 goto done;
3410 } 3419 }
3411 3420
3412 /* data was not sent, this is our new send_head */ 3421 /* data was not sent, put it in write_queue */
3413 sk->sk_send_head = syn_data; 3422 __skb_queue_tail(&sk->sk_write_queue, syn_data);
3414 tp->packets_out -= tcp_skb_pcount(syn_data); 3423 tp->packets_out -= tcp_skb_pcount(syn_data);
3415 3424
3416fallback: 3425fallback:
@@ -3453,6 +3462,7 @@ int tcp_connect(struct sock *sk)
3453 tp->retrans_stamp = tcp_time_stamp(tp); 3462 tp->retrans_stamp = tcp_time_stamp(tp);
3454 tcp_connect_queue_skb(sk, buff); 3463 tcp_connect_queue_skb(sk, buff);
3455 tcp_ecn_send_syn(sk, buff); 3464 tcp_ecn_send_syn(sk, buff);
3465 tcp_rbtree_insert(&sk->tcp_rtx_queue, buff);
3456 3466
3457 /* Send off SYN; include data in Fast Open. */ 3467 /* Send off SYN; include data in Fast Open. */
3458 err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) : 3468 err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) :
@@ -3647,7 +3657,8 @@ int tcp_write_wakeup(struct sock *sk, int mib)
3647 skb->len > mss) { 3657 skb->len > mss) {
3648 seg_size = min(seg_size, mss); 3658 seg_size = min(seg_size, mss);
3649 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; 3659 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
3650 if (tcp_fragment(sk, skb, seg_size, mss, GFP_ATOMIC)) 3660 if (tcp_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE,
3661 skb, seg_size, mss, GFP_ATOMIC))
3651 return -1; 3662 return -1;
3652 } else if (!tcp_skb_pcount(skb)) 3663 } else if (!tcp_skb_pcount(skb))
3653 tcp_set_skb_tso_segs(skb, mss); 3664 tcp_set_skb_tso_segs(skb, mss);
@@ -3677,7 +3688,7 @@ void tcp_send_probe0(struct sock *sk)
3677 3688
3678 err = tcp_write_wakeup(sk, LINUX_MIB_TCPWINPROBE); 3689 err = tcp_write_wakeup(sk, LINUX_MIB_TCPWINPROBE);
3679 3690
3680 if (tp->packets_out || !tcp_send_head(sk)) { 3691 if (tp->packets_out || tcp_write_queue_empty(sk)) {
3681 /* Cancel probe timer, if it is not required. */ 3692 /* Cancel probe timer, if it is not required. */
3682 icsk->icsk_probes_out = 0; 3693 icsk->icsk_probes_out = 0;
3683 icsk->icsk_backoff = 0; 3694 icsk->icsk_backoff = 0;