aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorEric Dumazet <edumazet@google.com>2017-10-06 01:21:27 -0400
committerDavid S. Miller <davem@davemloft.net>2017-10-06 19:28:54 -0400
commit75c119afe14f74b4dd967d75ed9f57ab6c0ef045 (patch)
treea9e03880b4f700a0f45026f06262a916d42f7e5e
parentf33198163a0fbb03766444253edf6ea50685d725 (diff)
tcp: implement rb-tree based retransmit queue
Using a linear list to store all skbs in write queue has been okay for quite a while : O(N) is not too bad when N < 500. Things get messy when N is the order of 100,000 : Modern TCP stacks want 10Gbit+ of throughput even with 200 ms RTT flows. 40 ns per cache line miss means a full scan can use 4 ms, blowing away CPU caches. SACK processing often can use various hints to avoid parsing whole retransmit queue. But with high packet losses and/or high reordering, hints no longer work. Sender has to process thousands of unfriendly SACK, accumulating a huge socket backlog, burning a cpu and massively dropping packets. Using an rb-tree for retransmit queue has been avoided for years because it added complexity and overhead, but now is the time to be more resistant and say no to quadratic behavior. 1) RTX queue is no longer part of the write queue : already sent skbs are stored in one rb-tree. 2) Since reaching the head of write queue no longer needs sk->sk_send_head, we added an union of sk_send_head and tcp_rtx_queue Tested: On receiver : netem on ingress : delay 150ms 200us loss 1 GRO disabled to force stress and SACK storms. for f in `seq 1 10` do ./netperf -H lpaa6 -l30 -- -K bbr -o THROUGHPUT|tail -1 done | awk '{print $0} {sum += $0} END {printf "%7u\n",sum}' Before patch : 323.87 351.48 339.59 338.62 306.72 204.07 304.93 291.88 202.47 176.88 2840 After patch: 1700.83 2207.98 2070.17 1544.26 2114.76 2124.89 1693.14 1080.91 2216.82 1299.94 18053 Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--include/net/sock.h7
-rw-r--r--include/net/tcp.h89
-rw-r--r--net/ipv4/tcp.c41
-rw-r--r--net/ipv4/tcp_input.c133
-rw-r--r--net/ipv4/tcp_ipv4.c2
-rw-r--r--net/ipv4/tcp_output.c137
-rw-r--r--net/ipv4/tcp_timer.c24
7 files changed, 245 insertions, 188 deletions
diff --git a/include/net/sock.h b/include/net/sock.h
index a6b9a8d1a6df..4827094f1db4 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -60,7 +60,7 @@
60#include <linux/sched.h> 60#include <linux/sched.h>
61#include <linux/wait.h> 61#include <linux/wait.h>
62#include <linux/cgroup-defs.h> 62#include <linux/cgroup-defs.h>
63 63#include <linux/rbtree.h>
64#include <linux/filter.h> 64#include <linux/filter.h>
65#include <linux/rculist_nulls.h> 65#include <linux/rculist_nulls.h>
66#include <linux/poll.h> 66#include <linux/poll.h>
@@ -397,7 +397,10 @@ struct sock {
397 int sk_wmem_queued; 397 int sk_wmem_queued;
398 refcount_t sk_wmem_alloc; 398 refcount_t sk_wmem_alloc;
399 unsigned long sk_tsq_flags; 399 unsigned long sk_tsq_flags;
400 struct sk_buff *sk_send_head; 400 union {
401 struct sk_buff *sk_send_head;
402 struct rb_root tcp_rtx_queue;
403 };
401 struct sk_buff_head sk_write_queue; 404 struct sk_buff_head sk_write_queue;
402 __s32 sk_peek_off; 405 __s32 sk_peek_off;
403 int sk_write_pending; 406 int sk_write_pending;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 744559b72784..5a95e5886b55 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -551,7 +551,13 @@ void tcp_xmit_retransmit_queue(struct sock *);
551void tcp_simple_retransmit(struct sock *); 551void tcp_simple_retransmit(struct sock *);
552void tcp_enter_recovery(struct sock *sk, bool ece_ack); 552void tcp_enter_recovery(struct sock *sk, bool ece_ack);
553int tcp_trim_head(struct sock *, struct sk_buff *, u32); 553int tcp_trim_head(struct sock *, struct sk_buff *, u32);
554int tcp_fragment(struct sock *, struct sk_buff *, u32, unsigned int, gfp_t); 554enum tcp_queue {
555 TCP_FRAG_IN_WRITE_QUEUE,
556 TCP_FRAG_IN_RTX_QUEUE,
557};
558int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
559 struct sk_buff *skb, u32 len,
560 unsigned int mss_now, gfp_t gfp);
555 561
556void tcp_send_probe0(struct sock *); 562void tcp_send_probe0(struct sock *);
557void tcp_send_partial(struct sock *); 563void tcp_send_partial(struct sock *);
@@ -1608,6 +1614,11 @@ static inline void tcp_skb_tsorted_anchor_cleanup(struct sk_buff *skb)
1608 1614
1609void tcp_write_queue_purge(struct sock *sk); 1615void tcp_write_queue_purge(struct sock *sk);
1610 1616
1617static inline struct sk_buff *tcp_rtx_queue_head(const struct sock *sk)
1618{
1619 return skb_rb_first(&sk->tcp_rtx_queue);
1620}
1621
1611static inline struct sk_buff *tcp_write_queue_head(const struct sock *sk) 1622static inline struct sk_buff *tcp_write_queue_head(const struct sock *sk)
1612{ 1623{
1613 return skb_peek(&sk->sk_write_queue); 1624 return skb_peek(&sk->sk_write_queue);
@@ -1630,18 +1641,12 @@ static inline struct sk_buff *tcp_write_queue_prev(const struct sock *sk,
1630 return skb_queue_prev(&sk->sk_write_queue, skb); 1641 return skb_queue_prev(&sk->sk_write_queue, skb);
1631} 1642}
1632 1643
1633#define tcp_for_write_queue(skb, sk) \
1634 skb_queue_walk(&(sk)->sk_write_queue, skb)
1635
1636#define tcp_for_write_queue_from(skb, sk) \
1637 skb_queue_walk_from(&(sk)->sk_write_queue, skb)
1638
1639#define tcp_for_write_queue_from_safe(skb, tmp, sk) \ 1644#define tcp_for_write_queue_from_safe(skb, tmp, sk) \
1640 skb_queue_walk_from_safe(&(sk)->sk_write_queue, skb, tmp) 1645 skb_queue_walk_from_safe(&(sk)->sk_write_queue, skb, tmp)
1641 1646
1642static inline struct sk_buff *tcp_send_head(const struct sock *sk) 1647static inline struct sk_buff *tcp_send_head(const struct sock *sk)
1643{ 1648{
1644 return sk->sk_send_head; 1649 return skb_peek(&sk->sk_write_queue);
1645} 1650}
1646 1651
1647static inline bool tcp_skb_is_last(const struct sock *sk, 1652static inline bool tcp_skb_is_last(const struct sock *sk,
@@ -1650,29 +1655,30 @@ static inline bool tcp_skb_is_last(const struct sock *sk,
1650 return skb_queue_is_last(&sk->sk_write_queue, skb); 1655 return skb_queue_is_last(&sk->sk_write_queue, skb);
1651} 1656}
1652 1657
1653static inline void tcp_advance_send_head(struct sock *sk, const struct sk_buff *skb) 1658static inline bool tcp_write_queue_empty(const struct sock *sk)
1654{ 1659{
1655 if (tcp_skb_is_last(sk, skb)) 1660 return skb_queue_empty(&sk->sk_write_queue);
1656 sk->sk_send_head = NULL; 1661}
1657 else 1662
1658 sk->sk_send_head = tcp_write_queue_next(sk, skb); 1663static inline bool tcp_rtx_queue_empty(const struct sock *sk)
1664{
1665 return RB_EMPTY_ROOT(&sk->tcp_rtx_queue);
1666}
1667
1668static inline bool tcp_rtx_and_write_queues_empty(const struct sock *sk)
1669{
1670 return tcp_rtx_queue_empty(sk) && tcp_write_queue_empty(sk);
1659} 1671}
1660 1672
1661static inline void tcp_check_send_head(struct sock *sk, struct sk_buff *skb_unlinked) 1673static inline void tcp_check_send_head(struct sock *sk, struct sk_buff *skb_unlinked)
1662{ 1674{
1663 if (sk->sk_send_head == skb_unlinked) { 1675 if (tcp_write_queue_empty(sk))
1664 sk->sk_send_head = NULL;
1665 tcp_chrono_stop(sk, TCP_CHRONO_BUSY); 1676 tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
1666 } 1677
1667 if (tcp_sk(sk)->highest_sack == skb_unlinked) 1678 if (tcp_sk(sk)->highest_sack == skb_unlinked)
1668 tcp_sk(sk)->highest_sack = NULL; 1679 tcp_sk(sk)->highest_sack = NULL;
1669} 1680}
1670 1681
1671static inline void tcp_init_send_head(struct sock *sk)
1672{
1673 sk->sk_send_head = NULL;
1674}
1675
1676static inline void __tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb) 1682static inline void __tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb)
1677{ 1683{
1678 __skb_queue_tail(&sk->sk_write_queue, skb); 1684 __skb_queue_tail(&sk->sk_write_queue, skb);
@@ -1683,8 +1689,7 @@ static inline void tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb
1683 __tcp_add_write_queue_tail(sk, skb); 1689 __tcp_add_write_queue_tail(sk, skb);
1684 1690
1685 /* Queue it, remembering where we must start sending. */ 1691 /* Queue it, remembering where we must start sending. */
1686 if (sk->sk_send_head == NULL) { 1692 if (sk->sk_write_queue.next == skb) {
1687 sk->sk_send_head = skb;
1688 tcp_chrono_start(sk, TCP_CHRONO_BUSY); 1693 tcp_chrono_start(sk, TCP_CHRONO_BUSY);
1689 1694
1690 if (tcp_sk(sk)->highest_sack == NULL) 1695 if (tcp_sk(sk)->highest_sack == NULL)
@@ -1697,35 +1702,32 @@ static inline void __tcp_add_write_queue_head(struct sock *sk, struct sk_buff *s
1697 __skb_queue_head(&sk->sk_write_queue, skb); 1702 __skb_queue_head(&sk->sk_write_queue, skb);
1698} 1703}
1699 1704
1700/* Insert buff after skb on the write queue of sk. */
1701static inline void tcp_insert_write_queue_after(struct sk_buff *skb,
1702 struct sk_buff *buff,
1703 struct sock *sk)
1704{
1705 __skb_queue_after(&sk->sk_write_queue, skb, buff);
1706}
1707
1708/* Insert new before skb on the write queue of sk. */ 1705/* Insert new before skb on the write queue of sk. */
1709static inline void tcp_insert_write_queue_before(struct sk_buff *new, 1706static inline void tcp_insert_write_queue_before(struct sk_buff *new,
1710 struct sk_buff *skb, 1707 struct sk_buff *skb,
1711 struct sock *sk) 1708 struct sock *sk)
1712{ 1709{
1713 __skb_queue_before(&sk->sk_write_queue, skb, new); 1710 __skb_queue_before(&sk->sk_write_queue, skb, new);
1714
1715 if (sk->sk_send_head == skb)
1716 sk->sk_send_head = new;
1717} 1711}
1718 1712
1719static inline void tcp_unlink_write_queue(struct sk_buff *skb, struct sock *sk) 1713static inline void tcp_unlink_write_queue(struct sk_buff *skb, struct sock *sk)
1720{ 1714{
1721 list_del(&skb->tcp_tsorted_anchor);
1722 tcp_skb_tsorted_anchor_cleanup(skb);
1723 __skb_unlink(skb, &sk->sk_write_queue); 1715 __skb_unlink(skb, &sk->sk_write_queue);
1724} 1716}
1725 1717
1726static inline bool tcp_write_queue_empty(struct sock *sk) 1718void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb);
1719
1720static inline void tcp_rtx_queue_unlink(struct sk_buff *skb, struct sock *sk)
1727{ 1721{
1728 return skb_queue_empty(&sk->sk_write_queue); 1722 tcp_skb_tsorted_anchor_cleanup(skb);
1723 rb_erase(&skb->rbnode, &sk->tcp_rtx_queue);
1724}
1725
1726static inline void tcp_rtx_queue_unlink_and_free(struct sk_buff *skb, struct sock *sk)
1727{
1728 list_del(&skb->tcp_tsorted_anchor);
1729 tcp_rtx_queue_unlink(skb, sk);
1730 sk_wmem_free_skb(sk, skb);
1729} 1731}
1730 1732
1731static inline void tcp_push_pending_frames(struct sock *sk) 1733static inline void tcp_push_pending_frames(struct sock *sk)
@@ -1754,8 +1756,9 @@ static inline u32 tcp_highest_sack_seq(struct tcp_sock *tp)
1754 1756
1755static inline void tcp_advance_highest_sack(struct sock *sk, struct sk_buff *skb) 1757static inline void tcp_advance_highest_sack(struct sock *sk, struct sk_buff *skb)
1756{ 1758{
1757 tcp_sk(sk)->highest_sack = tcp_skb_is_last(sk, skb) ? NULL : 1759 struct sk_buff *next = skb_rb_next(skb);
1758 tcp_write_queue_next(sk, skb); 1760
1761 tcp_sk(sk)->highest_sack = next ?: tcp_send_head(sk);
1759} 1762}
1760 1763
1761static inline struct sk_buff *tcp_highest_sack(struct sock *sk) 1764static inline struct sk_buff *tcp_highest_sack(struct sock *sk)
@@ -1765,7 +1768,9 @@ static inline struct sk_buff *tcp_highest_sack(struct sock *sk)
1765 1768
1766static inline void tcp_highest_sack_reset(struct sock *sk) 1769static inline void tcp_highest_sack_reset(struct sock *sk)
1767{ 1770{
1768 tcp_sk(sk)->highest_sack = tcp_write_queue_head(sk); 1771 struct sk_buff *skb = tcp_rtx_queue_head(sk);
1772
1773 tcp_sk(sk)->highest_sack = skb ?: tcp_send_head(sk);
1769} 1774}
1770 1775
1771/* Called when old skb is about to be deleted (to be combined with new skb) */ 1776/* Called when old skb is about to be deleted (to be combined with new skb) */
@@ -1935,7 +1940,7 @@ extern void tcp_rack_reo_timeout(struct sock *sk);
1935/* At how many usecs into the future should the RTO fire? */ 1940/* At how many usecs into the future should the RTO fire? */
1936static inline s64 tcp_rto_delta_us(const struct sock *sk) 1941static inline s64 tcp_rto_delta_us(const struct sock *sk)
1937{ 1942{
1938 const struct sk_buff *skb = tcp_write_queue_head(sk); 1943 const struct sk_buff *skb = tcp_rtx_queue_head(sk);
1939 u32 rto = inet_csk(sk)->icsk_rto; 1944 u32 rto = inet_csk(sk)->icsk_rto;
1940 u64 rto_time_stamp_us = skb->skb_mstamp + jiffies_to_usecs(rto); 1945 u64 rto_time_stamp_us = skb->skb_mstamp + jiffies_to_usecs(rto);
1941 1946
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index b8d379c80936..3b34850d361f 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -413,6 +413,7 @@ void tcp_init_sock(struct sock *sk)
413 struct tcp_sock *tp = tcp_sk(sk); 413 struct tcp_sock *tp = tcp_sk(sk);
414 414
415 tp->out_of_order_queue = RB_ROOT; 415 tp->out_of_order_queue = RB_ROOT;
416 sk->tcp_rtx_queue = RB_ROOT;
416 tcp_init_xmit_timers(sk); 417 tcp_init_xmit_timers(sk);
417 INIT_LIST_HEAD(&tp->tsq_node); 418 INIT_LIST_HEAD(&tp->tsq_node);
418 INIT_LIST_HEAD(&tp->tsorted_sent_queue); 419 INIT_LIST_HEAD(&tp->tsorted_sent_queue);
@@ -701,10 +702,9 @@ static void tcp_push(struct sock *sk, int flags, int mss_now,
701 struct tcp_sock *tp = tcp_sk(sk); 702 struct tcp_sock *tp = tcp_sk(sk);
702 struct sk_buff *skb; 703 struct sk_buff *skb;
703 704
704 if (!tcp_send_head(sk))
705 return;
706
707 skb = tcp_write_queue_tail(sk); 705 skb = tcp_write_queue_tail(sk);
706 if (!skb)
707 return;
708 if (!(flags & MSG_MORE) || forced_push(tp)) 708 if (!(flags & MSG_MORE) || forced_push(tp))
709 tcp_mark_push(tp, skb); 709 tcp_mark_push(tp, skb);
710 710
@@ -964,14 +964,14 @@ ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
964 int copy, i; 964 int copy, i;
965 bool can_coalesce; 965 bool can_coalesce;
966 966
967 if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0 || 967 if (!skb || (copy = size_goal - skb->len) <= 0 ||
968 !tcp_skb_can_collapse_to(skb)) { 968 !tcp_skb_can_collapse_to(skb)) {
969new_segment: 969new_segment:
970 if (!sk_stream_memory_free(sk)) 970 if (!sk_stream_memory_free(sk))
971 goto wait_for_sndbuf; 971 goto wait_for_sndbuf;
972 972
973 skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, 973 skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
974 skb_queue_empty(&sk->sk_write_queue)); 974 tcp_rtx_and_write_queues_empty(sk));
975 if (!skb) 975 if (!skb)
976 goto wait_for_memory; 976 goto wait_for_memory;
977 977
@@ -1199,7 +1199,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
1199 goto out_err; 1199 goto out_err;
1200 } 1200 }
1201 1201
1202 skb = tcp_send_head(sk) ? tcp_write_queue_tail(sk) : NULL; 1202 skb = tcp_write_queue_tail(sk);
1203 uarg = sock_zerocopy_realloc(sk, size, skb_zcopy(skb)); 1203 uarg = sock_zerocopy_realloc(sk, size, skb_zcopy(skb));
1204 if (!uarg) { 1204 if (!uarg) {
1205 err = -ENOBUFS; 1205 err = -ENOBUFS;
@@ -1275,7 +1275,7 @@ restart:
1275 int max = size_goal; 1275 int max = size_goal;
1276 1276
1277 skb = tcp_write_queue_tail(sk); 1277 skb = tcp_write_queue_tail(sk);
1278 if (tcp_send_head(sk)) { 1278 if (skb) {
1279 if (skb->ip_summed == CHECKSUM_NONE) 1279 if (skb->ip_summed == CHECKSUM_NONE)
1280 max = mss_now; 1280 max = mss_now;
1281 copy = max - skb->len; 1281 copy = max - skb->len;
@@ -1295,7 +1295,7 @@ new_segment:
1295 process_backlog = false; 1295 process_backlog = false;
1296 goto restart; 1296 goto restart;
1297 } 1297 }
1298 first_skb = skb_queue_empty(&sk->sk_write_queue); 1298 first_skb = tcp_rtx_and_write_queues_empty(sk);
1299 skb = sk_stream_alloc_skb(sk, 1299 skb = sk_stream_alloc_skb(sk,
1300 select_size(sk, sg, first_skb), 1300 select_size(sk, sg, first_skb),
1301 sk->sk_allocation, 1301 sk->sk_allocation,
@@ -1521,6 +1521,13 @@ static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
1521 1521
1522 /* XXX -- need to support SO_PEEK_OFF */ 1522 /* XXX -- need to support SO_PEEK_OFF */
1523 1523
1524 skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
1525 err = skb_copy_datagram_msg(skb, 0, msg, skb->len);
1526 if (err)
1527 return err;
1528 copied += skb->len;
1529 }
1530
1524 skb_queue_walk(&sk->sk_write_queue, skb) { 1531 skb_queue_walk(&sk->sk_write_queue, skb) {
1525 err = skb_copy_datagram_msg(skb, 0, msg, skb->len); 1532 err = skb_copy_datagram_msg(skb, 0, msg, skb->len);
1526 if (err) 1533 if (err)
@@ -2320,6 +2327,22 @@ static inline bool tcp_need_reset(int state)
2320 TCPF_FIN_WAIT2 | TCPF_SYN_RECV); 2327 TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
2321} 2328}
2322 2329
2330static void tcp_rtx_queue_purge(struct sock *sk)
2331{
2332 struct rb_node *p = rb_first(&sk->tcp_rtx_queue);
2333
2334 while (p) {
2335 struct sk_buff *skb = rb_to_skb(p);
2336
2337 p = rb_next(p);
2338 /* Since we are deleting whole queue, no need to
2339 * list_del(&skb->tcp_tsorted_anchor)
2340 */
2341 tcp_rtx_queue_unlink(skb, sk);
2342 sk_wmem_free_skb(sk, skb);
2343 }
2344}
2345
2323void tcp_write_queue_purge(struct sock *sk) 2346void tcp_write_queue_purge(struct sock *sk)
2324{ 2347{
2325 struct sk_buff *skb; 2348 struct sk_buff *skb;
@@ -2329,6 +2352,7 @@ void tcp_write_queue_purge(struct sock *sk)
2329 tcp_skb_tsorted_anchor_cleanup(skb); 2352 tcp_skb_tsorted_anchor_cleanup(skb);
2330 sk_wmem_free_skb(sk, skb); 2353 sk_wmem_free_skb(sk, skb);
2331 } 2354 }
2355 tcp_rtx_queue_purge(sk);
2332 INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue); 2356 INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue);
2333 sk_mem_reclaim(sk); 2357 sk_mem_reclaim(sk);
2334 tcp_clear_all_retrans_hints(tcp_sk(sk)); 2358 tcp_clear_all_retrans_hints(tcp_sk(sk));
@@ -2392,7 +2416,6 @@ int tcp_disconnect(struct sock *sk, int flags)
2392 * issue in __tcp_select_window() 2416 * issue in __tcp_select_window()
2393 */ 2417 */
2394 icsk->icsk_ack.rcv_mss = TCP_MIN_MSS; 2418 icsk->icsk_ack.rcv_mss = TCP_MIN_MSS;
2395 tcp_init_send_head(sk);
2396 memset(&tp->rx_opt, 0, sizeof(tp->rx_opt)); 2419 memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
2397 __sk_dst_reset(sk); 2420 __sk_dst_reset(sk);
2398 dst_release(sk->sk_rx_dst); 2421 dst_release(sk->sk_rx_dst);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 72c4732ae2da..d0682ce2a5d6 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1142,6 +1142,7 @@ struct tcp_sacktag_state {
1142 u64 last_sackt; 1142 u64 last_sackt;
1143 struct rate_sample *rate; 1143 struct rate_sample *rate;
1144 int flag; 1144 int flag;
1145 unsigned int mss_now;
1145}; 1146};
1146 1147
1147/* Check if skb is fully within the SACK block. In presence of GSO skbs, 1148/* Check if skb is fully within the SACK block. In presence of GSO skbs,
@@ -1191,7 +1192,8 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
1191 if (pkt_len >= skb->len && !in_sack) 1192 if (pkt_len >= skb->len && !in_sack)
1192 return 0; 1193 return 0;
1193 1194
1194 err = tcp_fragment(sk, skb, pkt_len, mss, GFP_ATOMIC); 1195 err = tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
1196 pkt_len, mss, GFP_ATOMIC);
1195 if (err < 0) 1197 if (err < 0)
1196 return err; 1198 return err;
1197 } 1199 }
@@ -1363,8 +1365,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev,
1363 if (unlikely(TCP_SKB_CB(prev)->tx.delivered_mstamp)) 1365 if (unlikely(TCP_SKB_CB(prev)->tx.delivered_mstamp))
1364 TCP_SKB_CB(prev)->tx.delivered_mstamp = 0; 1366 TCP_SKB_CB(prev)->tx.delivered_mstamp = 0;
1365 1367
1366 tcp_unlink_write_queue(skb, sk); 1368 tcp_rtx_queue_unlink_and_free(skb, sk);
1367 sk_wmem_free_skb(sk, skb);
1368 1369
1369 NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKMERGED); 1370 NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKMERGED);
1370 1371
@@ -1414,9 +1415,9 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
1414 goto fallback; 1415 goto fallback;
1415 1416
1416 /* Can only happen with delayed DSACK + discard craziness */ 1417 /* Can only happen with delayed DSACK + discard craziness */
1417 if (unlikely(skb == tcp_write_queue_head(sk))) 1418 prev = skb_rb_prev(skb);
1419 if (!prev)
1418 goto fallback; 1420 goto fallback;
1419 prev = tcp_write_queue_prev(sk, skb);
1420 1421
1421 if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) 1422 if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED)
1422 goto fallback; 1423 goto fallback;
@@ -1501,12 +1502,11 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
1501 /* Hole filled allows collapsing with the next as well, this is very 1502 /* Hole filled allows collapsing with the next as well, this is very
1502 * useful when hole on every nth skb pattern happens 1503 * useful when hole on every nth skb pattern happens
1503 */ 1504 */
1504 if (prev == tcp_write_queue_tail(sk)) 1505 skb = skb_rb_next(prev);
1506 if (!skb)
1505 goto out; 1507 goto out;
1506 skb = tcp_write_queue_next(sk, prev);
1507 1508
1508 if (!skb_can_shift(skb) || 1509 if (!skb_can_shift(skb) ||
1509 (skb == tcp_send_head(sk)) ||
1510 ((TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) || 1510 ((TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) ||
1511 (mss != tcp_skb_seglen(skb))) 1511 (mss != tcp_skb_seglen(skb)))
1512 goto out; 1512 goto out;
@@ -1539,13 +1539,10 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
1539 struct tcp_sock *tp = tcp_sk(sk); 1539 struct tcp_sock *tp = tcp_sk(sk);
1540 struct sk_buff *tmp; 1540 struct sk_buff *tmp;
1541 1541
1542 tcp_for_write_queue_from(skb, sk) { 1542 skb_rbtree_walk_from(skb) {
1543 int in_sack = 0; 1543 int in_sack = 0;
1544 bool dup_sack = dup_sack_in; 1544 bool dup_sack = dup_sack_in;
1545 1545
1546 if (skb == tcp_send_head(sk))
1547 break;
1548
1549 /* queue is in-order => we can short-circuit the walk early */ 1546 /* queue is in-order => we can short-circuit the walk early */
1550 if (!before(TCP_SKB_CB(skb)->seq, end_seq)) 1547 if (!before(TCP_SKB_CB(skb)->seq, end_seq))
1551 break; 1548 break;
@@ -1607,23 +1604,44 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
1607 return skb; 1604 return skb;
1608} 1605}
1609 1606
1610/* Avoid all extra work that is being done by sacktag while walking in 1607static struct sk_buff *tcp_sacktag_bsearch(struct sock *sk,
1611 * a normal way 1608 struct tcp_sacktag_state *state,
1612 */ 1609 u32 seq)
1610{
1611 struct rb_node *parent, **p = &sk->tcp_rtx_queue.rb_node;
1612 struct sk_buff *skb;
1613 int unack_bytes;
1614
1615 while (*p) {
1616 parent = *p;
1617 skb = rb_to_skb(parent);
1618 if (before(seq, TCP_SKB_CB(skb)->seq)) {
1619 p = &parent->rb_left;
1620 continue;
1621 }
1622 if (!before(seq, TCP_SKB_CB(skb)->end_seq)) {
1623 p = &parent->rb_right;
1624 continue;
1625 }
1626
1627 state->fack_count = 0;
1628 unack_bytes = TCP_SKB_CB(skb)->seq - tcp_sk(sk)->snd_una;
1629 if (state->mss_now && unack_bytes > 0)
1630 state->fack_count = unack_bytes / state->mss_now;
1631
1632 return skb;
1633 }
1634 return NULL;
1635}
1636
1613static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk, 1637static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk,
1614 struct tcp_sacktag_state *state, 1638 struct tcp_sacktag_state *state,
1615 u32 skip_to_seq) 1639 u32 skip_to_seq)
1616{ 1640{
1617 tcp_for_write_queue_from(skb, sk) { 1641 if (skb && after(TCP_SKB_CB(skb)->seq, skip_to_seq))
1618 if (skb == tcp_send_head(sk)) 1642 return skb;
1619 break;
1620
1621 if (after(TCP_SKB_CB(skb)->end_seq, skip_to_seq))
1622 break;
1623 1643
1624 state->fack_count += tcp_skb_pcount(skb); 1644 return tcp_sacktag_bsearch(sk, state, skip_to_seq);
1625 }
1626 return skb;
1627} 1645}
1628 1646
1629static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb, 1647static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb,
@@ -1745,8 +1763,9 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
1745 } 1763 }
1746 } 1764 }
1747 1765
1748 skb = tcp_write_queue_head(sk); 1766 state->mss_now = tcp_current_mss(sk);
1749 state->fack_count = 0; 1767 state->fack_count = 0;
1768 skb = NULL;
1750 i = 0; 1769 i = 0;
1751 1770
1752 if (!tp->sacked_out) { 1771 if (!tp->sacked_out) {
@@ -1970,7 +1989,7 @@ void tcp_enter_loss(struct sock *sk)
1970 if (tcp_is_reno(tp)) 1989 if (tcp_is_reno(tp))
1971 tcp_reset_reno_sack(tp); 1990 tcp_reset_reno_sack(tp);
1972 1991
1973 skb = tcp_write_queue_head(sk); 1992 skb = tcp_rtx_queue_head(sk);
1974 is_reneg = skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED); 1993 is_reneg = skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED);
1975 if (is_reneg) { 1994 if (is_reneg) {
1976 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING); 1995 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
@@ -1979,10 +1998,7 @@ void tcp_enter_loss(struct sock *sk)
1979 } 1998 }
1980 tcp_clear_all_retrans_hints(tp); 1999 tcp_clear_all_retrans_hints(tp);
1981 2000
1982 tcp_for_write_queue(skb, sk) { 2001 skb_rbtree_walk_from(skb) {
1983 if (skb == tcp_send_head(sk))
1984 break;
1985
1986 mark_lost = (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) || 2002 mark_lost = (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) ||
1987 is_reneg); 2003 is_reneg);
1988 if (mark_lost) 2004 if (mark_lost)
@@ -2215,13 +2231,11 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
2215 return; 2231 return;
2216 cnt = tp->lost_cnt_hint; 2232 cnt = tp->lost_cnt_hint;
2217 } else { 2233 } else {
2218 skb = tcp_write_queue_head(sk); 2234 skb = tcp_rtx_queue_head(sk);
2219 cnt = 0; 2235 cnt = 0;
2220 } 2236 }
2221 2237
2222 tcp_for_write_queue_from(skb, sk) { 2238 skb_rbtree_walk_from(skb) {
2223 if (skb == tcp_send_head(sk))
2224 break;
2225 /* TODO: do this better */ 2239 /* TODO: do this better */
2226 /* this is not the most efficient way to do this... */ 2240 /* this is not the most efficient way to do this... */
2227 tp->lost_skb_hint = skb; 2241 tp->lost_skb_hint = skb;
@@ -2245,7 +2259,8 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
2245 /* If needed, chop off the prefix to mark as lost. */ 2259 /* If needed, chop off the prefix to mark as lost. */
2246 lost = (packets - oldcnt) * mss; 2260 lost = (packets - oldcnt) * mss;
2247 if (lost < skb->len && 2261 if (lost < skb->len &&
2248 tcp_fragment(sk, skb, lost, mss, GFP_ATOMIC) < 0) 2262 tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
2263 lost, mss, GFP_ATOMIC) < 0)
2249 break; 2264 break;
2250 cnt = packets; 2265 cnt = packets;
2251 } 2266 }
@@ -2329,7 +2344,7 @@ static bool tcp_any_retrans_done(const struct sock *sk)
2329 if (tp->retrans_out) 2344 if (tp->retrans_out)
2330 return true; 2345 return true;
2331 2346
2332 skb = tcp_write_queue_head(sk); 2347 skb = tcp_rtx_queue_head(sk);
2333 if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS)) 2348 if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS))
2334 return true; 2349 return true;
2335 2350
@@ -2370,9 +2385,7 @@ static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss)
2370 if (unmark_loss) { 2385 if (unmark_loss) {
2371 struct sk_buff *skb; 2386 struct sk_buff *skb;
2372 2387
2373 tcp_for_write_queue(skb, sk) { 2388 skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
2374 if (skb == tcp_send_head(sk))
2375 break;
2376 TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; 2389 TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
2377 } 2390 }
2378 tp->lost_out = 0; 2391 tp->lost_out = 0;
@@ -2617,9 +2630,7 @@ void tcp_simple_retransmit(struct sock *sk)
2617 unsigned int mss = tcp_current_mss(sk); 2630 unsigned int mss = tcp_current_mss(sk);
2618 u32 prior_lost = tp->lost_out; 2631 u32 prior_lost = tp->lost_out;
2619 2632
2620 tcp_for_write_queue(skb, sk) { 2633 skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
2621 if (skb == tcp_send_head(sk))
2622 break;
2623 if (tcp_skb_seglen(skb) > mss && 2634 if (tcp_skb_seglen(skb) > mss &&
2624 !(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) { 2635 !(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
2625 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { 2636 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
@@ -2713,7 +2724,7 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack,
2713 * is updated in tcp_ack()). Otherwise fall back to 2724 * is updated in tcp_ack()). Otherwise fall back to
2714 * the conventional recovery. 2725 * the conventional recovery.
2715 */ 2726 */
2716 if (tcp_send_head(sk) && 2727 if (!tcp_write_queue_empty(sk) &&
2717 after(tcp_wnd_end(tp), tp->snd_nxt)) { 2728 after(tcp_wnd_end(tp), tp->snd_nxt)) {
2718 *rexmit = REXMIT_NEW; 2729 *rexmit = REXMIT_NEW;
2719 return; 2730 return;
@@ -3077,11 +3088,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3077 struct tcp_sock *tp = tcp_sk(sk); 3088 struct tcp_sock *tp = tcp_sk(sk);
3078 u32 prior_sacked = tp->sacked_out; 3089 u32 prior_sacked = tp->sacked_out;
3079 u32 reord = tp->packets_out; 3090 u32 reord = tp->packets_out;
3091 struct sk_buff *skb, *next;
3080 bool fully_acked = true; 3092 bool fully_acked = true;
3081 long sack_rtt_us = -1L; 3093 long sack_rtt_us = -1L;
3082 long seq_rtt_us = -1L; 3094 long seq_rtt_us = -1L;
3083 long ca_rtt_us = -1L; 3095 long ca_rtt_us = -1L;
3084 struct sk_buff *skb;
3085 u32 pkts_acked = 0; 3096 u32 pkts_acked = 0;
3086 u32 last_in_flight = 0; 3097 u32 last_in_flight = 0;
3087 bool rtt_update; 3098 bool rtt_update;
@@ -3089,7 +3100,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3089 3100
3090 first_ackt = 0; 3101 first_ackt = 0;
3091 3102
3092 while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) { 3103 for (skb = skb_rb_first(&sk->tcp_rtx_queue); skb; skb = next) {
3093 struct tcp_skb_cb *scb = TCP_SKB_CB(skb); 3104 struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
3094 u8 sacked = scb->sacked; 3105 u8 sacked = scb->sacked;
3095 u32 acked_pcount; 3106 u32 acked_pcount;
@@ -3107,8 +3118,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3107 break; 3118 break;
3108 fully_acked = false; 3119 fully_acked = false;
3109 } else { 3120 } else {
3110 /* Speedup tcp_unlink_write_queue() and next loop */
3111 prefetchw(skb->next);
3112 acked_pcount = tcp_skb_pcount(skb); 3121 acked_pcount = tcp_skb_pcount(skb);
3113 } 3122 }
3114 3123
@@ -3160,12 +3169,12 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3160 if (!fully_acked) 3169 if (!fully_acked)
3161 break; 3170 break;
3162 3171
3163 tcp_unlink_write_queue(skb, sk); 3172 next = skb_rb_next(skb);
3164 sk_wmem_free_skb(sk, skb);
3165 if (unlikely(skb == tp->retransmit_skb_hint)) 3173 if (unlikely(skb == tp->retransmit_skb_hint))
3166 tp->retransmit_skb_hint = NULL; 3174 tp->retransmit_skb_hint = NULL;
3167 if (unlikely(skb == tp->lost_skb_hint)) 3175 if (unlikely(skb == tp->lost_skb_hint))
3168 tp->lost_skb_hint = NULL; 3176 tp->lost_skb_hint = NULL;
3177 tcp_rtx_queue_unlink_and_free(skb, sk);
3169 } 3178 }
3170 3179
3171 if (!skb) 3180 if (!skb)
@@ -3257,12 +3266,14 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3257 3266
3258static void tcp_ack_probe(struct sock *sk) 3267static void tcp_ack_probe(struct sock *sk)
3259{ 3268{
3260 const struct tcp_sock *tp = tcp_sk(sk);
3261 struct inet_connection_sock *icsk = inet_csk(sk); 3269 struct inet_connection_sock *icsk = inet_csk(sk);
3270 struct sk_buff *head = tcp_send_head(sk);
3271 const struct tcp_sock *tp = tcp_sk(sk);
3262 3272
3263 /* Was it a usable window open? */ 3273 /* Was it a usable window open? */
3264 3274 if (!head)
3265 if (!after(TCP_SKB_CB(tcp_send_head(sk))->end_seq, tcp_wnd_end(tp))) { 3275 return;
3276 if (!after(TCP_SKB_CB(head)->end_seq, tcp_wnd_end(tp))) {
3266 icsk->icsk_backoff = 0; 3277 icsk->icsk_backoff = 0;
3267 inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0); 3278 inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0);
3268 /* Socket must be waked up by subsequent tcp_data_snd_check(). 3279 /* Socket must be waked up by subsequent tcp_data_snd_check().
@@ -3382,7 +3393,7 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32
3382 tp->pred_flags = 0; 3393 tp->pred_flags = 0;
3383 tcp_fast_path_check(sk); 3394 tcp_fast_path_check(sk);
3384 3395
3385 if (tcp_send_head(sk)) 3396 if (!tcp_write_queue_empty(sk))
3386 tcp_slow_start_after_idle_check(sk); 3397 tcp_slow_start_after_idle_check(sk);
3387 3398
3388 if (nwin > tp->max_window) { 3399 if (nwin > tp->max_window) {
@@ -3567,8 +3578,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3567 sack_state.first_sackt = 0; 3578 sack_state.first_sackt = 0;
3568 sack_state.rate = &rs; 3579 sack_state.rate = &rs;
3569 3580
3570 /* We very likely will need to access write queue head. */ 3581 /* We very likely will need to access rtx queue. */
3571 prefetchw(sk->sk_write_queue.next); 3582 prefetch(sk->tcp_rtx_queue.rb_node);
3572 3583
3573 /* If the ack is older than previous acks 3584 /* If the ack is older than previous acks
3574 * then we can probably ignore it. 3585 * then we can probably ignore it.
@@ -3682,8 +3693,7 @@ no_queue:
3682 * being used to time the probes, and is probably far higher than 3693 * being used to time the probes, and is probably far higher than
3683 * it needs to be for normal retransmission. 3694 * it needs to be for normal retransmission.
3684 */ 3695 */
3685 if (tcp_send_head(sk)) 3696 tcp_ack_probe(sk);
3686 tcp_ack_probe(sk);
3687 3697
3688 if (tp->tlp_high_seq) 3698 if (tp->tlp_high_seq)
3689 tcp_process_tlp_ack(sk, ack, flag); 3699 tcp_process_tlp_ack(sk, ack, flag);
@@ -4726,7 +4736,7 @@ static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
4726} 4736}
4727 4737
4728/* Insert skb into rb tree, ordered by TCP_SKB_CB(skb)->seq */ 4738/* Insert skb into rb tree, ordered by TCP_SKB_CB(skb)->seq */
4729static void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb) 4739void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb)
4730{ 4740{
4731 struct rb_node **p = &root->rb_node; 4741 struct rb_node **p = &root->rb_node;
4732 struct rb_node *parent = NULL; 4742 struct rb_node *parent = NULL;
@@ -5530,7 +5540,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
5530 struct tcp_fastopen_cookie *cookie) 5540 struct tcp_fastopen_cookie *cookie)
5531{ 5541{
5532 struct tcp_sock *tp = tcp_sk(sk); 5542 struct tcp_sock *tp = tcp_sk(sk);
5533 struct sk_buff *data = tp->syn_data ? tcp_write_queue_head(sk) : NULL; 5543 struct sk_buff *data = tp->syn_data ? tcp_rtx_queue_head(sk) : NULL;
5534 u16 mss = tp->rx_opt.mss_clamp, try_exp = 0; 5544 u16 mss = tp->rx_opt.mss_clamp, try_exp = 0;
5535 bool syn_drop = false; 5545 bool syn_drop = false;
5536 5546
@@ -5565,9 +5575,8 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
5565 tcp_fastopen_cache_set(sk, mss, cookie, syn_drop, try_exp); 5575 tcp_fastopen_cache_set(sk, mss, cookie, syn_drop, try_exp);
5566 5576
5567 if (data) { /* Retransmit unacked data in SYN */ 5577 if (data) { /* Retransmit unacked data in SYN */
5568 tcp_for_write_queue_from(data, sk) { 5578 skb_rbtree_walk_from(data) {
5569 if (data == tcp_send_head(sk) || 5579 if (__tcp_retransmit_skb(sk, data, 1))
5570 __tcp_retransmit_skb(sk, data, 1))
5571 break; 5580 break;
5572 } 5581 }
5573 tcp_rearm_rto(sk); 5582 tcp_rearm_rto(sk);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index c7460fd90884..5418ecf03b78 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -480,7 +480,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
480 TCP_TIMEOUT_INIT; 480 TCP_TIMEOUT_INIT;
481 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); 481 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
482 482
483 skb = tcp_write_queue_head(sk); 483 skb = tcp_rtx_queue_head(sk);
484 BUG_ON(!skb); 484 BUG_ON(!skb);
485 485
486 tcp_mstamp_refresh(tp); 486 tcp_mstamp_refresh(tp);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 8162e2880178..696b0a168f16 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -66,15 +66,17 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
66 int push_one, gfp_t gfp); 66 int push_one, gfp_t gfp);
67 67
68/* Account for new data that has been sent to the network. */ 68/* Account for new data that has been sent to the network. */
69static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb) 69static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb)
70{ 70{
71 struct inet_connection_sock *icsk = inet_csk(sk); 71 struct inet_connection_sock *icsk = inet_csk(sk);
72 struct tcp_sock *tp = tcp_sk(sk); 72 struct tcp_sock *tp = tcp_sk(sk);
73 unsigned int prior_packets = tp->packets_out; 73 unsigned int prior_packets = tp->packets_out;
74 74
75 tcp_advance_send_head(sk, skb);
76 tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; 75 tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
77 76
77 __skb_unlink(skb, &sk->sk_write_queue);
78 tcp_rbtree_insert(&sk->tcp_rtx_queue, skb);
79
78 tp->packets_out += tcp_skb_pcount(skb); 80 tp->packets_out += tcp_skb_pcount(skb);
79 if (!prior_packets || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) 81 if (!prior_packets || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
80 tcp_rearm_rto(sk); 82 tcp_rearm_rto(sk);
@@ -1249,12 +1251,25 @@ static void tcp_skb_fragment_eor(struct sk_buff *skb, struct sk_buff *skb2)
1249 TCP_SKB_CB(skb)->eor = 0; 1251 TCP_SKB_CB(skb)->eor = 0;
1250} 1252}
1251 1253
1254/* Insert buff after skb on the write or rtx queue of sk. */
1255static void tcp_insert_write_queue_after(struct sk_buff *skb,
1256 struct sk_buff *buff,
1257 struct sock *sk,
1258 enum tcp_queue tcp_queue)
1259{
1260 if (tcp_queue == TCP_FRAG_IN_WRITE_QUEUE)
1261 __skb_queue_after(&sk->sk_write_queue, skb, buff);
1262 else
1263 tcp_rbtree_insert(&sk->tcp_rtx_queue, buff);
1264}
1265
1252/* Function to create two new TCP segments. Shrinks the given segment 1266/* Function to create two new TCP segments. Shrinks the given segment
1253 * to the specified size and appends a new segment with the rest of the 1267 * to the specified size and appends a new segment with the rest of the
1254 * packet to the list. This won't be called frequently, I hope. 1268 * packet to the list. This won't be called frequently, I hope.
1255 * Remember, these are still headerless SKBs at this point. 1269 * Remember, these are still headerless SKBs at this point.
1256 */ 1270 */
1257int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, 1271int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
1272 struct sk_buff *skb, u32 len,
1258 unsigned int mss_now, gfp_t gfp) 1273 unsigned int mss_now, gfp_t gfp)
1259{ 1274{
1260 struct tcp_sock *tp = tcp_sk(sk); 1275 struct tcp_sock *tp = tcp_sk(sk);
@@ -1337,7 +1352,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
1337 1352
1338 /* Link BUFF into the send queue. */ 1353 /* Link BUFF into the send queue. */
1339 __skb_header_release(buff); 1354 __skb_header_release(buff);
1340 tcp_insert_write_queue_after(skb, buff, sk); 1355 tcp_insert_write_queue_after(skb, buff, sk, tcp_queue);
1341 list_add(&buff->tcp_tsorted_anchor, &skb->tcp_tsorted_anchor); 1356 list_add(&buff->tcp_tsorted_anchor, &skb->tcp_tsorted_anchor);
1342 1357
1343 return 0; 1358 return 0;
@@ -1625,10 +1640,10 @@ static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited)
1625 * is caused by insufficient sender buffer: 1640 * is caused by insufficient sender buffer:
1626 * 1) just sent some data (see tcp_write_xmit) 1641 * 1) just sent some data (see tcp_write_xmit)
1627 * 2) not cwnd limited (this else condition) 1642 * 2) not cwnd limited (this else condition)
1628 * 3) no more data to send (null tcp_send_head ) 1643 * 3) no more data to send (tcp_write_queue_empty())
1629 * 4) application is hitting buffer limit (SOCK_NOSPACE) 1644 * 4) application is hitting buffer limit (SOCK_NOSPACE)
1630 */ 1645 */
1631 if (!tcp_send_head(sk) && sk->sk_socket && 1646 if (tcp_write_queue_empty(sk) && sk->sk_socket &&
1632 test_bit(SOCK_NOSPACE, &sk->sk_socket->flags) && 1647 test_bit(SOCK_NOSPACE, &sk->sk_socket->flags) &&
1633 (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) 1648 (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
1634 tcp_chrono_start(sk, TCP_CHRONO_SNDBUF_LIMITED); 1649 tcp_chrono_start(sk, TCP_CHRONO_SNDBUF_LIMITED);
@@ -1824,7 +1839,8 @@ static bool tcp_snd_wnd_test(const struct tcp_sock *tp,
1824 * know that all the data is in scatter-gather pages, and that the 1839 * know that all the data is in scatter-gather pages, and that the
1825 * packet has never been sent out before (and thus is not cloned). 1840 * packet has never been sent out before (and thus is not cloned).
1826 */ 1841 */
1827static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, 1842static int tso_fragment(struct sock *sk, enum tcp_queue tcp_queue,
1843 struct sk_buff *skb, unsigned int len,
1828 unsigned int mss_now, gfp_t gfp) 1844 unsigned int mss_now, gfp_t gfp)
1829{ 1845{
1830 struct sk_buff *buff; 1846 struct sk_buff *buff;
@@ -1833,7 +1849,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
1833 1849
1834 /* All of a TSO frame must be composed of paged data. */ 1850 /* All of a TSO frame must be composed of paged data. */
1835 if (skb->len != skb->data_len) 1851 if (skb->len != skb->data_len)
1836 return tcp_fragment(sk, skb, len, mss_now, gfp); 1852 return tcp_fragment(sk, tcp_queue, skb, len, mss_now, gfp);
1837 1853
1838 buff = sk_stream_alloc_skb(sk, 0, gfp, true); 1854 buff = sk_stream_alloc_skb(sk, 0, gfp, true);
1839 if (unlikely(!buff)) 1855 if (unlikely(!buff))
@@ -1869,7 +1885,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
1869 1885
1870 /* Link BUFF into the send queue. */ 1886 /* Link BUFF into the send queue. */
1871 __skb_header_release(buff); 1887 __skb_header_release(buff);
1872 tcp_insert_write_queue_after(skb, buff, sk); 1888 tcp_insert_write_queue_after(skb, buff, sk, tcp_queue);
1873 1889
1874 return 0; 1890 return 0;
1875} 1891}
@@ -1939,8 +1955,10 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
1939 goto send_now; 1955 goto send_now;
1940 } 1956 }
1941 1957
1942 head = tcp_write_queue_head(sk); 1958 /* TODO : use tsorted_sent_queue ? */
1943 1959 head = tcp_rtx_queue_head(sk);
1960 if (!head)
1961 goto send_now;
1944 age = tcp_stamp_us_delta(tp->tcp_mstamp, head->skb_mstamp); 1962 age = tcp_stamp_us_delta(tp->tcp_mstamp, head->skb_mstamp);
1945 /* If next ACK is likely to come too late (half srtt), do not defer */ 1963 /* If next ACK is likely to come too late (half srtt), do not defer */
1946 if (age < (tp->srtt_us >> 4)) 1964 if (age < (tp->srtt_us >> 4))
@@ -2158,13 +2176,12 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
2158 limit <<= factor; 2176 limit <<= factor;
2159 2177
2160 if (refcount_read(&sk->sk_wmem_alloc) > limit) { 2178 if (refcount_read(&sk->sk_wmem_alloc) > limit) {
2161 /* Always send the 1st or 2nd skb in write queue. 2179 /* Always send skb if rtx queue is empty.
2162 * No need to wait for TX completion to call us back, 2180 * No need to wait for TX completion to call us back,
2163 * after softirq/tasklet schedule. 2181 * after softirq/tasklet schedule.
2164 * This helps when TX completions are delayed too much. 2182 * This helps when TX completions are delayed too much.
2165 */ 2183 */
2166 if (skb == sk->sk_write_queue.next || 2184 if (tcp_rtx_queue_empty(sk))
2167 skb->prev == sk->sk_write_queue.next)
2168 return false; 2185 return false;
2169 2186
2170 set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags); 2187 set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
@@ -2215,7 +2232,7 @@ void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type)
2215 * it's the "most interesting" or current chrono we are 2232 * it's the "most interesting" or current chrono we are
2216 * tracking and starts busy chrono if we have pending data. 2233 * tracking and starts busy chrono if we have pending data.
2217 */ 2234 */
2218 if (tcp_write_queue_empty(sk)) 2235 if (tcp_rtx_and_write_queues_empty(sk))
2219 tcp_chrono_set(tp, TCP_CHRONO_UNSPEC); 2236 tcp_chrono_set(tp, TCP_CHRONO_UNSPEC);
2220 else if (type == tp->chrono_type) 2237 else if (type == tp->chrono_type)
2221 tcp_chrono_set(tp, TCP_CHRONO_BUSY); 2238 tcp_chrono_set(tp, TCP_CHRONO_BUSY);
@@ -2310,7 +2327,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
2310 nonagle); 2327 nonagle);
2311 2328
2312 if (skb->len > limit && 2329 if (skb->len > limit &&
2313 unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) 2330 unlikely(tso_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE,
2331 skb, limit, mss_now, gfp)))
2314 break; 2332 break;
2315 2333
2316 if (test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags)) 2334 if (test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags))
@@ -2350,7 +2368,7 @@ repair:
2350 tcp_cwnd_validate(sk, is_cwnd_limited); 2368 tcp_cwnd_validate(sk, is_cwnd_limited);
2351 return false; 2369 return false;
2352 } 2370 }
2353 return !tp->packets_out && tcp_send_head(sk); 2371 return !tp->packets_out && !tcp_write_queue_empty(sk);
2354} 2372}
2355 2373
2356bool tcp_schedule_loss_probe(struct sock *sk) 2374bool tcp_schedule_loss_probe(struct sock *sk)
@@ -2374,7 +2392,7 @@ bool tcp_schedule_loss_probe(struct sock *sk)
2374 return false; 2392 return false;
2375 2393
2376 if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) && 2394 if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) &&
2377 tcp_send_head(sk)) 2395 !tcp_write_queue_empty(sk))
2378 return false; 2396 return false;
2379 2397
2380 /* Probe timeout is 2*rtt. Add minimum RTO to account 2398 /* Probe timeout is 2*rtt. Add minimum RTO to account
@@ -2427,18 +2445,14 @@ void tcp_send_loss_probe(struct sock *sk)
2427 int mss = tcp_current_mss(sk); 2445 int mss = tcp_current_mss(sk);
2428 2446
2429 skb = tcp_send_head(sk); 2447 skb = tcp_send_head(sk);
2430 if (skb) { 2448 if (skb && tcp_snd_wnd_test(tp, skb, mss)) {
2431 if (tcp_snd_wnd_test(tp, skb, mss)) { 2449 pcount = tp->packets_out;
2432 pcount = tp->packets_out; 2450 tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC);
2433 tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC); 2451 if (tp->packets_out > pcount)
2434 if (tp->packets_out > pcount) 2452 goto probe_sent;
2435 goto probe_sent; 2453 goto rearm_timer;
2436 goto rearm_timer;
2437 }
2438 skb = tcp_write_queue_prev(sk, skb);
2439 } else {
2440 skb = tcp_write_queue_tail(sk);
2441 } 2454 }
2455 skb = skb_rb_last(&sk->tcp_rtx_queue);
2442 2456
2443 /* At most one outstanding TLP retransmission. */ 2457 /* At most one outstanding TLP retransmission. */
2444 if (tp->tlp_high_seq) 2458 if (tp->tlp_high_seq)
@@ -2456,10 +2470,11 @@ void tcp_send_loss_probe(struct sock *sk)
2456 goto rearm_timer; 2470 goto rearm_timer;
2457 2471
2458 if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) { 2472 if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) {
2459 if (unlikely(tcp_fragment(sk, skb, (pcount - 1) * mss, mss, 2473 if (unlikely(tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
2474 (pcount - 1) * mss, mss,
2460 GFP_ATOMIC))) 2475 GFP_ATOMIC)))
2461 goto rearm_timer; 2476 goto rearm_timer;
2462 skb = tcp_write_queue_next(sk, skb); 2477 skb = skb_rb_next(skb);
2463 } 2478 }
2464 2479
2465 if (WARN_ON(!skb || !tcp_skb_pcount(skb))) 2480 if (WARN_ON(!skb || !tcp_skb_pcount(skb)))
@@ -2659,7 +2674,7 @@ void tcp_skb_collapse_tstamp(struct sk_buff *skb,
2659static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) 2674static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
2660{ 2675{
2661 struct tcp_sock *tp = tcp_sk(sk); 2676 struct tcp_sock *tp = tcp_sk(sk);
2662 struct sk_buff *next_skb = tcp_write_queue_next(sk, skb); 2677 struct sk_buff *next_skb = skb_rb_next(skb);
2663 int skb_size, next_skb_size; 2678 int skb_size, next_skb_size;
2664 2679
2665 skb_size = skb->len; 2680 skb_size = skb->len;
@@ -2676,8 +2691,6 @@ static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
2676 } 2691 }
2677 tcp_highest_sack_combine(sk, next_skb, skb); 2692 tcp_highest_sack_combine(sk, next_skb, skb);
2678 2693
2679 tcp_unlink_write_queue(next_skb, sk);
2680
2681 if (next_skb->ip_summed == CHECKSUM_PARTIAL) 2694 if (next_skb->ip_summed == CHECKSUM_PARTIAL)
2682 skb->ip_summed = CHECKSUM_PARTIAL; 2695 skb->ip_summed = CHECKSUM_PARTIAL;
2683 2696
@@ -2705,7 +2718,7 @@ static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
2705 2718
2706 tcp_skb_collapse_tstamp(skb, next_skb); 2719 tcp_skb_collapse_tstamp(skb, next_skb);
2707 2720
2708 sk_wmem_free_skb(sk, next_skb); 2721 tcp_rtx_queue_unlink_and_free(next_skb, sk);
2709 return true; 2722 return true;
2710} 2723}
2711 2724
@@ -2716,8 +2729,6 @@ static bool tcp_can_collapse(const struct sock *sk, const struct sk_buff *skb)
2716 return false; 2729 return false;
2717 if (skb_cloned(skb)) 2730 if (skb_cloned(skb))
2718 return false; 2731 return false;
2719 if (skb == tcp_send_head(sk))
2720 return false;
2721 /* Some heuristics for collapsing over SACK'd could be invented */ 2732 /* Some heuristics for collapsing over SACK'd could be invented */
2722 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) 2733 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
2723 return false; 2734 return false;
@@ -2740,7 +2751,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
2740 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) 2751 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
2741 return; 2752 return;
2742 2753
2743 tcp_for_write_queue_from_safe(skb, tmp, sk) { 2754 skb_rbtree_walk_from_safe(skb, tmp) {
2744 if (!tcp_can_collapse(sk, skb)) 2755 if (!tcp_can_collapse(sk, skb))
2745 break; 2756 break;
2746 2757
@@ -2815,7 +2826,8 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
2815 2826
2816 len = cur_mss * segs; 2827 len = cur_mss * segs;
2817 if (skb->len > len) { 2828 if (skb->len > len) {
2818 if (tcp_fragment(sk, skb, len, cur_mss, GFP_ATOMIC)) 2829 if (tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, len,
2830 cur_mss, GFP_ATOMIC))
2819 return -ENOMEM; /* We'll try again later. */ 2831 return -ENOMEM; /* We'll try again later. */
2820 } else { 2832 } else {
2821 if (skb_unclone(skb, GFP_ATOMIC)) 2833 if (skb_unclone(skb, GFP_ATOMIC))
@@ -2906,29 +2918,24 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
2906void tcp_xmit_retransmit_queue(struct sock *sk) 2918void tcp_xmit_retransmit_queue(struct sock *sk)
2907{ 2919{
2908 const struct inet_connection_sock *icsk = inet_csk(sk); 2920 const struct inet_connection_sock *icsk = inet_csk(sk);
2921 struct sk_buff *skb, *rtx_head = NULL, *hole = NULL;
2909 struct tcp_sock *tp = tcp_sk(sk); 2922 struct tcp_sock *tp = tcp_sk(sk);
2910 struct sk_buff *skb;
2911 struct sk_buff *hole = NULL;
2912 u32 max_segs; 2923 u32 max_segs;
2913 int mib_idx; 2924 int mib_idx;
2914 2925
2915 if (!tp->packets_out) 2926 if (!tp->packets_out)
2916 return; 2927 return;
2917 2928
2918 if (tp->retransmit_skb_hint) { 2929 skb = tp->retransmit_skb_hint;
2919 skb = tp->retransmit_skb_hint; 2930 if (!skb) {
2920 } else { 2931 rtx_head = tcp_rtx_queue_head(sk);
2921 skb = tcp_write_queue_head(sk); 2932 skb = rtx_head;
2922 } 2933 }
2923
2924 max_segs = tcp_tso_segs(sk, tcp_current_mss(sk)); 2934 max_segs = tcp_tso_segs(sk, tcp_current_mss(sk));
2925 tcp_for_write_queue_from(skb, sk) { 2935 skb_rbtree_walk_from(skb) {
2926 __u8 sacked; 2936 __u8 sacked;
2927 int segs; 2937 int segs;
2928 2938
2929 if (skb == tcp_send_head(sk))
2930 break;
2931
2932 if (tcp_pacing_check(sk)) 2939 if (tcp_pacing_check(sk))
2933 break; 2940 break;
2934 2941
@@ -2973,7 +2980,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
2973 if (tcp_in_cwnd_reduction(sk)) 2980 if (tcp_in_cwnd_reduction(sk))
2974 tp->prr_out += tcp_skb_pcount(skb); 2981 tp->prr_out += tcp_skb_pcount(skb);
2975 2982
2976 if (skb == tcp_write_queue_head(sk) && 2983 if (skb == rtx_head &&
2977 icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT) 2984 icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT)
2978 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 2985 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
2979 inet_csk(sk)->icsk_rto, 2986 inet_csk(sk)->icsk_rto,
@@ -3015,12 +3022,15 @@ void tcp_send_fin(struct sock *sk)
3015 * Note: in the latter case, FIN packet will be sent after a timeout, 3022 * Note: in the latter case, FIN packet will be sent after a timeout,
3016 * as TCP stack thinks it has already been transmitted. 3023 * as TCP stack thinks it has already been transmitted.
3017 */ 3024 */
3018 if (tskb && (tcp_send_head(sk) || tcp_under_memory_pressure(sk))) { 3025 if (!tskb && tcp_under_memory_pressure(sk))
3026 tskb = skb_rb_last(&sk->tcp_rtx_queue);
3027
3028 if (tskb) {
3019coalesce: 3029coalesce:
3020 TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN; 3030 TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN;
3021 TCP_SKB_CB(tskb)->end_seq++; 3031 TCP_SKB_CB(tskb)->end_seq++;
3022 tp->write_seq++; 3032 tp->write_seq++;
3023 if (!tcp_send_head(sk)) { 3033 if (tcp_write_queue_empty(sk)) {
3024 /* This means tskb was already sent. 3034 /* This means tskb was already sent.
3025 * Pretend we included the FIN on previous transmit. 3035 * Pretend we included the FIN on previous transmit.
3026 * We need to set tp->snd_nxt to the value it would have 3036 * We need to set tp->snd_nxt to the value it would have
@@ -3086,9 +3096,9 @@ int tcp_send_synack(struct sock *sk)
3086{ 3096{
3087 struct sk_buff *skb; 3097 struct sk_buff *skb;
3088 3098
3089 skb = tcp_write_queue_head(sk); 3099 skb = tcp_rtx_queue_head(sk);
3090 if (!skb || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { 3100 if (!skb || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
3091 pr_debug("%s: wrong queue state\n", __func__); 3101 pr_err("%s: wrong queue state\n", __func__);
3092 return -EFAULT; 3102 return -EFAULT;
3093 } 3103 }
3094 if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) { 3104 if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) {
@@ -3101,10 +3111,9 @@ int tcp_send_synack(struct sock *sk)
3101 if (!nskb) 3111 if (!nskb)
3102 return -ENOMEM; 3112 return -ENOMEM;
3103 INIT_LIST_HEAD(&nskb->tcp_tsorted_anchor); 3113 INIT_LIST_HEAD(&nskb->tcp_tsorted_anchor);
3104 tcp_unlink_write_queue(skb, sk); 3114 tcp_rtx_queue_unlink_and_free(skb, sk);
3105 __skb_header_release(nskb); 3115 __skb_header_release(nskb);
3106 __tcp_add_write_queue_head(sk, nskb); 3116 tcp_rbtree_insert(&sk->tcp_rtx_queue, nskb);
3107 sk_wmem_free_skb(sk, skb);
3108 sk->sk_wmem_queued += nskb->truesize; 3117 sk->sk_wmem_queued += nskb->truesize;
3109 sk_mem_charge(sk, nskb->truesize); 3118 sk_mem_charge(sk, nskb->truesize);
3110 skb = nskb; 3119 skb = nskb;
@@ -3327,7 +3336,6 @@ static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb)
3327 3336
3328 tcb->end_seq += skb->len; 3337 tcb->end_seq += skb->len;
3329 __skb_header_release(skb); 3338 __skb_header_release(skb);
3330 __tcp_add_write_queue_tail(sk, skb);
3331 sk->sk_wmem_queued += skb->truesize; 3339 sk->sk_wmem_queued += skb->truesize;
3332 sk_mem_charge(sk, skb->truesize); 3340 sk_mem_charge(sk, skb->truesize);
3333 tp->write_seq = tcb->end_seq; 3341 tp->write_seq = tcb->end_seq;
@@ -3405,12 +3413,13 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
3405 TCP_SKB_CB(syn_data)->tcp_flags = TCPHDR_ACK | TCPHDR_PSH; 3413 TCP_SKB_CB(syn_data)->tcp_flags = TCPHDR_ACK | TCPHDR_PSH;
3406 if (!err) { 3414 if (!err) {
3407 tp->syn_data = (fo->copied > 0); 3415 tp->syn_data = (fo->copied > 0);
3416 tcp_rbtree_insert(&sk->tcp_rtx_queue, syn_data);
3408 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT); 3417 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT);
3409 goto done; 3418 goto done;
3410 } 3419 }
3411 3420
3412 /* data was not sent, this is our new send_head */ 3421 /* data was not sent, put it in write_queue */
3413 sk->sk_send_head = syn_data; 3422 __skb_queue_tail(&sk->sk_write_queue, syn_data);
3414 tp->packets_out -= tcp_skb_pcount(syn_data); 3423 tp->packets_out -= tcp_skb_pcount(syn_data);
3415 3424
3416fallback: 3425fallback:
@@ -3453,6 +3462,7 @@ int tcp_connect(struct sock *sk)
3453 tp->retrans_stamp = tcp_time_stamp(tp); 3462 tp->retrans_stamp = tcp_time_stamp(tp);
3454 tcp_connect_queue_skb(sk, buff); 3463 tcp_connect_queue_skb(sk, buff);
3455 tcp_ecn_send_syn(sk, buff); 3464 tcp_ecn_send_syn(sk, buff);
3465 tcp_rbtree_insert(&sk->tcp_rtx_queue, buff);
3456 3466
3457 /* Send off SYN; include data in Fast Open. */ 3467 /* Send off SYN; include data in Fast Open. */
3458 err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) : 3468 err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) :
@@ -3647,7 +3657,8 @@ int tcp_write_wakeup(struct sock *sk, int mib)
3647 skb->len > mss) { 3657 skb->len > mss) {
3648 seg_size = min(seg_size, mss); 3658 seg_size = min(seg_size, mss);
3649 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; 3659 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
3650 if (tcp_fragment(sk, skb, seg_size, mss, GFP_ATOMIC)) 3660 if (tcp_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE,
3661 skb, seg_size, mss, GFP_ATOMIC))
3651 return -1; 3662 return -1;
3652 } else if (!tcp_skb_pcount(skb)) 3663 } else if (!tcp_skb_pcount(skb))
3653 tcp_set_skb_tso_segs(skb, mss); 3664 tcp_set_skb_tso_segs(skb, mss);
@@ -3677,7 +3688,7 @@ void tcp_send_probe0(struct sock *sk)
3677 3688
3678 err = tcp_write_wakeup(sk, LINUX_MIB_TCPWINPROBE); 3689 err = tcp_write_wakeup(sk, LINUX_MIB_TCPWINPROBE);
3679 3690
3680 if (tp->packets_out || !tcp_send_head(sk)) { 3691 if (tp->packets_out || tcp_write_queue_empty(sk)) {
3681 /* Cancel probe timer, if it is not required. */ 3692 /* Cancel probe timer, if it is not required. */
3682 icsk->icsk_probes_out = 0; 3693 icsk->icsk_probes_out = 0;
3683 icsk->icsk_backoff = 0; 3694 icsk->icsk_backoff = 0;
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 655dd8d7f064..7014cc00c74c 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -156,8 +156,13 @@ static bool retransmits_timed_out(struct sock *sk,
156 return false; 156 return false;
157 157
158 start_ts = tcp_sk(sk)->retrans_stamp; 158 start_ts = tcp_sk(sk)->retrans_stamp;
159 if (unlikely(!start_ts)) 159 if (unlikely(!start_ts)) {
160 start_ts = tcp_skb_timestamp(tcp_write_queue_head(sk)); 160 struct sk_buff *head = tcp_rtx_queue_head(sk);
161
162 if (!head)
163 return false;
164 start_ts = tcp_skb_timestamp(head);
165 }
161 166
162 if (likely(timeout == 0)) { 167 if (likely(timeout == 0)) {
163 linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base); 168 linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base);
@@ -304,11 +309,12 @@ static void tcp_delack_timer(unsigned long data)
304static void tcp_probe_timer(struct sock *sk) 309static void tcp_probe_timer(struct sock *sk)
305{ 310{
306 struct inet_connection_sock *icsk = inet_csk(sk); 311 struct inet_connection_sock *icsk = inet_csk(sk);
312 struct sk_buff *skb = tcp_send_head(sk);
307 struct tcp_sock *tp = tcp_sk(sk); 313 struct tcp_sock *tp = tcp_sk(sk);
308 int max_probes; 314 int max_probes;
309 u32 start_ts; 315 u32 start_ts;
310 316
311 if (tp->packets_out || !tcp_send_head(sk)) { 317 if (tp->packets_out || !skb) {
312 icsk->icsk_probes_out = 0; 318 icsk->icsk_probes_out = 0;
313 return; 319 return;
314 } 320 }
@@ -321,9 +327,9 @@ static void tcp_probe_timer(struct sock *sk)
321 * corresponding system limit. We also implement similar policy when 327 * corresponding system limit. We also implement similar policy when
322 * we use RTO to probe window in tcp_retransmit_timer(). 328 * we use RTO to probe window in tcp_retransmit_timer().
323 */ 329 */
324 start_ts = tcp_skb_timestamp(tcp_send_head(sk)); 330 start_ts = tcp_skb_timestamp(skb);
325 if (!start_ts) 331 if (!start_ts)
326 tcp_send_head(sk)->skb_mstamp = tp->tcp_mstamp; 332 skb->skb_mstamp = tp->tcp_mstamp;
327 else if (icsk->icsk_user_timeout && 333 else if (icsk->icsk_user_timeout &&
328 (s32)(tcp_time_stamp(tp) - start_ts) > 334 (s32)(tcp_time_stamp(tp) - start_ts) >
329 jiffies_to_msecs(icsk->icsk_user_timeout)) 335 jiffies_to_msecs(icsk->icsk_user_timeout))
@@ -408,7 +414,7 @@ void tcp_retransmit_timer(struct sock *sk)
408 if (!tp->packets_out) 414 if (!tp->packets_out)
409 goto out; 415 goto out;
410 416
411 WARN_ON(tcp_write_queue_empty(sk)); 417 WARN_ON(tcp_rtx_queue_empty(sk));
412 418
413 tp->tlp_high_seq = 0; 419 tp->tlp_high_seq = 0;
414 420
@@ -441,7 +447,7 @@ void tcp_retransmit_timer(struct sock *sk)
441 goto out; 447 goto out;
442 } 448 }
443 tcp_enter_loss(sk); 449 tcp_enter_loss(sk);
444 tcp_retransmit_skb(sk, tcp_write_queue_head(sk), 1); 450 tcp_retransmit_skb(sk, tcp_rtx_queue_head(sk), 1);
445 __sk_dst_reset(sk); 451 __sk_dst_reset(sk);
446 goto out_reset_timer; 452 goto out_reset_timer;
447 } 453 }
@@ -473,7 +479,7 @@ void tcp_retransmit_timer(struct sock *sk)
473 479
474 tcp_enter_loss(sk); 480 tcp_enter_loss(sk);
475 481
476 if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk), 1) > 0) { 482 if (tcp_retransmit_skb(sk, tcp_rtx_queue_head(sk), 1) > 0) {
477 /* Retransmission failed because of local congestion, 483 /* Retransmission failed because of local congestion,
478 * do not backoff. 484 * do not backoff.
479 */ 485 */
@@ -647,7 +653,7 @@ static void tcp_keepalive_timer (unsigned long data)
647 elapsed = keepalive_time_when(tp); 653 elapsed = keepalive_time_when(tp);
648 654
649 /* It is alive without keepalive 8) */ 655 /* It is alive without keepalive 8) */
650 if (tp->packets_out || tcp_send_head(sk)) 656 if (tp->packets_out || !tcp_write_queue_empty(sk))
651 goto resched; 657 goto resched;
652 658
653 elapsed = keepalive_time_elapsed(tp); 659 elapsed = keepalive_time_elapsed(tp);