aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp_output.c
diff options
context:
space:
mode:
authorEric Dumazet <edumazet@google.com>2017-10-04 15:59:58 -0400
committerDavid S. Miller <davem@davemloft.net>2017-10-06 00:24:47 -0400
commite2080072ed2d98a55ae69d95dea60ff7a17cddd5 (patch)
tree6ad479e3db638db9c6469cab047b59685d04c5b1 /net/ipv4/tcp_output.c
parentb1fb67fa501c4787035317f84db6caf013385581 (diff)
tcp: new list for sent but unacked skbs for RACK recovery
This patch adds a new queue (list) that tracks the sent but not yet acked or SACKed skbs for a TCP connection. The list is chronologically ordered by skb->skb_mstamp (the head is the oldest sent skb). This list will be used to optimize TCP Rack recovery, which checks an skb's timestamp to judge if it has been lost and needs to be retransmitted. Since TCP write queue is ordered by sequence instead of sent time, RACK has to scan over the write queue to catch all eligible packets to detect lost retransmission, and iterates through SACKed skbs repeatedly. Special cares for rare events: 1. TCP repair fakes skb transmission so the send queue needs adjusted 2. SACK reneging would require re-inserting SACKed skbs into the send queue. For now I believe it's not worth the complexity to make RACK work perfectly on SACK reneging, so we do nothing here. 3. Fast Open: currently for non-TFO, send-queue correctly queues the pure SYN packet. For TFO which queues a pure SYN and then a data packet, send-queue only queues the data packet but not the pure SYN due to the structure of TFO code. This is okay because the SYN receiver would never respond with a SACK on a missing SYN (i.e. SYN is never fast-retransmitted by SACK/RACK). In order to not grow sk_buff, we use an union for the new list and _skb_refdst/destructor fields. This is a bit complicated because we need to make sure _skb_refdst and destructor are properly zeroed before skb is cloned/copied at transmit, and before being freed. Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: Yuchung Cheng <ycheng@google.com> Signed-off-by: Neal Cardwell <ncardwell@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4/tcp_output.c')
-rw-r--r--net/ipv4/tcp_output.c42
1 files changed, 31 insertions, 11 deletions
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 0bc9e46a5369..8162e2880178 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -971,6 +971,12 @@ static void tcp_internal_pacing(struct sock *sk, const struct sk_buff *skb)
971 HRTIMER_MODE_ABS_PINNED); 971 HRTIMER_MODE_ABS_PINNED);
972} 972}
973 973
974static void tcp_update_skb_after_send(struct tcp_sock *tp, struct sk_buff *skb)
975{
976 skb->skb_mstamp = tp->tcp_mstamp;
977 list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
978}
979
974/* This routine actually transmits TCP packets queued in by 980/* This routine actually transmits TCP packets queued in by
975 * tcp_do_sendmsg(). This is used by both the initial 981 * tcp_do_sendmsg(). This is used by both the initial
976 * transmission and possible later retransmissions. 982 * transmission and possible later retransmissions.
@@ -1003,10 +1009,14 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
1003 TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq 1009 TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq
1004 - tp->snd_una; 1010 - tp->snd_una;
1005 oskb = skb; 1011 oskb = skb;
1006 if (unlikely(skb_cloned(skb))) 1012
1007 skb = pskb_copy(skb, gfp_mask); 1013 tcp_skb_tsorted_save(oskb) {
1008 else 1014 if (unlikely(skb_cloned(oskb)))
1009 skb = skb_clone(skb, gfp_mask); 1015 skb = pskb_copy(oskb, gfp_mask);
1016 else
1017 skb = skb_clone(oskb, gfp_mask);
1018 } tcp_skb_tsorted_restore(oskb);
1019
1010 if (unlikely(!skb)) 1020 if (unlikely(!skb))
1011 return -ENOBUFS; 1021 return -ENOBUFS;
1012 } 1022 }
@@ -1127,7 +1137,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
1127 err = net_xmit_eval(err); 1137 err = net_xmit_eval(err);
1128 } 1138 }
1129 if (!err && oskb) { 1139 if (!err && oskb) {
1130 oskb->skb_mstamp = tp->tcp_mstamp; 1140 tcp_update_skb_after_send(tp, oskb);
1131 tcp_rate_skb_sent(sk, oskb); 1141 tcp_rate_skb_sent(sk, oskb);
1132 } 1142 }
1133 return err; 1143 return err;
@@ -1328,6 +1338,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
1328 /* Link BUFF into the send queue. */ 1338 /* Link BUFF into the send queue. */
1329 __skb_header_release(buff); 1339 __skb_header_release(buff);
1330 tcp_insert_write_queue_after(skb, buff, sk); 1340 tcp_insert_write_queue_after(skb, buff, sk);
1341 list_add(&buff->tcp_tsorted_anchor, &skb->tcp_tsorted_anchor);
1331 1342
1332 return 0; 1343 return 0;
1333} 1344}
@@ -2260,7 +2271,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
2260 2271
2261 if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) { 2272 if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
2262 /* "skb_mstamp" is used as a start point for the retransmit timer */ 2273 /* "skb_mstamp" is used as a start point for the retransmit timer */
2263 skb->skb_mstamp = tp->tcp_mstamp; 2274 tcp_update_skb_after_send(tp, skb);
2264 goto repair; /* Skip network transmission */ 2275 goto repair; /* Skip network transmission */
2265 } 2276 }
2266 2277
@@ -2838,11 +2849,14 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
2838 skb_headroom(skb) >= 0xFFFF)) { 2849 skb_headroom(skb) >= 0xFFFF)) {
2839 struct sk_buff *nskb; 2850 struct sk_buff *nskb;
2840 2851
2841 nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC); 2852 tcp_skb_tsorted_save(skb) {
2842 err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) : 2853 nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC);
2843 -ENOBUFS; 2854 err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) :
2855 -ENOBUFS;
2856 } tcp_skb_tsorted_restore(skb);
2857
2844 if (!err) 2858 if (!err)
2845 skb->skb_mstamp = tp->tcp_mstamp; 2859 tcp_update_skb_after_send(tp, skb);
2846 } else { 2860 } else {
2847 err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); 2861 err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
2848 } 2862 }
@@ -3023,6 +3037,7 @@ coalesce:
3023 goto coalesce; 3037 goto coalesce;
3024 return; 3038 return;
3025 } 3039 }
3040 INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
3026 skb_reserve(skb, MAX_TCP_HEADER); 3041 skb_reserve(skb, MAX_TCP_HEADER);
3027 sk_forced_mem_schedule(sk, skb->truesize); 3042 sk_forced_mem_schedule(sk, skb->truesize);
3028 /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */ 3043 /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
@@ -3078,9 +3093,14 @@ int tcp_send_synack(struct sock *sk)
3078 } 3093 }
3079 if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) { 3094 if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) {
3080 if (skb_cloned(skb)) { 3095 if (skb_cloned(skb)) {
3081 struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC); 3096 struct sk_buff *nskb;
3097
3098 tcp_skb_tsorted_save(skb) {
3099 nskb = skb_copy(skb, GFP_ATOMIC);
3100 } tcp_skb_tsorted_restore(skb);
3082 if (!nskb) 3101 if (!nskb)
3083 return -ENOMEM; 3102 return -ENOMEM;
3103 INIT_LIST_HEAD(&nskb->tcp_tsorted_anchor);
3084 tcp_unlink_write_queue(skb, sk); 3104 tcp_unlink_write_queue(skb, sk);
3085 __skb_header_release(nskb); 3105 __skb_header_release(nskb);
3086 __tcp_add_write_queue_head(sk, nskb); 3106 __tcp_add_write_queue_head(sk, nskb);