aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2017-10-06 00:24:48 -0400
committerDavid S. Miller <davem@davemloft.net>2017-10-06 00:24:48 -0400
commitcec451ce60e50dba6d4136b7d1e62a5900cd264f (patch)
treecb70c1552a2c58cc5b8f63bd8e85b7fdc3b33497
parentb1fb67fa501c4787035317f84db6caf013385581 (diff)
parentbef06223083b81d2064824afe2bc85be416ab73a (diff)
Merge branch 'tcp-improving-RACK-cpu-performance'
Yuchung Cheng says: ==================== tcp: improving RACK cpu performance This patch set improves the CPU consumption of the RACK TCP loss recovery algorithm, in particular for high-speed networks. Currently, for every ACK in recovery RACK can potentially iterate over all sent packets in the write queue. On large BDP networks with non-trivial losses the RACK write queue walk CPU usage becomes unreasonably high. This patch introduces a new queue in TCP that keeps only skbs sent and not yet (s)acked or marked lost, in time order instead of sequence order. With that, RACK can examine this time-sorted list and only check packets that were sent recently, within the reordering window, per ACK. This is the fastest way without any write queue walks. The number of skbs examined per ACK is reduced by orders of magnitude. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--include/linux/skbuff.h11
-rw-r--r--include/linux/tcp.h1
-rw-r--r--include/net/tcp.h24
-rw-r--r--net/ipv4/tcp.c2
-rw-r--r--net/ipv4/tcp_input.c9
-rw-r--r--net/ipv4/tcp_minisocks.c1
-rw-r--r--net/ipv4/tcp_output.c42
-rw-r--r--net/ipv4/tcp_recovery.c52
8 files changed, 93 insertions, 49 deletions
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index ada821466e88..01a985937867 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -617,6 +617,7 @@ typedef unsigned char *sk_buff_data_t;
617 * @nf_trace: netfilter packet trace flag 617 * @nf_trace: netfilter packet trace flag
618 * @protocol: Packet protocol from driver 618 * @protocol: Packet protocol from driver
619 * @destructor: Destruct function 619 * @destructor: Destruct function
620 * @tcp_tsorted_anchor: list structure for TCP (tp->tsorted_sent_queue)
620 * @_nfct: Associated connection, if any (with nfctinfo bits) 621 * @_nfct: Associated connection, if any (with nfctinfo bits)
621 * @nf_bridge: Saved data about a bridged frame - see br_netfilter.c 622 * @nf_bridge: Saved data about a bridged frame - see br_netfilter.c
622 * @skb_iif: ifindex of device we arrived on 623 * @skb_iif: ifindex of device we arrived on
@@ -686,8 +687,14 @@ struct sk_buff {
686 */ 687 */
687 char cb[48] __aligned(8); 688 char cb[48] __aligned(8);
688 689
689 unsigned long _skb_refdst; 690 union {
690 void (*destructor)(struct sk_buff *skb); 691 struct {
692 unsigned long _skb_refdst;
693 void (*destructor)(struct sk_buff *skb);
694 };
695 struct list_head tcp_tsorted_anchor;
696 };
697
691#ifdef CONFIG_XFRM 698#ifdef CONFIG_XFRM
692 struct sec_path *sp; 699 struct sec_path *sp;
693#endif 700#endif
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 4aa40ef02d32..1d2c44e09e31 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -191,6 +191,7 @@ struct tcp_sock {
191 u32 tsoffset; /* timestamp offset */ 191 u32 tsoffset; /* timestamp offset */
192 192
193 struct list_head tsq_node; /* anchor in tsq_tasklet.head list */ 193 struct list_head tsq_node; /* anchor in tsq_tasklet.head list */
194 struct list_head tsorted_sent_queue; /* time-sorted sent but un-SACKed skbs */
194 195
195 u32 snd_wl1; /* Sequence for window update */ 196 u32 snd_wl1; /* Sequence for window update */
196 u32 snd_wnd; /* The window we expect to receive */ 197 u32 snd_wnd; /* The window we expect to receive */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 426c2e986016..3b16f353b539 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1589,14 +1589,34 @@ enum tcp_chrono {
1589void tcp_chrono_start(struct sock *sk, const enum tcp_chrono type); 1589void tcp_chrono_start(struct sock *sk, const enum tcp_chrono type);
1590void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type); 1590void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type);
1591 1591
1592/* This helper is needed, because skb->tcp_tsorted_anchor uses
1593 * the same memory storage than skb->destructor/_skb_refdst
1594 */
1595static inline void tcp_skb_tsorted_anchor_cleanup(struct sk_buff *skb)
1596{
1597 skb->destructor = NULL;
1598 skb->_skb_refdst = 0UL;
1599}
1600
1601#define tcp_skb_tsorted_save(skb) { \
1602 unsigned long _save = skb->_skb_refdst; \
1603 skb->_skb_refdst = 0UL;
1604
1605#define tcp_skb_tsorted_restore(skb) \
1606 skb->_skb_refdst = _save; \
1607}
1608
1592/* write queue abstraction */ 1609/* write queue abstraction */
1593static inline void tcp_write_queue_purge(struct sock *sk) 1610static inline void tcp_write_queue_purge(struct sock *sk)
1594{ 1611{
1595 struct sk_buff *skb; 1612 struct sk_buff *skb;
1596 1613
1597 tcp_chrono_stop(sk, TCP_CHRONO_BUSY); 1614 tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
1598 while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) 1615 while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1616 tcp_skb_tsorted_anchor_cleanup(skb);
1599 sk_wmem_free_skb(sk, skb); 1617 sk_wmem_free_skb(sk, skb);
1618 }
1619 INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue);
1600 sk_mem_reclaim(sk); 1620 sk_mem_reclaim(sk);
1601 tcp_clear_all_retrans_hints(tcp_sk(sk)); 1621 tcp_clear_all_retrans_hints(tcp_sk(sk));
1602} 1622}
@@ -1711,6 +1731,8 @@ static inline void tcp_insert_write_queue_before(struct sk_buff *new,
1711 1731
1712static inline void tcp_unlink_write_queue(struct sk_buff *skb, struct sock *sk) 1732static inline void tcp_unlink_write_queue(struct sk_buff *skb, struct sock *sk)
1713{ 1733{
1734 list_del(&skb->tcp_tsorted_anchor);
1735 tcp_skb_tsorted_anchor_cleanup(skb);
1714 __skb_unlink(skb, &sk->sk_write_queue); 1736 __skb_unlink(skb, &sk->sk_write_queue);
1715} 1737}
1716 1738
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index c115e37ca608..8cf742fd4f99 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -415,6 +415,7 @@ void tcp_init_sock(struct sock *sk)
415 tp->out_of_order_queue = RB_ROOT; 415 tp->out_of_order_queue = RB_ROOT;
416 tcp_init_xmit_timers(sk); 416 tcp_init_xmit_timers(sk);
417 INIT_LIST_HEAD(&tp->tsq_node); 417 INIT_LIST_HEAD(&tp->tsq_node);
418 INIT_LIST_HEAD(&tp->tsorted_sent_queue);
418 419
419 icsk->icsk_rto = TCP_TIMEOUT_INIT; 420 icsk->icsk_rto = TCP_TIMEOUT_INIT;
420 tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); 421 tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
@@ -881,6 +882,7 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
881 * available to the caller, no more, no less. 882 * available to the caller, no more, no less.
882 */ 883 */
883 skb->reserved_tailroom = skb->end - skb->tail - size; 884 skb->reserved_tailroom = skb->end - skb->tail - size;
885 INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
884 return skb; 886 return skb;
885 } 887 }
886 __kfree_skb(skb); 888 __kfree_skb(skb);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index c5b8d61846c2..fb0d7ed84b94 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1593,6 +1593,8 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
1593 tcp_skb_pcount(skb), 1593 tcp_skb_pcount(skb),
1594 skb->skb_mstamp); 1594 skb->skb_mstamp);
1595 tcp_rate_skb_delivered(sk, skb, state->rate); 1595 tcp_rate_skb_delivered(sk, skb, state->rate);
1596 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
1597 list_del_init(&skb->tcp_tsorted_anchor);
1596 1598
1597 if (!before(TCP_SKB_CB(skb)->seq, 1599 if (!before(TCP_SKB_CB(skb)->seq,
1598 tcp_highest_sack_seq(tp))) 1600 tcp_highest_sack_seq(tp)))
@@ -3054,8 +3056,11 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
3054 3056
3055 shinfo = skb_shinfo(skb); 3057 shinfo = skb_shinfo(skb);
3056 if (!before(shinfo->tskey, prior_snd_una) && 3058 if (!before(shinfo->tskey, prior_snd_una) &&
3057 before(shinfo->tskey, tcp_sk(sk)->snd_una)) 3059 before(shinfo->tskey, tcp_sk(sk)->snd_una)) {
3058 __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK); 3060 tcp_skb_tsorted_save(skb) {
3061 __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK);
3062 } tcp_skb_tsorted_restore(skb);
3063 }
3059} 3064}
3060 3065
3061/* Remove acknowledged frames from the retransmission queue. If our packet 3066/* Remove acknowledged frames from the retransmission queue. If our packet
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 188a6f31356d..2341b9f857b6 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -446,6 +446,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
446 newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1; 446 newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1;
447 447
448 INIT_LIST_HEAD(&newtp->tsq_node); 448 INIT_LIST_HEAD(&newtp->tsq_node);
449 INIT_LIST_HEAD(&newtp->tsorted_sent_queue);
449 450
450 tcp_init_wl(newtp, treq->rcv_isn); 451 tcp_init_wl(newtp, treq->rcv_isn);
451 452
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 0bc9e46a5369..8162e2880178 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -971,6 +971,12 @@ static void tcp_internal_pacing(struct sock *sk, const struct sk_buff *skb)
971 HRTIMER_MODE_ABS_PINNED); 971 HRTIMER_MODE_ABS_PINNED);
972} 972}
973 973
974static void tcp_update_skb_after_send(struct tcp_sock *tp, struct sk_buff *skb)
975{
976 skb->skb_mstamp = tp->tcp_mstamp;
977 list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
978}
979
974/* This routine actually transmits TCP packets queued in by 980/* This routine actually transmits TCP packets queued in by
975 * tcp_do_sendmsg(). This is used by both the initial 981 * tcp_do_sendmsg(). This is used by both the initial
976 * transmission and possible later retransmissions. 982 * transmission and possible later retransmissions.
@@ -1003,10 +1009,14 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
1003 TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq 1009 TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq
1004 - tp->snd_una; 1010 - tp->snd_una;
1005 oskb = skb; 1011 oskb = skb;
1006 if (unlikely(skb_cloned(skb))) 1012
1007 skb = pskb_copy(skb, gfp_mask); 1013 tcp_skb_tsorted_save(oskb) {
1008 else 1014 if (unlikely(skb_cloned(oskb)))
1009 skb = skb_clone(skb, gfp_mask); 1015 skb = pskb_copy(oskb, gfp_mask);
1016 else
1017 skb = skb_clone(oskb, gfp_mask);
1018 } tcp_skb_tsorted_restore(oskb);
1019
1010 if (unlikely(!skb)) 1020 if (unlikely(!skb))
1011 return -ENOBUFS; 1021 return -ENOBUFS;
1012 } 1022 }
@@ -1127,7 +1137,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
1127 err = net_xmit_eval(err); 1137 err = net_xmit_eval(err);
1128 } 1138 }
1129 if (!err && oskb) { 1139 if (!err && oskb) {
1130 oskb->skb_mstamp = tp->tcp_mstamp; 1140 tcp_update_skb_after_send(tp, oskb);
1131 tcp_rate_skb_sent(sk, oskb); 1141 tcp_rate_skb_sent(sk, oskb);
1132 } 1142 }
1133 return err; 1143 return err;
@@ -1328,6 +1338,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
1328 /* Link BUFF into the send queue. */ 1338 /* Link BUFF into the send queue. */
1329 __skb_header_release(buff); 1339 __skb_header_release(buff);
1330 tcp_insert_write_queue_after(skb, buff, sk); 1340 tcp_insert_write_queue_after(skb, buff, sk);
1341 list_add(&buff->tcp_tsorted_anchor, &skb->tcp_tsorted_anchor);
1331 1342
1332 return 0; 1343 return 0;
1333} 1344}
@@ -2260,7 +2271,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
2260 2271
2261 if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) { 2272 if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
2262 /* "skb_mstamp" is used as a start point for the retransmit timer */ 2273 /* "skb_mstamp" is used as a start point for the retransmit timer */
2263 skb->skb_mstamp = tp->tcp_mstamp; 2274 tcp_update_skb_after_send(tp, skb);
2264 goto repair; /* Skip network transmission */ 2275 goto repair; /* Skip network transmission */
2265 } 2276 }
2266 2277
@@ -2838,11 +2849,14 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
2838 skb_headroom(skb) >= 0xFFFF)) { 2849 skb_headroom(skb) >= 0xFFFF)) {
2839 struct sk_buff *nskb; 2850 struct sk_buff *nskb;
2840 2851
2841 nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC); 2852 tcp_skb_tsorted_save(skb) {
2842 err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) : 2853 nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC);
2843 -ENOBUFS; 2854 err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) :
2855 -ENOBUFS;
2856 } tcp_skb_tsorted_restore(skb);
2857
2844 if (!err) 2858 if (!err)
2845 skb->skb_mstamp = tp->tcp_mstamp; 2859 tcp_update_skb_after_send(tp, skb);
2846 } else { 2860 } else {
2847 err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); 2861 err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
2848 } 2862 }
@@ -3023,6 +3037,7 @@ coalesce:
3023 goto coalesce; 3037 goto coalesce;
3024 return; 3038 return;
3025 } 3039 }
3040 INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
3026 skb_reserve(skb, MAX_TCP_HEADER); 3041 skb_reserve(skb, MAX_TCP_HEADER);
3027 sk_forced_mem_schedule(sk, skb->truesize); 3042 sk_forced_mem_schedule(sk, skb->truesize);
3028 /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */ 3043 /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
@@ -3078,9 +3093,14 @@ int tcp_send_synack(struct sock *sk)
3078 } 3093 }
3079 if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) { 3094 if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) {
3080 if (skb_cloned(skb)) { 3095 if (skb_cloned(skb)) {
3081 struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC); 3096 struct sk_buff *nskb;
3097
3098 tcp_skb_tsorted_save(skb) {
3099 nskb = skb_copy(skb, GFP_ATOMIC);
3100 } tcp_skb_tsorted_restore(skb);
3082 if (!nskb) 3101 if (!nskb)
3083 return -ENOMEM; 3102 return -ENOMEM;
3103 INIT_LIST_HEAD(&nskb->tcp_tsorted_anchor);
3084 tcp_unlink_write_queue(skb, sk); 3104 tcp_unlink_write_queue(skb, sk);
3085 __skb_header_release(nskb); 3105 __skb_header_release(nskb);
3086 __tcp_add_write_queue_head(sk, nskb); 3106 __tcp_add_write_queue_head(sk, nskb);
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index 449cd914d58e..cda6074a429a 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -45,7 +45,7 @@ static bool tcp_rack_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2)
45static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout) 45static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout)
46{ 46{
47 struct tcp_sock *tp = tcp_sk(sk); 47 struct tcp_sock *tp = tcp_sk(sk);
48 struct sk_buff *skb; 48 struct sk_buff *skb, *n;
49 u32 reo_wnd; 49 u32 reo_wnd;
50 50
51 *reo_timeout = 0; 51 *reo_timeout = 0;
@@ -58,45 +58,31 @@ static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout)
58 if ((tp->rack.reord || !tp->lost_out) && tcp_min_rtt(tp) != ~0U) 58 if ((tp->rack.reord || !tp->lost_out) && tcp_min_rtt(tp) != ~0U)
59 reo_wnd = max(tcp_min_rtt(tp) >> 2, reo_wnd); 59 reo_wnd = max(tcp_min_rtt(tp) >> 2, reo_wnd);
60 60
61 tcp_for_write_queue(skb, sk) { 61 list_for_each_entry_safe(skb, n, &tp->tsorted_sent_queue,
62 tcp_tsorted_anchor) {
62 struct tcp_skb_cb *scb = TCP_SKB_CB(skb); 63 struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
64 s32 remaining;
63 65
64 if (skb == tcp_send_head(sk)) 66 /* Skip ones marked lost but not yet retransmitted */
65 break; 67 if ((scb->sacked & TCPCB_LOST) &&
66 68 !(scb->sacked & TCPCB_SACKED_RETRANS))
67 /* Skip ones already (s)acked */
68 if (!after(scb->end_seq, tp->snd_una) ||
69 scb->sacked & TCPCB_SACKED_ACKED)
70 continue; 69 continue;
71 70
72 if (tcp_rack_sent_after(tp->rack.mstamp, skb->skb_mstamp, 71 if (!tcp_rack_sent_after(tp->rack.mstamp, skb->skb_mstamp,
73 tp->rack.end_seq, scb->end_seq)) { 72 tp->rack.end_seq, scb->end_seq))
74 /* Step 3 in draft-cheng-tcpm-rack-00.txt: 73 break;
75 * A packet is lost if its elapsed time is beyond
76 * the recent RTT plus the reordering window.
77 */
78 u32 elapsed = tcp_stamp_us_delta(tp->tcp_mstamp,
79 skb->skb_mstamp);
80 s32 remaining = tp->rack.rtt_us + reo_wnd - elapsed;
81
82 if (remaining < 0) {
83 tcp_rack_mark_skb_lost(sk, skb);
84 continue;
85 }
86
87 /* Skip ones marked lost but not yet retransmitted */
88 if ((scb->sacked & TCPCB_LOST) &&
89 !(scb->sacked & TCPCB_SACKED_RETRANS))
90 continue;
91 74
75 /* A packet is lost if it has not been s/acked beyond
76 * the recent RTT plus the reordering window.
77 */
78 remaining = tp->rack.rtt_us + reo_wnd -
79 tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp);
80 if (remaining < 0) {
81 tcp_rack_mark_skb_lost(sk, skb);
82 list_del_init(&skb->tcp_tsorted_anchor);
83 } else {
92 /* Record maximum wait time (+1 to avoid 0) */ 84 /* Record maximum wait time (+1 to avoid 0) */
93 *reo_timeout = max_t(u32, *reo_timeout, 1 + remaining); 85 *reo_timeout = max_t(u32, *reo_timeout, 1 + remaining);
94
95 } else if (!(scb->sacked & TCPCB_RETRANS)) {
96 /* Original data are sent sequentially so stop early
97 * b/c the rest are all sent after rack_sent
98 */
99 break;
100 } 86 }
101 } 87 }
102} 88}