summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/linux/tcp.h2
-rw-r--r--include/net/tcp.h35
-rw-r--r--net/ipv4/Makefile2
-rw-r--r--net/ipv4/tcp_input.c46
-rw-r--r--net/ipv4/tcp_output.c4
-rw-r--r--net/ipv4/tcp_rate.c149
6 files changed, 222 insertions, 16 deletions
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 38590fbc0ac5..c50e6aec005a 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -268,6 +268,8 @@ struct tcp_sock {
268 u32 prr_out; /* Total number of pkts sent during Recovery. */ 268 u32 prr_out; /* Total number of pkts sent during Recovery. */
269 u32 delivered; /* Total data packets delivered incl. rexmits */ 269 u32 delivered; /* Total data packets delivered incl. rexmits */
270 u32 lost; /* Total data packets lost incl. rexmits */ 270 u32 lost; /* Total data packets lost incl. rexmits */
271 struct skb_mstamp first_tx_mstamp; /* start of window send phase */
272 struct skb_mstamp delivered_mstamp; /* time we reached "delivered" */
271 273
272 u32 rcv_wnd; /* Current receiver window */ 274 u32 rcv_wnd; /* Current receiver window */
273 u32 write_seq; /* Tail(+1) of data held in tcp send buffer */ 275 u32 write_seq; /* Tail(+1) of data held in tcp send buffer */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 2f1648af4d12..b261c892605a 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -763,8 +763,14 @@ struct tcp_skb_cb {
763 __u32 ack_seq; /* Sequence number ACK'd */ 763 __u32 ack_seq; /* Sequence number ACK'd */
764 union { 764 union {
765 struct { 765 struct {
766 /* There is space for up to 20 bytes */ 766 /* There is space for up to 24 bytes */
767 __u32 in_flight;/* Bytes in flight when packet sent */ 767 __u32 in_flight;/* Bytes in flight when packet sent */
768 /* pkts S/ACKed so far upon tx of skb, incl retrans: */
769 __u32 delivered;
770 /* start of send pipeline phase */
771 struct skb_mstamp first_tx_mstamp;
772 /* when we reached the "delivered" count */
773 struct skb_mstamp delivered_mstamp;
768 } tx; /* only used for outgoing skbs */ 774 } tx; /* only used for outgoing skbs */
769 union { 775 union {
770 struct inet_skb_parm h4; 776 struct inet_skb_parm h4;
@@ -860,6 +866,26 @@ struct ack_sample {
860 u32 in_flight; 866 u32 in_flight;
861}; 867};
862 868
869/* A rate sample measures the number of (original/retransmitted) data
870 * packets delivered "delivered" over an interval of time "interval_us".
871 * The tcp_rate.c code fills in the rate sample, and congestion
872 * control modules that define a cong_control function to run at the end
873 * of ACK processing can optionally chose to consult this sample when
874 * setting cwnd and pacing rate.
875 * A sample is invalid if "delivered" or "interval_us" is negative.
876 */
877struct rate_sample {
878 struct skb_mstamp prior_mstamp; /* starting timestamp for interval */
879 u32 prior_delivered; /* tp->delivered at "prior_mstamp" */
880 s32 delivered; /* number of packets delivered over interval */
881 long interval_us; /* time for tp->delivered to incr "delivered" */
882 long rtt_us; /* RTT of last (S)ACKed packet (or -1) */
883 int losses; /* number of packets marked lost upon ACK */
884 u32 acked_sacked; /* number of packets newly (S)ACKed upon ACK */
885 u32 prior_in_flight; /* in flight before this ACK */
886 bool is_retrans; /* is sample from retransmission? */
887};
888
863struct tcp_congestion_ops { 889struct tcp_congestion_ops {
864 struct list_head list; 890 struct list_head list;
865 u32 key; 891 u32 key;
@@ -946,6 +972,13 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event)
946 icsk->icsk_ca_ops->cwnd_event(sk, event); 972 icsk->icsk_ca_ops->cwnd_event(sk, event);
947} 973}
948 974
975/* From tcp_rate.c */
976void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb);
977void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
978 struct rate_sample *rs);
979void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
980 struct skb_mstamp *now, struct rate_sample *rs);
981
949/* These functions determine how the current flow behaves in respect of SACK 982/* These functions determine how the current flow behaves in respect of SACK
950 * handling. SACK is negotiated with the peer, and therefore it can vary 983 * handling. SACK is negotiated with the peer, and therefore it can vary
951 * between different flows. 984 * between different flows.
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 24629b6f57cc..9cfff1a0bf71 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -8,7 +8,7 @@ obj-y := route.o inetpeer.o protocol.o \
8 inet_timewait_sock.o inet_connection_sock.o \ 8 inet_timewait_sock.o inet_connection_sock.o \
9 tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \ 9 tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
10 tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \ 10 tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \
11 tcp_recovery.o \ 11 tcp_rate.o tcp_recovery.o \
12 tcp_offload.o datagram.o raw.o udp.o udplite.o \ 12 tcp_offload.o datagram.o raw.o udp.o udplite.o \
13 udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \ 13 udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \
14 fib_frontend.o fib_semantics.o fib_trie.o \ 14 fib_frontend.o fib_semantics.o fib_trie.o \
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 9413288c2778..d9ed4bb96f74 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1112,6 +1112,7 @@ struct tcp_sacktag_state {
1112 */ 1112 */
1113 struct skb_mstamp first_sackt; 1113 struct skb_mstamp first_sackt;
1114 struct skb_mstamp last_sackt; 1114 struct skb_mstamp last_sackt;
1115 struct rate_sample *rate;
1115 int flag; 1116 int flag;
1116}; 1117};
1117 1118
@@ -1279,6 +1280,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
1279 tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked, 1280 tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked,
1280 start_seq, end_seq, dup_sack, pcount, 1281 start_seq, end_seq, dup_sack, pcount,
1281 &skb->skb_mstamp); 1282 &skb->skb_mstamp);
1283 tcp_rate_skb_delivered(sk, skb, state->rate);
1282 1284
1283 if (skb == tp->lost_skb_hint) 1285 if (skb == tp->lost_skb_hint)
1284 tp->lost_cnt_hint += pcount; 1286 tp->lost_cnt_hint += pcount;
@@ -1329,6 +1331,9 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
1329 tcp_advance_highest_sack(sk, skb); 1331 tcp_advance_highest_sack(sk, skb);
1330 1332
1331 tcp_skb_collapse_tstamp(prev, skb); 1333 tcp_skb_collapse_tstamp(prev, skb);
1334 if (unlikely(TCP_SKB_CB(prev)->tx.delivered_mstamp.v64))
1335 TCP_SKB_CB(prev)->tx.delivered_mstamp.v64 = 0;
1336
1332 tcp_unlink_write_queue(skb, sk); 1337 tcp_unlink_write_queue(skb, sk);
1333 sk_wmem_free_skb(sk, skb); 1338 sk_wmem_free_skb(sk, skb);
1334 1339
@@ -1558,6 +1563,7 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
1558 dup_sack, 1563 dup_sack,
1559 tcp_skb_pcount(skb), 1564 tcp_skb_pcount(skb),
1560 &skb->skb_mstamp); 1565 &skb->skb_mstamp);
1566 tcp_rate_skb_delivered(sk, skb, state->rate);
1561 1567
1562 if (!before(TCP_SKB_CB(skb)->seq, 1568 if (!before(TCP_SKB_CB(skb)->seq,
1563 tcp_highest_sack_seq(tp))) 1569 tcp_highest_sack_seq(tp)))
@@ -1640,8 +1646,10 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
1640 1646
1641 found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire, 1647 found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire,
1642 num_sacks, prior_snd_una); 1648 num_sacks, prior_snd_una);
1643 if (found_dup_sack) 1649 if (found_dup_sack) {
1644 state->flag |= FLAG_DSACKING_ACK; 1650 state->flag |= FLAG_DSACKING_ACK;
1651 tp->delivered++; /* A spurious retransmission is delivered */
1652 }
1645 1653
1646 /* Eliminate too old ACKs, but take into 1654 /* Eliminate too old ACKs, but take into
1647 * account more or less fresh ones, they can 1655 * account more or less fresh ones, they can
@@ -3071,10 +3079,11 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
3071 */ 3079 */
3072static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, 3080static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3073 u32 prior_snd_una, int *acked, 3081 u32 prior_snd_una, int *acked,
3074 struct tcp_sacktag_state *sack) 3082 struct tcp_sacktag_state *sack,
3083 struct skb_mstamp *now)
3075{ 3084{
3076 const struct inet_connection_sock *icsk = inet_csk(sk); 3085 const struct inet_connection_sock *icsk = inet_csk(sk);
3077 struct skb_mstamp first_ackt, last_ackt, now; 3086 struct skb_mstamp first_ackt, last_ackt;
3078 struct tcp_sock *tp = tcp_sk(sk); 3087 struct tcp_sock *tp = tcp_sk(sk);
3079 u32 prior_sacked = tp->sacked_out; 3088 u32 prior_sacked = tp->sacked_out;
3080 u32 reord = tp->packets_out; 3089 u32 reord = tp->packets_out;
@@ -3106,7 +3115,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3106 acked_pcount = tcp_tso_acked(sk, skb); 3115 acked_pcount = tcp_tso_acked(sk, skb);
3107 if (!acked_pcount) 3116 if (!acked_pcount)
3108 break; 3117 break;
3109
3110 fully_acked = false; 3118 fully_acked = false;
3111 } else { 3119 } else {
3112 /* Speedup tcp_unlink_write_queue() and next loop */ 3120 /* Speedup tcp_unlink_write_queue() and next loop */
@@ -3142,6 +3150,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3142 3150
3143 tp->packets_out -= acked_pcount; 3151 tp->packets_out -= acked_pcount;
3144 pkts_acked += acked_pcount; 3152 pkts_acked += acked_pcount;
3153 tcp_rate_skb_delivered(sk, skb, sack->rate);
3145 3154
3146 /* Initial outgoing SYN's get put onto the write_queue 3155 /* Initial outgoing SYN's get put onto the write_queue
3147 * just like anything else we transmit. It is not 3156 * just like anything else we transmit. It is not
@@ -3174,16 +3183,15 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3174 if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) 3183 if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
3175 flag |= FLAG_SACK_RENEGING; 3184 flag |= FLAG_SACK_RENEGING;
3176 3185
3177 skb_mstamp_get(&now);
3178 if (likely(first_ackt.v64) && !(flag & FLAG_RETRANS_DATA_ACKED)) { 3186 if (likely(first_ackt.v64) && !(flag & FLAG_RETRANS_DATA_ACKED)) {
3179 seq_rtt_us = skb_mstamp_us_delta(&now, &first_ackt); 3187 seq_rtt_us = skb_mstamp_us_delta(now, &first_ackt);
3180 ca_rtt_us = skb_mstamp_us_delta(&now, &last_ackt); 3188 ca_rtt_us = skb_mstamp_us_delta(now, &last_ackt);
3181 } 3189 }
3182 if (sack->first_sackt.v64) { 3190 if (sack->first_sackt.v64) {
3183 sack_rtt_us = skb_mstamp_us_delta(&now, &sack->first_sackt); 3191 sack_rtt_us = skb_mstamp_us_delta(now, &sack->first_sackt);
3184 ca_rtt_us = skb_mstamp_us_delta(&now, &sack->last_sackt); 3192 ca_rtt_us = skb_mstamp_us_delta(now, &sack->last_sackt);
3185 } 3193 }
3186 3194 sack->rate->rtt_us = ca_rtt_us; /* RTT of last (S)ACKed packet, or -1 */
3187 rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us, 3195 rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us,
3188 ca_rtt_us); 3196 ca_rtt_us);
3189 3197
@@ -3211,7 +3219,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3211 tp->fackets_out -= min(pkts_acked, tp->fackets_out); 3219 tp->fackets_out -= min(pkts_acked, tp->fackets_out);
3212 3220
3213 } else if (skb && rtt_update && sack_rtt_us >= 0 && 3221 } else if (skb && rtt_update && sack_rtt_us >= 0 &&
3214 sack_rtt_us > skb_mstamp_us_delta(&now, &skb->skb_mstamp)) { 3222 sack_rtt_us > skb_mstamp_us_delta(now, &skb->skb_mstamp)) {
3215 /* Do not re-arm RTO if the sack RTT is measured from data sent 3223 /* Do not re-arm RTO if the sack RTT is measured from data sent
3216 * after when the head was last (re)transmitted. Otherwise the 3224 * after when the head was last (re)transmitted. Otherwise the
3217 * timeout may continue to extend in loss recovery. 3225 * timeout may continue to extend in loss recovery.
@@ -3548,17 +3556,21 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3548 struct inet_connection_sock *icsk = inet_csk(sk); 3556 struct inet_connection_sock *icsk = inet_csk(sk);
3549 struct tcp_sock *tp = tcp_sk(sk); 3557 struct tcp_sock *tp = tcp_sk(sk);
3550 struct tcp_sacktag_state sack_state; 3558 struct tcp_sacktag_state sack_state;
3559 struct rate_sample rs = { .prior_delivered = 0 };
3551 u32 prior_snd_una = tp->snd_una; 3560 u32 prior_snd_una = tp->snd_una;
3552 u32 ack_seq = TCP_SKB_CB(skb)->seq; 3561 u32 ack_seq = TCP_SKB_CB(skb)->seq;
3553 u32 ack = TCP_SKB_CB(skb)->ack_seq; 3562 u32 ack = TCP_SKB_CB(skb)->ack_seq;
3554 bool is_dupack = false; 3563 bool is_dupack = false;
3555 u32 prior_fackets; 3564 u32 prior_fackets;
3556 int prior_packets = tp->packets_out; 3565 int prior_packets = tp->packets_out;
3557 u32 prior_delivered = tp->delivered; 3566 u32 delivered = tp->delivered;
3567 u32 lost = tp->lost;
3558 int acked = 0; /* Number of packets newly acked */ 3568 int acked = 0; /* Number of packets newly acked */
3559 int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */ 3569 int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */
3570 struct skb_mstamp now;
3560 3571
3561 sack_state.first_sackt.v64 = 0; 3572 sack_state.first_sackt.v64 = 0;
3573 sack_state.rate = &rs;
3562 3574
3563 /* We very likely will need to access write queue head. */ 3575 /* We very likely will need to access write queue head. */
3564 prefetchw(sk->sk_write_queue.next); 3576 prefetchw(sk->sk_write_queue.next);
@@ -3581,6 +3593,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3581 if (after(ack, tp->snd_nxt)) 3593 if (after(ack, tp->snd_nxt))
3582 goto invalid_ack; 3594 goto invalid_ack;
3583 3595
3596 skb_mstamp_get(&now);
3597
3584 if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || 3598 if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
3585 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) 3599 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
3586 tcp_rearm_rto(sk); 3600 tcp_rearm_rto(sk);
@@ -3591,6 +3605,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3591 } 3605 }
3592 3606
3593 prior_fackets = tp->fackets_out; 3607 prior_fackets = tp->fackets_out;
3608 rs.prior_in_flight = tcp_packets_in_flight(tp);
3594 3609
3595 /* ts_recent update must be made after we are sure that the packet 3610 /* ts_recent update must be made after we are sure that the packet
3596 * is in window. 3611 * is in window.
@@ -3646,7 +3661,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3646 3661
3647 /* See if we can take anything off of the retransmit queue. */ 3662 /* See if we can take anything off of the retransmit queue. */
3648 flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked, 3663 flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked,
3649 &sack_state); 3664 &sack_state, &now);
3650 3665
3651 if (tcp_ack_is_dubious(sk, flag)) { 3666 if (tcp_ack_is_dubious(sk, flag)) {
3652 is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); 3667 is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
@@ -3663,7 +3678,10 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3663 3678
3664 if (icsk->icsk_pending == ICSK_TIME_RETRANS) 3679 if (icsk->icsk_pending == ICSK_TIME_RETRANS)
3665 tcp_schedule_loss_probe(sk); 3680 tcp_schedule_loss_probe(sk);
3666 tcp_cong_control(sk, ack, tp->delivered - prior_delivered, flag); 3681 delivered = tp->delivered - delivered; /* freshly ACKed or SACKed */
3682 lost = tp->lost - lost; /* freshly marked lost */
3683 tcp_rate_gen(sk, delivered, lost, &now, &rs);
3684 tcp_cong_control(sk, ack, delivered, flag);
3667 tcp_xmit_recovery(sk, rexmit); 3685 tcp_xmit_recovery(sk, rexmit);
3668 return 1; 3686 return 1;
3669 3687
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 8b45794eb6b2..e02c8ebf3ed4 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -918,6 +918,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
918 skb_mstamp_get(&skb->skb_mstamp); 918 skb_mstamp_get(&skb->skb_mstamp);
919 TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq 919 TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq
920 - tp->snd_una; 920 - tp->snd_una;
921 tcp_rate_skb_sent(sk, skb);
921 922
922 if (unlikely(skb_cloned(skb))) 923 if (unlikely(skb_cloned(skb)))
923 skb = pskb_copy(skb, gfp_mask); 924 skb = pskb_copy(skb, gfp_mask);
@@ -1213,6 +1214,9 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
1213 tcp_set_skb_tso_segs(skb, mss_now); 1214 tcp_set_skb_tso_segs(skb, mss_now);
1214 tcp_set_skb_tso_segs(buff, mss_now); 1215 tcp_set_skb_tso_segs(buff, mss_now);
1215 1216
1217 /* Update delivered info for the new segment */
1218 TCP_SKB_CB(buff)->tx = TCP_SKB_CB(skb)->tx;
1219
1216 /* If this packet has been sent out already, we must 1220 /* If this packet has been sent out already, we must
1217 * adjust the various packet counters. 1221 * adjust the various packet counters.
1218 */ 1222 */
diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c
new file mode 100644
index 000000000000..1daed6af6e80
--- /dev/null
+++ b/net/ipv4/tcp_rate.c
@@ -0,0 +1,149 @@
1#include <net/tcp.h>
2
3/* The bandwidth estimator estimates the rate at which the network
4 * can currently deliver outbound data packets for this flow. At a high
5 * level, it operates by taking a delivery rate sample for each ACK.
6 *
7 * A rate sample records the rate at which the network delivered packets
8 * for this flow, calculated over the time interval between the transmission
9 * of a data packet and the acknowledgment of that packet.
10 *
11 * Specifically, over the interval between each transmit and corresponding ACK,
12 * the estimator generates a delivery rate sample. Typically it uses the rate
13 * at which packets were acknowledged. However, the approach of using only the
14 * acknowledgment rate faces a challenge under the prevalent ACK decimation or
15 * compression: packets can temporarily appear to be delivered much quicker
16 * than the bottleneck rate. Since it is physically impossible to do that in a
17 * sustained fashion, when the estimator notices that the ACK rate is faster
18 * than the transmit rate, it uses the latter:
19 *
20 * send_rate = #pkts_delivered/(last_snd_time - first_snd_time)
21 * ack_rate = #pkts_delivered/(last_ack_time - first_ack_time)
22 * bw = min(send_rate, ack_rate)
23 *
24 * Notice the estimator essentially estimates the goodput, not always the
25 * network bottleneck link rate when the sending or receiving is limited by
26 * other factors like applications or receiver window limits. The estimator
27 * deliberately avoids using the inter-packet spacing approach because that
28 * approach requires a large number of samples and sophisticated filtering.
29 */
30
31
32/* Snapshot the current delivery information in the skb, to generate
33 * a rate sample later when the skb is (s)acked in tcp_rate_skb_delivered().
34 */
35void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb)
36{
37 struct tcp_sock *tp = tcp_sk(sk);
38
39 /* In general we need to start delivery rate samples from the
40 * time we received the most recent ACK, to ensure we include
41 * the full time the network needs to deliver all in-flight
42 * packets. If there are no packets in flight yet, then we
43 * know that any ACKs after now indicate that the network was
44 * able to deliver those packets completely in the sampling
45 * interval between now and the next ACK.
46 *
47 * Note that we use packets_out instead of tcp_packets_in_flight(tp)
48 * because the latter is a guess based on RTO and loss-marking
49 * heuristics. We don't want spurious RTOs or loss markings to cause
50 * a spuriously small time interval, causing a spuriously high
51 * bandwidth estimate.
52 */
53 if (!tp->packets_out) {
54 tp->first_tx_mstamp = skb->skb_mstamp;
55 tp->delivered_mstamp = skb->skb_mstamp;
56 }
57
58 TCP_SKB_CB(skb)->tx.first_tx_mstamp = tp->first_tx_mstamp;
59 TCP_SKB_CB(skb)->tx.delivered_mstamp = tp->delivered_mstamp;
60 TCP_SKB_CB(skb)->tx.delivered = tp->delivered;
61}
62
63/* When an skb is sacked or acked, we fill in the rate sample with the (prior)
64 * delivery information when the skb was last transmitted.
65 *
66 * If an ACK (s)acks multiple skbs (e.g., stretched-acks), this function is
67 * called multiple times. We favor the information from the most recently
68 * sent skb, i.e., the skb with the highest prior_delivered count.
69 */
70void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
71 struct rate_sample *rs)
72{
73 struct tcp_sock *tp = tcp_sk(sk);
74 struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
75
76 if (!scb->tx.delivered_mstamp.v64)
77 return;
78
79 if (!rs->prior_delivered ||
80 after(scb->tx.delivered, rs->prior_delivered)) {
81 rs->prior_delivered = scb->tx.delivered;
82 rs->prior_mstamp = scb->tx.delivered_mstamp;
83 rs->is_retrans = scb->sacked & TCPCB_RETRANS;
84
85 /* Find the duration of the "send phase" of this window: */
86 rs->interval_us = skb_mstamp_us_delta(
87 &skb->skb_mstamp,
88 &scb->tx.first_tx_mstamp);
89
90 /* Record send time of most recently ACKed packet: */
91 tp->first_tx_mstamp = skb->skb_mstamp;
92 }
93 /* Mark off the skb delivered once it's sacked to avoid being
94 * used again when it's cumulatively acked. For acked packets
95 * we don't need to reset since it'll be freed soon.
96 */
97 if (scb->sacked & TCPCB_SACKED_ACKED)
98 scb->tx.delivered_mstamp.v64 = 0;
99}
100
101/* Update the connection delivery information and generate a rate sample. */
102void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
103 struct skb_mstamp *now, struct rate_sample *rs)
104{
105 struct tcp_sock *tp = tcp_sk(sk);
106 u32 snd_us, ack_us;
107
108 /* TODO: there are multiple places throughout tcp_ack() to get
109 * current time. Refactor the code using a new "tcp_acktag_state"
110 * to carry current time, flags, stats like "tcp_sacktag_state".
111 */
112 if (delivered)
113 tp->delivered_mstamp = *now;
114
115 rs->acked_sacked = delivered; /* freshly ACKed or SACKed */
116 rs->losses = lost; /* freshly marked lost */
117 /* Return an invalid sample if no timing information is available. */
118 if (!rs->prior_mstamp.v64) {
119 rs->delivered = -1;
120 rs->interval_us = -1;
121 return;
122 }
123 rs->delivered = tp->delivered - rs->prior_delivered;
124
125 /* Model sending data and receiving ACKs as separate pipeline phases
126 * for a window. Usually the ACK phase is longer, but with ACK
127 * compression the send phase can be longer. To be safe we use the
128 * longer phase.
129 */
130 snd_us = rs->interval_us; /* send phase */
131 ack_us = skb_mstamp_us_delta(now, &rs->prior_mstamp); /* ack phase */
132 rs->interval_us = max(snd_us, ack_us);
133
134 /* Normally we expect interval_us >= min-rtt.
135 * Note that rate may still be over-estimated when a spuriously
136 * retransmistted skb was first (s)acked because "interval_us"
137 * is under-estimated (up to an RTT). However continuously
138 * measuring the delivery rate during loss recovery is crucial
139 * for connections suffer heavy or prolonged losses.
140 */
141 if (unlikely(rs->interval_us < tcp_min_rtt(tp))) {
142 rs->interval_us = -1;
143 if (!rs->is_retrans)
144 pr_debug("tcp rate: %ld %d %u %u %u\n",
145 rs->interval_us, rs->delivered,
146 inet_csk(sk)->icsk_ca_state,
147 tp->rx_opt.sack_ok, tcp_min_rtt(tp));
148 }
149}