aboutsummaryrefslogtreecommitdiffstats
path: root/include
diff options
context:
space:
mode:
authorYuchung Cheng <ycheng@google.com>2016-09-19 23:39:14 -0400
committerDavid S. Miller <davem@davemloft.net>2016-09-21 00:23:00 -0400
commitb9f64820fb226a4e8ab10591f46cecd91ca56b30 (patch)
tree0486be41c1a85db592e675a182fa99f71605e018 /include
parent0682e6902a52aca7caf6ad42551b16ea0f87bc31 (diff)
tcp: track data delivery rate for a TCP connection
This patch generates data delivery rate (throughput) samples on a per-ACK basis. These rate samples can be used by congestion control modules, and specifically will be used by TCP BBR in later patches in this series. Key state: tp->delivered: Tracks the total number of data packets (original or not) delivered so far. This is an already-existing field. tp->delivered_mstamp: the last time tp->delivered was updated. Algorithm: A rate sample is calculated as (d1 - d0)/(t1 - t0) on a per-ACK basis: d1: the current tp->delivered after processing the ACK t1: the current time after processing the ACK d0: the prior tp->delivered when the acked skb was transmitted t0: the prior tp->delivered_mstamp when the acked skb was transmitted When an skb is transmitted, we snapshot d0 and t0 in its control block in tcp_rate_skb_sent(). When an ACK arrives, it may SACK and ACK some skbs. For each SACKed or ACKed skb, tcp_rate_skb_delivered() updates the rate_sample struct to reflect the latest (d0, t0). Finally, tcp_rate_gen() generates a rate sample by storing (d1 - d0) in rs->delivered and (t1 - t0) in rs->interval_us. One caveat: if an skb was sent with no packets in flight, then tp->delivered_mstamp may be either invalid (if the connection is starting) or outdated (if the connection was idle). In that case, we'll re-stamp tp->delivered_mstamp. At first glance it seems t0 should always be the time when an skb was transmitted, but actually this could over-estimate the rate due to phase mismatch between transmit and ACK events. To track the delivery rate, we ensure that if packets are in flight then t0 and and t1 are times at which packets were marked delivered. If the initial and final RTTs are different then one may be corrupted by some sort of noise. The noise we see most often is sending gaps caused by delayed, compressed, or stretched acks. This either affects both RTTs equally or artificially reduces the final RTT. We approach this by recording the info we need to compute the initial RTT (duration of the "send phase" of the window) when we recorded the associated inflight. Then, for a filter to avoid bandwidth overestimates, we generalize the per-sample bandwidth computation from: bw = delivered / ack_phase_rtt to the following: bw = delivered / max(send_phase_rtt, ack_phase_rtt) In large-scale experiments, this filtering approach incorporating send_phase_rtt is effective at avoiding bandwidth overestimates due to ACK compression or stretched ACKs. Signed-off-by: Van Jacobson <vanj@google.com> Signed-off-by: Neal Cardwell <ncardwell@google.com> Signed-off-by: Yuchung Cheng <ycheng@google.com> Signed-off-by: Nandita Dukkipati <nanditad@google.com> Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'include')
-rw-r--r--include/linux/tcp.h2
-rw-r--r--include/net/tcp.h35
2 files changed, 36 insertions, 1 deletions
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 38590fbc0ac5..c50e6aec005a 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -268,6 +268,8 @@ struct tcp_sock {
268 u32 prr_out; /* Total number of pkts sent during Recovery. */ 268 u32 prr_out; /* Total number of pkts sent during Recovery. */
269 u32 delivered; /* Total data packets delivered incl. rexmits */ 269 u32 delivered; /* Total data packets delivered incl. rexmits */
270 u32 lost; /* Total data packets lost incl. rexmits */ 270 u32 lost; /* Total data packets lost incl. rexmits */
271 struct skb_mstamp first_tx_mstamp; /* start of window send phase */
272 struct skb_mstamp delivered_mstamp; /* time we reached "delivered" */
271 273
272 u32 rcv_wnd; /* Current receiver window */ 274 u32 rcv_wnd; /* Current receiver window */
273 u32 write_seq; /* Tail(+1) of data held in tcp send buffer */ 275 u32 write_seq; /* Tail(+1) of data held in tcp send buffer */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 2f1648af4d12..b261c892605a 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -763,8 +763,14 @@ struct tcp_skb_cb {
763 __u32 ack_seq; /* Sequence number ACK'd */ 763 __u32 ack_seq; /* Sequence number ACK'd */
764 union { 764 union {
765 struct { 765 struct {
766 /* There is space for up to 20 bytes */ 766 /* There is space for up to 24 bytes */
767 __u32 in_flight;/* Bytes in flight when packet sent */ 767 __u32 in_flight;/* Bytes in flight when packet sent */
768 /* pkts S/ACKed so far upon tx of skb, incl retrans: */
769 __u32 delivered;
770 /* start of send pipeline phase */
771 struct skb_mstamp first_tx_mstamp;
772 /* when we reached the "delivered" count */
773 struct skb_mstamp delivered_mstamp;
768 } tx; /* only used for outgoing skbs */ 774 } tx; /* only used for outgoing skbs */
769 union { 775 union {
770 struct inet_skb_parm h4; 776 struct inet_skb_parm h4;
@@ -860,6 +866,26 @@ struct ack_sample {
860 u32 in_flight; 866 u32 in_flight;
861}; 867};
862 868
869/* A rate sample measures the number of (original/retransmitted) data
870 * packets delivered "delivered" over an interval of time "interval_us".
871 * The tcp_rate.c code fills in the rate sample, and congestion
872 * control modules that define a cong_control function to run at the end
873 * of ACK processing can optionally chose to consult this sample when
874 * setting cwnd and pacing rate.
875 * A sample is invalid if "delivered" or "interval_us" is negative.
876 */
877struct rate_sample {
878 struct skb_mstamp prior_mstamp; /* starting timestamp for interval */
879 u32 prior_delivered; /* tp->delivered at "prior_mstamp" */
880 s32 delivered; /* number of packets delivered over interval */
881 long interval_us; /* time for tp->delivered to incr "delivered" */
882 long rtt_us; /* RTT of last (S)ACKed packet (or -1) */
883 int losses; /* number of packets marked lost upon ACK */
884 u32 acked_sacked; /* number of packets newly (S)ACKed upon ACK */
885 u32 prior_in_flight; /* in flight before this ACK */
886 bool is_retrans; /* is sample from retransmission? */
887};
888
863struct tcp_congestion_ops { 889struct tcp_congestion_ops {
864 struct list_head list; 890 struct list_head list;
865 u32 key; 891 u32 key;
@@ -946,6 +972,13 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event)
946 icsk->icsk_ca_ops->cwnd_event(sk, event); 972 icsk->icsk_ca_ops->cwnd_event(sk, event);
947} 973}
948 974
975/* From tcp_rate.c */
976void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb);
977void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
978 struct rate_sample *rs);
979void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
980 struct skb_mstamp *now, struct rate_sample *rs);
981
949/* These functions determine how the current flow behaves in respect of SACK 982/* These functions determine how the current flow behaves in respect of SACK
950 * handling. SACK is negotiated with the peer, and therefore it can vary 983 * handling. SACK is negotiated with the peer, and therefore it can vary
951 * between different flows. 984 * between different flows.