diff options
| author | Eric Dumazet <edumazet@google.com> | 2014-02-26 17:02:48 -0500 |
|---|---|---|
| committer | David S. Miller <davem@davemloft.net> | 2014-02-26 17:08:40 -0500 |
| commit | 740b0f1841f6e39085b711d41db9ffb07198682b (patch) | |
| tree | 7befd549fc20c51bff4c79790ad4520fcc0e324e /include | |
| parent | 363ec392352e55c61ce2799c3f15f89f9429bba7 (diff) | |
tcp: switch rtt estimations to usec resolution
Upcoming congestion controls for TCP require usec resolution for RTT
estimations. Millisecond resolution is simply not enough these days.
FQ/pacing in DC environments also require this change for finer control
and removal of bimodal behavior due to the current hack in
tcp_update_pacing_rate() for 'small rtt'
TCP_CONG_RTT_STAMP is no longer needed.
As Julian Anastasov pointed out, we need to keep user compatibility :
tcp_metrics used to export RTT and RTTVAR in msec resolution,
so we added RTT_US and RTTVAR_US. An iproute2 patch is needed
to use the new attributes if provided by the kernel.
In this example ss command displays a srtt of 32 usecs (10Gbit link)
lpk51:~# ./ss -i dst lpk52
Netid State Recv-Q Send-Q Local Address:Port Peer
Address:Port
tcp ESTAB 0 1 10.246.11.51:42959
10.246.11.52:64614
cubic wscale:6,6 rto:201 rtt:0.032/0.001 ato:40 mss:1448
cwnd:10 send
3620.0Mbps pacing_rate 7240.0Mbps unacked:1 rcv_rtt:993 rcv_space:29559
Updated iproute2 ip command displays :
lpk51:~# ./ip tcp_metrics | grep 10.246.11.52
10.246.11.52 age 561.914sec cwnd 10 rtt 274us rttvar 213us source
10.246.11.51
Old binary displays :
lpk51:~# ip tcp_metrics | grep 10.246.11.52
10.246.11.52 age 561.914sec cwnd 10 rtt 250us rttvar 125us source
10.246.11.51
With help from Julian Anastasov, Stephen Hemminger and Yuchung Cheng
Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Cc: Stephen Hemminger <stephen@networkplumber.org>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Larry Brakmo <brakmo@google.com>
Cc: Julian Anastasov <ja@ssi.bg>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'include')
| -rw-r--r-- | include/linux/tcp.h | 8 | ||||
| -rw-r--r-- | include/net/tcp.h | 10 | ||||
| -rw-r--r-- | include/uapi/linux/tcp_metrics.h | 7 |
3 files changed, 16 insertions, 9 deletions
diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 4ad0706d40eb..239946868142 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h | |||
| @@ -201,10 +201,10 @@ struct tcp_sock { | |||
| 201 | u32 tlp_high_seq; /* snd_nxt at the time of TLP retransmit. */ | 201 | u32 tlp_high_seq; /* snd_nxt at the time of TLP retransmit. */ |
| 202 | 202 | ||
| 203 | /* RTT measurement */ | 203 | /* RTT measurement */ |
| 204 | u32 srtt; /* smoothed round trip time << 3 */ | 204 | u32 srtt_us; /* smoothed round trip time << 3 in usecs */ |
| 205 | u32 mdev; /* medium deviation */ | 205 | u32 mdev_us; /* medium deviation */ |
| 206 | u32 mdev_max; /* maximal mdev for the last rtt period */ | 206 | u32 mdev_max_us; /* maximal mdev for the last rtt period */ |
| 207 | u32 rttvar; /* smoothed mdev_max */ | 207 | u32 rttvar_us; /* smoothed mdev_max */ |
| 208 | u32 rtt_seq; /* sequence number to update rttvar */ | 208 | u32 rtt_seq; /* sequence number to update rttvar */ |
| 209 | 209 | ||
| 210 | u32 packets_out; /* Packets which are "in flight" */ | 210 | u32 packets_out; /* Packets which are "in flight" */ |
diff --git a/include/net/tcp.h b/include/net/tcp.h index 1f820537741a..93eab0b9da60 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h | |||
| @@ -31,6 +31,7 @@ | |||
| 31 | #include <linux/crypto.h> | 31 | #include <linux/crypto.h> |
| 32 | #include <linux/cryptohash.h> | 32 | #include <linux/cryptohash.h> |
| 33 | #include <linux/kref.h> | 33 | #include <linux/kref.h> |
| 34 | #include <linux/ktime.h> | ||
| 34 | 35 | ||
| 35 | #include <net/inet_connection_sock.h> | 36 | #include <net/inet_connection_sock.h> |
| 36 | #include <net/inet_timewait_sock.h> | 37 | #include <net/inet_timewait_sock.h> |
| @@ -478,7 +479,6 @@ int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th, | |||
| 478 | struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, | 479 | struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, |
| 479 | struct ip_options *opt); | 480 | struct ip_options *opt); |
| 480 | #ifdef CONFIG_SYN_COOKIES | 481 | #ifdef CONFIG_SYN_COOKIES |
| 481 | #include <linux/ktime.h> | ||
| 482 | 482 | ||
| 483 | /* Syncookies use a monotonic timer which increments every 64 seconds. | 483 | /* Syncookies use a monotonic timer which increments every 64 seconds. |
| 484 | * This counter is used both as a hash input and partially encoded into | 484 | * This counter is used both as a hash input and partially encoded into |
| @@ -619,7 +619,7 @@ static inline void tcp_bound_rto(const struct sock *sk) | |||
| 619 | 619 | ||
| 620 | static inline u32 __tcp_set_rto(const struct tcp_sock *tp) | 620 | static inline u32 __tcp_set_rto(const struct tcp_sock *tp) |
| 621 | { | 621 | { |
| 622 | return (tp->srtt >> 3) + tp->rttvar; | 622 | return usecs_to_jiffies((tp->srtt_us >> 3) + tp->rttvar_us); |
| 623 | } | 623 | } |
| 624 | 624 | ||
| 625 | static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd) | 625 | static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd) |
| @@ -656,6 +656,11 @@ static inline u32 tcp_rto_min(struct sock *sk) | |||
| 656 | return rto_min; | 656 | return rto_min; |
| 657 | } | 657 | } |
| 658 | 658 | ||
| 659 | static inline u32 tcp_rto_min_us(struct sock *sk) | ||
| 660 | { | ||
| 661 | return jiffies_to_usecs(tcp_rto_min(sk)); | ||
| 662 | } | ||
| 663 | |||
| 659 | /* Compute the actual receive window we are currently advertising. | 664 | /* Compute the actual receive window we are currently advertising. |
| 660 | * Rcv_nxt can be after the window if our peer push more data | 665 | * Rcv_nxt can be after the window if our peer push more data |
| 661 | * than the offered window. | 666 | * than the offered window. |
| @@ -778,7 +783,6 @@ enum tcp_ca_event { | |||
| 778 | #define TCP_CA_BUF_MAX (TCP_CA_NAME_MAX*TCP_CA_MAX) | 783 | #define TCP_CA_BUF_MAX (TCP_CA_NAME_MAX*TCP_CA_MAX) |
| 779 | 784 | ||
| 780 | #define TCP_CONG_NON_RESTRICTED 0x1 | 785 | #define TCP_CONG_NON_RESTRICTED 0x1 |
| 781 | #define TCP_CONG_RTT_STAMP 0x2 | ||
| 782 | 786 | ||
| 783 | struct tcp_congestion_ops { | 787 | struct tcp_congestion_ops { |
| 784 | struct list_head list; | 788 | struct list_head list; |
diff --git a/include/uapi/linux/tcp_metrics.h b/include/uapi/linux/tcp_metrics.h index 54a37b13f2c4..93533926035c 100644 --- a/include/uapi/linux/tcp_metrics.h +++ b/include/uapi/linux/tcp_metrics.h | |||
| @@ -11,12 +11,15 @@ | |||
| 11 | #define TCP_METRICS_GENL_VERSION 0x1 | 11 | #define TCP_METRICS_GENL_VERSION 0x1 |
| 12 | 12 | ||
| 13 | enum tcp_metric_index { | 13 | enum tcp_metric_index { |
| 14 | TCP_METRIC_RTT, | 14 | TCP_METRIC_RTT, /* in ms units */ |
| 15 | TCP_METRIC_RTTVAR, | 15 | TCP_METRIC_RTTVAR, /* in ms units */ |
| 16 | TCP_METRIC_SSTHRESH, | 16 | TCP_METRIC_SSTHRESH, |
| 17 | TCP_METRIC_CWND, | 17 | TCP_METRIC_CWND, |
| 18 | TCP_METRIC_REORDERING, | 18 | TCP_METRIC_REORDERING, |
| 19 | 19 | ||
| 20 | TCP_METRIC_RTT_US, /* in usec units */ | ||
| 21 | TCP_METRIC_RTTVAR_US, /* in usec units */ | ||
| 22 | |||
| 20 | /* Always last. */ | 23 | /* Always last. */ |
| 21 | __TCP_METRIC_MAX, | 24 | __TCP_METRIC_MAX, |
| 22 | }; | 25 | }; |
