summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSoheil Hassas Yeganeh <soheil@google.com>2017-03-15 16:30:45 -0400
committerDavid S. Miller <davem@davemloft.net>2017-03-16 23:33:56 -0400
commitd82bae12dc38d79a2b77473f5eb0612a3d69c55b (patch)
tree677ef02fce0b832e01c02c21f37fa7d653663104
parent8b705f5241adb2d0b5d009abea5a865601666974 (diff)
tcp: remove per-destination timestamp cache
Commit 8a5bd45f6616 (tcp: randomize tcp timestamp offsets for each connection) randomizes TCP timestamps per connection. After this commit, there is no guarantee that the timestamps received from the same destination are monotonically increasing. As a result, the per-destination timestamp cache in TCP metrics (i.e., tcpm_ts in struct tcp_metrics_block) is broken and cannot be relied upon. Remove the per-destination timestamp cache and all related code paths. Note that this cache was already broken for caching timestamps of multiple machines behind a NAT sharing the same address. Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com> Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: Neal Cardwell <ncardwell@google.com> Signed-off-by: Yuchung Cheng <ycheng@google.com> Cc: Lutz Vieweg <lvml@5t9.de> Cc: Florian Westphal <fw@strlen.de> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--include/net/tcp.h6
-rw-r--r--net/ipv4/tcp_input.c6
-rw-r--r--net/ipv4/tcp_ipv4.c4
-rw-r--r--net/ipv4/tcp_metrics.c147
-rw-r--r--net/ipv4/tcp_minisocks.c22
-rw-r--r--net/ipv6/tcp_ipv6.c5
6 files changed, 11 insertions, 179 deletions
diff --git a/include/net/tcp.h b/include/net/tcp.h
index bede8f7fa742..c81f3b958d44 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -406,11 +406,7 @@ void tcp_clear_retrans(struct tcp_sock *tp);
406void tcp_update_metrics(struct sock *sk); 406void tcp_update_metrics(struct sock *sk);
407void tcp_init_metrics(struct sock *sk); 407void tcp_init_metrics(struct sock *sk);
408void tcp_metrics_init(void); 408void tcp_metrics_init(void);
409bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst, 409bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst);
410 bool paws_check, bool timestamps);
411bool tcp_remember_stamp(struct sock *sk);
412bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw);
413void tcp_fetch_timewait_stamp(struct sock *sk, struct dst_entry *dst);
414void tcp_disable_fack(struct tcp_sock *tp); 410void tcp_disable_fack(struct tcp_sock *tp);
415void tcp_close(struct sock *sk, long timeout); 411void tcp_close(struct sock *sk, long timeout);
416void tcp_init_sock(struct sock *sk); 412void tcp_init_sock(struct sock *sk);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 96b67a8b18c3..aafec0676d3e 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6342,8 +6342,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
6342 dst = af_ops->route_req(sk, &fl, req, &strict); 6342 dst = af_ops->route_req(sk, &fl, req, &strict);
6343 6343
6344 if (dst && strict && 6344 if (dst && strict &&
6345 !tcp_peer_is_proven(req, dst, true, 6345 !tcp_peer_is_proven(req, dst)) {
6346 tmp_opt.saw_tstamp)) {
6347 NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED); 6346 NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
6348 goto drop_and_release; 6347 goto drop_and_release;
6349 } 6348 }
@@ -6352,8 +6351,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
6352 else if (!net->ipv4.sysctl_tcp_syncookies && 6351 else if (!net->ipv4.sysctl_tcp_syncookies &&
6353 (net->ipv4.sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < 6352 (net->ipv4.sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
6354 (net->ipv4.sysctl_max_syn_backlog >> 2)) && 6353 (net->ipv4.sysctl_max_syn_backlog >> 2)) &&
6355 !tcp_peer_is_proven(req, dst, false, 6354 !tcp_peer_is_proven(req, dst)) {
6356 tmp_opt.saw_tstamp)) {
6357 /* Without syncookies last quarter of 6355 /* Without syncookies last quarter of
6358 * backlog is filled with destinations, 6356 * backlog is filled with destinations,
6359 * proven to be alive. 6357 * proven to be alive.
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 08d870e45658..d8b401fff9fe 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -198,10 +198,6 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
198 tp->write_seq = 0; 198 tp->write_seq = 0;
199 } 199 }
200 200
201 if (tcp_death_row->sysctl_tw_recycle &&
202 !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
203 tcp_fetch_timewait_stamp(sk, &rt->dst);
204
205 inet->inet_dport = usin->sin_port; 201 inet->inet_dport = usin->sin_port;
206 sk_daddr_set(sk, daddr); 202 sk_daddr_set(sk, daddr);
207 203
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 0f46e5fe31ad..9d0d4f39e42b 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -45,8 +45,6 @@ struct tcp_metrics_block {
45 struct inetpeer_addr tcpm_saddr; 45 struct inetpeer_addr tcpm_saddr;
46 struct inetpeer_addr tcpm_daddr; 46 struct inetpeer_addr tcpm_daddr;
47 unsigned long tcpm_stamp; 47 unsigned long tcpm_stamp;
48 u32 tcpm_ts;
49 u32 tcpm_ts_stamp;
50 u32 tcpm_lock; 48 u32 tcpm_lock;
51 u32 tcpm_vals[TCP_METRIC_MAX_KERNEL + 1]; 49 u32 tcpm_vals[TCP_METRIC_MAX_KERNEL + 1];
52 struct tcp_fastopen_metrics tcpm_fastopen; 50 struct tcp_fastopen_metrics tcpm_fastopen;
@@ -123,8 +121,6 @@ static void tcpm_suck_dst(struct tcp_metrics_block *tm,
123 tm->tcpm_vals[TCP_METRIC_SSTHRESH] = dst_metric_raw(dst, RTAX_SSTHRESH); 121 tm->tcpm_vals[TCP_METRIC_SSTHRESH] = dst_metric_raw(dst, RTAX_SSTHRESH);
124 tm->tcpm_vals[TCP_METRIC_CWND] = dst_metric_raw(dst, RTAX_CWND); 122 tm->tcpm_vals[TCP_METRIC_CWND] = dst_metric_raw(dst, RTAX_CWND);
125 tm->tcpm_vals[TCP_METRIC_REORDERING] = dst_metric_raw(dst, RTAX_REORDERING); 123 tm->tcpm_vals[TCP_METRIC_REORDERING] = dst_metric_raw(dst, RTAX_REORDERING);
126 tm->tcpm_ts = 0;
127 tm->tcpm_ts_stamp = 0;
128 if (fastopen_clear) { 124 if (fastopen_clear) {
129 tm->tcpm_fastopen.mss = 0; 125 tm->tcpm_fastopen.mss = 0;
130 tm->tcpm_fastopen.syn_loss = 0; 126 tm->tcpm_fastopen.syn_loss = 0;
@@ -273,48 +269,6 @@ static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req,
273 return tm; 269 return tm;
274} 270}
275 271
276static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock *tw)
277{
278 struct tcp_metrics_block *tm;
279 struct inetpeer_addr saddr, daddr;
280 unsigned int hash;
281 struct net *net;
282
283 if (tw->tw_family == AF_INET) {
284 inetpeer_set_addr_v4(&saddr, tw->tw_rcv_saddr);
285 inetpeer_set_addr_v4(&daddr, tw->tw_daddr);
286 hash = ipv4_addr_hash(tw->tw_daddr);
287 }
288#if IS_ENABLED(CONFIG_IPV6)
289 else if (tw->tw_family == AF_INET6) {
290 if (ipv6_addr_v4mapped(&tw->tw_v6_daddr)) {
291 inetpeer_set_addr_v4(&saddr, tw->tw_rcv_saddr);
292 inetpeer_set_addr_v4(&daddr, tw->tw_daddr);
293 hash = ipv4_addr_hash(tw->tw_daddr);
294 } else {
295 inetpeer_set_addr_v6(&saddr, &tw->tw_v6_rcv_saddr);
296 inetpeer_set_addr_v6(&daddr, &tw->tw_v6_daddr);
297 hash = ipv6_addr_hash(&tw->tw_v6_daddr);
298 }
299 }
300#endif
301 else
302 return NULL;
303
304 net = twsk_net(tw);
305 hash ^= net_hash_mix(net);
306 hash = hash_32(hash, tcp_metrics_hash_log);
307
308 for (tm = rcu_dereference(tcp_metrics_hash[hash].chain); tm;
309 tm = rcu_dereference(tm->tcpm_next)) {
310 if (addr_same(&tm->tcpm_saddr, &saddr) &&
311 addr_same(&tm->tcpm_daddr, &daddr) &&
312 net_eq(tm_net(tm), net))
313 break;
314 }
315 return tm;
316}
317
318static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk, 272static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk,
319 struct dst_entry *dst, 273 struct dst_entry *dst,
320 bool create) 274 bool create)
@@ -573,8 +527,7 @@ reset:
573 tp->snd_cwnd_stamp = tcp_time_stamp; 527 tp->snd_cwnd_stamp = tcp_time_stamp;
574} 528}
575 529
576bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst, 530bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst)
577 bool paws_check, bool timestamps)
578{ 531{
579 struct tcp_metrics_block *tm; 532 struct tcp_metrics_block *tm;
580 bool ret; 533 bool ret;
@@ -584,94 +537,10 @@ bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst,
584 537
585 rcu_read_lock(); 538 rcu_read_lock();
586 tm = __tcp_get_metrics_req(req, dst); 539 tm = __tcp_get_metrics_req(req, dst);
587 if (paws_check) { 540 if (tm && tcp_metric_get(tm, TCP_METRIC_RTT))
588 if (tm &&
589 (u32)get_seconds() - tm->tcpm_ts_stamp < TCP_PAWS_MSL &&
590 ((s32)(tm->tcpm_ts - req->ts_recent) > TCP_PAWS_WINDOW ||
591 !timestamps))
592 ret = false;
593 else
594 ret = true;
595 } else {
596 if (tm && tcp_metric_get(tm, TCP_METRIC_RTT) && tm->tcpm_ts_stamp)
597 ret = true;
598 else
599 ret = false;
600 }
601 rcu_read_unlock();
602
603 return ret;
604}
605
606void tcp_fetch_timewait_stamp(struct sock *sk, struct dst_entry *dst)
607{
608 struct tcp_metrics_block *tm;
609
610 rcu_read_lock();
611 tm = tcp_get_metrics(sk, dst, true);
612 if (tm) {
613 struct tcp_sock *tp = tcp_sk(sk);
614
615 if ((u32)get_seconds() - tm->tcpm_ts_stamp <= TCP_PAWS_MSL) {
616 tp->rx_opt.ts_recent_stamp = tm->tcpm_ts_stamp;
617 tp->rx_opt.ts_recent = tm->tcpm_ts;
618 }
619 }
620 rcu_read_unlock();
621}
622EXPORT_SYMBOL_GPL(tcp_fetch_timewait_stamp);
623
624/* VJ's idea. Save last timestamp seen from this destination and hold
625 * it at least for normal timewait interval to use for duplicate
626 * segment detection in subsequent connections, before they enter
627 * synchronized state.
628 */
629bool tcp_remember_stamp(struct sock *sk)
630{
631 struct dst_entry *dst = __sk_dst_get(sk);
632 bool ret = false;
633
634 if (dst) {
635 struct tcp_metrics_block *tm;
636
637 rcu_read_lock();
638 tm = tcp_get_metrics(sk, dst, true);
639 if (tm) {
640 struct tcp_sock *tp = tcp_sk(sk);
641
642 if ((s32)(tm->tcpm_ts - tp->rx_opt.ts_recent) <= 0 ||
643 ((u32)get_seconds() - tm->tcpm_ts_stamp > TCP_PAWS_MSL &&
644 tm->tcpm_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) {
645 tm->tcpm_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp;
646 tm->tcpm_ts = tp->rx_opt.ts_recent;
647 }
648 ret = true;
649 }
650 rcu_read_unlock();
651 }
652 return ret;
653}
654
655bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw)
656{
657 struct tcp_metrics_block *tm;
658 bool ret = false;
659
660 rcu_read_lock();
661 tm = __tcp_get_metrics_tw(tw);
662 if (tm) {
663 const struct tcp_timewait_sock *tcptw;
664 struct sock *sk = (struct sock *) tw;
665
666 tcptw = tcp_twsk(sk);
667 if ((s32)(tm->tcpm_ts - tcptw->tw_ts_recent) <= 0 ||
668 ((u32)get_seconds() - tm->tcpm_ts_stamp > TCP_PAWS_MSL &&
669 tm->tcpm_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) {
670 tm->tcpm_ts_stamp = (u32)tcptw->tw_ts_recent_stamp;
671 tm->tcpm_ts = tcptw->tw_ts_recent;
672 }
673 ret = true; 541 ret = true;
674 } 542 else
543 ret = false;
675 rcu_read_unlock(); 544 rcu_read_unlock();
676 545
677 return ret; 546 return ret;
@@ -791,14 +660,6 @@ static int tcp_metrics_fill_info(struct sk_buff *msg,
791 jiffies - tm->tcpm_stamp, 660 jiffies - tm->tcpm_stamp,
792 TCP_METRICS_ATTR_PAD) < 0) 661 TCP_METRICS_ATTR_PAD) < 0)
793 goto nla_put_failure; 662 goto nla_put_failure;
794 if (tm->tcpm_ts_stamp) {
795 if (nla_put_s32(msg, TCP_METRICS_ATTR_TW_TS_STAMP,
796 (s32) (get_seconds() - tm->tcpm_ts_stamp)) < 0)
797 goto nla_put_failure;
798 if (nla_put_u32(msg, TCP_METRICS_ATTR_TW_TSVAL,
799 tm->tcpm_ts) < 0)
800 goto nla_put_failure;
801 }
802 663
803 { 664 {
804 int n = 0; 665 int n = 0;
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 7e16243cdb58..692f974e5abe 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -94,7 +94,6 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
94 struct tcp_options_received tmp_opt; 94 struct tcp_options_received tmp_opt;
95 struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); 95 struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
96 bool paws_reject = false; 96 bool paws_reject = false;
97 struct inet_timewait_death_row *tcp_death_row = &sock_net((struct sock*)tw)->ipv4.tcp_death_row;
98 97
99 tmp_opt.saw_tstamp = 0; 98 tmp_opt.saw_tstamp = 0;
100 if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) { 99 if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
@@ -149,12 +148,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
149 tcptw->tw_ts_recent = tmp_opt.rcv_tsval; 148 tcptw->tw_ts_recent = tmp_opt.rcv_tsval;
150 } 149 }
151 150
152 if (tcp_death_row->sysctl_tw_recycle && 151 inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN);
153 tcptw->tw_ts_recent_stamp &&
154 tcp_tw_remember_stamp(tw))
155 inet_twsk_reschedule(tw, tw->tw_timeout);
156 else
157 inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN);
158 return TCP_TW_ACK; 152 return TCP_TW_ACK;
159 } 153 }
160 154
@@ -259,12 +253,8 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
259 const struct inet_connection_sock *icsk = inet_csk(sk); 253 const struct inet_connection_sock *icsk = inet_csk(sk);
260 const struct tcp_sock *tp = tcp_sk(sk); 254 const struct tcp_sock *tp = tcp_sk(sk);
261 struct inet_timewait_sock *tw; 255 struct inet_timewait_sock *tw;
262 bool recycle_ok = false;
263 struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; 256 struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
264 257
265 if (tcp_death_row->sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp)
266 recycle_ok = tcp_remember_stamp(sk);
267
268 tw = inet_twsk_alloc(sk, tcp_death_row, state); 258 tw = inet_twsk_alloc(sk, tcp_death_row, state);
269 259
270 if (tw) { 260 if (tw) {
@@ -317,13 +307,9 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
317 if (timeo < rto) 307 if (timeo < rto)
318 timeo = rto; 308 timeo = rto;
319 309
320 if (recycle_ok) { 310 tw->tw_timeout = TCP_TIMEWAIT_LEN;
321 tw->tw_timeout = rto; 311 if (state == TCP_TIME_WAIT)
322 } else { 312 timeo = TCP_TIMEWAIT_LEN;
323 tw->tw_timeout = TCP_TIMEWAIT_LEN;
324 if (state == TCP_TIME_WAIT)
325 timeo = TCP_TIMEWAIT_LEN;
326 }
327 313
328 inet_twsk_schedule(tw, timeo); 314 inet_twsk_schedule(tw, timeo);
329 /* Linkage updates. */ 315 /* Linkage updates. */
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index c73a431fd06f..853cb43e3e3c 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -265,11 +265,6 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
265 sk->sk_gso_type = SKB_GSO_TCPV6; 265 sk->sk_gso_type = SKB_GSO_TCPV6;
266 ip6_dst_store(sk, dst, NULL, NULL); 266 ip6_dst_store(sk, dst, NULL, NULL);
267 267
268 if (tcp_death_row->sysctl_tw_recycle &&
269 !tp->rx_opt.ts_recent_stamp &&
270 ipv6_addr_equal(&fl6.daddr, &sk->sk_v6_daddr))
271 tcp_fetch_timewait_stamp(sk, dst);
272
273 icsk->icsk_ext_hdr_len = 0; 268 icsk->icsk_ext_hdr_len = 0;
274 if (opt) 269 if (opt)
275 icsk->icsk_ext_hdr_len = opt->opt_flen + 270 icsk->icsk_ext_hdr_len = opt->opt_flen +