aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorFlorian Westphal <fw@strlen.de>2014-11-03 11:35:03 -0500
committerDavid S. Miller <davem@davemloft.net>2014-11-04 16:06:09 -0500
commitf7b3bec6f5167efaf56b756abfafb924cb1d3050 (patch)
tree511fb5930fd9d2eaeb040c901287dbc82c8d4c0d
parentf1673381b1481a409238d4552a0700d490c5b36c (diff)
net: allow setting ecn via routing table
This patch allows to set ECN on a per-route basis in case the sysctl tcp_ecn is not set to 1. In other words, when ECN is set for specific routes, it provides a tcp_ecn=1 behaviour for that route while the rest of the stack acts according to the global settings. One can use 'ip route change dev $dev $net features ecn' to toggle this. Having a more fine-grained per-route setting can be beneficial for various reasons, for example, 1) within data centers, or 2) local ISPs may deploy ECN support for their own video/streaming services [1], etc. There was a recent measurement study/paper [2] which scanned the Alexa's publicly available top million websites list from a vantage point in US, Europe and Asia: Half of the Alexa list will now happily use ECN (tcp_ecn=2, most likely blamed to commit 255cac91c3 ("tcp: extend ECN sysctl to allow server-side only ECN") ;)); the break in connectivity on-path was found is about 1 in 10,000 cases. Timeouts rather than receiving back RSTs were much more common in the negotiation phase (and mostly seen in the Alexa middle band, ranks around 50k-150k): from 12-thousand hosts on which there _may_ be ECN-linked connection failures, only 79 failed with RST when _not_ failing with RST when ECN is not requested. It's unclear though, how much equipment in the wild actually marks CE when buffers start to fill up. We thought about a fallback to non-ECN for retransmitted SYNs as another global option (which could perhaps one day be made default), but as Eric points out, there's much more work needed to detect broken middleboxes. Two examples Eric mentioned are buggy firewalls that accept only a single SYN per flow, and middleboxes that successfully let an ECN flow establish, but later mark CE for all packets (so cwnd converges to 1). [1] http://www.ietf.org/proceedings/89/slides/slides-89-tsvarea-1.pdf, p.15 [2] http://ecn.ethz.ch/ Joint work with Daniel Borkmann. Reference: http://thread.gmane.org/gmane.linux.network/335797 Suggested-by: Hannes Frederic Sowa <hannes@stressinduktion.org> Acked-by: Eric Dumazet <edumazet@google.com> Signed-off-by: Daniel Borkmann <dborkman@redhat.com> Signed-off-by: Florian Westphal <fw@strlen.de> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--include/net/tcp.h2
-rw-r--r--net/ipv4/syncookies.c6
-rw-r--r--net/ipv4/tcp_input.c25
-rw-r--r--net/ipv4/tcp_output.c13
-rw-r--r--net/ipv6/syncookies.c2
5 files changed, 31 insertions, 17 deletions
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 36c5084964cd..f50f29faf76f 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -493,7 +493,7 @@ __u32 cookie_v4_init_sequence(struct sock *sk, const struct sk_buff *skb,
493__u32 cookie_init_timestamp(struct request_sock *req); 493__u32 cookie_init_timestamp(struct request_sock *req);
494bool cookie_timestamp_decode(struct tcp_options_received *opt); 494bool cookie_timestamp_decode(struct tcp_options_received *opt);
495bool cookie_ecn_ok(const struct tcp_options_received *opt, 495bool cookie_ecn_ok(const struct tcp_options_received *opt,
496 const struct net *net); 496 const struct net *net, const struct dst_entry *dst);
497 497
498/* From net/ipv6/syncookies.c */ 498/* From net/ipv6/syncookies.c */
499int __cookie_v6_check(const struct ipv6hdr *iph, const struct tcphdr *th, 499int __cookie_v6_check(const struct ipv6hdr *iph, const struct tcphdr *th,
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 6de772500ee9..45fe60c5238e 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -273,7 +273,7 @@ bool cookie_timestamp_decode(struct tcp_options_received *tcp_opt)
273EXPORT_SYMBOL(cookie_timestamp_decode); 273EXPORT_SYMBOL(cookie_timestamp_decode);
274 274
275bool cookie_ecn_ok(const struct tcp_options_received *tcp_opt, 275bool cookie_ecn_ok(const struct tcp_options_received *tcp_opt,
276 const struct net *net) 276 const struct net *net, const struct dst_entry *dst)
277{ 277{
278 bool ecn_ok = tcp_opt->rcv_tsecr & TS_OPT_ECN; 278 bool ecn_ok = tcp_opt->rcv_tsecr & TS_OPT_ECN;
279 279
@@ -283,7 +283,7 @@ bool cookie_ecn_ok(const struct tcp_options_received *tcp_opt,
283 if (net->ipv4.sysctl_tcp_ecn) 283 if (net->ipv4.sysctl_tcp_ecn)
284 return true; 284 return true;
285 285
286 return false; 286 return dst_feature(dst, RTAX_FEATURE_ECN);
287} 287}
288EXPORT_SYMBOL(cookie_ecn_ok); 288EXPORT_SYMBOL(cookie_ecn_ok);
289 289
@@ -387,7 +387,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
387 dst_metric(&rt->dst, RTAX_INITRWND)); 387 dst_metric(&rt->dst, RTAX_INITRWND));
388 388
389 ireq->rcv_wscale = rcv_wscale; 389 ireq->rcv_wscale = rcv_wscale;
390 ireq->ecn_ok = cookie_ecn_ok(&tcp_opt, sock_net(sk)); 390 ireq->ecn_ok = cookie_ecn_ok(&tcp_opt, sock_net(sk), &rt->dst);
391 391
392 ret = get_cookie_sock(sk, skb, req, &rt->dst); 392 ret = get_cookie_sock(sk, skb, req, &rt->dst);
393 /* ip_queue_xmit() depends on our flow being setup 393 /* ip_queue_xmit() depends on our flow being setup
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 4e4617e90417..196b4388116c 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5876,20 +5876,22 @@ static inline void pr_drop_req(struct request_sock *req, __u16 port, int family)
5876 */ 5876 */
5877static void tcp_ecn_create_request(struct request_sock *req, 5877static void tcp_ecn_create_request(struct request_sock *req,
5878 const struct sk_buff *skb, 5878 const struct sk_buff *skb,
5879 const struct sock *listen_sk) 5879 const struct sock *listen_sk,
5880 const struct dst_entry *dst)
5880{ 5881{
5881 const struct tcphdr *th = tcp_hdr(skb); 5882 const struct tcphdr *th = tcp_hdr(skb);
5882 const struct net *net = sock_net(listen_sk); 5883 const struct net *net = sock_net(listen_sk);
5883 bool th_ecn = th->ece && th->cwr; 5884 bool th_ecn = th->ece && th->cwr;
5884 bool ect, need_ecn; 5885 bool ect, need_ecn, ecn_ok;
5885 5886
5886 if (!th_ecn) 5887 if (!th_ecn)
5887 return; 5888 return;
5888 5889
5889 ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield); 5890 ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield);
5890 need_ecn = tcp_ca_needs_ecn(listen_sk); 5891 need_ecn = tcp_ca_needs_ecn(listen_sk);
5892 ecn_ok = net->ipv4.sysctl_tcp_ecn || dst_feature(dst, RTAX_FEATURE_ECN);
5891 5893
5892 if (!ect && !need_ecn && net->ipv4.sysctl_tcp_ecn) 5894 if (!ect && !need_ecn && ecn_ok)
5893 inet_rsk(req)->ecn_ok = 1; 5895 inet_rsk(req)->ecn_ok = 1;
5894 else if (ect && need_ecn) 5896 else if (ect && need_ecn)
5895 inet_rsk(req)->ecn_ok = 1; 5897 inet_rsk(req)->ecn_ok = 1;
@@ -5954,13 +5956,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
5954 if (security_inet_conn_request(sk, skb, req)) 5956 if (security_inet_conn_request(sk, skb, req))
5955 goto drop_and_free; 5957 goto drop_and_free;
5956 5958
5957 if (!want_cookie || tmp_opt.tstamp_ok) 5959 if (!want_cookie && !isn) {
5958 tcp_ecn_create_request(req, skb, sk);
5959
5960 if (want_cookie) {
5961 isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);
5962 req->cookie_ts = tmp_opt.tstamp_ok;
5963 } else if (!isn) {
5964 /* VJ's idea. We save last timestamp seen 5960 /* VJ's idea. We save last timestamp seen
5965 * from the destination in peer table, when entering 5961 * from the destination in peer table, when entering
5966 * state TIME-WAIT, and check against it before 5962 * state TIME-WAIT, and check against it before
@@ -6008,6 +6004,15 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
6008 goto drop_and_free; 6004 goto drop_and_free;
6009 } 6005 }
6010 6006
6007 tcp_ecn_create_request(req, skb, sk, dst);
6008
6009 if (want_cookie) {
6010 isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);
6011 req->cookie_ts = tmp_opt.tstamp_ok;
6012 if (!tmp_opt.tstamp_ok)
6013 inet_rsk(req)->ecn_ok = 0;
6014 }
6015
6011 tcp_rsk(req)->snt_isn = isn; 6016 tcp_rsk(req)->snt_isn = isn;
6012 tcp_openreq_init_rwin(req, sk, dst); 6017 tcp_openreq_init_rwin(req, sk, dst);
6013 fastopen = !want_cookie && 6018 fastopen = !want_cookie &&
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index a3d453b94747..0b88158dd4a7 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -333,10 +333,19 @@ static void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb)
333static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) 333static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
334{ 334{
335 struct tcp_sock *tp = tcp_sk(sk); 335 struct tcp_sock *tp = tcp_sk(sk);
336 bool use_ecn = sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 ||
337 tcp_ca_needs_ecn(sk);
338
339 if (!use_ecn) {
340 const struct dst_entry *dst = __sk_dst_get(sk);
341
342 if (dst && dst_feature(dst, RTAX_FEATURE_ECN))
343 use_ecn = true;
344 }
336 345
337 tp->ecn_flags = 0; 346 tp->ecn_flags = 0;
338 if (sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 || 347
339 tcp_ca_needs_ecn(sk)) { 348 if (use_ecn) {
340 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR; 349 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR;
341 tp->ecn_flags = TCP_ECN_OK; 350 tp->ecn_flags = TCP_ECN_OK;
342 if (tcp_ca_needs_ecn(sk)) 351 if (tcp_ca_needs_ecn(sk))
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index 52cc8cb02c0c..7337fc7947e2 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -262,7 +262,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
262 dst_metric(dst, RTAX_INITRWND)); 262 dst_metric(dst, RTAX_INITRWND));
263 263
264 ireq->rcv_wscale = rcv_wscale; 264 ireq->rcv_wscale = rcv_wscale;
265 ireq->ecn_ok = cookie_ecn_ok(&tcp_opt, sock_net(sk)); 265 ireq->ecn_ok = cookie_ecn_ok(&tcp_opt, sock_net(sk), dst);
266 266
267 ret = get_cookie_sock(sk, skb, req, dst); 267 ret = get_cookie_sock(sk, skb, req, dst);
268out: 268out: