aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
authorFlorian Westphal <fw@strlen.de>2015-01-30 14:45:20 -0500
committerDavid S. Miller <davem@davemloft.net>2015-02-02 21:48:55 -0500
commit843c2fdf7a12951815d981fa431d7e04d8259332 (patch)
tree0370af9edd51202331cf9327a5eeff6cd5de9104 /net/ipv4
parent6942241616bb1420f02a32fdeae033134185438b (diff)
net: dctcp: loosen requirement to assert ECT(0) during 3WHS
One deployment requirement of DCTCP is to be able to run in a DC setting along with TCP traffic. As Glenn Judd's NSDI'15 paper "Attaining the Promise and Avoiding the Pitfalls of TCP in the Datacenter" [1] (tba) explains, one way to solve this on switch side is to split DCTCP and TCP traffic in two queues per switch port based on the DSCP: one queue soley intended for DCTCP traffic and one for non-DCTCP traffic. For the DCTCP queue, there's the marking threshold K as explained in commit e3118e8359bb ("net: tcp: add DCTCP congestion control algorithm") for RED marking ECT(0) packets with CE. For the non-DCTCP queue, there's f.e. a classic tail drop queue. As already explained in e3118e8359bb, running DCTCP at scale when not marking SYN/SYN-ACK packets with ECT(0) has severe consequences as for non-ECT(0) packets, traversing the RED marking DCTCP queue will result in a severe reduction of connection probability. This is due to the DCTCP queue being dominated by ECT(0) traffic and switches handle non-ECT traffic in the RED marking queue after passing K as drops, where K is usually a low watermark in order to leave enough tailroom for bursts. Splitting DCTCP traffic among several queues (ECN and non-ECN queue) is being considered a terrible idea in the network community as it splits single flows across multiple network paths. Therefore, commit e3118e8359bb implements this on Linux as ECT(0) marked traffic, as we argue that marking all packets of a DCTCP flow is the only viable solution and also doesn't speak against the draft. However, recently, a DCTCP implementation for FreeBSD hit also their mainline kernel [2]. In order to let them play well together with Linux' DCTCP, we would need to loosen the requirement that ECT(0) has to be asserted during the 3WHS as not implemented in FreeBSD. This simplifies the ECN test and lets DCTCP work together with FreeBSD. Joint work with Daniel Borkmann. [1] https://www.usenix.org/conference/nsdi15/technical-sessions/presentation/judd [2] https://github.com/freebsd/freebsd/commit/8ad879445281027858a7fa706d13e458095b595f Signed-off-by: Florian Westphal <fw@strlen.de> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Cc: Glenn Judd <glenn.judd@morganstanley.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/tcp_input.c14
1 files changed, 5 insertions, 9 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index ed11931f340f..d3dfff78fa19 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5872,10 +5872,9 @@ static inline void pr_drop_req(struct request_sock *req, __u16 port, int family)
5872 * TCP ECN negotiation. 5872 * TCP ECN negotiation.
5873 * 5873 *
5874 * Exception: tcp_ca wants ECN. This is required for DCTCP 5874 * Exception: tcp_ca wants ECN. This is required for DCTCP
5875 * congestion control; it requires setting ECT on all packets, 5875 * congestion control: Linux DCTCP asserts ECT on all packets,
5876 * including SYN. We inverse the test in this case: If our 5876 * including SYN, which is most optimal solution; however,
5877 * local socket wants ECN, but peer only set ece/cwr (but not 5877 * others, such as FreeBSD do not.
5878 * ECT in IP header) its probably a non-DCTCP aware sender.
5879 */ 5878 */
5880static void tcp_ecn_create_request(struct request_sock *req, 5879static void tcp_ecn_create_request(struct request_sock *req,
5881 const struct sk_buff *skb, 5880 const struct sk_buff *skb,
@@ -5885,18 +5884,15 @@ static void tcp_ecn_create_request(struct request_sock *req,
5885 const struct tcphdr *th = tcp_hdr(skb); 5884 const struct tcphdr *th = tcp_hdr(skb);
5886 const struct net *net = sock_net(listen_sk); 5885 const struct net *net = sock_net(listen_sk);
5887 bool th_ecn = th->ece && th->cwr; 5886 bool th_ecn = th->ece && th->cwr;
5888 bool ect, need_ecn, ecn_ok; 5887 bool ect, ecn_ok;
5889 5888
5890 if (!th_ecn) 5889 if (!th_ecn)
5891 return; 5890 return;
5892 5891
5893 ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield); 5892 ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield);
5894 need_ecn = tcp_ca_needs_ecn(listen_sk);
5895 ecn_ok = net->ipv4.sysctl_tcp_ecn || dst_feature(dst, RTAX_FEATURE_ECN); 5893 ecn_ok = net->ipv4.sysctl_tcp_ecn || dst_feature(dst, RTAX_FEATURE_ECN);
5896 5894
5897 if (!ect && !need_ecn && ecn_ok) 5895 if ((!ect && ecn_ok) || tcp_ca_needs_ecn(listen_sk))
5898 inet_rsk(req)->ecn_ok = 1;
5899 else if (ect && need_ecn)
5900 inet_rsk(req)->ecn_ok = 1; 5896 inet_rsk(req)->ecn_ok = 1;
5901} 5897}
5902 5898