aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDaniel Borkmann <dborkman@redhat.com>2014-09-26 16:37:33 -0400
committerDavid S. Miller <davem@davemloft.net>2014-09-29 00:13:10 -0400
commit30e502a34b8b21fae2c789da102bd9f6e99fef83 (patch)
tree6a56bc051b629e3c0914c3a519d201f357e79ada
parent55d8694fa82c9b5858ae5a78a210353961f908f9 (diff)
net: tcp: add flag for ca to indicate that ECN is required
This patch adds a flag to TCP congestion algorithms that allows for requesting to mark IPv4/IPv6 sockets with transport as ECN capable, that is, ECT(0), when required by a congestion algorithm. It is currently used and needed in DataCenter TCP (DCTCP), as it requires both peers to assert ECT on all IP packets sent - it uses ECN feedback (i.e. CE, Congestion Encountered information) from switches inside the data center to derive feedback to the end hosts. Therefore, simply add a new flag to icsk_ca_ops. Note that DCTCP's algorithm/behaviour slightly diverges from RFC3168, therefore this is only (!) enabled iff the assigned congestion control ops module has requested this. By that, we can tightly couple this logic really only to the provided congestion control ops. Joint work with Florian Westphal and Glenn Judd. Signed-off-by: Daniel Borkmann <dborkman@redhat.com> Signed-off-by: Florian Westphal <fw@strlen.de> Signed-off-by: Glenn Judd <glenn.judd@morganstanley.com> Acked-by: Stephen Hemminger <stephen@networkplumber.org> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--include/net/tcp.h61
-rw-r--r--net/ipv4/tcp_input.c2
-rw-r--r--net/ipv4/tcp_output.c25
3 files changed, 63 insertions, 25 deletions
diff --git a/include/net/tcp.h b/include/net/tcp.h
index f99b0c072ee5..a12f145cfbc3 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -733,23 +733,6 @@ struct tcp_skb_cb {
733 733
734#define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0])) 734#define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0]))
735 735
736/* RFC3168 : 6.1.1 SYN packets must not have ECT/ECN bits set
737 *
738 * If we receive a SYN packet with these bits set, it means a network is
739 * playing bad games with TOS bits. In order to avoid possible false congestion
740 * notifications, we disable TCP ECN negociation.
741 */
742static inline void
743TCP_ECN_create_request(struct request_sock *req, const struct sk_buff *skb,
744 struct net *net)
745{
746 const struct tcphdr *th = tcp_hdr(skb);
747
748 if (net->ipv4.sysctl_tcp_ecn && th->ece && th->cwr &&
749 INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield))
750 inet_rsk(req)->ecn_ok = 1;
751}
752
753/* Due to TSO, an SKB can be composed of multiple actual 736/* Due to TSO, an SKB can be composed of multiple actual
754 * packets. To keep these tracked properly, we use this. 737 * packets. To keep these tracked properly, we use this.
755 */ 738 */
@@ -791,7 +774,10 @@ enum tcp_ca_event {
791#define TCP_CA_MAX 128 774#define TCP_CA_MAX 128
792#define TCP_CA_BUF_MAX (TCP_CA_NAME_MAX*TCP_CA_MAX) 775#define TCP_CA_BUF_MAX (TCP_CA_NAME_MAX*TCP_CA_MAX)
793 776
777/* Algorithm can be set on socket without CAP_NET_ADMIN privileges */
794#define TCP_CONG_NON_RESTRICTED 0x1 778#define TCP_CONG_NON_RESTRICTED 0x1
779/* Requires ECN/ECT set on all packets */
780#define TCP_CONG_NEEDS_ECN 0x2
795 781
796struct tcp_congestion_ops { 782struct tcp_congestion_ops {
797 struct list_head list; 783 struct list_head list;
@@ -840,6 +826,13 @@ u32 tcp_reno_ssthresh(struct sock *sk);
840void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked); 826void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked);
841extern struct tcp_congestion_ops tcp_reno; 827extern struct tcp_congestion_ops tcp_reno;
842 828
829static inline bool tcp_ca_needs_ecn(const struct sock *sk)
830{
831 const struct inet_connection_sock *icsk = inet_csk(sk);
832
833 return icsk->icsk_ca_ops->flags & TCP_CONG_NEEDS_ECN;
834}
835
843static inline void tcp_set_ca_state(struct sock *sk, const u8 ca_state) 836static inline void tcp_set_ca_state(struct sock *sk, const u8 ca_state)
844{ 837{
845 struct inet_connection_sock *icsk = inet_csk(sk); 838 struct inet_connection_sock *icsk = inet_csk(sk);
@@ -857,6 +850,40 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event)
857 icsk->icsk_ca_ops->cwnd_event(sk, event); 850 icsk->icsk_ca_ops->cwnd_event(sk, event);
858} 851}
859 852
853/* RFC3168 : 6.1.1 SYN packets must not have ECT/ECN bits set
854 *
855 * If we receive a SYN packet with these bits set, it means a
856 * network is playing bad games with TOS bits. In order to
857 * avoid possible false congestion notifications, we disable
858 * TCP ECN negociation.
859 *
860 * Exception: tcp_ca wants ECN. This is required for DCTCP
861 * congestion control; it requires setting ECT on all packets,
862 * including SYN. We inverse the test in this case: If our
863 * local socket wants ECN, but peer only set ece/cwr (but not
864 * ECT in IP header) its probably a non-DCTCP aware sender.
865 */
866static inline void
867TCP_ECN_create_request(struct request_sock *req, const struct sk_buff *skb,
868 const struct sock *listen_sk)
869{
870 const struct tcphdr *th = tcp_hdr(skb);
871 const struct net *net = sock_net(listen_sk);
872 bool th_ecn = th->ece && th->cwr;
873 bool ect, need_ecn;
874
875 if (!th_ecn)
876 return;
877
878 ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield);
879 need_ecn = tcp_ca_needs_ecn(listen_sk);
880
881 if (!ect && !need_ecn && net->ipv4.sysctl_tcp_ecn)
882 inet_rsk(req)->ecn_ok = 1;
883 else if (ect && need_ecn)
884 inet_rsk(req)->ecn_ok = 1;
885}
886
860/* These functions determine how the current flow behaves in respect of SACK 887/* These functions determine how the current flow behaves in respect of SACK
861 * handling. SACK is negotiated with the peer, and therefore it can vary 888 * handling. SACK is negotiated with the peer, and therefore it can vary
862 * between different flows. 889 * between different flows.
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 5073eefa6fae..fb0fe97e1c54 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5944,7 +5944,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
5944 goto drop_and_free; 5944 goto drop_and_free;
5945 5945
5946 if (!want_cookie || tmp_opt.tstamp_ok) 5946 if (!want_cookie || tmp_opt.tstamp_ok)
5947 TCP_ECN_create_request(req, skb, sock_net(sk)); 5947 TCP_ECN_create_request(req, skb, sk);
5948 5948
5949 if (want_cookie) { 5949 if (want_cookie) {
5950 isn = cookie_init_sequence(af_ops, sk, skb, &req->mss); 5950 isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 4d92703df4c6..20e73271d75c 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -318,11 +318,15 @@ static u16 tcp_select_window(struct sock *sk)
318} 318}
319 319
320/* Packet ECN state for a SYN-ACK */ 320/* Packet ECN state for a SYN-ACK */
321static inline void TCP_ECN_send_synack(const struct tcp_sock *tp, struct sk_buff *skb) 321static inline void TCP_ECN_send_synack(struct sock *sk, struct sk_buff *skb)
322{ 322{
323 const struct tcp_sock *tp = tcp_sk(sk);
324
323 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR; 325 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR;
324 if (!(tp->ecn_flags & TCP_ECN_OK)) 326 if (!(tp->ecn_flags & TCP_ECN_OK))
325 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE; 327 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE;
328 else if (tcp_ca_needs_ecn(sk))
329 INET_ECN_xmit(sk);
326} 330}
327 331
328/* Packet ECN state for a SYN. */ 332/* Packet ECN state for a SYN. */
@@ -331,17 +335,24 @@ static inline void TCP_ECN_send_syn(struct sock *sk, struct sk_buff *skb)
331 struct tcp_sock *tp = tcp_sk(sk); 335 struct tcp_sock *tp = tcp_sk(sk);
332 336
333 tp->ecn_flags = 0; 337 tp->ecn_flags = 0;
334 if (sock_net(sk)->ipv4.sysctl_tcp_ecn == 1) { 338 if (sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 ||
339 tcp_ca_needs_ecn(sk)) {
335 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR; 340 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR;
336 tp->ecn_flags = TCP_ECN_OK; 341 tp->ecn_flags = TCP_ECN_OK;
342 if (tcp_ca_needs_ecn(sk))
343 INET_ECN_xmit(sk);
337 } 344 }
338} 345}
339 346
340static __inline__ void 347static __inline__ void
341TCP_ECN_make_synack(const struct request_sock *req, struct tcphdr *th) 348TCP_ECN_make_synack(const struct request_sock *req, struct tcphdr *th,
349 struct sock *sk)
342{ 350{
343 if (inet_rsk(req)->ecn_ok) 351 if (inet_rsk(req)->ecn_ok) {
344 th->ece = 1; 352 th->ece = 1;
353 if (tcp_ca_needs_ecn(sk))
354 INET_ECN_xmit(sk);
355 }
345} 356}
346 357
347/* Set up ECN state for a packet on a ESTABLISHED socket that is about to 358/* Set up ECN state for a packet on a ESTABLISHED socket that is about to
@@ -362,7 +373,7 @@ static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb,
362 tcp_hdr(skb)->cwr = 1; 373 tcp_hdr(skb)->cwr = 1;
363 skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; 374 skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
364 } 375 }
365 } else { 376 } else if (!tcp_ca_needs_ecn(sk)) {
366 /* ACK or retransmitted segment: clear ECT|CE */ 377 /* ACK or retransmitted segment: clear ECT|CE */
367 INET_ECN_dontxmit(sk); 378 INET_ECN_dontxmit(sk);
368 } 379 }
@@ -2789,7 +2800,7 @@ int tcp_send_synack(struct sock *sk)
2789 } 2800 }
2790 2801
2791 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ACK; 2802 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ACK;
2792 TCP_ECN_send_synack(tcp_sk(sk), skb); 2803 TCP_ECN_send_synack(sk, skb);
2793 } 2804 }
2794 return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); 2805 return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
2795} 2806}
@@ -2848,7 +2859,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2848 memset(th, 0, sizeof(struct tcphdr)); 2859 memset(th, 0, sizeof(struct tcphdr));
2849 th->syn = 1; 2860 th->syn = 1;
2850 th->ack = 1; 2861 th->ack = 1;
2851 TCP_ECN_make_synack(req, th); 2862 TCP_ECN_make_synack(req, th, sk);
2852 th->source = htons(ireq->ir_num); 2863 th->source = htons(ireq->ir_num);
2853 th->dest = ireq->ir_rmt_port; 2864 th->dest = ireq->ir_rmt_port;
2854 /* Setting of flags are superfluous here for callers (and ECE is 2865 /* Setting of flags are superfluous here for callers (and ECE is