aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLawrence Brakmo <brakmo@fb.com>2017-06-30 23:02:49 -0400
committerDavid S. Miller <davem@davemloft.net>2017-07-01 19:15:14 -0400
commit91b5b21c7c16899abb37f4a9e4388b4e9aae0b9d (patch)
treeff5989374783d9f11e822906a98e94d08a6f135a
parentd9925368a641391f38cd281e67b948e6b6f3bcca (diff)
bpf: Add support for changing congestion control
Added support for changing congestion control for SOCK_OPS bpf programs through the setsockopt bpf helper function. It also adds a new SOCK_OPS op, BPF_SOCK_OPS_NEEDS_ECN, that is needed for congestion controls, like dctcp, that need to enable ECN in the SYN packets. Signed-off-by: Lawrence Brakmo <brakmo@fb.com> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--include/net/tcp.h9
-rw-r--r--include/uapi/linux/bpf.h3
-rw-r--r--net/core/filter.c18
-rw-r--r--net/ipv4/tcp.c2
-rw-r--r--net/ipv4/tcp_cong.c32
-rw-r--r--net/ipv4/tcp_input.c3
-rw-r--r--net/ipv4/tcp_output.c8
7 files changed, 58 insertions, 17 deletions
diff --git a/include/net/tcp.h b/include/net/tcp.h
index d6bb3948203d..70483296157f 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1004,7 +1004,9 @@ void tcp_get_default_congestion_control(char *name);
1004void tcp_get_available_congestion_control(char *buf, size_t len); 1004void tcp_get_available_congestion_control(char *buf, size_t len);
1005void tcp_get_allowed_congestion_control(char *buf, size_t len); 1005void tcp_get_allowed_congestion_control(char *buf, size_t len);
1006int tcp_set_allowed_congestion_control(char *allowed); 1006int tcp_set_allowed_congestion_control(char *allowed);
1007int tcp_set_congestion_control(struct sock *sk, const char *name); 1007int tcp_set_congestion_control(struct sock *sk, const char *name, bool load);
1008void tcp_reinit_congestion_control(struct sock *sk,
1009 const struct tcp_congestion_ops *ca);
1008u32 tcp_slow_start(struct tcp_sock *tp, u32 acked); 1010u32 tcp_slow_start(struct tcp_sock *tp, u32 acked);
1009void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked); 1011void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked);
1010 1012
@@ -2078,4 +2080,9 @@ static inline u32 tcp_rwnd_init_bpf(struct sock *sk)
2078 rwnd = 0; 2080 rwnd = 0;
2079 return rwnd; 2081 return rwnd;
2080} 2082}
2083
2084static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk)
2085{
2086 return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN) == 1);
2087}
2081#endif /* _TCP_H */ 2088#endif /* _TCP_H */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 2405fe304c98..cc4725982bd8 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -778,6 +778,9 @@ enum {
778 * passive connection is 778 * passive connection is
779 * established 779 * established
780 */ 780 */
781 BPF_SOCK_OPS_NEEDS_ECN, /* If connection's congestion control
782 * needs ECN
783 */
781}; 784};
782 785
783#endif /* _UAPI__LINUX_BPF_H__ */ 786#endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/net/core/filter.c b/net/core/filter.c
index ca033e15d35e..12df52711fe8 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2719,8 +2719,24 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
2719 } 2719 }
2720 } else if (level == SOL_TCP && 2720 } else if (level == SOL_TCP &&
2721 sk->sk_prot->setsockopt == tcp_setsockopt) { 2721 sk->sk_prot->setsockopt == tcp_setsockopt) {
2722 /* Place holder */ 2722#ifdef CONFIG_INET
2723 if (optname == TCP_CONGESTION) {
2724 char name[TCP_CA_NAME_MAX];
2725
2726 strncpy(name, optval, min_t(long, optlen,
2727 TCP_CA_NAME_MAX-1));
2728 name[TCP_CA_NAME_MAX-1] = 0;
2729 ret = tcp_set_congestion_control(sk, name, false);
2730 if (!ret && bpf_sock->op > BPF_SOCK_OPS_NEEDS_ECN)
2731 /* replacing an existing ca */
2732 tcp_reinit_congestion_control(sk,
2733 inet_csk(sk)->icsk_ca_ops);
2734 } else {
2735 ret = -EINVAL;
2736 }
2737#else
2723 ret = -EINVAL; 2738 ret = -EINVAL;
2739#endif
2724 } else { 2740 } else {
2725 ret = -EINVAL; 2741 ret = -EINVAL;
2726 } 2742 }
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index fae45e402742..71ce33decd97 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2481,7 +2481,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
2481 name[val] = 0; 2481 name[val] = 0;
2482 2482
2483 lock_sock(sk); 2483 lock_sock(sk);
2484 err = tcp_set_congestion_control(sk, name); 2484 err = tcp_set_congestion_control(sk, name, true);
2485 release_sock(sk); 2485 release_sock(sk);
2486 return err; 2486 return err;
2487 } 2487 }
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 324c9bcc5456..fde983f6376b 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -189,8 +189,8 @@ void tcp_init_congestion_control(struct sock *sk)
189 INET_ECN_dontxmit(sk); 189 INET_ECN_dontxmit(sk);
190} 190}
191 191
192static void tcp_reinit_congestion_control(struct sock *sk, 192void tcp_reinit_congestion_control(struct sock *sk,
193 const struct tcp_congestion_ops *ca) 193 const struct tcp_congestion_ops *ca)
194{ 194{
195 struct inet_connection_sock *icsk = inet_csk(sk); 195 struct inet_connection_sock *icsk = inet_csk(sk);
196 196
@@ -333,8 +333,12 @@ out:
333 return ret; 333 return ret;
334} 334}
335 335
336/* Change congestion control for socket */ 336/* Change congestion control for socket. If load is false, then it is the
337int tcp_set_congestion_control(struct sock *sk, const char *name) 337 * responsibility of the caller to call tcp_init_congestion_control or
338 * tcp_reinit_congestion_control (if the current congestion control was
339 * already initialized.
340 */
341int tcp_set_congestion_control(struct sock *sk, const char *name, bool load)
338{ 342{
339 struct inet_connection_sock *icsk = inet_csk(sk); 343 struct inet_connection_sock *icsk = inet_csk(sk);
340 const struct tcp_congestion_ops *ca; 344 const struct tcp_congestion_ops *ca;
@@ -344,21 +348,29 @@ int tcp_set_congestion_control(struct sock *sk, const char *name)
344 return -EPERM; 348 return -EPERM;
345 349
346 rcu_read_lock(); 350 rcu_read_lock();
347 ca = __tcp_ca_find_autoload(name); 351 if (!load)
352 ca = tcp_ca_find(name);
353 else
354 ca = __tcp_ca_find_autoload(name);
348 /* No change asking for existing value */ 355 /* No change asking for existing value */
349 if (ca == icsk->icsk_ca_ops) { 356 if (ca == icsk->icsk_ca_ops) {
350 icsk->icsk_ca_setsockopt = 1; 357 icsk->icsk_ca_setsockopt = 1;
351 goto out; 358 goto out;
352 } 359 }
353 if (!ca) 360 if (!ca) {
354 err = -ENOENT; 361 err = -ENOENT;
355 else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || 362 } else if (!load) {
356 ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))) 363 icsk->icsk_ca_ops = ca;
364 if (!try_module_get(ca->owner))
365 err = -EBUSY;
366 } else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) ||
367 ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))) {
357 err = -EPERM; 368 err = -EPERM;
358 else if (!try_module_get(ca->owner)) 369 } else if (!try_module_get(ca->owner)) {
359 err = -EBUSY; 370 err = -EBUSY;
360 else 371 } else {
361 tcp_reinit_congestion_control(sk, ca); 372 tcp_reinit_congestion_control(sk, ca);
373 }
362 out: 374 out:
363 rcu_read_unlock(); 375 rcu_read_unlock();
364 return err; 376 return err;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 664210e5e4a7..2920e0cb09f8 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6191,7 +6191,8 @@ static void tcp_ecn_create_request(struct request_sock *req,
6191 ecn_ok = net->ipv4.sysctl_tcp_ecn || ecn_ok_dst; 6191 ecn_ok = net->ipv4.sysctl_tcp_ecn || ecn_ok_dst;
6192 6192
6193 if ((!ect && ecn_ok) || tcp_ca_needs_ecn(listen_sk) || 6193 if ((!ect && ecn_ok) || tcp_ca_needs_ecn(listen_sk) ||
6194 (ecn_ok_dst & DST_FEATURE_ECN_CA)) 6194 (ecn_ok_dst & DST_FEATURE_ECN_CA) ||
6195 tcp_bpf_ca_needs_ecn((struct sock *)req))
6195 inet_rsk(req)->ecn_ok = 1; 6196 inet_rsk(req)->ecn_ok = 1;
6196} 6197}
6197 6198
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 33b3e401e812..4d36f0b093e6 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -316,7 +316,8 @@ static void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb)
316 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR; 316 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR;
317 if (!(tp->ecn_flags & TCP_ECN_OK)) 317 if (!(tp->ecn_flags & TCP_ECN_OK))
318 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE; 318 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE;
319 else if (tcp_ca_needs_ecn(sk)) 319 else if (tcp_ca_needs_ecn(sk) ||
320 tcp_bpf_ca_needs_ecn(sk))
320 INET_ECN_xmit(sk); 321 INET_ECN_xmit(sk);
321} 322}
322 323
@@ -324,8 +325,9 @@ static void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb)
324static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) 325static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
325{ 326{
326 struct tcp_sock *tp = tcp_sk(sk); 327 struct tcp_sock *tp = tcp_sk(sk);
328 bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk);
327 bool use_ecn = sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 || 329 bool use_ecn = sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 ||
328 tcp_ca_needs_ecn(sk); 330 tcp_ca_needs_ecn(sk) || bpf_needs_ecn;
329 331
330 if (!use_ecn) { 332 if (!use_ecn) {
331 const struct dst_entry *dst = __sk_dst_get(sk); 333 const struct dst_entry *dst = __sk_dst_get(sk);
@@ -339,7 +341,7 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
339 if (use_ecn) { 341 if (use_ecn) {
340 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR; 342 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR;
341 tp->ecn_flags = TCP_ECN_OK; 343 tp->ecn_flags = TCP_ECN_OK;
342 if (tcp_ca_needs_ecn(sk)) 344 if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn)
343 INET_ECN_xmit(sk); 345 INET_ECN_xmit(sk);
344 } 346 }
345} 347}