diff options
| author | Lawrence Brakmo <brakmo@fb.com> | 2017-06-30 23:02:49 -0400 |
|---|---|---|
| committer | David S. Miller <davem@davemloft.net> | 2017-07-01 19:15:14 -0400 |
| commit | 91b5b21c7c16899abb37f4a9e4388b4e9aae0b9d (patch) | |
| tree | ff5989374783d9f11e822906a98e94d08a6f135a | |
| parent | d9925368a641391f38cd281e67b948e6b6f3bcca (diff) | |
bpf: Add support for changing congestion control
Added support for changing congestion control for SOCK_OPS bpf
programs through the setsockopt bpf helper function. It also adds
a new SOCK_OPS op, BPF_SOCK_OPS_NEEDS_ECN, that is needed for
congestion controls, like dctcp, that need to enable ECN in the
SYN packets.
Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
| -rw-r--r-- | include/net/tcp.h | 9 | ||||
| -rw-r--r-- | include/uapi/linux/bpf.h | 3 | ||||
| -rw-r--r-- | net/core/filter.c | 18 | ||||
| -rw-r--r-- | net/ipv4/tcp.c | 2 | ||||
| -rw-r--r-- | net/ipv4/tcp_cong.c | 32 | ||||
| -rw-r--r-- | net/ipv4/tcp_input.c | 3 | ||||
| -rw-r--r-- | net/ipv4/tcp_output.c | 8 |
7 files changed, 58 insertions, 17 deletions
diff --git a/include/net/tcp.h b/include/net/tcp.h index d6bb3948203d..70483296157f 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h | |||
| @@ -1004,7 +1004,9 @@ void tcp_get_default_congestion_control(char *name); | |||
| 1004 | void tcp_get_available_congestion_control(char *buf, size_t len); | 1004 | void tcp_get_available_congestion_control(char *buf, size_t len); |
| 1005 | void tcp_get_allowed_congestion_control(char *buf, size_t len); | 1005 | void tcp_get_allowed_congestion_control(char *buf, size_t len); |
| 1006 | int tcp_set_allowed_congestion_control(char *allowed); | 1006 | int tcp_set_allowed_congestion_control(char *allowed); |
| 1007 | int tcp_set_congestion_control(struct sock *sk, const char *name); | 1007 | int tcp_set_congestion_control(struct sock *sk, const char *name, bool load); |
| 1008 | void tcp_reinit_congestion_control(struct sock *sk, | ||
| 1009 | const struct tcp_congestion_ops *ca); | ||
| 1008 | u32 tcp_slow_start(struct tcp_sock *tp, u32 acked); | 1010 | u32 tcp_slow_start(struct tcp_sock *tp, u32 acked); |
| 1009 | void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked); | 1011 | void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked); |
| 1010 | 1012 | ||
| @@ -2078,4 +2080,9 @@ static inline u32 tcp_rwnd_init_bpf(struct sock *sk) | |||
| 2078 | rwnd = 0; | 2080 | rwnd = 0; |
| 2079 | return rwnd; | 2081 | return rwnd; |
| 2080 | } | 2082 | } |
| 2083 | |||
| 2084 | static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk) | ||
| 2085 | { | ||
| 2086 | return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN) == 1); | ||
| 2087 | } | ||
| 2081 | #endif /* _TCP_H */ | 2088 | #endif /* _TCP_H */ |
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 2405fe304c98..cc4725982bd8 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h | |||
| @@ -778,6 +778,9 @@ enum { | |||
| 778 | * passive connection is | 778 | * passive connection is |
| 779 | * established | 779 | * established |
| 780 | */ | 780 | */ |
| 781 | BPF_SOCK_OPS_NEEDS_ECN, /* If connection's congestion control | ||
| 782 | * needs ECN | ||
| 783 | */ | ||
| 781 | }; | 784 | }; |
| 782 | 785 | ||
| 783 | #endif /* _UAPI__LINUX_BPF_H__ */ | 786 | #endif /* _UAPI__LINUX_BPF_H__ */ |
diff --git a/net/core/filter.c b/net/core/filter.c index ca033e15d35e..12df52711fe8 100644 --- a/net/core/filter.c +++ b/net/core/filter.c | |||
| @@ -2719,8 +2719,24 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, | |||
| 2719 | } | 2719 | } |
| 2720 | } else if (level == SOL_TCP && | 2720 | } else if (level == SOL_TCP && |
| 2721 | sk->sk_prot->setsockopt == tcp_setsockopt) { | 2721 | sk->sk_prot->setsockopt == tcp_setsockopt) { |
| 2722 | /* Place holder */ | 2722 | #ifdef CONFIG_INET |
| 2723 | if (optname == TCP_CONGESTION) { | ||
| 2724 | char name[TCP_CA_NAME_MAX]; | ||
| 2725 | |||
| 2726 | strncpy(name, optval, min_t(long, optlen, | ||
| 2727 | TCP_CA_NAME_MAX-1)); | ||
| 2728 | name[TCP_CA_NAME_MAX-1] = 0; | ||
| 2729 | ret = tcp_set_congestion_control(sk, name, false); | ||
| 2730 | if (!ret && bpf_sock->op > BPF_SOCK_OPS_NEEDS_ECN) | ||
| 2731 | /* replacing an existing ca */ | ||
| 2732 | tcp_reinit_congestion_control(sk, | ||
| 2733 | inet_csk(sk)->icsk_ca_ops); | ||
| 2734 | } else { | ||
| 2735 | ret = -EINVAL; | ||
| 2736 | } | ||
| 2737 | #else | ||
| 2723 | ret = -EINVAL; | 2738 | ret = -EINVAL; |
| 2739 | #endif | ||
| 2724 | } else { | 2740 | } else { |
| 2725 | ret = -EINVAL; | 2741 | ret = -EINVAL; |
| 2726 | } | 2742 | } |
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index fae45e402742..71ce33decd97 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c | |||
| @@ -2481,7 +2481,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, | |||
| 2481 | name[val] = 0; | 2481 | name[val] = 0; |
| 2482 | 2482 | ||
| 2483 | lock_sock(sk); | 2483 | lock_sock(sk); |
| 2484 | err = tcp_set_congestion_control(sk, name); | 2484 | err = tcp_set_congestion_control(sk, name, true); |
| 2485 | release_sock(sk); | 2485 | release_sock(sk); |
| 2486 | return err; | 2486 | return err; |
| 2487 | } | 2487 | } |
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 324c9bcc5456..fde983f6376b 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c | |||
| @@ -189,8 +189,8 @@ void tcp_init_congestion_control(struct sock *sk) | |||
| 189 | INET_ECN_dontxmit(sk); | 189 | INET_ECN_dontxmit(sk); |
| 190 | } | 190 | } |
| 191 | 191 | ||
| 192 | static void tcp_reinit_congestion_control(struct sock *sk, | 192 | void tcp_reinit_congestion_control(struct sock *sk, |
| 193 | const struct tcp_congestion_ops *ca) | 193 | const struct tcp_congestion_ops *ca) |
| 194 | { | 194 | { |
| 195 | struct inet_connection_sock *icsk = inet_csk(sk); | 195 | struct inet_connection_sock *icsk = inet_csk(sk); |
| 196 | 196 | ||
| @@ -333,8 +333,12 @@ out: | |||
| 333 | return ret; | 333 | return ret; |
| 334 | } | 334 | } |
| 335 | 335 | ||
| 336 | /* Change congestion control for socket */ | 336 | /* Change congestion control for socket. If load is false, then it is the |
| 337 | int tcp_set_congestion_control(struct sock *sk, const char *name) | 337 | * responsibility of the caller to call tcp_init_congestion_control or |
| 338 | * tcp_reinit_congestion_control (if the current congestion control was | ||
| 339 | * already initialized. | ||
| 340 | */ | ||
| 341 | int tcp_set_congestion_control(struct sock *sk, const char *name, bool load) | ||
| 338 | { | 342 | { |
| 339 | struct inet_connection_sock *icsk = inet_csk(sk); | 343 | struct inet_connection_sock *icsk = inet_csk(sk); |
| 340 | const struct tcp_congestion_ops *ca; | 344 | const struct tcp_congestion_ops *ca; |
| @@ -344,21 +348,29 @@ int tcp_set_congestion_control(struct sock *sk, const char *name) | |||
| 344 | return -EPERM; | 348 | return -EPERM; |
| 345 | 349 | ||
| 346 | rcu_read_lock(); | 350 | rcu_read_lock(); |
| 347 | ca = __tcp_ca_find_autoload(name); | 351 | if (!load) |
| 352 | ca = tcp_ca_find(name); | ||
| 353 | else | ||
| 354 | ca = __tcp_ca_find_autoload(name); | ||
| 348 | /* No change asking for existing value */ | 355 | /* No change asking for existing value */ |
| 349 | if (ca == icsk->icsk_ca_ops) { | 356 | if (ca == icsk->icsk_ca_ops) { |
| 350 | icsk->icsk_ca_setsockopt = 1; | 357 | icsk->icsk_ca_setsockopt = 1; |
| 351 | goto out; | 358 | goto out; |
| 352 | } | 359 | } |
| 353 | if (!ca) | 360 | if (!ca) { |
| 354 | err = -ENOENT; | 361 | err = -ENOENT; |
| 355 | else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || | 362 | } else if (!load) { |
| 356 | ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))) | 363 | icsk->icsk_ca_ops = ca; |
| 364 | if (!try_module_get(ca->owner)) | ||
| 365 | err = -EBUSY; | ||
| 366 | } else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || | ||
| 367 | ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))) { | ||
| 357 | err = -EPERM; | 368 | err = -EPERM; |
| 358 | else if (!try_module_get(ca->owner)) | 369 | } else if (!try_module_get(ca->owner)) { |
| 359 | err = -EBUSY; | 370 | err = -EBUSY; |
| 360 | else | 371 | } else { |
| 361 | tcp_reinit_congestion_control(sk, ca); | 372 | tcp_reinit_congestion_control(sk, ca); |
| 373 | } | ||
| 362 | out: | 374 | out: |
| 363 | rcu_read_unlock(); | 375 | rcu_read_unlock(); |
| 364 | return err; | 376 | return err; |
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 664210e5e4a7..2920e0cb09f8 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c | |||
| @@ -6191,7 +6191,8 @@ static void tcp_ecn_create_request(struct request_sock *req, | |||
| 6191 | ecn_ok = net->ipv4.sysctl_tcp_ecn || ecn_ok_dst; | 6191 | ecn_ok = net->ipv4.sysctl_tcp_ecn || ecn_ok_dst; |
| 6192 | 6192 | ||
| 6193 | if ((!ect && ecn_ok) || tcp_ca_needs_ecn(listen_sk) || | 6193 | if ((!ect && ecn_ok) || tcp_ca_needs_ecn(listen_sk) || |
| 6194 | (ecn_ok_dst & DST_FEATURE_ECN_CA)) | 6194 | (ecn_ok_dst & DST_FEATURE_ECN_CA) || |
| 6195 | tcp_bpf_ca_needs_ecn((struct sock *)req)) | ||
| 6195 | inet_rsk(req)->ecn_ok = 1; | 6196 | inet_rsk(req)->ecn_ok = 1; |
| 6196 | } | 6197 | } |
| 6197 | 6198 | ||
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 33b3e401e812..4d36f0b093e6 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c | |||
| @@ -316,7 +316,8 @@ static void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb) | |||
| 316 | TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR; | 316 | TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR; |
| 317 | if (!(tp->ecn_flags & TCP_ECN_OK)) | 317 | if (!(tp->ecn_flags & TCP_ECN_OK)) |
| 318 | TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE; | 318 | TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE; |
| 319 | else if (tcp_ca_needs_ecn(sk)) | 319 | else if (tcp_ca_needs_ecn(sk) || |
| 320 | tcp_bpf_ca_needs_ecn(sk)) | ||
| 320 | INET_ECN_xmit(sk); | 321 | INET_ECN_xmit(sk); |
| 321 | } | 322 | } |
| 322 | 323 | ||
| @@ -324,8 +325,9 @@ static void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb) | |||
| 324 | static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) | 325 | static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) |
| 325 | { | 326 | { |
| 326 | struct tcp_sock *tp = tcp_sk(sk); | 327 | struct tcp_sock *tp = tcp_sk(sk); |
| 328 | bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk); | ||
| 327 | bool use_ecn = sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 || | 329 | bool use_ecn = sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 || |
| 328 | tcp_ca_needs_ecn(sk); | 330 | tcp_ca_needs_ecn(sk) || bpf_needs_ecn; |
| 329 | 331 | ||
| 330 | if (!use_ecn) { | 332 | if (!use_ecn) { |
| 331 | const struct dst_entry *dst = __sk_dst_get(sk); | 333 | const struct dst_entry *dst = __sk_dst_get(sk); |
| @@ -339,7 +341,7 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) | |||
| 339 | if (use_ecn) { | 341 | if (use_ecn) { |
| 340 | TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR; | 342 | TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR; |
| 341 | tp->ecn_flags = TCP_ECN_OK; | 343 | tp->ecn_flags = TCP_ECN_OK; |
| 342 | if (tcp_ca_needs_ecn(sk)) | 344 | if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn) |
| 343 | INET_ECN_xmit(sk); | 345 | INET_ECN_xmit(sk); |
| 344 | } | 346 | } |
| 345 | } | 347 | } |
