aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorStephen Hemminger <stephen@networkplumber.org>2017-11-14 11:25:49 -0500
committerDavid S. Miller <davem@davemloft.net>2017-11-15 00:09:52 -0500
commit6670e152447732ba90626f36dfc015a13fbf150e (patch)
tree0848d15b1d27139ee651c8ecae0136bb679f205a
parent11bf284f81b46f59d5f4a4522c13aa7852cfd560 (diff)
tcp: Namespace-ify sysctl_tcp_default_congestion_control
Make default TCP default congestion control to a per namespace value. This changes default congestion control to a pointer to congestion ops (rather than implicit as first element of available lsit). The congestion control setting of new namespaces is inherited from the current setting of the root namespace. Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com> Reviewed-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--include/net/netns/ipv4.h1
-rw-r--r--include/net/tcp.h6
-rw-r--r--net/ipv4/fib_semantics.c4
-rw-r--r--net/ipv4/sysctl_net_ipv4.c19
-rw-r--r--net/ipv4/tcp_cong.c76
-rw-r--r--net/ipv4/tcp_ipv4.c9
-rw-r--r--net/ipv6/route.c3
7 files changed, 64 insertions, 54 deletions
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 5e12975fc658..44668c29701a 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -160,6 +160,7 @@ struct netns_ipv4 {
160 struct inet_timewait_death_row tcp_death_row; 160 struct inet_timewait_death_row tcp_death_row;
161 int sysctl_max_syn_backlog; 161 int sysctl_max_syn_backlog;
162 int sysctl_tcp_fastopen; 162 int sysctl_tcp_fastopen;
163 const struct tcp_congestion_ops __rcu *tcp_congestion_control;
163 struct tcp_fastopen_context __rcu *tcp_fastopen_ctx; 164 struct tcp_fastopen_context __rcu *tcp_fastopen_ctx;
164 spinlock_t tcp_fastopen_ctx_lock; 165 spinlock_t tcp_fastopen_ctx_lock;
165 unsigned int sysctl_tcp_fastopen_blackhole_timeout; 166 unsigned int sysctl_tcp_fastopen_blackhole_timeout;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index ed71511e67a6..35cc7d0d3d47 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1002,8 +1002,8 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *type);
1002void tcp_assign_congestion_control(struct sock *sk); 1002void tcp_assign_congestion_control(struct sock *sk);
1003void tcp_init_congestion_control(struct sock *sk); 1003void tcp_init_congestion_control(struct sock *sk);
1004void tcp_cleanup_congestion_control(struct sock *sk); 1004void tcp_cleanup_congestion_control(struct sock *sk);
1005int tcp_set_default_congestion_control(const char *name); 1005int tcp_set_default_congestion_control(struct net *net, const char *name);
1006void tcp_get_default_congestion_control(char *name); 1006void tcp_get_default_congestion_control(struct net *net, char *name);
1007void tcp_get_available_congestion_control(char *buf, size_t len); 1007void tcp_get_available_congestion_control(char *buf, size_t len);
1008void tcp_get_allowed_congestion_control(char *buf, size_t len); 1008void tcp_get_allowed_congestion_control(char *buf, size_t len);
1009int tcp_set_allowed_congestion_control(char *allowed); 1009int tcp_set_allowed_congestion_control(char *allowed);
@@ -1017,7 +1017,7 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked);
1017extern struct tcp_congestion_ops tcp_reno; 1017extern struct tcp_congestion_ops tcp_reno;
1018 1018
1019struct tcp_congestion_ops *tcp_ca_find_key(u32 key); 1019struct tcp_congestion_ops *tcp_ca_find_key(u32 key);
1020u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca); 1020u32 tcp_ca_get_key_by_name(struct net *net, const char *name, bool *ecn_ca);
1021#ifdef CONFIG_INET 1021#ifdef CONFIG_INET
1022char *tcp_ca_get_name_by_key(u32 key, char *buffer); 1022char *tcp_ca_get_name_by_key(u32 key, char *buffer);
1023#else 1023#else
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 589caaa90613..f04d944f8abe 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -710,7 +710,7 @@ bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi)
710 bool ecn_ca = false; 710 bool ecn_ca = false;
711 711
712 nla_strlcpy(tmp, nla, sizeof(tmp)); 712 nla_strlcpy(tmp, nla, sizeof(tmp));
713 val = tcp_ca_get_key_by_name(tmp, &ecn_ca); 713 val = tcp_ca_get_key_by_name(fi->fib_net, tmp, &ecn_ca);
714 } else { 714 } else {
715 val = nla_get_u32(nla); 715 val = nla_get_u32(nla);
716 } 716 }
@@ -1030,7 +1030,7 @@ fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg)
1030 char tmp[TCP_CA_NAME_MAX]; 1030 char tmp[TCP_CA_NAME_MAX];
1031 1031
1032 nla_strlcpy(tmp, nla, sizeof(tmp)); 1032 nla_strlcpy(tmp, nla, sizeof(tmp));
1033 val = tcp_ca_get_key_by_name(tmp, &ecn_ca); 1033 val = tcp_ca_get_key_by_name(fi->fib_net, tmp, &ecn_ca);
1034 if (val == TCP_CA_UNSPEC) 1034 if (val == TCP_CA_UNSPEC)
1035 return -EINVAL; 1035 return -EINVAL;
1036 } else { 1036 } else {
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index ef0ff3357a44..93e172118a94 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -201,6 +201,8 @@ static int ipv4_ping_group_range(struct ctl_table *table, int write,
201static int proc_tcp_congestion_control(struct ctl_table *ctl, int write, 201static int proc_tcp_congestion_control(struct ctl_table *ctl, int write,
202 void __user *buffer, size_t *lenp, loff_t *ppos) 202 void __user *buffer, size_t *lenp, loff_t *ppos)
203{ 203{
204 struct net *net = container_of(ctl->data, struct net,
205 ipv4.tcp_congestion_control);
204 char val[TCP_CA_NAME_MAX]; 206 char val[TCP_CA_NAME_MAX];
205 struct ctl_table tbl = { 207 struct ctl_table tbl = {
206 .data = val, 208 .data = val,
@@ -208,11 +210,11 @@ static int proc_tcp_congestion_control(struct ctl_table *ctl, int write,
208 }; 210 };
209 int ret; 211 int ret;
210 212
211 tcp_get_default_congestion_control(val); 213 tcp_get_default_congestion_control(net, val);
212 214
213 ret = proc_dostring(&tbl, write, buffer, lenp, ppos); 215 ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
214 if (write && ret == 0) 216 if (write && ret == 0)
215 ret = tcp_set_default_congestion_control(val); 217 ret = tcp_set_default_congestion_control(net, val);
216 return ret; 218 return ret;
217} 219}
218 220
@@ -447,12 +449,6 @@ static struct ctl_table ipv4_table[] = {
447 .mode = 0644, 449 .mode = 0644,
448 .proc_handler = proc_dointvec 450 .proc_handler = proc_dointvec
449 }, 451 },
450 {
451 .procname = "tcp_congestion_control",
452 .mode = 0644,
453 .maxlen = TCP_CA_NAME_MAX,
454 .proc_handler = proc_tcp_congestion_control,
455 },
456#ifdef CONFIG_NETLABEL 452#ifdef CONFIG_NETLABEL
457 { 453 {
458 .procname = "cipso_cache_enable", 454 .procname = "cipso_cache_enable",
@@ -764,6 +760,13 @@ static struct ctl_table ipv4_net_table[] = {
764 }, 760 },
765#endif 761#endif
766 { 762 {
763 .procname = "tcp_congestion_control",
764 .data = &init_net.ipv4.tcp_congestion_control,
765 .mode = 0644,
766 .maxlen = TCP_CA_NAME_MAX,
767 .proc_handler = proc_tcp_congestion_control,
768 },
769 {
767 .procname = "tcp_keepalive_time", 770 .procname = "tcp_keepalive_time",
768 .data = &init_net.ipv4.sysctl_tcp_keepalive_time, 771 .data = &init_net.ipv4.sysctl_tcp_keepalive_time,
769 .maxlen = sizeof(int), 772 .maxlen = sizeof(int),
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 2f26124fd160..bc6c02f16243 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -33,9 +33,11 @@ static struct tcp_congestion_ops *tcp_ca_find(const char *name)
33} 33}
34 34
35/* Must be called with rcu lock held */ 35/* Must be called with rcu lock held */
36static const struct tcp_congestion_ops *__tcp_ca_find_autoload(const char *name) 36static struct tcp_congestion_ops *tcp_ca_find_autoload(struct net *net,
37 const char *name)
37{ 38{
38 const struct tcp_congestion_ops *ca = tcp_ca_find(name); 39 struct tcp_congestion_ops *ca = tcp_ca_find(name);
40
39#ifdef CONFIG_MODULES 41#ifdef CONFIG_MODULES
40 if (!ca && capable(CAP_NET_ADMIN)) { 42 if (!ca && capable(CAP_NET_ADMIN)) {
41 rcu_read_unlock(); 43 rcu_read_unlock();
@@ -115,7 +117,7 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca)
115} 117}
116EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control); 118EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control);
117 119
118u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca) 120u32 tcp_ca_get_key_by_name(struct net *net, const char *name, bool *ecn_ca)
119{ 121{
120 const struct tcp_congestion_ops *ca; 122 const struct tcp_congestion_ops *ca;
121 u32 key = TCP_CA_UNSPEC; 123 u32 key = TCP_CA_UNSPEC;
@@ -123,7 +125,7 @@ u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca)
123 might_sleep(); 125 might_sleep();
124 126
125 rcu_read_lock(); 127 rcu_read_lock();
126 ca = __tcp_ca_find_autoload(name); 128 ca = tcp_ca_find_autoload(net, name);
127 if (ca) { 129 if (ca) {
128 key = ca->key; 130 key = ca->key;
129 *ecn_ca = ca->flags & TCP_CONG_NEEDS_ECN; 131 *ecn_ca = ca->flags & TCP_CONG_NEEDS_ECN;
@@ -153,23 +155,18 @@ EXPORT_SYMBOL_GPL(tcp_ca_get_name_by_key);
153/* Assign choice of congestion control. */ 155/* Assign choice of congestion control. */
154void tcp_assign_congestion_control(struct sock *sk) 156void tcp_assign_congestion_control(struct sock *sk)
155{ 157{
158 struct net *net = sock_net(sk);
156 struct inet_connection_sock *icsk = inet_csk(sk); 159 struct inet_connection_sock *icsk = inet_csk(sk);
157 struct tcp_congestion_ops *ca; 160 const struct tcp_congestion_ops *ca;
158 161
159 rcu_read_lock(); 162 rcu_read_lock();
160 list_for_each_entry_rcu(ca, &tcp_cong_list, list) { 163 ca = rcu_dereference(net->ipv4.tcp_congestion_control);
161 if (likely(try_module_get(ca->owner))) { 164 if (unlikely(!try_module_get(ca->owner)))
162 icsk->icsk_ca_ops = ca; 165 ca = &tcp_reno;
163 goto out; 166 icsk->icsk_ca_ops = ca;
164 }
165 /* Fallback to next available. The last really
166 * guaranteed fallback is Reno from this list.
167 */
168 }
169out:
170 rcu_read_unlock(); 167 rcu_read_unlock();
171 memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
172 168
169 memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
173 if (ca->flags & TCP_CONG_NEEDS_ECN) 170 if (ca->flags & TCP_CONG_NEEDS_ECN)
174 INET_ECN_xmit(sk); 171 INET_ECN_xmit(sk);
175 else 172 else
@@ -214,29 +211,27 @@ void tcp_cleanup_congestion_control(struct sock *sk)
214} 211}
215 212
216/* Used by sysctl to change default congestion control */ 213/* Used by sysctl to change default congestion control */
217int tcp_set_default_congestion_control(const char *name) 214int tcp_set_default_congestion_control(struct net *net, const char *name)
218{ 215{
219 struct tcp_congestion_ops *ca; 216 struct tcp_congestion_ops *ca;
220 int ret = -ENOENT; 217 const struct tcp_congestion_ops *prev;
221 218 int ret;
222 spin_lock(&tcp_cong_list_lock);
223 ca = tcp_ca_find(name);
224#ifdef CONFIG_MODULES
225 if (!ca && capable(CAP_NET_ADMIN)) {
226 spin_unlock(&tcp_cong_list_lock);
227 219
228 request_module("tcp_%s", name); 220 rcu_read_lock();
229 spin_lock(&tcp_cong_list_lock); 221 ca = tcp_ca_find_autoload(net, name);
230 ca = tcp_ca_find(name); 222 if (!ca) {
231 } 223 ret = -ENOENT;
232#endif 224 } else if (!try_module_get(ca->owner)) {
225 ret = -EBUSY;
226 } else {
227 prev = xchg(&net->ipv4.tcp_congestion_control, ca);
228 if (prev)
229 module_put(prev->owner);
233 230
234 if (ca) { 231 ca->flags |= TCP_CONG_NON_RESTRICTED;
235 ca->flags |= TCP_CONG_NON_RESTRICTED; /* default is always allowed */
236 list_move(&ca->list, &tcp_cong_list);
237 ret = 0; 232 ret = 0;
238 } 233 }
239 spin_unlock(&tcp_cong_list_lock); 234 rcu_read_unlock();
240 235
241 return ret; 236 return ret;
242} 237}
@@ -244,7 +239,8 @@ int tcp_set_default_congestion_control(const char *name)
244/* Set default value from kernel configuration at bootup */ 239/* Set default value from kernel configuration at bootup */
245static int __init tcp_congestion_default(void) 240static int __init tcp_congestion_default(void)
246{ 241{
247 return tcp_set_default_congestion_control(CONFIG_DEFAULT_TCP_CONG); 242 return tcp_set_default_congestion_control(&init_net,
243 CONFIG_DEFAULT_TCP_CONG);
248} 244}
249late_initcall(tcp_congestion_default); 245late_initcall(tcp_congestion_default);
250 246
@@ -264,14 +260,12 @@ void tcp_get_available_congestion_control(char *buf, size_t maxlen)
264} 260}
265 261
266/* Get current default congestion control */ 262/* Get current default congestion control */
267void tcp_get_default_congestion_control(char *name) 263void tcp_get_default_congestion_control(struct net *net, char *name)
268{ 264{
269 struct tcp_congestion_ops *ca; 265 const struct tcp_congestion_ops *ca;
270 /* We will always have reno... */
271 BUG_ON(list_empty(&tcp_cong_list));
272 266
273 rcu_read_lock(); 267 rcu_read_lock();
274 ca = list_entry(tcp_cong_list.next, struct tcp_congestion_ops, list); 268 ca = rcu_dereference(net->ipv4.tcp_congestion_control);
275 strncpy(name, ca->name, TCP_CA_NAME_MAX); 269 strncpy(name, ca->name, TCP_CA_NAME_MAX);
276 rcu_read_unlock(); 270 rcu_read_unlock();
277} 271}
@@ -351,12 +345,14 @@ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, boo
351 if (!load) 345 if (!load)
352 ca = tcp_ca_find(name); 346 ca = tcp_ca_find(name);
353 else 347 else
354 ca = __tcp_ca_find_autoload(name); 348 ca = tcp_ca_find_autoload(sock_net(sk), name);
349
355 /* No change asking for existing value */ 350 /* No change asking for existing value */
356 if (ca == icsk->icsk_ca_ops) { 351 if (ca == icsk->icsk_ca_ops) {
357 icsk->icsk_ca_setsockopt = 1; 352 icsk->icsk_ca_setsockopt = 1;
358 goto out; 353 goto out;
359 } 354 }
355
360 if (!ca) { 356 if (!ca) {
361 err = -ENOENT; 357 err = -ENOENT;
362 } else if (!load) { 358 } else if (!load) {
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 1eac84b8044e..c6bc0c4d19c6 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2430,6 +2430,8 @@ static void __net_exit tcp_sk_exit(struct net *net)
2430{ 2430{
2431 int cpu; 2431 int cpu;
2432 2432
2433 module_put(net->ipv4.tcp_congestion_control->owner);
2434
2433 for_each_possible_cpu(cpu) 2435 for_each_possible_cpu(cpu)
2434 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu)); 2436 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2435 free_percpu(net->ipv4.tcp_sk); 2437 free_percpu(net->ipv4.tcp_sk);
@@ -2522,6 +2524,13 @@ static int __net_init tcp_sk_init(struct net *net)
2522 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60; 2524 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2523 atomic_set(&net->ipv4.tfo_active_disable_times, 0); 2525 atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2524 2526
2527 /* Reno is always built in */
2528 if (!net_eq(net, &init_net) &&
2529 try_module_get(init_net.ipv4.tcp_congestion_control->owner))
2530 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2531 else
2532 net->ipv4.tcp_congestion_control = &tcp_reno;
2533
2525 return 0; 2534 return 0;
2526fail: 2535fail:
2527 tcp_sk_exit(net); 2536 tcp_sk_exit(net);
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 70d9659fc1e9..05eb7bc36156 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2378,6 +2378,7 @@ out:
2378static int ip6_convert_metrics(struct mx6_config *mxc, 2378static int ip6_convert_metrics(struct mx6_config *mxc,
2379 const struct fib6_config *cfg) 2379 const struct fib6_config *cfg)
2380{ 2380{
2381 struct net *net = cfg->fc_nlinfo.nl_net;
2381 bool ecn_ca = false; 2382 bool ecn_ca = false;
2382 struct nlattr *nla; 2383 struct nlattr *nla;
2383 int remaining; 2384 int remaining;
@@ -2403,7 +2404,7 @@ static int ip6_convert_metrics(struct mx6_config *mxc,
2403 char tmp[TCP_CA_NAME_MAX]; 2404 char tmp[TCP_CA_NAME_MAX];
2404 2405
2405 nla_strlcpy(tmp, nla, sizeof(tmp)); 2406 nla_strlcpy(tmp, nla, sizeof(tmp));
2406 val = tcp_ca_get_key_by_name(tmp, &ecn_ca); 2407 val = tcp_ca_get_key_by_name(net, tmp, &ecn_ca);
2407 if (val == TCP_CA_UNSPEC) 2408 if (val == TCP_CA_UNSPEC)
2408 goto err; 2409 goto err;
2409 } else { 2410 } else {