aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp_cong.c
diff options
context:
space:
mode:
authorDaniel Borkmann <dborkman@redhat.com>2015-01-05 17:57:46 -0500
committerDavid S. Miller <davem@davemloft.net>2015-01-05 22:55:24 -0500
commitc5c6a8ab45ec0f18733afb4aaade0d4a139d80b3 (patch)
treec248b79eca4c665244c4280dbde577cb71d001df /net/ipv4/tcp_cong.c
parent29ba4fffd396f4beefe34d24d7bcd86cb5c3e492 (diff)
net: tcp: add key management to congestion control
This patch adds necessary infrastructure to the congestion control framework for later per route congestion control support. For a per route congestion control possibility, our aim is to store a unique u32 key identifier into dst metrics, which can then be mapped into a tcp_congestion_ops struct. We argue that having a RTAX key entry is the most simple, generic and easy way to manage, and also keeps the memory footprint of dst entries lower on 64 bit than with storing a pointer directly, for example. Having a unique key id also allows for decoupling actual TCP congestion control module management from the FIB layer, i.e. we don't have to care about expensive module refcounting inside the FIB at this point. We first thought of using an IDR store for the realization, which takes over dynamic assignment of unused key space and also performs the key to pointer mapping in RCU. While doing so, we stumbled upon the issue that due to the nature of dynamic key distribution, it just so happens, arguably in very rare occasions, that excessive module loads and unloads can lead to a possible reuse of previously used key space. Thus, previously stale keys in the dst metric are now being reassigned to a different congestion control algorithm, which might lead to unexpected behaviour. One way to resolve this would have been to walk FIBs on the actually rare occasion of a module unload and reset the metric keys for each FIB in each netns, but that's just very costly. Therefore, we argue a better solution is to reuse the unique congestion control algorithm name member and map that into u32 key space through jhash. For that, we split the flags attribute (as it currently uses 2 bits only anyway) into two u32 attributes, flags and key, so that we can keep the cacheline boundary of 2 cachelines on x86_64 and cache the precalculated key at registration time for the fast path. On average we might expect 2 - 4 modules being loaded worst case perhaps 15, so a key collision possibility is extremely low, and guaranteed collision-free on LE/BE for all in-tree modules. Overall this results in much simpler code, and all without the overhead of an IDR. Due to the deterministic nature, modules can now be unloaded, the congestion control algorithm for a specific but unloaded key will fall back to the default one, and on module reload time it will switch back to the expected algorithm transparently. Joint work with Florian Westphal. Signed-off-by: Florian Westphal <fw@strlen.de> Signed-off-by: Daniel Borkmann <dborkman@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4/tcp_cong.c')
-rw-r--r--net/ipv4/tcp_cong.c97
1 files changed, 81 insertions, 16 deletions
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 38f2f8aa4ceb..63c29dba68a8 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -13,6 +13,7 @@
13#include <linux/types.h> 13#include <linux/types.h>
14#include <linux/list.h> 14#include <linux/list.h>
15#include <linux/gfp.h> 15#include <linux/gfp.h>
16#include <linux/jhash.h>
16#include <net/tcp.h> 17#include <net/tcp.h>
17 18
18static DEFINE_SPINLOCK(tcp_cong_list_lock); 19static DEFINE_SPINLOCK(tcp_cong_list_lock);
@@ -31,6 +32,34 @@ static struct tcp_congestion_ops *tcp_ca_find(const char *name)
31 return NULL; 32 return NULL;
32} 33}
33 34
35/* Must be called with rcu lock held */
36static const struct tcp_congestion_ops *__tcp_ca_find_autoload(const char *name)
37{
38 const struct tcp_congestion_ops *ca = tcp_ca_find(name);
39#ifdef CONFIG_MODULES
40 if (!ca && capable(CAP_NET_ADMIN)) {
41 rcu_read_unlock();
42 request_module("tcp_%s", name);
43 rcu_read_lock();
44 ca = tcp_ca_find(name);
45 }
46#endif
47 return ca;
48}
49
50/* Simple linear search, not much in here. */
51struct tcp_congestion_ops *tcp_ca_find_key(u32 key)
52{
53 struct tcp_congestion_ops *e;
54
55 list_for_each_entry_rcu(e, &tcp_cong_list, list) {
56 if (e->key == key)
57 return e;
58 }
59
60 return NULL;
61}
62
34/* 63/*
35 * Attach new congestion control algorithm to the list 64 * Attach new congestion control algorithm to the list
36 * of available options. 65 * of available options.
@@ -45,9 +74,12 @@ int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
45 return -EINVAL; 74 return -EINVAL;
46 } 75 }
47 76
77 ca->key = jhash(ca->name, sizeof(ca->name), strlen(ca->name));
78
48 spin_lock(&tcp_cong_list_lock); 79 spin_lock(&tcp_cong_list_lock);
49 if (tcp_ca_find(ca->name)) { 80 if (ca->key == TCP_CA_UNSPEC || tcp_ca_find_key(ca->key)) {
50 pr_notice("%s already registered\n", ca->name); 81 pr_notice("%s already registered or non-unique key\n",
82 ca->name);
51 ret = -EEXIST; 83 ret = -EEXIST;
52 } else { 84 } else {
53 list_add_tail_rcu(&ca->list, &tcp_cong_list); 85 list_add_tail_rcu(&ca->list, &tcp_cong_list);
@@ -70,9 +102,50 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca)
70 spin_lock(&tcp_cong_list_lock); 102 spin_lock(&tcp_cong_list_lock);
71 list_del_rcu(&ca->list); 103 list_del_rcu(&ca->list);
72 spin_unlock(&tcp_cong_list_lock); 104 spin_unlock(&tcp_cong_list_lock);
105
106 /* Wait for outstanding readers to complete before the
107 * module gets removed entirely.
108 *
109 * A try_module_get() should fail by now as our module is
110 * in "going" state since no refs are held anymore and
111 * module_exit() handler being called.
112 */
113 synchronize_rcu();
73} 114}
74EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control); 115EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control);
75 116
117u32 tcp_ca_get_key_by_name(const char *name)
118{
119 const struct tcp_congestion_ops *ca;
120 u32 key;
121
122 might_sleep();
123
124 rcu_read_lock();
125 ca = __tcp_ca_find_autoload(name);
126 key = ca ? ca->key : TCP_CA_UNSPEC;
127 rcu_read_unlock();
128
129 return key;
130}
131EXPORT_SYMBOL_GPL(tcp_ca_get_key_by_name);
132
133char *tcp_ca_get_name_by_key(u32 key, char *buffer)
134{
135 const struct tcp_congestion_ops *ca;
136 char *ret = NULL;
137
138 rcu_read_lock();
139 ca = tcp_ca_find_key(key);
140 if (ca)
141 ret = strncpy(buffer, ca->name,
142 TCP_CA_NAME_MAX);
143 rcu_read_unlock();
144
145 return ret;
146}
147EXPORT_SYMBOL_GPL(tcp_ca_get_name_by_key);
148
76/* Assign choice of congestion control. */ 149/* Assign choice of congestion control. */
77void tcp_assign_congestion_control(struct sock *sk) 150void tcp_assign_congestion_control(struct sock *sk)
78{ 151{
@@ -253,25 +326,17 @@ out:
253int tcp_set_congestion_control(struct sock *sk, const char *name) 326int tcp_set_congestion_control(struct sock *sk, const char *name)
254{ 327{
255 struct inet_connection_sock *icsk = inet_csk(sk); 328 struct inet_connection_sock *icsk = inet_csk(sk);
256 struct tcp_congestion_ops *ca; 329 const struct tcp_congestion_ops *ca;
257 int err = 0; 330 int err = 0;
258 331
259 rcu_read_lock(); 332 if (icsk->icsk_ca_dst_locked)
260 ca = tcp_ca_find(name); 333 return -EPERM;
261 334
262 /* no change asking for existing value */ 335 rcu_read_lock();
336 ca = __tcp_ca_find_autoload(name);
337 /* No change asking for existing value */
263 if (ca == icsk->icsk_ca_ops) 338 if (ca == icsk->icsk_ca_ops)
264 goto out; 339 goto out;
265
266#ifdef CONFIG_MODULES
267 /* not found attempt to autoload module */
268 if (!ca && capable(CAP_NET_ADMIN)) {
269 rcu_read_unlock();
270 request_module("tcp_%s", name);
271 rcu_read_lock();
272 ca = tcp_ca_find(name);
273 }
274#endif
275 if (!ca) 340 if (!ca)
276 err = -ENOENT; 341 err = -ENOENT;
277 else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || 342 else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) ||