aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp_output.c
diff options
context:
space:
mode:
authorDaniel Borkmann <dborkman@redhat.com>2015-01-05 17:57:48 -0500
committerDavid S. Miller <davem@davemloft.net>2015-01-05 22:55:24 -0500
commit81164413ad096bafe8ad1068f3f095a7dd081d8b (patch)
tree3b4de2d4483e984a20640aa9f42681aebb6b955e /net/ipv4/tcp_output.c
parentea697639992d96da98016b8934e68a73876a2264 (diff)
net: tcp: add per route congestion control
This work adds the possibility to define a per route/destination congestion control algorithm. Generally, this opens up the possibility for a machine with different links to enforce specific congestion control algorithms with optimal strategies for each of them based on their network characteristics, even transparently for a single application listening on all links. For our specific use case, this additionally facilitates deployment of DCTCP, for example, applications can easily serve internal traffic/dsts in DCTCP and external one with CUBIC. Other scenarios would also allow for utilizing e.g. long living, low priority background flows for certain destinations/routes while still being able for normal traffic to utilize the default congestion control algorithm. We also thought about a per netns setting (where different defaults are possible), but given its actually a link specific property, we argue that a per route/destination setting is the most natural and flexible. The administrator can utilize this through ip-route(8) by appending "congctl [lock] <name>", where <name> denotes the name of a congestion control algorithm and the optional lock parameter allows to enforce the given algorithm so that applications in user space would not be allowed to overwrite that algorithm for that destination. The dst metric lookups are being done when a dst entry is already available in order to avoid a costly lookup and still before the algorithms are being initialized, thus overhead is very low when the feature is not being used. While the client side would need to drop the current reference on the module, on server side this can actually even be avoided as we just got a flat-copied socket clone. Joint work with Florian Westphal. Suggested-by: Hannes Frederic Sowa <hannes@stressinduktion.org> Signed-off-by: Florian Westphal <fw@strlen.de> Signed-off-by: Daniel Borkmann <dborkman@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4/tcp_output.c')
-rw-r--r--net/ipv4/tcp_output.c21
1 files changed, 21 insertions, 0 deletions
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 7f18262e2326..dc30cb563e4f 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2939,6 +2939,25 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2939} 2939}
2940EXPORT_SYMBOL(tcp_make_synack); 2940EXPORT_SYMBOL(tcp_make_synack);
2941 2941
2942static void tcp_ca_dst_init(struct sock *sk, const struct dst_entry *dst)
2943{
2944 struct inet_connection_sock *icsk = inet_csk(sk);
2945 const struct tcp_congestion_ops *ca;
2946 u32 ca_key = dst_metric(dst, RTAX_CC_ALGO);
2947
2948 if (ca_key == TCP_CA_UNSPEC)
2949 return;
2950
2951 rcu_read_lock();
2952 ca = tcp_ca_find_key(ca_key);
2953 if (likely(ca && try_module_get(ca->owner))) {
2954 module_put(icsk->icsk_ca_ops->owner);
2955 icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst);
2956 icsk->icsk_ca_ops = ca;
2957 }
2958 rcu_read_unlock();
2959}
2960
2942/* Do all connect socket setups that can be done AF independent. */ 2961/* Do all connect socket setups that can be done AF independent. */
2943static void tcp_connect_init(struct sock *sk) 2962static void tcp_connect_init(struct sock *sk)
2944{ 2963{
@@ -2964,6 +2983,8 @@ static void tcp_connect_init(struct sock *sk)
2964 tcp_mtup_init(sk); 2983 tcp_mtup_init(sk);
2965 tcp_sync_mss(sk, dst_mtu(dst)); 2984 tcp_sync_mss(sk, dst_mtu(dst));
2966 2985
2986 tcp_ca_dst_init(sk, dst);
2987
2967 if (!tp->window_clamp) 2988 if (!tp->window_clamp)
2968 tp->window_clamp = dst_metric(dst, RTAX_WINDOW); 2989 tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
2969 tp->advmss = dst_metric_advmss(dst); 2990 tp->advmss = dst_metric_advmss(dst);