aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp_minisocks.c
diff options
context:
space:
mode:
authorDaniel Borkmann <dborkman@redhat.com>2015-01-05 17:57:48 -0500
committerDavid S. Miller <davem@davemloft.net>2015-01-05 22:55:24 -0500
commit81164413ad096bafe8ad1068f3f095a7dd081d8b (patch)
tree3b4de2d4483e984a20640aa9f42681aebb6b955e /net/ipv4/tcp_minisocks.c
parentea697639992d96da98016b8934e68a73876a2264 (diff)
net: tcp: add per route congestion control
This work adds the possibility to define a per route/destination congestion control algorithm. Generally, this opens up the possibility for a machine with different links to enforce specific congestion control algorithms with optimal strategies for each of them based on their network characteristics, even transparently for a single application listening on all links. For our specific use case, this additionally facilitates deployment of DCTCP, for example, applications can easily serve internal traffic/dsts in DCTCP and external one with CUBIC. Other scenarios would also allow for utilizing e.g. long living, low priority background flows for certain destinations/routes while still being able for normal traffic to utilize the default congestion control algorithm. We also thought about a per netns setting (where different defaults are possible), but given its actually a link specific property, we argue that a per route/destination setting is the most natural and flexible. The administrator can utilize this through ip-route(8) by appending "congctl [lock] <name>", where <name> denotes the name of a congestion control algorithm and the optional lock parameter allows to enforce the given algorithm so that applications in user space would not be allowed to overwrite that algorithm for that destination. The dst metric lookups are being done when a dst entry is already available in order to avoid a costly lookup and still before the algorithms are being initialized, thus overhead is very low when the feature is not being used. While the client side would need to drop the current reference on the module, on server side this can actually even be avoided as we just got a flat-copied socket clone. Joint work with Florian Westphal. Suggested-by: Hannes Frederic Sowa <hannes@stressinduktion.org> Signed-off-by: Florian Westphal <fw@strlen.de> Signed-off-by: Daniel Borkmann <dborkman@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4/tcp_minisocks.c')
-rw-r--r--net/ipv4/tcp_minisocks.c30
1 files changed, 26 insertions, 4 deletions
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 63d2680b65db..bc9216dc9de1 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -399,6 +399,32 @@ static void tcp_ecn_openreq_child(struct tcp_sock *tp,
399 tp->ecn_flags = inet_rsk(req)->ecn_ok ? TCP_ECN_OK : 0; 399 tp->ecn_flags = inet_rsk(req)->ecn_ok ? TCP_ECN_OK : 0;
400} 400}
401 401
402void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst)
403{
404 struct inet_connection_sock *icsk = inet_csk(sk);
405 u32 ca_key = dst_metric(dst, RTAX_CC_ALGO);
406 bool ca_got_dst = false;
407
408 if (ca_key != TCP_CA_UNSPEC) {
409 const struct tcp_congestion_ops *ca;
410
411 rcu_read_lock();
412 ca = tcp_ca_find_key(ca_key);
413 if (likely(ca && try_module_get(ca->owner))) {
414 icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst);
415 icsk->icsk_ca_ops = ca;
416 ca_got_dst = true;
417 }
418 rcu_read_unlock();
419 }
420
421 if (!ca_got_dst && !try_module_get(icsk->icsk_ca_ops->owner))
422 tcp_assign_congestion_control(sk);
423
424 tcp_set_ca_state(sk, TCP_CA_Open);
425}
426EXPORT_SYMBOL_GPL(tcp_ca_openreq_child);
427
402/* This is not only more efficient than what we used to do, it eliminates 428/* This is not only more efficient than what we used to do, it eliminates
403 * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM 429 * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
404 * 430 *
@@ -451,10 +477,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
451 newtp->snd_cwnd = TCP_INIT_CWND; 477 newtp->snd_cwnd = TCP_INIT_CWND;
452 newtp->snd_cwnd_cnt = 0; 478 newtp->snd_cwnd_cnt = 0;
453 479
454 if (!try_module_get(newicsk->icsk_ca_ops->owner))
455 tcp_assign_congestion_control(newsk);
456
457 tcp_set_ca_state(newsk, TCP_CA_Open);
458 tcp_init_xmit_timers(newsk); 480 tcp_init_xmit_timers(newsk);
459 __skb_queue_head_init(&newtp->out_of_order_queue); 481 __skb_queue_head_init(&newtp->out_of_order_queue);
460 newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1; 482 newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1;