aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
authorDaniel Borkmann <dborkman@redhat.com>2015-01-05 17:57:48 -0500
committerDavid S. Miller <davem@davemloft.net>2015-01-05 22:55:24 -0500
commit81164413ad096bafe8ad1068f3f095a7dd081d8b (patch)
tree3b4de2d4483e984a20640aa9f42681aebb6b955e /net/ipv4
parentea697639992d96da98016b8934e68a73876a2264 (diff)
net: tcp: add per route congestion control
This work adds the possibility to define a per route/destination congestion control algorithm. Generally, this opens up the possibility for a machine with different links to enforce specific congestion control algorithms with optimal strategies for each of them based on their network characteristics, even transparently for a single application listening on all links. For our specific use case, this additionally facilitates deployment of DCTCP, for example, applications can easily serve internal traffic/dsts in DCTCP and external one with CUBIC. Other scenarios would also allow for utilizing e.g. long living, low priority background flows for certain destinations/routes while still being able for normal traffic to utilize the default congestion control algorithm. We also thought about a per netns setting (where different defaults are possible), but given its actually a link specific property, we argue that a per route/destination setting is the most natural and flexible. The administrator can utilize this through ip-route(8) by appending "congctl [lock] <name>", where <name> denotes the name of a congestion control algorithm and the optional lock parameter allows to enforce the given algorithm so that applications in user space would not be allowed to overwrite that algorithm for that destination. The dst metric lookups are being done when a dst entry is already available in order to avoid a costly lookup and still before the algorithms are being initialized, thus overhead is very low when the feature is not being used. While the client side would need to drop the current reference on the module, on server side this can actually even be avoided as we just got a flat-copied socket clone. Joint work with Florian Westphal. Suggested-by: Hannes Frederic Sowa <hannes@stressinduktion.org> Signed-off-by: Florian Westphal <fw@strlen.de> Signed-off-by: Daniel Borkmann <dborkman@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/tcp_ipv4.c2
-rw-r--r--net/ipv4/tcp_minisocks.c30
-rw-r--r--net/ipv4/tcp_output.c21
3 files changed, 49 insertions, 4 deletions
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index a3f72d7fc06c..ad3e65bdd368 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1340,6 +1340,8 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1340 } 1340 }
1341 sk_setup_caps(newsk, dst); 1341 sk_setup_caps(newsk, dst);
1342 1342
1343 tcp_ca_openreq_child(newsk, dst);
1344
1343 tcp_sync_mss(newsk, dst_mtu(dst)); 1345 tcp_sync_mss(newsk, dst_mtu(dst));
1344 newtp->advmss = dst_metric_advmss(dst); 1346 newtp->advmss = dst_metric_advmss(dst);
1345 if (tcp_sk(sk)->rx_opt.user_mss && 1347 if (tcp_sk(sk)->rx_opt.user_mss &&
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 63d2680b65db..bc9216dc9de1 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -399,6 +399,32 @@ static void tcp_ecn_openreq_child(struct tcp_sock *tp,
399 tp->ecn_flags = inet_rsk(req)->ecn_ok ? TCP_ECN_OK : 0; 399 tp->ecn_flags = inet_rsk(req)->ecn_ok ? TCP_ECN_OK : 0;
400} 400}
401 401
402void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst)
403{
404 struct inet_connection_sock *icsk = inet_csk(sk);
405 u32 ca_key = dst_metric(dst, RTAX_CC_ALGO);
406 bool ca_got_dst = false;
407
408 if (ca_key != TCP_CA_UNSPEC) {
409 const struct tcp_congestion_ops *ca;
410
411 rcu_read_lock();
412 ca = tcp_ca_find_key(ca_key);
413 if (likely(ca && try_module_get(ca->owner))) {
414 icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst);
415 icsk->icsk_ca_ops = ca;
416 ca_got_dst = true;
417 }
418 rcu_read_unlock();
419 }
420
421 if (!ca_got_dst && !try_module_get(icsk->icsk_ca_ops->owner))
422 tcp_assign_congestion_control(sk);
423
424 tcp_set_ca_state(sk, TCP_CA_Open);
425}
426EXPORT_SYMBOL_GPL(tcp_ca_openreq_child);
427
402/* This is not only more efficient than what we used to do, it eliminates 428/* This is not only more efficient than what we used to do, it eliminates
403 * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM 429 * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
404 * 430 *
@@ -451,10 +477,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
451 newtp->snd_cwnd = TCP_INIT_CWND; 477 newtp->snd_cwnd = TCP_INIT_CWND;
452 newtp->snd_cwnd_cnt = 0; 478 newtp->snd_cwnd_cnt = 0;
453 479
454 if (!try_module_get(newicsk->icsk_ca_ops->owner))
455 tcp_assign_congestion_control(newsk);
456
457 tcp_set_ca_state(newsk, TCP_CA_Open);
458 tcp_init_xmit_timers(newsk); 480 tcp_init_xmit_timers(newsk);
459 __skb_queue_head_init(&newtp->out_of_order_queue); 481 __skb_queue_head_init(&newtp->out_of_order_queue);
460 newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1; 482 newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 7f18262e2326..dc30cb563e4f 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2939,6 +2939,25 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2939} 2939}
2940EXPORT_SYMBOL(tcp_make_synack); 2940EXPORT_SYMBOL(tcp_make_synack);
2941 2941
2942static void tcp_ca_dst_init(struct sock *sk, const struct dst_entry *dst)
2943{
2944 struct inet_connection_sock *icsk = inet_csk(sk);
2945 const struct tcp_congestion_ops *ca;
2946 u32 ca_key = dst_metric(dst, RTAX_CC_ALGO);
2947
2948 if (ca_key == TCP_CA_UNSPEC)
2949 return;
2950
2951 rcu_read_lock();
2952 ca = tcp_ca_find_key(ca_key);
2953 if (likely(ca && try_module_get(ca->owner))) {
2954 module_put(icsk->icsk_ca_ops->owner);
2955 icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst);
2956 icsk->icsk_ca_ops = ca;
2957 }
2958 rcu_read_unlock();
2959}
2960
2942/* Do all connect socket setups that can be done AF independent. */ 2961/* Do all connect socket setups that can be done AF independent. */
2943static void tcp_connect_init(struct sock *sk) 2962static void tcp_connect_init(struct sock *sk)
2944{ 2963{
@@ -2964,6 +2983,8 @@ static void tcp_connect_init(struct sock *sk)
2964 tcp_mtup_init(sk); 2983 tcp_mtup_init(sk);
2965 tcp_sync_mss(sk, dst_mtu(dst)); 2984 tcp_sync_mss(sk, dst_mtu(dst));
2966 2985
2986 tcp_ca_dst_init(sk, dst);
2987
2967 if (!tp->window_clamp) 2988 if (!tp->window_clamp)
2968 tp->window_clamp = dst_metric(dst, RTAX_WINDOW); 2989 tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
2969 tp->advmss = dst_metric_advmss(dst); 2990 tp->advmss = dst_metric_advmss(dst);