/* * Plugable TCP congestion control support and newReno * congestion control. * Based on ideas from I/O scheduler suport and Web100. * * Copyright (C) 2005 Stephen Hemminger <shemminger@osdl.org> */ #define pr_fmt(fmt) "TCP: " fmt #include <linux/module.h> #include <linux/mm.h> #include <linux/types.h> #include <linux/list.h> #include <linux/gfp.h> #include <net/tcp.h> int sysctl_tcp_max_ssthresh = 0; static DEFINE_SPINLOCK(tcp_cong_list_lock); static LIST_HEAD(tcp_cong_list); /* Simple linear search, don't expect many entries! */ static struct tcp_congestion_ops *tcp_ca_find(const char *name) { struct tcp_congestion_ops *e; list_for_each_entry_rcu(e, &tcp_cong_list, list) { if (strcmp(e->name, name) == 0) return e; } return NULL; } /* * Attach new congestion control algorithm to the list * of available options. */ int tcp_register_congestion_control(struct tcp_congestion_ops *ca) { int ret = 0; /* all algorithms must implement ssthresh and cong_avoid ops */ if (!ca->ssthresh || !ca->cong_avoid) { pr_err("%s does not implement required ops\n", ca->name); return -EINVAL; } spin_lock(&tcp_cong_list_lock); if (tcp_ca_find(ca->name)) { pr_notice("%s already registered\n", ca->name); ret = -EEXIST; } else { list_add_tail_rcu(&ca->list, &tcp_cong_list); pr_info("%s registered\n", ca->name); } spin_unlock(&tcp_cong_list_lock); return ret; } EXPORT_SYMBOL_GPL(tcp_register_congestion_control); /* * Remove congestion control algorithm, called from * the module's remove function. Module ref counts are used * to ensure that this can't be done till all sockets using * that method are closed. */ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca) { spin_lock(&tcp_cong_list_lock); list_del_rcu(&ca->list); spin_unlock(&tcp_cong_list_lock); } EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control); /* Assign choice of congestion control. */ void tcp_init_congestion_control(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_congestion_ops *ca; /* if no choice made yet assign the current value set as default */ if (icsk->icsk_ca_ops == &tcp_init_congestion_ops) { rcu_read_lock(); list_for_each_entry_rcu(ca, &tcp_cong_list, list) { if (try_module_get(ca->owner)) { icsk->icsk_ca_ops = ca; break; } /* fallback to next available */ } rcu_read_unlock(); } if (icsk->icsk_ca_ops->init) icsk->icsk_ca_ops->init(sk); } /* Manage refcounts on socket close. */ void tcp_cleanup_congestion_control(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); if (icsk->icsk_ca_ops->release) icsk->icsk_ca_ops->release(sk); module_put(icsk->icsk_ca_ops->owner); } /* Used by sysctl to change default congestion control */ int tcp_set_default_congestion_control(const char *name) { struct tcp_congestion_ops *ca; int ret = -ENOENT; spin_lock(&tcp_cong_list_lock); ca = tcp_ca_find(name); #ifdef CONFIG_MODULES if (!ca && capable(CAP_NET_ADMIN)) { spin_unlock(&tcp_cong_list_lock); request_module("tcp_%s", name); spin_lock(&tcp_cong_list_lock); ca = tcp_ca_find(name); } #endif if (ca) { ca->flags |= TCP_CONG_NON_RESTRICTED; /* default is always allowed */ list_move(&ca->list, &tcp_cong_list); ret = 0; } spin_unlock(&tcp_cong_list_lock); return ret; } /* Set default value from kernel configuration at bootup */ static int __init tcp_congestion_default(void) { return tcp_set_default_congestion_control(CONFIG_DEFAULT_TCP_CONG); } late_initcall(tcp_congestion_default); /* Build string with list of available congestion control values */ void tcp_get_available_congestion_control(char *buf, size_t maxlen) { struct tcp_congestion_ops *ca; size_t offs = 0; rcu_read_lock(); list_for_each_entry_rcu(ca, &tcp_cong_list, list) { offs += snprintf(buf + offs, maxlen - offs, "%s%s", offs == 0 ? "" : " ", ca->name); } rcu_read_unlock(); } /* Get current default congestion control */ void tcp_get_default_congestion_control(char *name) { struct tcp_congestion_ops *ca; /* We will always have reno... */ BUG_ON(list_empty(&tcp_cong_list)); rcu_read_lock(); ca = list_entry(tcp_cong_list.next, struct tcp_congestion_ops, list); strncpy(name, ca->name, TCP_CA_NAME_MAX); rcu_read_unlock(); } /* Built list of non-restricted congestion control values */ void tcp_get_allowed_congestion_control(char *buf, size_t maxlen) { struct tcp_congestion_ops *ca; size_t offs = 0; *buf = '\0'; rcu_read_lock(); list_for_each_entry_rcu(ca, &tcp_cong_list, list) { if (!(ca->flags & TCP_CONG_NON_RESTRICTED)) continue; offs += snprintf(buf + offs, maxlen - offs, "%s%s", offs == 0 ? "" : " ", ca->name); } rcu_read_unlock(); } /* Change list of non-restricted congestion control */ int tcp_set_allowed_congestion_control(char *val) { struct tcp_congestion_ops *ca; char *saved_clone, *clone, *name; int ret = 0; saved_clone = clone = kstrdup(val, GFP_USER); if (!clone) return -ENOMEM; spin_lock(&tcp_cong_list_lock); /* pass 1 check for bad entries */ while ((name = strsep(&clone, " ")) && *name) { ca = tcp_ca_find(name); if (!ca) { ret = -ENOENT; goto out; } } /* pass 2 clear old values */ list_for_each_entry_rcu(ca, &tcp_cong_list, list) ca->flags &= ~TCP_CONG_NON_RESTRICTED; /* pass 3 mark as allowed */ while ((name = strsep(&val, " ")) && *name) { ca = tcp_ca_find(name); WARN_ON(!ca); if (ca) ca->flags |= TCP_CONG_NON_RESTRICTED; } out: spin_unlock(&tcp_cong_list_lock); kfree(saved_clone); return ret; } /* Change congestion control for socket */ int tcp_set_congestion_control(struct sock *sk, const char *name) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_congestion_ops *ca; int err = 0; rcu_read_lock(); ca = tcp_ca_find(name); /* no change asking for existing value */ if (ca == icsk->icsk_ca_ops) goto out; #ifdef CONFIG_MODULES /* not found attempt to autoload module */ if (!ca && capable(CAP_NET_ADMIN)) { rcu_read_unlock(); request_module("tcp_%s", name); rcu_read_lock(); ca = tcp_ca_find(name); } #endif if (!ca) err = -ENOENT; else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || capable(CAP_NET_ADMIN))) err = -EPERM; else if (!try_module_get(ca->owner)) err = -EBUSY; else { tcp_cleanup_congestion_control(sk); icsk->icsk_ca_ops = ca; if (sk->sk_state != TCP_CLOSE && icsk->icsk_ca_ops->init) icsk->icsk_ca_ops->init(sk); } out: rcu_read_unlock(); return err; } /* RFC2861 Check whether we are limited by application or congestion window * This is the inverse of cwnd check in tcp_tso_should_defer */ bool tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight) { const struct tcp_sock *tp = tcp_sk(sk); u32 left; if (in_flight >= tp->snd_cwnd) return true; left = tp->snd_cwnd - in_flight; if (sk_can_gso(sk) && left * sysctl_tcp_tso_win_divisor < tp->snd_cwnd && left * tp->mss_cache < sk->sk_gso_max_size && left < sk->sk_gso_max_segs) return true; return left <= tcp_max_tso_deferred_mss(tp); } EXPORT_SYMBOL_GPL(tcp_is_cwnd_limited); /* * Slow start is used when congestion window is less than slow start * threshold. This version implements the basic RFC2581 version * and optionally supports: * RFC3742 Limited Slow Start - growth limited to max_ssthresh * RFC3465 Appropriate Byte Counting - growth limited by bytes acknowledged */ void tcp_slow_start(struct tcp_sock *tp) { int cnt; /* increase in packets */ unsigned int delta = 0; /* RFC3465: ABC Slow start * Increase only after a full MSS of bytes is acked * * TCP sender SHOULD increase cwnd by the number of * previously unacknowledged bytes ACKed by each incoming * acknowledgment, provided the increase is not more than L */ if (sysctl_tcp_abc && tp->bytes_acked < tp->mss_cache) return; if (sysctl_tcp_max_ssthresh > 0 && tp->snd_cwnd > sysctl_tcp_max_ssthresh) cnt = sysctl_tcp_max_ssthresh >> 1; /* limited slow start */ else cnt = tp->snd_cwnd; /* exponential increase */ /* RFC3465: ABC * We MAY increase by 2 if discovered delayed ack */ if (sysctl_tcp_abc > 1 && tp->bytes_acked >= 2*tp->mss_cache) cnt <<= 1; tp->bytes_acked = 0; tp->snd_cwnd_cnt += cnt; while (tp->snd_cwnd_cnt >= tp->snd_cwnd) { tp->snd_cwnd_cnt -= tp->snd_cwnd; delta++; } tp->snd_cwnd = min(tp->snd_cwnd + delta, tp->snd_cwnd_clamp); } EXPORT_SYMBOL_GPL(tcp_slow_start); /* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd (or alternative w) */ void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w) { if (tp->snd_cwnd_cnt >= w) { if (tp->snd_cwnd < tp->snd_cwnd_clamp) tp->snd_cwnd++; tp->snd_cwnd_cnt = 0; } else { tp->snd_cwnd_cnt++; } } EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai); /* * TCP Reno congestion control * This is special case used for fallback as well. */ /* This is Jacobson's slow start and congestion avoidance. * SIGCOMM '88, p. 328. */ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) { struct tcp_sock *tp = tcp_sk(sk); if (!tcp_is_cwnd_limited(sk, in_flight)) return; /* In "safe" area, increase. */ if (tp->snd_cwnd <= tp->snd_ssthresh) tcp_slow_start(tp); /* In dangerous area, increase slowly. */ else if (sysctl_tcp_abc) { /* RFC3465: Appropriate Byte Count * increase once for each full cwnd acked */ if (tp->bytes_acked >= tp->snd_cwnd*tp->mss_cache) { tp->bytes_acked -= tp->snd_cwnd*tp->mss_cache; if (tp->snd_cwnd < tp->snd_cwnd_clamp) tp->snd_cwnd++; } } else { tcp_cong_avoid_ai(tp, tp->snd_cwnd); } } EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid); /* Slow start threshold is half the congestion window (min 2) */ u32 tcp_reno_ssthresh(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); return max(tp->snd_cwnd >> 1U, 2U); } EXPORT_SYMBOL_GPL(tcp_reno_ssthresh); /* Lower bound on congestion window with halving. */ u32 tcp_reno_min_cwnd(const struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); return tp->snd_ssthresh/2; } EXPORT_SYMBOL_GPL(tcp_reno_min_cwnd); struct tcp_congestion_ops tcp_reno = { .flags = TCP_CONG_NON_RESTRICTED, .name = "reno", .owner = THIS_MODULE, .ssthresh = tcp_reno_ssthresh, .cong_avoid = tcp_reno_cong_avoid, .min_cwnd = tcp_reno_min_cwnd, }; /* Initial congestion control used (until SYN) * really reno under another name so we can tell difference * during tcp_set_default_congestion_control */ struct tcp_congestion_ops tcp_init_congestion_ops = { .name = "", .owner = THIS_MODULE, .ssthresh = tcp_reno_ssthresh, .cong_avoid = tcp_reno_cong_avoid, .min_cwnd = tcp_reno_min_cwnd, }; EXPORT_SYMBOL_GPL(tcp_init_congestion_ops);