[TCP]: Add TCP Vegas congestion control module.

TCP Vegas code modified for the new TCP infrastructure. Vegas now uses microsecond resolution timestamps for better estimation of performance over higher speed links. Signed-off-by: Stephen Hemminger <shemminger@osdl.org> Signed-off-by: David S. Miller <davem@davemloft.net>
author: Stephen Hemminger <shemminger@osdl.org> 2005-06-23 15:27:19 -0400
committer: David S. Miller <davem@davemloft.net> 2005-06-23 15:27:19 -0400
commit: b87d8561d8667d221b728ccdcb18eb95b16a687b (patch)
tree: 715b8e8d8442e418364498a12712106530031b96 /net
parent: 835b3f0c0d7e1f716c45ec576662eac7a68b8548 (diff)
3 files changed, 423 insertions, 0 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 516ffe842816..6c105b60cc00 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -488,6 +488,17 @@ config TCP_CONG_HYBLA
        involved, expecially when sharing a common bottleneck with normal
        terrestrial connections.
+config TCP_CONG_VEGAS
+        tristate "TCP Vegas"
+        depends on INET && EXPERIMENTAL
+        default n
+        ---help---
+        TCP Vegas is a sender-side only change to TCP that anticipates
+        the onset of congestion by estimating the bandwidth. TCP Vegas
+        adjusts the sending rate by modifying the congestion
+        window. TCP Vegas should provide less packet loss, but it is
+        not as aggressive as TCP Reno.
 endmenu
 source "net/ipv4/ipvs/Kconfig"
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index ede7a5d617ff..a801a97bd011 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -35,6 +35,7 @@ obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
 obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o
 obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o
 obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o
+obj-$(CONFIG_TCP_CONG_VEGAS) += tcp_vegas.o
 obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
                      xfrm4_output.o
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
new file mode 100644
index 000000000000..9bd443db5193
--- /dev/null
+++ b/net/ipv4/tcp_vegas.c
@@ -0,0 +1,411 @@
+/*
+ * TCP Vegas congestion control
+ *
+ * This is based on the congestion detection/avoidance scheme described in
+ *    Lawrence S. Brakmo and Larry L. Peterson.
+ *    "TCP Vegas: End to end congestion avoidance on a global internet."
+ *    IEEE Journal on Selected Areas in Communication, 13(8):1465--1480,
+ *    October 1995. Available from:
+ *      ftp://ftp.cs.arizona.edu/xkernel/Papers/jsac.ps
+ *
+ * See http://www.cs.arizona.edu/xkernel/ for their implementation.
+ * The main aspects that distinguish this implementation from the
+ * Arizona Vegas implementation are:
+ *   o We do not change the loss detection or recovery mechanisms of
+ *     Linux in any way. Linux already recovers from losses quite well,
+ *     using fine-grained timers, NewReno, and FACK.
+ *   o To avoid the performance penalty imposed by increasing cwnd
+ *     only every-other RTT during slow start, we increase during
+ *     every RTT during slow start, just like Reno.
+ *   o Largely to allow continuous cwnd growth during slow start,
+ *     we use the rate at which ACKs come back as the "actual"
+ *     rate, rather than the rate at which data is sent.
+ *   o To speed convergence to the right rate, we set the cwnd
+ *     to achieve the right ("actual") rate when we exit slow start.
+ *   o To filter out the noise caused by delayed ACKs, we use the
+ *     minimum RTT sample observed during the last RTT to calculate
+ *     the actual rate.
+ *   o When the sender re-starts from idle, it waits until it has
+ *     received ACKs for an entire flight of new data before making
+ *     a cwnd adjustment decision. The original Vegas implementation
+ *     assumed senders never went idle.
+ */
+#include <linux/config.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/tcp_diag.h>
+#include <net/tcp.h>
+/* Default values of the Vegas variables, in fixed-point representation
+ * with V_PARAM_SHIFT bits to the right of the binary point.
+ */
+#define V_PARAM_SHIFT 1
+static int alpha = 1<<V_PARAM_SHIFT;
+static int beta  = 3<<V_PARAM_SHIFT;
+static int gamma = 1<<V_PARAM_SHIFT;
+module_param(alpha, int, 0644);
+MODULE_PARM_DESC(alpha, "lower bound of packets in network (scale by 2)");
+module_param(beta, int, 0644);
+MODULE_PARM_DESC(beta, "upper bound of packets in network (scale by 2)");
+module_param(gamma, int, 0644);
+MODULE_PARM_DESC(gamma, "limit on increase (scale by 2)");
+/* Vegas variables */
+struct vegas {
+        u32     beg_snd_nxt;    /* right edge during last RTT */
+        u32     beg_snd_una;    /* left edge  during last RTT */
+        u32     beg_snd_cwnd;   /* saves the size of the cwnd */
+        u8      doing_vegas_now;/* if true, do vegas for this RTT */
+        u16     cntRTT;         /* # of RTTs measured within last RTT */
+        u32     minRTT;         /* min of RTTs measured within last RTT (in usec) */
+        u32     baseRTT;        /* the min of all Vegas RTT measurements seen (in usec) */
+};
+/* There are several situations when we must "re-start" Vegas:
+ *
+ *  o when a connection is established
+ *  o after an RTO
+ *  o after fast recovery
+ *  o when we send a packet and there is no outstanding
+ *    unacknowledged data (restarting an idle connection)
+ *
+ * In these circumstances we cannot do a Vegas calculation at the
+ * end of the first RTT, because any calculation we do is using
+ * stale info -- both the saved cwnd and congestion feedback are
+ * stale.
+ *
+ * Instead we must wait until the completion of an RTT during
+ * which we actually receive ACKs.
+ */
+static inline void vegas_enable(struct tcp_sock *tp)
+{
+        struct vegas *vegas = tcp_ca(tp);
+        /* Begin taking Vegas samples next time we send something. */
+        vegas->doing_vegas_now = 1;
+        /* Set the beginning of the next send window. */
+        vegas->beg_snd_nxt = tp->snd_nxt;
+        vegas->cntRTT = 0;
+        vegas->minRTT = 0x7fffffff;
+}
+/* Stop taking Vegas samples for now. */
+static inline void vegas_disable(struct tcp_sock *tp)
+{
+        struct vegas *vegas = tcp_ca(tp);
+        vegas->doing_vegas_now = 0;
+}
+static void tcp_vegas_init(struct tcp_sock *tp)
+{
+        struct vegas *vegas = tcp_ca(tp);
+        vegas->baseRTT = 0x7fffffff;
+        vegas_enable(tp);
+}
+/* Do RTT sampling needed for Vegas.
+ * Basically we:
+ *   o min-filter RTT samples from within an RTT to get the current
+ *     propagation delay + queuing delay (we are min-filtering to try to
+ *     avoid the effects of delayed ACKs)
+ *   o min-filter RTT samples from a much longer window (forever for now)
+ *     to find the propagation delay (baseRTT)
+ */
+static void tcp_vegas_rtt_calc(struct tcp_sock *tp, u32 usrtt)
+{
+        struct vegas *vegas = tcp_ca(tp);
+        u32 vrtt = usrtt + 1; /* Never allow zero rtt or baseRTT */
+        /* Filter to find propagation delay: */
+        if (vrtt < vegas->baseRTT)
+                vegas->baseRTT = vrtt;
+        /* Find the min RTT during the last RTT to find
+         * the current prop. delay + queuing delay:
+         */
+        vegas->minRTT = min(vegas->minRTT, vrtt);
+        vegas->cntRTT++;
+}
+static void tcp_vegas_state(struct tcp_sock *tp, u8 ca_state)
+{
+        if (ca_state == TCP_CA_Open)
+                vegas_enable(tp);
+        else
+                vegas_disable(tp);
+}
+/*
+ * If the connection is idle and we are restarting,
+ * then we don't want to do any Vegas calculations
+ * until we get fresh RTT samples.  So when we
+ * restart, we reset our Vegas state to a clean
+ * slate. After we get acks for this flight of
+ * packets, _then_ we can make Vegas calculations
+ * again.
+ */
+static void tcp_vegas_cwnd_event(struct tcp_sock *tp, enum tcp_ca_event event)
+{
+        if (event == CA_EVENT_CWND_RESTART ||
+            event == CA_EVENT_TX_START)
+                tcp_vegas_init(tp);
+}
+static void tcp_vegas_cong_avoid(struct tcp_sock *tp, u32 ack,
+                                 u32 seq_rtt, u32 in_flight, int flag)
+{
+        struct vegas *vegas = tcp_ca(tp);
+        if (!vegas->doing_vegas_now)
+                return tcp_reno_cong_avoid(tp, ack, seq_rtt, in_flight, flag);
+        /* The key players are v_beg_snd_una and v_beg_snd_nxt.
+         *
+         * These are so named because they represent the approximate values
+         * of snd_una and snd_nxt at the beginning of the current RTT. More
+         * precisely, they represent the amount of data sent during the RTT.
+         * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt,
+         * we will calculate that (v_beg_snd_nxt - v_beg_snd_una) outstanding
+         * bytes of data have been ACKed during the course of the RTT, giving
+         * an "actual" rate of:
+         *
+         *     (v_beg_snd_nxt - v_beg_snd_una) / (rtt duration)
+         *
+         * Unfortunately, v_beg_snd_una is not exactly equal to snd_una,
+         * because delayed ACKs can cover more than one segment, so they
+         * don't line up nicely with the boundaries of RTTs.
+         *
+         * Another unfortunate fact of life is that delayed ACKs delay the
+         * advance of the left edge of our send window, so that the number
+         * of bytes we send in an RTT is often less than our cwnd will allow.
+         * So we keep track of our cwnd separately, in v_beg_snd_cwnd.
+         */
+        if (after(ack, vegas->beg_snd_nxt)) {
+                /* Do the Vegas once-per-RTT cwnd adjustment. */
+                u32 old_wnd, old_snd_cwnd;
+                /* Here old_wnd is essentially the window of data that was
+                 * sent during the previous RTT, and has all
+                 * been acknowledged in the course of the RTT that ended
+                 * with the ACK we just received. Likewise, old_snd_cwnd
+                 * is the cwnd during the previous RTT.
+                 */
+                old_wnd = (vegas->beg_snd_nxt - vegas->beg_snd_una) /
+                        tp->mss_cache;
+                old_snd_cwnd = vegas->beg_snd_cwnd;
+                /* Save the extent of the current window so we can use this
+                 * at the end of the next RTT.
+                 */
+                vegas->beg_snd_una  = vegas->beg_snd_nxt;
+                vegas->beg_snd_nxt  = tp->snd_nxt;
+                vegas->beg_snd_cwnd = tp->snd_cwnd;
+                /* Take into account the current RTT sample too, to
+                 * decrease the impact of delayed acks. This double counts
+                 * this sample since we count it for the next window as well,
+                 * but that's not too awful, since we're taking the min,
+                 * rather than averaging.
+                 */
+                tcp_vegas_rtt_calc(tp, seq_rtt*1000);
+                /* We do the Vegas calculations only if we got enough RTT
+                 * samples that we can be reasonably sure that we got
+                 * at least one RTT sample that wasn't from a delayed ACK.
+                 * If we only had 2 samples total,
+                 * then that means we're getting only 1 ACK per RTT, which
+                 * means they're almost certainly delayed ACKs.
+                 * If  we have 3 samples, we should be OK.
+                 */
+                if (vegas->cntRTT <= 2) {
+                        /* We don't have enough RTT samples to do the Vegas
+                         * calculation, so we'll behave like Reno.
+                         */
+                        if (tp->snd_cwnd > tp->snd_ssthresh)
+                                tp->snd_cwnd++;
+                } else {
+                        u32 rtt, target_cwnd, diff;
+                        /* We have enough RTT samples, so, using the Vegas
+                         * algorithm, we determine if we should increase or
+                         * decrease cwnd, and by how much.
+                         */
+                        /* Pluck out the RTT we are using for the Vegas
+                         * calculations. This is the min RTT seen during the
+                         * last RTT. Taking the min filters out the effects
+                         * of delayed ACKs, at the cost of noticing congestion
+                         * a bit later.
+                         */
+                        rtt = vegas->minRTT;
+                        /* Calculate the cwnd we should have, if we weren't
+                         * going too fast.
+                         *
+                         * This is:
+                         *     (actual rate in segments) * baseRTT
+                         * We keep it as a fixed point number with
+                         * V_PARAM_SHIFT bits to the right of the binary point.
+                         */
+                        target_cwnd = ((old_wnd * vegas->baseRTT)
+                                       << V_PARAM_SHIFT) / rtt;
+                        /* Calculate the difference between the window we had,
+                         * and the window we would like to have. This quantity
+                         * is the "Diff" from the Arizona Vegas papers.
+                         *
+                         * Again, this is a fixed point number with
+                         * V_PARAM_SHIFT bits to the right of the binary
+                         * point.
+                         */
+                        diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd;
+                        if (tp->snd_cwnd < tp->snd_ssthresh) {
+                                /* Slow start.  */
+                                if (diff > gamma) {
+                                        /* Going too fast. Time to slow down
+                                         * and switch to congestion avoidance.
+                                         */
+                                        tp->snd_ssthresh = 2;
+                                        /* Set cwnd to match the actual rate
+                                         * exactly:
+                                         *   cwnd = (actual rate) * baseRTT
+                                         * Then we add 1 because the integer
+                                         * truncation robs us of full link
+                                         * utilization.
+                                         */
+                                        tp->snd_cwnd = min(tp->snd_cwnd,
+                                                           (target_cwnd >>
+                                                            V_PARAM_SHIFT)+1);
+                                }
+                        } else {
+                                /* Congestion avoidance. */
+                                u32 next_snd_cwnd;
+                                /* Figure out where we would like cwnd
+                                 * to be.
+                                 */
+                                if (diff > beta) {
+                                        /* The old window was too fast, so
+                                         * we slow down.
+                                         */
+                                        next_snd_cwnd = old_snd_cwnd - 1;
+                                } else if (diff < alpha) {
+                                        /* We don't have enough extra packets
+                                         * in the network, so speed up.
+                                         */
+                                        next_snd_cwnd = old_snd_cwnd + 1;
+                                } else {
+                                        /* Sending just as fast as we
+                                         * should be.
+                                         */
+                                        next_snd_cwnd = old_snd_cwnd;
+                                }
+                                /* Adjust cwnd upward or downward, toward the
+                                 * desired value.
+                                 */
+                                if (next_snd_cwnd > tp->snd_cwnd)
+                                        tp->snd_cwnd++;
+                                else if (next_snd_cwnd < tp->snd_cwnd)
+                                        tp->snd_cwnd--;
+                        }
+                }
+                /* Wipe the slate clean for the next RTT. */
+                vegas->cntRTT = 0;
+                vegas->minRTT = 0x7fffffff;
+        }
+        /* The following code is executed for every ack we receive,
+         * except for conditions checked in should_advance_cwnd()
+         * before the call to tcp_cong_avoid(). Mainly this means that
+         * we only execute this code if the ack actually acked some
+         * data.
+         */
+        /* If we are in slow start, increase our cwnd in response to this ACK.
+         * (If we are not in slow start then we are in congestion avoidance,
+         * and adjust our congestion window only once per RTT. See the code
+         * above.)
+         */
+        if (tp->snd_cwnd <= tp->snd_ssthresh)
+                tp->snd_cwnd++;
+        /* to keep cwnd from growing without bound */
+        tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
+        /* Make sure that we are never so timid as to reduce our cwnd below
+         * 2 MSS.
+         *
+         * Going below 2 MSS would risk huge delayed ACKs from our receiver.
+         */
+        tp->snd_cwnd = max(tp->snd_cwnd, 2U);
+}
+/* Extract info for Tcp socket info provided via netlink. */
+static void tcp_vegas_get_info(struct tcp_sock *tp, u32 ext,
+                               struct sk_buff *skb)
+{
+        const struct vegas *ca = tcp_ca(tp);
+        if (ext & (1<<(TCPDIAG_VEGASINFO-1))) {
+                struct tcpvegas_info *info;
+                info = RTA_DATA(__RTA_PUT(skb, TCPDIAG_VEGASINFO,
+                                          sizeof(*info)));
+                info->tcpv_enabled = ca->doing_vegas_now;
+                info->tcpv_rttcnt = ca->cntRTT;
+                info->tcpv_rtt = ca->baseRTT;
+                info->tcpv_minrtt = ca->minRTT;
+        rtattr_failure: ;
+        }
+}
+static struct tcp_congestion_ops tcp_vegas = {
+        .init           = tcp_vegas_init,
+        .ssthresh       = tcp_reno_ssthresh,
+        .cong_avoid     = tcp_vegas_cong_avoid,
+        .min_cwnd       = tcp_reno_min_cwnd,
+        .rtt_sample     = tcp_vegas_rtt_calc,
+        .set_state      = tcp_vegas_state,
+        .cwnd_event     = tcp_vegas_cwnd_event,
+        .get_info       = tcp_vegas_get_info,
+        .owner          = THIS_MODULE,
+        .name           = "vegas",
+};
+static int __init tcp_vegas_register(void)
+{
+        BUG_ON(sizeof(struct vegas) > TCP_CA_PRIV_SIZE);
+        tcp_register_congestion_control(&tcp_vegas);
+        return 0;
+}
+static void __exit tcp_vegas_unregister(void)
+{
+        tcp_unregister_congestion_control(&tcp_vegas);
+}
+module_init(tcp_vegas_register);
+module_exit(tcp_vegas_unregister);
+MODULE_AUTHOR("Stephen Hemminger");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("TCP Vegas");
author	Stephen Hemminger <shemminger@osdl.org>	2005-06-23 15:27:19 -0400
committer	David S. Miller <davem@davemloft.net>	2005-06-23 15:27:19 -0400
commit	b87d8561d8667d221b728ccdcb18eb95b16a687b (patch)
tree	715b8e8d8442e418364498a12712106530031b96 /net
parent	835b3f0c0d7e1f716c45ec576662eac7a68b8548 (diff)

diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 516ffe842816..6c105b60cc00 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig
@@ -488,6 +488,17 @@ config TCP_CONG_HYBLA
488	involved, expecially when sharing a common bottleneck with normal	488	involved, expecially when sharing a common bottleneck with normal
489	terrestrial connections.	489	terrestrial connections.
490		490
		491	config TCP_CONG_VEGAS
		492	tristate "TCP Vegas"
		493	depends on INET && EXPERIMENTAL
		494	default n
		495	---help---
		496	TCP Vegas is a sender-side only change to TCP that anticipates
		497	the onset of congestion by estimating the bandwidth. TCP Vegas
		498	adjusts the sending rate by modifying the congestion
		499	window. TCP Vegas should provide less packet loss, but it is
		500	not as aggressive as TCP Reno.
		501
491	endmenu	502	endmenu
492		503
493	source "net/ipv4/ipvs/Kconfig"	504	source "net/ipv4/ipvs/Kconfig"


diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index ede7a5d617ff..a801a97bd011 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile
@@ -35,6 +35,7 @@ obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
35	obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o	35	obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o
36	obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o	36	obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o
37	obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o	37	obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o
		38	obj-$(CONFIG_TCP_CONG_VEGAS) += tcp_vegas.o
38		39
39	obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \	40	obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
40	xfrm4_output.o	41	xfrm4_output.o


diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c new file mode 100644 index 000000000000..9bd443db5193 --- /dev/null +++ b/net/ipv4/tcp_vegas.c
@@ -0,0 +1,411 @@
		1	/*
		2	* TCP Vegas congestion control
		3	*
		4	* This is based on the congestion detection/avoidance scheme described in
		5	* Lawrence S. Brakmo and Larry L. Peterson.
		6	* "TCP Vegas: End to end congestion avoidance on a global internet."
		7	* IEEE Journal on Selected Areas in Communication, 13(8):1465--1480,
		8	* October 1995. Available from:
		9	* ftp://ftp.cs.arizona.edu/xkernel/Papers/jsac.ps
		10	*
		11	* See http://www.cs.arizona.edu/xkernel/ for their implementation.
		12	* The main aspects that distinguish this implementation from the
		13	* Arizona Vegas implementation are:
		14	* o We do not change the loss detection or recovery mechanisms of
		15	* Linux in any way. Linux already recovers from losses quite well,
		16	* using fine-grained timers, NewReno, and FACK.
		17	* o To avoid the performance penalty imposed by increasing cwnd
		18	* only every-other RTT during slow start, we increase during
		19	* every RTT during slow start, just like Reno.
		20	* o Largely to allow continuous cwnd growth during slow start,
		21	* we use the rate at which ACKs come back as the "actual"
		22	* rate, rather than the rate at which data is sent.
		23	* o To speed convergence to the right rate, we set the cwnd
		24	* to achieve the right ("actual") rate when we exit slow start.
		25	* o To filter out the noise caused by delayed ACKs, we use the
		26	* minimum RTT sample observed during the last RTT to calculate
		27	* the actual rate.
		28	* o When the sender re-starts from idle, it waits until it has
		29	* received ACKs for an entire flight of new data before making
		30	* a cwnd adjustment decision. The original Vegas implementation
		31	* assumed senders never went idle.
		32	*/
		33
		34	#include <linux/config.h>
		35	#include <linux/mm.h>
		36	#include <linux/module.h>
		37	#include <linux/skbuff.h>
		38	#include <linux/tcp_diag.h>
		39
		40	#include <net/tcp.h>
		41
		42	/* Default values of the Vegas variables, in fixed-point representation
		43	* with V_PARAM_SHIFT bits to the right of the binary point.
		44	*/
		45	#define V_PARAM_SHIFT 1
		46	static int alpha = 1<<V_PARAM_SHIFT;
		47	static int beta = 3<<V_PARAM_SHIFT;
		48	static int gamma = 1<<V_PARAM_SHIFT;
		49
		50	module_param(alpha, int, 0644);
		51	MODULE_PARM_DESC(alpha, "lower bound of packets in network (scale by 2)");
		52	module_param(beta, int, 0644);
		53	MODULE_PARM_DESC(beta, "upper bound of packets in network (scale by 2)");
		54	module_param(gamma, int, 0644);
		55	MODULE_PARM_DESC(gamma, "limit on increase (scale by 2)");
		56
		57
		58	/* Vegas variables */
		59	struct vegas {
		60	u32 beg_snd_nxt; /* right edge during last RTT */
		61	u32 beg_snd_una; /* left edge during last RTT */
		62	u32 beg_snd_cwnd; /* saves the size of the cwnd */
		63	u8 doing_vegas_now;/* if true, do vegas for this RTT */
		64	u16 cntRTT; /* # of RTTs measured within last RTT */
		65	u32 minRTT; /* min of RTTs measured within last RTT (in usec) */
		66	u32 baseRTT; /* the min of all Vegas RTT measurements seen (in usec) */
		67	};
		68
		69	/* There are several situations when we must "re-start" Vegas:
		70	*
		71	* o when a connection is established
		72	* o after an RTO
		73	* o after fast recovery
		74	* o when we send a packet and there is no outstanding
		75	* unacknowledged data (restarting an idle connection)
		76	*
		77	* In these circumstances we cannot do a Vegas calculation at the
		78	* end of the first RTT, because any calculation we do is using
		79	* stale info -- both the saved cwnd and congestion feedback are
		80	* stale.
		81	*
		82	* Instead we must wait until the completion of an RTT during
		83	* which we actually receive ACKs.
		84	*/
		85	static inline void vegas_enable(struct tcp_sock *tp)
		86	{
		87	struct vegas *vegas = tcp_ca(tp);
		88
		89	/* Begin taking Vegas samples next time we send something. */
		90	vegas->doing_vegas_now = 1;
		91
		92	/* Set the beginning of the next send window. */
		93	vegas->beg_snd_nxt = tp->snd_nxt;
		94
		95	vegas->cntRTT = 0;
		96	vegas->minRTT = 0x7fffffff;
		97	}
		98
		99	/* Stop taking Vegas samples for now. */
		100	static inline void vegas_disable(struct tcp_sock *tp)
		101	{
		102	struct vegas *vegas = tcp_ca(tp);
		103
		104	vegas->doing_vegas_now = 0;
		105	}
		106
		107	static void tcp_vegas_init(struct tcp_sock *tp)
		108	{
		109	struct vegas *vegas = tcp_ca(tp);
		110
		111	vegas->baseRTT = 0x7fffffff;
		112	vegas_enable(tp);
		113	}
		114
		115	/* Do RTT sampling needed for Vegas.
		116	* Basically we:
		117	* o min-filter RTT samples from within an RTT to get the current
		118	* propagation delay + queuing delay (we are min-filtering to try to
		119	* avoid the effects of delayed ACKs)
		120	* o min-filter RTT samples from a much longer window (forever for now)
		121	* to find the propagation delay (baseRTT)
		122	*/
		123	static void tcp_vegas_rtt_calc(struct tcp_sock *tp, u32 usrtt)
		124	{
		125	struct vegas *vegas = tcp_ca(tp);
		126	u32 vrtt = usrtt + 1; /* Never allow zero rtt or baseRTT */
		127
		128	/* Filter to find propagation delay: */
		129	if (vrtt < vegas->baseRTT)
		130	vegas->baseRTT = vrtt;
		131
		132	/* Find the min RTT during the last RTT to find
		133	* the current prop. delay + queuing delay:
		134	*/
		135	vegas->minRTT = min(vegas->minRTT, vrtt);
		136	vegas->cntRTT++;
		137	}
		138
		139	static void tcp_vegas_state(struct tcp_sock *tp, u8 ca_state)
		140	{
		141
		142	if (ca_state == TCP_CA_Open)
		143	vegas_enable(tp);
		144	else
		145	vegas_disable(tp);
		146	}
		147
		148	/*
		149	* If the connection is idle and we are restarting,
		150	* then we don't want to do any Vegas calculations
		151	* until we get fresh RTT samples. So when we
		152	* restart, we reset our Vegas state to a clean
		153	* slate. After we get acks for this flight of
		154	* packets, _then_ we can make Vegas calculations
		155	* again.
		156	*/
		157	static void tcp_vegas_cwnd_event(struct tcp_sock *tp, enum tcp_ca_event event)
		158	{
		159	if (event == CA_EVENT_CWND_RESTART \|\|
		160	event == CA_EVENT_TX_START)
		161	tcp_vegas_init(tp);
		162	}
		163
		164	static void tcp_vegas_cong_avoid(struct tcp_sock *tp, u32 ack,
		165	u32 seq_rtt, u32 in_flight, int flag)
		166	{
		167	struct vegas *vegas = tcp_ca(tp);
		168
		169	if (!vegas->doing_vegas_now)
		170	return tcp_reno_cong_avoid(tp, ack, seq_rtt, in_flight, flag);
		171
		172	/* The key players are v_beg_snd_una and v_beg_snd_nxt.
		173	*
		174	* These are so named because they represent the approximate values
		175	* of snd_una and snd_nxt at the beginning of the current RTT. More
		176	* precisely, they represent the amount of data sent during the RTT.
		177	* At the end of the RTT, when we receive an ACK for v_beg_snd_nxt,
		178	* we will calculate that (v_beg_snd_nxt - v_beg_snd_una) outstanding
		179	* bytes of data have been ACKed during the course of the RTT, giving
		180	* an "actual" rate of:
		181	*
		182	* (v_beg_snd_nxt - v_beg_snd_una) / (rtt duration)
		183	*
		184	* Unfortunately, v_beg_snd_una is not exactly equal to snd_una,
		185	* because delayed ACKs can cover more than one segment, so they
		186	* don't line up nicely with the boundaries of RTTs.
		187	*
		188	* Another unfortunate fact of life is that delayed ACKs delay the
		189	* advance of the left edge of our send window, so that the number
		190	* of bytes we send in an RTT is often less than our cwnd will allow.
		191	* So we keep track of our cwnd separately, in v_beg_snd_cwnd.
		192	*/
		193
		194	if (after(ack, vegas->beg_snd_nxt)) {
		195	/* Do the Vegas once-per-RTT cwnd adjustment. */
		196	u32 old_wnd, old_snd_cwnd;
		197
		198
		199	/* Here old_wnd is essentially the window of data that was
		200	* sent during the previous RTT, and has all
		201	* been acknowledged in the course of the RTT that ended
		202	* with the ACK we just received. Likewise, old_snd_cwnd
		203	* is the cwnd during the previous RTT.
		204	*/
		205	old_wnd = (vegas->beg_snd_nxt - vegas->beg_snd_una) /
		206	tp->mss_cache;
		207	old_snd_cwnd = vegas->beg_snd_cwnd;
		208
		209	/* Save the extent of the current window so we can use this
		210	* at the end of the next RTT.
		211	*/
		212	vegas->beg_snd_una = vegas->beg_snd_nxt;
		213	vegas->beg_snd_nxt = tp->snd_nxt;
		214	vegas->beg_snd_cwnd = tp->snd_cwnd;
		215
		216	/* Take into account the current RTT sample too, to
		217	* decrease the impact of delayed acks. This double counts
		218	* this sample since we count it for the next window as well,
		219	* but that's not too awful, since we're taking the min,
		220	* rather than averaging.
		221	*/
		222	tcp_vegas_rtt_calc(tp, seq_rtt*1000);
		223
		224	/* We do the Vegas calculations only if we got enough RTT
		225	* samples that we can be reasonably sure that we got
		226	* at least one RTT sample that wasn't from a delayed ACK.
		227	* If we only had 2 samples total,
		228	* then that means we're getting only 1 ACK per RTT, which
		229	* means they're almost certainly delayed ACKs.
		230	* If we have 3 samples, we should be OK.
		231	*/
		232
		233	if (vegas->cntRTT <= 2) {
		234	/* We don't have enough RTT samples to do the Vegas
		235	* calculation, so we'll behave like Reno.
		236	*/
		237	if (tp->snd_cwnd > tp->snd_ssthresh)
		238	tp->snd_cwnd++;
		239	} else {
		240	u32 rtt, target_cwnd, diff;
		241
		242	/* We have enough RTT samples, so, using the Vegas
		243	* algorithm, we determine if we should increase or
		244	* decrease cwnd, and by how much.
		245	*/
		246
		247	/* Pluck out the RTT we are using for the Vegas
		248	* calculations. This is the min RTT seen during the
		249	* last RTT. Taking the min filters out the effects
		250	* of delayed ACKs, at the cost of noticing congestion
		251	* a bit later.
		252	*/
		253	rtt = vegas->minRTT;
		254
		255	/* Calculate the cwnd we should have, if we weren't
		256	* going too fast.
		257	*
		258	* This is:
		259	* (actual rate in segments) * baseRTT
		260	* We keep it as a fixed point number with
		261	* V_PARAM_SHIFT bits to the right of the binary point.
		262	*/
		263	target_cwnd = ((old_wnd * vegas->baseRTT)
		264	<< V_PARAM_SHIFT) / rtt;
		265
		266	/* Calculate the difference between the window we had,
		267	* and the window we would like to have. This quantity
		268	* is the "Diff" from the Arizona Vegas papers.
		269	*
		270	* Again, this is a fixed point number with
		271	* V_PARAM_SHIFT bits to the right of the binary
		272	* point.
		273	*/
		274	diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd;
		275
		276	if (tp->snd_cwnd < tp->snd_ssthresh) {
		277	/* Slow start. */
		278	if (diff > gamma) {
		279	/* Going too fast. Time to slow down
		280	* and switch to congestion avoidance.
		281	*/
		282	tp->snd_ssthresh = 2;
		283
		284	/* Set cwnd to match the actual rate
		285	* exactly:
		286	* cwnd = (actual rate) * baseRTT
		287	* Then we add 1 because the integer
		288	* truncation robs us of full link
		289	* utilization.
		290	*/
		291	tp->snd_cwnd = min(tp->snd_cwnd,
		292	(target_cwnd >>
		293	V_PARAM_SHIFT)+1);
		294
		295	}
		296	} else {
		297	/* Congestion avoidance. */
		298	u32 next_snd_cwnd;
		299
		300	/* Figure out where we would like cwnd
		301	* to be.
		302	*/
		303	if (diff > beta) {
		304	/* The old window was too fast, so
		305	* we slow down.
		306	*/
		307	next_snd_cwnd = old_snd_cwnd - 1;
		308	} else if (diff < alpha) {
		309	/* We don't have enough extra packets
		310	* in the network, so speed up.
		311	*/
		312	next_snd_cwnd = old_snd_cwnd + 1;
		313	} else {
		314	/* Sending just as fast as we
		315	* should be.
		316	*/
		317	next_snd_cwnd = old_snd_cwnd;
		318	}
		319
		320	/* Adjust cwnd upward or downward, toward the
		321	* desired value.
		322	*/
		323	if (next_snd_cwnd > tp->snd_cwnd)
		324	tp->snd_cwnd++;
		325	else if (next_snd_cwnd < tp->snd_cwnd)
		326	tp->snd_cwnd--;
		327	}
		328	}
		329
		330	/* Wipe the slate clean for the next RTT. */
		331	vegas->cntRTT = 0;
		332	vegas->minRTT = 0x7fffffff;
		333	}
		334
		335	/* The following code is executed for every ack we receive,
		336	* except for conditions checked in should_advance_cwnd()
		337	* before the call to tcp_cong_avoid(). Mainly this means that
		338	* we only execute this code if the ack actually acked some
		339	* data.
		340	*/
		341
		342	/* If we are in slow start, increase our cwnd in response to this ACK.
		343	* (If we are not in slow start then we are in congestion avoidance,
		344	* and adjust our congestion window only once per RTT. See the code
		345	* above.)
		346	*/
		347	if (tp->snd_cwnd <= tp->snd_ssthresh)
		348	tp->snd_cwnd++;
		349
		350	/* to keep cwnd from growing without bound */
		351	tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
		352
		353	/* Make sure that we are never so timid as to reduce our cwnd below
		354	* 2 MSS.
		355	*
		356	* Going below 2 MSS would risk huge delayed ACKs from our receiver.
		357	*/
		358	tp->snd_cwnd = max(tp->snd_cwnd, 2U);
		359	}
		360
		361	/* Extract info for Tcp socket info provided via netlink. */
		362	static void tcp_vegas_get_info(struct tcp_sock *tp, u32 ext,
		363	struct sk_buff *skb)
		364	{
		365	const struct vegas *ca = tcp_ca(tp);
		366	if (ext & (1<<(TCPDIAG_VEGASINFO-1))) {
		367	struct tcpvegas_info *info;
		368
		369	info = RTA_DATA(__RTA_PUT(skb, TCPDIAG_VEGASINFO,
		370	sizeof(*info)));
		371
		372	info->tcpv_enabled = ca->doing_vegas_now;
		373	info->tcpv_rttcnt = ca->cntRTT;
		374	info->tcpv_rtt = ca->baseRTT;
		375	info->tcpv_minrtt = ca->minRTT;
		376	rtattr_failure: ;
		377	}
		378	}
		379
		380	static struct tcp_congestion_ops tcp_vegas = {
		381	.init = tcp_vegas_init,
		382	.ssthresh = tcp_reno_ssthresh,
		383	.cong_avoid = tcp_vegas_cong_avoid,
		384	.min_cwnd = tcp_reno_min_cwnd,
		385	.rtt_sample = tcp_vegas_rtt_calc,
		386	.set_state = tcp_vegas_state,
		387	.cwnd_event = tcp_vegas_cwnd_event,
		388	.get_info = tcp_vegas_get_info,
		389
		390	.owner = THIS_MODULE,
		391	.name = "vegas",
		392	};
		393
		394	static int __init tcp_vegas_register(void)
		395	{
		396	BUG_ON(sizeof(struct vegas) > TCP_CA_PRIV_SIZE);
		397	tcp_register_congestion_control(&tcp_vegas);
		398	return 0;
		399	}
		400
		401	static void __exit tcp_vegas_unregister(void)
		402	{
		403	tcp_unregister_congestion_control(&tcp_vegas);
		404	}
		405
		406	module_init(tcp_vegas_register);
		407	module_exit(tcp_vegas_unregister);
		408
		409	MODULE_AUTHOR("Stephen Hemminger");
		410	MODULE_LICENSE("GPL");
		411	MODULE_DESCRIPTION("TCP Vegas");