4 files changed, 437 insertions, 0 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 9e8ef509c51d..dc61e6641624 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -574,6 +574,20 @@ config TCP_CONG_VENO
        loss packets.
        See http://www.ntu.edu.sg/home5/ZHOU0022/papers/CPFu03a.pdf
+config TCP_CONG_YEAH
+        tristate "YeAH TCP"
+        depends on EXPERIMENTAL
+        default n
+        ---help---
+        YeAH-TCP is a sender-side high-speed enabled TCP congestion control
+        algorithm, which uses a mixed loss/delay approach to compute the
+        congestion window. It's design goals target high efficiency,
+        internal, RTT and Reno fairness, resilience to link loss while
+        keeping network elements load as low as possible.
+        For further details look here:
+          http://wil.cs.caltech.edu/pfldnet2007/paper/YeAH_TCP.pdf
 choice
        prompt "Default TCP congestion control"
        default DEFAULT_CUBIC
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 7a068626feea..eeb94d5cac96 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -49,6 +49,7 @@ obj-$(CONFIG_TCP_CONG_VEGAS) += tcp_vegas.o
 obj-$(CONFIG_TCP_CONG_VENO) += tcp_veno.o
 obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o
 obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o
+obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o
 obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
 obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c
new file mode 100644
index 000000000000..815e020e98fe
--- /dev/null
+++ b/net/ipv4/tcp_yeah.c
@@ -0,0 +1,288 @@
+/*
+ *
+ *   YeAH TCP
+ *
+ * For further details look at:
+ *    http://wil.cs.caltech.edu/pfldnet2007/paper/YeAH_TCP.pdf
+ *
+ */
+#include "tcp_yeah.h"
+/* Default values of the Vegas variables, in fixed-point representation
+ * with V_PARAM_SHIFT bits to the right of the binary point.
+ */
+#define V_PARAM_SHIFT 1
+#define TCP_YEAH_ALPHA       80 //lin number of packets queued at the bottleneck
+#define TCP_YEAH_GAMMA        1 //lin fraction of queue to be removed per rtt
+#define TCP_YEAH_DELTA        3 //log minimum fraction of cwnd to be removed on loss
+#define TCP_YEAH_EPSILON      1 //log maximum fraction to be removed on early decongestion
+#define TCP_YEAH_PHY          8 //lin maximum delta from base
+#define TCP_YEAH_RHO         16 //lin minumum number of consecutive rtt to consider competition on loss
+#define TCP_YEAH_ZETA        50 //lin minimum number of state switchs to reset reno_count
+#define TCP_SCALABLE_AI_CNT      100U
+/* YeAH variables */
+struct yeah {
+        /* Vegas */
+        u32     beg_snd_nxt;    /* right edge during last RTT */
+        u32     beg_snd_una;    /* left edge  during last RTT */
+        u32     beg_snd_cwnd;   /* saves the size of the cwnd */
+        u8      doing_vegas_now;/* if true, do vegas for this RTT */
+        u16     cntRTT;         /* # of RTTs measured within last RTT */
+        u32     minRTT;         /* min of RTTs measured within last RTT (in usec) */
+        u32     baseRTT;        /* the min of all Vegas RTT measurements seen (in usec) */
+        /* YeAH */
+        u32 lastQ;
+        u32 doing_reno_now;
+        u32 reno_count;
+        u32 fast_count;
+        u32 pkts_acked;
+};
+static void tcp_yeah_init(struct sock *sk)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        struct yeah *yeah = inet_csk_ca(sk);
+        tcp_vegas_init(sk);
+        yeah->doing_reno_now = 0;
+        yeah->lastQ = 0;
+        yeah->reno_count = 2;
+        /* Ensure the MD arithmetic works.  This is somewhat pedantic,
+         * since I don't think we will see a cwnd this large. :) */
+        tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128);
+}
+static void tcp_yeah_pkts_acked(struct sock *sk, u32 pkts_acked)
+{
+        const struct inet_connection_sock *icsk = inet_csk(sk);
+        struct yeah *yeah = inet_csk_ca(sk);
+        if (icsk->icsk_ca_state == TCP_CA_Open)
+                yeah->pkts_acked = pkts_acked;
+}
+/* 64bit divisor, dividend and result. dynamic precision */
+static inline u64 div64_64(u64 dividend, u64 divisor)
+{
+        u32 d = divisor;
+        if (divisor > 0xffffffffULL) {
+                unsigned int shift = fls(divisor >> 32);
+                d = divisor >> shift;
+                dividend >>= shift;
+        }
+        /* avoid 64 bit division if possible */
+        if (dividend >> 32)
+                do_div(dividend, d);
+        else
+                dividend = (u32) dividend / d;
+        return dividend;
+}
+static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack,
+                                 u32 seq_rtt, u32 in_flight, int flag)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        struct yeah *yeah = inet_csk_ca(sk);
+        if (!tcp_is_cwnd_limited(sk, in_flight))
+                return;
+        if (tp->snd_cwnd <= tp->snd_ssthresh) {
+                tcp_slow_start(tp);
+        } else if (!yeah->doing_reno_now) {
+                /* Scalable */
+                tp->snd_cwnd_cnt+=yeah->pkts_acked;
+                if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)){
+                        if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+                                tp->snd_cwnd++;
+                        tp->snd_cwnd_cnt = 0;
+                }
+                yeah->pkts_acked = 1;
+        } else {
+                /* Reno */
+                if (tp->snd_cwnd_cnt < tp->snd_cwnd)
+                        tp->snd_cwnd_cnt++;
+                if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
+                        tp->snd_cwnd++;
+                        tp->snd_cwnd_cnt = 0;
+                }
+        }
+        /* The key players are v_beg_snd_una and v_beg_snd_nxt.
+         *
+         * These are so named because they represent the approximate values
+         * of snd_una and snd_nxt at the beginning of the current RTT. More
+         * precisely, they represent the amount of data sent during the RTT.
+         * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt,
+         * we will calculate that (v_beg_snd_nxt - v_beg_snd_una) outstanding
+         * bytes of data have been ACKed during the course of the RTT, giving
+         * an "actual" rate of:
+         *
+         *     (v_beg_snd_nxt - v_beg_snd_una) / (rtt duration)
+         *
+         * Unfortunately, v_beg_snd_una is not exactly equal to snd_una,
+         * because delayed ACKs can cover more than one segment, so they
+         * don't line up yeahly with the boundaries of RTTs.
+         *
+         * Another unfortunate fact of life is that delayed ACKs delay the
+         * advance of the left edge of our send window, so that the number
+         * of bytes we send in an RTT is often less than our cwnd will allow.
+         * So we keep track of our cwnd separately, in v_beg_snd_cwnd.
+         */
+        if (after(ack, yeah->beg_snd_nxt)) {
+                /* We do the Vegas calculations only if we got enough RTT
+                 * samples that we can be reasonably sure that we got
+                 * at least one RTT sample that wasn't from a delayed ACK.
+                 * If we only had 2 samples total,
+                 * then that means we're getting only 1 ACK per RTT, which
+                 * means they're almost certainly delayed ACKs.
+                 * If  we have 3 samples, we should be OK.
+                 */
+                if (yeah->cntRTT > 2) {
+                        u32 rtt;
+                        u32 queue, maxqueue;
+                        /* We have enough RTT samples, so, using the Vegas
+                         * algorithm, we determine if we should increase or
+                         * decrease cwnd, and by how much.
+                         */
+                        /* Pluck out the RTT we are using for the Vegas
+                         * calculations. This is the min RTT seen during the
+                         * last RTT. Taking the min filters out the effects
+                         * of delayed ACKs, at the cost of noticing congestion
+                         * a bit later.
+                         */
+                        rtt = yeah->minRTT;
+                        queue = (u32)div64_64((u64)tp->snd_cwnd * (rtt - yeah->baseRTT), rtt);
+                        maxqueue = TCP_YEAH_ALPHA;
+                        if (queue > maxqueue ||
+                                    rtt - yeah->baseRTT > (yeah->baseRTT / TCP_YEAH_PHY)) {
+                                if (queue > maxqueue && tp->snd_cwnd > yeah->reno_count) {
+                                        u32 reduction = min( queue / TCP_YEAH_GAMMA ,
+                                                         tp->snd_cwnd >> TCP_YEAH_EPSILON );
+                                        tp->snd_cwnd -= reduction;
+                                        tp->snd_cwnd = max( tp->snd_cwnd, yeah->reno_count);
+                                        tp->snd_ssthresh = tp->snd_cwnd;
+                        }
+                                if (yeah->reno_count <= 2)
+                                        yeah->reno_count = max( tp->snd_cwnd>>1, 2U);
+                                else
+                                        yeah->reno_count++;
+                                yeah->doing_reno_now =
+                                                   min_t( u32, yeah->doing_reno_now + 1 , 0xffffff);
+                        } else {
+                                yeah->fast_count++;
+                                if (yeah->fast_count > TCP_YEAH_ZETA) {
+                                        yeah->reno_count = 2;
+                                        yeah->fast_count = 0;
+                                }
+                                yeah->doing_reno_now = 0;
+                        }
+                        yeah->lastQ = queue;
+                }
+                /* Save the extent of the current window so we can use this
+                 * at the end of the next RTT.
+                 */
+                yeah->beg_snd_una  = yeah->beg_snd_nxt;
+                yeah->beg_snd_nxt  = tp->snd_nxt;
+                yeah->beg_snd_cwnd = tp->snd_cwnd;
+                /* Wipe the slate clean for the next RTT. */
+                yeah->cntRTT = 0;
+                yeah->minRTT = 0x7fffffff;
+        }
+}
+static u32 tcp_yeah_ssthresh(struct sock *sk) {
+        const struct tcp_sock *tp = tcp_sk(sk);
+        struct yeah *yeah = inet_csk_ca(sk);
+        u32 reduction;
+        if (yeah->doing_reno_now < TCP_YEAH_RHO) {
+                reduction = yeah->lastQ;
+                reduction = min( reduction, max(tp->snd_cwnd>>1, 2U) );
+                reduction = max( reduction, tp->snd_cwnd >> TCP_YEAH_DELTA);
+        } else
+                reduction = max(tp->snd_cwnd>>1,2U);
+        yeah->fast_count = 0;
+        yeah->reno_count = max(yeah->reno_count>>1, 2U);
+        return tp->snd_cwnd - reduction;
+}
+static struct tcp_congestion_ops tcp_yeah = {
+        .init           = tcp_yeah_init,
+        .ssthresh       = tcp_yeah_ssthresh,
+        .cong_avoid     = tcp_yeah_cong_avoid,
+        .min_cwnd       = tcp_reno_min_cwnd,
+        .rtt_sample     = tcp_vegas_rtt_calc,
+        .set_state      = tcp_vegas_state,
+        .cwnd_event     = tcp_vegas_cwnd_event,
+        .get_info       = tcp_vegas_get_info,
+        .pkts_acked     = tcp_yeah_pkts_acked,
+        .owner          = THIS_MODULE,
+        .name           = "yeah",
+};
+static int __init tcp_yeah_register(void)
+{
+        BUG_ON(sizeof(struct yeah) > ICSK_CA_PRIV_SIZE);
+        tcp_register_congestion_control(&tcp_yeah);
+        return 0;
+}
+static void __exit tcp_yeah_unregister(void)
+{
+        tcp_unregister_congestion_control(&tcp_yeah);
+}
+module_init(tcp_yeah_register);
+module_exit(tcp_yeah_unregister);
+MODULE_AUTHOR("Angelo P. Castellani");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("YeAH TCP");
diff --git a/net/ipv4/tcp_yeah.h b/net/ipv4/tcp_yeah.h
new file mode 100644
index 000000000000..b3255dba4e2d
--- /dev/null
+++ b/net/ipv4/tcp_yeah.h
@@ -0,0 +1,134 @@
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/inet_diag.h>
+#include <net/tcp.h>
+/* Vegas variables */
+struct vegas {
+        u32     beg_snd_nxt;    /* right edge during last RTT */
+        u32     beg_snd_una;    /* left edge  during last RTT */
+        u32     beg_snd_cwnd;   /* saves the size of the cwnd */
+        u8      doing_vegas_now;/* if true, do vegas for this RTT */
+        u16     cntRTT;         /* # of RTTs measured within last RTT */
+        u32     minRTT;         /* min of RTTs measured within last RTT (in usec) */
+        u32     baseRTT;        /* the min of all Vegas RTT measurements seen (in usec) */
+};
+/* There are several situations when we must "re-start" Vegas:
+ *
+ *  o when a connection is established
+ *  o after an RTO
+ *  o after fast recovery
+ *  o when we send a packet and there is no outstanding
+ *    unacknowledged data (restarting an idle connection)
+ *
+ * In these circumstances we cannot do a Vegas calculation at the
+ * end of the first RTT, because any calculation we do is using
+ * stale info -- both the saved cwnd and congestion feedback are
+ * stale.
+ *
+ * Instead we must wait until the completion of an RTT during
+ * which we actually receive ACKs.
+ */
+static inline void vegas_enable(struct sock *sk)
+{
+        const struct tcp_sock *tp = tcp_sk(sk);
+        struct vegas *vegas = inet_csk_ca(sk);
+        /* Begin taking Vegas samples next time we send something. */
+        vegas->doing_vegas_now = 1;
+        /* Set the beginning of the next send window. */
+        vegas->beg_snd_nxt = tp->snd_nxt;
+        vegas->cntRTT = 0;
+        vegas->minRTT = 0x7fffffff;
+}
+/* Stop taking Vegas samples for now. */
+static inline void vegas_disable(struct sock *sk)
+{
+        struct vegas *vegas = inet_csk_ca(sk);
+        vegas->doing_vegas_now = 0;
+}
+static void tcp_vegas_init(struct sock *sk)
+{
+        struct vegas *vegas = inet_csk_ca(sk);
+        vegas->baseRTT = 0x7fffffff;
+        vegas_enable(sk);
+}
+static void tcp_vegas_state(struct sock *sk, u8 ca_state)
+{
+        if (ca_state == TCP_CA_Open)
+                vegas_enable(sk);
+        else
+                vegas_disable(sk);
+}
+/* Do RTT sampling needed for Vegas.
+ * Basically we:
+ *   o min-filter RTT samples from within an RTT to get the current
+ *     propagation delay + queuing delay (we are min-filtering to try to
+ *     avoid the effects of delayed ACKs)
+ *   o min-filter RTT samples from a much longer window (forever for now)
+ *     to find the propagation delay (baseRTT)
+ */
+static void tcp_vegas_rtt_calc(struct sock *sk, u32 usrtt)
+{
+        struct vegas *vegas = inet_csk_ca(sk);
+        u32 vrtt = usrtt + 1; /* Never allow zero rtt or baseRTT */
+        /* Filter to find propagation delay: */
+        if (vrtt < vegas->baseRTT)
+                vegas->baseRTT = vrtt;
+        /* Find the min RTT during the last RTT to find
+         * the current prop. delay + queuing delay:
+         */
+        vegas->minRTT = min(vegas->minRTT, vrtt);
+        vegas->cntRTT++;
+}
+/*
+ * If the connection is idle and we are restarting,
+ * then we don't want to do any Vegas calculations
+ * until we get fresh RTT samples.  So when we
+ * restart, we reset our Vegas state to a clean
+ * slate. After we get acks for this flight of
+ * packets, _then_ we can make Vegas calculations
+ * again.
+ */
+static void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event)
+{
+        if (event == CA_EVENT_CWND_RESTART ||
+            event == CA_EVENT_TX_START)
+                tcp_vegas_init(sk);
+}
+/* Extract info for Tcp socket info provided via netlink. */
+static void tcp_vegas_get_info(struct sock *sk, u32 ext,
+                               struct sk_buff *skb)
+{
+        const struct vegas *ca = inet_csk_ca(sk);
+        if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
+                struct tcpvegas_info *info;
+                info = RTA_DATA(__RTA_PUT(skb, INET_DIAG_VEGASINFO,
+                                          sizeof(*info)));
+                info->tcpv_enabled = ca->doing_vegas_now;
+                info->tcpv_rttcnt = ca->cntRTT;
+                info->tcpv_rtt = ca->baseRTT;
+                info->tcpv_minrtt = ca->minRTT;
+        rtattr_failure: ;
+        }
+}

diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 9e8ef509c51d..dc61e6641624 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig
@@ -574,6 +574,20 @@ config TCP_CONG_VENO
574	loss packets.	574	loss packets.
575	See http://www.ntu.edu.sg/home5/ZHOU0022/papers/CPFu03a.pdf	575	See http://www.ntu.edu.sg/home5/ZHOU0022/papers/CPFu03a.pdf
576		576
		577	config TCP_CONG_YEAH
		578	tristate "YeAH TCP"
		579	depends on EXPERIMENTAL
		580	default n
		581	---help---
		582	YeAH-TCP is a sender-side high-speed enabled TCP congestion control
		583	algorithm, which uses a mixed loss/delay approach to compute the
		584	congestion window. It's design goals target high efficiency,
		585	internal, RTT and Reno fairness, resilience to link loss while
		586	keeping network elements load as low as possible.
		587
		588	For further details look here:
		589	http://wil.cs.caltech.edu/pfldnet2007/paper/YeAH_TCP.pdf
		590
577	choice	591	choice
578	prompt "Default TCP congestion control"	592	prompt "Default TCP congestion control"
579	default DEFAULT_CUBIC	593	default DEFAULT_CUBIC


diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 7a068626feea..eeb94d5cac96 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile
@@ -49,6 +49,7 @@ obj-$(CONFIG_TCP_CONG_VEGAS) += tcp_vegas.o
49	obj-$(CONFIG_TCP_CONG_VENO) += tcp_veno.o	49	obj-$(CONFIG_TCP_CONG_VENO) += tcp_veno.o
50	obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o	50	obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o
51	obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o	51	obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o
		52	obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o
52	obj-$(CONFIG_NETLABEL) += cipso_ipv4.o	53	obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
53		54
54	obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \	55	obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \


diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c new file mode 100644 index 000000000000..815e020e98fe --- /dev/null +++ b/net/ipv4/tcp_yeah.c
@@ -0,0 +1,288 @@
		1	/*
		2	*
		3	* YeAH TCP
		4	*
		5	* For further details look at:
		6	* http://wil.cs.caltech.edu/pfldnet2007/paper/YeAH_TCP.pdf
		7	*
		8	*/
		9
		10	#include "tcp_yeah.h"
		11
		12	/* Default values of the Vegas variables, in fixed-point representation
		13	* with V_PARAM_SHIFT bits to the right of the binary point.
		14	*/
		15	#define V_PARAM_SHIFT 1
		16
		17	#define TCP_YEAH_ALPHA 80 //lin number of packets queued at the bottleneck
		18	#define TCP_YEAH_GAMMA 1 //lin fraction of queue to be removed per rtt
		19	#define TCP_YEAH_DELTA 3 //log minimum fraction of cwnd to be removed on loss
		20	#define TCP_YEAH_EPSILON 1 //log maximum fraction to be removed on early decongestion
		21	#define TCP_YEAH_PHY 8 //lin maximum delta from base
		22	#define TCP_YEAH_RHO 16 //lin minumum number of consecutive rtt to consider competition on loss
		23	#define TCP_YEAH_ZETA 50 //lin minimum number of state switchs to reset reno_count
		24
		25	#define TCP_SCALABLE_AI_CNT 100U
		26
		27	/* YeAH variables */
		28	struct yeah {
		29	/* Vegas */
		30	u32 beg_snd_nxt; /* right edge during last RTT */
		31	u32 beg_snd_una; /* left edge during last RTT */
		32	u32 beg_snd_cwnd; /* saves the size of the cwnd */
		33	u8 doing_vegas_now;/* if true, do vegas for this RTT */
		34	u16 cntRTT; /* # of RTTs measured within last RTT */
		35	u32 minRTT; /* min of RTTs measured within last RTT (in usec) */
		36	u32 baseRTT; /* the min of all Vegas RTT measurements seen (in usec) */
		37
		38	/* YeAH */
		39	u32 lastQ;
		40	u32 doing_reno_now;
		41
		42	u32 reno_count;
		43	u32 fast_count;
		44
		45	u32 pkts_acked;
		46	};
		47
		48	static void tcp_yeah_init(struct sock *sk)
		49	{
		50	struct tcp_sock *tp = tcp_sk(sk);
		51	struct yeah *yeah = inet_csk_ca(sk);
		52
		53	tcp_vegas_init(sk);
		54
		55	yeah->doing_reno_now = 0;
		56	yeah->lastQ = 0;
		57
		58	yeah->reno_count = 2;
		59
		60	/* Ensure the MD arithmetic works. This is somewhat pedantic,
		61	* since I don't think we will see a cwnd this large. :) */
		62	tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128);
		63
		64	}
		65
		66
		67	static void tcp_yeah_pkts_acked(struct sock *sk, u32 pkts_acked)
		68	{
		69	const struct inet_connection_sock *icsk = inet_csk(sk);
		70	struct yeah *yeah = inet_csk_ca(sk);
		71
		72	if (icsk->icsk_ca_state == TCP_CA_Open)
		73	yeah->pkts_acked = pkts_acked;
		74	}
		75
		76	/* 64bit divisor, dividend and result. dynamic precision */
		77	static inline u64 div64_64(u64 dividend, u64 divisor)
		78	{
		79	u32 d = divisor;
		80
		81	if (divisor > 0xffffffffULL) {
		82	unsigned int shift = fls(divisor >> 32);
		83
		84	d = divisor >> shift;
		85	dividend >>= shift;
		86	}
		87
		88	/* avoid 64 bit division if possible */
		89	if (dividend >> 32)
		90	do_div(dividend, d);
		91	else
		92	dividend = (u32) dividend / d;
		93
		94	return dividend;
		95	}
		96
		97	static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack,
		98	u32 seq_rtt, u32 in_flight, int flag)
		99	{
		100	struct tcp_sock *tp = tcp_sk(sk);
		101	struct yeah *yeah = inet_csk_ca(sk);
		102
		103	if (!tcp_is_cwnd_limited(sk, in_flight))
		104	return;
		105
		106	if (tp->snd_cwnd <= tp->snd_ssthresh) {
		107	tcp_slow_start(tp);
		108	} else if (!yeah->doing_reno_now) {
		109	/* Scalable */
		110
		111	tp->snd_cwnd_cnt+=yeah->pkts_acked;
		112	if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)){
		113	if (tp->snd_cwnd < tp->snd_cwnd_clamp)
		114	tp->snd_cwnd++;
		115	tp->snd_cwnd_cnt = 0;
		116	}
		117
		118	yeah->pkts_acked = 1;
		119
		120	} else {
		121	/* Reno */
		122
		123	if (tp->snd_cwnd_cnt < tp->snd_cwnd)
		124	tp->snd_cwnd_cnt++;
		125
		126	if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
		127	tp->snd_cwnd++;
		128	tp->snd_cwnd_cnt = 0;
		129	}
		130	}
		131
		132	/* The key players are v_beg_snd_una and v_beg_snd_nxt.
		133	*
		134	* These are so named because they represent the approximate values
		135	* of snd_una and snd_nxt at the beginning of the current RTT. More
		136	* precisely, they represent the amount of data sent during the RTT.
		137	* At the end of the RTT, when we receive an ACK for v_beg_snd_nxt,
		138	* we will calculate that (v_beg_snd_nxt - v_beg_snd_una) outstanding
		139	* bytes of data have been ACKed during the course of the RTT, giving
		140	* an "actual" rate of:
		141	*
		142	* (v_beg_snd_nxt - v_beg_snd_una) / (rtt duration)
		143	*
		144	* Unfortunately, v_beg_snd_una is not exactly equal to snd_una,
		145	* because delayed ACKs can cover more than one segment, so they
		146	* don't line up yeahly with the boundaries of RTTs.
		147	*
		148	* Another unfortunate fact of life is that delayed ACKs delay the
		149	* advance of the left edge of our send window, so that the number
		150	* of bytes we send in an RTT is often less than our cwnd will allow.
		151	* So we keep track of our cwnd separately, in v_beg_snd_cwnd.
		152	*/
		153
		154	if (after(ack, yeah->beg_snd_nxt)) {
		155
		156	/* We do the Vegas calculations only if we got enough RTT
		157	* samples that we can be reasonably sure that we got
		158	* at least one RTT sample that wasn't from a delayed ACK.
		159	* If we only had 2 samples total,
		160	* then that means we're getting only 1 ACK per RTT, which
		161	* means they're almost certainly delayed ACKs.
		162	* If we have 3 samples, we should be OK.
		163	*/
		164
		165	if (yeah->cntRTT > 2) {
		166	u32 rtt;
		167	u32 queue, maxqueue;
		168
		169	/* We have enough RTT samples, so, using the Vegas
		170	* algorithm, we determine if we should increase or
		171	* decrease cwnd, and by how much.
		172	*/
		173
		174	/* Pluck out the RTT we are using for the Vegas
		175	* calculations. This is the min RTT seen during the
		176	* last RTT. Taking the min filters out the effects
		177	* of delayed ACKs, at the cost of noticing congestion
		178	* a bit later.
		179	*/
		180	rtt = yeah->minRTT;
		181
		182	queue = (u32)div64_64((u64)tp->snd_cwnd * (rtt - yeah->baseRTT), rtt);
		183
		184	maxqueue = TCP_YEAH_ALPHA;
		185
		186	if (queue > maxqueue \|\|
		187	rtt - yeah->baseRTT > (yeah->baseRTT / TCP_YEAH_PHY)) {
		188
		189	if (queue > maxqueue && tp->snd_cwnd > yeah->reno_count) {
		190	u32 reduction = min( queue / TCP_YEAH_GAMMA ,
		191	tp->snd_cwnd >> TCP_YEAH_EPSILON );
		192
		193	tp->snd_cwnd -= reduction;
		194
		195	tp->snd_cwnd = max( tp->snd_cwnd, yeah->reno_count);
		196
		197	tp->snd_ssthresh = tp->snd_cwnd;
		198	}
		199
		200	if (yeah->reno_count <= 2)
		201	yeah->reno_count = max( tp->snd_cwnd>>1, 2U);
		202	else
		203	yeah->reno_count++;
		204
		205	yeah->doing_reno_now =
		206	min_t( u32, yeah->doing_reno_now + 1 , 0xffffff);
		207
		208	} else {
		209	yeah->fast_count++;
		210
		211	if (yeah->fast_count > TCP_YEAH_ZETA) {
		212	yeah->reno_count = 2;
		213	yeah->fast_count = 0;
		214	}
		215
		216	yeah->doing_reno_now = 0;
		217	}
		218
		219	yeah->lastQ = queue;
		220
		221	}
		222
		223	/* Save the extent of the current window so we can use this
		224	* at the end of the next RTT.
		225	*/
		226	yeah->beg_snd_una = yeah->beg_snd_nxt;
		227	yeah->beg_snd_nxt = tp->snd_nxt;
		228	yeah->beg_snd_cwnd = tp->snd_cwnd;
		229
		230	/* Wipe the slate clean for the next RTT. */
		231	yeah->cntRTT = 0;
		232	yeah->minRTT = 0x7fffffff;
		233	}
		234	}
		235
		236	static u32 tcp_yeah_ssthresh(struct sock *sk) {
		237	const struct tcp_sock *tp = tcp_sk(sk);
		238	struct yeah *yeah = inet_csk_ca(sk);
		239	u32 reduction;
		240
		241	if (yeah->doing_reno_now < TCP_YEAH_RHO) {
		242	reduction = yeah->lastQ;
		243
		244	reduction = min( reduction, max(tp->snd_cwnd>>1, 2U) );
		245
		246	reduction = max( reduction, tp->snd_cwnd >> TCP_YEAH_DELTA);
		247	} else
		248	reduction = max(tp->snd_cwnd>>1,2U);
		249
		250	yeah->fast_count = 0;
		251	yeah->reno_count = max(yeah->reno_count>>1, 2U);
		252
		253	return tp->snd_cwnd - reduction;
		254	}
		255
		256	static struct tcp_congestion_ops tcp_yeah = {
		257	.init = tcp_yeah_init,
		258	.ssthresh = tcp_yeah_ssthresh,
		259	.cong_avoid = tcp_yeah_cong_avoid,
		260	.min_cwnd = tcp_reno_min_cwnd,
		261	.rtt_sample = tcp_vegas_rtt_calc,
		262	.set_state = tcp_vegas_state,
		263	.cwnd_event = tcp_vegas_cwnd_event,
		264	.get_info = tcp_vegas_get_info,
		265	.pkts_acked = tcp_yeah_pkts_acked,
		266
		267	.owner = THIS_MODULE,
		268	.name = "yeah",
		269	};
		270
		271	static int __init tcp_yeah_register(void)
		272	{
		273	BUG_ON(sizeof(struct yeah) > ICSK_CA_PRIV_SIZE);
		274	tcp_register_congestion_control(&tcp_yeah);
		275	return 0;
		276	}
		277
		278	static void __exit tcp_yeah_unregister(void)
		279	{
		280	tcp_unregister_congestion_control(&tcp_yeah);
		281	}
		282
		283	module_init(tcp_yeah_register);
		284	module_exit(tcp_yeah_unregister);
		285
		286	MODULE_AUTHOR("Angelo P. Castellani");
		287	MODULE_LICENSE("GPL");
		288	MODULE_DESCRIPTION("YeAH TCP");


diff --git a/net/ipv4/tcp_yeah.h b/net/ipv4/tcp_yeah.h new file mode 100644 index 000000000000..b3255dba4e2d --- /dev/null +++ b/net/ipv4/tcp_yeah.h
@@ -0,0 +1,134 @@
		1	#include <linux/mm.h>
		2	#include <linux/module.h>
		3	#include <linux/skbuff.h>
		4	#include <linux/inet_diag.h>
		5
		6	#include <net/tcp.h>
		7
		8	/* Vegas variables */
		9	struct vegas {
		10	u32 beg_snd_nxt; /* right edge during last RTT */
		11	u32 beg_snd_una; /* left edge during last RTT */
		12	u32 beg_snd_cwnd; /* saves the size of the cwnd */
		13	u8 doing_vegas_now;/* if true, do vegas for this RTT */
		14	u16 cntRTT; /* # of RTTs measured within last RTT */
		15	u32 minRTT; /* min of RTTs measured within last RTT (in usec) */
		16	u32 baseRTT; /* the min of all Vegas RTT measurements seen (in usec) */
		17	};
		18
		19	/* There are several situations when we must "re-start" Vegas:
		20	*
		21	* o when a connection is established
		22	* o after an RTO
		23	* o after fast recovery
		24	* o when we send a packet and there is no outstanding
		25	* unacknowledged data (restarting an idle connection)
		26	*
		27	* In these circumstances we cannot do a Vegas calculation at the
		28	* end of the first RTT, because any calculation we do is using
		29	* stale info -- both the saved cwnd and congestion feedback are
		30	* stale.
		31	*
		32	* Instead we must wait until the completion of an RTT during
		33	* which we actually receive ACKs.
		34	*/
		35	static inline void vegas_enable(struct sock *sk)
		36	{
		37	const struct tcp_sock *tp = tcp_sk(sk);
		38	struct vegas *vegas = inet_csk_ca(sk);
		39
		40	/* Begin taking Vegas samples next time we send something. */
		41	vegas->doing_vegas_now = 1;
		42
		43	/* Set the beginning of the next send window. */
		44	vegas->beg_snd_nxt = tp->snd_nxt;
		45
		46	vegas->cntRTT = 0;
		47	vegas->minRTT = 0x7fffffff;
		48	}
		49
		50	/* Stop taking Vegas samples for now. */
		51	static inline void vegas_disable(struct sock *sk)
		52	{
		53	struct vegas *vegas = inet_csk_ca(sk);
		54
		55	vegas->doing_vegas_now = 0;
		56	}
		57
		58	static void tcp_vegas_init(struct sock *sk)
		59	{
		60	struct vegas *vegas = inet_csk_ca(sk);
		61
		62	vegas->baseRTT = 0x7fffffff;
		63	vegas_enable(sk);
		64	}
		65
		66	static void tcp_vegas_state(struct sock *sk, u8 ca_state)
		67	{
		68
		69	if (ca_state == TCP_CA_Open)
		70	vegas_enable(sk);
		71	else
		72	vegas_disable(sk);
		73	}
		74
		75	/* Do RTT sampling needed for Vegas.
		76	* Basically we:
		77	* o min-filter RTT samples from within an RTT to get the current
		78	* propagation delay + queuing delay (we are min-filtering to try to
		79	* avoid the effects of delayed ACKs)
		80	* o min-filter RTT samples from a much longer window (forever for now)
		81	* to find the propagation delay (baseRTT)
		82	*/
		83	static void tcp_vegas_rtt_calc(struct sock *sk, u32 usrtt)
		84	{
		85	struct vegas *vegas = inet_csk_ca(sk);
		86	u32 vrtt = usrtt + 1; /* Never allow zero rtt or baseRTT */
		87
		88	/* Filter to find propagation delay: */
		89	if (vrtt < vegas->baseRTT)
		90	vegas->baseRTT = vrtt;
		91
		92	/* Find the min RTT during the last RTT to find
		93	* the current prop. delay + queuing delay:
		94	*/
		95	vegas->minRTT = min(vegas->minRTT, vrtt);
		96	vegas->cntRTT++;
		97	}
		98
		99	/*
		100	* If the connection is idle and we are restarting,
		101	* then we don't want to do any Vegas calculations
		102	* until we get fresh RTT samples. So when we
		103	* restart, we reset our Vegas state to a clean
		104	* slate. After we get acks for this flight of
		105	* packets, _then_ we can make Vegas calculations
		106	* again.
		107	*/
		108	static void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event)
		109	{
		110	if (event == CA_EVENT_CWND_RESTART \|\|
		111	event == CA_EVENT_TX_START)
		112	tcp_vegas_init(sk);
		113	}
		114
		115	/* Extract info for Tcp socket info provided via netlink. */
		116	static void tcp_vegas_get_info(struct sock *sk, u32 ext,
		117	struct sk_buff *skb)
		118	{
		119	const struct vegas *ca = inet_csk_ca(sk);
		120	if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
		121	struct tcpvegas_info *info;
		122
		123	info = RTA_DATA(__RTA_PUT(skb, INET_DIAG_VEGASINFO,
		124	sizeof(*info)));
		125
		126	info->tcpv_enabled = ca->doing_vegas_now;
		127	info->tcpv_rttcnt = ca->cntRTT;
		128	info->tcpv_rtt = ca->baseRTT;
		129	info->tcpv_minrtt = ca->minRTT;
		130	rtattr_failure: ;
		131	}
		132	}
		133
		134