aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAngelo P. Castellani <angelo.castellani@gmail.con>2007-02-22 03:23:05 -0500
committerDavid S. Miller <davem@sunset.davemloft.net>2007-04-26 01:23:18 -0400
commit5ef814753eb810d900fbd77af7c87f6d04f0e551 (patch)
tree5ef93769f33ea5676588aab48179a0c614e6275c
parent127af0c44fc916908abd145914d65b9fe598bcd7 (diff)
[TCP] YeAH-TCP: algorithm implementation
YeAH-TCP is a sender-side high-speed enabled TCP congestion control algorithm, which uses a mixed loss/delay approach to compute the congestion window. It's design goals target high efficiency, internal, RTT and Reno fairness, resilience to link loss while keeping network elements load as low as possible. For further details look here: http://wil.cs.caltech.edu/pfldnet2007/paper/YeAH_TCP.pdf Signed-off-by: Angelo P. Castellani <angelo.castellani@gmail.con> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--net/ipv4/Kconfig14
-rw-r--r--net/ipv4/Makefile1
-rw-r--r--net/ipv4/tcp_yeah.c288
-rw-r--r--net/ipv4/tcp_yeah.h134
4 files changed, 437 insertions, 0 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 9e8ef509c51d..dc61e6641624 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -574,6 +574,20 @@ config TCP_CONG_VENO
574 loss packets. 574 loss packets.
575 See http://www.ntu.edu.sg/home5/ZHOU0022/papers/CPFu03a.pdf 575 See http://www.ntu.edu.sg/home5/ZHOU0022/papers/CPFu03a.pdf
576 576
577config TCP_CONG_YEAH
578 tristate "YeAH TCP"
579 depends on EXPERIMENTAL
580 default n
581 ---help---
582 YeAH-TCP is a sender-side high-speed enabled TCP congestion control
583 algorithm, which uses a mixed loss/delay approach to compute the
584 congestion window. It's design goals target high efficiency,
585 internal, RTT and Reno fairness, resilience to link loss while
586 keeping network elements load as low as possible.
587
588 For further details look here:
589 http://wil.cs.caltech.edu/pfldnet2007/paper/YeAH_TCP.pdf
590
577choice 591choice
578 prompt "Default TCP congestion control" 592 prompt "Default TCP congestion control"
579 default DEFAULT_CUBIC 593 default DEFAULT_CUBIC
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 7a068626feea..eeb94d5cac96 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -49,6 +49,7 @@ obj-$(CONFIG_TCP_CONG_VEGAS) += tcp_vegas.o
49obj-$(CONFIG_TCP_CONG_VENO) += tcp_veno.o 49obj-$(CONFIG_TCP_CONG_VENO) += tcp_veno.o
50obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o 50obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o
51obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o 51obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o
52obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o
52obj-$(CONFIG_NETLABEL) += cipso_ipv4.o 53obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
53 54
54obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ 55obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c
new file mode 100644
index 000000000000..815e020e98fe
--- /dev/null
+++ b/net/ipv4/tcp_yeah.c
@@ -0,0 +1,288 @@
1/*
2 *
3 * YeAH TCP
4 *
5 * For further details look at:
6 * http://wil.cs.caltech.edu/pfldnet2007/paper/YeAH_TCP.pdf
7 *
8 */
9
10#include "tcp_yeah.h"
11
12/* Default values of the Vegas variables, in fixed-point representation
13 * with V_PARAM_SHIFT bits to the right of the binary point.
14 */
15#define V_PARAM_SHIFT 1
16
17#define TCP_YEAH_ALPHA 80 //lin number of packets queued at the bottleneck
18#define TCP_YEAH_GAMMA 1 //lin fraction of queue to be removed per rtt
19#define TCP_YEAH_DELTA 3 //log minimum fraction of cwnd to be removed on loss
20#define TCP_YEAH_EPSILON 1 //log maximum fraction to be removed on early decongestion
21#define TCP_YEAH_PHY 8 //lin maximum delta from base
22#define TCP_YEAH_RHO 16 //lin minumum number of consecutive rtt to consider competition on loss
23#define TCP_YEAH_ZETA 50 //lin minimum number of state switchs to reset reno_count
24
25#define TCP_SCALABLE_AI_CNT 100U
26
27/* YeAH variables */
28struct yeah {
29 /* Vegas */
30 u32 beg_snd_nxt; /* right edge during last RTT */
31 u32 beg_snd_una; /* left edge during last RTT */
32 u32 beg_snd_cwnd; /* saves the size of the cwnd */
33 u8 doing_vegas_now;/* if true, do vegas for this RTT */
34 u16 cntRTT; /* # of RTTs measured within last RTT */
35 u32 minRTT; /* min of RTTs measured within last RTT (in usec) */
36 u32 baseRTT; /* the min of all Vegas RTT measurements seen (in usec) */
37
38 /* YeAH */
39 u32 lastQ;
40 u32 doing_reno_now;
41
42 u32 reno_count;
43 u32 fast_count;
44
45 u32 pkts_acked;
46};
47
48static void tcp_yeah_init(struct sock *sk)
49{
50 struct tcp_sock *tp = tcp_sk(sk);
51 struct yeah *yeah = inet_csk_ca(sk);
52
53 tcp_vegas_init(sk);
54
55 yeah->doing_reno_now = 0;
56 yeah->lastQ = 0;
57
58 yeah->reno_count = 2;
59
60 /* Ensure the MD arithmetic works. This is somewhat pedantic,
61 * since I don't think we will see a cwnd this large. :) */
62 tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128);
63
64}
65
66
67static void tcp_yeah_pkts_acked(struct sock *sk, u32 pkts_acked)
68{
69 const struct inet_connection_sock *icsk = inet_csk(sk);
70 struct yeah *yeah = inet_csk_ca(sk);
71
72 if (icsk->icsk_ca_state == TCP_CA_Open)
73 yeah->pkts_acked = pkts_acked;
74}
75
76/* 64bit divisor, dividend and result. dynamic precision */
77static inline u64 div64_64(u64 dividend, u64 divisor)
78{
79 u32 d = divisor;
80
81 if (divisor > 0xffffffffULL) {
82 unsigned int shift = fls(divisor >> 32);
83
84 d = divisor >> shift;
85 dividend >>= shift;
86 }
87
88 /* avoid 64 bit division if possible */
89 if (dividend >> 32)
90 do_div(dividend, d);
91 else
92 dividend = (u32) dividend / d;
93
94 return dividend;
95}
96
97static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack,
98 u32 seq_rtt, u32 in_flight, int flag)
99{
100 struct tcp_sock *tp = tcp_sk(sk);
101 struct yeah *yeah = inet_csk_ca(sk);
102
103 if (!tcp_is_cwnd_limited(sk, in_flight))
104 return;
105
106 if (tp->snd_cwnd <= tp->snd_ssthresh) {
107 tcp_slow_start(tp);
108 } else if (!yeah->doing_reno_now) {
109 /* Scalable */
110
111 tp->snd_cwnd_cnt+=yeah->pkts_acked;
112 if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)){
113 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
114 tp->snd_cwnd++;
115 tp->snd_cwnd_cnt = 0;
116 }
117
118 yeah->pkts_acked = 1;
119
120 } else {
121 /* Reno */
122
123 if (tp->snd_cwnd_cnt < tp->snd_cwnd)
124 tp->snd_cwnd_cnt++;
125
126 if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
127 tp->snd_cwnd++;
128 tp->snd_cwnd_cnt = 0;
129 }
130 }
131
132 /* The key players are v_beg_snd_una and v_beg_snd_nxt.
133 *
134 * These are so named because they represent the approximate values
135 * of snd_una and snd_nxt at the beginning of the current RTT. More
136 * precisely, they represent the amount of data sent during the RTT.
137 * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt,
138 * we will calculate that (v_beg_snd_nxt - v_beg_snd_una) outstanding
139 * bytes of data have been ACKed during the course of the RTT, giving
140 * an "actual" rate of:
141 *
142 * (v_beg_snd_nxt - v_beg_snd_una) / (rtt duration)
143 *
144 * Unfortunately, v_beg_snd_una is not exactly equal to snd_una,
145 * because delayed ACKs can cover more than one segment, so they
146 * don't line up yeahly with the boundaries of RTTs.
147 *
148 * Another unfortunate fact of life is that delayed ACKs delay the
149 * advance of the left edge of our send window, so that the number
150 * of bytes we send in an RTT is often less than our cwnd will allow.
151 * So we keep track of our cwnd separately, in v_beg_snd_cwnd.
152 */
153
154 if (after(ack, yeah->beg_snd_nxt)) {
155
156 /* We do the Vegas calculations only if we got enough RTT
157 * samples that we can be reasonably sure that we got
158 * at least one RTT sample that wasn't from a delayed ACK.
159 * If we only had 2 samples total,
160 * then that means we're getting only 1 ACK per RTT, which
161 * means they're almost certainly delayed ACKs.
162 * If we have 3 samples, we should be OK.
163 */
164
165 if (yeah->cntRTT > 2) {
166 u32 rtt;
167 u32 queue, maxqueue;
168
169 /* We have enough RTT samples, so, using the Vegas
170 * algorithm, we determine if we should increase or
171 * decrease cwnd, and by how much.
172 */
173
174 /* Pluck out the RTT we are using for the Vegas
175 * calculations. This is the min RTT seen during the
176 * last RTT. Taking the min filters out the effects
177 * of delayed ACKs, at the cost of noticing congestion
178 * a bit later.
179 */
180 rtt = yeah->minRTT;
181
182 queue = (u32)div64_64((u64)tp->snd_cwnd * (rtt - yeah->baseRTT), rtt);
183
184 maxqueue = TCP_YEAH_ALPHA;
185
186 if (queue > maxqueue ||
187 rtt - yeah->baseRTT > (yeah->baseRTT / TCP_YEAH_PHY)) {
188
189 if (queue > maxqueue && tp->snd_cwnd > yeah->reno_count) {
190 u32 reduction = min( queue / TCP_YEAH_GAMMA ,
191 tp->snd_cwnd >> TCP_YEAH_EPSILON );
192
193 tp->snd_cwnd -= reduction;
194
195 tp->snd_cwnd = max( tp->snd_cwnd, yeah->reno_count);
196
197 tp->snd_ssthresh = tp->snd_cwnd;
198 }
199
200 if (yeah->reno_count <= 2)
201 yeah->reno_count = max( tp->snd_cwnd>>1, 2U);
202 else
203 yeah->reno_count++;
204
205 yeah->doing_reno_now =
206 min_t( u32, yeah->doing_reno_now + 1 , 0xffffff);
207
208 } else {
209 yeah->fast_count++;
210
211 if (yeah->fast_count > TCP_YEAH_ZETA) {
212 yeah->reno_count = 2;
213 yeah->fast_count = 0;
214 }
215
216 yeah->doing_reno_now = 0;
217 }
218
219 yeah->lastQ = queue;
220
221 }
222
223 /* Save the extent of the current window so we can use this
224 * at the end of the next RTT.
225 */
226 yeah->beg_snd_una = yeah->beg_snd_nxt;
227 yeah->beg_snd_nxt = tp->snd_nxt;
228 yeah->beg_snd_cwnd = tp->snd_cwnd;
229
230 /* Wipe the slate clean for the next RTT. */
231 yeah->cntRTT = 0;
232 yeah->minRTT = 0x7fffffff;
233 }
234}
235
236static u32 tcp_yeah_ssthresh(struct sock *sk) {
237 const struct tcp_sock *tp = tcp_sk(sk);
238 struct yeah *yeah = inet_csk_ca(sk);
239 u32 reduction;
240
241 if (yeah->doing_reno_now < TCP_YEAH_RHO) {
242 reduction = yeah->lastQ;
243
244 reduction = min( reduction, max(tp->snd_cwnd>>1, 2U) );
245
246 reduction = max( reduction, tp->snd_cwnd >> TCP_YEAH_DELTA);
247 } else
248 reduction = max(tp->snd_cwnd>>1,2U);
249
250 yeah->fast_count = 0;
251 yeah->reno_count = max(yeah->reno_count>>1, 2U);
252
253 return tp->snd_cwnd - reduction;
254}
255
256static struct tcp_congestion_ops tcp_yeah = {
257 .init = tcp_yeah_init,
258 .ssthresh = tcp_yeah_ssthresh,
259 .cong_avoid = tcp_yeah_cong_avoid,
260 .min_cwnd = tcp_reno_min_cwnd,
261 .rtt_sample = tcp_vegas_rtt_calc,
262 .set_state = tcp_vegas_state,
263 .cwnd_event = tcp_vegas_cwnd_event,
264 .get_info = tcp_vegas_get_info,
265 .pkts_acked = tcp_yeah_pkts_acked,
266
267 .owner = THIS_MODULE,
268 .name = "yeah",
269};
270
271static int __init tcp_yeah_register(void)
272{
273 BUG_ON(sizeof(struct yeah) > ICSK_CA_PRIV_SIZE);
274 tcp_register_congestion_control(&tcp_yeah);
275 return 0;
276}
277
278static void __exit tcp_yeah_unregister(void)
279{
280 tcp_unregister_congestion_control(&tcp_yeah);
281}
282
283module_init(tcp_yeah_register);
284module_exit(tcp_yeah_unregister);
285
286MODULE_AUTHOR("Angelo P. Castellani");
287MODULE_LICENSE("GPL");
288MODULE_DESCRIPTION("YeAH TCP");
diff --git a/net/ipv4/tcp_yeah.h b/net/ipv4/tcp_yeah.h
new file mode 100644
index 000000000000..b3255dba4e2d
--- /dev/null
+++ b/net/ipv4/tcp_yeah.h
@@ -0,0 +1,134 @@
1#include <linux/mm.h>
2#include <linux/module.h>
3#include <linux/skbuff.h>
4#include <linux/inet_diag.h>
5
6#include <net/tcp.h>
7
8/* Vegas variables */
9struct vegas {
10 u32 beg_snd_nxt; /* right edge during last RTT */
11 u32 beg_snd_una; /* left edge during last RTT */
12 u32 beg_snd_cwnd; /* saves the size of the cwnd */
13 u8 doing_vegas_now;/* if true, do vegas for this RTT */
14 u16 cntRTT; /* # of RTTs measured within last RTT */
15 u32 minRTT; /* min of RTTs measured within last RTT (in usec) */
16 u32 baseRTT; /* the min of all Vegas RTT measurements seen (in usec) */
17};
18
19/* There are several situations when we must "re-start" Vegas:
20 *
21 * o when a connection is established
22 * o after an RTO
23 * o after fast recovery
24 * o when we send a packet and there is no outstanding
25 * unacknowledged data (restarting an idle connection)
26 *
27 * In these circumstances we cannot do a Vegas calculation at the
28 * end of the first RTT, because any calculation we do is using
29 * stale info -- both the saved cwnd and congestion feedback are
30 * stale.
31 *
32 * Instead we must wait until the completion of an RTT during
33 * which we actually receive ACKs.
34 */
35static inline void vegas_enable(struct sock *sk)
36{
37 const struct tcp_sock *tp = tcp_sk(sk);
38 struct vegas *vegas = inet_csk_ca(sk);
39
40 /* Begin taking Vegas samples next time we send something. */
41 vegas->doing_vegas_now = 1;
42
43 /* Set the beginning of the next send window. */
44 vegas->beg_snd_nxt = tp->snd_nxt;
45
46 vegas->cntRTT = 0;
47 vegas->minRTT = 0x7fffffff;
48}
49
50/* Stop taking Vegas samples for now. */
51static inline void vegas_disable(struct sock *sk)
52{
53 struct vegas *vegas = inet_csk_ca(sk);
54
55 vegas->doing_vegas_now = 0;
56}
57
58static void tcp_vegas_init(struct sock *sk)
59{
60 struct vegas *vegas = inet_csk_ca(sk);
61
62 vegas->baseRTT = 0x7fffffff;
63 vegas_enable(sk);
64}
65
66static void tcp_vegas_state(struct sock *sk, u8 ca_state)
67{
68
69 if (ca_state == TCP_CA_Open)
70 vegas_enable(sk);
71 else
72 vegas_disable(sk);
73}
74
75/* Do RTT sampling needed for Vegas.
76 * Basically we:
77 * o min-filter RTT samples from within an RTT to get the current
78 * propagation delay + queuing delay (we are min-filtering to try to
79 * avoid the effects of delayed ACKs)
80 * o min-filter RTT samples from a much longer window (forever for now)
81 * to find the propagation delay (baseRTT)
82 */
83static void tcp_vegas_rtt_calc(struct sock *sk, u32 usrtt)
84{
85 struct vegas *vegas = inet_csk_ca(sk);
86 u32 vrtt = usrtt + 1; /* Never allow zero rtt or baseRTT */
87
88 /* Filter to find propagation delay: */
89 if (vrtt < vegas->baseRTT)
90 vegas->baseRTT = vrtt;
91
92 /* Find the min RTT during the last RTT to find
93 * the current prop. delay + queuing delay:
94 */
95 vegas->minRTT = min(vegas->minRTT, vrtt);
96 vegas->cntRTT++;
97}
98
99/*
100 * If the connection is idle and we are restarting,
101 * then we don't want to do any Vegas calculations
102 * until we get fresh RTT samples. So when we
103 * restart, we reset our Vegas state to a clean
104 * slate. After we get acks for this flight of
105 * packets, _then_ we can make Vegas calculations
106 * again.
107 */
108static void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event)
109{
110 if (event == CA_EVENT_CWND_RESTART ||
111 event == CA_EVENT_TX_START)
112 tcp_vegas_init(sk);
113}
114
115/* Extract info for Tcp socket info provided via netlink. */
116static void tcp_vegas_get_info(struct sock *sk, u32 ext,
117 struct sk_buff *skb)
118{
119 const struct vegas *ca = inet_csk_ca(sk);
120 if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
121 struct tcpvegas_info *info;
122
123 info = RTA_DATA(__RTA_PUT(skb, INET_DIAG_VEGASINFO,
124 sizeof(*info)));
125
126 info->tcpv_enabled = ca->doing_vegas_now;
127 info->tcpv_rttcnt = ca->cntRTT;
128 info->tcpv_rtt = ca->baseRTT;
129 info->tcpv_minrtt = ca->minRTT;
130 rtattr_failure: ;
131 }
132}
133
134