aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDoug Leith <doug.leith@nuim.ie>2008-12-09 03:13:04 -0500
committerDavid S. Miller <davem@davemloft.net>2008-12-09 03:13:04 -0500
commit8d3a564da34e5844aca4f991b73f8ca512246b23 (patch)
tree9cff1e4a8c5feb4c76a360e782c3598355d78492
parent8c83f80b2d335176f72d8729fe1dfe19c5812cb1 (diff)
tcp: tcp_vegas cong avoid fix
This patch addresses a book-keeping issue in tcp_vegas.c. At present tcp_vegas does separate book-keeping of cwnd based on packet sequence numbers. A mismatch can develop between this book-keeping and tp->snd_cwnd due, for example, to delayed acks acking multiple packets. When vegas transitions to reno operation (e.g. following loss), then this mismatch leads to incorrect behaviour (akin to a cwnd backoff). This seems mostly to affect operation at low cwnds where delayed acking can lead to a significant fraction of cwnd being covered by a single ack, leading to the book-keeping mismatch. This patch modifies the congestion avoidance update to avoid the need for separate book-keeping while leaving vegas congestion avoidance functionally unchanged. A secondary advantage of this modification is that the use of fixed-point (via V_PARAM_SHIFT) and 64 bit arithmetic is no longer necessary, simplifying the code. Some example test measurements with the patched code (confirming no functional change in the congestion avoidance algorithm) can be seen at: http://www.hamilton.ie/doug/vegaspatch/ Signed-off-by: Doug Leith <doug.leith@nuim.ie> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--net/ipv4/tcp_vegas.c80
1 files changed, 10 insertions, 70 deletions
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index 7cd22262de3a..a453aac91bd3 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -40,18 +40,14 @@
40 40
41#include "tcp_vegas.h" 41#include "tcp_vegas.h"
42 42
43/* Default values of the Vegas variables, in fixed-point representation 43static int alpha = 2;
44 * with V_PARAM_SHIFT bits to the right of the binary point. 44static int beta = 4;
45 */ 45static int gamma = 1;
46#define V_PARAM_SHIFT 1
47static int alpha = 2<<V_PARAM_SHIFT;
48static int beta = 4<<V_PARAM_SHIFT;
49static int gamma = 1<<V_PARAM_SHIFT;
50 46
51module_param(alpha, int, 0644); 47module_param(alpha, int, 0644);
52MODULE_PARM_DESC(alpha, "lower bound of packets in network (scale by 2)"); 48MODULE_PARM_DESC(alpha, "lower bound of packets in network");
53module_param(beta, int, 0644); 49module_param(beta, int, 0644);
54MODULE_PARM_DESC(beta, "upper bound of packets in network (scale by 2)"); 50MODULE_PARM_DESC(beta, "upper bound of packets in network");
55module_param(gamma, int, 0644); 51module_param(gamma, int, 0644);
56MODULE_PARM_DESC(gamma, "limit on increase (scale by 2)"); 52MODULE_PARM_DESC(gamma, "limit on increase (scale by 2)");
57 53
@@ -172,49 +168,13 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
172 return; 168 return;
173 } 169 }
174 170
175 /* The key players are v_beg_snd_una and v_beg_snd_nxt.
176 *
177 * These are so named because they represent the approximate values
178 * of snd_una and snd_nxt at the beginning of the current RTT. More
179 * precisely, they represent the amount of data sent during the RTT.
180 * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt,
181 * we will calculate that (v_beg_snd_nxt - v_beg_snd_una) outstanding
182 * bytes of data have been ACKed during the course of the RTT, giving
183 * an "actual" rate of:
184 *
185 * (v_beg_snd_nxt - v_beg_snd_una) / (rtt duration)
186 *
187 * Unfortunately, v_beg_snd_una is not exactly equal to snd_una,
188 * because delayed ACKs can cover more than one segment, so they
189 * don't line up nicely with the boundaries of RTTs.
190 *
191 * Another unfortunate fact of life is that delayed ACKs delay the
192 * advance of the left edge of our send window, so that the number
193 * of bytes we send in an RTT is often less than our cwnd will allow.
194 * So we keep track of our cwnd separately, in v_beg_snd_cwnd.
195 */
196
197 if (after(ack, vegas->beg_snd_nxt)) { 171 if (after(ack, vegas->beg_snd_nxt)) {
198 /* Do the Vegas once-per-RTT cwnd adjustment. */ 172 /* Do the Vegas once-per-RTT cwnd adjustment. */
199 u32 old_wnd, old_snd_cwnd;
200
201
202 /* Here old_wnd is essentially the window of data that was
203 * sent during the previous RTT, and has all
204 * been acknowledged in the course of the RTT that ended
205 * with the ACK we just received. Likewise, old_snd_cwnd
206 * is the cwnd during the previous RTT.
207 */
208 old_wnd = (vegas->beg_snd_nxt - vegas->beg_snd_una) /
209 tp->mss_cache;
210 old_snd_cwnd = vegas->beg_snd_cwnd;
211 173
212 /* Save the extent of the current window so we can use this 174 /* Save the extent of the current window so we can use this
213 * at the end of the next RTT. 175 * at the end of the next RTT.
214 */ 176 */
215 vegas->beg_snd_una = vegas->beg_snd_nxt;
216 vegas->beg_snd_nxt = tp->snd_nxt; 177 vegas->beg_snd_nxt = tp->snd_nxt;
217 vegas->beg_snd_cwnd = tp->snd_cwnd;
218 178
219 /* We do the Vegas calculations only if we got enough RTT 179 /* We do the Vegas calculations only if we got enough RTT
220 * samples that we can be reasonably sure that we got 180 * samples that we can be reasonably sure that we got
@@ -252,22 +212,14 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
252 * 212 *
253 * This is: 213 * This is:
254 * (actual rate in segments) * baseRTT 214 * (actual rate in segments) * baseRTT
255 * We keep it as a fixed point number with
256 * V_PARAM_SHIFT bits to the right of the binary point.
257 */ 215 */
258 target_cwnd = ((u64)old_wnd * vegas->baseRTT); 216 target_cwnd = tp->snd_cwnd * vegas->baseRTT / rtt;
259 target_cwnd <<= V_PARAM_SHIFT;
260 do_div(target_cwnd, rtt);
261 217
262 /* Calculate the difference between the window we had, 218 /* Calculate the difference between the window we had,
263 * and the window we would like to have. This quantity 219 * and the window we would like to have. This quantity
264 * is the "Diff" from the Arizona Vegas papers. 220 * is the "Diff" from the Arizona Vegas papers.
265 *
266 * Again, this is a fixed point number with
267 * V_PARAM_SHIFT bits to the right of the binary
268 * point.
269 */ 221 */
270 diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd; 222 diff = tp->snd_cwnd * (rtt-vegas->baseRTT) / vegas->baseRTT;
271 223
272 if (diff > gamma && tp->snd_ssthresh > 2 ) { 224 if (diff > gamma && tp->snd_ssthresh > 2 ) {
273 /* Going too fast. Time to slow down 225 /* Going too fast. Time to slow down
@@ -282,16 +234,13 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
282 * truncation robs us of full link 234 * truncation robs us of full link
283 * utilization. 235 * utilization.
284 */ 236 */
285 tp->snd_cwnd = min(tp->snd_cwnd, 237 tp->snd_cwnd = min(tp->snd_cwnd, (u32)target_cwnd+1);
286 ((u32)target_cwnd >>
287 V_PARAM_SHIFT)+1);
288 238
289 } else if (tp->snd_cwnd <= tp->snd_ssthresh) { 239 } else if (tp->snd_cwnd <= tp->snd_ssthresh) {
290 /* Slow start. */ 240 /* Slow start. */
291 tcp_slow_start(tp); 241 tcp_slow_start(tp);
292 } else { 242 } else {
293 /* Congestion avoidance. */ 243 /* Congestion avoidance. */
294 u32 next_snd_cwnd;
295 244
296 /* Figure out where we would like cwnd 245 /* Figure out where we would like cwnd
297 * to be. 246 * to be.
@@ -300,26 +249,17 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
300 /* The old window was too fast, so 249 /* The old window was too fast, so
301 * we slow down. 250 * we slow down.
302 */ 251 */
303 next_snd_cwnd = old_snd_cwnd - 1; 252 tp->snd_cwnd--;
304 } else if (diff < alpha) { 253 } else if (diff < alpha) {
305 /* We don't have enough extra packets 254 /* We don't have enough extra packets
306 * in the network, so speed up. 255 * in the network, so speed up.
307 */ 256 */
308 next_snd_cwnd = old_snd_cwnd + 1; 257 tp->snd_cwnd++;
309 } else { 258 } else {
310 /* Sending just as fast as we 259 /* Sending just as fast as we
311 * should be. 260 * should be.
312 */ 261 */
313 next_snd_cwnd = old_snd_cwnd;
314 } 262 }
315
316 /* Adjust cwnd upward or downward, toward the
317 * desired value.
318 */
319 if (next_snd_cwnd > tp->snd_cwnd)
320 tp->snd_cwnd++;
321 else if (next_snd_cwnd < tp->snd_cwnd)
322 tp->snd_cwnd--;
323 } 263 }
324 264
325 if (tp->snd_cwnd < 2) 265 if (tp->snd_cwnd < 2)