aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGerrit Renker <gerrit@erg.abdn.ac.uk>2008-09-04 01:30:19 -0400
committerGerrit Renker <gerrit@erg.abdn.ac.uk>2008-09-04 01:45:39 -0400
commit1435562d7e0412e4885b661843f69859013f9d25 (patch)
tree8357027ac15fa199051e8d85aa448115c3bdd2c2
parente9803c0104564698d3b8e84ccdb0b8b0e65427e2 (diff)
dccp ccid-2: Replace broken RTT estimator with better algorithm
The current CCID-2 RTT estimator code is in parts broken and lags behind the suggestions in RFC2988 of using scaled variants for SRTT/RTTVAR. That code is replaced by the present patch, which reuses the Linux TCP RTT estimator code - reasons for this code duplication are given below. Further details: ---------------- 1. The minimum RTO of previously one second has been replaced with TCP's, since RFC4341, sec. 5 says that the minimum of 1 sec. (suggested in RFC2988, 2.4) is not necessary. Instead, the TCP_RTO_MIN is used, which agrees with DCCP's concept of a default RTT (RFC 4340, 3.4). 2. The maximum RTO has been set to DCCP_RTO_MAX (64 sec), which agrees with RFC2988, (2.5). 3. De-inlined the function ccid2_new_ack(). 4. Added a FIXME: the RTT is sampled several times per Ack Vector, which will give the wrong estimate. It should be replaced with one sample per Ack. However, at the moment this can not be resolved easily, since - it depends on TX history code (which also needs some work), - the cleanest solution is not to use the `sent' time at all (saves 4 bytes per entry) and use DCCP timestamps / elapsed time to estimated the RTT, which however is non-trivial to get right (but needs to be done). Reasons for reusing the Linux TCP estimator algorithm: ------------------------------------------------------ Some time was spent to find a better alternative, using basic RFC2988 as a first step. Further analysis and experimentation showed that the Linux TCP RTO estimator is superior to a basic RFC2988 implementation. A summary is on http://www.erg.abdn.ac.uk/users/gerrit/dccp/notes/ccid2/rto_estimator/ In addition, this estimator fared well in a recent empirical evaluation: Rewaskar, Sushant, Jasleen Kaur and F. Donelson Smith. A Performance Study of Loss Detection/Recovery in Real-world TCP Implementations. Proceedings of 15th IEEE International Conference on Network Protocols (ICNP-07). 2007. Thus there is significant benefit in reusing the existing TCP code. Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
-rw-r--r--net/dccp/ccids/ccid2.c171
-rw-r--r--net/dccp/ccids/ccid2.h18
2 files changed, 107 insertions, 82 deletions
diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c
index fa074d44206..22753fd9869 100644
--- a/net/dccp/ccids/ccid2.c
+++ b/net/dccp/ccids/ccid2.c
@@ -110,12 +110,6 @@ static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val)
110 dp->dccps_l_ack_ratio = val; 110 dp->dccps_l_ack_ratio = val;
111} 111}
112 112
113static void ccid2_change_srtt(struct ccid2_hc_tx_sock *hctx, long val)
114{
115 ccid2_pr_debug("change SRTT to %ld\n", val);
116 hctx->srtt = val;
117}
118
119static void ccid2_start_rto_timer(struct sock *sk); 113static void ccid2_start_rto_timer(struct sock *sk);
120 114
121static void ccid2_hc_tx_rto_expire(unsigned long data) 115static void ccid2_hc_tx_rto_expire(unsigned long data)
@@ -123,7 +117,6 @@ static void ccid2_hc_tx_rto_expire(unsigned long data)
123 struct sock *sk = (struct sock *)data; 117 struct sock *sk = (struct sock *)data;
124 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); 118 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
125 const bool sender_was_blocked = ccid2_cwnd_network_limited(hctx); 119 const bool sender_was_blocked = ccid2_cwnd_network_limited(hctx);
126 long s;
127 120
128 bh_lock_sock(sk); 121 bh_lock_sock(sk);
129 if (sock_owned_by_user(sk)) { 122 if (sock_owned_by_user(sk)) {
@@ -135,10 +128,8 @@ static void ccid2_hc_tx_rto_expire(unsigned long data)
135 128
136 /* back-off timer */ 129 /* back-off timer */
137 hctx->rto <<= 1; 130 hctx->rto <<= 1;
138 131 if (hctx->rto > DCCP_RTO_MAX)
139 s = hctx->rto / HZ; 132 hctx->rto = DCCP_RTO_MAX;
140 if (s > 60)
141 hctx->rto = 60 * HZ;
142 133
143 /* adjust pipe, cwnd etc */ 134 /* adjust pipe, cwnd etc */
144 hctx->ssthresh = hctx->cwnd / 2; 135 hctx->ssthresh = hctx->cwnd / 2;
@@ -279,9 +270,87 @@ static void ccid2_hc_tx_kill_rto_timer(struct sock *sk)
279 ccid2_pr_debug("deleted RTO timer\n"); 270 ccid2_pr_debug("deleted RTO timer\n");
280} 271}
281 272
282static inline void ccid2_new_ack(struct sock *sk, 273/**
283 struct ccid2_seq *seqp, 274 * ccid2_rtt_estimator - Sample RTT and compute RTO using RFC2988 algorithm
284 unsigned int *maxincr) 275 * This code is almost identical with TCP's tcp_rtt_estimator(), since
276 * - it has a higher sampling frequency (recommended by RFC 1323),
277 * - the RTO does not collapse into RTT due to RTTVAR going towards zero,
278 * - it is simple (cf. more complex proposals such as Eifel timer or research
279 * which suggests that the gain should be set according to window size),
280 * - in tests it was found to work well with CCID2 [gerrit].
281 */
282static void ccid2_rtt_estimator(struct sock *sk, const long mrtt)
283{
284 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
285 long m = mrtt ? : 1;
286
287 if (hctx->srtt == 0) {
288 /* First measurement m */
289 hctx->srtt = m << 3;
290 hctx->mdev = m << 1;
291
292 hctx->mdev_max = max(TCP_RTO_MIN, hctx->mdev);
293 hctx->rttvar = hctx->mdev_max;
294 hctx->rtt_seq = dccp_sk(sk)->dccps_gss;
295 } else {
296 /* Update scaled SRTT as SRTT += 1/8 * (m - SRTT) */
297 m -= (hctx->srtt >> 3);
298 hctx->srtt += m;
299
300 /* Similarly, update scaled mdev with regard to |m| */
301 if (m < 0) {
302 m = -m;
303 m -= (hctx->mdev >> 2);
304 /*
305 * This neutralises RTO increase when RTT < SRTT - mdev
306 * (see P. Sarolahti, A. Kuznetsov,"Congestion Control
307 * in Linux TCP", USENIX 2002, pp. 49-62).
308 */
309 if (m > 0)
310 m >>= 3;
311 } else {
312 m -= (hctx->mdev >> 2);
313 }
314 hctx->mdev += m;
315
316 if (hctx->mdev > hctx->mdev_max) {
317 hctx->mdev_max = hctx->mdev;
318 if (hctx->mdev_max > hctx->rttvar)
319 hctx->rttvar = hctx->mdev_max;
320 }
321
322 /*
323 * Decay RTTVAR at most once per flight, exploiting that
324 * 1) pipe <= cwnd <= Sequence_Window = W (RFC 4340, 7.5.2)
325 * 2) AWL = GSS-W+1 <= GAR <= GSS (RFC 4340, 7.5.1)
326 * GAR is a useful bound for FlightSize = pipe, AWL is probably
327 * too low as it over-estimates pipe.
328 */
329 if (after48(dccp_sk(sk)->dccps_gar, hctx->rtt_seq)) {
330 if (hctx->mdev_max < hctx->rttvar)
331 hctx->rttvar -= (hctx->rttvar -
332 hctx->mdev_max) >> 2;
333 hctx->rtt_seq = dccp_sk(sk)->dccps_gss;
334 hctx->mdev_max = TCP_RTO_MIN;
335 }
336 }
337
338 /*
339 * Set RTO from SRTT and RTTVAR
340 * Clock granularity is ignored since the minimum error for RTTVAR is
341 * clamped to 50msec (corresponding to HZ=20). This leads to a minimum
342 * RTO of 200msec. This agrees with TCP and RFC 4341, 5.: "Because DCCP
343 * does not retransmit data, DCCP does not require TCP's recommended
344 * minimum timeout of one second".
345 */
346 hctx->rto = (hctx->srtt >> 3) + hctx->rttvar;
347
348 if (hctx->rto > DCCP_RTO_MAX)
349 hctx->rto = DCCP_RTO_MAX;
350}
351
352static void ccid2_new_ack(struct sock *sk, struct ccid2_seq *seqp,
353 unsigned int *maxincr)
285{ 354{
286 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); 355 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
287 356
@@ -295,64 +364,15 @@ static inline void ccid2_new_ack(struct sock *sk,
295 hctx->cwnd += 1; 364 hctx->cwnd += 1;
296 hctx->packets_acked = 0; 365 hctx->packets_acked = 0;
297 } 366 }
298 367 /*
299 /* update RTO */ 368 * FIXME: RTT is sampled several times per acknowledgment (for each
300 if (hctx->srtt == -1 || 369 * entry in the Ack Vector), instead of once per Ack (as in TCP SACK).
301 time_after(jiffies, hctx->lastrtt + hctx->srtt)) { 370 * This causes the RTT to be over-estimated, since the older entries
302 unsigned long r = (long)jiffies - (long)seqp->ccid2s_sent; 371 * in the Ack Vector have earlier sending times.
303 int s; 372 * The cleanest solution is to not use the ccid2s_sent field at all
304 373 * and instead use DCCP timestamps - need to be resolved at some time.
305 /* first measurement */ 374 */
306 if (hctx->srtt == -1) { 375 ccid2_rtt_estimator(sk, jiffies - seqp->ccid2s_sent);
307 ccid2_pr_debug("R: %lu Time=%lu seq=%llu\n",
308 r, jiffies,
309 (unsigned long long)seqp->ccid2s_seq);
310 ccid2_change_srtt(hctx, r);
311 hctx->rttvar = r >> 1;
312 } else {
313 /* RTTVAR */
314 long tmp = hctx->srtt - r;
315 long srtt;
316
317 if (tmp < 0)
318 tmp *= -1;
319
320 tmp >>= 2;
321 hctx->rttvar *= 3;
322 hctx->rttvar >>= 2;
323 hctx->rttvar += tmp;
324
325 /* SRTT */
326 srtt = hctx->srtt;
327 srtt *= 7;
328 srtt >>= 3;
329 tmp = r >> 3;
330 srtt += tmp;
331 ccid2_change_srtt(hctx, srtt);
332 }
333 s = hctx->rttvar << 2;
334 /* clock granularity is 1 when based on jiffies */
335 if (!s)
336 s = 1;
337 hctx->rto = hctx->srtt + s;
338
339 /* must be at least a second */
340 s = hctx->rto / HZ;
341 /* DCCP doesn't require this [but I like it cuz my code sux] */
342#if 1
343 if (s < 1)
344 hctx->rto = HZ;
345#endif
346 /* max 60 seconds */
347 if (s > 60)
348 hctx->rto = HZ * 60;
349
350 hctx->lastrtt = jiffies;
351
352 ccid2_pr_debug("srtt: %ld rttvar: %ld rto: %ld (HZ=%d) R=%lu\n",
353 hctx->srtt, hctx->rttvar,
354 hctx->rto, HZ, r);
355 }
356} 376}
357 377
358static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp) 378static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp)
@@ -579,8 +599,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
579 if (hctx->pipe == 0) 599 if (hctx->pipe == 0)
580 sk_stop_timer(sk, &hctx->rtotimer); 600 sk_stop_timer(sk, &hctx->rtotimer);
581 else 601 else
582 sk_reset_timer(sk, &hctx->rtotimer, 602 sk_reset_timer(sk, &hctx->rtotimer, jiffies + hctx->rto);
583 jiffies + hctx->rto);
584done: 603done:
585 /* check if incoming Acks allow pending packets to be sent */ 604 /* check if incoming Acks allow pending packets to be sent */
586 if (sender_was_blocked && !ccid2_cwnd_network_limited(hctx)) 605 if (sender_was_blocked && !ccid2_cwnd_network_limited(hctx))
@@ -613,9 +632,7 @@ static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk)
613 if (ccid2_hc_tx_alloc_seq(hctx)) 632 if (ccid2_hc_tx_alloc_seq(hctx))
614 return -ENOMEM; 633 return -ENOMEM;
615 634
616 hctx->rto = 3 * HZ; 635 hctx->rto = DCCP_TIMEOUT_INIT;
617 ccid2_change_srtt(hctx, -1);
618 hctx->rttvar = -1;
619 hctx->rpdupack = -1; 636 hctx->rpdupack = -1;
620 hctx->last_cong = jiffies; 637 hctx->last_cong = jiffies;
621 setup_timer(&hctx->rtotimer, ccid2_hc_tx_rto_expire, (unsigned long)sk); 638 setup_timer(&hctx->rtotimer, ccid2_hc_tx_rto_expire, (unsigned long)sk);
diff --git a/net/dccp/ccids/ccid2.h b/net/dccp/ccids/ccid2.h
index d5900dd5d4f..8b7a2dee2f6 100644
--- a/net/dccp/ccids/ccid2.h
+++ b/net/dccp/ccids/ccid2.h
@@ -44,7 +44,12 @@ struct ccid2_seq {
44 * 44 *
45 * @{cwnd,ssthresh,pipe}: as per RFC 4341, section 5 45 * @{cwnd,ssthresh,pipe}: as per RFC 4341, section 5
46 * @packets_acked: Ack counter for deriving cwnd growth (RFC 3465) 46 * @packets_acked: Ack counter for deriving cwnd growth (RFC 3465)
47 * @lastrtt: time RTT was last measured 47 * @srtt: smoothed RTT estimate, scaled by 2^3
48 * @mdev: smoothed RTT variation, scaled by 2^2
49 * @mdev_max: maximum of @mdev during one flight
50 * @rttvar: moving average/maximum of @mdev_max
51 * @rto: RTO value deriving from SRTT and RTTVAR (RFC 2988)
52 * @rtt_seq: to decay RTTVAR at most once per flight
48 * @rpseq: last consecutive seqno 53 * @rpseq: last consecutive seqno
49 * @rpdupack: dupacks since rpseq 54 * @rpdupack: dupacks since rpseq
50 * @av_chunks: list of Ack Vectors received on current skb 55 * @av_chunks: list of Ack Vectors received on current skb
@@ -58,10 +63,13 @@ struct ccid2_hc_tx_sock {
58 int seqbufc; 63 int seqbufc;
59 struct ccid2_seq *seqh; 64 struct ccid2_seq *seqh;
60 struct ccid2_seq *seqt; 65 struct ccid2_seq *seqt;
61 long rto; 66 /* RTT measurement: variables/principles are the same as in TCP */
62 long srtt; 67 u32 srtt,
63 long rttvar; 68 mdev,
64 unsigned long lastrtt; 69 mdev_max,
70 rttvar,
71 rto;
72 u64 rtt_seq:48;
65 struct timer_list rtotimer; 73 struct timer_list rtotimer;
66 u64 rpseq; 74 u64 rpseq;
67 int rpdupack; 75 int rpdupack;