aboutsummaryrefslogtreecommitdiffstats
path: root/net/dccp
diff options
context:
space:
mode:
authorGerrit Renker <gerrit@erg.abdn.ac.uk>2010-08-22 15:41:40 -0400
committerDavid S. Miller <davem@davemloft.net>2010-08-23 23:13:31 -0400
commit231cc2aaf14bad3b2325be0b19b8385ff5e75485 (patch)
tree0836d99d5c6fedca5793db99799a41ba35863b38 /net/dccp
parentc38c92a84a9291a3d0eaf6a13650a11961ae964f (diff)
dccp ccid-2: Replace broken RTT estimator with better algorithm
The current CCID-2 RTT estimator code is in parts broken and lags behind the suggestions in RFC2988 of using scaled variants for SRTT/RTTVAR. That code is replaced by the present patch, which reuses the Linux TCP RTT estimator code. Further details: ---------------- 1. The minimum RTO of previously one second has been replaced with TCP's, since RFC4341, sec. 5 says that the minimum of 1 sec. (suggested in RFC2988, 2.4) is not necessary. Instead, the TCP_RTO_MIN is used, which agrees with DCCP's concept of a default RTT (RFC 4340, 3.4). 2. The maximum RTO has been set to DCCP_RTO_MAX (64 sec), which agrees with RFC2988, (2.5). 3. De-inlined the function ccid2_new_ack(). 4. Added a FIXME: the RTT is sampled several times per Ack Vector, which will give the wrong estimate. It should be replaced with one sample per Ack. However, at the moment this can not be resolved easily, since - it depends on TX history code (which also needs some work), - the cleanest solution is not to use the `sent' time at all (saves 4 bytes per entry) and use DCCP timestamps / elapsed time to estimated the RTT, which however is non-trivial to get right (but needs to be done). Reasons for reusing the Linux TCP estimator algorithm: ------------------------------------------------------ Some time was spent to find a better alternative, using basic RFC2988 as a first step. Further analysis and experimentation showed that the Linux TCP RTO estimator is superior to a basic RFC2988 implementation. A summary is on http://www.erg.abdn.ac.uk/users/gerrit/dccp/notes/ccid2/rto_estimator/ In addition, this estimator fared well in a recent empirical evaluation: Rewaskar, Sushant, Jasleen Kaur and F. Donelson Smith. A Performance Study of Loss Detection/Recovery in Real-world TCP Implementations. Proceedings of 15th IEEE International Conference on Network Protocols (ICNP-07), 2007. Thus there is significant benefit in reusing the existing TCP code. Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/dccp')
-rw-r--r--net/dccp/ccids/ccid2.c169
-rw-r--r--net/dccp/ccids/ccid2.h20
2 files changed, 108 insertions, 81 deletions
diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c
index f7f5069b1e84..7af3106c1f94 100644
--- a/net/dccp/ccids/ccid2.c
+++ b/net/dccp/ccids/ccid2.c
@@ -113,19 +113,12 @@ static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val)
113 dp->dccps_l_ack_ratio = val; 113 dp->dccps_l_ack_ratio = val;
114} 114}
115 115
116static void ccid2_change_srtt(struct ccid2_hc_tx_sock *hc, long val)
117{
118 ccid2_pr_debug("change SRTT to %ld\n", val);
119 hc->tx_srtt = val;
120}
121
122static void ccid2_start_rto_timer(struct sock *sk); 116static void ccid2_start_rto_timer(struct sock *sk);
123 117
124static void ccid2_hc_tx_rto_expire(unsigned long data) 118static void ccid2_hc_tx_rto_expire(unsigned long data)
125{ 119{
126 struct sock *sk = (struct sock *)data; 120 struct sock *sk = (struct sock *)data;
127 struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); 121 struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
128 long s;
129 122
130 bh_lock_sock(sk); 123 bh_lock_sock(sk);
131 if (sock_owned_by_user(sk)) { 124 if (sock_owned_by_user(sk)) {
@@ -137,10 +130,8 @@ static void ccid2_hc_tx_rto_expire(unsigned long data)
137 130
138 /* back-off timer */ 131 /* back-off timer */
139 hc->tx_rto <<= 1; 132 hc->tx_rto <<= 1;
140 133 if (hc->tx_rto > DCCP_RTO_MAX)
141 s = hc->tx_rto / HZ; 134 hc->tx_rto = DCCP_RTO_MAX;
142 if (s > 60)
143 hc->tx_rto = 60 * HZ;
144 135
145 ccid2_start_rto_timer(sk); 136 ccid2_start_rto_timer(sk);
146 137
@@ -168,7 +159,7 @@ static void ccid2_start_rto_timer(struct sock *sk)
168{ 159{
169 struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); 160 struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
170 161
171 ccid2_pr_debug("setting RTO timeout=%ld\n", hc->tx_rto); 162 ccid2_pr_debug("setting RTO timeout=%u\n", hc->tx_rto);
172 163
173 BUG_ON(timer_pending(&hc->tx_rtotimer)); 164 BUG_ON(timer_pending(&hc->tx_rtotimer));
174 sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto); 165 sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto);
@@ -339,9 +330,86 @@ static void ccid2_hc_tx_kill_rto_timer(struct sock *sk)
339 ccid2_pr_debug("deleted RTO timer\n"); 330 ccid2_pr_debug("deleted RTO timer\n");
340} 331}
341 332
342static inline void ccid2_new_ack(struct sock *sk, 333/**
343 struct ccid2_seq *seqp, 334 * ccid2_rtt_estimator - Sample RTT and compute RTO using RFC2988 algorithm
344 unsigned int *maxincr) 335 * This code is almost identical with TCP's tcp_rtt_estimator(), since
336 * - it has a higher sampling frequency (recommended by RFC 1323),
337 * - the RTO does not collapse into RTT due to RTTVAR going towards zero,
338 * - it is simple (cf. more complex proposals such as Eifel timer or research
339 * which suggests that the gain should be set according to window size),
340 * - in tests it was found to work well with CCID2 [gerrit].
341 */
342static void ccid2_rtt_estimator(struct sock *sk, const long mrtt)
343{
344 struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
345 long m = mrtt ? : 1;
346
347 if (hc->tx_srtt == 0) {
348 /* First measurement m */
349 hc->tx_srtt = m << 3;
350 hc->tx_mdev = m << 1;
351
352 hc->tx_mdev_max = max(TCP_RTO_MIN, hc->tx_mdev);
353 hc->tx_rttvar = hc->tx_mdev_max;
354 hc->tx_rtt_seq = dccp_sk(sk)->dccps_gss;
355 } else {
356 /* Update scaled SRTT as SRTT += 1/8 * (m - SRTT) */
357 m -= (hc->tx_srtt >> 3);
358 hc->tx_srtt += m;
359
360 /* Similarly, update scaled mdev with regard to |m| */
361 if (m < 0) {
362 m = -m;
363 m -= (hc->tx_mdev >> 2);
364 /*
365 * This neutralises RTO increase when RTT < SRTT - mdev
366 * (see P. Sarolahti, A. Kuznetsov,"Congestion Control
367 * in Linux TCP", USENIX 2002, pp. 49-62).
368 */
369 if (m > 0)
370 m >>= 3;
371 } else {
372 m -= (hc->tx_mdev >> 2);
373 }
374 hc->tx_mdev += m;
375
376 if (hc->tx_mdev > hc->tx_mdev_max) {
377 hc->tx_mdev_max = hc->tx_mdev;
378 if (hc->tx_mdev_max > hc->tx_rttvar)
379 hc->tx_rttvar = hc->tx_mdev_max;
380 }
381
382 /*
383 * Decay RTTVAR at most once per flight, exploiting that
384 * 1) pipe <= cwnd <= Sequence_Window = W (RFC 4340, 7.5.2)
385 * 2) AWL = GSS-W+1 <= GAR <= GSS (RFC 4340, 7.5.1)
386 * GAR is a useful bound for FlightSize = pipe.
387 * AWL is probably too low here, as it over-estimates pipe.
388 */
389 if (after48(dccp_sk(sk)->dccps_gar, hc->tx_rtt_seq)) {
390 if (hc->tx_mdev_max < hc->tx_rttvar)
391 hc->tx_rttvar -= (hc->tx_rttvar -
392 hc->tx_mdev_max) >> 2;
393 hc->tx_rtt_seq = dccp_sk(sk)->dccps_gss;
394 hc->tx_mdev_max = TCP_RTO_MIN;
395 }
396 }
397
398 /*
399 * Set RTO from SRTT and RTTVAR
400 * As in TCP, 4 * RTTVAR >= TCP_RTO_MIN, giving a minimum RTO of 200 ms.
401 * This agrees with RFC 4341, 5:
402 * "Because DCCP does not retransmit data, DCCP does not require
403 * TCP's recommended minimum timeout of one second".
404 */
405 hc->tx_rto = (hc->tx_srtt >> 3) + hc->tx_rttvar;
406
407 if (hc->tx_rto > DCCP_RTO_MAX)
408 hc->tx_rto = DCCP_RTO_MAX;
409}
410
411static void ccid2_new_ack(struct sock *sk, struct ccid2_seq *seqp,
412 unsigned int *maxincr)
345{ 413{
346 struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); 414 struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
347 415
@@ -355,64 +423,15 @@ static inline void ccid2_new_ack(struct sock *sk,
355 hc->tx_cwnd += 1; 423 hc->tx_cwnd += 1;
356 hc->tx_packets_acked = 0; 424 hc->tx_packets_acked = 0;
357 } 425 }
358 426 /*
359 /* update RTO */ 427 * FIXME: RTT is sampled several times per acknowledgment (for each
360 if (hc->tx_srtt == -1 || 428 * entry in the Ack Vector), instead of once per Ack (as in TCP SACK).
361 time_after(jiffies, hc->tx_lastrtt + hc->tx_srtt)) { 429 * This causes the RTT to be over-estimated, since the older entries
362 unsigned long r = (long)jiffies - (long)seqp->ccid2s_sent; 430 * in the Ack Vector have earlier sending times.
363 int s; 431 * The cleanest solution is to not use the ccid2s_sent field at all
364 432 * and instead use DCCP timestamps: requires changes in other places.
365 /* first measurement */ 433 */
366 if (hc->tx_srtt == -1) { 434 ccid2_rtt_estimator(sk, jiffies - seqp->ccid2s_sent);
367 ccid2_pr_debug("R: %lu Time=%lu seq=%llu\n",
368 r, jiffies,
369 (unsigned long long)seqp->ccid2s_seq);
370 ccid2_change_srtt(hc, r);
371 hc->tx_rttvar = r >> 1;
372 } else {
373 /* RTTVAR */
374 long tmp = hc->tx_srtt - r;
375 long srtt;
376
377 if (tmp < 0)
378 tmp *= -1;
379
380 tmp >>= 2;
381 hc->tx_rttvar *= 3;
382 hc->tx_rttvar >>= 2;
383 hc->tx_rttvar += tmp;
384
385 /* SRTT */
386 srtt = hc->tx_srtt;
387 srtt *= 7;
388 srtt >>= 3;
389 tmp = r >> 3;
390 srtt += tmp;
391 ccid2_change_srtt(hc, srtt);
392 }
393 s = hc->tx_rttvar << 2;
394 /* clock granularity is 1 when based on jiffies */
395 if (!s)
396 s = 1;
397 hc->tx_rto = hc->tx_srtt + s;
398
399 /* must be at least a second */
400 s = hc->tx_rto / HZ;
401 /* DCCP doesn't require this [but I like it cuz my code sux] */
402#if 1
403 if (s < 1)
404 hc->tx_rto = HZ;
405#endif
406 /* max 60 seconds */
407 if (s > 60)
408 hc->tx_rto = HZ * 60;
409
410 hc->tx_lastrtt = jiffies;
411
412 ccid2_pr_debug("srtt: %ld rttvar: %ld rto: %ld (HZ=%d) R=%lu\n",
413 hc->tx_srtt, hc->tx_rttvar,
414 hc->tx_rto, HZ, r);
415 }
416} 435}
417 436
418static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp) 437static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp)
@@ -662,9 +681,7 @@ static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk)
662 if (ccid2_hc_tx_alloc_seq(hc)) 681 if (ccid2_hc_tx_alloc_seq(hc))
663 return -ENOMEM; 682 return -ENOMEM;
664 683
665 hc->tx_rto = 3 * HZ; 684 hc->tx_rto = DCCP_TIMEOUT_INIT;
666 ccid2_change_srtt(hc, -1);
667 hc->tx_rttvar = -1;
668 hc->tx_rpdupack = -1; 685 hc->tx_rpdupack = -1;
669 hc->tx_last_cong = jiffies; 686 hc->tx_last_cong = jiffies;
670 setup_timer(&hc->tx_rtotimer, ccid2_hc_tx_rto_expire, 687 setup_timer(&hc->tx_rtotimer, ccid2_hc_tx_rto_expire,
diff --git a/net/dccp/ccids/ccid2.h b/net/dccp/ccids/ccid2.h
index 1ec6a30103bb..b017843ba44d 100644
--- a/net/dccp/ccids/ccid2.h
+++ b/net/dccp/ccids/ccid2.h
@@ -42,7 +42,12 @@ struct ccid2_seq {
42 * struct ccid2_hc_tx_sock - CCID2 TX half connection 42 * struct ccid2_hc_tx_sock - CCID2 TX half connection
43 * @tx_{cwnd,ssthresh,pipe}: as per RFC 4341, section 5 43 * @tx_{cwnd,ssthresh,pipe}: as per RFC 4341, section 5
44 * @tx_packets_acked: Ack counter for deriving cwnd growth (RFC 3465) 44 * @tx_packets_acked: Ack counter for deriving cwnd growth (RFC 3465)
45 * @tx_lastrtt: time RTT was last measured 45 * @tx_srtt: smoothed RTT estimate, scaled by 2^3
46 * @tx_mdev: smoothed RTT variation, scaled by 2^2
47 * @tx_mdev_max: maximum of @mdev during one flight
48 * @tx_rttvar: moving average/maximum of @mdev_max
49 * @tx_rto: RTO value deriving from SRTT and RTTVAR (RFC 2988)
50 * @tx_rtt_seq: to decay RTTVAR at most once per flight
46 * @tx_rpseq: last consecutive seqno 51 * @tx_rpseq: last consecutive seqno
47 * @tx_rpdupack: dupacks since rpseq 52 * @tx_rpdupack: dupacks since rpseq
48 */ 53 */
@@ -55,11 +60,16 @@ struct ccid2_hc_tx_sock {
55 int tx_seqbufc; 60 int tx_seqbufc;
56 struct ccid2_seq *tx_seqh; 61 struct ccid2_seq *tx_seqh;
57 struct ccid2_seq *tx_seqt; 62 struct ccid2_seq *tx_seqt;
58 long tx_rto; 63
59 long tx_srtt; 64 /* RTT measurement: variables/principles are the same as in TCP */
60 long tx_rttvar; 65 u32 tx_srtt,
61 unsigned long tx_lastrtt; 66 tx_mdev,
67 tx_mdev_max,
68 tx_rttvar,
69 tx_rto;
70 u64 tx_rtt_seq:48;
62 struct timer_list tx_rtotimer; 71 struct timer_list tx_rtotimer;
72
63 u64 tx_rpseq; 73 u64 tx_rpseq;
64 int tx_rpdupack; 74 int tx_rpdupack;
65 unsigned long tx_last_cong; 75 unsigned long tx_last_cong;