diff options
author | Gerrit Renker <gerrit@erg.abdn.ac.uk> | 2010-08-22 15:41:40 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2010-08-23 23:13:31 -0400 |
commit | 231cc2aaf14bad3b2325be0b19b8385ff5e75485 (patch) | |
tree | 0836d99d5c6fedca5793db99799a41ba35863b38 /net/dccp | |
parent | c38c92a84a9291a3d0eaf6a13650a11961ae964f (diff) |
dccp ccid-2: Replace broken RTT estimator with better algorithm
The current CCID-2 RTT estimator code is in parts broken and lags behind the
suggestions in RFC2988 of using scaled variants for SRTT/RTTVAR.
That code is replaced by the present patch, which reuses the Linux TCP RTT
estimator code.
Further details:
----------------
1. The minimum RTO of previously one second has been replaced with TCP's, since
RFC4341, sec. 5 says that the minimum of 1 sec. (suggested in RFC2988, 2.4)
is not necessary. Instead, the TCP_RTO_MIN is used, which agrees with DCCP's
concept of a default RTT (RFC 4340, 3.4).
2. The maximum RTO has been set to DCCP_RTO_MAX (64 sec), which agrees with
RFC2988, (2.5).
3. De-inlined the function ccid2_new_ack().
4. Added a FIXME: the RTT is sampled several times per Ack Vector, which will
give the wrong estimate. It should be replaced with one sample per Ack.
However, at the moment this can not be resolved easily, since
- it depends on TX history code (which also needs some work),
- the cleanest solution is not to use the `sent' time at all (saves 4 bytes
per entry) and use DCCP timestamps / elapsed time to estimated the RTT,
which however is non-trivial to get right (but needs to be done).
Reasons for reusing the Linux TCP estimator algorithm:
------------------------------------------------------
Some time was spent to find a better alternative, using basic RFC2988 as a first
step. Further analysis and experimentation showed that the Linux TCP RTO
estimator is superior to a basic RFC2988 implementation. A summary is on
http://www.erg.abdn.ac.uk/users/gerrit/dccp/notes/ccid2/rto_estimator/
In addition, this estimator fared well in a recent empirical evaluation:
Rewaskar, Sushant, Jasleen Kaur and F. Donelson Smith.
A Performance Study of Loss Detection/Recovery in Real-world TCP
Implementations. Proceedings of 15th IEEE International
Conference on Network Protocols (ICNP-07), 2007.
Thus there is significant benefit in reusing the existing TCP code.
Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/dccp')
-rw-r--r-- | net/dccp/ccids/ccid2.c | 169 | ||||
-rw-r--r-- | net/dccp/ccids/ccid2.h | 20 |
2 files changed, 108 insertions, 81 deletions
diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c index f7f5069b1e84..7af3106c1f94 100644 --- a/net/dccp/ccids/ccid2.c +++ b/net/dccp/ccids/ccid2.c | |||
@@ -113,19 +113,12 @@ static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val) | |||
113 | dp->dccps_l_ack_ratio = val; | 113 | dp->dccps_l_ack_ratio = val; |
114 | } | 114 | } |
115 | 115 | ||
116 | static void ccid2_change_srtt(struct ccid2_hc_tx_sock *hc, long val) | ||
117 | { | ||
118 | ccid2_pr_debug("change SRTT to %ld\n", val); | ||
119 | hc->tx_srtt = val; | ||
120 | } | ||
121 | |||
122 | static void ccid2_start_rto_timer(struct sock *sk); | 116 | static void ccid2_start_rto_timer(struct sock *sk); |
123 | 117 | ||
124 | static void ccid2_hc_tx_rto_expire(unsigned long data) | 118 | static void ccid2_hc_tx_rto_expire(unsigned long data) |
125 | { | 119 | { |
126 | struct sock *sk = (struct sock *)data; | 120 | struct sock *sk = (struct sock *)data; |
127 | struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); | 121 | struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); |
128 | long s; | ||
129 | 122 | ||
130 | bh_lock_sock(sk); | 123 | bh_lock_sock(sk); |
131 | if (sock_owned_by_user(sk)) { | 124 | if (sock_owned_by_user(sk)) { |
@@ -137,10 +130,8 @@ static void ccid2_hc_tx_rto_expire(unsigned long data) | |||
137 | 130 | ||
138 | /* back-off timer */ | 131 | /* back-off timer */ |
139 | hc->tx_rto <<= 1; | 132 | hc->tx_rto <<= 1; |
140 | 133 | if (hc->tx_rto > DCCP_RTO_MAX) | |
141 | s = hc->tx_rto / HZ; | 134 | hc->tx_rto = DCCP_RTO_MAX; |
142 | if (s > 60) | ||
143 | hc->tx_rto = 60 * HZ; | ||
144 | 135 | ||
145 | ccid2_start_rto_timer(sk); | 136 | ccid2_start_rto_timer(sk); |
146 | 137 | ||
@@ -168,7 +159,7 @@ static void ccid2_start_rto_timer(struct sock *sk) | |||
168 | { | 159 | { |
169 | struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); | 160 | struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); |
170 | 161 | ||
171 | ccid2_pr_debug("setting RTO timeout=%ld\n", hc->tx_rto); | 162 | ccid2_pr_debug("setting RTO timeout=%u\n", hc->tx_rto); |
172 | 163 | ||
173 | BUG_ON(timer_pending(&hc->tx_rtotimer)); | 164 | BUG_ON(timer_pending(&hc->tx_rtotimer)); |
174 | sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto); | 165 | sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto); |
@@ -339,9 +330,86 @@ static void ccid2_hc_tx_kill_rto_timer(struct sock *sk) | |||
339 | ccid2_pr_debug("deleted RTO timer\n"); | 330 | ccid2_pr_debug("deleted RTO timer\n"); |
340 | } | 331 | } |
341 | 332 | ||
342 | static inline void ccid2_new_ack(struct sock *sk, | 333 | /** |
343 | struct ccid2_seq *seqp, | 334 | * ccid2_rtt_estimator - Sample RTT and compute RTO using RFC2988 algorithm |
344 | unsigned int *maxincr) | 335 | * This code is almost identical with TCP's tcp_rtt_estimator(), since |
336 | * - it has a higher sampling frequency (recommended by RFC 1323), | ||
337 | * - the RTO does not collapse into RTT due to RTTVAR going towards zero, | ||
338 | * - it is simple (cf. more complex proposals such as Eifel timer or research | ||
339 | * which suggests that the gain should be set according to window size), | ||
340 | * - in tests it was found to work well with CCID2 [gerrit]. | ||
341 | */ | ||
342 | static void ccid2_rtt_estimator(struct sock *sk, const long mrtt) | ||
343 | { | ||
344 | struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); | ||
345 | long m = mrtt ? : 1; | ||
346 | |||
347 | if (hc->tx_srtt == 0) { | ||
348 | /* First measurement m */ | ||
349 | hc->tx_srtt = m << 3; | ||
350 | hc->tx_mdev = m << 1; | ||
351 | |||
352 | hc->tx_mdev_max = max(TCP_RTO_MIN, hc->tx_mdev); | ||
353 | hc->tx_rttvar = hc->tx_mdev_max; | ||
354 | hc->tx_rtt_seq = dccp_sk(sk)->dccps_gss; | ||
355 | } else { | ||
356 | /* Update scaled SRTT as SRTT += 1/8 * (m - SRTT) */ | ||
357 | m -= (hc->tx_srtt >> 3); | ||
358 | hc->tx_srtt += m; | ||
359 | |||
360 | /* Similarly, update scaled mdev with regard to |m| */ | ||
361 | if (m < 0) { | ||
362 | m = -m; | ||
363 | m -= (hc->tx_mdev >> 2); | ||
364 | /* | ||
365 | * This neutralises RTO increase when RTT < SRTT - mdev | ||
366 | * (see P. Sarolahti, A. Kuznetsov,"Congestion Control | ||
367 | * in Linux TCP", USENIX 2002, pp. 49-62). | ||
368 | */ | ||
369 | if (m > 0) | ||
370 | m >>= 3; | ||
371 | } else { | ||
372 | m -= (hc->tx_mdev >> 2); | ||
373 | } | ||
374 | hc->tx_mdev += m; | ||
375 | |||
376 | if (hc->tx_mdev > hc->tx_mdev_max) { | ||
377 | hc->tx_mdev_max = hc->tx_mdev; | ||
378 | if (hc->tx_mdev_max > hc->tx_rttvar) | ||
379 | hc->tx_rttvar = hc->tx_mdev_max; | ||
380 | } | ||
381 | |||
382 | /* | ||
383 | * Decay RTTVAR at most once per flight, exploiting that | ||
384 | * 1) pipe <= cwnd <= Sequence_Window = W (RFC 4340, 7.5.2) | ||
385 | * 2) AWL = GSS-W+1 <= GAR <= GSS (RFC 4340, 7.5.1) | ||
386 | * GAR is a useful bound for FlightSize = pipe. | ||
387 | * AWL is probably too low here, as it over-estimates pipe. | ||
388 | */ | ||
389 | if (after48(dccp_sk(sk)->dccps_gar, hc->tx_rtt_seq)) { | ||
390 | if (hc->tx_mdev_max < hc->tx_rttvar) | ||
391 | hc->tx_rttvar -= (hc->tx_rttvar - | ||
392 | hc->tx_mdev_max) >> 2; | ||
393 | hc->tx_rtt_seq = dccp_sk(sk)->dccps_gss; | ||
394 | hc->tx_mdev_max = TCP_RTO_MIN; | ||
395 | } | ||
396 | } | ||
397 | |||
398 | /* | ||
399 | * Set RTO from SRTT and RTTVAR | ||
400 | * As in TCP, 4 * RTTVAR >= TCP_RTO_MIN, giving a minimum RTO of 200 ms. | ||
401 | * This agrees with RFC 4341, 5: | ||
402 | * "Because DCCP does not retransmit data, DCCP does not require | ||
403 | * TCP's recommended minimum timeout of one second". | ||
404 | */ | ||
405 | hc->tx_rto = (hc->tx_srtt >> 3) + hc->tx_rttvar; | ||
406 | |||
407 | if (hc->tx_rto > DCCP_RTO_MAX) | ||
408 | hc->tx_rto = DCCP_RTO_MAX; | ||
409 | } | ||
410 | |||
411 | static void ccid2_new_ack(struct sock *sk, struct ccid2_seq *seqp, | ||
412 | unsigned int *maxincr) | ||
345 | { | 413 | { |
346 | struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); | 414 | struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); |
347 | 415 | ||
@@ -355,64 +423,15 @@ static inline void ccid2_new_ack(struct sock *sk, | |||
355 | hc->tx_cwnd += 1; | 423 | hc->tx_cwnd += 1; |
356 | hc->tx_packets_acked = 0; | 424 | hc->tx_packets_acked = 0; |
357 | } | 425 | } |
358 | 426 | /* | |
359 | /* update RTO */ | 427 | * FIXME: RTT is sampled several times per acknowledgment (for each |
360 | if (hc->tx_srtt == -1 || | 428 | * entry in the Ack Vector), instead of once per Ack (as in TCP SACK). |
361 | time_after(jiffies, hc->tx_lastrtt + hc->tx_srtt)) { | 429 | * This causes the RTT to be over-estimated, since the older entries |
362 | unsigned long r = (long)jiffies - (long)seqp->ccid2s_sent; | 430 | * in the Ack Vector have earlier sending times. |
363 | int s; | 431 | * The cleanest solution is to not use the ccid2s_sent field at all |
364 | 432 | * and instead use DCCP timestamps: requires changes in other places. | |
365 | /* first measurement */ | 433 | */ |
366 | if (hc->tx_srtt == -1) { | 434 | ccid2_rtt_estimator(sk, jiffies - seqp->ccid2s_sent); |
367 | ccid2_pr_debug("R: %lu Time=%lu seq=%llu\n", | ||
368 | r, jiffies, | ||
369 | (unsigned long long)seqp->ccid2s_seq); | ||
370 | ccid2_change_srtt(hc, r); | ||
371 | hc->tx_rttvar = r >> 1; | ||
372 | } else { | ||
373 | /* RTTVAR */ | ||
374 | long tmp = hc->tx_srtt - r; | ||
375 | long srtt; | ||
376 | |||
377 | if (tmp < 0) | ||
378 | tmp *= -1; | ||
379 | |||
380 | tmp >>= 2; | ||
381 | hc->tx_rttvar *= 3; | ||
382 | hc->tx_rttvar >>= 2; | ||
383 | hc->tx_rttvar += tmp; | ||
384 | |||
385 | /* SRTT */ | ||
386 | srtt = hc->tx_srtt; | ||
387 | srtt *= 7; | ||
388 | srtt >>= 3; | ||
389 | tmp = r >> 3; | ||
390 | srtt += tmp; | ||
391 | ccid2_change_srtt(hc, srtt); | ||
392 | } | ||
393 | s = hc->tx_rttvar << 2; | ||
394 | /* clock granularity is 1 when based on jiffies */ | ||
395 | if (!s) | ||
396 | s = 1; | ||
397 | hc->tx_rto = hc->tx_srtt + s; | ||
398 | |||
399 | /* must be at least a second */ | ||
400 | s = hc->tx_rto / HZ; | ||
401 | /* DCCP doesn't require this [but I like it cuz my code sux] */ | ||
402 | #if 1 | ||
403 | if (s < 1) | ||
404 | hc->tx_rto = HZ; | ||
405 | #endif | ||
406 | /* max 60 seconds */ | ||
407 | if (s > 60) | ||
408 | hc->tx_rto = HZ * 60; | ||
409 | |||
410 | hc->tx_lastrtt = jiffies; | ||
411 | |||
412 | ccid2_pr_debug("srtt: %ld rttvar: %ld rto: %ld (HZ=%d) R=%lu\n", | ||
413 | hc->tx_srtt, hc->tx_rttvar, | ||
414 | hc->tx_rto, HZ, r); | ||
415 | } | ||
416 | } | 435 | } |
417 | 436 | ||
418 | static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp) | 437 | static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp) |
@@ -662,9 +681,7 @@ static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk) | |||
662 | if (ccid2_hc_tx_alloc_seq(hc)) | 681 | if (ccid2_hc_tx_alloc_seq(hc)) |
663 | return -ENOMEM; | 682 | return -ENOMEM; |
664 | 683 | ||
665 | hc->tx_rto = 3 * HZ; | 684 | hc->tx_rto = DCCP_TIMEOUT_INIT; |
666 | ccid2_change_srtt(hc, -1); | ||
667 | hc->tx_rttvar = -1; | ||
668 | hc->tx_rpdupack = -1; | 685 | hc->tx_rpdupack = -1; |
669 | hc->tx_last_cong = jiffies; | 686 | hc->tx_last_cong = jiffies; |
670 | setup_timer(&hc->tx_rtotimer, ccid2_hc_tx_rto_expire, | 687 | setup_timer(&hc->tx_rtotimer, ccid2_hc_tx_rto_expire, |
diff --git a/net/dccp/ccids/ccid2.h b/net/dccp/ccids/ccid2.h index 1ec6a30103bb..b017843ba44d 100644 --- a/net/dccp/ccids/ccid2.h +++ b/net/dccp/ccids/ccid2.h | |||
@@ -42,7 +42,12 @@ struct ccid2_seq { | |||
42 | * struct ccid2_hc_tx_sock - CCID2 TX half connection | 42 | * struct ccid2_hc_tx_sock - CCID2 TX half connection |
43 | * @tx_{cwnd,ssthresh,pipe}: as per RFC 4341, section 5 | 43 | * @tx_{cwnd,ssthresh,pipe}: as per RFC 4341, section 5 |
44 | * @tx_packets_acked: Ack counter for deriving cwnd growth (RFC 3465) | 44 | * @tx_packets_acked: Ack counter for deriving cwnd growth (RFC 3465) |
45 | * @tx_lastrtt: time RTT was last measured | 45 | * @tx_srtt: smoothed RTT estimate, scaled by 2^3 |
46 | * @tx_mdev: smoothed RTT variation, scaled by 2^2 | ||
47 | * @tx_mdev_max: maximum of @mdev during one flight | ||
48 | * @tx_rttvar: moving average/maximum of @mdev_max | ||
49 | * @tx_rto: RTO value deriving from SRTT and RTTVAR (RFC 2988) | ||
50 | * @tx_rtt_seq: to decay RTTVAR at most once per flight | ||
46 | * @tx_rpseq: last consecutive seqno | 51 | * @tx_rpseq: last consecutive seqno |
47 | * @tx_rpdupack: dupacks since rpseq | 52 | * @tx_rpdupack: dupacks since rpseq |
48 | */ | 53 | */ |
@@ -55,11 +60,16 @@ struct ccid2_hc_tx_sock { | |||
55 | int tx_seqbufc; | 60 | int tx_seqbufc; |
56 | struct ccid2_seq *tx_seqh; | 61 | struct ccid2_seq *tx_seqh; |
57 | struct ccid2_seq *tx_seqt; | 62 | struct ccid2_seq *tx_seqt; |
58 | long tx_rto; | 63 | |
59 | long tx_srtt; | 64 | /* RTT measurement: variables/principles are the same as in TCP */ |
60 | long tx_rttvar; | 65 | u32 tx_srtt, |
61 | unsigned long tx_lastrtt; | 66 | tx_mdev, |
67 | tx_mdev_max, | ||
68 | tx_rttvar, | ||
69 | tx_rto; | ||
70 | u64 tx_rtt_seq:48; | ||
62 | struct timer_list tx_rtotimer; | 71 | struct timer_list tx_rtotimer; |
72 | |||
63 | u64 tx_rpseq; | 73 | u64 tx_rpseq; |
64 | int tx_rpdupack; | 74 | int tx_rpdupack; |
65 | unsigned long tx_last_cong; | 75 | unsigned long tx_last_cong; |