aboutsummaryrefslogtreecommitdiffstats
path: root/net/dccp/ccids
diff options
context:
space:
mode:
Diffstat (limited to 'net/dccp/ccids')
-rw-r--r--net/dccp/ccids/Kconfig30
-rw-r--r--net/dccp/ccids/ccid2.c622
-rw-r--r--net/dccp/ccids/ccid2.h63
-rw-r--r--net/dccp/ccids/ccid3.c762
-rw-r--r--net/dccp/ccids/ccid3.h153
-rw-r--r--net/dccp/ccids/lib/loss_interval.c30
-rw-r--r--net/dccp/ccids/lib/loss_interval.h4
-rw-r--r--net/dccp/ccids/lib/packet_history.c282
-rw-r--r--net/dccp/ccids/lib/packet_history.h78
-rw-r--r--net/dccp/ccids/lib/tfrc.h16
-rw-r--r--net/dccp/ccids/lib/tfrc_equation.c29
11 files changed, 953 insertions, 1116 deletions
diff --git a/net/dccp/ccids/Kconfig b/net/dccp/ccids/Kconfig
index 12275943eab8..fb168be2cb43 100644
--- a/net/dccp/ccids/Kconfig
+++ b/net/dccp/ccids/Kconfig
@@ -1,10 +1,8 @@
1menu "DCCP CCIDs Configuration (EXPERIMENTAL)" 1menu "DCCP CCIDs Configuration (EXPERIMENTAL)"
2 depends on EXPERIMENTAL
3 2
4config IP_DCCP_CCID2 3config IP_DCCP_CCID2
5 tristate "CCID2 (TCP-Like) (EXPERIMENTAL)" 4 tristate "CCID2 (TCP-Like)"
6 def_tristate IP_DCCP 5 def_tristate IP_DCCP
7 select IP_DCCP_ACKVEC
8 ---help--- 6 ---help---
9 CCID 2, TCP-like Congestion Control, denotes Additive Increase, 7 CCID 2, TCP-like Congestion Control, denotes Additive Increase,
10 Multiplicative Decrease (AIMD) congestion control with behavior 8 Multiplicative Decrease (AIMD) congestion control with behavior
@@ -36,7 +34,7 @@ config IP_DCCP_CCID2_DEBUG
36 If in doubt, say N. 34 If in doubt, say N.
37 35
38config IP_DCCP_CCID3 36config IP_DCCP_CCID3
39 tristate "CCID3 (TCP-Friendly) (EXPERIMENTAL)" 37 tristate "CCID3 (TCP-Friendly)"
40 def_tristate IP_DCCP 38 def_tristate IP_DCCP
41 select IP_DCCP_TFRC_LIB 39 select IP_DCCP_TFRC_LIB
42 ---help--- 40 ---help---
@@ -64,9 +62,9 @@ config IP_DCCP_CCID3
64 62
65 If in doubt, say M. 63 If in doubt, say M.
66 64
65if IP_DCCP_CCID3
67config IP_DCCP_CCID3_DEBUG 66config IP_DCCP_CCID3_DEBUG
68 bool "CCID3 debugging messages" 67 bool "CCID3 debugging messages"
69 depends on IP_DCCP_CCID3
70 ---help--- 68 ---help---
71 Enable CCID3-specific debugging messages. 69 Enable CCID3-specific debugging messages.
72 70
@@ -76,10 +74,29 @@ config IP_DCCP_CCID3_DEBUG
76 74
77 If in doubt, say N. 75 If in doubt, say N.
78 76
77choice
78 prompt "Select method for measuring the packet size s"
79 default IP_DCCP_CCID3_MEASURE_S_AS_MPS
80
81config IP_DCCP_CCID3_MEASURE_S_AS_MPS
82 bool "Always use MPS in place of s"
83 ---help---
84 This use is recommended as it is consistent with the initialisation
85 of X and suggested when s varies (rfc3448bis, (1) in section 4.1).
86config IP_DCCP_CCID3_MEASURE_S_AS_AVG
87 bool "Use moving average"
88 ---help---
89 An alternative way of tracking s, also supported by rfc3448bis.
90 This used to be the default for CCID-3 in previous kernels.
91config IP_DCCP_CCID3_MEASURE_S_AS_MAX
92 bool "Track the maximum payload length"
93 ---help---
94 An experimental method based on tracking the maximum packet size.
95endchoice
96
79config IP_DCCP_CCID3_RTO 97config IP_DCCP_CCID3_RTO
80 int "Use higher bound for nofeedback timer" 98 int "Use higher bound for nofeedback timer"
81 default 100 99 default 100
82 depends on IP_DCCP_CCID3 && EXPERIMENTAL
83 ---help--- 100 ---help---
84 Use higher lower bound for nofeedback timer expiration. 101 Use higher lower bound for nofeedback timer expiration.
85 102
@@ -106,6 +123,7 @@ config IP_DCCP_CCID3_RTO
106 The purpose of the nofeedback timer is to slow DCCP down when there 123 The purpose of the nofeedback timer is to slow DCCP down when there
107 is serious network congestion: experimenting with larger values should 124 is serious network congestion: experimenting with larger values should
108 therefore not be performed on WANs. 125 therefore not be performed on WANs.
126endif # IP_DCCP_CCID3
109 127
110config IP_DCCP_TFRC_LIB 128config IP_DCCP_TFRC_LIB
111 tristate 129 tristate
diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c
index 9a430734530c..fa713227c66f 100644
--- a/net/dccp/ccids/ccid2.c
+++ b/net/dccp/ccids/ccid2.c
@@ -25,7 +25,7 @@
25/* 25/*
26 * This implementation should follow RFC 4341 26 * This implementation should follow RFC 4341
27 */ 27 */
28 28#include "../feat.h"
29#include "../ccid.h" 29#include "../ccid.h"
30#include "../dccp.h" 30#include "../dccp.h"
31#include "ccid2.h" 31#include "ccid2.h"
@@ -34,51 +34,8 @@
34#ifdef CONFIG_IP_DCCP_CCID2_DEBUG 34#ifdef CONFIG_IP_DCCP_CCID2_DEBUG
35static int ccid2_debug; 35static int ccid2_debug;
36#define ccid2_pr_debug(format, a...) DCCP_PR_DEBUG(ccid2_debug, format, ##a) 36#define ccid2_pr_debug(format, a...) DCCP_PR_DEBUG(ccid2_debug, format, ##a)
37
38static void ccid2_hc_tx_check_sanity(const struct ccid2_hc_tx_sock *hctx)
39{
40 int len = 0;
41 int pipe = 0;
42 struct ccid2_seq *seqp = hctx->ccid2hctx_seqh;
43
44 /* there is data in the chain */
45 if (seqp != hctx->ccid2hctx_seqt) {
46 seqp = seqp->ccid2s_prev;
47 len++;
48 if (!seqp->ccid2s_acked)
49 pipe++;
50
51 while (seqp != hctx->ccid2hctx_seqt) {
52 struct ccid2_seq *prev = seqp->ccid2s_prev;
53
54 len++;
55 if (!prev->ccid2s_acked)
56 pipe++;
57
58 /* packets are sent sequentially */
59 BUG_ON(dccp_delta_seqno(seqp->ccid2s_seq,
60 prev->ccid2s_seq ) >= 0);
61 BUG_ON(time_before(seqp->ccid2s_sent,
62 prev->ccid2s_sent));
63
64 seqp = prev;
65 }
66 }
67
68 BUG_ON(pipe != hctx->ccid2hctx_pipe);
69 ccid2_pr_debug("len of chain=%d\n", len);
70
71 do {
72 seqp = seqp->ccid2s_prev;
73 len++;
74 } while (seqp != hctx->ccid2hctx_seqh);
75
76 ccid2_pr_debug("total len=%d\n", len);
77 BUG_ON(len != hctx->ccid2hctx_seqbufc * CCID2_SEQBUF_LEN);
78}
79#else 37#else
80#define ccid2_pr_debug(format, a...) 38#define ccid2_pr_debug(format, a...)
81#define ccid2_hc_tx_check_sanity(hctx)
82#endif 39#endif
83 40
84static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hctx) 41static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hctx)
@@ -87,8 +44,7 @@ static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hctx)
87 int i; 44 int i;
88 45
89 /* check if we have space to preserve the pointer to the buffer */ 46 /* check if we have space to preserve the pointer to the buffer */
90 if (hctx->ccid2hctx_seqbufc >= (sizeof(hctx->ccid2hctx_seqbuf) / 47 if (hctx->seqbufc >= sizeof(hctx->seqbuf) / sizeof(struct ccid2_seq *))
91 sizeof(struct ccid2_seq*)))
92 return -ENOMEM; 48 return -ENOMEM;
93 49
94 /* allocate buffer and initialize linked list */ 50 /* allocate buffer and initialize linked list */
@@ -104,38 +60,35 @@ static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hctx)
104 seqp->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1]; 60 seqp->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1];
105 61
106 /* This is the first allocation. Initiate the head and tail. */ 62 /* This is the first allocation. Initiate the head and tail. */
107 if (hctx->ccid2hctx_seqbufc == 0) 63 if (hctx->seqbufc == 0)
108 hctx->ccid2hctx_seqh = hctx->ccid2hctx_seqt = seqp; 64 hctx->seqh = hctx->seqt = seqp;
109 else { 65 else {
110 /* link the existing list with the one we just created */ 66 /* link the existing list with the one we just created */
111 hctx->ccid2hctx_seqh->ccid2s_next = seqp; 67 hctx->seqh->ccid2s_next = seqp;
112 seqp->ccid2s_prev = hctx->ccid2hctx_seqh; 68 seqp->ccid2s_prev = hctx->seqh;
113 69
114 hctx->ccid2hctx_seqt->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1]; 70 hctx->seqt->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1];
115 seqp[CCID2_SEQBUF_LEN - 1].ccid2s_next = hctx->ccid2hctx_seqt; 71 seqp[CCID2_SEQBUF_LEN - 1].ccid2s_next = hctx->seqt;
116 } 72 }
117 73
118 /* store the original pointer to the buffer so we can free it */ 74 /* store the original pointer to the buffer so we can free it */
119 hctx->ccid2hctx_seqbuf[hctx->ccid2hctx_seqbufc] = seqp; 75 hctx->seqbuf[hctx->seqbufc] = seqp;
120 hctx->ccid2hctx_seqbufc++; 76 hctx->seqbufc++;
121 77
122 return 0; 78 return 0;
123} 79}
124 80
125static int ccid2_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) 81static int ccid2_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
126{ 82{
127 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); 83 if (ccid2_cwnd_network_limited(ccid2_hc_tx_sk(sk)))
128 84 return CCID_PACKET_WILL_DEQUEUE_LATER;
129 if (hctx->ccid2hctx_pipe < hctx->ccid2hctx_cwnd) 85 return CCID_PACKET_SEND_AT_ONCE;
130 return 0;
131
132 return 1; /* XXX CCID should dequeue when ready instead of polling */
133} 86}
134 87
135static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val) 88static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val)
136{ 89{
137 struct dccp_sock *dp = dccp_sk(sk); 90 struct dccp_sock *dp = dccp_sk(sk);
138 u32 max_ratio = DIV_ROUND_UP(ccid2_hc_tx_sk(sk)->ccid2hctx_cwnd, 2); 91 u32 max_ratio = DIV_ROUND_UP(ccid2_hc_tx_sk(sk)->cwnd, 2);
139 92
140 /* 93 /*
141 * Ensure that Ack Ratio does not exceed ceil(cwnd/2), which is (2) from 94 * Ensure that Ack Ratio does not exceed ceil(cwnd/2), which is (2) from
@@ -147,8 +100,8 @@ static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val)
147 DCCP_WARN("Limiting Ack Ratio (%u) to %u\n", val, max_ratio); 100 DCCP_WARN("Limiting Ack Ratio (%u) to %u\n", val, max_ratio);
148 val = max_ratio; 101 val = max_ratio;
149 } 102 }
150 if (val > 0xFFFF) /* RFC 4340, 11.3 */ 103 if (val > DCCPF_ACK_RATIO_MAX)
151 val = 0xFFFF; 104 val = DCCPF_ACK_RATIO_MAX;
152 105
153 if (val == dp->dccps_l_ack_ratio) 106 if (val == dp->dccps_l_ack_ratio)
154 return; 107 return;
@@ -157,99 +110,77 @@ static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val)
157 dp->dccps_l_ack_ratio = val; 110 dp->dccps_l_ack_ratio = val;
158} 111}
159 112
160static void ccid2_change_srtt(struct ccid2_hc_tx_sock *hctx, long val)
161{
162 ccid2_pr_debug("change SRTT to %ld\n", val);
163 hctx->ccid2hctx_srtt = val;
164}
165
166static void ccid2_start_rto_timer(struct sock *sk);
167
168static void ccid2_hc_tx_rto_expire(unsigned long data) 113static void ccid2_hc_tx_rto_expire(unsigned long data)
169{ 114{
170 struct sock *sk = (struct sock *)data; 115 struct sock *sk = (struct sock *)data;
171 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); 116 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
172 long s; 117 const bool sender_was_blocked = ccid2_cwnd_network_limited(hctx);
173 118
174 bh_lock_sock(sk); 119 bh_lock_sock(sk);
175 if (sock_owned_by_user(sk)) { 120 if (sock_owned_by_user(sk)) {
176 sk_reset_timer(sk, &hctx->ccid2hctx_rtotimer, 121 sk_reset_timer(sk, &hctx->rtotimer, jiffies + HZ / 5);
177 jiffies + HZ / 5);
178 goto out; 122 goto out;
179 } 123 }
180 124
181 ccid2_pr_debug("RTO_EXPIRE\n"); 125 ccid2_pr_debug("RTO_EXPIRE\n");
182 126
183 ccid2_hc_tx_check_sanity(hctx);
184
185 /* back-off timer */ 127 /* back-off timer */
186 hctx->ccid2hctx_rto <<= 1; 128 hctx->rto <<= 1;
187 129 if (hctx->rto > DCCP_RTO_MAX)
188 s = hctx->ccid2hctx_rto / HZ; 130 hctx->rto = DCCP_RTO_MAX;
189 if (s > 60)
190 hctx->ccid2hctx_rto = 60 * HZ;
191
192 ccid2_start_rto_timer(sk);
193 131
194 /* adjust pipe, cwnd etc */ 132 /* adjust pipe, cwnd etc */
195 hctx->ccid2hctx_ssthresh = hctx->ccid2hctx_cwnd / 2; 133 hctx->ssthresh = hctx->cwnd / 2;
196 if (hctx->ccid2hctx_ssthresh < 2) 134 if (hctx->ssthresh < 2)
197 hctx->ccid2hctx_ssthresh = 2; 135 hctx->ssthresh = 2;
198 hctx->ccid2hctx_cwnd = 1; 136 hctx->cwnd = 1;
199 hctx->ccid2hctx_pipe = 0; 137 hctx->pipe = 0;
200 138
201 /* clear state about stuff we sent */ 139 /* clear state about stuff we sent */
202 hctx->ccid2hctx_seqt = hctx->ccid2hctx_seqh; 140 hctx->seqt = hctx->seqh;
203 hctx->ccid2hctx_packets_acked = 0; 141 hctx->packets_acked = 0;
204 142
205 /* clear ack ratio state. */ 143 /* clear ack ratio state. */
206 hctx->ccid2hctx_rpseq = 0; 144 hctx->rpseq = 0;
207 hctx->ccid2hctx_rpdupack = -1; 145 hctx->rpdupack = -1;
208 ccid2_change_l_ack_ratio(sk, 1); 146 ccid2_change_l_ack_ratio(sk, 1);
209 ccid2_hc_tx_check_sanity(hctx); 147
148 /* if we were blocked before, we may now send cwnd=1 packet */
149 if (sender_was_blocked)
150 tasklet_schedule(&dccp_sk(sk)->dccps_xmitlet);
151 /* restart backed-off timer */
152 sk_reset_timer(sk, &hctx->rtotimer, jiffies + hctx->rto);
210out: 153out:
211 bh_unlock_sock(sk); 154 bh_unlock_sock(sk);
212 sock_put(sk); 155 sock_put(sk);
213} 156}
214 157
215static void ccid2_start_rto_timer(struct sock *sk) 158static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len)
216{
217 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
218
219 ccid2_pr_debug("setting RTO timeout=%ld\n", hctx->ccid2hctx_rto);
220
221 BUG_ON(timer_pending(&hctx->ccid2hctx_rtotimer));
222 sk_reset_timer(sk, &hctx->ccid2hctx_rtotimer,
223 jiffies + hctx->ccid2hctx_rto);
224}
225
226static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len)
227{ 159{
228 struct dccp_sock *dp = dccp_sk(sk); 160 struct dccp_sock *dp = dccp_sk(sk);
229 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); 161 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
230 struct ccid2_seq *next; 162 struct ccid2_seq *next;
231 163
232 hctx->ccid2hctx_pipe++; 164 hctx->pipe++;
233 165
234 hctx->ccid2hctx_seqh->ccid2s_seq = dp->dccps_gss; 166 hctx->seqh->ccid2s_seq = dp->dccps_gss;
235 hctx->ccid2hctx_seqh->ccid2s_acked = 0; 167 hctx->seqh->ccid2s_acked = 0;
236 hctx->ccid2hctx_seqh->ccid2s_sent = jiffies; 168 hctx->seqh->ccid2s_sent = jiffies;
237 169
238 next = hctx->ccid2hctx_seqh->ccid2s_next; 170 next = hctx->seqh->ccid2s_next;
239 /* check if we need to alloc more space */ 171 /* check if we need to alloc more space */
240 if (next == hctx->ccid2hctx_seqt) { 172 if (next == hctx->seqt) {
241 if (ccid2_hc_tx_alloc_seq(hctx)) { 173 if (ccid2_hc_tx_alloc_seq(hctx)) {
242 DCCP_CRIT("packet history - out of memory!"); 174 DCCP_CRIT("packet history - out of memory!");
243 /* FIXME: find a more graceful way to bail out */ 175 /* FIXME: find a more graceful way to bail out */
244 return; 176 return;
245 } 177 }
246 next = hctx->ccid2hctx_seqh->ccid2s_next; 178 next = hctx->seqh->ccid2s_next;
247 BUG_ON(next == hctx->ccid2hctx_seqt); 179 BUG_ON(next == hctx->seqt);
248 } 180 }
249 hctx->ccid2hctx_seqh = next; 181 hctx->seqh = next;
250 182
251 ccid2_pr_debug("cwnd=%d pipe=%d\n", hctx->ccid2hctx_cwnd, 183 ccid2_pr_debug("cwnd=%d pipe=%d\n", hctx->cwnd, hctx->pipe);
252 hctx->ccid2hctx_pipe);
253 184
254 /* 185 /*
255 * FIXME: The code below is broken and the variables have been removed 186 * FIXME: The code below is broken and the variables have been removed
@@ -272,12 +203,12 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len)
272 */ 203 */
273#if 0 204#if 0
274 /* Ack Ratio. Need to maintain a concept of how many windows we sent */ 205 /* Ack Ratio. Need to maintain a concept of how many windows we sent */
275 hctx->ccid2hctx_arsent++; 206 hctx->arsent++;
276 /* We had an ack loss in this window... */ 207 /* We had an ack loss in this window... */
277 if (hctx->ccid2hctx_ackloss) { 208 if (hctx->ackloss) {
278 if (hctx->ccid2hctx_arsent >= hctx->ccid2hctx_cwnd) { 209 if (hctx->arsent >= hctx->cwnd) {
279 hctx->ccid2hctx_arsent = 0; 210 hctx->arsent = 0;
280 hctx->ccid2hctx_ackloss = 0; 211 hctx->ackloss = 0;
281 } 212 }
282 } else { 213 } else {
283 /* No acks lost up to now... */ 214 /* No acks lost up to now... */
@@ -287,28 +218,28 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len)
287 int denom = dp->dccps_l_ack_ratio * dp->dccps_l_ack_ratio - 218 int denom = dp->dccps_l_ack_ratio * dp->dccps_l_ack_ratio -
288 dp->dccps_l_ack_ratio; 219 dp->dccps_l_ack_ratio;
289 220
290 denom = hctx->ccid2hctx_cwnd * hctx->ccid2hctx_cwnd / denom; 221 denom = hctx->cwnd * hctx->cwnd / denom;
291 222
292 if (hctx->ccid2hctx_arsent >= denom) { 223 if (hctx->arsent >= denom) {
293 ccid2_change_l_ack_ratio(sk, dp->dccps_l_ack_ratio - 1); 224 ccid2_change_l_ack_ratio(sk, dp->dccps_l_ack_ratio - 1);
294 hctx->ccid2hctx_arsent = 0; 225 hctx->arsent = 0;
295 } 226 }
296 } else { 227 } else {
297 /* we can't increase ack ratio further [1] */ 228 /* we can't increase ack ratio further [1] */
298 hctx->ccid2hctx_arsent = 0; /* or maybe set it to cwnd*/ 229 hctx->arsent = 0; /* or maybe set it to cwnd*/
299 } 230 }
300 } 231 }
301#endif 232#endif
302 233
303 /* setup RTO timer */ 234 /* setup RTO timer */
304 if (!timer_pending(&hctx->ccid2hctx_rtotimer)) 235 if (!timer_pending(&hctx->rtotimer))
305 ccid2_start_rto_timer(sk); 236 sk_reset_timer(sk, &hctx->rtotimer, jiffies + hctx->rto);
306 237
307#ifdef CONFIG_IP_DCCP_CCID2_DEBUG 238#ifdef CONFIG_IP_DCCP_CCID2_DEBUG
308 do { 239 do {
309 struct ccid2_seq *seqp = hctx->ccid2hctx_seqt; 240 struct ccid2_seq *seqp = hctx->seqt;
310 241
311 while (seqp != hctx->ccid2hctx_seqh) { 242 while (seqp != hctx->seqh) {
312 ccid2_pr_debug("out seq=%llu acked=%d time=%lu\n", 243 ccid2_pr_debug("out seq=%llu acked=%d time=%lu\n",
313 (unsigned long long)seqp->ccid2s_seq, 244 (unsigned long long)seqp->ccid2s_seq,
314 seqp->ccid2s_acked, seqp->ccid2s_sent); 245 seqp->ccid2s_acked, seqp->ccid2s_sent);
@@ -316,205 +247,158 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len)
316 } 247 }
317 } while (0); 248 } while (0);
318 ccid2_pr_debug("=========\n"); 249 ccid2_pr_debug("=========\n");
319 ccid2_hc_tx_check_sanity(hctx);
320#endif 250#endif
321} 251}
322 252
323/* XXX Lame code duplication! 253/**
324 * returns -1 if none was found. 254 * ccid2_rtt_estimator - Sample RTT and compute RTO using RFC2988 algorithm
325 * else returns the next offset to use in the function call. 255 * This code is almost identical with TCP's tcp_rtt_estimator(), since
256 * - it has a higher sampling frequency (recommended by RFC 1323),
257 * - the RTO does not collapse into RTT due to RTTVAR going towards zero,
258 * - it is simple (cf. more complex proposals such as Eifel timer or research
259 * which suggests that the gain should be set according to window size),
260 * - in tests it was found to work well with CCID2 [gerrit].
326 */ 261 */
327static int ccid2_ackvector(struct sock *sk, struct sk_buff *skb, int offset, 262static void ccid2_rtt_estimator(struct sock *sk, const long mrtt)
328 unsigned char **vec, unsigned char *veclen)
329{ 263{
330 const struct dccp_hdr *dh = dccp_hdr(skb); 264 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
331 unsigned char *options = (unsigned char *)dh + dccp_hdr_len(skb); 265 long m = mrtt ? : 1;
332 unsigned char *opt_ptr; 266
333 const unsigned char *opt_end = (unsigned char *)dh + 267 if (hctx->srtt == 0) {
334 (dh->dccph_doff * 4); 268 /* First measurement m */
335 unsigned char opt, len; 269 hctx->srtt = m << 3;
336 unsigned char *value; 270 hctx->mdev = m << 1;
337 271
338 BUG_ON(offset < 0); 272 hctx->mdev_max = max(TCP_RTO_MIN, hctx->mdev);
339 options += offset; 273 hctx->rttvar = hctx->mdev_max;
340 opt_ptr = options; 274 hctx->rtt_seq = dccp_sk(sk)->dccps_gss;
341 if (opt_ptr >= opt_end) 275 } else {
342 return -1; 276 /* Update scaled SRTT as SRTT += 1/8 * (m - SRTT) */
343 277 m -= (hctx->srtt >> 3);
344 while (opt_ptr != opt_end) { 278 hctx->srtt += m;
345 opt = *opt_ptr++; 279
346 len = 0; 280 /* Similarly, update scaled mdev with regard to |m| */
347 value = NULL; 281 if (m < 0) {
348 282 m = -m;
349 /* Check if this isn't a single byte option */ 283 m -= (hctx->mdev >> 2);
350 if (opt > DCCPO_MAX_RESERVED) {
351 if (opt_ptr == opt_end)
352 goto out_invalid_option;
353
354 len = *opt_ptr++;
355 if (len < 3)
356 goto out_invalid_option;
357 /* 284 /*
358 * Remove the type and len fields, leaving 285 * This neutralises RTO increase when RTT < SRTT - mdev
359 * just the value size 286 * (see P. Sarolahti, A. Kuznetsov,"Congestion Control
287 * in Linux TCP", USENIX 2002, pp. 49-62).
360 */ 288 */
361 len -= 2; 289 if (m > 0)
362 value = opt_ptr; 290 m >>= 3;
363 opt_ptr += len; 291 } else {
292 m -= (hctx->mdev >> 2);
293 }
294 hctx->mdev += m;
364 295
365 if (opt_ptr > opt_end) 296 if (hctx->mdev > hctx->mdev_max) {
366 goto out_invalid_option; 297 hctx->mdev_max = hctx->mdev;
298 if (hctx->mdev_max > hctx->rttvar)
299 hctx->rttvar = hctx->mdev_max;
367 } 300 }
368 301
369 switch (opt) { 302 /*
370 case DCCPO_ACK_VECTOR_0: 303 * Decay RTTVAR at most once per flight, exploiting that
371 case DCCPO_ACK_VECTOR_1: 304 * 1) pipe <= cwnd <= Sequence_Window = W (RFC 4340, 7.5.2)
372 *vec = value; 305 * 2) AWL = GSS-W+1 <= GAR <= GSS (RFC 4340, 7.5.1)
373 *veclen = len; 306 * GAR is a useful bound for FlightSize = pipe, AWL is probably
374 return offset + (opt_ptr - options); 307 * too low as it over-estimates pipe.
308 */
309 if (after48(dccp_sk(sk)->dccps_gar, hctx->rtt_seq)) {
310 if (hctx->mdev_max < hctx->rttvar)
311 hctx->rttvar -= (hctx->rttvar -
312 hctx->mdev_max) >> 2;
313 hctx->rtt_seq = dccp_sk(sk)->dccps_gss;
314 hctx->mdev_max = TCP_RTO_MIN;
375 } 315 }
376 } 316 }
377 317
378 return -1; 318 /*
379 319 * Set RTO from SRTT and RTTVAR
380out_invalid_option: 320 * Clock granularity is ignored since the minimum error for RTTVAR is
381 DCCP_BUG("Invalid option - this should not happen (previous parsing)!"); 321 * clamped to 50msec (corresponding to HZ=20). This leads to a minimum
382 return -1; 322 * RTO of 200msec. This agrees with TCP and RFC 4341, 5.: "Because DCCP
383} 323 * does not retransmit data, DCCP does not require TCP's recommended
384 324 * minimum timeout of one second".
385static void ccid2_hc_tx_kill_rto_timer(struct sock *sk) 325 */
386{ 326 hctx->rto = (hctx->srtt >> 3) + hctx->rttvar;
387 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
388 327
389 sk_stop_timer(sk, &hctx->ccid2hctx_rtotimer); 328 if (hctx->rto > DCCP_RTO_MAX)
390 ccid2_pr_debug("deleted RTO timer\n"); 329 hctx->rto = DCCP_RTO_MAX;
391} 330}
392 331
393static inline void ccid2_new_ack(struct sock *sk, 332static void ccid2_new_ack(struct sock *sk, struct ccid2_seq *seqp,
394 struct ccid2_seq *seqp, 333 unsigned int *maxincr)
395 unsigned int *maxincr)
396{ 334{
397 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); 335 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
398 336
399 if (hctx->ccid2hctx_cwnd < hctx->ccid2hctx_ssthresh) { 337 if (hctx->cwnd < hctx->ssthresh) {
400 if (*maxincr > 0 && ++hctx->ccid2hctx_packets_acked == 2) { 338 if (*maxincr > 0 && ++hctx->packets_acked == 2) {
401 hctx->ccid2hctx_cwnd += 1; 339 hctx->cwnd += 1;
402 *maxincr -= 1; 340 *maxincr -= 1;
403 hctx->ccid2hctx_packets_acked = 0; 341 hctx->packets_acked = 0;
404 } 342 }
405 } else if (++hctx->ccid2hctx_packets_acked >= hctx->ccid2hctx_cwnd) { 343 } else if (++hctx->packets_acked >= hctx->cwnd) {
406 hctx->ccid2hctx_cwnd += 1; 344 hctx->cwnd += 1;
407 hctx->ccid2hctx_packets_acked = 0; 345 hctx->packets_acked = 0;
408 } 346 }
409 347 /*
410 /* update RTO */ 348 * FIXME: RTT is sampled several times per acknowledgment (for each
411 if (hctx->ccid2hctx_srtt == -1 || 349 * entry in the Ack Vector), instead of once per Ack (as in TCP SACK).
412 time_after(jiffies, hctx->ccid2hctx_lastrtt + hctx->ccid2hctx_srtt)) { 350 * This causes the RTT to be over-estimated, since the older entries
413 unsigned long r = (long)jiffies - (long)seqp->ccid2s_sent; 351 * in the Ack Vector have earlier sending times.
414 int s; 352 * The cleanest solution is to not use the ccid2s_sent field at all
415 353 * and instead use DCCP timestamps - need to be resolved at some time.
416 /* first measurement */ 354 */
417 if (hctx->ccid2hctx_srtt == -1) { 355 ccid2_rtt_estimator(sk, jiffies - seqp->ccid2s_sent);
418 ccid2_pr_debug("R: %lu Time=%lu seq=%llu\n",
419 r, jiffies,
420 (unsigned long long)seqp->ccid2s_seq);
421 ccid2_change_srtt(hctx, r);
422 hctx->ccid2hctx_rttvar = r >> 1;
423 } else {
424 /* RTTVAR */
425 long tmp = hctx->ccid2hctx_srtt - r;
426 long srtt;
427
428 if (tmp < 0)
429 tmp *= -1;
430
431 tmp >>= 2;
432 hctx->ccid2hctx_rttvar *= 3;
433 hctx->ccid2hctx_rttvar >>= 2;
434 hctx->ccid2hctx_rttvar += tmp;
435
436 /* SRTT */
437 srtt = hctx->ccid2hctx_srtt;
438 srtt *= 7;
439 srtt >>= 3;
440 tmp = r >> 3;
441 srtt += tmp;
442 ccid2_change_srtt(hctx, srtt);
443 }
444 s = hctx->ccid2hctx_rttvar << 2;
445 /* clock granularity is 1 when based on jiffies */
446 if (!s)
447 s = 1;
448 hctx->ccid2hctx_rto = hctx->ccid2hctx_srtt + s;
449
450 /* must be at least a second */
451 s = hctx->ccid2hctx_rto / HZ;
452 /* DCCP doesn't require this [but I like it cuz my code sux] */
453#if 1
454 if (s < 1)
455 hctx->ccid2hctx_rto = HZ;
456#endif
457 /* max 60 seconds */
458 if (s > 60)
459 hctx->ccid2hctx_rto = HZ * 60;
460
461 hctx->ccid2hctx_lastrtt = jiffies;
462
463 ccid2_pr_debug("srtt: %ld rttvar: %ld rto: %ld (HZ=%d) R=%lu\n",
464 hctx->ccid2hctx_srtt, hctx->ccid2hctx_rttvar,
465 hctx->ccid2hctx_rto, HZ, r);
466 }
467
468 /* we got a new ack, so re-start RTO timer */
469 ccid2_hc_tx_kill_rto_timer(sk);
470 ccid2_start_rto_timer(sk);
471}
472
473static void ccid2_hc_tx_dec_pipe(struct sock *sk)
474{
475 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
476
477 if (hctx->ccid2hctx_pipe == 0)
478 DCCP_BUG("pipe == 0");
479 else
480 hctx->ccid2hctx_pipe--;
481
482 if (hctx->ccid2hctx_pipe == 0)
483 ccid2_hc_tx_kill_rto_timer(sk);
484} 356}
485 357
486static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp) 358static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp)
487{ 359{
488 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); 360 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
489 361
490 if (time_before(seqp->ccid2s_sent, hctx->ccid2hctx_last_cong)) { 362 if (time_before(seqp->ccid2s_sent, hctx->last_cong)) {
491 ccid2_pr_debug("Multiple losses in an RTT---treating as one\n"); 363 ccid2_pr_debug("Multiple losses in an RTT---treating as one\n");
492 return; 364 return;
493 } 365 }
494 366
495 hctx->ccid2hctx_last_cong = jiffies; 367 hctx->last_cong = jiffies;
496 368
497 hctx->ccid2hctx_cwnd = hctx->ccid2hctx_cwnd / 2 ? : 1U; 369 hctx->cwnd = hctx->cwnd / 2 ? : 1U;
498 hctx->ccid2hctx_ssthresh = max(hctx->ccid2hctx_cwnd, 2U); 370 hctx->ssthresh = max(hctx->cwnd, 2U);
499 371
500 /* Avoid spurious timeouts resulting from Ack Ratio > cwnd */ 372 /* Avoid spurious timeouts resulting from Ack Ratio > cwnd */
501 if (dccp_sk(sk)->dccps_l_ack_ratio > hctx->ccid2hctx_cwnd) 373 if (dccp_sk(sk)->dccps_l_ack_ratio > hctx->cwnd)
502 ccid2_change_l_ack_ratio(sk, hctx->ccid2hctx_cwnd); 374 ccid2_change_l_ack_ratio(sk, hctx->cwnd);
375}
376
377static int ccid2_hc_tx_parse_options(struct sock *sk, u8 packet_type,
378 u8 option, u8 *optval, u8 optlen)
379{
380 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
381
382 switch (option) {
383 case DCCPO_ACK_VECTOR_0:
384 case DCCPO_ACK_VECTOR_1:
385 return dccp_ackvec_parsed_add(&hctx->av_chunks, optval, optlen,
386 option - DCCPO_ACK_VECTOR_0);
387 }
388 return 0;
503} 389}
504 390
505static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) 391static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
506{ 392{
507 struct dccp_sock *dp = dccp_sk(sk); 393 struct dccp_sock *dp = dccp_sk(sk);
508 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); 394 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
395 const bool sender_was_blocked = ccid2_cwnd_network_limited(hctx);
396 struct dccp_ackvec_parsed *avp;
509 u64 ackno, seqno; 397 u64 ackno, seqno;
510 struct ccid2_seq *seqp; 398 struct ccid2_seq *seqp;
511 unsigned char *vector;
512 unsigned char veclen;
513 int offset = 0;
514 int done = 0; 399 int done = 0;
515 unsigned int maxincr = 0; 400 unsigned int maxincr = 0;
516 401
517 ccid2_hc_tx_check_sanity(hctx);
518 /* check reverse path congestion */ 402 /* check reverse path congestion */
519 seqno = DCCP_SKB_CB(skb)->dccpd_seq; 403 seqno = DCCP_SKB_CB(skb)->dccpd_seq;
520 404
@@ -523,21 +407,21 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
523 * -sorbo. 407 * -sorbo.
524 */ 408 */
525 /* need to bootstrap */ 409 /* need to bootstrap */
526 if (hctx->ccid2hctx_rpdupack == -1) { 410 if (hctx->rpdupack == -1) {
527 hctx->ccid2hctx_rpdupack = 0; 411 hctx->rpdupack = 0;
528 hctx->ccid2hctx_rpseq = seqno; 412 hctx->rpseq = seqno;
529 } else { 413 } else {
530 /* check if packet is consecutive */ 414 /* check if packet is consecutive */
531 if (dccp_delta_seqno(hctx->ccid2hctx_rpseq, seqno) == 1) 415 if (dccp_delta_seqno(hctx->rpseq, seqno) == 1)
532 hctx->ccid2hctx_rpseq = seqno; 416 hctx->rpseq = seqno;
533 /* it's a later packet */ 417 /* it's a later packet */
534 else if (after48(seqno, hctx->ccid2hctx_rpseq)) { 418 else if (after48(seqno, hctx->rpseq)) {
535 hctx->ccid2hctx_rpdupack++; 419 hctx->rpdupack++;
536 420
537 /* check if we got enough dupacks */ 421 /* check if we got enough dupacks */
538 if (hctx->ccid2hctx_rpdupack >= NUMDUPACK) { 422 if (hctx->rpdupack >= NUMDUPACK) {
539 hctx->ccid2hctx_rpdupack = -1; /* XXX lame */ 423 hctx->rpdupack = -1; /* XXX lame */
540 hctx->ccid2hctx_rpseq = 0; 424 hctx->rpseq = 0;
541 425
542 ccid2_change_l_ack_ratio(sk, 2 * dp->dccps_l_ack_ratio); 426 ccid2_change_l_ack_ratio(sk, 2 * dp->dccps_l_ack_ratio);
543 } 427 }
@@ -545,27 +429,22 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
545 } 429 }
546 430
547 /* check forward path congestion */ 431 /* check forward path congestion */
548 /* still didn't send out new data packets */ 432 if (dccp_packet_without_ack(skb))
549 if (hctx->ccid2hctx_seqh == hctx->ccid2hctx_seqt)
550 return; 433 return;
551 434
552 switch (DCCP_SKB_CB(skb)->dccpd_type) { 435 /* still didn't send out new data packets */
553 case DCCP_PKT_ACK: 436 if (hctx->seqh == hctx->seqt)
554 case DCCP_PKT_DATAACK: 437 goto done;
555 break;
556 default:
557 return;
558 }
559 438
560 ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq; 439 ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq;
561 if (after48(ackno, hctx->ccid2hctx_high_ack)) 440 if (after48(ackno, hctx->high_ack))
562 hctx->ccid2hctx_high_ack = ackno; 441 hctx->high_ack = ackno;
563 442
564 seqp = hctx->ccid2hctx_seqt; 443 seqp = hctx->seqt;
565 while (before48(seqp->ccid2s_seq, ackno)) { 444 while (before48(seqp->ccid2s_seq, ackno)) {
566 seqp = seqp->ccid2s_next; 445 seqp = seqp->ccid2s_next;
567 if (seqp == hctx->ccid2hctx_seqh) { 446 if (seqp == hctx->seqh) {
568 seqp = hctx->ccid2hctx_seqh->ccid2s_prev; 447 seqp = hctx->seqh->ccid2s_prev;
569 break; 448 break;
570 } 449 }
571 } 450 }
@@ -575,26 +454,26 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
575 * packets per acknowledgement. Rounding up avoids that cwnd is not 454 * packets per acknowledgement. Rounding up avoids that cwnd is not
576 * advanced when Ack Ratio is 1 and gives a slight edge otherwise. 455 * advanced when Ack Ratio is 1 and gives a slight edge otherwise.
577 */ 456 */
578 if (hctx->ccid2hctx_cwnd < hctx->ccid2hctx_ssthresh) 457 if (hctx->cwnd < hctx->ssthresh)
579 maxincr = DIV_ROUND_UP(dp->dccps_l_ack_ratio, 2); 458 maxincr = DIV_ROUND_UP(dp->dccps_l_ack_ratio, 2);
580 459
581 /* go through all ack vectors */ 460 /* go through all ack vectors */
582 while ((offset = ccid2_ackvector(sk, skb, offset, 461 list_for_each_entry(avp, &hctx->av_chunks, node) {
583 &vector, &veclen)) != -1) {
584 /* go through this ack vector */ 462 /* go through this ack vector */
585 while (veclen--) { 463 for (; avp->len--; avp->vec++) {
586 const u8 rl = *vector & DCCP_ACKVEC_LEN_MASK; 464 u64 ackno_end_rl = SUB48(ackno,
587 u64 ackno_end_rl = SUB48(ackno, rl); 465 dccp_ackvec_runlen(avp->vec));
588 466
589 ccid2_pr_debug("ackvec start:%llu end:%llu\n", 467 ccid2_pr_debug("ackvec %llu |%u,%u|\n",
590 (unsigned long long)ackno, 468 (unsigned long long)ackno,
591 (unsigned long long)ackno_end_rl); 469 dccp_ackvec_state(avp->vec) >> 6,
470 dccp_ackvec_runlen(avp->vec));
592 /* if the seqno we are analyzing is larger than the 471 /* if the seqno we are analyzing is larger than the
593 * current ackno, then move towards the tail of our 472 * current ackno, then move towards the tail of our
594 * seqnos. 473 * seqnos.
595 */ 474 */
596 while (after48(seqp->ccid2s_seq, ackno)) { 475 while (after48(seqp->ccid2s_seq, ackno)) {
597 if (seqp == hctx->ccid2hctx_seqt) { 476 if (seqp == hctx->seqt) {
598 done = 1; 477 done = 1;
599 break; 478 break;
600 } 479 }
@@ -607,26 +486,24 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
607 * run length 486 * run length
608 */ 487 */
609 while (between48(seqp->ccid2s_seq,ackno_end_rl,ackno)) { 488 while (between48(seqp->ccid2s_seq,ackno_end_rl,ackno)) {
610 const u8 state = *vector & 489 const u8 state = dccp_ackvec_state(avp->vec);
611 DCCP_ACKVEC_STATE_MASK;
612 490
613 /* new packet received or marked */ 491 /* new packet received or marked */
614 if (state != DCCP_ACKVEC_STATE_NOT_RECEIVED && 492 if (state != DCCPAV_NOT_RECEIVED &&
615 !seqp->ccid2s_acked) { 493 !seqp->ccid2s_acked) {
616 if (state == 494 if (state == DCCPAV_ECN_MARKED)
617 DCCP_ACKVEC_STATE_ECN_MARKED) {
618 ccid2_congestion_event(sk, 495 ccid2_congestion_event(sk,
619 seqp); 496 seqp);
620 } else 497 else
621 ccid2_new_ack(sk, seqp, 498 ccid2_new_ack(sk, seqp,
622 &maxincr); 499 &maxincr);
623 500
624 seqp->ccid2s_acked = 1; 501 seqp->ccid2s_acked = 1;
625 ccid2_pr_debug("Got ack for %llu\n", 502 ccid2_pr_debug("Got ack for %llu\n",
626 (unsigned long long)seqp->ccid2s_seq); 503 (unsigned long long)seqp->ccid2s_seq);
627 ccid2_hc_tx_dec_pipe(sk); 504 hctx->pipe--;
628 } 505 }
629 if (seqp == hctx->ccid2hctx_seqt) { 506 if (seqp == hctx->seqt) {
630 done = 1; 507 done = 1;
631 break; 508 break;
632 } 509 }
@@ -636,7 +513,6 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
636 break; 513 break;
637 514
638 ackno = SUB48(ackno_end_rl, 1); 515 ackno = SUB48(ackno_end_rl, 1);
639 vector++;
640 } 516 }
641 if (done) 517 if (done)
642 break; 518 break;
@@ -645,11 +521,11 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
645 /* The state about what is acked should be correct now 521 /* The state about what is acked should be correct now
646 * Check for NUMDUPACK 522 * Check for NUMDUPACK
647 */ 523 */
648 seqp = hctx->ccid2hctx_seqt; 524 seqp = hctx->seqt;
649 while (before48(seqp->ccid2s_seq, hctx->ccid2hctx_high_ack)) { 525 while (before48(seqp->ccid2s_seq, hctx->high_ack)) {
650 seqp = seqp->ccid2s_next; 526 seqp = seqp->ccid2s_next;
651 if (seqp == hctx->ccid2hctx_seqh) { 527 if (seqp == hctx->seqh) {
652 seqp = hctx->ccid2hctx_seqh->ccid2s_prev; 528 seqp = hctx->seqh->ccid2s_prev;
653 break; 529 break;
654 } 530 }
655 } 531 }
@@ -660,7 +536,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
660 if (done == NUMDUPACK) 536 if (done == NUMDUPACK)
661 break; 537 break;
662 } 538 }
663 if (seqp == hctx->ccid2hctx_seqt) 539 if (seqp == hctx->seqt)
664 break; 540 break;
665 seqp = seqp->ccid2s_prev; 541 seqp = seqp->ccid2s_prev;
666 } 542 }
@@ -681,25 +557,34 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
681 * one ack vector. 557 * one ack vector.
682 */ 558 */
683 ccid2_congestion_event(sk, seqp); 559 ccid2_congestion_event(sk, seqp);
684 ccid2_hc_tx_dec_pipe(sk); 560 hctx->pipe--;
685 } 561 }
686 if (seqp == hctx->ccid2hctx_seqt) 562 if (seqp == hctx->seqt)
687 break; 563 break;
688 seqp = seqp->ccid2s_prev; 564 seqp = seqp->ccid2s_prev;
689 } 565 }
690 566
691 hctx->ccid2hctx_seqt = last_acked; 567 hctx->seqt = last_acked;
692 } 568 }
693 569
694 /* trim acked packets in tail */ 570 /* trim acked packets in tail */
695 while (hctx->ccid2hctx_seqt != hctx->ccid2hctx_seqh) { 571 while (hctx->seqt != hctx->seqh) {
696 if (!hctx->ccid2hctx_seqt->ccid2s_acked) 572 if (!hctx->seqt->ccid2s_acked)
697 break; 573 break;
698 574
699 hctx->ccid2hctx_seqt = hctx->ccid2hctx_seqt->ccid2s_next; 575 hctx->seqt = hctx->seqt->ccid2s_next;
700 } 576 }
701 577
702 ccid2_hc_tx_check_sanity(hctx); 578 /* restart RTO timer if not all outstanding data has been acked */
579 if (hctx->pipe == 0)
580 sk_stop_timer(sk, &hctx->rtotimer);
581 else
582 sk_reset_timer(sk, &hctx->rtotimer, jiffies + hctx->rto);
583done:
584 /* check if incoming Acks allow pending packets to be sent */
585 if (sender_was_blocked && !ccid2_cwnd_network_limited(hctx))
586 tasklet_schedule(&dccp_sk(sk)->dccps_xmitlet);
587 dccp_ackvec_parsed_cleanup(&hctx->av_chunks);
703} 588}
704 589
705static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk) 590static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk)
@@ -709,17 +594,13 @@ static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk)
709 u32 max_ratio; 594 u32 max_ratio;
710 595
711 /* RFC 4341, 5: initialise ssthresh to arbitrarily high (max) value */ 596 /* RFC 4341, 5: initialise ssthresh to arbitrarily high (max) value */
712 hctx->ccid2hctx_ssthresh = ~0U; 597 hctx->ssthresh = ~0U;
713 598
714 /* 599 /* Use larger initial windows (RFC 3390, rfc2581bis) */
715 * RFC 4341, 5: "The cwnd parameter is initialized to at most four 600 hctx->cwnd = rfc3390_bytes_to_packets(dp->dccps_mss_cache);
716 * packets for new connections, following the rules from [RFC3390]".
717 * We need to convert the bytes of RFC3390 into the packets of RFC 4341.
718 */
719 hctx->ccid2hctx_cwnd = clamp(4380U / dp->dccps_mss_cache, 2U, 4U);
720 601
721 /* Make sure that Ack Ratio is enabled and within bounds. */ 602 /* Make sure that Ack Ratio is enabled and within bounds. */
722 max_ratio = DIV_ROUND_UP(hctx->ccid2hctx_cwnd, 2); 603 max_ratio = DIV_ROUND_UP(hctx->cwnd, 2);
723 if (dp->dccps_l_ack_ratio == 0 || dp->dccps_l_ack_ratio > max_ratio) 604 if (dp->dccps_l_ack_ratio == 0 || dp->dccps_l_ack_ratio > max_ratio)
724 dp->dccps_l_ack_ratio = max_ratio; 605 dp->dccps_l_ack_ratio = max_ratio;
725 606
@@ -727,15 +608,11 @@ static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk)
727 if (ccid2_hc_tx_alloc_seq(hctx)) 608 if (ccid2_hc_tx_alloc_seq(hctx))
728 return -ENOMEM; 609 return -ENOMEM;
729 610
730 hctx->ccid2hctx_rto = 3 * HZ; 611 hctx->rto = DCCP_TIMEOUT_INIT;
731 ccid2_change_srtt(hctx, -1); 612 hctx->rpdupack = -1;
732 hctx->ccid2hctx_rttvar = -1; 613 hctx->last_cong = jiffies;
733 hctx->ccid2hctx_rpdupack = -1; 614 setup_timer(&hctx->rtotimer, ccid2_hc_tx_rto_expire, (unsigned long)sk);
734 hctx->ccid2hctx_last_cong = jiffies; 615 INIT_LIST_HEAD(&hctx->av_chunks);
735 setup_timer(&hctx->ccid2hctx_rtotimer, ccid2_hc_tx_rto_expire,
736 (unsigned long)sk);
737
738 ccid2_hc_tx_check_sanity(hctx);
739 return 0; 616 return 0;
740} 617}
741 618
@@ -744,11 +621,11 @@ static void ccid2_hc_tx_exit(struct sock *sk)
744 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); 621 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
745 int i; 622 int i;
746 623
747 ccid2_hc_tx_kill_rto_timer(sk); 624 sk_stop_timer(sk, &hctx->rtotimer);
748 625
749 for (i = 0; i < hctx->ccid2hctx_seqbufc; i++) 626 for (i = 0; i < hctx->seqbufc; i++)
750 kfree(hctx->ccid2hctx_seqbuf[i]); 627 kfree(hctx->seqbuf[i]);
751 hctx->ccid2hctx_seqbufc = 0; 628 hctx->seqbufc = 0;
752} 629}
753 630
754static void ccid2_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) 631static void ccid2_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
@@ -759,27 +636,28 @@ static void ccid2_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
759 switch (DCCP_SKB_CB(skb)->dccpd_type) { 636 switch (DCCP_SKB_CB(skb)->dccpd_type) {
760 case DCCP_PKT_DATA: 637 case DCCP_PKT_DATA:
761 case DCCP_PKT_DATAACK: 638 case DCCP_PKT_DATAACK:
762 hcrx->ccid2hcrx_data++; 639 hcrx->data++;
763 if (hcrx->ccid2hcrx_data >= dp->dccps_r_ack_ratio) { 640 if (hcrx->data >= dp->dccps_r_ack_ratio) {
764 dccp_send_ack(sk); 641 dccp_send_ack(sk);
765 hcrx->ccid2hcrx_data = 0; 642 hcrx->data = 0;
766 } 643 }
767 break; 644 break;
768 } 645 }
769} 646}
770 647
771static struct ccid_operations ccid2 = { 648static struct ccid_operations ccid2 = {
772 .ccid_id = DCCPC_CCID2, 649 .ccid_id = DCCPC_CCID2,
773 .ccid_name = "TCP-like", 650 .ccid_name = "TCP-like",
774 .ccid_owner = THIS_MODULE, 651 .ccid_owner = THIS_MODULE,
775 .ccid_hc_tx_obj_size = sizeof(struct ccid2_hc_tx_sock), 652 .ccid_hc_tx_obj_size = sizeof(struct ccid2_hc_tx_sock),
776 .ccid_hc_tx_init = ccid2_hc_tx_init, 653 .ccid_hc_tx_init = ccid2_hc_tx_init,
777 .ccid_hc_tx_exit = ccid2_hc_tx_exit, 654 .ccid_hc_tx_exit = ccid2_hc_tx_exit,
778 .ccid_hc_tx_send_packet = ccid2_hc_tx_send_packet, 655 .ccid_hc_tx_send_packet = ccid2_hc_tx_send_packet,
779 .ccid_hc_tx_packet_sent = ccid2_hc_tx_packet_sent, 656 .ccid_hc_tx_packet_sent = ccid2_hc_tx_packet_sent,
780 .ccid_hc_tx_packet_recv = ccid2_hc_tx_packet_recv, 657 .ccid_hc_tx_parse_options = ccid2_hc_tx_parse_options,
781 .ccid_hc_rx_obj_size = sizeof(struct ccid2_hc_rx_sock), 658 .ccid_hc_tx_packet_recv = ccid2_hc_tx_packet_recv,
782 .ccid_hc_rx_packet_recv = ccid2_hc_rx_packet_recv, 659 .ccid_hc_rx_obj_size = sizeof(struct ccid2_hc_rx_sock),
660 .ccid_hc_rx_packet_recv = ccid2_hc_rx_packet_recv,
783}; 661};
784 662
785#ifdef CONFIG_IP_DCCP_CCID2_DEBUG 663#ifdef CONFIG_IP_DCCP_CCID2_DEBUG
diff --git a/net/dccp/ccids/ccid2.h b/net/dccp/ccids/ccid2.h
index 2c94ca029010..8b7a2dee2f6d 100644
--- a/net/dccp/ccids/ccid2.h
+++ b/net/dccp/ccids/ccid2.h
@@ -42,34 +42,49 @@ struct ccid2_seq {
42 42
43/** struct ccid2_hc_tx_sock - CCID2 TX half connection 43/** struct ccid2_hc_tx_sock - CCID2 TX half connection
44 * 44 *
45 * @ccid2hctx_{cwnd,ssthresh,pipe}: as per RFC 4341, section 5 45 * @{cwnd,ssthresh,pipe}: as per RFC 4341, section 5
46 * @ccid2hctx_packets_acked - Ack counter for deriving cwnd growth (RFC 3465) 46 * @packets_acked: Ack counter for deriving cwnd growth (RFC 3465)
47 * @ccid2hctx_lastrtt -time RTT was last measured 47 * @srtt: smoothed RTT estimate, scaled by 2^3
48 * @ccid2hctx_rpseq - last consecutive seqno 48 * @mdev: smoothed RTT variation, scaled by 2^2
49 * @ccid2hctx_rpdupack - dupacks since rpseq 49 * @mdev_max: maximum of @mdev during one flight
50*/ 50 * @rttvar: moving average/maximum of @mdev_max
51 * @rto: RTO value deriving from SRTT and RTTVAR (RFC 2988)
52 * @rtt_seq: to decay RTTVAR at most once per flight
53 * @rpseq: last consecutive seqno
54 * @rpdupack: dupacks since rpseq
55 * @av_chunks: list of Ack Vectors received on current skb
56 */
51struct ccid2_hc_tx_sock { 57struct ccid2_hc_tx_sock {
52 u32 ccid2hctx_cwnd; 58 u32 cwnd;
53 u32 ccid2hctx_ssthresh; 59 u32 ssthresh;
54 u32 ccid2hctx_pipe; 60 u32 pipe;
55 u32 ccid2hctx_packets_acked; 61 u32 packets_acked;
56 struct ccid2_seq *ccid2hctx_seqbuf[CCID2_SEQBUF_MAX]; 62 struct ccid2_seq *seqbuf[CCID2_SEQBUF_MAX];
57 int ccid2hctx_seqbufc; 63 int seqbufc;
58 struct ccid2_seq *ccid2hctx_seqh; 64 struct ccid2_seq *seqh;
59 struct ccid2_seq *ccid2hctx_seqt; 65 struct ccid2_seq *seqt;
60 long ccid2hctx_rto; 66 /* RTT measurement: variables/principles are the same as in TCP */
61 long ccid2hctx_srtt; 67 u32 srtt,
62 long ccid2hctx_rttvar; 68 mdev,
63 unsigned long ccid2hctx_lastrtt; 69 mdev_max,
64 struct timer_list ccid2hctx_rtotimer; 70 rttvar,
65 u64 ccid2hctx_rpseq; 71 rto;
66 int ccid2hctx_rpdupack; 72 u64 rtt_seq:48;
67 unsigned long ccid2hctx_last_cong; 73 struct timer_list rtotimer;
68 u64 ccid2hctx_high_ack; 74 u64 rpseq;
75 int rpdupack;
76 unsigned long last_cong;
77 u64 high_ack;
78 struct list_head av_chunks;
69}; 79};
70 80
81static inline bool ccid2_cwnd_network_limited(struct ccid2_hc_tx_sock *hctx)
82{
83 return (hctx->pipe >= hctx->cwnd);
84}
85
71struct ccid2_hc_rx_sock { 86struct ccid2_hc_rx_sock {
72 int ccid2hcrx_data; 87 int data;
73}; 88};
74 89
75static inline struct ccid2_hc_tx_sock *ccid2_hc_tx_sk(const struct sock *sk) 90static inline struct ccid2_hc_tx_sock *ccid2_hc_tx_sk(const struct sock *sk)
diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c
index 3b8bd7ca6761..06cfdad84a6a 100644
--- a/net/dccp/ccids/ccid3.c
+++ b/net/dccp/ccids/ccid3.c
@@ -49,75 +49,41 @@ static int ccid3_debug;
49/* 49/*
50 * Transmitter Half-Connection Routines 50 * Transmitter Half-Connection Routines
51 */ 51 */
52#ifdef CONFIG_IP_DCCP_CCID3_DEBUG 52/* Oscillation Prevention/Reduction: recommended by rfc3448bis, on by default */
53static const char *ccid3_tx_state_name(enum ccid3_hc_tx_states state) 53static int do_osc_prev = true;
54{
55 static char *ccid3_state_names[] = {
56 [TFRC_SSTATE_NO_SENT] = "NO_SENT",
57 [TFRC_SSTATE_NO_FBACK] = "NO_FBACK",
58 [TFRC_SSTATE_FBACK] = "FBACK",
59 [TFRC_SSTATE_TERM] = "TERM",
60 };
61
62 return ccid3_state_names[state];
63}
64#endif
65
66static void ccid3_hc_tx_set_state(struct sock *sk,
67 enum ccid3_hc_tx_states state)
68{
69 struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
70 enum ccid3_hc_tx_states oldstate = hctx->ccid3hctx_state;
71
72 ccid3_pr_debug("%s(%p) %-8.8s -> %s\n",
73 dccp_role(sk), sk, ccid3_tx_state_name(oldstate),
74 ccid3_tx_state_name(state));
75 WARN_ON(state == oldstate);
76 hctx->ccid3hctx_state = state;
77}
78 54
79/* 55/*
80 * Compute the initial sending rate X_init in the manner of RFC 3390: 56 * Compute the initial sending rate X_init in the manner of RFC 3390:
81 * 57 *
82 * X_init = min(4 * s, max(2 * s, 4380 bytes)) / RTT 58 * X_init = min(4 * MPS, max(2 * MPS, 4380 bytes)) / RTT
83 * 59 *
84 * Note that RFC 3390 uses MSS, RFC 4342 refers to RFC 3390, and rfc3448bis
85 * (rev-02) clarifies the use of RFC 3390 with regard to the above formula.
86 * For consistency with other parts of the code, X_init is scaled by 2^6. 60 * For consistency with other parts of the code, X_init is scaled by 2^6.
87 */ 61 */
88static inline u64 rfc3390_initial_rate(struct sock *sk) 62static inline u64 rfc3390_initial_rate(struct sock *sk)
89{ 63{
90 const struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); 64 const u32 mps = dccp_sk(sk)->dccps_mss_cache,
91 const __u32 w_init = clamp_t(__u32, 4380U, 65 w_init = clamp(4380U, 2 * mps, 4 * mps);
92 2 * hctx->ccid3hctx_s, 4 * hctx->ccid3hctx_s);
93 66
94 return scaled_div(w_init << 6, hctx->ccid3hctx_rtt); 67 return scaled_div(w_init << 6, ccid3_hc_tx_sk(sk)->rtt);
95} 68}
96 69
97/* 70/**
98 * Recalculate t_ipi and delta (should be called whenever X changes) 71 * ccid3_update_send_interval - Calculate new t_ipi = s / X
72 * This respects the granularity of X (64 * bytes/second) and enforces the
73 * scaled minimum of s * 64 / t_mbi = `s' bytes/second as per RFC 3448/4342.
99 */ 74 */
100static void ccid3_update_send_interval(struct ccid3_hc_tx_sock *hctx) 75static void ccid3_update_send_interval(struct ccid3_hc_tx_sock *hctx)
101{ 76{
102 /* Calculate new t_ipi = s / X_inst (X_inst is in 64 * bytes/second) */ 77 if (unlikely(hctx->x <= hctx->s))
103 hctx->ccid3hctx_t_ipi = scaled_div32(((u64)hctx->ccid3hctx_s) << 6, 78 hctx->x = hctx->s;
104 hctx->ccid3hctx_x); 79 hctx->t_ipi = scaled_div32(((u64)hctx->s) << 6, hctx->x);
105
106 /* Calculate new delta by delta = min(t_ipi / 2, t_gran / 2) */
107 hctx->ccid3hctx_delta = min_t(u32, hctx->ccid3hctx_t_ipi / 2,
108 TFRC_OPSYS_HALF_TIME_GRAN);
109
110 ccid3_pr_debug("t_ipi=%u, delta=%u, s=%u, X=%u\n",
111 hctx->ccid3hctx_t_ipi, hctx->ccid3hctx_delta,
112 hctx->ccid3hctx_s, (unsigned)(hctx->ccid3hctx_x >> 6));
113
114} 80}
115 81
116static u32 ccid3_hc_tx_idle_rtt(struct ccid3_hc_tx_sock *hctx, ktime_t now) 82static u32 ccid3_hc_tx_idle_rtt(struct ccid3_hc_tx_sock *hctx, ktime_t now)
117{ 83{
118 u32 delta = ktime_us_delta(now, hctx->ccid3hctx_t_last_win_count); 84 u32 delta = ktime_us_delta(now, hctx->t_last_win_count);
119 85
120 return delta / hctx->ccid3hctx_rtt; 86 return delta / hctx->rtt;
121} 87}
122 88
123/** 89/**
@@ -133,8 +99,8 @@ static u32 ccid3_hc_tx_idle_rtt(struct ccid3_hc_tx_sock *hctx, ktime_t now)
133static void ccid3_hc_tx_update_x(struct sock *sk, ktime_t *stamp) 99static void ccid3_hc_tx_update_x(struct sock *sk, ktime_t *stamp)
134{ 100{
135 struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); 101 struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
136 __u64 min_rate = 2 * hctx->ccid3hctx_x_recv; 102 u64 min_rate = 2 * hctx->x_recv;
137 const __u64 old_x = hctx->ccid3hctx_x; 103 const u64 old_x = hctx->x;
138 ktime_t now = stamp ? *stamp : ktime_get_real(); 104 ktime_t now = stamp ? *stamp : ktime_get_real();
139 105
140 /* 106 /*
@@ -145,50 +111,44 @@ static void ccid3_hc_tx_update_x(struct sock *sk, ktime_t *stamp)
145 */ 111 */
146 if (ccid3_hc_tx_idle_rtt(hctx, now) >= 2) { 112 if (ccid3_hc_tx_idle_rtt(hctx, now) >= 2) {
147 min_rate = rfc3390_initial_rate(sk); 113 min_rate = rfc3390_initial_rate(sk);
148 min_rate = max(min_rate, 2 * hctx->ccid3hctx_x_recv); 114 min_rate = max(min_rate, 2 * hctx->x_recv);
149 } 115 }
150 116
151 if (hctx->ccid3hctx_p > 0) { 117 if (hctx->p > 0) {
152 118
153 hctx->ccid3hctx_x = min(((__u64)hctx->ccid3hctx_x_calc) << 6, 119 hctx->x = min(((u64)hctx->x_calc) << 6, min_rate);
154 min_rate);
155 hctx->ccid3hctx_x = max(hctx->ccid3hctx_x,
156 (((__u64)hctx->ccid3hctx_s) << 6) /
157 TFRC_T_MBI);
158 120
159 } else if (ktime_us_delta(now, hctx->ccid3hctx_t_ld) 121 } else if (ktime_us_delta(now, hctx->t_ld) - (s64)hctx->rtt >= 0) {
160 - (s64)hctx->ccid3hctx_rtt >= 0) {
161 122
162 hctx->ccid3hctx_x = min(2 * hctx->ccid3hctx_x, min_rate); 123 hctx->x = min(2 * hctx->x, min_rate);
163 hctx->ccid3hctx_x = max(hctx->ccid3hctx_x, 124 hctx->x = max(hctx->x,
164 scaled_div(((__u64)hctx->ccid3hctx_s) << 6, 125 scaled_div(((u64)hctx->s) << 6, hctx->rtt));
165 hctx->ccid3hctx_rtt)); 126 hctx->t_ld = now;
166 hctx->ccid3hctx_t_ld = now;
167 } 127 }
168 128
169 if (hctx->ccid3hctx_x != old_x) { 129 if (hctx->x != old_x) {
170 ccid3_pr_debug("X_prev=%u, X_now=%u, X_calc=%u, " 130 ccid3_pr_debug("X_prev=%u, X_now=%u, X_calc=%u, "
171 "X_recv=%u\n", (unsigned)(old_x >> 6), 131 "X_recv=%u\n", (unsigned)(old_x >> 6),
172 (unsigned)(hctx->ccid3hctx_x >> 6), 132 (unsigned)(hctx->x >> 6), hctx->x_calc,
173 hctx->ccid3hctx_x_calc, 133 (unsigned)(hctx->x_recv >> 6));
174 (unsigned)(hctx->ccid3hctx_x_recv >> 6));
175 134
176 ccid3_update_send_interval(hctx); 135 ccid3_update_send_interval(hctx);
177 } 136 }
178} 137}
179 138
180/* 139/*
181 * Track the mean packet size `s' (cf. RFC 4342, 5.3 and RFC 3448, 4.1) 140 * ccid3_hc_tx_measure_packet_size - Measuring the packet size `s' (sec 4.1)
182 * @len: DCCP packet payload size in bytes 141 * @new_len: DCCP payload size in bytes (not used by all methods)
183 */ 142 */
184static inline void ccid3_hc_tx_update_s(struct ccid3_hc_tx_sock *hctx, int len) 143static u32 ccid3_hc_tx_measure_packet_size(struct sock *sk, const u16 new_len)
185{ 144{
186 const u16 old_s = hctx->ccid3hctx_s; 145#if defined(CONFIG_IP_DCCP_CCID3_MEASURE_S_AS_AVG)
187 146 return tfrc_ewma(ccid3_hc_tx_sk(sk)->s, new_len, 9);
188 hctx->ccid3hctx_s = tfrc_ewma(hctx->ccid3hctx_s, len, 9); 147#elif defined(CONFIG_IP_DCCP_CCID3_MEASURE_S_AS_MAX)
189 148 return max(ccid3_hc_tx_sk(sk)->s, new_len);
190 if (hctx->ccid3hctx_s != old_s) 149#else /* CONFIG_IP_DCCP_CCID3_MEASURE_S_AS_MPS */
191 ccid3_update_send_interval(hctx); 150 return dccp_sk(sk)->dccps_mss_cache;
151#endif
192} 152}
193 153
194/* 154/*
@@ -198,13 +158,13 @@ static inline void ccid3_hc_tx_update_s(struct ccid3_hc_tx_sock *hctx, int len)
198static inline void ccid3_hc_tx_update_win_count(struct ccid3_hc_tx_sock *hctx, 158static inline void ccid3_hc_tx_update_win_count(struct ccid3_hc_tx_sock *hctx,
199 ktime_t now) 159 ktime_t now)
200{ 160{
201 u32 delta = ktime_us_delta(now, hctx->ccid3hctx_t_last_win_count), 161 u32 delta = ktime_us_delta(now, hctx->t_last_win_count),
202 quarter_rtts = (4 * delta) / hctx->ccid3hctx_rtt; 162 quarter_rtts = (4 * delta) / hctx->rtt;
203 163
204 if (quarter_rtts > 0) { 164 if (quarter_rtts > 0) {
205 hctx->ccid3hctx_t_last_win_count = now; 165 hctx->t_last_win_count = now;
206 hctx->ccid3hctx_last_win_count += min(quarter_rtts, 5U); 166 hctx->last_win_count += min(quarter_rtts, 5U);
207 hctx->ccid3hctx_last_win_count &= 0xF; /* mod 16 */ 167 hctx->last_win_count &= 0xF; /* mod 16 */
208 } 168 }
209} 169}
210 170
@@ -221,25 +181,26 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data)
221 goto restart_timer; 181 goto restart_timer;
222 } 182 }
223 183
224 ccid3_pr_debug("%s(%p, state=%s) - entry \n", dccp_role(sk), sk, 184 ccid3_pr_debug("%s(%p) entry with%s feedback\n", dccp_role(sk), sk,
225 ccid3_tx_state_name(hctx->ccid3hctx_state)); 185 hctx->feedback ? "" : "out");
226 186
227 if (hctx->ccid3hctx_state == TFRC_SSTATE_FBACK) 187 /* Ignore and do not restart after leaving the established state */
228 ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK); 188 if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN))
229 else if (hctx->ccid3hctx_state != TFRC_SSTATE_NO_FBACK)
230 goto out; 189 goto out;
231 190
191 /* Reset feedback state to "no feedback received" */
192 hctx->feedback = false;
193
232 /* 194 /*
233 * Determine new allowed sending rate X as per draft rfc3448bis-00, 4.4 195 * Determine new allowed sending rate X as per draft rfc3448bis-00, 4.4
196 * RTO is 0 if and only if no feedback has been received yet.
234 */ 197 */
235 if (hctx->ccid3hctx_t_rto == 0 || /* no feedback received yet */ 198 if (hctx->t_rto == 0 || hctx->p == 0) {
236 hctx->ccid3hctx_p == 0) {
237 199
238 /* halve send rate directly */ 200 /* halve send rate directly */
239 hctx->ccid3hctx_x = max(hctx->ccid3hctx_x / 2, 201 hctx->x /= 2;
240 (((__u64)hctx->ccid3hctx_s) << 6) /
241 TFRC_T_MBI);
242 ccid3_update_send_interval(hctx); 202 ccid3_update_send_interval(hctx);
203
243 } else { 204 } else {
244 /* 205 /*
245 * Modify the cached value of X_recv 206 * Modify the cached value of X_recv
@@ -251,44 +212,41 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data)
251 * 212 *
252 * Note that X_recv is scaled by 2^6 while X_calc is not 213 * Note that X_recv is scaled by 2^6 while X_calc is not
253 */ 214 */
254 BUG_ON(hctx->ccid3hctx_p && !hctx->ccid3hctx_x_calc); 215 BUG_ON(hctx->p && !hctx->x_calc);
255 216
256 if (hctx->ccid3hctx_x_calc > (hctx->ccid3hctx_x_recv >> 5)) 217 if (hctx->x_calc > (hctx->x_recv >> 5))
257 hctx->ccid3hctx_x_recv = 218 hctx->x_recv /= 2;
258 max(hctx->ccid3hctx_x_recv / 2,
259 (((__u64)hctx->ccid3hctx_s) << 6) /
260 (2 * TFRC_T_MBI));
261 else { 219 else {
262 hctx->ccid3hctx_x_recv = hctx->ccid3hctx_x_calc; 220 hctx->x_recv = hctx->x_calc;
263 hctx->ccid3hctx_x_recv <<= 4; 221 hctx->x_recv <<= 4;
264 } 222 }
265 ccid3_hc_tx_update_x(sk, NULL); 223 ccid3_hc_tx_update_x(sk, NULL);
266 } 224 }
267 ccid3_pr_debug("Reduced X to %llu/64 bytes/sec\n", 225 ccid3_pr_debug("Reduced X to %llu/64 bytes/sec\n",
268 (unsigned long long)hctx->ccid3hctx_x); 226 (unsigned long long)hctx->x);
269 227
270 /* 228 /*
271 * Set new timeout for the nofeedback timer. 229 * Set new timeout for the nofeedback timer.
272 * See comments in packet_recv() regarding the value of t_RTO. 230 * See comments in packet_recv() regarding the value of t_RTO.
273 */ 231 */
274 if (unlikely(hctx->ccid3hctx_t_rto == 0)) /* no feedback yet */ 232 if (unlikely(hctx->t_rto == 0)) /* no feedback received yet */
275 t_nfb = TFRC_INITIAL_TIMEOUT; 233 t_nfb = TFRC_INITIAL_TIMEOUT;
276 else 234 else
277 t_nfb = max(hctx->ccid3hctx_t_rto, 2 * hctx->ccid3hctx_t_ipi); 235 t_nfb = max(hctx->t_rto, 2 * hctx->t_ipi);
278 236
279restart_timer: 237restart_timer:
280 sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer, 238 sk_reset_timer(sk, &hctx->no_feedback_timer,
281 jiffies + usecs_to_jiffies(t_nfb)); 239 jiffies + usecs_to_jiffies(t_nfb));
282out: 240out:
283 bh_unlock_sock(sk); 241 bh_unlock_sock(sk);
284 sock_put(sk); 242 sock_put(sk);
285} 243}
286 244
287/* 245/**
288 * returns 246 * ccid3_hc_tx_send_packet - Delay-based dequeueing of TX packets
289 * > 0: delay (in msecs) that should pass before actually sending 247 * @skb: next packet candidate to send on @sk
290 * = 0: can send immediately 248 * This function uses the convention of ccid_packet_dequeue_eval() and
291 * < 0: error condition; do not send packet 249 * returns a millisecond-delay value between 0 and t_mbi = 64000 msec.
292 */ 250 */
293static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) 251static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
294{ 252{
@@ -305,18 +263,14 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
305 if (unlikely(skb->len == 0)) 263 if (unlikely(skb->len == 0))
306 return -EBADMSG; 264 return -EBADMSG;
307 265
308 switch (hctx->ccid3hctx_state) { 266 if (hctx->s == 0) {
309 case TFRC_SSTATE_NO_SENT: 267 sk_reset_timer(sk, &hctx->no_feedback_timer, (jiffies +
310 sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer,
311 (jiffies +
312 usecs_to_jiffies(TFRC_INITIAL_TIMEOUT))); 268 usecs_to_jiffies(TFRC_INITIAL_TIMEOUT)));
313 hctx->ccid3hctx_last_win_count = 0; 269 hctx->last_win_count = 0;
314 hctx->ccid3hctx_t_last_win_count = now; 270 hctx->t_last_win_count = now;
315 271
316 /* Set t_0 for initial packet */ 272 /* Set t_0 for initial packet */
317 hctx->ccid3hctx_t_nom = now; 273 hctx->t_nom = now;
318
319 hctx->ccid3hctx_s = skb->len;
320 274
321 /* 275 /*
322 * Use initial RTT sample when available: recommended by erratum 276 * Use initial RTT sample when available: recommended by erratum
@@ -325,9 +279,9 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
325 */ 279 */
326 if (dp->dccps_syn_rtt) { 280 if (dp->dccps_syn_rtt) {
327 ccid3_pr_debug("SYN RTT = %uus\n", dp->dccps_syn_rtt); 281 ccid3_pr_debug("SYN RTT = %uus\n", dp->dccps_syn_rtt);
328 hctx->ccid3hctx_rtt = dp->dccps_syn_rtt; 282 hctx->rtt = dp->dccps_syn_rtt;
329 hctx->ccid3hctx_x = rfc3390_initial_rate(sk); 283 hctx->x = rfc3390_initial_rate(sk);
330 hctx->ccid3hctx_t_ld = now; 284 hctx->t_ld = now;
331 } else { 285 } else {
332 /* 286 /*
333 * Sender does not have RTT sample: 287 * Sender does not have RTT sample:
@@ -335,17 +289,20 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
335 * is needed in several parts (e.g. window counter); 289 * is needed in several parts (e.g. window counter);
336 * - set sending rate X_pps = 1pps as per RFC 3448, 4.2. 290 * - set sending rate X_pps = 1pps as per RFC 3448, 4.2.
337 */ 291 */
338 hctx->ccid3hctx_rtt = DCCP_FALLBACK_RTT; 292 hctx->rtt = DCCP_FALLBACK_RTT;
339 hctx->ccid3hctx_x = hctx->ccid3hctx_s; 293 hctx->x = dp->dccps_mss_cache;
340 hctx->ccid3hctx_x <<= 6; 294 hctx->x <<= 6;
341 } 295 }
296
297 /* Compute t_ipi = s / X */
298 hctx->s = ccid3_hc_tx_measure_packet_size(sk, skb->len);
342 ccid3_update_send_interval(hctx); 299 ccid3_update_send_interval(hctx);
343 300
344 ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK); 301 /* Seed value for Oscillation Prevention (sec. 4.5) */
345 break; 302 hctx->r_sqmean = tfrc_scaled_sqrt(hctx->rtt);
346 case TFRC_SSTATE_NO_FBACK: 303
347 case TFRC_SSTATE_FBACK: 304 } else {
348 delay = ktime_us_delta(hctx->ccid3hctx_t_nom, now); 305 delay = ktime_us_delta(hctx->t_nom, now);
349 ccid3_pr_debug("delay=%ld\n", (long)delay); 306 ccid3_pr_debug("delay=%ld\n", (long)delay);
350 /* 307 /*
351 * Scheduling of packet transmissions [RFC 3448, 4.6] 308 * Scheduling of packet transmissions [RFC 3448, 4.6]
@@ -355,99 +312,80 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
355 * else 312 * else
356 * // send the packet in (t_nom - t_now) milliseconds. 313 * // send the packet in (t_nom - t_now) milliseconds.
357 */ 314 */
358 if (delay - (s64)hctx->ccid3hctx_delta >= 1000) 315 if (delay >= TFRC_T_DELTA)
359 return (u32)delay / 1000L; 316 return (u32)delay / USEC_PER_MSEC;
360 317
361 ccid3_hc_tx_update_win_count(hctx, now); 318 ccid3_hc_tx_update_win_count(hctx, now);
362 break;
363 case TFRC_SSTATE_TERM:
364 DCCP_BUG("%s(%p) - Illegal state TERM", dccp_role(sk), sk);
365 return -EINVAL;
366 } 319 }
367 320
368 /* prepare to send now (add options etc.) */ 321 /* prepare to send now (add options etc.) */
369 dp->dccps_hc_tx_insert_options = 1; 322 dp->dccps_hc_tx_insert_options = 1;
370 DCCP_SKB_CB(skb)->dccpd_ccval = hctx->ccid3hctx_last_win_count; 323 DCCP_SKB_CB(skb)->dccpd_ccval = hctx->last_win_count;
371 324
372 /* set the nominal send time for the next following packet */ 325 /* set the nominal send time for the next following packet */
373 hctx->ccid3hctx_t_nom = ktime_add_us(hctx->ccid3hctx_t_nom, 326 hctx->t_nom = ktime_add_us(hctx->t_nom, hctx->t_ipi);
374 hctx->ccid3hctx_t_ipi); 327 return CCID_PACKET_SEND_AT_ONCE;
375 return 0;
376} 328}
377 329
378static void ccid3_hc_tx_packet_sent(struct sock *sk, int more, 330static void ccid3_hc_tx_packet_sent(struct sock *sk, unsigned int len)
379 unsigned int len)
380{ 331{
381 struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); 332 struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
382 333
383 ccid3_hc_tx_update_s(hctx, len); 334 /* Changes to s will become effective the next time X is computed */
335 hctx->s = ccid3_hc_tx_measure_packet_size(sk, len);
384 336
385 if (tfrc_tx_hist_add(&hctx->ccid3hctx_hist, dccp_sk(sk)->dccps_gss)) 337 if (tfrc_tx_hist_add(&hctx->hist, dccp_sk(sk)->dccps_gss))
386 DCCP_CRIT("packet history - out of memory!"); 338 DCCP_CRIT("packet history - out of memory!");
387} 339}
388 340
389static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) 341static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
390{ 342{
391 struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); 343 struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
392 struct ccid3_options_received *opt_recv; 344 struct tfrc_tx_hist_entry *acked;
393 ktime_t now; 345 ktime_t now;
394 unsigned long t_nfb; 346 unsigned long t_nfb;
395 u32 pinv, r_sample; 347 u32 r_sample;
396 348
397 /* we are only interested in ACKs */ 349 /* we are only interested in ACKs */
398 if (!(DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK || 350 if (!(DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK ||
399 DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_DATAACK)) 351 DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_DATAACK))
400 return; 352 return;
401 /* ... and only in the established state */ 353 /*
402 if (hctx->ccid3hctx_state != TFRC_SSTATE_FBACK && 354 * Locate the acknowledged packet in the TX history.
403 hctx->ccid3hctx_state != TFRC_SSTATE_NO_FBACK) 355 *
404 return; 356 * Returning "entry not found" here can for instance happen when
405 357 * - the host has not sent out anything (e.g. a passive server),
406 opt_recv = &hctx->ccid3hctx_options_received; 358 * - the Ack is outdated (packet with higher Ack number was received),
407 now = ktime_get_real(); 359 * - it is a bogus Ack (for a packet not sent on this connection).
408 360 */
409 /* Estimate RTT from history if ACK number is valid */ 361 acked = tfrc_tx_hist_find_entry(hctx->hist, dccp_hdr_ack_seq(skb));
410 r_sample = tfrc_tx_hist_rtt(hctx->ccid3hctx_hist, 362 if (acked == NULL)
411 DCCP_SKB_CB(skb)->dccpd_ack_seq, now);
412 if (r_sample == 0) {
413 DCCP_WARN("%s(%p): %s with bogus ACK-%llu\n", dccp_role(sk), sk,
414 dccp_packet_name(DCCP_SKB_CB(skb)->dccpd_type),
415 (unsigned long long)DCCP_SKB_CB(skb)->dccpd_ack_seq);
416 return; 363 return;
417 } 364 /* For the sake of RTT sampling, ignore/remove all older entries */
365 tfrc_tx_hist_purge(&acked->next);
418 366
419 /* Update receive rate in units of 64 * bytes/second */ 367 /* Update the moving average for the RTT estimate (RFC 3448, 4.3) */
420 hctx->ccid3hctx_x_recv = opt_recv->ccid3or_receive_rate; 368 now = ktime_get_real();
421 hctx->ccid3hctx_x_recv <<= 6; 369 r_sample = dccp_sample_rtt(sk, ktime_us_delta(now, acked->stamp));
370 hctx->rtt = tfrc_ewma(hctx->rtt, r_sample, 9);
422 371
423 /* Update loss event rate (which is scaled by 1e6) */
424 pinv = opt_recv->ccid3or_loss_event_rate;
425 if (pinv == ~0U || pinv == 0) /* see RFC 4342, 8.5 */
426 hctx->ccid3hctx_p = 0;
427 else /* can not exceed 100% */
428 hctx->ccid3hctx_p = scaled_div(1, pinv);
429 /*
430 * Validate new RTT sample and update moving average
431 */
432 r_sample = dccp_sample_rtt(sk, r_sample);
433 hctx->ccid3hctx_rtt = tfrc_ewma(hctx->ccid3hctx_rtt, r_sample, 9);
434 /* 372 /*
435 * Update allowed sending rate X as per draft rfc3448bis-00, 4.2/3 373 * Update allowed sending rate X as per draft rfc3448bis-00, 4.2/3
436 */ 374 */
437 if (hctx->ccid3hctx_state == TFRC_SSTATE_NO_FBACK) { 375 if (!hctx->feedback) {
438 ccid3_hc_tx_set_state(sk, TFRC_SSTATE_FBACK); 376 hctx->feedback = true;
439 377
440 if (hctx->ccid3hctx_t_rto == 0) { 378 if (hctx->t_rto == 0) {
441 /* 379 /*
442 * Initial feedback packet: Larger Initial Windows (4.2) 380 * Initial feedback packet: Larger Initial Windows (4.2)
443 */ 381 */
444 hctx->ccid3hctx_x = rfc3390_initial_rate(sk); 382 hctx->x = rfc3390_initial_rate(sk);
445 hctx->ccid3hctx_t_ld = now; 383 hctx->t_ld = now;
446 384
447 ccid3_update_send_interval(hctx); 385 ccid3_update_send_interval(hctx);
448 386
449 goto done_computing_x; 387 goto done_computing_x;
450 } else if (hctx->ccid3hctx_p == 0) { 388 } else if (hctx->p == 0) {
451 /* 389 /*
452 * First feedback after nofeedback timer expiry (4.3) 390 * First feedback after nofeedback timer expiry (4.3)
453 */ 391 */
@@ -456,25 +394,52 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
456 } 394 }
457 395
458 /* Update sending rate (step 4 of [RFC 3448, 4.3]) */ 396 /* Update sending rate (step 4 of [RFC 3448, 4.3]) */
459 if (hctx->ccid3hctx_p > 0) 397 if (hctx->p > 0)
460 hctx->ccid3hctx_x_calc = 398 hctx->x_calc = tfrc_calc_x(hctx->s, hctx->rtt, hctx->p);
461 tfrc_calc_x(hctx->ccid3hctx_s,
462 hctx->ccid3hctx_rtt,
463 hctx->ccid3hctx_p);
464 ccid3_hc_tx_update_x(sk, &now); 399 ccid3_hc_tx_update_x(sk, &now);
465 400
466done_computing_x: 401done_computing_x:
467 ccid3_pr_debug("%s(%p), RTT=%uus (sample=%uus), s=%u, " 402 ccid3_pr_debug("%s(%p), RTT=%uus (sample=%uus), s=%u, "
468 "p=%u, X_calc=%u, X_recv=%u, X=%u\n", 403 "p=%u, X_calc=%u, X_recv=%u, X=%u\n",
469 dccp_role(sk), 404 dccp_role(sk), sk, hctx->rtt, r_sample,
470 sk, hctx->ccid3hctx_rtt, r_sample, 405 hctx->s, hctx->p, hctx->x_calc,
471 hctx->ccid3hctx_s, hctx->ccid3hctx_p, 406 (unsigned)(hctx->x_recv >> 6),
472 hctx->ccid3hctx_x_calc, 407 (unsigned)(hctx->x >> 6));
473 (unsigned)(hctx->ccid3hctx_x_recv >> 6), 408 /*
474 (unsigned)(hctx->ccid3hctx_x >> 6)); 409 * Oscillation Reduction (RFC 3448, 4.5) - modifying t_ipi according to
410 * RTT changes, multiplying by X/X_inst = sqrt(R_sample)/R_sqmean. This
411 * can be useful if few connections share a link, avoiding that buffer
412 * fill levels (RTT) oscillate as a result of frequent adjustments to X.
413 * A useful presentation with background information is in
414 * Joerg Widmer, "Equation-Based Congestion Control",
415 * MSc Thesis, University of Mannheim, Germany, 2000
416 * (sec. 3.6.4), who calls this ISM ("Inter-packet Space Modulation").
417 */
418 if (do_osc_prev) {
419 r_sample = tfrc_scaled_sqrt(r_sample);
420 /*
421 * The modulation can work in both ways: increase/decrease t_ipi
422 * according to long-term increases/decreases of the RTT. The
423 * former is a useful measure, since it works against queue
424 * build-up. The latter temporarily increases the sending rate,
425 * so that buffers fill up more quickly. This in turn causes
426 * the RTT to increase, so that either later reduction becomes
427 * necessary or the RTT stays at a very high level. Decreasing
428 * t_ipi is therefore not supported.
429 * Furthermore, during the initial slow-start phase the RTT
430 * naturally increases, where using the algorithm would cause
431 * delays. Hence it is disabled during the initial slow-start.
432 */
433 if (r_sample > hctx->r_sqmean && hctx->p > 0)
434 hctx->t_ipi = div_u64((u64)hctx->t_ipi * (u64)r_sample,
435 hctx->r_sqmean);
436 hctx->t_ipi = min_t(u32, hctx->t_ipi, TFRC_T_MBI);
437 /* update R_sqmean _after_ computing the modulation factor */
438 hctx->r_sqmean = tfrc_ewma(hctx->r_sqmean, r_sample, 9);
439 }
475 440
476 /* unschedule no feedback timer */ 441 /* unschedule no feedback timer */
477 sk_stop_timer(sk, &hctx->ccid3hctx_no_feedback_timer); 442 sk_stop_timer(sk, &hctx->no_feedback_timer);
478 443
479 /* 444 /*
480 * As we have calculated new ipi, delta, t_nom it is possible 445 * As we have calculated new ipi, delta, t_nom it is possible
@@ -488,95 +453,66 @@ done_computing_x:
488 * This can help avoid triggering the nofeedback timer too 453 * This can help avoid triggering the nofeedback timer too
489 * often ('spinning') on LANs with small RTTs. 454 * often ('spinning') on LANs with small RTTs.
490 */ 455 */
491 hctx->ccid3hctx_t_rto = max_t(u32, 4 * hctx->ccid3hctx_rtt, 456 hctx->t_rto = max_t(u32, 4 * hctx->rtt, (CONFIG_IP_DCCP_CCID3_RTO *
492 (CONFIG_IP_DCCP_CCID3_RTO * 457 (USEC_PER_SEC / 1000)));
493 (USEC_PER_SEC / 1000)));
494 /* 458 /*
495 * Schedule no feedback timer to expire in 459 * Schedule no feedback timer to expire in
496 * max(t_RTO, 2 * s/X) = max(t_RTO, 2 * t_ipi) 460 * max(t_RTO, 2 * s/X) = max(t_RTO, 2 * t_ipi)
497 */ 461 */
498 t_nfb = max(hctx->ccid3hctx_t_rto, 2 * hctx->ccid3hctx_t_ipi); 462 t_nfb = max(hctx->t_rto, 2 * hctx->t_ipi);
499 463
500 ccid3_pr_debug("%s(%p), Scheduled no feedback timer to " 464 ccid3_pr_debug("%s(%p), Scheduled no feedback timer to "
501 "expire in %lu jiffies (%luus)\n", 465 "expire in %lu jiffies (%luus)\n",
502 dccp_role(sk), 466 dccp_role(sk), sk, usecs_to_jiffies(t_nfb), t_nfb);
503 sk, usecs_to_jiffies(t_nfb), t_nfb);
504 467
505 sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer, 468 sk_reset_timer(sk, &hctx->no_feedback_timer,
506 jiffies + usecs_to_jiffies(t_nfb)); 469 jiffies + usecs_to_jiffies(t_nfb));
507} 470}
508 471
509static int ccid3_hc_tx_parse_options(struct sock *sk, unsigned char option, 472static int ccid3_hc_tx_parse_options(struct sock *sk, u8 packet_type,
510 unsigned char len, u16 idx, 473 u8 option, u8 *optval, u8 optlen)
511 unsigned char *value)
512{ 474{
513 int rc = 0;
514 const struct dccp_sock *dp = dccp_sk(sk);
515 struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); 475 struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
516 struct ccid3_options_received *opt_recv;
517 __be32 opt_val; 476 __be32 opt_val;
518 477
519 opt_recv = &hctx->ccid3hctx_options_received;
520
521 if (opt_recv->ccid3or_seqno != dp->dccps_gsr) {
522 opt_recv->ccid3or_seqno = dp->dccps_gsr;
523 opt_recv->ccid3or_loss_event_rate = ~0;
524 opt_recv->ccid3or_loss_intervals_idx = 0;
525 opt_recv->ccid3or_loss_intervals_len = 0;
526 opt_recv->ccid3or_receive_rate = 0;
527 }
528
529 switch (option) { 478 switch (option) {
479 case TFRC_OPT_RECEIVE_RATE:
530 case TFRC_OPT_LOSS_EVENT_RATE: 480 case TFRC_OPT_LOSS_EVENT_RATE:
531 if (unlikely(len != 4)) { 481 /* Must be ignored on Data packets, cf. RFC 4342 8.3 and 8.5 */
532 DCCP_WARN("%s(%p), invalid len %d " 482 if (packet_type == DCCP_PKT_DATA)
533 "for TFRC_OPT_LOSS_EVENT_RATE\n", 483 break;
534 dccp_role(sk), sk, len); 484 if (unlikely(optlen != 4)) {
535 rc = -EINVAL; 485 DCCP_WARN("%s(%p), invalid len %d for %u\n",
536 } else { 486 dccp_role(sk), sk, optlen, option);
537 opt_val = get_unaligned((__be32 *)value); 487 return -EINVAL;
538 opt_recv->ccid3or_loss_event_rate = ntohl(opt_val);
539 ccid3_pr_debug("%s(%p), LOSS_EVENT_RATE=%u\n",
540 dccp_role(sk), sk,
541 opt_recv->ccid3or_loss_event_rate);
542 } 488 }
543 break; 489 opt_val = ntohl(get_unaligned((__be32 *)optval));
544 case TFRC_OPT_LOSS_INTERVALS: 490
545 opt_recv->ccid3or_loss_intervals_idx = idx; 491 if (option == TFRC_OPT_RECEIVE_RATE) {
546 opt_recv->ccid3or_loss_intervals_len = len; 492 /* Receive Rate is kept in units of 64 bytes/second */
547 ccid3_pr_debug("%s(%p), LOSS_INTERVALS=(%u, %u)\n", 493 hctx->x_recv = opt_val;
548 dccp_role(sk), sk, 494 hctx->x_recv <<= 6;
549 opt_recv->ccid3or_loss_intervals_idx, 495
550 opt_recv->ccid3or_loss_intervals_len);
551 break;
552 case TFRC_OPT_RECEIVE_RATE:
553 if (unlikely(len != 4)) {
554 DCCP_WARN("%s(%p), invalid len %d "
555 "for TFRC_OPT_RECEIVE_RATE\n",
556 dccp_role(sk), sk, len);
557 rc = -EINVAL;
558 } else {
559 opt_val = get_unaligned((__be32 *)value);
560 opt_recv->ccid3or_receive_rate = ntohl(opt_val);
561 ccid3_pr_debug("%s(%p), RECEIVE_RATE=%u\n", 496 ccid3_pr_debug("%s(%p), RECEIVE_RATE=%u\n",
562 dccp_role(sk), sk, 497 dccp_role(sk), sk, opt_val);
563 opt_recv->ccid3or_receive_rate); 498 } else {
499 /* Update the fixpoint Loss Event Rate fraction */
500 hctx->p = tfrc_invert_loss_event_rate(opt_val);
501
502 ccid3_pr_debug("%s(%p), LOSS_EVENT_RATE=%u\n",
503 dccp_role(sk), sk, opt_val);
564 } 504 }
565 break;
566 } 505 }
567 506 return 0;
568 return rc;
569} 507}
570 508
571static int ccid3_hc_tx_init(struct ccid *ccid, struct sock *sk) 509static int ccid3_hc_tx_init(struct ccid *ccid, struct sock *sk)
572{ 510{
573 struct ccid3_hc_tx_sock *hctx = ccid_priv(ccid); 511 struct ccid3_hc_tx_sock *hctx = ccid_priv(ccid);
574 512
575 hctx->ccid3hctx_state = TFRC_SSTATE_NO_SENT; 513 hctx->hist = NULL;
576 hctx->ccid3hctx_hist = NULL; 514 setup_timer(&hctx->no_feedback_timer,
577 setup_timer(&hctx->ccid3hctx_no_feedback_timer, 515 ccid3_hc_tx_no_feedback_timer, (unsigned long)sk);
578 ccid3_hc_tx_no_feedback_timer, (unsigned long)sk);
579
580 return 0; 516 return 0;
581} 517}
582 518
@@ -584,42 +520,36 @@ static void ccid3_hc_tx_exit(struct sock *sk)
584{ 520{
585 struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); 521 struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
586 522
587 ccid3_hc_tx_set_state(sk, TFRC_SSTATE_TERM); 523 sk_stop_timer(sk, &hctx->no_feedback_timer);
588 sk_stop_timer(sk, &hctx->ccid3hctx_no_feedback_timer); 524 tfrc_tx_hist_purge(&hctx->hist);
589
590 tfrc_tx_hist_purge(&hctx->ccid3hctx_hist);
591} 525}
592 526
593static void ccid3_hc_tx_get_info(struct sock *sk, struct tcp_info *info) 527static void ccid3_hc_tx_get_info(struct sock *sk, struct tcp_info *info)
594{ 528{
595 struct ccid3_hc_tx_sock *hctx; 529 info->tcpi_rto = ccid3_hc_tx_sk(sk)->t_rto;
596 530 info->tcpi_rtt = ccid3_hc_tx_sk(sk)->rtt;
597 /* Listen socks doesn't have a private CCID block */
598 if (sk->sk_state == DCCP_LISTEN)
599 return;
600
601 hctx = ccid3_hc_tx_sk(sk);
602 info->tcpi_rto = hctx->ccid3hctx_t_rto;
603 info->tcpi_rtt = hctx->ccid3hctx_rtt;
604} 531}
605 532
606static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len, 533static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len,
607 u32 __user *optval, int __user *optlen) 534 u32 __user *optval, int __user *optlen)
608{ 535{
609 const struct ccid3_hc_tx_sock *hctx; 536 const struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
537 struct tfrc_tx_info tfrc;
610 const void *val; 538 const void *val;
611 539
612 /* Listen socks doesn't have a private CCID block */
613 if (sk->sk_state == DCCP_LISTEN)
614 return -EINVAL;
615
616 hctx = ccid3_hc_tx_sk(sk);
617 switch (optname) { 540 switch (optname) {
618 case DCCP_SOCKOPT_CCID_TX_INFO: 541 case DCCP_SOCKOPT_CCID_TX_INFO:
619 if (len < sizeof(hctx->ccid3hctx_tfrc)) 542 if (len < sizeof(tfrc))
620 return -EINVAL; 543 return -EINVAL;
621 len = sizeof(hctx->ccid3hctx_tfrc); 544 tfrc.tfrctx_x = hctx->x;
622 val = &hctx->ccid3hctx_tfrc; 545 tfrc.tfrctx_x_recv = hctx->x_recv;
546 tfrc.tfrctx_x_calc = hctx->x_calc;
547 tfrc.tfrctx_rtt = hctx->rtt;
548 tfrc.tfrctx_p = hctx->p;
549 tfrc.tfrctx_rto = hctx->t_rto;
550 tfrc.tfrctx_ipi = hctx->t_ipi;
551 len = sizeof(tfrc);
552 val = &tfrc;
623 break; 553 break;
624 default: 554 default:
625 return -ENOPROTOOPT; 555 return -ENOPROTOOPT;
@@ -634,112 +564,82 @@ static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len,
634/* 564/*
635 * Receiver Half-Connection Routines 565 * Receiver Half-Connection Routines
636 */ 566 */
637
638/* CCID3 feedback types */
639enum ccid3_fback_type {
640 CCID3_FBACK_NONE = 0,
641 CCID3_FBACK_INITIAL,
642 CCID3_FBACK_PERIODIC,
643 CCID3_FBACK_PARAM_CHANGE
644};
645
646#ifdef CONFIG_IP_DCCP_CCID3_DEBUG
647static const char *ccid3_rx_state_name(enum ccid3_hc_rx_states state)
648{
649 static char *ccid3_rx_state_names[] = {
650 [TFRC_RSTATE_NO_DATA] = "NO_DATA",
651 [TFRC_RSTATE_DATA] = "DATA",
652 [TFRC_RSTATE_TERM] = "TERM",
653 };
654
655 return ccid3_rx_state_names[state];
656}
657#endif
658
659static void ccid3_hc_rx_set_state(struct sock *sk,
660 enum ccid3_hc_rx_states state)
661{
662 struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
663 enum ccid3_hc_rx_states oldstate = hcrx->ccid3hcrx_state;
664
665 ccid3_pr_debug("%s(%p) %-8.8s -> %s\n",
666 dccp_role(sk), sk, ccid3_rx_state_name(oldstate),
667 ccid3_rx_state_name(state));
668 WARN_ON(state == oldstate);
669 hcrx->ccid3hcrx_state = state;
670}
671
672static void ccid3_hc_rx_send_feedback(struct sock *sk, 567static void ccid3_hc_rx_send_feedback(struct sock *sk,
673 const struct sk_buff *skb, 568 const struct sk_buff *skb,
674 enum ccid3_fback_type fbtype) 569 enum ccid3_fback_type fbtype)
675{ 570{
676 struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); 571 struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
677 struct dccp_sock *dp = dccp_sk(sk);
678 ktime_t now;
679 s64 delta = 0;
680
681 if (unlikely(hcrx->ccid3hcrx_state == TFRC_RSTATE_TERM))
682 return;
683
684 now = ktime_get_real();
685 572
686 switch (fbtype) { 573 switch (fbtype) {
687 case CCID3_FBACK_INITIAL: 574 case CCID3_FBACK_INITIAL:
688 hcrx->ccid3hcrx_x_recv = 0; 575 hcrx->x_recv = 0;
689 hcrx->ccid3hcrx_pinv = ~0U; /* see RFC 4342, 8.5 */ 576 hcrx->p_inverse = ~0U; /* see RFC 4342, 8.5 */
690 break; 577 break;
691 case CCID3_FBACK_PARAM_CHANGE: 578 case CCID3_FBACK_PARAM_CHANGE:
579 if (unlikely(hcrx->feedback == CCID3_FBACK_NONE)) {
580 /*
581 * rfc3448bis-06, 6.3.1: First packet(s) lost or marked
582 * FIXME: in rfc3448bis the receiver returns X_recv=0
583 * here as it normally would in the first feedback packet.
584 * However this is not possible yet, since the code still
585 * uses RFC 3448, i.e.
586 * If (p > 0)
587 * Calculate X_calc using the TCP throughput equation.
588 * X = max(min(X_calc, 2*X_recv), s/t_mbi);
589 * would bring X down to s/t_mbi. That is why we return
590 * X_recv according to rfc3448bis-06 for the moment.
591 */
592 u32 s = tfrc_rx_hist_packet_size(&hcrx->hist),
593 rtt = tfrc_rx_hist_rtt(&hcrx->hist);
594
595 hcrx->x_recv = scaled_div32(s, 2 * rtt);
596 break;
597 }
692 /* 598 /*
693 * When parameters change (new loss or p > p_prev), we do not 599 * When parameters change (new loss or p > p_prev), we do not
694 * have a reliable estimate for R_m of [RFC 3448, 6.2] and so 600 * have a reliable estimate for R_m of [RFC 3448, 6.2] and so
695 * need to reuse the previous value of X_recv. However, when 601 * always check whether at least RTT time units were covered.
696 * X_recv was 0 (due to early loss), this would kill X down to
697 * s/t_mbi (i.e. one packet in 64 seconds).
698 * To avoid such drastic reduction, we approximate X_recv as
699 * the number of bytes since last feedback.
700 * This is a safe fallback, since X is bounded above by X_calc.
701 */ 602 */
702 if (hcrx->ccid3hcrx_x_recv > 0) 603 hcrx->x_recv = tfrc_rx_hist_x_recv(&hcrx->hist, hcrx->x_recv);
703 break; 604 break;
704 /* fall through */
705 case CCID3_FBACK_PERIODIC: 605 case CCID3_FBACK_PERIODIC:
706 delta = ktime_us_delta(now, hcrx->ccid3hcrx_tstamp_last_feedback); 606 /*
707 if (delta <= 0) 607 * Step (2) of rfc3448bis-06, 6.2:
708 DCCP_BUG("delta (%ld) <= 0", (long)delta); 608 * - if no data packets have been received, just restart timer
709 else 609 * - if data packets have been received, re-compute X_recv
710 hcrx->ccid3hcrx_x_recv = 610 */
711 scaled_div32(hcrx->ccid3hcrx_bytes_recv, delta); 611 if (hcrx->hist.bytes_recvd == 0)
612 goto prepare_for_next_time;
613 hcrx->x_recv = tfrc_rx_hist_x_recv(&hcrx->hist, hcrx->x_recv);
712 break; 614 break;
713 default: 615 default:
714 return; 616 return;
715 } 617 }
716 618
717 ccid3_pr_debug("Interval %ldusec, X_recv=%u, 1/p=%u\n", (long)delta, 619 ccid3_pr_debug("X_recv=%u, 1/p=%u\n", hcrx->x_recv, hcrx->p_inverse);
718 hcrx->ccid3hcrx_x_recv, hcrx->ccid3hcrx_pinv);
719
720 hcrx->ccid3hcrx_tstamp_last_feedback = now;
721 hcrx->ccid3hcrx_last_counter = dccp_hdr(skb)->dccph_ccval;
722 hcrx->ccid3hcrx_bytes_recv = 0;
723 620
724 dp->dccps_hc_rx_insert_options = 1; 621 dccp_sk(sk)->dccps_hc_rx_insert_options = 1;
725 dccp_send_ack(sk); 622 dccp_send_ack(sk);
623
624prepare_for_next_time:
625 tfrc_rx_hist_restart_byte_counter(&hcrx->hist);
626 hcrx->last_counter = dccp_hdr(skb)->dccph_ccval;
627 hcrx->feedback = fbtype;
726} 628}
727 629
728static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb) 630static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb)
729{ 631{
730 const struct ccid3_hc_rx_sock *hcrx; 632 const struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
731 __be32 x_recv, pinv; 633 __be32 x_recv, pinv;
732 634
733 if (!(sk->sk_state == DCCP_OPEN || sk->sk_state == DCCP_PARTOPEN)) 635 if (!(sk->sk_state == DCCP_OPEN || sk->sk_state == DCCP_PARTOPEN))
734 return 0; 636 return 0;
735 637
736 hcrx = ccid3_hc_rx_sk(sk);
737
738 if (dccp_packet_without_ack(skb)) 638 if (dccp_packet_without_ack(skb))
739 return 0; 639 return 0;
740 640
741 x_recv = htonl(hcrx->ccid3hcrx_x_recv); 641 x_recv = htonl(hcrx->x_recv);
742 pinv = htonl(hcrx->ccid3hcrx_pinv); 642 pinv = htonl(hcrx->p_inverse);
743 643
744 if (dccp_insert_option(sk, skb, TFRC_OPT_LOSS_EVENT_RATE, 644 if (dccp_insert_option(sk, skb, TFRC_OPT_LOSS_EVENT_RATE,
745 &pinv, sizeof(pinv)) || 645 &pinv, sizeof(pinv)) ||
@@ -762,171 +662,95 @@ static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb)
762static u32 ccid3_first_li(struct sock *sk) 662static u32 ccid3_first_li(struct sock *sk)
763{ 663{
764 struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); 664 struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
765 u32 x_recv, p, delta; 665 u32 s = tfrc_rx_hist_packet_size(&hcrx->hist),
666 rtt = tfrc_rx_hist_rtt(&hcrx->hist), x_recv, p;
766 u64 fval; 667 u64 fval;
767 668
768 if (hcrx->ccid3hcrx_rtt == 0) { 669 /*
769 DCCP_WARN("No RTT estimate available, using fallback RTT\n"); 670 * rfc3448bis-06, 6.3.1: First data packet(s) are marked or lost. Set p
770 hcrx->ccid3hcrx_rtt = DCCP_FALLBACK_RTT; 671 * to give the equivalent of X_target = s/(2*R). Thus fval = 2 and so p
771 } 672 * is about 20.64%. This yields an interval length of 4.84 (rounded up).
673 */
674 if (unlikely(hcrx->feedback == CCID3_FBACK_NONE))
675 return 5;
772 676
773 delta = ktime_to_us(net_timedelta(hcrx->ccid3hcrx_tstamp_last_feedback)); 677 x_recv = tfrc_rx_hist_x_recv(&hcrx->hist, hcrx->x_recv);
774 x_recv = scaled_div32(hcrx->ccid3hcrx_bytes_recv, delta); 678 if (x_recv == 0)
775 if (x_recv == 0) { /* would also trigger divide-by-zero */ 679 goto failed;
776 DCCP_WARN("X_recv==0\n");
777 if ((x_recv = hcrx->ccid3hcrx_x_recv) == 0) {
778 DCCP_BUG("stored value of X_recv is zero");
779 return ~0U;
780 }
781 }
782 680
783 fval = scaled_div(hcrx->ccid3hcrx_s, hcrx->ccid3hcrx_rtt); 681 fval = scaled_div32(scaled_div(s, rtt), x_recv);
784 fval = scaled_div32(fval, x_recv);
785 p = tfrc_calc_x_reverse_lookup(fval); 682 p = tfrc_calc_x_reverse_lookup(fval);
786 683
787 ccid3_pr_debug("%s(%p), receive rate=%u bytes/s, implied " 684 ccid3_pr_debug("%s(%p), receive rate=%u bytes/s, implied "
788 "loss rate=%u\n", dccp_role(sk), sk, x_recv, p); 685 "loss rate=%u\n", dccp_role(sk), sk, x_recv, p);
789 686
790 return p == 0 ? ~0U : scaled_div(1, p); 687 if (p > 0)
688 return scaled_div(1, p);
689failed:
690 return UINT_MAX;
791} 691}
792 692
793static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) 693static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
794{ 694{
795 struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); 695 struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
796 enum ccid3_fback_type do_feedback = CCID3_FBACK_NONE;
797 const u64 ndp = dccp_sk(sk)->dccps_options_received.dccpor_ndp; 696 const u64 ndp = dccp_sk(sk)->dccps_options_received.dccpor_ndp;
798 const bool is_data_packet = dccp_data_packet(skb); 697 const bool is_data_packet = dccp_data_packet(skb);
799 698
800 if (unlikely(hcrx->ccid3hcrx_state == TFRC_RSTATE_NO_DATA)) {
801 if (is_data_packet) {
802 const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4;
803 do_feedback = CCID3_FBACK_INITIAL;
804 ccid3_hc_rx_set_state(sk, TFRC_RSTATE_DATA);
805 hcrx->ccid3hcrx_s = payload;
806 /*
807 * Not necessary to update ccid3hcrx_bytes_recv here,
808 * since X_recv = 0 for the first feedback packet (cf.
809 * RFC 3448, 6.3) -- gerrit
810 */
811 }
812 goto update_records;
813 }
814
815 if (tfrc_rx_hist_duplicate(&hcrx->ccid3hcrx_hist, skb))
816 return; /* done receiving */
817
818 if (is_data_packet) {
819 const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4;
820 /*
821 * Update moving-average of s and the sum of received payload bytes
822 */
823 hcrx->ccid3hcrx_s = tfrc_ewma(hcrx->ccid3hcrx_s, payload, 9);
824 hcrx->ccid3hcrx_bytes_recv += payload;
825 }
826
827 /* 699 /*
828 * Perform loss detection and handle pending losses 700 * Perform loss detection and handle pending losses
829 */ 701 */
830 if (tfrc_rx_handle_loss(&hcrx->ccid3hcrx_hist, &hcrx->ccid3hcrx_li_hist, 702 if (tfrc_rx_congestion_event(&hcrx->hist, &hcrx->li_hist,
831 skb, ndp, ccid3_first_li, sk)) { 703 skb, ndp, ccid3_first_li, sk))
832 do_feedback = CCID3_FBACK_PARAM_CHANGE; 704 ccid3_hc_rx_send_feedback(sk, skb, CCID3_FBACK_PARAM_CHANGE);
833 goto done_receiving;
834 }
835
836 if (tfrc_rx_hist_loss_pending(&hcrx->ccid3hcrx_hist))
837 return; /* done receiving */
838
839 /* 705 /*
840 * Handle data packets: RTT sampling and monitoring p 706 * Feedback for first non-empty data packet (RFC 3448, 6.3)
841 */ 707 */
842 if (unlikely(!is_data_packet)) 708 else if (unlikely(hcrx->feedback == CCID3_FBACK_NONE && is_data_packet))
843 goto update_records; 709 ccid3_hc_rx_send_feedback(sk, skb, CCID3_FBACK_INITIAL);
844
845 if (!tfrc_lh_is_initialised(&hcrx->ccid3hcrx_li_hist)) {
846 const u32 sample = tfrc_rx_hist_sample_rtt(&hcrx->ccid3hcrx_hist, skb);
847 /*
848 * Empty loss history: no loss so far, hence p stays 0.
849 * Sample RTT values, since an RTT estimate is required for the
850 * computation of p when the first loss occurs; RFC 3448, 6.3.1.
851 */
852 if (sample != 0)
853 hcrx->ccid3hcrx_rtt = tfrc_ewma(hcrx->ccid3hcrx_rtt, sample, 9);
854
855 } else if (tfrc_lh_update_i_mean(&hcrx->ccid3hcrx_li_hist, skb)) {
856 /*
857 * Step (3) of [RFC 3448, 6.1]: Recompute I_mean and, if I_mean
858 * has decreased (resp. p has increased), send feedback now.
859 */
860 do_feedback = CCID3_FBACK_PARAM_CHANGE;
861 }
862
863 /* 710 /*
864 * Check if the periodic once-per-RTT feedback is due; RFC 4342, 10.3 711 * Check if the periodic once-per-RTT feedback is due; RFC 4342, 10.3
865 */ 712 */
866 if (SUB16(dccp_hdr(skb)->dccph_ccval, hcrx->ccid3hcrx_last_counter) > 3) 713 else if (!tfrc_rx_hist_loss_pending(&hcrx->hist) && is_data_packet &&
867 do_feedback = CCID3_FBACK_PERIODIC; 714 SUB16(dccp_hdr(skb)->dccph_ccval, hcrx->last_counter) > 3)
868 715 ccid3_hc_rx_send_feedback(sk, skb, CCID3_FBACK_PERIODIC);
869update_records:
870 tfrc_rx_hist_add_packet(&hcrx->ccid3hcrx_hist, skb, ndp);
871
872done_receiving:
873 if (do_feedback)
874 ccid3_hc_rx_send_feedback(sk, skb, do_feedback);
875} 716}
876 717
877static int ccid3_hc_rx_init(struct ccid *ccid, struct sock *sk) 718static int ccid3_hc_rx_init(struct ccid *ccid, struct sock *sk)
878{ 719{
879 struct ccid3_hc_rx_sock *hcrx = ccid_priv(ccid); 720 struct ccid3_hc_rx_sock *hcrx = ccid_priv(ccid);
880 721
881 hcrx->ccid3hcrx_state = TFRC_RSTATE_NO_DATA; 722 tfrc_lh_init(&hcrx->li_hist);
882 tfrc_lh_init(&hcrx->ccid3hcrx_li_hist); 723 return tfrc_rx_hist_init(&hcrx->hist, sk);
883 return tfrc_rx_hist_alloc(&hcrx->ccid3hcrx_hist);
884} 724}
885 725
886static void ccid3_hc_rx_exit(struct sock *sk) 726static void ccid3_hc_rx_exit(struct sock *sk)
887{ 727{
888 struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); 728 struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
889 729
890 ccid3_hc_rx_set_state(sk, TFRC_RSTATE_TERM); 730 tfrc_rx_hist_purge(&hcrx->hist);
891 731 tfrc_lh_cleanup(&hcrx->li_hist);
892 tfrc_rx_hist_purge(&hcrx->ccid3hcrx_hist);
893 tfrc_lh_cleanup(&hcrx->ccid3hcrx_li_hist);
894} 732}
895 733
896static void ccid3_hc_rx_get_info(struct sock *sk, struct tcp_info *info) 734static void ccid3_hc_rx_get_info(struct sock *sk, struct tcp_info *info)
897{ 735{
898 const struct ccid3_hc_rx_sock *hcrx;
899
900 /* Listen socks doesn't have a private CCID block */
901 if (sk->sk_state == DCCP_LISTEN)
902 return;
903
904 hcrx = ccid3_hc_rx_sk(sk);
905 info->tcpi_ca_state = hcrx->ccid3hcrx_state;
906 info->tcpi_options |= TCPI_OPT_TIMESTAMPS; 736 info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
907 info->tcpi_rcv_rtt = hcrx->ccid3hcrx_rtt; 737 info->tcpi_rcv_rtt = tfrc_rx_hist_rtt(&ccid3_hc_rx_sk(sk)->hist);
908} 738}
909 739
910static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len, 740static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len,
911 u32 __user *optval, int __user *optlen) 741 u32 __user *optval, int __user *optlen)
912{ 742{
913 const struct ccid3_hc_rx_sock *hcrx; 743 const struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
914 struct tfrc_rx_info rx_info; 744 struct tfrc_rx_info rx_info;
915 const void *val; 745 const void *val;
916 746
917 /* Listen socks doesn't have a private CCID block */
918 if (sk->sk_state == DCCP_LISTEN)
919 return -EINVAL;
920
921 hcrx = ccid3_hc_rx_sk(sk);
922 switch (optname) { 747 switch (optname) {
923 case DCCP_SOCKOPT_CCID_RX_INFO: 748 case DCCP_SOCKOPT_CCID_RX_INFO:
924 if (len < sizeof(rx_info)) 749 if (len < sizeof(rx_info))
925 return -EINVAL; 750 return -EINVAL;
926 rx_info.tfrcrx_x_recv = hcrx->ccid3hcrx_x_recv; 751 rx_info.tfrcrx_x_recv = hcrx->x_recv;
927 rx_info.tfrcrx_rtt = hcrx->ccid3hcrx_rtt; 752 rx_info.tfrcrx_rtt = tfrc_rx_hist_rtt(&hcrx->hist);
928 rx_info.tfrcrx_p = hcrx->ccid3hcrx_pinv == 0 ? ~0U : 753 rx_info.tfrcrx_p = tfrc_invert_loss_event_rate(hcrx->p_inverse);
929 scaled_div(1, hcrx->ccid3hcrx_pinv);
930 len = sizeof(rx_info); 754 len = sizeof(rx_info);
931 val = &rx_info; 755 val = &rx_info;
932 break; 756 break;
@@ -962,6 +786,9 @@ static struct ccid_operations ccid3 = {
962 .ccid_hc_tx_getsockopt = ccid3_hc_tx_getsockopt, 786 .ccid_hc_tx_getsockopt = ccid3_hc_tx_getsockopt,
963}; 787};
964 788
789module_param(do_osc_prev, bool, 0644);
790MODULE_PARM_DESC(do_osc_prev, "Use Oscillation Prevention (RFC 3448, 4.5)");
791
965#ifdef CONFIG_IP_DCCP_CCID3_DEBUG 792#ifdef CONFIG_IP_DCCP_CCID3_DEBUG
966module_param(ccid3_debug, bool, 0644); 793module_param(ccid3_debug, bool, 0644);
967MODULE_PARM_DESC(ccid3_debug, "Enable debug messages"); 794MODULE_PARM_DESC(ccid3_debug, "Enable debug messages");
@@ -969,6 +796,19 @@ MODULE_PARM_DESC(ccid3_debug, "Enable debug messages");
969 796
970static __init int ccid3_module_init(void) 797static __init int ccid3_module_init(void)
971{ 798{
799 struct timespec tp;
800
801 /*
802 * Without a fine-grained clock resolution, RTTs/X_recv are not sampled
803 * correctly and feedback is sent either too early or too late.
804 */
805 hrtimer_get_res(CLOCK_MONOTONIC, &tp);
806 if (tp.tv_sec || tp.tv_nsec > DCCP_TIME_RESOLUTION * NSEC_PER_USEC) {
807 printk(KERN_ERR "%s: Timer too coarse (%ld usec), need %u-usec"
808 " resolution - check your clocksource.\n", __func__,
809 tp.tv_nsec/NSEC_PER_USEC, DCCP_TIME_RESOLUTION);
810 return -ESOCKTNOSUPPORT;
811 }
972 return ccid_register(&ccid3); 812 return ccid_register(&ccid3);
973} 813}
974module_init(ccid3_module_init); 814module_init(ccid3_module_init);
diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h
index 49ca32bd7e79..af6e1bf937d9 100644
--- a/net/dccp/ccids/ccid3.h
+++ b/net/dccp/ccids/ccid3.h
@@ -47,11 +47,22 @@
47/* Two seconds as per RFC 3448 4.2 */ 47/* Two seconds as per RFC 3448 4.2 */
48#define TFRC_INITIAL_TIMEOUT (2 * USEC_PER_SEC) 48#define TFRC_INITIAL_TIMEOUT (2 * USEC_PER_SEC)
49 49
50/* In usecs - half the scheduling granularity as per RFC3448 4.6 */ 50/* Maximum backoff interval t_mbi (RFC 3448, 4.3) */
51#define TFRC_OPSYS_HALF_TIME_GRAN (USEC_PER_SEC / (2 * HZ)) 51#define TFRC_T_MBI (64 * USEC_PER_SEC)
52 52
53/* Parameter t_mbi from [RFC 3448, 4.3]: backoff interval in seconds */ 53/*
54#define TFRC_T_MBI 64 54 * The t_delta parameter (RFC 3448, 4.6): delays of less than %USEC_PER_MSEC are
55 * rounded down to 0, since sk_reset_timer() here uses millisecond granularity.
56 * Hence we can use a constant t_delta = %USEC_PER_MSEC when HZ >= 500. A coarse
57 * resolution of HZ < 500 means that the error is below one timer tick (t_gran)
58 * when using the constant t_delta = t_gran / 2 = %USEC_PER_SEC / (2 * HZ).
59 */
60#if (HZ >= 500)
61# define TFRC_T_DELTA USEC_PER_MSEC
62#else
63# define TFRC_T_DELTA (USEC_PER_SEC / (2 * HZ))
64#warning Coarse CONFIG_HZ resolution -- higher value recommended for TFRC.
65#endif
55 66
56enum ccid3_options { 67enum ccid3_options {
57 TFRC_OPT_LOSS_EVENT_RATE = 192, 68 TFRC_OPT_LOSS_EVENT_RATE = 192,
@@ -59,62 +70,43 @@ enum ccid3_options {
59 TFRC_OPT_RECEIVE_RATE = 194, 70 TFRC_OPT_RECEIVE_RATE = 194,
60}; 71};
61 72
62struct ccid3_options_received {
63 u64 ccid3or_seqno:48,
64 ccid3or_loss_intervals_idx:16;
65 u16 ccid3or_loss_intervals_len;
66 u32 ccid3or_loss_event_rate;
67 u32 ccid3or_receive_rate;
68};
69
70/* TFRC sender states */
71enum ccid3_hc_tx_states {
72 TFRC_SSTATE_NO_SENT = 1,
73 TFRC_SSTATE_NO_FBACK,
74 TFRC_SSTATE_FBACK,
75 TFRC_SSTATE_TERM,
76};
77
78/** struct ccid3_hc_tx_sock - CCID3 sender half-connection socket 73/** struct ccid3_hc_tx_sock - CCID3 sender half-connection socket
79 * 74 *
80 * @ccid3hctx_x - Current sending rate in 64 * bytes per second 75 * @x - Current sending rate in 64 * bytes per second
81 * @ccid3hctx_x_recv - Receive rate in 64 * bytes per second 76 * @x_recv - Receive rate in 64 * bytes per second
82 * @ccid3hctx_x_calc - Calculated rate in bytes per second 77 * @x_calc - Calculated rate in bytes per second
83 * @ccid3hctx_rtt - Estimate of current round trip time in usecs 78 * @rtt - Estimate of current round trip time in usecs
84 * @ccid3hctx_p - Current loss event rate (0-1) scaled by 1000000 79 * @r_sqmean - Estimate of long-term RTT (RFC 3448, 4.5)
85 * @ccid3hctx_s - Packet size in bytes 80 * @p - Current loss event rate (0-1) scaled by 1000000
86 * @ccid3hctx_t_rto - Nofeedback Timer setting in usecs 81 * @s - Packet size in bytes
87 * @ccid3hctx_t_ipi - Interpacket (send) interval (RFC 3448, 4.6) in usecs 82 * @t_rto - Nofeedback Timer setting in usecs
88 * @ccid3hctx_state - Sender state, one of %ccid3_hc_tx_states 83 * @t_ipi - Interpacket (send) interval (RFC 3448, 4.6) in usecs
89 * @ccid3hctx_last_win_count - Last window counter sent 84 * @feedback - Whether feedback has been received or not
90 * @ccid3hctx_t_last_win_count - Timestamp of earliest packet 85 * @last_win_count - Last window counter sent
91 * with last_win_count value sent 86 * @t_last_win_count - Timestamp of earliest packet with
92 * @ccid3hctx_no_feedback_timer - Handle to no feedback timer 87 * last_win_count value sent
93 * @ccid3hctx_t_ld - Time last doubled during slow start 88 * @no_feedback_timer - Handle to no feedback timer
94 * @ccid3hctx_t_nom - Nominal send time of next packet 89 * @t_ld - Time last doubled during slow start
95 * @ccid3hctx_delta - Send timer delta (RFC 3448, 4.6) in usecs 90 * @t_nom - Nominal send time of next packet
96 * @ccid3hctx_hist - Packet history 91 * @hist - Packet history
97 * @ccid3hctx_options_received - Parsed set of retrieved options
98 */ 92 */
99struct ccid3_hc_tx_sock { 93struct ccid3_hc_tx_sock {
100 struct tfrc_tx_info ccid3hctx_tfrc; 94 u64 x;
101#define ccid3hctx_x ccid3hctx_tfrc.tfrctx_x 95 u64 x_recv;
102#define ccid3hctx_x_recv ccid3hctx_tfrc.tfrctx_x_recv 96 u32 x_calc;
103#define ccid3hctx_x_calc ccid3hctx_tfrc.tfrctx_x_calc 97 u32 rtt;
104#define ccid3hctx_rtt ccid3hctx_tfrc.tfrctx_rtt 98 u16 r_sqmean;
105#define ccid3hctx_p ccid3hctx_tfrc.tfrctx_p 99 u32 p;
106#define ccid3hctx_t_rto ccid3hctx_tfrc.tfrctx_rto 100 u32 t_rto;
107#define ccid3hctx_t_ipi ccid3hctx_tfrc.tfrctx_ipi 101 u32 t_ipi;
108 u16 ccid3hctx_s; 102 u16 s;
109 enum ccid3_hc_tx_states ccid3hctx_state:8; 103 bool feedback:1;
110 u8 ccid3hctx_last_win_count; 104 u8 last_win_count;
111 ktime_t ccid3hctx_t_last_win_count; 105 ktime_t t_last_win_count;
112 struct timer_list ccid3hctx_no_feedback_timer; 106 struct timer_list no_feedback_timer;
113 ktime_t ccid3hctx_t_ld; 107 ktime_t t_ld;
114 ktime_t ccid3hctx_t_nom; 108 ktime_t t_nom;
115 u32 ccid3hctx_delta; 109 struct tfrc_tx_hist_entry *hist;
116 struct tfrc_tx_hist_entry *ccid3hctx_hist;
117 struct ccid3_options_received ccid3hctx_options_received;
118}; 110};
119 111
120static inline struct ccid3_hc_tx_sock *ccid3_hc_tx_sk(const struct sock *sk) 112static inline struct ccid3_hc_tx_sock *ccid3_hc_tx_sk(const struct sock *sk)
@@ -124,41 +116,32 @@ static inline struct ccid3_hc_tx_sock *ccid3_hc_tx_sk(const struct sock *sk)
124 return hctx; 116 return hctx;
125} 117}
126 118
127/* TFRC receiver states */ 119
128enum ccid3_hc_rx_states { 120enum ccid3_fback_type {
129 TFRC_RSTATE_NO_DATA = 1, 121 CCID3_FBACK_NONE = 0,
130 TFRC_RSTATE_DATA, 122 CCID3_FBACK_INITIAL,
131 TFRC_RSTATE_TERM = 127, 123 CCID3_FBACK_PERIODIC,
124 CCID3_FBACK_PARAM_CHANGE
132}; 125};
133 126
134/** struct ccid3_hc_rx_sock - CCID3 receiver half-connection socket 127/** struct ccid3_hc_rx_sock - CCID3 receiver half-connection socket
135 * 128 *
136 * @ccid3hcrx_x_recv - Receiver estimate of send rate (RFC 3448 4.3) 129 * @last_counter - Tracks window counter (RFC 4342, 8.1)
137 * @ccid3hcrx_rtt - Receiver estimate of rtt (non-standard) 130 * @feedback - The type of the feedback last sent
138 * @ccid3hcrx_p - Current loss event rate (RFC 3448 5.4) 131 * @x_recv - Receiver estimate of send rate (RFC 3448, sec. 4.3)
139 * @ccid3hcrx_last_counter - Tracks window counter (RFC 4342, 8.1) 132 * @tstamp_last_feedback - Time at which last feedback was sent
140 * @ccid3hcrx_state - Receiver state, one of %ccid3_hc_rx_states 133 * @hist - Packet history (loss detection + RTT sampling)
141 * @ccid3hcrx_bytes_recv - Total sum of DCCP payload bytes 134 * @li_hist - Loss Interval database
142 * @ccid3hcrx_x_recv - Receiver estimate of send rate (RFC 3448, sec. 4.3) 135 * @p_inverse - Inverse of Loss Event Rate (RFC 4342, sec. 8.5)
143 * @ccid3hcrx_rtt - Receiver estimate of RTT
144 * @ccid3hcrx_tstamp_last_feedback - Time at which last feedback was sent
145 * @ccid3hcrx_tstamp_last_ack - Time at which last feedback was sent
146 * @ccid3hcrx_hist - Packet history (loss detection + RTT sampling)
147 * @ccid3hcrx_li_hist - Loss Interval database
148 * @ccid3hcrx_s - Received packet size in bytes
149 * @ccid3hcrx_pinv - Inverse of Loss Event Rate (RFC 4342, sec. 8.5)
150 */ 136 */
151struct ccid3_hc_rx_sock { 137struct ccid3_hc_rx_sock {
152 u8 ccid3hcrx_last_counter:4; 138 u8 last_counter:4;
153 enum ccid3_hc_rx_states ccid3hcrx_state:8; 139 enum ccid3_fback_type feedback:4;
154 u32 ccid3hcrx_bytes_recv; 140 u32 x_recv;
155 u32 ccid3hcrx_x_recv; 141 ktime_t tstamp_last_feedback;
156 u32 ccid3hcrx_rtt; 142 struct tfrc_rx_hist hist;
157 ktime_t ccid3hcrx_tstamp_last_feedback; 143 struct tfrc_loss_hist li_hist;
158 struct tfrc_rx_hist ccid3hcrx_hist; 144#define p_inverse li_hist.i_mean
159 struct tfrc_loss_hist ccid3hcrx_li_hist;
160 u16 ccid3hcrx_s;
161#define ccid3hcrx_pinv ccid3hcrx_li_hist.i_mean
162}; 145};
163 146
164static inline struct ccid3_hc_rx_sock *ccid3_hc_rx_sk(const struct sock *sk) 147static inline struct ccid3_hc_rx_sock *ccid3_hc_rx_sk(const struct sock *sk)
diff --git a/net/dccp/ccids/lib/loss_interval.c b/net/dccp/ccids/lib/loss_interval.c
index 5b3ce0688c5c..b1ae8f8259e5 100644
--- a/net/dccp/ccids/lib/loss_interval.c
+++ b/net/dccp/ccids/lib/loss_interval.c
@@ -86,21 +86,26 @@ static void tfrc_lh_calc_i_mean(struct tfrc_loss_hist *lh)
86 86
87/** 87/**
88 * tfrc_lh_update_i_mean - Update the `open' loss interval I_0 88 * tfrc_lh_update_i_mean - Update the `open' loss interval I_0
89 * For recomputing p: returns `true' if p > p_prev <=> 1/p < 1/p_prev 89 * This updates I_mean as the sequence numbers increase. As a consequence, the
90 * open loss interval I_0 increases, hence p = W_tot/max(I_tot0, I_tot1)
91 * decreases, and thus there is no need to send renewed feedback.
90 */ 92 */
91u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *skb) 93void tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *skb)
92{ 94{
93 struct tfrc_loss_interval *cur = tfrc_lh_peek(lh); 95 struct tfrc_loss_interval *cur = tfrc_lh_peek(lh);
94 u32 old_i_mean = lh->i_mean;
95 s64 len; 96 s64 len;
96 97
97 if (cur == NULL) /* not initialised */ 98 if (cur == NULL) /* not initialised */
98 return 0; 99 return;
100
101 /* FIXME: should probably also count non-data packets (RFC 4342, 6.1) */
102 if (!dccp_data_packet(skb))
103 return;
99 104
100 len = dccp_delta_seqno(cur->li_seqno, DCCP_SKB_CB(skb)->dccpd_seq) + 1; 105 len = dccp_delta_seqno(cur->li_seqno, DCCP_SKB_CB(skb)->dccpd_seq) + 1;
101 106
102 if (len - (s64)cur->li_length <= 0) /* duplicate or reordered */ 107 if (len - (s64)cur->li_length <= 0) /* duplicate or reordered */
103 return 0; 108 return;
104 109
105 if (SUB16(dccp_hdr(skb)->dccph_ccval, cur->li_ccval) > 4) 110 if (SUB16(dccp_hdr(skb)->dccph_ccval, cur->li_ccval) > 4)
106 /* 111 /*
@@ -114,14 +119,11 @@ u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *skb)
114 cur->li_is_closed = 1; 119 cur->li_is_closed = 1;
115 120
116 if (tfrc_lh_length(lh) == 1) /* due to RFC 3448, 6.3.1 */ 121 if (tfrc_lh_length(lh) == 1) /* due to RFC 3448, 6.3.1 */
117 return 0; 122 return;
118 123
119 cur->li_length = len; 124 cur->li_length = len;
120 tfrc_lh_calc_i_mean(lh); 125 tfrc_lh_calc_i_mean(lh);
121
122 return (lh->i_mean < old_i_mean);
123} 126}
124EXPORT_SYMBOL_GPL(tfrc_lh_update_i_mean);
125 127
126/* Determine if `new_loss' does begin a new loss interval [RFC 4342, 10.2] */ 128/* Determine if `new_loss' does begin a new loss interval [RFC 4342, 10.2] */
127static inline u8 tfrc_lh_is_new_loss(struct tfrc_loss_interval *cur, 129static inline u8 tfrc_lh_is_new_loss(struct tfrc_loss_interval *cur,
@@ -138,18 +140,18 @@ static inline u8 tfrc_lh_is_new_loss(struct tfrc_loss_interval *cur,
138 * @sk: Used by @calc_first_li in caller-specific way (subtyping) 140 * @sk: Used by @calc_first_li in caller-specific way (subtyping)
139 * Updates I_mean and returns 1 if a new interval has in fact been added to @lh. 141 * Updates I_mean and returns 1 if a new interval has in fact been added to @lh.
140 */ 142 */
141int tfrc_lh_interval_add(struct tfrc_loss_hist *lh, struct tfrc_rx_hist *rh, 143bool tfrc_lh_interval_add(struct tfrc_loss_hist *lh, struct tfrc_rx_hist *rh,
142 u32 (*calc_first_li)(struct sock *), struct sock *sk) 144 u32 (*calc_first_li)(struct sock *), struct sock *sk)
143{ 145{
144 struct tfrc_loss_interval *cur = tfrc_lh_peek(lh), *new; 146 struct tfrc_loss_interval *cur = tfrc_lh_peek(lh), *new;
145 147
146 if (cur != NULL && !tfrc_lh_is_new_loss(cur, tfrc_rx_hist_loss_prev(rh))) 148 if (cur != NULL && !tfrc_lh_is_new_loss(cur, tfrc_rx_hist_loss_prev(rh)))
147 return 0; 149 return false;
148 150
149 new = tfrc_lh_demand_next(lh); 151 new = tfrc_lh_demand_next(lh);
150 if (unlikely(new == NULL)) { 152 if (unlikely(new == NULL)) {
151 DCCP_CRIT("Cannot allocate/add loss record."); 153 DCCP_CRIT("Cannot allocate/add loss record.");
152 return 0; 154 return false;
153 } 155 }
154 156
155 new->li_seqno = tfrc_rx_hist_loss_prev(rh)->tfrchrx_seqno; 157 new->li_seqno = tfrc_rx_hist_loss_prev(rh)->tfrchrx_seqno;
@@ -167,7 +169,7 @@ int tfrc_lh_interval_add(struct tfrc_loss_hist *lh, struct tfrc_rx_hist *rh,
167 169
168 tfrc_lh_calc_i_mean(lh); 170 tfrc_lh_calc_i_mean(lh);
169 } 171 }
170 return 1; 172 return true;
171} 173}
172EXPORT_SYMBOL_GPL(tfrc_lh_interval_add); 174EXPORT_SYMBOL_GPL(tfrc_lh_interval_add);
173 175
diff --git a/net/dccp/ccids/lib/loss_interval.h b/net/dccp/ccids/lib/loss_interval.h
index 246018a3b269..d08a226db43e 100644
--- a/net/dccp/ccids/lib/loss_interval.h
+++ b/net/dccp/ccids/lib/loss_interval.h
@@ -67,9 +67,9 @@ static inline u8 tfrc_lh_length(struct tfrc_loss_hist *lh)
67 67
68struct tfrc_rx_hist; 68struct tfrc_rx_hist;
69 69
70extern int tfrc_lh_interval_add(struct tfrc_loss_hist *, struct tfrc_rx_hist *, 70extern bool tfrc_lh_interval_add(struct tfrc_loss_hist *, struct tfrc_rx_hist *,
71 u32 (*first_li)(struct sock *), struct sock *); 71 u32 (*first_li)(struct sock *), struct sock *);
72extern u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *); 72extern void tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *);
73extern void tfrc_lh_cleanup(struct tfrc_loss_hist *lh); 73extern void tfrc_lh_cleanup(struct tfrc_loss_hist *lh);
74 74
75#endif /* _DCCP_LI_HIST_ */ 75#endif /* _DCCP_LI_HIST_ */
diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c
index 6cc108afdc3b..cce9f03bda3e 100644
--- a/net/dccp/ccids/lib/packet_history.c
+++ b/net/dccp/ccids/lib/packet_history.c
@@ -40,18 +40,6 @@
40#include "packet_history.h" 40#include "packet_history.h"
41#include "../../dccp.h" 41#include "../../dccp.h"
42 42
43/**
44 * tfrc_tx_hist_entry - Simple singly-linked TX history list
45 * @next: next oldest entry (LIFO order)
46 * @seqno: sequence number of this entry
47 * @stamp: send time of packet with sequence number @seqno
48 */
49struct tfrc_tx_hist_entry {
50 struct tfrc_tx_hist_entry *next;
51 u64 seqno;
52 ktime_t stamp;
53};
54
55/* 43/*
56 * Transmitter History Routines 44 * Transmitter History Routines
57 */ 45 */
@@ -73,15 +61,6 @@ void tfrc_tx_packet_history_exit(void)
73 } 61 }
74} 62}
75 63
76static struct tfrc_tx_hist_entry *
77 tfrc_tx_hist_find_entry(struct tfrc_tx_hist_entry *head, u64 seqno)
78{
79 while (head != NULL && head->seqno != seqno)
80 head = head->next;
81
82 return head;
83}
84
85int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno) 64int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno)
86{ 65{
87 struct tfrc_tx_hist_entry *entry = kmem_cache_alloc(tfrc_tx_hist_slab, gfp_any()); 66 struct tfrc_tx_hist_entry *entry = kmem_cache_alloc(tfrc_tx_hist_slab, gfp_any());
@@ -111,25 +90,6 @@ void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp)
111} 90}
112EXPORT_SYMBOL_GPL(tfrc_tx_hist_purge); 91EXPORT_SYMBOL_GPL(tfrc_tx_hist_purge);
113 92
114u32 tfrc_tx_hist_rtt(struct tfrc_tx_hist_entry *head, const u64 seqno,
115 const ktime_t now)
116{
117 u32 rtt = 0;
118 struct tfrc_tx_hist_entry *packet = tfrc_tx_hist_find_entry(head, seqno);
119
120 if (packet != NULL) {
121 rtt = ktime_us_delta(now, packet->stamp);
122 /*
123 * Garbage-collect older (irrelevant) entries:
124 */
125 tfrc_tx_hist_purge(&packet->next);
126 }
127
128 return rtt;
129}
130EXPORT_SYMBOL_GPL(tfrc_tx_hist_rtt);
131
132
133/* 93/*
134 * Receiver History Routines 94 * Receiver History Routines
135 */ 95 */
@@ -191,14 +151,31 @@ int tfrc_rx_hist_duplicate(struct tfrc_rx_hist *h, struct sk_buff *skb)
191} 151}
192EXPORT_SYMBOL_GPL(tfrc_rx_hist_duplicate); 152EXPORT_SYMBOL_GPL(tfrc_rx_hist_duplicate);
193 153
154
155static void __tfrc_rx_hist_swap(struct tfrc_rx_hist *h, const u8 a, const u8 b)
156{
157 struct tfrc_rx_hist_entry *tmp = h->ring[a];
158
159 h->ring[a] = h->ring[b];
160 h->ring[b] = tmp;
161}
162
194static void tfrc_rx_hist_swap(struct tfrc_rx_hist *h, const u8 a, const u8 b) 163static void tfrc_rx_hist_swap(struct tfrc_rx_hist *h, const u8 a, const u8 b)
195{ 164{
196 const u8 idx_a = tfrc_rx_hist_index(h, a), 165 __tfrc_rx_hist_swap(h, tfrc_rx_hist_index(h, a),
197 idx_b = tfrc_rx_hist_index(h, b); 166 tfrc_rx_hist_index(h, b));
198 struct tfrc_rx_hist_entry *tmp = h->ring[idx_a]; 167}
199 168
200 h->ring[idx_a] = h->ring[idx_b]; 169/**
201 h->ring[idx_b] = tmp; 170 * tfrc_rx_hist_resume_rtt_sampling - Prepare RX history for RTT sampling
171 * This is called after loss detection has finished, when the history entry
172 * with the index of `loss_count' holds the highest-received sequence number.
173 * RTT sampling requires this information at ring[0] (tfrc_rx_hist_sample_rtt).
174 */
175static inline void tfrc_rx_hist_resume_rtt_sampling(struct tfrc_rx_hist *h)
176{
177 __tfrc_rx_hist_swap(h, 0, tfrc_rx_hist_index(h, h->loss_count));
178 h->loss_count = h->loss_start = 0;
202} 179}
203 180
204/* 181/*
@@ -215,10 +192,8 @@ static void __do_track_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u64 n1)
215 u64 s0 = tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno, 192 u64 s0 = tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno,
216 s1 = DCCP_SKB_CB(skb)->dccpd_seq; 193 s1 = DCCP_SKB_CB(skb)->dccpd_seq;
217 194
218 if (!dccp_loss_free(s0, s1, n1)) { /* gap between S0 and S1 */ 195 if (!dccp_loss_free(s0, s1, n1)) /* gap between S0 and S1 */
219 h->loss_count = 1; 196 h->loss_count = 1;
220 tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 1), skb, n1);
221 }
222} 197}
223 198
224static void __one_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n2) 199static void __one_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n2)
@@ -240,8 +215,7 @@ static void __one_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n2
240 215
241 if (dccp_loss_free(s2, s1, n1)) { 216 if (dccp_loss_free(s2, s1, n1)) {
242 /* hole is filled: S0, S2, and S1 are consecutive */ 217 /* hole is filled: S0, S2, and S1 are consecutive */
243 h->loss_count = 0; 218 tfrc_rx_hist_resume_rtt_sampling(h);
244 h->loss_start = tfrc_rx_hist_index(h, 1);
245 } else 219 } else
246 /* gap between S2 and S1: just update loss_prev */ 220 /* gap between S2 and S1: just update loss_prev */
247 tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_loss_prev(h), skb, n2); 221 tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_loss_prev(h), skb, n2);
@@ -294,8 +268,7 @@ static int __two_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n3)
294 268
295 if (dccp_loss_free(s1, s2, n2)) { 269 if (dccp_loss_free(s1, s2, n2)) {
296 /* entire hole filled by S0, S3, S1, S2 */ 270 /* entire hole filled by S0, S3, S1, S2 */
297 h->loss_start = tfrc_rx_hist_index(h, 2); 271 tfrc_rx_hist_resume_rtt_sampling(h);
298 h->loss_count = 0;
299 } else { 272 } else {
300 /* gap remains between S1 and S2 */ 273 /* gap remains between S1 and S2 */
301 h->loss_start = tfrc_rx_hist_index(h, 1); 274 h->loss_start = tfrc_rx_hist_index(h, 1);
@@ -339,8 +312,7 @@ static void __three_after_loss(struct tfrc_rx_hist *h)
339 312
340 if (dccp_loss_free(s2, s3, n3)) { 313 if (dccp_loss_free(s2, s3, n3)) {
341 /* no gap between S2 and S3: entire hole is filled */ 314 /* no gap between S2 and S3: entire hole is filled */
342 h->loss_start = tfrc_rx_hist_index(h, 3); 315 tfrc_rx_hist_resume_rtt_sampling(h);
343 h->loss_count = 0;
344 } else { 316 } else {
345 /* gap between S2 and S3 */ 317 /* gap between S2 and S3 */
346 h->loss_start = tfrc_rx_hist_index(h, 2); 318 h->loss_start = tfrc_rx_hist_index(h, 2);
@@ -354,13 +326,13 @@ static void __three_after_loss(struct tfrc_rx_hist *h)
354} 326}
355 327
356/** 328/**
357 * tfrc_rx_handle_loss - Loss detection and further processing 329 * tfrc_rx_congestion_event - Loss detection and further processing
358 * @h: The non-empty RX history object 330 * @h: The non-empty RX history object
359 * @lh: Loss Intervals database to update 331 * @lh: Loss Intervals database to update
360 * @skb: Currently received packet 332 * @skb: Currently received packet
361 * @ndp: The NDP count belonging to @skb 333 * @ndp: The NDP count belonging to @skb
362 * @calc_first_li: Caller-dependent computation of first loss interval in @lh 334 * @first_li: Caller-dependent computation of first loss interval in @lh
363 * @sk: Used by @calc_first_li (see tfrc_lh_interval_add) 335 * @sk: Used by @calc_first_li (see tfrc_lh_interval_add)
364 * Chooses action according to pending loss, updates LI database when a new 336 * Chooses action according to pending loss, updates LI database when a new
365 * loss was detected, and does required post-processing. Returns 1 when caller 337 * loss was detected, and does required post-processing. Returns 1 when caller
366 * should send feedback, 0 otherwise. 338 * should send feedback, 0 otherwise.
@@ -368,15 +340,20 @@ static void __three_after_loss(struct tfrc_rx_hist *h)
368 * records accordingly, the caller should not perform any more RX history 340 * records accordingly, the caller should not perform any more RX history
369 * operations when loss_count is greater than 0 after calling this function. 341 * operations when loss_count is greater than 0 after calling this function.
370 */ 342 */
371int tfrc_rx_handle_loss(struct tfrc_rx_hist *h, 343bool tfrc_rx_congestion_event(struct tfrc_rx_hist *h,
372 struct tfrc_loss_hist *lh, 344 struct tfrc_loss_hist *lh,
373 struct sk_buff *skb, const u64 ndp, 345 struct sk_buff *skb, const u64 ndp,
374 u32 (*calc_first_li)(struct sock *), struct sock *sk) 346 u32 (*first_li)(struct sock *), struct sock *sk)
375{ 347{
376 int is_new_loss = 0; 348 bool new_event = false;
349
350 if (tfrc_rx_hist_duplicate(h, skb))
351 return 0;
377 352
378 if (h->loss_count == 0) { 353 if (h->loss_count == 0) {
379 __do_track_loss(h, skb, ndp); 354 __do_track_loss(h, skb, ndp);
355 tfrc_rx_hist_sample_rtt(h, skb);
356 tfrc_rx_hist_add_packet(h, skb, ndp);
380 } else if (h->loss_count == 1) { 357 } else if (h->loss_count == 1) {
381 __one_after_loss(h, skb, ndp); 358 __one_after_loss(h, skb, ndp);
382 } else if (h->loss_count != 2) { 359 } else if (h->loss_count != 2) {
@@ -385,34 +362,57 @@ int tfrc_rx_handle_loss(struct tfrc_rx_hist *h,
385 /* 362 /*
386 * Update Loss Interval database and recycle RX records 363 * Update Loss Interval database and recycle RX records
387 */ 364 */
388 is_new_loss = tfrc_lh_interval_add(lh, h, calc_first_li, sk); 365 new_event = tfrc_lh_interval_add(lh, h, first_li, sk);
389 __three_after_loss(h); 366 __three_after_loss(h);
390 } 367 }
391 return is_new_loss; 368
369 /*
370 * Update moving-average of `s' and the sum of received payload bytes.
371 */
372 if (dccp_data_packet(skb)) {
373 const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4;
374
375 h->packet_size = tfrc_ewma(h->packet_size, payload, 9);
376 h->bytes_recvd += payload;
377 }
378
379 /* RFC 3448, 6.1: update I_0, whose growth implies p <= p_prev */
380 if (!new_event)
381 tfrc_lh_update_i_mean(lh, skb);
382
383 return new_event;
392} 384}
393EXPORT_SYMBOL_GPL(tfrc_rx_handle_loss); 385EXPORT_SYMBOL_GPL(tfrc_rx_congestion_event);
394 386
395int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h) 387/* Compute the sending rate X_recv measured between feedback intervals */
388u32 tfrc_rx_hist_x_recv(struct tfrc_rx_hist *h, const u32 last_x_recv)
396{ 389{
397 int i; 390 u64 bytes = h->bytes_recvd, last_rtt = h->rtt_estimate;
391 s64 delta = ktime_to_us(net_timedelta(h->bytes_start));
398 392
399 for (i = 0; i <= TFRC_NDUPACK; i++) { 393 WARN_ON(delta <= 0);
400 h->ring[i] = kmem_cache_alloc(tfrc_rx_hist_slab, GFP_ATOMIC); 394 /*
401 if (h->ring[i] == NULL) 395 * Ensure that the sampling interval for X_recv is at least one RTT,
402 goto out_free; 396 * by extending the sampling interval backwards in time, over the last
403 } 397 * R_(m-1) seconds, as per rfc3448bis-06, 6.2.
398 * To reduce noise (e.g. when the RTT changes often), this is only
399 * done when delta is smaller than RTT/2.
400 */
401 if (last_x_recv > 0 && delta < last_rtt/2) {
402 tfrc_pr_debug("delta < RTT ==> %ld us < %u us\n",
403 (long)delta, (unsigned)last_rtt);
404 404
405 h->loss_count = h->loss_start = 0; 405 delta = (bytes ? delta : 0) + last_rtt;
406 return 0; 406 bytes += div_u64((u64)last_x_recv * last_rtt, USEC_PER_SEC);
407 }
407 408
408out_free: 409 if (unlikely(bytes == 0)) {
409 while (i-- != 0) { 410 DCCP_WARN("X_recv == 0, using old value of %u\n", last_x_recv);
410 kmem_cache_free(tfrc_rx_hist_slab, h->ring[i]); 411 return last_x_recv;
411 h->ring[i] = NULL;
412 } 412 }
413 return -ENOBUFS; 413 return scaled_div32(bytes, delta);
414} 414}
415EXPORT_SYMBOL_GPL(tfrc_rx_hist_alloc); 415EXPORT_SYMBOL_GPL(tfrc_rx_hist_x_recv);
416 416
417void tfrc_rx_hist_purge(struct tfrc_rx_hist *h) 417void tfrc_rx_hist_purge(struct tfrc_rx_hist *h)
418{ 418{
@@ -426,73 +426,81 @@ void tfrc_rx_hist_purge(struct tfrc_rx_hist *h)
426} 426}
427EXPORT_SYMBOL_GPL(tfrc_rx_hist_purge); 427EXPORT_SYMBOL_GPL(tfrc_rx_hist_purge);
428 428
429/** 429static int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h)
430 * tfrc_rx_hist_rtt_last_s - reference entry to compute RTT samples against
431 */
432static inline struct tfrc_rx_hist_entry *
433 tfrc_rx_hist_rtt_last_s(const struct tfrc_rx_hist *h)
434{ 430{
435 return h->ring[0]; 431 int i;
432
433 memset(h, 0, sizeof(*h));
434
435 for (i = 0; i <= TFRC_NDUPACK; i++) {
436 h->ring[i] = kmem_cache_alloc(tfrc_rx_hist_slab, GFP_ATOMIC);
437 if (h->ring[i] == NULL) {
438 tfrc_rx_hist_purge(h);
439 return -ENOBUFS;
440 }
441 }
442 return 0;
436} 443}
437 444
438/** 445int tfrc_rx_hist_init(struct tfrc_rx_hist *h, struct sock *sk)
439 * tfrc_rx_hist_rtt_prev_s: previously suitable (wrt rtt_last_s) RTT-sampling entry
440 */
441static inline struct tfrc_rx_hist_entry *
442 tfrc_rx_hist_rtt_prev_s(const struct tfrc_rx_hist *h)
443{ 446{
444 return h->ring[h->rtt_sample_prev]; 447 if (tfrc_rx_hist_alloc(h))
448 return -ENOBUFS;
449 /*
450 * Initialise first entry with GSR to start loss detection as early as
451 * possible. Code using this must not use any other fields. The entry
452 * will be overwritten once the CCID updates its received packets.
453 */
454 tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno = dccp_sk(sk)->dccps_gsr;
455 return 0;
445} 456}
457EXPORT_SYMBOL_GPL(tfrc_rx_hist_init);
446 458
447/** 459/**
448 * tfrc_rx_hist_sample_rtt - Sample RTT from timestamp / CCVal 460 * tfrc_rx_hist_sample_rtt - Sample RTT from timestamp / CCVal
449 * Based on ideas presented in RFC 4342, 8.1. Returns 0 if it was not able 461 * Based on ideas presented in RFC 4342, 8.1. This function expects that no loss
450 * to compute a sample with given data - calling function should check this. 462 * is pending and uses the following history entries (via rtt_sample_prev):
463 * - h->ring[0] contains the most recent history entry prior to @skb;
464 * - h->ring[1] is an unused `dummy' entry when the current difference is 0;
451 */ 465 */
452u32 tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, const struct sk_buff *skb) 466void tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, const struct sk_buff *skb)
453{ 467{
454 u32 sample = 0, 468 struct tfrc_rx_hist_entry *last = h->ring[0];
455 delta_v = SUB16(dccp_hdr(skb)->dccph_ccval, 469 u32 sample, delta_v;
456 tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval);
457
458 if (delta_v < 1 || delta_v > 4) { /* unsuitable CCVal delta */
459 if (h->rtt_sample_prev == 2) { /* previous candidate stored */
460 sample = SUB16(tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_ccval,
461 tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval);
462 if (sample)
463 sample = 4 / sample *
464 ktime_us_delta(tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_tstamp,
465 tfrc_rx_hist_rtt_last_s(h)->tfrchrx_tstamp);
466 else /*
467 * FIXME: This condition is in principle not
468 * possible but occurs when CCID is used for
469 * two-way data traffic. I have tried to trace
470 * it, but the cause does not seem to be here.
471 */
472 DCCP_BUG("please report to dccp@vger.kernel.org"
473 " => prev = %u, last = %u",
474 tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_ccval,
475 tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval);
476 } else if (delta_v < 1) {
477 h->rtt_sample_prev = 1;
478 goto keep_ref_for_next_time;
479 }
480 470
481 } else if (delta_v == 4) /* optimal match */ 471 /*
482 sample = ktime_to_us(net_timedelta(tfrc_rx_hist_rtt_last_s(h)->tfrchrx_tstamp)); 472 * When not to sample:
483 else { /* suboptimal match */ 473 * - on non-data packets
484 h->rtt_sample_prev = 2; 474 * (RFC 4342, 8.1: CCVal only fully defined for data packets);
485 goto keep_ref_for_next_time; 475 * - when no data packets have been received yet
486 } 476 * (FIXME: using sampled packet size as indicator here);
477 * - as long as there are gaps in the sequence space (pending loss).
478 */
479 if (!dccp_data_packet(skb) || h->packet_size == 0 ||
480 tfrc_rx_hist_loss_pending(h))
481 return;
487 482
488 if (unlikely(sample > DCCP_SANE_RTT_MAX)) { 483 h->rtt_sample_prev = 0; /* reset previous candidate */
489 DCCP_WARN("RTT sample %u too large, using max\n", sample); 484
490 sample = DCCP_SANE_RTT_MAX; 485 delta_v = SUB16(dccp_hdr(skb)->dccph_ccval, last->tfrchrx_ccval);
486 if (delta_v == 0) { /* less than RTT/4 difference */
487 h->rtt_sample_prev = 1;
488 return;
491 } 489 }
490 sample = dccp_sane_rtt(ktime_to_us(net_timedelta(last->tfrchrx_tstamp)));
492 491
493 h->rtt_sample_prev = 0; /* use current entry as next reference */ 492 if (delta_v <= 4) /* between RTT/4 and RTT */
494keep_ref_for_next_time: 493 sample *= 4 / delta_v;
494 else if (!(sample < h->rtt_estimate && sample > h->rtt_estimate/2))
495 /*
496 * Optimisation: CCVal difference is greater than 1 RTT, yet the
497 * sample is less than the local RTT estimate; which means that
498 * the RTT estimate is too high.
499 * To avoid noise, it is not done if the sample is below RTT/2.
500 */
501 return;
495 502
496 return sample; 503 /* Use a lower weight than usual to increase responsiveness */
504 h->rtt_estimate = tfrc_ewma(h->rtt_estimate, sample, 5);
497} 505}
498EXPORT_SYMBOL_GPL(tfrc_rx_hist_sample_rtt); 506EXPORT_SYMBOL_GPL(tfrc_rx_hist_sample_rtt);
diff --git a/net/dccp/ccids/lib/packet_history.h b/net/dccp/ccids/lib/packet_history.h
index 461cc91cce88..555e65cd73a0 100644
--- a/net/dccp/ccids/lib/packet_history.h
+++ b/net/dccp/ccids/lib/packet_history.h
@@ -40,12 +40,28 @@
40#include <linux/slab.h> 40#include <linux/slab.h>
41#include "tfrc.h" 41#include "tfrc.h"
42 42
43struct tfrc_tx_hist_entry; 43/**
44 * tfrc_tx_hist_entry - Simple singly-linked TX history list
45 * @next: next oldest entry (LIFO order)
46 * @seqno: sequence number of this entry
47 * @stamp: send time of packet with sequence number @seqno
48 */
49struct tfrc_tx_hist_entry {
50 struct tfrc_tx_hist_entry *next;
51 u64 seqno;
52 ktime_t stamp;
53};
54
55static inline struct tfrc_tx_hist_entry *
56 tfrc_tx_hist_find_entry(struct tfrc_tx_hist_entry *head, u64 seqno)
57{
58 while (head != NULL && head->seqno != seqno)
59 head = head->next;
60 return head;
61}
44 62
45extern int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno); 63extern int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno);
46extern void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp); 64extern void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp);
47extern u32 tfrc_tx_hist_rtt(struct tfrc_tx_hist_entry *head,
48 const u64 seqno, const ktime_t now);
49 65
50/* Subtraction a-b modulo-16, respects circular wrap-around */ 66/* Subtraction a-b modulo-16, respects circular wrap-around */
51#define SUB16(a, b) (((a) + 16 - (b)) & 0xF) 67#define SUB16(a, b) (((a) + 16 - (b)) & 0xF)
@@ -75,12 +91,22 @@ struct tfrc_rx_hist_entry {
75 * @loss_count: Number of entries in circular history 91 * @loss_count: Number of entries in circular history
76 * @loss_start: Movable index (for loss detection) 92 * @loss_start: Movable index (for loss detection)
77 * @rtt_sample_prev: Used during RTT sampling, points to candidate entry 93 * @rtt_sample_prev: Used during RTT sampling, points to candidate entry
94 * @rtt_estimate: Receiver RTT estimate
95 * @packet_size: Packet size in bytes (as per RFC 3448, 3.1)
96 * @bytes_recvd: Number of bytes received since @bytes_start
97 * @bytes_start: Start time for counting @bytes_recvd
78 */ 98 */
79struct tfrc_rx_hist { 99struct tfrc_rx_hist {
80 struct tfrc_rx_hist_entry *ring[TFRC_NDUPACK + 1]; 100 struct tfrc_rx_hist_entry *ring[TFRC_NDUPACK + 1];
81 u8 loss_count:2, 101 u8 loss_count:2,
82 loss_start:2; 102 loss_start:2;
103 /* Receiver RTT sampling */
83#define rtt_sample_prev loss_start 104#define rtt_sample_prev loss_start
105 u32 rtt_estimate;
106 /* Receiver sampling of application payload lengths */
107 u32 packet_size,
108 bytes_recvd;
109 ktime_t bytes_start;
84}; 110};
85 111
86/** 112/**
@@ -124,20 +150,50 @@ static inline bool tfrc_rx_hist_loss_pending(const struct tfrc_rx_hist *h)
124 return h->loss_count > 0; 150 return h->loss_count > 0;
125} 151}
126 152
153/*
154 * Accessor functions to retrieve parameters sampled by the RX history
155 */
156static inline u32 tfrc_rx_hist_packet_size(const struct tfrc_rx_hist *h)
157{
158 if (h->packet_size == 0) {
159 DCCP_WARN("No sample for s, using fallback\n");
160 return TCP_MIN_RCVMSS;
161 }
162 return h->packet_size;
163
164}
165static inline u32 tfrc_rx_hist_rtt(const struct tfrc_rx_hist *h)
166{
167 if (h->rtt_estimate == 0) {
168 DCCP_WARN("No RTT estimate available, using fallback RTT\n");
169 return DCCP_FALLBACK_RTT;
170 }
171 return h->rtt_estimate;
172}
173
174static inline void tfrc_rx_hist_restart_byte_counter(struct tfrc_rx_hist *h)
175{
176 h->bytes_recvd = 0;
177 h->bytes_start = ktime_get_real();
178}
179
180extern u32 tfrc_rx_hist_x_recv(struct tfrc_rx_hist *h, const u32 last_x_recv);
181
182
127extern void tfrc_rx_hist_add_packet(struct tfrc_rx_hist *h, 183extern void tfrc_rx_hist_add_packet(struct tfrc_rx_hist *h,
128 const struct sk_buff *skb, const u64 ndp); 184 const struct sk_buff *skb, const u64 ndp);
129 185
130extern int tfrc_rx_hist_duplicate(struct tfrc_rx_hist *h, struct sk_buff *skb); 186extern int tfrc_rx_hist_duplicate(struct tfrc_rx_hist *h, struct sk_buff *skb);
131 187
132struct tfrc_loss_hist; 188struct tfrc_loss_hist;
133extern int tfrc_rx_handle_loss(struct tfrc_rx_hist *h, 189extern bool tfrc_rx_congestion_event(struct tfrc_rx_hist *h,
134 struct tfrc_loss_hist *lh, 190 struct tfrc_loss_hist *lh,
135 struct sk_buff *skb, const u64 ndp, 191 struct sk_buff *skb, const u64 ndp,
136 u32 (*first_li)(struct sock *sk), 192 u32 (*first_li)(struct sock *sk),
137 struct sock *sk); 193 struct sock *sk);
138extern u32 tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, 194extern void tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h,
139 const struct sk_buff *skb); 195 const struct sk_buff *skb);
140extern int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h); 196extern int tfrc_rx_hist_init(struct tfrc_rx_hist *h, struct sock *sk);
141extern void tfrc_rx_hist_purge(struct tfrc_rx_hist *h); 197extern void tfrc_rx_hist_purge(struct tfrc_rx_hist *h);
142 198
143#endif /* _DCCP_PKT_HIST_ */ 199#endif /* _DCCP_PKT_HIST_ */
diff --git a/net/dccp/ccids/lib/tfrc.h b/net/dccp/ccids/lib/tfrc.h
index ed9857527acf..ede12f53de5a 100644
--- a/net/dccp/ccids/lib/tfrc.h
+++ b/net/dccp/ccids/lib/tfrc.h
@@ -48,6 +48,21 @@ static inline u32 scaled_div32(u64 a, u64 b)
48} 48}
49 49
50/** 50/**
51 * tfrc_scaled_sqrt - Compute scaled integer sqrt(x) for 0 < x < 2^22-1
52 * Uses scaling to improve accuracy of the integer approximation of sqrt(). The
53 * scaling factor of 2^10 limits the maximum @sample to 4e6; this is okay for
54 * clamped RTT samples (dccp_sample_rtt).
55 * Should best be used for expressions of type sqrt(x)/sqrt(y), since then the
56 * scaling factor is neutralised. For this purpose, it avoids returning zero.
57 */
58static inline u16 tfrc_scaled_sqrt(const u32 sample)
59{
60 const unsigned long non_zero_sample = sample ? : 1;
61
62 return int_sqrt(non_zero_sample << 10);
63}
64
65/**
51 * tfrc_ewma - Exponentially weighted moving average 66 * tfrc_ewma - Exponentially weighted moving average
52 * @weight: Weight to be used as damping factor, in units of 1/10 67 * @weight: Weight to be used as damping factor, in units of 1/10
53 */ 68 */
@@ -58,6 +73,7 @@ static inline u32 tfrc_ewma(const u32 avg, const u32 newval, const u8 weight)
58 73
59extern u32 tfrc_calc_x(u16 s, u32 R, u32 p); 74extern u32 tfrc_calc_x(u16 s, u32 R, u32 p);
60extern u32 tfrc_calc_x_reverse_lookup(u32 fvalue); 75extern u32 tfrc_calc_x_reverse_lookup(u32 fvalue);
76extern u32 tfrc_invert_loss_event_rate(u32 loss_event_rate);
61 77
62extern int tfrc_tx_packet_history_init(void); 78extern int tfrc_tx_packet_history_init(void);
63extern void tfrc_tx_packet_history_exit(void); 79extern void tfrc_tx_packet_history_exit(void);
diff --git a/net/dccp/ccids/lib/tfrc_equation.c b/net/dccp/ccids/lib/tfrc_equation.c
index 2f20a29cffe4..38239c4d5e14 100644
--- a/net/dccp/ccids/lib/tfrc_equation.c
+++ b/net/dccp/ccids/lib/tfrc_equation.c
@@ -632,8 +632,16 @@ u32 tfrc_calc_x(u16 s, u32 R, u32 p)
632 632
633 if (p <= TFRC_CALC_X_SPLIT) { /* 0.0000 < p <= 0.05 */ 633 if (p <= TFRC_CALC_X_SPLIT) { /* 0.0000 < p <= 0.05 */
634 if (p < TFRC_SMALLEST_P) { /* 0.0000 < p < 0.0001 */ 634 if (p < TFRC_SMALLEST_P) { /* 0.0000 < p < 0.0001 */
635 DCCP_WARN("Value of p (%d) below resolution. " 635 /*
636 "Substituting %d\n", p, TFRC_SMALLEST_P); 636 * In the congestion-avoidance phase p decays towards 0
637 * when there are no further losses, so this case is
638 * natural. Truncating to p_min = 0.01% means that the
639 * maximum achievable throughput is limited to about
640 * X_calc_max = 122.4 * s/RTT (see RFC 3448, 3.1); e.g.
641 * with s=1500 bytes, RTT=0.01 s: X_calc_max = 147 Mbps.
642 */
643 tfrc_pr_debug("Value of p (%d) below resolution. "
644 "Substituting %d\n", p, TFRC_SMALLEST_P);
637 index = 0; 645 index = 0;
638 } else /* 0.0001 <= p <= 0.05 */ 646 } else /* 0.0001 <= p <= 0.05 */
639 index = p/TFRC_SMALLEST_P - 1; 647 index = p/TFRC_SMALLEST_P - 1;
@@ -658,7 +666,6 @@ u32 tfrc_calc_x(u16 s, u32 R, u32 p)
658 result = scaled_div(s, R); 666 result = scaled_div(s, R);
659 return scaled_div32(result, f); 667 return scaled_div32(result, f);
660} 668}
661
662EXPORT_SYMBOL_GPL(tfrc_calc_x); 669EXPORT_SYMBOL_GPL(tfrc_calc_x);
663 670
664/** 671/**
@@ -693,5 +700,19 @@ u32 tfrc_calc_x_reverse_lookup(u32 fvalue)
693 index = tfrc_binsearch(fvalue, 0); 700 index = tfrc_binsearch(fvalue, 0);
694 return (index + 1) * 1000000 / TFRC_CALC_X_ARRSIZE; 701 return (index + 1) * 1000000 / TFRC_CALC_X_ARRSIZE;
695} 702}
696
697EXPORT_SYMBOL_GPL(tfrc_calc_x_reverse_lookup); 703EXPORT_SYMBOL_GPL(tfrc_calc_x_reverse_lookup);
704
705/**
706 * tfrc_invert_loss_event_rate - Compute p so that 10^6 corresponds to 100%
707 * When @loss_event_rate is large, there is a chance that p is truncated to 0.
708 * To avoid re-entering slow-start in that case, we set p = TFRC_SMALLEST_P > 0.
709 */
710u32 tfrc_invert_loss_event_rate(u32 loss_event_rate)
711{
712 if (loss_event_rate == UINT_MAX) /* see RFC 4342, 8.5 */
713 return 0;
714 if (unlikely(loss_event_rate == 0)) /* map 1/0 into 100% */
715 return 1000000;
716 return max_t(u32, scaled_div(1, loss_event_rate), TFRC_SMALLEST_P);
717}
718EXPORT_SYMBOL_GPL(tfrc_invert_loss_event_rate);