aboutsummaryrefslogtreecommitdiffstats
path: root/net/dccp/ccids
diff options
context:
space:
mode:
authorGerrit Renker <gerrit@erg.abdn.ac.uk>2008-09-09 07:27:22 -0400
committerGerrit Renker <gerrit@erg.abdn.ac.uk>2008-09-09 07:27:22 -0400
commit410e27a49bb98bc7fa3ff5fc05cc313817b9f253 (patch)
tree88bb1fcf84f9ebfa4299c9a8dcd9e6330b358446 /net/dccp/ccids
parent0a68a20cc3eafa73bb54097c28b921147d7d3685 (diff)
This reverts "Merge branch 'dccp' of git://eden-feed.erg.abdn.ac.uk/dccp_exp"
as it accentally contained the wrong set of patches. These will be submitted separately. Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Diffstat (limited to 'net/dccp/ccids')
-rw-r--r--net/dccp/ccids/Kconfig30
-rw-r--r--net/dccp/ccids/ccid2.c622
-rw-r--r--net/dccp/ccids/ccid2.h63
-rw-r--r--net/dccp/ccids/ccid3.c762
-rw-r--r--net/dccp/ccids/ccid3.h153
-rw-r--r--net/dccp/ccids/lib/loss_interval.c30
-rw-r--r--net/dccp/ccids/lib/loss_interval.h4
-rw-r--r--net/dccp/ccids/lib/packet_history.c282
-rw-r--r--net/dccp/ccids/lib/packet_history.h78
-rw-r--r--net/dccp/ccids/lib/tfrc.h16
-rw-r--r--net/dccp/ccids/lib/tfrc_equation.c29
11 files changed, 1116 insertions, 953 deletions
diff --git a/net/dccp/ccids/Kconfig b/net/dccp/ccids/Kconfig
index fb168be2cb43..12275943eab8 100644
--- a/net/dccp/ccids/Kconfig
+++ b/net/dccp/ccids/Kconfig
@@ -1,8 +1,10 @@
1menu "DCCP CCIDs Configuration (EXPERIMENTAL)" 1menu "DCCP CCIDs Configuration (EXPERIMENTAL)"
2 depends on EXPERIMENTAL
2 3
3config IP_DCCP_CCID2 4config IP_DCCP_CCID2
4 tristate "CCID2 (TCP-Like)" 5 tristate "CCID2 (TCP-Like) (EXPERIMENTAL)"
5 def_tristate IP_DCCP 6 def_tristate IP_DCCP
7 select IP_DCCP_ACKVEC
6 ---help--- 8 ---help---
7 CCID 2, TCP-like Congestion Control, denotes Additive Increase, 9 CCID 2, TCP-like Congestion Control, denotes Additive Increase,
8 Multiplicative Decrease (AIMD) congestion control with behavior 10 Multiplicative Decrease (AIMD) congestion control with behavior
@@ -34,7 +36,7 @@ config IP_DCCP_CCID2_DEBUG
34 If in doubt, say N. 36 If in doubt, say N.
35 37
36config IP_DCCP_CCID3 38config IP_DCCP_CCID3
37 tristate "CCID3 (TCP-Friendly)" 39 tristate "CCID3 (TCP-Friendly) (EXPERIMENTAL)"
38 def_tristate IP_DCCP 40 def_tristate IP_DCCP
39 select IP_DCCP_TFRC_LIB 41 select IP_DCCP_TFRC_LIB
40 ---help--- 42 ---help---
@@ -62,9 +64,9 @@ config IP_DCCP_CCID3
62 64
63 If in doubt, say M. 65 If in doubt, say M.
64 66
65if IP_DCCP_CCID3
66config IP_DCCP_CCID3_DEBUG 67config IP_DCCP_CCID3_DEBUG
67 bool "CCID3 debugging messages" 68 bool "CCID3 debugging messages"
69 depends on IP_DCCP_CCID3
68 ---help--- 70 ---help---
69 Enable CCID3-specific debugging messages. 71 Enable CCID3-specific debugging messages.
70 72
@@ -74,29 +76,10 @@ config IP_DCCP_CCID3_DEBUG
74 76
75 If in doubt, say N. 77 If in doubt, say N.
76 78
77choice
78 prompt "Select method for measuring the packet size s"
79 default IP_DCCP_CCID3_MEASURE_S_AS_MPS
80
81config IP_DCCP_CCID3_MEASURE_S_AS_MPS
82 bool "Always use MPS in place of s"
83 ---help---
84 This use is recommended as it is consistent with the initialisation
85 of X and suggested when s varies (rfc3448bis, (1) in section 4.1).
86config IP_DCCP_CCID3_MEASURE_S_AS_AVG
87 bool "Use moving average"
88 ---help---
89 An alternative way of tracking s, also supported by rfc3448bis.
90 This used to be the default for CCID-3 in previous kernels.
91config IP_DCCP_CCID3_MEASURE_S_AS_MAX
92 bool "Track the maximum payload length"
93 ---help---
94 An experimental method based on tracking the maximum packet size.
95endchoice
96
97config IP_DCCP_CCID3_RTO 79config IP_DCCP_CCID3_RTO
98 int "Use higher bound for nofeedback timer" 80 int "Use higher bound for nofeedback timer"
99 default 100 81 default 100
82 depends on IP_DCCP_CCID3 && EXPERIMENTAL
100 ---help--- 83 ---help---
101 Use higher lower bound for nofeedback timer expiration. 84 Use higher lower bound for nofeedback timer expiration.
102 85
@@ -123,7 +106,6 @@ config IP_DCCP_CCID3_RTO
123 The purpose of the nofeedback timer is to slow DCCP down when there 106 The purpose of the nofeedback timer is to slow DCCP down when there
124 is serious network congestion: experimenting with larger values should 107 is serious network congestion: experimenting with larger values should
125 therefore not be performed on WANs. 108 therefore not be performed on WANs.
126endif # IP_DCCP_CCID3
127 109
128config IP_DCCP_TFRC_LIB 110config IP_DCCP_TFRC_LIB
129 tristate 111 tristate
diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c
index fa713227c66f..9a430734530c 100644
--- a/net/dccp/ccids/ccid2.c
+++ b/net/dccp/ccids/ccid2.c
@@ -25,7 +25,7 @@
25/* 25/*
26 * This implementation should follow RFC 4341 26 * This implementation should follow RFC 4341
27 */ 27 */
28#include "../feat.h" 28
29#include "../ccid.h" 29#include "../ccid.h"
30#include "../dccp.h" 30#include "../dccp.h"
31#include "ccid2.h" 31#include "ccid2.h"
@@ -34,8 +34,51 @@
34#ifdef CONFIG_IP_DCCP_CCID2_DEBUG 34#ifdef CONFIG_IP_DCCP_CCID2_DEBUG
35static int ccid2_debug; 35static int ccid2_debug;
36#define ccid2_pr_debug(format, a...) DCCP_PR_DEBUG(ccid2_debug, format, ##a) 36#define ccid2_pr_debug(format, a...) DCCP_PR_DEBUG(ccid2_debug, format, ##a)
37
38static void ccid2_hc_tx_check_sanity(const struct ccid2_hc_tx_sock *hctx)
39{
40 int len = 0;
41 int pipe = 0;
42 struct ccid2_seq *seqp = hctx->ccid2hctx_seqh;
43
44 /* there is data in the chain */
45 if (seqp != hctx->ccid2hctx_seqt) {
46 seqp = seqp->ccid2s_prev;
47 len++;
48 if (!seqp->ccid2s_acked)
49 pipe++;
50
51 while (seqp != hctx->ccid2hctx_seqt) {
52 struct ccid2_seq *prev = seqp->ccid2s_prev;
53
54 len++;
55 if (!prev->ccid2s_acked)
56 pipe++;
57
58 /* packets are sent sequentially */
59 BUG_ON(dccp_delta_seqno(seqp->ccid2s_seq,
60 prev->ccid2s_seq ) >= 0);
61 BUG_ON(time_before(seqp->ccid2s_sent,
62 prev->ccid2s_sent));
63
64 seqp = prev;
65 }
66 }
67
68 BUG_ON(pipe != hctx->ccid2hctx_pipe);
69 ccid2_pr_debug("len of chain=%d\n", len);
70
71 do {
72 seqp = seqp->ccid2s_prev;
73 len++;
74 } while (seqp != hctx->ccid2hctx_seqh);
75
76 ccid2_pr_debug("total len=%d\n", len);
77 BUG_ON(len != hctx->ccid2hctx_seqbufc * CCID2_SEQBUF_LEN);
78}
37#else 79#else
38#define ccid2_pr_debug(format, a...) 80#define ccid2_pr_debug(format, a...)
81#define ccid2_hc_tx_check_sanity(hctx)
39#endif 82#endif
40 83
41static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hctx) 84static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hctx)
@@ -44,7 +87,8 @@ static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hctx)
44 int i; 87 int i;
45 88
46 /* check if we have space to preserve the pointer to the buffer */ 89 /* check if we have space to preserve the pointer to the buffer */
47 if (hctx->seqbufc >= sizeof(hctx->seqbuf) / sizeof(struct ccid2_seq *)) 90 if (hctx->ccid2hctx_seqbufc >= (sizeof(hctx->ccid2hctx_seqbuf) /
91 sizeof(struct ccid2_seq*)))
48 return -ENOMEM; 92 return -ENOMEM;
49 93
50 /* allocate buffer and initialize linked list */ 94 /* allocate buffer and initialize linked list */
@@ -60,35 +104,38 @@ static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hctx)
60 seqp->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1]; 104 seqp->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1];
61 105
62 /* This is the first allocation. Initiate the head and tail. */ 106 /* This is the first allocation. Initiate the head and tail. */
63 if (hctx->seqbufc == 0) 107 if (hctx->ccid2hctx_seqbufc == 0)
64 hctx->seqh = hctx->seqt = seqp; 108 hctx->ccid2hctx_seqh = hctx->ccid2hctx_seqt = seqp;
65 else { 109 else {
66 /* link the existing list with the one we just created */ 110 /* link the existing list with the one we just created */
67 hctx->seqh->ccid2s_next = seqp; 111 hctx->ccid2hctx_seqh->ccid2s_next = seqp;
68 seqp->ccid2s_prev = hctx->seqh; 112 seqp->ccid2s_prev = hctx->ccid2hctx_seqh;
69 113
70 hctx->seqt->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1]; 114 hctx->ccid2hctx_seqt->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1];
71 seqp[CCID2_SEQBUF_LEN - 1].ccid2s_next = hctx->seqt; 115 seqp[CCID2_SEQBUF_LEN - 1].ccid2s_next = hctx->ccid2hctx_seqt;
72 } 116 }
73 117
74 /* store the original pointer to the buffer so we can free it */ 118 /* store the original pointer to the buffer so we can free it */
75 hctx->seqbuf[hctx->seqbufc] = seqp; 119 hctx->ccid2hctx_seqbuf[hctx->ccid2hctx_seqbufc] = seqp;
76 hctx->seqbufc++; 120 hctx->ccid2hctx_seqbufc++;
77 121
78 return 0; 122 return 0;
79} 123}
80 124
81static int ccid2_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) 125static int ccid2_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
82{ 126{
83 if (ccid2_cwnd_network_limited(ccid2_hc_tx_sk(sk))) 127 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
84 return CCID_PACKET_WILL_DEQUEUE_LATER; 128
85 return CCID_PACKET_SEND_AT_ONCE; 129 if (hctx->ccid2hctx_pipe < hctx->ccid2hctx_cwnd)
130 return 0;
131
132 return 1; /* XXX CCID should dequeue when ready instead of polling */
86} 133}
87 134
88static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val) 135static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val)
89{ 136{
90 struct dccp_sock *dp = dccp_sk(sk); 137 struct dccp_sock *dp = dccp_sk(sk);
91 u32 max_ratio = DIV_ROUND_UP(ccid2_hc_tx_sk(sk)->cwnd, 2); 138 u32 max_ratio = DIV_ROUND_UP(ccid2_hc_tx_sk(sk)->ccid2hctx_cwnd, 2);
92 139
93 /* 140 /*
94 * Ensure that Ack Ratio does not exceed ceil(cwnd/2), which is (2) from 141 * Ensure that Ack Ratio does not exceed ceil(cwnd/2), which is (2) from
@@ -100,8 +147,8 @@ static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val)
100 DCCP_WARN("Limiting Ack Ratio (%u) to %u\n", val, max_ratio); 147 DCCP_WARN("Limiting Ack Ratio (%u) to %u\n", val, max_ratio);
101 val = max_ratio; 148 val = max_ratio;
102 } 149 }
103 if (val > DCCPF_ACK_RATIO_MAX) 150 if (val > 0xFFFF) /* RFC 4340, 11.3 */
104 val = DCCPF_ACK_RATIO_MAX; 151 val = 0xFFFF;
105 152
106 if (val == dp->dccps_l_ack_ratio) 153 if (val == dp->dccps_l_ack_ratio)
107 return; 154 return;
@@ -110,77 +157,99 @@ static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val)
110 dp->dccps_l_ack_ratio = val; 157 dp->dccps_l_ack_ratio = val;
111} 158}
112 159
160static void ccid2_change_srtt(struct ccid2_hc_tx_sock *hctx, long val)
161{
162 ccid2_pr_debug("change SRTT to %ld\n", val);
163 hctx->ccid2hctx_srtt = val;
164}
165
166static void ccid2_start_rto_timer(struct sock *sk);
167
113static void ccid2_hc_tx_rto_expire(unsigned long data) 168static void ccid2_hc_tx_rto_expire(unsigned long data)
114{ 169{
115 struct sock *sk = (struct sock *)data; 170 struct sock *sk = (struct sock *)data;
116 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); 171 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
117 const bool sender_was_blocked = ccid2_cwnd_network_limited(hctx); 172 long s;
118 173
119 bh_lock_sock(sk); 174 bh_lock_sock(sk);
120 if (sock_owned_by_user(sk)) { 175 if (sock_owned_by_user(sk)) {
121 sk_reset_timer(sk, &hctx->rtotimer, jiffies + HZ / 5); 176 sk_reset_timer(sk, &hctx->ccid2hctx_rtotimer,
177 jiffies + HZ / 5);
122 goto out; 178 goto out;
123 } 179 }
124 180
125 ccid2_pr_debug("RTO_EXPIRE\n"); 181 ccid2_pr_debug("RTO_EXPIRE\n");
126 182
183 ccid2_hc_tx_check_sanity(hctx);
184
127 /* back-off timer */ 185 /* back-off timer */
128 hctx->rto <<= 1; 186 hctx->ccid2hctx_rto <<= 1;
129 if (hctx->rto > DCCP_RTO_MAX) 187
130 hctx->rto = DCCP_RTO_MAX; 188 s = hctx->ccid2hctx_rto / HZ;
189 if (s > 60)
190 hctx->ccid2hctx_rto = 60 * HZ;
191
192 ccid2_start_rto_timer(sk);
131 193
132 /* adjust pipe, cwnd etc */ 194 /* adjust pipe, cwnd etc */
133 hctx->ssthresh = hctx->cwnd / 2; 195 hctx->ccid2hctx_ssthresh = hctx->ccid2hctx_cwnd / 2;
134 if (hctx->ssthresh < 2) 196 if (hctx->ccid2hctx_ssthresh < 2)
135 hctx->ssthresh = 2; 197 hctx->ccid2hctx_ssthresh = 2;
136 hctx->cwnd = 1; 198 hctx->ccid2hctx_cwnd = 1;
137 hctx->pipe = 0; 199 hctx->ccid2hctx_pipe = 0;
138 200
139 /* clear state about stuff we sent */ 201 /* clear state about stuff we sent */
140 hctx->seqt = hctx->seqh; 202 hctx->ccid2hctx_seqt = hctx->ccid2hctx_seqh;
141 hctx->packets_acked = 0; 203 hctx->ccid2hctx_packets_acked = 0;
142 204
143 /* clear ack ratio state. */ 205 /* clear ack ratio state. */
144 hctx->rpseq = 0; 206 hctx->ccid2hctx_rpseq = 0;
145 hctx->rpdupack = -1; 207 hctx->ccid2hctx_rpdupack = -1;
146 ccid2_change_l_ack_ratio(sk, 1); 208 ccid2_change_l_ack_ratio(sk, 1);
147 209 ccid2_hc_tx_check_sanity(hctx);
148 /* if we were blocked before, we may now send cwnd=1 packet */
149 if (sender_was_blocked)
150 tasklet_schedule(&dccp_sk(sk)->dccps_xmitlet);
151 /* restart backed-off timer */
152 sk_reset_timer(sk, &hctx->rtotimer, jiffies + hctx->rto);
153out: 210out:
154 bh_unlock_sock(sk); 211 bh_unlock_sock(sk);
155 sock_put(sk); 212 sock_put(sk);
156} 213}
157 214
158static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len) 215static void ccid2_start_rto_timer(struct sock *sk)
216{
217 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
218
219 ccid2_pr_debug("setting RTO timeout=%ld\n", hctx->ccid2hctx_rto);
220
221 BUG_ON(timer_pending(&hctx->ccid2hctx_rtotimer));
222 sk_reset_timer(sk, &hctx->ccid2hctx_rtotimer,
223 jiffies + hctx->ccid2hctx_rto);
224}
225
226static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len)
159{ 227{
160 struct dccp_sock *dp = dccp_sk(sk); 228 struct dccp_sock *dp = dccp_sk(sk);
161 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); 229 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
162 struct ccid2_seq *next; 230 struct ccid2_seq *next;
163 231
164 hctx->pipe++; 232 hctx->ccid2hctx_pipe++;
165 233
166 hctx->seqh->ccid2s_seq = dp->dccps_gss; 234 hctx->ccid2hctx_seqh->ccid2s_seq = dp->dccps_gss;
167 hctx->seqh->ccid2s_acked = 0; 235 hctx->ccid2hctx_seqh->ccid2s_acked = 0;
168 hctx->seqh->ccid2s_sent = jiffies; 236 hctx->ccid2hctx_seqh->ccid2s_sent = jiffies;
169 237
170 next = hctx->seqh->ccid2s_next; 238 next = hctx->ccid2hctx_seqh->ccid2s_next;
171 /* check if we need to alloc more space */ 239 /* check if we need to alloc more space */
172 if (next == hctx->seqt) { 240 if (next == hctx->ccid2hctx_seqt) {
173 if (ccid2_hc_tx_alloc_seq(hctx)) { 241 if (ccid2_hc_tx_alloc_seq(hctx)) {
174 DCCP_CRIT("packet history - out of memory!"); 242 DCCP_CRIT("packet history - out of memory!");
175 /* FIXME: find a more graceful way to bail out */ 243 /* FIXME: find a more graceful way to bail out */
176 return; 244 return;
177 } 245 }
178 next = hctx->seqh->ccid2s_next; 246 next = hctx->ccid2hctx_seqh->ccid2s_next;
179 BUG_ON(next == hctx->seqt); 247 BUG_ON(next == hctx->ccid2hctx_seqt);
180 } 248 }
181 hctx->seqh = next; 249 hctx->ccid2hctx_seqh = next;
182 250
183 ccid2_pr_debug("cwnd=%d pipe=%d\n", hctx->cwnd, hctx->pipe); 251 ccid2_pr_debug("cwnd=%d pipe=%d\n", hctx->ccid2hctx_cwnd,
252 hctx->ccid2hctx_pipe);
184 253
185 /* 254 /*
186 * FIXME: The code below is broken and the variables have been removed 255 * FIXME: The code below is broken and the variables have been removed
@@ -203,12 +272,12 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len)
203 */ 272 */
204#if 0 273#if 0
205 /* Ack Ratio. Need to maintain a concept of how many windows we sent */ 274 /* Ack Ratio. Need to maintain a concept of how many windows we sent */
206 hctx->arsent++; 275 hctx->ccid2hctx_arsent++;
207 /* We had an ack loss in this window... */ 276 /* We had an ack loss in this window... */
208 if (hctx->ackloss) { 277 if (hctx->ccid2hctx_ackloss) {
209 if (hctx->arsent >= hctx->cwnd) { 278 if (hctx->ccid2hctx_arsent >= hctx->ccid2hctx_cwnd) {
210 hctx->arsent = 0; 279 hctx->ccid2hctx_arsent = 0;
211 hctx->ackloss = 0; 280 hctx->ccid2hctx_ackloss = 0;
212 } 281 }
213 } else { 282 } else {
214 /* No acks lost up to now... */ 283 /* No acks lost up to now... */
@@ -218,28 +287,28 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len)
218 int denom = dp->dccps_l_ack_ratio * dp->dccps_l_ack_ratio - 287 int denom = dp->dccps_l_ack_ratio * dp->dccps_l_ack_ratio -
219 dp->dccps_l_ack_ratio; 288 dp->dccps_l_ack_ratio;
220 289
221 denom = hctx->cwnd * hctx->cwnd / denom; 290 denom = hctx->ccid2hctx_cwnd * hctx->ccid2hctx_cwnd / denom;
222 291
223 if (hctx->arsent >= denom) { 292 if (hctx->ccid2hctx_arsent >= denom) {
224 ccid2_change_l_ack_ratio(sk, dp->dccps_l_ack_ratio - 1); 293 ccid2_change_l_ack_ratio(sk, dp->dccps_l_ack_ratio - 1);
225 hctx->arsent = 0; 294 hctx->ccid2hctx_arsent = 0;
226 } 295 }
227 } else { 296 } else {
228 /* we can't increase ack ratio further [1] */ 297 /* we can't increase ack ratio further [1] */
229 hctx->arsent = 0; /* or maybe set it to cwnd*/ 298 hctx->ccid2hctx_arsent = 0; /* or maybe set it to cwnd*/
230 } 299 }
231 } 300 }
232#endif 301#endif
233 302
234 /* setup RTO timer */ 303 /* setup RTO timer */
235 if (!timer_pending(&hctx->rtotimer)) 304 if (!timer_pending(&hctx->ccid2hctx_rtotimer))
236 sk_reset_timer(sk, &hctx->rtotimer, jiffies + hctx->rto); 305 ccid2_start_rto_timer(sk);
237 306
238#ifdef CONFIG_IP_DCCP_CCID2_DEBUG 307#ifdef CONFIG_IP_DCCP_CCID2_DEBUG
239 do { 308 do {
240 struct ccid2_seq *seqp = hctx->seqt; 309 struct ccid2_seq *seqp = hctx->ccid2hctx_seqt;
241 310
242 while (seqp != hctx->seqh) { 311 while (seqp != hctx->ccid2hctx_seqh) {
243 ccid2_pr_debug("out seq=%llu acked=%d time=%lu\n", 312 ccid2_pr_debug("out seq=%llu acked=%d time=%lu\n",
244 (unsigned long long)seqp->ccid2s_seq, 313 (unsigned long long)seqp->ccid2s_seq,
245 seqp->ccid2s_acked, seqp->ccid2s_sent); 314 seqp->ccid2s_acked, seqp->ccid2s_sent);
@@ -247,158 +316,205 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len)
247 } 316 }
248 } while (0); 317 } while (0);
249 ccid2_pr_debug("=========\n"); 318 ccid2_pr_debug("=========\n");
319 ccid2_hc_tx_check_sanity(hctx);
250#endif 320#endif
251} 321}
252 322
253/** 323/* XXX Lame code duplication!
254 * ccid2_rtt_estimator - Sample RTT and compute RTO using RFC2988 algorithm 324 * returns -1 if none was found.
255 * This code is almost identical with TCP's tcp_rtt_estimator(), since 325 * else returns the next offset to use in the function call.
256 * - it has a higher sampling frequency (recommended by RFC 1323),
257 * - the RTO does not collapse into RTT due to RTTVAR going towards zero,
258 * - it is simple (cf. more complex proposals such as Eifel timer or research
259 * which suggests that the gain should be set according to window size),
260 * - in tests it was found to work well with CCID2 [gerrit].
261 */ 326 */
262static void ccid2_rtt_estimator(struct sock *sk, const long mrtt) 327static int ccid2_ackvector(struct sock *sk, struct sk_buff *skb, int offset,
328 unsigned char **vec, unsigned char *veclen)
263{ 329{
264 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); 330 const struct dccp_hdr *dh = dccp_hdr(skb);
265 long m = mrtt ? : 1; 331 unsigned char *options = (unsigned char *)dh + dccp_hdr_len(skb);
266 332 unsigned char *opt_ptr;
267 if (hctx->srtt == 0) { 333 const unsigned char *opt_end = (unsigned char *)dh +
268 /* First measurement m */ 334 (dh->dccph_doff * 4);
269 hctx->srtt = m << 3; 335 unsigned char opt, len;
270 hctx->mdev = m << 1; 336 unsigned char *value;
271 337
272 hctx->mdev_max = max(TCP_RTO_MIN, hctx->mdev); 338 BUG_ON(offset < 0);
273 hctx->rttvar = hctx->mdev_max; 339 options += offset;
274 hctx->rtt_seq = dccp_sk(sk)->dccps_gss; 340 opt_ptr = options;
275 } else { 341 if (opt_ptr >= opt_end)
276 /* Update scaled SRTT as SRTT += 1/8 * (m - SRTT) */ 342 return -1;
277 m -= (hctx->srtt >> 3); 343
278 hctx->srtt += m; 344 while (opt_ptr != opt_end) {
279 345 opt = *opt_ptr++;
280 /* Similarly, update scaled mdev with regard to |m| */ 346 len = 0;
281 if (m < 0) { 347 value = NULL;
282 m = -m; 348
283 m -= (hctx->mdev >> 2); 349 /* Check if this isn't a single byte option */
350 if (opt > DCCPO_MAX_RESERVED) {
351 if (opt_ptr == opt_end)
352 goto out_invalid_option;
353
354 len = *opt_ptr++;
355 if (len < 3)
356 goto out_invalid_option;
284 /* 357 /*
285 * This neutralises RTO increase when RTT < SRTT - mdev 358 * Remove the type and len fields, leaving
286 * (see P. Sarolahti, A. Kuznetsov,"Congestion Control 359 * just the value size
287 * in Linux TCP", USENIX 2002, pp. 49-62).
288 */ 360 */
289 if (m > 0) 361 len -= 2;
290 m >>= 3; 362 value = opt_ptr;
291 } else { 363 opt_ptr += len;
292 m -= (hctx->mdev >> 2);
293 }
294 hctx->mdev += m;
295 364
296 if (hctx->mdev > hctx->mdev_max) { 365 if (opt_ptr > opt_end)
297 hctx->mdev_max = hctx->mdev; 366 goto out_invalid_option;
298 if (hctx->mdev_max > hctx->rttvar)
299 hctx->rttvar = hctx->mdev_max;
300 } 367 }
301 368
302 /* 369 switch (opt) {
303 * Decay RTTVAR at most once per flight, exploiting that 370 case DCCPO_ACK_VECTOR_0:
304 * 1) pipe <= cwnd <= Sequence_Window = W (RFC 4340, 7.5.2) 371 case DCCPO_ACK_VECTOR_1:
305 * 2) AWL = GSS-W+1 <= GAR <= GSS (RFC 4340, 7.5.1) 372 *vec = value;
306 * GAR is a useful bound for FlightSize = pipe, AWL is probably 373 *veclen = len;
307 * too low as it over-estimates pipe. 374 return offset + (opt_ptr - options);
308 */
309 if (after48(dccp_sk(sk)->dccps_gar, hctx->rtt_seq)) {
310 if (hctx->mdev_max < hctx->rttvar)
311 hctx->rttvar -= (hctx->rttvar -
312 hctx->mdev_max) >> 2;
313 hctx->rtt_seq = dccp_sk(sk)->dccps_gss;
314 hctx->mdev_max = TCP_RTO_MIN;
315 } 375 }
316 } 376 }
317 377
318 /* 378 return -1;
319 * Set RTO from SRTT and RTTVAR
320 * Clock granularity is ignored since the minimum error for RTTVAR is
321 * clamped to 50msec (corresponding to HZ=20). This leads to a minimum
322 * RTO of 200msec. This agrees with TCP and RFC 4341, 5.: "Because DCCP
323 * does not retransmit data, DCCP does not require TCP's recommended
324 * minimum timeout of one second".
325 */
326 hctx->rto = (hctx->srtt >> 3) + hctx->rttvar;
327 379
328 if (hctx->rto > DCCP_RTO_MAX) 380out_invalid_option:
329 hctx->rto = DCCP_RTO_MAX; 381 DCCP_BUG("Invalid option - this should not happen (previous parsing)!");
382 return -1;
330} 383}
331 384
332static void ccid2_new_ack(struct sock *sk, struct ccid2_seq *seqp, 385static void ccid2_hc_tx_kill_rto_timer(struct sock *sk)
333 unsigned int *maxincr)
334{ 386{
335 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); 387 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
336 388
337 if (hctx->cwnd < hctx->ssthresh) { 389 sk_stop_timer(sk, &hctx->ccid2hctx_rtotimer);
338 if (*maxincr > 0 && ++hctx->packets_acked == 2) { 390 ccid2_pr_debug("deleted RTO timer\n");
339 hctx->cwnd += 1;
340 *maxincr -= 1;
341 hctx->packets_acked = 0;
342 }
343 } else if (++hctx->packets_acked >= hctx->cwnd) {
344 hctx->cwnd += 1;
345 hctx->packets_acked = 0;
346 }
347 /*
348 * FIXME: RTT is sampled several times per acknowledgment (for each
349 * entry in the Ack Vector), instead of once per Ack (as in TCP SACK).
350 * This causes the RTT to be over-estimated, since the older entries
351 * in the Ack Vector have earlier sending times.
352 * The cleanest solution is to not use the ccid2s_sent field at all
353 * and instead use DCCP timestamps - need to be resolved at some time.
354 */
355 ccid2_rtt_estimator(sk, jiffies - seqp->ccid2s_sent);
356} 391}
357 392
358static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp) 393static inline void ccid2_new_ack(struct sock *sk,
394 struct ccid2_seq *seqp,
395 unsigned int *maxincr)
359{ 396{
360 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); 397 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
361 398
362 if (time_before(seqp->ccid2s_sent, hctx->last_cong)) { 399 if (hctx->ccid2hctx_cwnd < hctx->ccid2hctx_ssthresh) {
363 ccid2_pr_debug("Multiple losses in an RTT---treating as one\n"); 400 if (*maxincr > 0 && ++hctx->ccid2hctx_packets_acked == 2) {
364 return; 401 hctx->ccid2hctx_cwnd += 1;
402 *maxincr -= 1;
403 hctx->ccid2hctx_packets_acked = 0;
404 }
405 } else if (++hctx->ccid2hctx_packets_acked >= hctx->ccid2hctx_cwnd) {
406 hctx->ccid2hctx_cwnd += 1;
407 hctx->ccid2hctx_packets_acked = 0;
365 } 408 }
366 409
367 hctx->last_cong = jiffies; 410 /* update RTO */
411 if (hctx->ccid2hctx_srtt == -1 ||
412 time_after(jiffies, hctx->ccid2hctx_lastrtt + hctx->ccid2hctx_srtt)) {
413 unsigned long r = (long)jiffies - (long)seqp->ccid2s_sent;
414 int s;
415
416 /* first measurement */
417 if (hctx->ccid2hctx_srtt == -1) {
418 ccid2_pr_debug("R: %lu Time=%lu seq=%llu\n",
419 r, jiffies,
420 (unsigned long long)seqp->ccid2s_seq);
421 ccid2_change_srtt(hctx, r);
422 hctx->ccid2hctx_rttvar = r >> 1;
423 } else {
424 /* RTTVAR */
425 long tmp = hctx->ccid2hctx_srtt - r;
426 long srtt;
427
428 if (tmp < 0)
429 tmp *= -1;
430
431 tmp >>= 2;
432 hctx->ccid2hctx_rttvar *= 3;
433 hctx->ccid2hctx_rttvar >>= 2;
434 hctx->ccid2hctx_rttvar += tmp;
435
436 /* SRTT */
437 srtt = hctx->ccid2hctx_srtt;
438 srtt *= 7;
439 srtt >>= 3;
440 tmp = r >> 3;
441 srtt += tmp;
442 ccid2_change_srtt(hctx, srtt);
443 }
444 s = hctx->ccid2hctx_rttvar << 2;
445 /* clock granularity is 1 when based on jiffies */
446 if (!s)
447 s = 1;
448 hctx->ccid2hctx_rto = hctx->ccid2hctx_srtt + s;
449
450 /* must be at least a second */
451 s = hctx->ccid2hctx_rto / HZ;
452 /* DCCP doesn't require this [but I like it cuz my code sux] */
453#if 1
454 if (s < 1)
455 hctx->ccid2hctx_rto = HZ;
456#endif
457 /* max 60 seconds */
458 if (s > 60)
459 hctx->ccid2hctx_rto = HZ * 60;
368 460
369 hctx->cwnd = hctx->cwnd / 2 ? : 1U; 461 hctx->ccid2hctx_lastrtt = jiffies;
370 hctx->ssthresh = max(hctx->cwnd, 2U);
371 462
372 /* Avoid spurious timeouts resulting from Ack Ratio > cwnd */ 463 ccid2_pr_debug("srtt: %ld rttvar: %ld rto: %ld (HZ=%d) R=%lu\n",
373 if (dccp_sk(sk)->dccps_l_ack_ratio > hctx->cwnd) 464 hctx->ccid2hctx_srtt, hctx->ccid2hctx_rttvar,
374 ccid2_change_l_ack_ratio(sk, hctx->cwnd); 465 hctx->ccid2hctx_rto, HZ, r);
466 }
467
468 /* we got a new ack, so re-start RTO timer */
469 ccid2_hc_tx_kill_rto_timer(sk);
470 ccid2_start_rto_timer(sk);
375} 471}
376 472
377static int ccid2_hc_tx_parse_options(struct sock *sk, u8 packet_type, 473static void ccid2_hc_tx_dec_pipe(struct sock *sk)
378 u8 option, u8 *optval, u8 optlen)
379{ 474{
380 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); 475 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
381 476
382 switch (option) { 477 if (hctx->ccid2hctx_pipe == 0)
383 case DCCPO_ACK_VECTOR_0: 478 DCCP_BUG("pipe == 0");
384 case DCCPO_ACK_VECTOR_1: 479 else
385 return dccp_ackvec_parsed_add(&hctx->av_chunks, optval, optlen, 480 hctx->ccid2hctx_pipe--;
386 option - DCCPO_ACK_VECTOR_0); 481
482 if (hctx->ccid2hctx_pipe == 0)
483 ccid2_hc_tx_kill_rto_timer(sk);
484}
485
486static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp)
487{
488 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
489
490 if (time_before(seqp->ccid2s_sent, hctx->ccid2hctx_last_cong)) {
491 ccid2_pr_debug("Multiple losses in an RTT---treating as one\n");
492 return;
387 } 493 }
388 return 0; 494
495 hctx->ccid2hctx_last_cong = jiffies;
496
497 hctx->ccid2hctx_cwnd = hctx->ccid2hctx_cwnd / 2 ? : 1U;
498 hctx->ccid2hctx_ssthresh = max(hctx->ccid2hctx_cwnd, 2U);
499
500 /* Avoid spurious timeouts resulting from Ack Ratio > cwnd */
501 if (dccp_sk(sk)->dccps_l_ack_ratio > hctx->ccid2hctx_cwnd)
502 ccid2_change_l_ack_ratio(sk, hctx->ccid2hctx_cwnd);
389} 503}
390 504
391static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) 505static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
392{ 506{
393 struct dccp_sock *dp = dccp_sk(sk); 507 struct dccp_sock *dp = dccp_sk(sk);
394 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); 508 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
395 const bool sender_was_blocked = ccid2_cwnd_network_limited(hctx);
396 struct dccp_ackvec_parsed *avp;
397 u64 ackno, seqno; 509 u64 ackno, seqno;
398 struct ccid2_seq *seqp; 510 struct ccid2_seq *seqp;
511 unsigned char *vector;
512 unsigned char veclen;
513 int offset = 0;
399 int done = 0; 514 int done = 0;
400 unsigned int maxincr = 0; 515 unsigned int maxincr = 0;
401 516
517 ccid2_hc_tx_check_sanity(hctx);
402 /* check reverse path congestion */ 518 /* check reverse path congestion */
403 seqno = DCCP_SKB_CB(skb)->dccpd_seq; 519 seqno = DCCP_SKB_CB(skb)->dccpd_seq;
404 520
@@ -407,21 +523,21 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
407 * -sorbo. 523 * -sorbo.
408 */ 524 */
409 /* need to bootstrap */ 525 /* need to bootstrap */
410 if (hctx->rpdupack == -1) { 526 if (hctx->ccid2hctx_rpdupack == -1) {
411 hctx->rpdupack = 0; 527 hctx->ccid2hctx_rpdupack = 0;
412 hctx->rpseq = seqno; 528 hctx->ccid2hctx_rpseq = seqno;
413 } else { 529 } else {
414 /* check if packet is consecutive */ 530 /* check if packet is consecutive */
415 if (dccp_delta_seqno(hctx->rpseq, seqno) == 1) 531 if (dccp_delta_seqno(hctx->ccid2hctx_rpseq, seqno) == 1)
416 hctx->rpseq = seqno; 532 hctx->ccid2hctx_rpseq = seqno;
417 /* it's a later packet */ 533 /* it's a later packet */
418 else if (after48(seqno, hctx->rpseq)) { 534 else if (after48(seqno, hctx->ccid2hctx_rpseq)) {
419 hctx->rpdupack++; 535 hctx->ccid2hctx_rpdupack++;
420 536
421 /* check if we got enough dupacks */ 537 /* check if we got enough dupacks */
422 if (hctx->rpdupack >= NUMDUPACK) { 538 if (hctx->ccid2hctx_rpdupack >= NUMDUPACK) {
423 hctx->rpdupack = -1; /* XXX lame */ 539 hctx->ccid2hctx_rpdupack = -1; /* XXX lame */
424 hctx->rpseq = 0; 540 hctx->ccid2hctx_rpseq = 0;
425 541
426 ccid2_change_l_ack_ratio(sk, 2 * dp->dccps_l_ack_ratio); 542 ccid2_change_l_ack_ratio(sk, 2 * dp->dccps_l_ack_ratio);
427 } 543 }
@@ -429,22 +545,27 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
429 } 545 }
430 546
431 /* check forward path congestion */ 547 /* check forward path congestion */
432 if (dccp_packet_without_ack(skb)) 548 /* still didn't send out new data packets */
549 if (hctx->ccid2hctx_seqh == hctx->ccid2hctx_seqt)
433 return; 550 return;
434 551
435 /* still didn't send out new data packets */ 552 switch (DCCP_SKB_CB(skb)->dccpd_type) {
436 if (hctx->seqh == hctx->seqt) 553 case DCCP_PKT_ACK:
437 goto done; 554 case DCCP_PKT_DATAACK:
555 break;
556 default:
557 return;
558 }
438 559
439 ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq; 560 ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq;
440 if (after48(ackno, hctx->high_ack)) 561 if (after48(ackno, hctx->ccid2hctx_high_ack))
441 hctx->high_ack = ackno; 562 hctx->ccid2hctx_high_ack = ackno;
442 563
443 seqp = hctx->seqt; 564 seqp = hctx->ccid2hctx_seqt;
444 while (before48(seqp->ccid2s_seq, ackno)) { 565 while (before48(seqp->ccid2s_seq, ackno)) {
445 seqp = seqp->ccid2s_next; 566 seqp = seqp->ccid2s_next;
446 if (seqp == hctx->seqh) { 567 if (seqp == hctx->ccid2hctx_seqh) {
447 seqp = hctx->seqh->ccid2s_prev; 568 seqp = hctx->ccid2hctx_seqh->ccid2s_prev;
448 break; 569 break;
449 } 570 }
450 } 571 }
@@ -454,26 +575,26 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
454 * packets per acknowledgement. Rounding up avoids that cwnd is not 575 * packets per acknowledgement. Rounding up avoids that cwnd is not
455 * advanced when Ack Ratio is 1 and gives a slight edge otherwise. 576 * advanced when Ack Ratio is 1 and gives a slight edge otherwise.
456 */ 577 */
457 if (hctx->cwnd < hctx->ssthresh) 578 if (hctx->ccid2hctx_cwnd < hctx->ccid2hctx_ssthresh)
458 maxincr = DIV_ROUND_UP(dp->dccps_l_ack_ratio, 2); 579 maxincr = DIV_ROUND_UP(dp->dccps_l_ack_ratio, 2);
459 580
460 /* go through all ack vectors */ 581 /* go through all ack vectors */
461 list_for_each_entry(avp, &hctx->av_chunks, node) { 582 while ((offset = ccid2_ackvector(sk, skb, offset,
583 &vector, &veclen)) != -1) {
462 /* go through this ack vector */ 584 /* go through this ack vector */
463 for (; avp->len--; avp->vec++) { 585 while (veclen--) {
464 u64 ackno_end_rl = SUB48(ackno, 586 const u8 rl = *vector & DCCP_ACKVEC_LEN_MASK;
465 dccp_ackvec_runlen(avp->vec)); 587 u64 ackno_end_rl = SUB48(ackno, rl);
466 588
467 ccid2_pr_debug("ackvec %llu |%u,%u|\n", 589 ccid2_pr_debug("ackvec start:%llu end:%llu\n",
468 (unsigned long long)ackno, 590 (unsigned long long)ackno,
469 dccp_ackvec_state(avp->vec) >> 6, 591 (unsigned long long)ackno_end_rl);
470 dccp_ackvec_runlen(avp->vec));
471 /* if the seqno we are analyzing is larger than the 592 /* if the seqno we are analyzing is larger than the
472 * current ackno, then move towards the tail of our 593 * current ackno, then move towards the tail of our
473 * seqnos. 594 * seqnos.
474 */ 595 */
475 while (after48(seqp->ccid2s_seq, ackno)) { 596 while (after48(seqp->ccid2s_seq, ackno)) {
476 if (seqp == hctx->seqt) { 597 if (seqp == hctx->ccid2hctx_seqt) {
477 done = 1; 598 done = 1;
478 break; 599 break;
479 } 600 }
@@ -486,24 +607,26 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
486 * run length 607 * run length
487 */ 608 */
488 while (between48(seqp->ccid2s_seq,ackno_end_rl,ackno)) { 609 while (between48(seqp->ccid2s_seq,ackno_end_rl,ackno)) {
489 const u8 state = dccp_ackvec_state(avp->vec); 610 const u8 state = *vector &
611 DCCP_ACKVEC_STATE_MASK;
490 612
491 /* new packet received or marked */ 613 /* new packet received or marked */
492 if (state != DCCPAV_NOT_RECEIVED && 614 if (state != DCCP_ACKVEC_STATE_NOT_RECEIVED &&
493 !seqp->ccid2s_acked) { 615 !seqp->ccid2s_acked) {
494 if (state == DCCPAV_ECN_MARKED) 616 if (state ==
617 DCCP_ACKVEC_STATE_ECN_MARKED) {
495 ccid2_congestion_event(sk, 618 ccid2_congestion_event(sk,
496 seqp); 619 seqp);
497 else 620 } else
498 ccid2_new_ack(sk, seqp, 621 ccid2_new_ack(sk, seqp,
499 &maxincr); 622 &maxincr);
500 623
501 seqp->ccid2s_acked = 1; 624 seqp->ccid2s_acked = 1;
502 ccid2_pr_debug("Got ack for %llu\n", 625 ccid2_pr_debug("Got ack for %llu\n",
503 (unsigned long long)seqp->ccid2s_seq); 626 (unsigned long long)seqp->ccid2s_seq);
504 hctx->pipe--; 627 ccid2_hc_tx_dec_pipe(sk);
505 } 628 }
506 if (seqp == hctx->seqt) { 629 if (seqp == hctx->ccid2hctx_seqt) {
507 done = 1; 630 done = 1;
508 break; 631 break;
509 } 632 }
@@ -513,6 +636,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
513 break; 636 break;
514 637
515 ackno = SUB48(ackno_end_rl, 1); 638 ackno = SUB48(ackno_end_rl, 1);
639 vector++;
516 } 640 }
517 if (done) 641 if (done)
518 break; 642 break;
@@ -521,11 +645,11 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
521 /* The state about what is acked should be correct now 645 /* The state about what is acked should be correct now
522 * Check for NUMDUPACK 646 * Check for NUMDUPACK
523 */ 647 */
524 seqp = hctx->seqt; 648 seqp = hctx->ccid2hctx_seqt;
525 while (before48(seqp->ccid2s_seq, hctx->high_ack)) { 649 while (before48(seqp->ccid2s_seq, hctx->ccid2hctx_high_ack)) {
526 seqp = seqp->ccid2s_next; 650 seqp = seqp->ccid2s_next;
527 if (seqp == hctx->seqh) { 651 if (seqp == hctx->ccid2hctx_seqh) {
528 seqp = hctx->seqh->ccid2s_prev; 652 seqp = hctx->ccid2hctx_seqh->ccid2s_prev;
529 break; 653 break;
530 } 654 }
531 } 655 }
@@ -536,7 +660,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
536 if (done == NUMDUPACK) 660 if (done == NUMDUPACK)
537 break; 661 break;
538 } 662 }
539 if (seqp == hctx->seqt) 663 if (seqp == hctx->ccid2hctx_seqt)
540 break; 664 break;
541 seqp = seqp->ccid2s_prev; 665 seqp = seqp->ccid2s_prev;
542 } 666 }
@@ -557,34 +681,25 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
557 * one ack vector. 681 * one ack vector.
558 */ 682 */
559 ccid2_congestion_event(sk, seqp); 683 ccid2_congestion_event(sk, seqp);
560 hctx->pipe--; 684 ccid2_hc_tx_dec_pipe(sk);
561 } 685 }
562 if (seqp == hctx->seqt) 686 if (seqp == hctx->ccid2hctx_seqt)
563 break; 687 break;
564 seqp = seqp->ccid2s_prev; 688 seqp = seqp->ccid2s_prev;
565 } 689 }
566 690
567 hctx->seqt = last_acked; 691 hctx->ccid2hctx_seqt = last_acked;
568 } 692 }
569 693
570 /* trim acked packets in tail */ 694 /* trim acked packets in tail */
571 while (hctx->seqt != hctx->seqh) { 695 while (hctx->ccid2hctx_seqt != hctx->ccid2hctx_seqh) {
572 if (!hctx->seqt->ccid2s_acked) 696 if (!hctx->ccid2hctx_seqt->ccid2s_acked)
573 break; 697 break;
574 698
575 hctx->seqt = hctx->seqt->ccid2s_next; 699 hctx->ccid2hctx_seqt = hctx->ccid2hctx_seqt->ccid2s_next;
576 } 700 }
577 701
578 /* restart RTO timer if not all outstanding data has been acked */ 702 ccid2_hc_tx_check_sanity(hctx);
579 if (hctx->pipe == 0)
580 sk_stop_timer(sk, &hctx->rtotimer);
581 else
582 sk_reset_timer(sk, &hctx->rtotimer, jiffies + hctx->rto);
583done:
584 /* check if incoming Acks allow pending packets to be sent */
585 if (sender_was_blocked && !ccid2_cwnd_network_limited(hctx))
586 tasklet_schedule(&dccp_sk(sk)->dccps_xmitlet);
587 dccp_ackvec_parsed_cleanup(&hctx->av_chunks);
588} 703}
589 704
590static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk) 705static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk)
@@ -594,13 +709,17 @@ static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk)
594 u32 max_ratio; 709 u32 max_ratio;
595 710
596 /* RFC 4341, 5: initialise ssthresh to arbitrarily high (max) value */ 711 /* RFC 4341, 5: initialise ssthresh to arbitrarily high (max) value */
597 hctx->ssthresh = ~0U; 712 hctx->ccid2hctx_ssthresh = ~0U;
598 713
599 /* Use larger initial windows (RFC 3390, rfc2581bis) */ 714 /*
600 hctx->cwnd = rfc3390_bytes_to_packets(dp->dccps_mss_cache); 715 * RFC 4341, 5: "The cwnd parameter is initialized to at most four
716 * packets for new connections, following the rules from [RFC3390]".
717 * We need to convert the bytes of RFC3390 into the packets of RFC 4341.
718 */
719 hctx->ccid2hctx_cwnd = clamp(4380U / dp->dccps_mss_cache, 2U, 4U);
601 720
602 /* Make sure that Ack Ratio is enabled and within bounds. */ 721 /* Make sure that Ack Ratio is enabled and within bounds. */
603 max_ratio = DIV_ROUND_UP(hctx->cwnd, 2); 722 max_ratio = DIV_ROUND_UP(hctx->ccid2hctx_cwnd, 2);
604 if (dp->dccps_l_ack_ratio == 0 || dp->dccps_l_ack_ratio > max_ratio) 723 if (dp->dccps_l_ack_ratio == 0 || dp->dccps_l_ack_ratio > max_ratio)
605 dp->dccps_l_ack_ratio = max_ratio; 724 dp->dccps_l_ack_ratio = max_ratio;
606 725
@@ -608,11 +727,15 @@ static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk)
608 if (ccid2_hc_tx_alloc_seq(hctx)) 727 if (ccid2_hc_tx_alloc_seq(hctx))
609 return -ENOMEM; 728 return -ENOMEM;
610 729
611 hctx->rto = DCCP_TIMEOUT_INIT; 730 hctx->ccid2hctx_rto = 3 * HZ;
612 hctx->rpdupack = -1; 731 ccid2_change_srtt(hctx, -1);
613 hctx->last_cong = jiffies; 732 hctx->ccid2hctx_rttvar = -1;
614 setup_timer(&hctx->rtotimer, ccid2_hc_tx_rto_expire, (unsigned long)sk); 733 hctx->ccid2hctx_rpdupack = -1;
615 INIT_LIST_HEAD(&hctx->av_chunks); 734 hctx->ccid2hctx_last_cong = jiffies;
735 setup_timer(&hctx->ccid2hctx_rtotimer, ccid2_hc_tx_rto_expire,
736 (unsigned long)sk);
737
738 ccid2_hc_tx_check_sanity(hctx);
616 return 0; 739 return 0;
617} 740}
618 741
@@ -621,11 +744,11 @@ static void ccid2_hc_tx_exit(struct sock *sk)
621 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); 744 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
622 int i; 745 int i;
623 746
624 sk_stop_timer(sk, &hctx->rtotimer); 747 ccid2_hc_tx_kill_rto_timer(sk);
625 748
626 for (i = 0; i < hctx->seqbufc; i++) 749 for (i = 0; i < hctx->ccid2hctx_seqbufc; i++)
627 kfree(hctx->seqbuf[i]); 750 kfree(hctx->ccid2hctx_seqbuf[i]);
628 hctx->seqbufc = 0; 751 hctx->ccid2hctx_seqbufc = 0;
629} 752}
630 753
631static void ccid2_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) 754static void ccid2_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
@@ -636,28 +759,27 @@ static void ccid2_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
636 switch (DCCP_SKB_CB(skb)->dccpd_type) { 759 switch (DCCP_SKB_CB(skb)->dccpd_type) {
637 case DCCP_PKT_DATA: 760 case DCCP_PKT_DATA:
638 case DCCP_PKT_DATAACK: 761 case DCCP_PKT_DATAACK:
639 hcrx->data++; 762 hcrx->ccid2hcrx_data++;
640 if (hcrx->data >= dp->dccps_r_ack_ratio) { 763 if (hcrx->ccid2hcrx_data >= dp->dccps_r_ack_ratio) {
641 dccp_send_ack(sk); 764 dccp_send_ack(sk);
642 hcrx->data = 0; 765 hcrx->ccid2hcrx_data = 0;
643 } 766 }
644 break; 767 break;
645 } 768 }
646} 769}
647 770
648static struct ccid_operations ccid2 = { 771static struct ccid_operations ccid2 = {
649 .ccid_id = DCCPC_CCID2, 772 .ccid_id = DCCPC_CCID2,
650 .ccid_name = "TCP-like", 773 .ccid_name = "TCP-like",
651 .ccid_owner = THIS_MODULE, 774 .ccid_owner = THIS_MODULE,
652 .ccid_hc_tx_obj_size = sizeof(struct ccid2_hc_tx_sock), 775 .ccid_hc_tx_obj_size = sizeof(struct ccid2_hc_tx_sock),
653 .ccid_hc_tx_init = ccid2_hc_tx_init, 776 .ccid_hc_tx_init = ccid2_hc_tx_init,
654 .ccid_hc_tx_exit = ccid2_hc_tx_exit, 777 .ccid_hc_tx_exit = ccid2_hc_tx_exit,
655 .ccid_hc_tx_send_packet = ccid2_hc_tx_send_packet, 778 .ccid_hc_tx_send_packet = ccid2_hc_tx_send_packet,
656 .ccid_hc_tx_packet_sent = ccid2_hc_tx_packet_sent, 779 .ccid_hc_tx_packet_sent = ccid2_hc_tx_packet_sent,
657 .ccid_hc_tx_parse_options = ccid2_hc_tx_parse_options, 780 .ccid_hc_tx_packet_recv = ccid2_hc_tx_packet_recv,
658 .ccid_hc_tx_packet_recv = ccid2_hc_tx_packet_recv, 781 .ccid_hc_rx_obj_size = sizeof(struct ccid2_hc_rx_sock),
659 .ccid_hc_rx_obj_size = sizeof(struct ccid2_hc_rx_sock), 782 .ccid_hc_rx_packet_recv = ccid2_hc_rx_packet_recv,
660 .ccid_hc_rx_packet_recv = ccid2_hc_rx_packet_recv,
661}; 783};
662 784
663#ifdef CONFIG_IP_DCCP_CCID2_DEBUG 785#ifdef CONFIG_IP_DCCP_CCID2_DEBUG
diff --git a/net/dccp/ccids/ccid2.h b/net/dccp/ccids/ccid2.h
index 8b7a2dee2f6d..2c94ca029010 100644
--- a/net/dccp/ccids/ccid2.h
+++ b/net/dccp/ccids/ccid2.h
@@ -42,49 +42,34 @@ struct ccid2_seq {
42 42
43/** struct ccid2_hc_tx_sock - CCID2 TX half connection 43/** struct ccid2_hc_tx_sock - CCID2 TX half connection
44 * 44 *
45 * @{cwnd,ssthresh,pipe}: as per RFC 4341, section 5 45 * @ccid2hctx_{cwnd,ssthresh,pipe}: as per RFC 4341, section 5
46 * @packets_acked: Ack counter for deriving cwnd growth (RFC 3465) 46 * @ccid2hctx_packets_acked - Ack counter for deriving cwnd growth (RFC 3465)
47 * @srtt: smoothed RTT estimate, scaled by 2^3 47 * @ccid2hctx_lastrtt -time RTT was last measured
48 * @mdev: smoothed RTT variation, scaled by 2^2 48 * @ccid2hctx_rpseq - last consecutive seqno
49 * @mdev_max: maximum of @mdev during one flight 49 * @ccid2hctx_rpdupack - dupacks since rpseq
50 * @rttvar: moving average/maximum of @mdev_max 50*/
51 * @rto: RTO value deriving from SRTT and RTTVAR (RFC 2988)
52 * @rtt_seq: to decay RTTVAR at most once per flight
53 * @rpseq: last consecutive seqno
54 * @rpdupack: dupacks since rpseq
55 * @av_chunks: list of Ack Vectors received on current skb
56 */
57struct ccid2_hc_tx_sock { 51struct ccid2_hc_tx_sock {
58 u32 cwnd; 52 u32 ccid2hctx_cwnd;
59 u32 ssthresh; 53 u32 ccid2hctx_ssthresh;
60 u32 pipe; 54 u32 ccid2hctx_pipe;
61 u32 packets_acked; 55 u32 ccid2hctx_packets_acked;
62 struct ccid2_seq *seqbuf[CCID2_SEQBUF_MAX]; 56 struct ccid2_seq *ccid2hctx_seqbuf[CCID2_SEQBUF_MAX];
63 int seqbufc; 57 int ccid2hctx_seqbufc;
64 struct ccid2_seq *seqh; 58 struct ccid2_seq *ccid2hctx_seqh;
65 struct ccid2_seq *seqt; 59 struct ccid2_seq *ccid2hctx_seqt;
66 /* RTT measurement: variables/principles are the same as in TCP */ 60 long ccid2hctx_rto;
67 u32 srtt, 61 long ccid2hctx_srtt;
68 mdev, 62 long ccid2hctx_rttvar;
69 mdev_max, 63 unsigned long ccid2hctx_lastrtt;
70 rttvar, 64 struct timer_list ccid2hctx_rtotimer;
71 rto; 65 u64 ccid2hctx_rpseq;
72 u64 rtt_seq:48; 66 int ccid2hctx_rpdupack;
73 struct timer_list rtotimer; 67 unsigned long ccid2hctx_last_cong;
74 u64 rpseq; 68 u64 ccid2hctx_high_ack;
75 int rpdupack;
76 unsigned long last_cong;
77 u64 high_ack;
78 struct list_head av_chunks;
79}; 69};
80 70
81static inline bool ccid2_cwnd_network_limited(struct ccid2_hc_tx_sock *hctx)
82{
83 return (hctx->pipe >= hctx->cwnd);
84}
85
86struct ccid2_hc_rx_sock { 71struct ccid2_hc_rx_sock {
87 int data; 72 int ccid2hcrx_data;
88}; 73};
89 74
90static inline struct ccid2_hc_tx_sock *ccid2_hc_tx_sk(const struct sock *sk) 75static inline struct ccid2_hc_tx_sock *ccid2_hc_tx_sk(const struct sock *sk)
diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c
index 06cfdad84a6a..3b8bd7ca6761 100644
--- a/net/dccp/ccids/ccid3.c
+++ b/net/dccp/ccids/ccid3.c
@@ -49,41 +49,75 @@ static int ccid3_debug;
49/* 49/*
50 * Transmitter Half-Connection Routines 50 * Transmitter Half-Connection Routines
51 */ 51 */
52/* Oscillation Prevention/Reduction: recommended by rfc3448bis, on by default */ 52#ifdef CONFIG_IP_DCCP_CCID3_DEBUG
53static int do_osc_prev = true; 53static const char *ccid3_tx_state_name(enum ccid3_hc_tx_states state)
54{
55 static char *ccid3_state_names[] = {
56 [TFRC_SSTATE_NO_SENT] = "NO_SENT",
57 [TFRC_SSTATE_NO_FBACK] = "NO_FBACK",
58 [TFRC_SSTATE_FBACK] = "FBACK",
59 [TFRC_SSTATE_TERM] = "TERM",
60 };
61
62 return ccid3_state_names[state];
63}
64#endif
65
66static void ccid3_hc_tx_set_state(struct sock *sk,
67 enum ccid3_hc_tx_states state)
68{
69 struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
70 enum ccid3_hc_tx_states oldstate = hctx->ccid3hctx_state;
71
72 ccid3_pr_debug("%s(%p) %-8.8s -> %s\n",
73 dccp_role(sk), sk, ccid3_tx_state_name(oldstate),
74 ccid3_tx_state_name(state));
75 WARN_ON(state == oldstate);
76 hctx->ccid3hctx_state = state;
77}
54 78
55/* 79/*
56 * Compute the initial sending rate X_init in the manner of RFC 3390: 80 * Compute the initial sending rate X_init in the manner of RFC 3390:
57 * 81 *
58 * X_init = min(4 * MPS, max(2 * MPS, 4380 bytes)) / RTT 82 * X_init = min(4 * s, max(2 * s, 4380 bytes)) / RTT
59 * 83 *
84 * Note that RFC 3390 uses MSS, RFC 4342 refers to RFC 3390, and rfc3448bis
85 * (rev-02) clarifies the use of RFC 3390 with regard to the above formula.
60 * For consistency with other parts of the code, X_init is scaled by 2^6. 86 * For consistency with other parts of the code, X_init is scaled by 2^6.
61 */ 87 */
62static inline u64 rfc3390_initial_rate(struct sock *sk) 88static inline u64 rfc3390_initial_rate(struct sock *sk)
63{ 89{
64 const u32 mps = dccp_sk(sk)->dccps_mss_cache, 90 const struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
65 w_init = clamp(4380U, 2 * mps, 4 * mps); 91 const __u32 w_init = clamp_t(__u32, 4380U,
92 2 * hctx->ccid3hctx_s, 4 * hctx->ccid3hctx_s);
66 93
67 return scaled_div(w_init << 6, ccid3_hc_tx_sk(sk)->rtt); 94 return scaled_div(w_init << 6, hctx->ccid3hctx_rtt);
68} 95}
69 96
70/** 97/*
71 * ccid3_update_send_interval - Calculate new t_ipi = s / X 98 * Recalculate t_ipi and delta (should be called whenever X changes)
72 * This respects the granularity of X (64 * bytes/second) and enforces the
73 * scaled minimum of s * 64 / t_mbi = `s' bytes/second as per RFC 3448/4342.
74 */ 99 */
75static void ccid3_update_send_interval(struct ccid3_hc_tx_sock *hctx) 100static void ccid3_update_send_interval(struct ccid3_hc_tx_sock *hctx)
76{ 101{
77 if (unlikely(hctx->x <= hctx->s)) 102 /* Calculate new t_ipi = s / X_inst (X_inst is in 64 * bytes/second) */
78 hctx->x = hctx->s; 103 hctx->ccid3hctx_t_ipi = scaled_div32(((u64)hctx->ccid3hctx_s) << 6,
79 hctx->t_ipi = scaled_div32(((u64)hctx->s) << 6, hctx->x); 104 hctx->ccid3hctx_x);
105
106 /* Calculate new delta by delta = min(t_ipi / 2, t_gran / 2) */
107 hctx->ccid3hctx_delta = min_t(u32, hctx->ccid3hctx_t_ipi / 2,
108 TFRC_OPSYS_HALF_TIME_GRAN);
109
110 ccid3_pr_debug("t_ipi=%u, delta=%u, s=%u, X=%u\n",
111 hctx->ccid3hctx_t_ipi, hctx->ccid3hctx_delta,
112 hctx->ccid3hctx_s, (unsigned)(hctx->ccid3hctx_x >> 6));
113
80} 114}
81 115
82static u32 ccid3_hc_tx_idle_rtt(struct ccid3_hc_tx_sock *hctx, ktime_t now) 116static u32 ccid3_hc_tx_idle_rtt(struct ccid3_hc_tx_sock *hctx, ktime_t now)
83{ 117{
84 u32 delta = ktime_us_delta(now, hctx->t_last_win_count); 118 u32 delta = ktime_us_delta(now, hctx->ccid3hctx_t_last_win_count);
85 119
86 return delta / hctx->rtt; 120 return delta / hctx->ccid3hctx_rtt;
87} 121}
88 122
89/** 123/**
@@ -99,8 +133,8 @@ static u32 ccid3_hc_tx_idle_rtt(struct ccid3_hc_tx_sock *hctx, ktime_t now)
99static void ccid3_hc_tx_update_x(struct sock *sk, ktime_t *stamp) 133static void ccid3_hc_tx_update_x(struct sock *sk, ktime_t *stamp)
100{ 134{
101 struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); 135 struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
102 u64 min_rate = 2 * hctx->x_recv; 136 __u64 min_rate = 2 * hctx->ccid3hctx_x_recv;
103 const u64 old_x = hctx->x; 137 const __u64 old_x = hctx->ccid3hctx_x;
104 ktime_t now = stamp ? *stamp : ktime_get_real(); 138 ktime_t now = stamp ? *stamp : ktime_get_real();
105 139
106 /* 140 /*
@@ -111,44 +145,50 @@ static void ccid3_hc_tx_update_x(struct sock *sk, ktime_t *stamp)
111 */ 145 */
112 if (ccid3_hc_tx_idle_rtt(hctx, now) >= 2) { 146 if (ccid3_hc_tx_idle_rtt(hctx, now) >= 2) {
113 min_rate = rfc3390_initial_rate(sk); 147 min_rate = rfc3390_initial_rate(sk);
114 min_rate = max(min_rate, 2 * hctx->x_recv); 148 min_rate = max(min_rate, 2 * hctx->ccid3hctx_x_recv);
115 } 149 }
116 150
117 if (hctx->p > 0) { 151 if (hctx->ccid3hctx_p > 0) {
118 152
119 hctx->x = min(((u64)hctx->x_calc) << 6, min_rate); 153 hctx->ccid3hctx_x = min(((__u64)hctx->ccid3hctx_x_calc) << 6,
154 min_rate);
155 hctx->ccid3hctx_x = max(hctx->ccid3hctx_x,
156 (((__u64)hctx->ccid3hctx_s) << 6) /
157 TFRC_T_MBI);
120 158
121 } else if (ktime_us_delta(now, hctx->t_ld) - (s64)hctx->rtt >= 0) { 159 } else if (ktime_us_delta(now, hctx->ccid3hctx_t_ld)
160 - (s64)hctx->ccid3hctx_rtt >= 0) {
122 161
123 hctx->x = min(2 * hctx->x, min_rate); 162 hctx->ccid3hctx_x = min(2 * hctx->ccid3hctx_x, min_rate);
124 hctx->x = max(hctx->x, 163 hctx->ccid3hctx_x = max(hctx->ccid3hctx_x,
125 scaled_div(((u64)hctx->s) << 6, hctx->rtt)); 164 scaled_div(((__u64)hctx->ccid3hctx_s) << 6,
126 hctx->t_ld = now; 165 hctx->ccid3hctx_rtt));
166 hctx->ccid3hctx_t_ld = now;
127 } 167 }
128 168
129 if (hctx->x != old_x) { 169 if (hctx->ccid3hctx_x != old_x) {
130 ccid3_pr_debug("X_prev=%u, X_now=%u, X_calc=%u, " 170 ccid3_pr_debug("X_prev=%u, X_now=%u, X_calc=%u, "
131 "X_recv=%u\n", (unsigned)(old_x >> 6), 171 "X_recv=%u\n", (unsigned)(old_x >> 6),
132 (unsigned)(hctx->x >> 6), hctx->x_calc, 172 (unsigned)(hctx->ccid3hctx_x >> 6),
133 (unsigned)(hctx->x_recv >> 6)); 173 hctx->ccid3hctx_x_calc,
174 (unsigned)(hctx->ccid3hctx_x_recv >> 6));
134 175
135 ccid3_update_send_interval(hctx); 176 ccid3_update_send_interval(hctx);
136 } 177 }
137} 178}
138 179
139/* 180/*
140 * ccid3_hc_tx_measure_packet_size - Measuring the packet size `s' (sec 4.1) 181 * Track the mean packet size `s' (cf. RFC 4342, 5.3 and RFC 3448, 4.1)
141 * @new_len: DCCP payload size in bytes (not used by all methods) 182 * @len: DCCP packet payload size in bytes
142 */ 183 */
143static u32 ccid3_hc_tx_measure_packet_size(struct sock *sk, const u16 new_len) 184static inline void ccid3_hc_tx_update_s(struct ccid3_hc_tx_sock *hctx, int len)
144{ 185{
145#if defined(CONFIG_IP_DCCP_CCID3_MEASURE_S_AS_AVG) 186 const u16 old_s = hctx->ccid3hctx_s;
146 return tfrc_ewma(ccid3_hc_tx_sk(sk)->s, new_len, 9); 187
147#elif defined(CONFIG_IP_DCCP_CCID3_MEASURE_S_AS_MAX) 188 hctx->ccid3hctx_s = tfrc_ewma(hctx->ccid3hctx_s, len, 9);
148 return max(ccid3_hc_tx_sk(sk)->s, new_len); 189
149#else /* CONFIG_IP_DCCP_CCID3_MEASURE_S_AS_MPS */ 190 if (hctx->ccid3hctx_s != old_s)
150 return dccp_sk(sk)->dccps_mss_cache; 191 ccid3_update_send_interval(hctx);
151#endif
152} 192}
153 193
154/* 194/*
@@ -158,13 +198,13 @@ static u32 ccid3_hc_tx_measure_packet_size(struct sock *sk, const u16 new_len)
158static inline void ccid3_hc_tx_update_win_count(struct ccid3_hc_tx_sock *hctx, 198static inline void ccid3_hc_tx_update_win_count(struct ccid3_hc_tx_sock *hctx,
159 ktime_t now) 199 ktime_t now)
160{ 200{
161 u32 delta = ktime_us_delta(now, hctx->t_last_win_count), 201 u32 delta = ktime_us_delta(now, hctx->ccid3hctx_t_last_win_count),
162 quarter_rtts = (4 * delta) / hctx->rtt; 202 quarter_rtts = (4 * delta) / hctx->ccid3hctx_rtt;
163 203
164 if (quarter_rtts > 0) { 204 if (quarter_rtts > 0) {
165 hctx->t_last_win_count = now; 205 hctx->ccid3hctx_t_last_win_count = now;
166 hctx->last_win_count += min(quarter_rtts, 5U); 206 hctx->ccid3hctx_last_win_count += min(quarter_rtts, 5U);
167 hctx->last_win_count &= 0xF; /* mod 16 */ 207 hctx->ccid3hctx_last_win_count &= 0xF; /* mod 16 */
168 } 208 }
169} 209}
170 210
@@ -181,26 +221,25 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data)
181 goto restart_timer; 221 goto restart_timer;
182 } 222 }
183 223
184 ccid3_pr_debug("%s(%p) entry with%s feedback\n", dccp_role(sk), sk, 224 ccid3_pr_debug("%s(%p, state=%s) - entry \n", dccp_role(sk), sk,
185 hctx->feedback ? "" : "out"); 225 ccid3_tx_state_name(hctx->ccid3hctx_state));
186 226
187 /* Ignore and do not restart after leaving the established state */ 227 if (hctx->ccid3hctx_state == TFRC_SSTATE_FBACK)
188 if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN)) 228 ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK);
229 else if (hctx->ccid3hctx_state != TFRC_SSTATE_NO_FBACK)
189 goto out; 230 goto out;
190 231
191 /* Reset feedback state to "no feedback received" */
192 hctx->feedback = false;
193
194 /* 232 /*
195 * Determine new allowed sending rate X as per draft rfc3448bis-00, 4.4 233 * Determine new allowed sending rate X as per draft rfc3448bis-00, 4.4
196 * RTO is 0 if and only if no feedback has been received yet.
197 */ 234 */
198 if (hctx->t_rto == 0 || hctx->p == 0) { 235 if (hctx->ccid3hctx_t_rto == 0 || /* no feedback received yet */
236 hctx->ccid3hctx_p == 0) {
199 237
200 /* halve send rate directly */ 238 /* halve send rate directly */
201 hctx->x /= 2; 239 hctx->ccid3hctx_x = max(hctx->ccid3hctx_x / 2,
240 (((__u64)hctx->ccid3hctx_s) << 6) /
241 TFRC_T_MBI);
202 ccid3_update_send_interval(hctx); 242 ccid3_update_send_interval(hctx);
203
204 } else { 243 } else {
205 /* 244 /*
206 * Modify the cached value of X_recv 245 * Modify the cached value of X_recv
@@ -212,41 +251,44 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data)
212 * 251 *
213 * Note that X_recv is scaled by 2^6 while X_calc is not 252 * Note that X_recv is scaled by 2^6 while X_calc is not
214 */ 253 */
215 BUG_ON(hctx->p && !hctx->x_calc); 254 BUG_ON(hctx->ccid3hctx_p && !hctx->ccid3hctx_x_calc);
216 255
217 if (hctx->x_calc > (hctx->x_recv >> 5)) 256 if (hctx->ccid3hctx_x_calc > (hctx->ccid3hctx_x_recv >> 5))
218 hctx->x_recv /= 2; 257 hctx->ccid3hctx_x_recv =
258 max(hctx->ccid3hctx_x_recv / 2,
259 (((__u64)hctx->ccid3hctx_s) << 6) /
260 (2 * TFRC_T_MBI));
219 else { 261 else {
220 hctx->x_recv = hctx->x_calc; 262 hctx->ccid3hctx_x_recv = hctx->ccid3hctx_x_calc;
221 hctx->x_recv <<= 4; 263 hctx->ccid3hctx_x_recv <<= 4;
222 } 264 }
223 ccid3_hc_tx_update_x(sk, NULL); 265 ccid3_hc_tx_update_x(sk, NULL);
224 } 266 }
225 ccid3_pr_debug("Reduced X to %llu/64 bytes/sec\n", 267 ccid3_pr_debug("Reduced X to %llu/64 bytes/sec\n",
226 (unsigned long long)hctx->x); 268 (unsigned long long)hctx->ccid3hctx_x);
227 269
228 /* 270 /*
229 * Set new timeout for the nofeedback timer. 271 * Set new timeout for the nofeedback timer.
230 * See comments in packet_recv() regarding the value of t_RTO. 272 * See comments in packet_recv() regarding the value of t_RTO.
231 */ 273 */
232 if (unlikely(hctx->t_rto == 0)) /* no feedback received yet */ 274 if (unlikely(hctx->ccid3hctx_t_rto == 0)) /* no feedback yet */
233 t_nfb = TFRC_INITIAL_TIMEOUT; 275 t_nfb = TFRC_INITIAL_TIMEOUT;
234 else 276 else
235 t_nfb = max(hctx->t_rto, 2 * hctx->t_ipi); 277 t_nfb = max(hctx->ccid3hctx_t_rto, 2 * hctx->ccid3hctx_t_ipi);
236 278
237restart_timer: 279restart_timer:
238 sk_reset_timer(sk, &hctx->no_feedback_timer, 280 sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer,
239 jiffies + usecs_to_jiffies(t_nfb)); 281 jiffies + usecs_to_jiffies(t_nfb));
240out: 282out:
241 bh_unlock_sock(sk); 283 bh_unlock_sock(sk);
242 sock_put(sk); 284 sock_put(sk);
243} 285}
244 286
245/** 287/*
246 * ccid3_hc_tx_send_packet - Delay-based dequeueing of TX packets 288 * returns
247 * @skb: next packet candidate to send on @sk 289 * > 0: delay (in msecs) that should pass before actually sending
248 * This function uses the convention of ccid_packet_dequeue_eval() and 290 * = 0: can send immediately
249 * returns a millisecond-delay value between 0 and t_mbi = 64000 msec. 291 * < 0: error condition; do not send packet
250 */ 292 */
251static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) 293static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
252{ 294{
@@ -263,14 +305,18 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
263 if (unlikely(skb->len == 0)) 305 if (unlikely(skb->len == 0))
264 return -EBADMSG; 306 return -EBADMSG;
265 307
266 if (hctx->s == 0) { 308 switch (hctx->ccid3hctx_state) {
267 sk_reset_timer(sk, &hctx->no_feedback_timer, (jiffies + 309 case TFRC_SSTATE_NO_SENT:
310 sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer,
311 (jiffies +
268 usecs_to_jiffies(TFRC_INITIAL_TIMEOUT))); 312 usecs_to_jiffies(TFRC_INITIAL_TIMEOUT)));
269 hctx->last_win_count = 0; 313 hctx->ccid3hctx_last_win_count = 0;
270 hctx->t_last_win_count = now; 314 hctx->ccid3hctx_t_last_win_count = now;
271 315
272 /* Set t_0 for initial packet */ 316 /* Set t_0 for initial packet */
273 hctx->t_nom = now; 317 hctx->ccid3hctx_t_nom = now;
318
319 hctx->ccid3hctx_s = skb->len;
274 320
275 /* 321 /*
276 * Use initial RTT sample when available: recommended by erratum 322 * Use initial RTT sample when available: recommended by erratum
@@ -279,9 +325,9 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
279 */ 325 */
280 if (dp->dccps_syn_rtt) { 326 if (dp->dccps_syn_rtt) {
281 ccid3_pr_debug("SYN RTT = %uus\n", dp->dccps_syn_rtt); 327 ccid3_pr_debug("SYN RTT = %uus\n", dp->dccps_syn_rtt);
282 hctx->rtt = dp->dccps_syn_rtt; 328 hctx->ccid3hctx_rtt = dp->dccps_syn_rtt;
283 hctx->x = rfc3390_initial_rate(sk); 329 hctx->ccid3hctx_x = rfc3390_initial_rate(sk);
284 hctx->t_ld = now; 330 hctx->ccid3hctx_t_ld = now;
285 } else { 331 } else {
286 /* 332 /*
287 * Sender does not have RTT sample: 333 * Sender does not have RTT sample:
@@ -289,20 +335,17 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
289 * is needed in several parts (e.g. window counter); 335 * is needed in several parts (e.g. window counter);
290 * - set sending rate X_pps = 1pps as per RFC 3448, 4.2. 336 * - set sending rate X_pps = 1pps as per RFC 3448, 4.2.
291 */ 337 */
292 hctx->rtt = DCCP_FALLBACK_RTT; 338 hctx->ccid3hctx_rtt = DCCP_FALLBACK_RTT;
293 hctx->x = dp->dccps_mss_cache; 339 hctx->ccid3hctx_x = hctx->ccid3hctx_s;
294 hctx->x <<= 6; 340 hctx->ccid3hctx_x <<= 6;
295 } 341 }
296
297 /* Compute t_ipi = s / X */
298 hctx->s = ccid3_hc_tx_measure_packet_size(sk, skb->len);
299 ccid3_update_send_interval(hctx); 342 ccid3_update_send_interval(hctx);
300 343
301 /* Seed value for Oscillation Prevention (sec. 4.5) */ 344 ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK);
302 hctx->r_sqmean = tfrc_scaled_sqrt(hctx->rtt); 345 break;
303 346 case TFRC_SSTATE_NO_FBACK:
304 } else { 347 case TFRC_SSTATE_FBACK:
305 delay = ktime_us_delta(hctx->t_nom, now); 348 delay = ktime_us_delta(hctx->ccid3hctx_t_nom, now);
306 ccid3_pr_debug("delay=%ld\n", (long)delay); 349 ccid3_pr_debug("delay=%ld\n", (long)delay);
307 /* 350 /*
308 * Scheduling of packet transmissions [RFC 3448, 4.6] 351 * Scheduling of packet transmissions [RFC 3448, 4.6]
@@ -312,80 +355,99 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
312 * else 355 * else
313 * // send the packet in (t_nom - t_now) milliseconds. 356 * // send the packet in (t_nom - t_now) milliseconds.
314 */ 357 */
315 if (delay >= TFRC_T_DELTA) 358 if (delay - (s64)hctx->ccid3hctx_delta >= 1000)
316 return (u32)delay / USEC_PER_MSEC; 359 return (u32)delay / 1000L;
317 360
318 ccid3_hc_tx_update_win_count(hctx, now); 361 ccid3_hc_tx_update_win_count(hctx, now);
362 break;
363 case TFRC_SSTATE_TERM:
364 DCCP_BUG("%s(%p) - Illegal state TERM", dccp_role(sk), sk);
365 return -EINVAL;
319 } 366 }
320 367
321 /* prepare to send now (add options etc.) */ 368 /* prepare to send now (add options etc.) */
322 dp->dccps_hc_tx_insert_options = 1; 369 dp->dccps_hc_tx_insert_options = 1;
323 DCCP_SKB_CB(skb)->dccpd_ccval = hctx->last_win_count; 370 DCCP_SKB_CB(skb)->dccpd_ccval = hctx->ccid3hctx_last_win_count;
324 371
325 /* set the nominal send time for the next following packet */ 372 /* set the nominal send time for the next following packet */
326 hctx->t_nom = ktime_add_us(hctx->t_nom, hctx->t_ipi); 373 hctx->ccid3hctx_t_nom = ktime_add_us(hctx->ccid3hctx_t_nom,
327 return CCID_PACKET_SEND_AT_ONCE; 374 hctx->ccid3hctx_t_ipi);
375 return 0;
328} 376}
329 377
330static void ccid3_hc_tx_packet_sent(struct sock *sk, unsigned int len) 378static void ccid3_hc_tx_packet_sent(struct sock *sk, int more,
379 unsigned int len)
331{ 380{
332 struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); 381 struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
333 382
334 /* Changes to s will become effective the next time X is computed */ 383 ccid3_hc_tx_update_s(hctx, len);
335 hctx->s = ccid3_hc_tx_measure_packet_size(sk, len);
336 384
337 if (tfrc_tx_hist_add(&hctx->hist, dccp_sk(sk)->dccps_gss)) 385 if (tfrc_tx_hist_add(&hctx->ccid3hctx_hist, dccp_sk(sk)->dccps_gss))
338 DCCP_CRIT("packet history - out of memory!"); 386 DCCP_CRIT("packet history - out of memory!");
339} 387}
340 388
341static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) 389static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
342{ 390{
343 struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); 391 struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
344 struct tfrc_tx_hist_entry *acked; 392 struct ccid3_options_received *opt_recv;
345 ktime_t now; 393 ktime_t now;
346 unsigned long t_nfb; 394 unsigned long t_nfb;
347 u32 r_sample; 395 u32 pinv, r_sample;
348 396
349 /* we are only interested in ACKs */ 397 /* we are only interested in ACKs */
350 if (!(DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK || 398 if (!(DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK ||
351 DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_DATAACK)) 399 DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_DATAACK))
352 return; 400 return;
353 /* 401 /* ... and only in the established state */
354 * Locate the acknowledged packet in the TX history. 402 if (hctx->ccid3hctx_state != TFRC_SSTATE_FBACK &&
355 * 403 hctx->ccid3hctx_state != TFRC_SSTATE_NO_FBACK)
356 * Returning "entry not found" here can for instance happen when 404 return;
357 * - the host has not sent out anything (e.g. a passive server), 405
358 * - the Ack is outdated (packet with higher Ack number was received), 406 opt_recv = &hctx->ccid3hctx_options_received;
359 * - it is a bogus Ack (for a packet not sent on this connection). 407 now = ktime_get_real();
360 */ 408
361 acked = tfrc_tx_hist_find_entry(hctx->hist, dccp_hdr_ack_seq(skb)); 409 /* Estimate RTT from history if ACK number is valid */
362 if (acked == NULL) 410 r_sample = tfrc_tx_hist_rtt(hctx->ccid3hctx_hist,
411 DCCP_SKB_CB(skb)->dccpd_ack_seq, now);
412 if (r_sample == 0) {
413 DCCP_WARN("%s(%p): %s with bogus ACK-%llu\n", dccp_role(sk), sk,
414 dccp_packet_name(DCCP_SKB_CB(skb)->dccpd_type),
415 (unsigned long long)DCCP_SKB_CB(skb)->dccpd_ack_seq);
363 return; 416 return;
364 /* For the sake of RTT sampling, ignore/remove all older entries */ 417 }
365 tfrc_tx_hist_purge(&acked->next);
366 418
367 /* Update the moving average for the RTT estimate (RFC 3448, 4.3) */ 419 /* Update receive rate in units of 64 * bytes/second */
368 now = ktime_get_real(); 420 hctx->ccid3hctx_x_recv = opt_recv->ccid3or_receive_rate;
369 r_sample = dccp_sample_rtt(sk, ktime_us_delta(now, acked->stamp)); 421 hctx->ccid3hctx_x_recv <<= 6;
370 hctx->rtt = tfrc_ewma(hctx->rtt, r_sample, 9);
371 422
423 /* Update loss event rate (which is scaled by 1e6) */
424 pinv = opt_recv->ccid3or_loss_event_rate;
425 if (pinv == ~0U || pinv == 0) /* see RFC 4342, 8.5 */
426 hctx->ccid3hctx_p = 0;
427 else /* can not exceed 100% */
428 hctx->ccid3hctx_p = scaled_div(1, pinv);
429 /*
430 * Validate new RTT sample and update moving average
431 */
432 r_sample = dccp_sample_rtt(sk, r_sample);
433 hctx->ccid3hctx_rtt = tfrc_ewma(hctx->ccid3hctx_rtt, r_sample, 9);
372 /* 434 /*
373 * Update allowed sending rate X as per draft rfc3448bis-00, 4.2/3 435 * Update allowed sending rate X as per draft rfc3448bis-00, 4.2/3
374 */ 436 */
375 if (!hctx->feedback) { 437 if (hctx->ccid3hctx_state == TFRC_SSTATE_NO_FBACK) {
376 hctx->feedback = true; 438 ccid3_hc_tx_set_state(sk, TFRC_SSTATE_FBACK);
377 439
378 if (hctx->t_rto == 0) { 440 if (hctx->ccid3hctx_t_rto == 0) {
379 /* 441 /*
380 * Initial feedback packet: Larger Initial Windows (4.2) 442 * Initial feedback packet: Larger Initial Windows (4.2)
381 */ 443 */
382 hctx->x = rfc3390_initial_rate(sk); 444 hctx->ccid3hctx_x = rfc3390_initial_rate(sk);
383 hctx->t_ld = now; 445 hctx->ccid3hctx_t_ld = now;
384 446
385 ccid3_update_send_interval(hctx); 447 ccid3_update_send_interval(hctx);
386 448
387 goto done_computing_x; 449 goto done_computing_x;
388 } else if (hctx->p == 0) { 450 } else if (hctx->ccid3hctx_p == 0) {
389 /* 451 /*
390 * First feedback after nofeedback timer expiry (4.3) 452 * First feedback after nofeedback timer expiry (4.3)
391 */ 453 */
@@ -394,52 +456,25 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
394 } 456 }
395 457
396 /* Update sending rate (step 4 of [RFC 3448, 4.3]) */ 458 /* Update sending rate (step 4 of [RFC 3448, 4.3]) */
397 if (hctx->p > 0) 459 if (hctx->ccid3hctx_p > 0)
398 hctx->x_calc = tfrc_calc_x(hctx->s, hctx->rtt, hctx->p); 460 hctx->ccid3hctx_x_calc =
461 tfrc_calc_x(hctx->ccid3hctx_s,
462 hctx->ccid3hctx_rtt,
463 hctx->ccid3hctx_p);
399 ccid3_hc_tx_update_x(sk, &now); 464 ccid3_hc_tx_update_x(sk, &now);
400 465
401done_computing_x: 466done_computing_x:
402 ccid3_pr_debug("%s(%p), RTT=%uus (sample=%uus), s=%u, " 467 ccid3_pr_debug("%s(%p), RTT=%uus (sample=%uus), s=%u, "
403 "p=%u, X_calc=%u, X_recv=%u, X=%u\n", 468 "p=%u, X_calc=%u, X_recv=%u, X=%u\n",
404 dccp_role(sk), sk, hctx->rtt, r_sample, 469 dccp_role(sk),
405 hctx->s, hctx->p, hctx->x_calc, 470 sk, hctx->ccid3hctx_rtt, r_sample,
406 (unsigned)(hctx->x_recv >> 6), 471 hctx->ccid3hctx_s, hctx->ccid3hctx_p,
407 (unsigned)(hctx->x >> 6)); 472 hctx->ccid3hctx_x_calc,
408 /* 473 (unsigned)(hctx->ccid3hctx_x_recv >> 6),
409 * Oscillation Reduction (RFC 3448, 4.5) - modifying t_ipi according to 474 (unsigned)(hctx->ccid3hctx_x >> 6));
410 * RTT changes, multiplying by X/X_inst = sqrt(R_sample)/R_sqmean. This
411 * can be useful if few connections share a link, avoiding that buffer
412 * fill levels (RTT) oscillate as a result of frequent adjustments to X.
413 * A useful presentation with background information is in
414 * Joerg Widmer, "Equation-Based Congestion Control",
415 * MSc Thesis, University of Mannheim, Germany, 2000
416 * (sec. 3.6.4), who calls this ISM ("Inter-packet Space Modulation").
417 */
418 if (do_osc_prev) {
419 r_sample = tfrc_scaled_sqrt(r_sample);
420 /*
421 * The modulation can work in both ways: increase/decrease t_ipi
422 * according to long-term increases/decreases of the RTT. The
423 * former is a useful measure, since it works against queue
424 * build-up. The latter temporarily increases the sending rate,
425 * so that buffers fill up more quickly. This in turn causes
426 * the RTT to increase, so that either later reduction becomes
427 * necessary or the RTT stays at a very high level. Decreasing
428 * t_ipi is therefore not supported.
429 * Furthermore, during the initial slow-start phase the RTT
430 * naturally increases, where using the algorithm would cause
431 * delays. Hence it is disabled during the initial slow-start.
432 */
433 if (r_sample > hctx->r_sqmean && hctx->p > 0)
434 hctx->t_ipi = div_u64((u64)hctx->t_ipi * (u64)r_sample,
435 hctx->r_sqmean);
436 hctx->t_ipi = min_t(u32, hctx->t_ipi, TFRC_T_MBI);
437 /* update R_sqmean _after_ computing the modulation factor */
438 hctx->r_sqmean = tfrc_ewma(hctx->r_sqmean, r_sample, 9);
439 }
440 475
441 /* unschedule no feedback timer */ 476 /* unschedule no feedback timer */
442 sk_stop_timer(sk, &hctx->no_feedback_timer); 477 sk_stop_timer(sk, &hctx->ccid3hctx_no_feedback_timer);
443 478
444 /* 479 /*
445 * As we have calculated new ipi, delta, t_nom it is possible 480 * As we have calculated new ipi, delta, t_nom it is possible
@@ -453,66 +488,95 @@ done_computing_x:
453 * This can help avoid triggering the nofeedback timer too 488 * This can help avoid triggering the nofeedback timer too
454 * often ('spinning') on LANs with small RTTs. 489 * often ('spinning') on LANs with small RTTs.
455 */ 490 */
456 hctx->t_rto = max_t(u32, 4 * hctx->rtt, (CONFIG_IP_DCCP_CCID3_RTO * 491 hctx->ccid3hctx_t_rto = max_t(u32, 4 * hctx->ccid3hctx_rtt,
457 (USEC_PER_SEC / 1000))); 492 (CONFIG_IP_DCCP_CCID3_RTO *
493 (USEC_PER_SEC / 1000)));
458 /* 494 /*
459 * Schedule no feedback timer to expire in 495 * Schedule no feedback timer to expire in
460 * max(t_RTO, 2 * s/X) = max(t_RTO, 2 * t_ipi) 496 * max(t_RTO, 2 * s/X) = max(t_RTO, 2 * t_ipi)
461 */ 497 */
462 t_nfb = max(hctx->t_rto, 2 * hctx->t_ipi); 498 t_nfb = max(hctx->ccid3hctx_t_rto, 2 * hctx->ccid3hctx_t_ipi);
463 499
464 ccid3_pr_debug("%s(%p), Scheduled no feedback timer to " 500 ccid3_pr_debug("%s(%p), Scheduled no feedback timer to "
465 "expire in %lu jiffies (%luus)\n", 501 "expire in %lu jiffies (%luus)\n",
466 dccp_role(sk), sk, usecs_to_jiffies(t_nfb), t_nfb); 502 dccp_role(sk),
503 sk, usecs_to_jiffies(t_nfb), t_nfb);
467 504
468 sk_reset_timer(sk, &hctx->no_feedback_timer, 505 sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer,
469 jiffies + usecs_to_jiffies(t_nfb)); 506 jiffies + usecs_to_jiffies(t_nfb));
470} 507}
471 508
472static int ccid3_hc_tx_parse_options(struct sock *sk, u8 packet_type, 509static int ccid3_hc_tx_parse_options(struct sock *sk, unsigned char option,
473 u8 option, u8 *optval, u8 optlen) 510 unsigned char len, u16 idx,
511 unsigned char *value)
474{ 512{
513 int rc = 0;
514 const struct dccp_sock *dp = dccp_sk(sk);
475 struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); 515 struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
516 struct ccid3_options_received *opt_recv;
476 __be32 opt_val; 517 __be32 opt_val;
477 518
478 switch (option) { 519 opt_recv = &hctx->ccid3hctx_options_received;
479 case TFRC_OPT_RECEIVE_RATE:
480 case TFRC_OPT_LOSS_EVENT_RATE:
481 /* Must be ignored on Data packets, cf. RFC 4342 8.3 and 8.5 */
482 if (packet_type == DCCP_PKT_DATA)
483 break;
484 if (unlikely(optlen != 4)) {
485 DCCP_WARN("%s(%p), invalid len %d for %u\n",
486 dccp_role(sk), sk, optlen, option);
487 return -EINVAL;
488 }
489 opt_val = ntohl(get_unaligned((__be32 *)optval));
490 520
491 if (option == TFRC_OPT_RECEIVE_RATE) { 521 if (opt_recv->ccid3or_seqno != dp->dccps_gsr) {
492 /* Receive Rate is kept in units of 64 bytes/second */ 522 opt_recv->ccid3or_seqno = dp->dccps_gsr;
493 hctx->x_recv = opt_val; 523 opt_recv->ccid3or_loss_event_rate = ~0;
494 hctx->x_recv <<= 6; 524 opt_recv->ccid3or_loss_intervals_idx = 0;
525 opt_recv->ccid3or_loss_intervals_len = 0;
526 opt_recv->ccid3or_receive_rate = 0;
527 }
495 528
496 ccid3_pr_debug("%s(%p), RECEIVE_RATE=%u\n", 529 switch (option) {
497 dccp_role(sk), sk, opt_val); 530 case TFRC_OPT_LOSS_EVENT_RATE:
531 if (unlikely(len != 4)) {
532 DCCP_WARN("%s(%p), invalid len %d "
533 "for TFRC_OPT_LOSS_EVENT_RATE\n",
534 dccp_role(sk), sk, len);
535 rc = -EINVAL;
498 } else { 536 } else {
499 /* Update the fixpoint Loss Event Rate fraction */ 537 opt_val = get_unaligned((__be32 *)value);
500 hctx->p = tfrc_invert_loss_event_rate(opt_val); 538 opt_recv->ccid3or_loss_event_rate = ntohl(opt_val);
501
502 ccid3_pr_debug("%s(%p), LOSS_EVENT_RATE=%u\n", 539 ccid3_pr_debug("%s(%p), LOSS_EVENT_RATE=%u\n",
503 dccp_role(sk), sk, opt_val); 540 dccp_role(sk), sk,
541 opt_recv->ccid3or_loss_event_rate);
504 } 542 }
543 break;
544 case TFRC_OPT_LOSS_INTERVALS:
545 opt_recv->ccid3or_loss_intervals_idx = idx;
546 opt_recv->ccid3or_loss_intervals_len = len;
547 ccid3_pr_debug("%s(%p), LOSS_INTERVALS=(%u, %u)\n",
548 dccp_role(sk), sk,
549 opt_recv->ccid3or_loss_intervals_idx,
550 opt_recv->ccid3or_loss_intervals_len);
551 break;
552 case TFRC_OPT_RECEIVE_RATE:
553 if (unlikely(len != 4)) {
554 DCCP_WARN("%s(%p), invalid len %d "
555 "for TFRC_OPT_RECEIVE_RATE\n",
556 dccp_role(sk), sk, len);
557 rc = -EINVAL;
558 } else {
559 opt_val = get_unaligned((__be32 *)value);
560 opt_recv->ccid3or_receive_rate = ntohl(opt_val);
561 ccid3_pr_debug("%s(%p), RECEIVE_RATE=%u\n",
562 dccp_role(sk), sk,
563 opt_recv->ccid3or_receive_rate);
564 }
565 break;
505 } 566 }
506 return 0; 567
568 return rc;
507} 569}
508 570
509static int ccid3_hc_tx_init(struct ccid *ccid, struct sock *sk) 571static int ccid3_hc_tx_init(struct ccid *ccid, struct sock *sk)
510{ 572{
511 struct ccid3_hc_tx_sock *hctx = ccid_priv(ccid); 573 struct ccid3_hc_tx_sock *hctx = ccid_priv(ccid);
512 574
513 hctx->hist = NULL; 575 hctx->ccid3hctx_state = TFRC_SSTATE_NO_SENT;
514 setup_timer(&hctx->no_feedback_timer, 576 hctx->ccid3hctx_hist = NULL;
515 ccid3_hc_tx_no_feedback_timer, (unsigned long)sk); 577 setup_timer(&hctx->ccid3hctx_no_feedback_timer,
578 ccid3_hc_tx_no_feedback_timer, (unsigned long)sk);
579
516 return 0; 580 return 0;
517} 581}
518 582
@@ -520,36 +584,42 @@ static void ccid3_hc_tx_exit(struct sock *sk)
520{ 584{
521 struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); 585 struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
522 586
523 sk_stop_timer(sk, &hctx->no_feedback_timer); 587 ccid3_hc_tx_set_state(sk, TFRC_SSTATE_TERM);
524 tfrc_tx_hist_purge(&hctx->hist); 588 sk_stop_timer(sk, &hctx->ccid3hctx_no_feedback_timer);
589
590 tfrc_tx_hist_purge(&hctx->ccid3hctx_hist);
525} 591}
526 592
527static void ccid3_hc_tx_get_info(struct sock *sk, struct tcp_info *info) 593static void ccid3_hc_tx_get_info(struct sock *sk, struct tcp_info *info)
528{ 594{
529 info->tcpi_rto = ccid3_hc_tx_sk(sk)->t_rto; 595 struct ccid3_hc_tx_sock *hctx;
530 info->tcpi_rtt = ccid3_hc_tx_sk(sk)->rtt; 596
597 /* Listen socks doesn't have a private CCID block */
598 if (sk->sk_state == DCCP_LISTEN)
599 return;
600
601 hctx = ccid3_hc_tx_sk(sk);
602 info->tcpi_rto = hctx->ccid3hctx_t_rto;
603 info->tcpi_rtt = hctx->ccid3hctx_rtt;
531} 604}
532 605
533static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len, 606static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len,
534 u32 __user *optval, int __user *optlen) 607 u32 __user *optval, int __user *optlen)
535{ 608{
536 const struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); 609 const struct ccid3_hc_tx_sock *hctx;
537 struct tfrc_tx_info tfrc;
538 const void *val; 610 const void *val;
539 611
612 /* Listen socks doesn't have a private CCID block */
613 if (sk->sk_state == DCCP_LISTEN)
614 return -EINVAL;
615
616 hctx = ccid3_hc_tx_sk(sk);
540 switch (optname) { 617 switch (optname) {
541 case DCCP_SOCKOPT_CCID_TX_INFO: 618 case DCCP_SOCKOPT_CCID_TX_INFO:
542 if (len < sizeof(tfrc)) 619 if (len < sizeof(hctx->ccid3hctx_tfrc))
543 return -EINVAL; 620 return -EINVAL;
544 tfrc.tfrctx_x = hctx->x; 621 len = sizeof(hctx->ccid3hctx_tfrc);
545 tfrc.tfrctx_x_recv = hctx->x_recv; 622 val = &hctx->ccid3hctx_tfrc;
546 tfrc.tfrctx_x_calc = hctx->x_calc;
547 tfrc.tfrctx_rtt = hctx->rtt;
548 tfrc.tfrctx_p = hctx->p;
549 tfrc.tfrctx_rto = hctx->t_rto;
550 tfrc.tfrctx_ipi = hctx->t_ipi;
551 len = sizeof(tfrc);
552 val = &tfrc;
553 break; 623 break;
554 default: 624 default:
555 return -ENOPROTOOPT; 625 return -ENOPROTOOPT;
@@ -564,82 +634,112 @@ static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len,
564/* 634/*
565 * Receiver Half-Connection Routines 635 * Receiver Half-Connection Routines
566 */ 636 */
637
638/* CCID3 feedback types */
639enum ccid3_fback_type {
640 CCID3_FBACK_NONE = 0,
641 CCID3_FBACK_INITIAL,
642 CCID3_FBACK_PERIODIC,
643 CCID3_FBACK_PARAM_CHANGE
644};
645
646#ifdef CONFIG_IP_DCCP_CCID3_DEBUG
647static const char *ccid3_rx_state_name(enum ccid3_hc_rx_states state)
648{
649 static char *ccid3_rx_state_names[] = {
650 [TFRC_RSTATE_NO_DATA] = "NO_DATA",
651 [TFRC_RSTATE_DATA] = "DATA",
652 [TFRC_RSTATE_TERM] = "TERM",
653 };
654
655 return ccid3_rx_state_names[state];
656}
657#endif
658
659static void ccid3_hc_rx_set_state(struct sock *sk,
660 enum ccid3_hc_rx_states state)
661{
662 struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
663 enum ccid3_hc_rx_states oldstate = hcrx->ccid3hcrx_state;
664
665 ccid3_pr_debug("%s(%p) %-8.8s -> %s\n",
666 dccp_role(sk), sk, ccid3_rx_state_name(oldstate),
667 ccid3_rx_state_name(state));
668 WARN_ON(state == oldstate);
669 hcrx->ccid3hcrx_state = state;
670}
671
567static void ccid3_hc_rx_send_feedback(struct sock *sk, 672static void ccid3_hc_rx_send_feedback(struct sock *sk,
568 const struct sk_buff *skb, 673 const struct sk_buff *skb,
569 enum ccid3_fback_type fbtype) 674 enum ccid3_fback_type fbtype)
570{ 675{
571 struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); 676 struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
677 struct dccp_sock *dp = dccp_sk(sk);
678 ktime_t now;
679 s64 delta = 0;
680
681 if (unlikely(hcrx->ccid3hcrx_state == TFRC_RSTATE_TERM))
682 return;
683
684 now = ktime_get_real();
572 685
573 switch (fbtype) { 686 switch (fbtype) {
574 case CCID3_FBACK_INITIAL: 687 case CCID3_FBACK_INITIAL:
575 hcrx->x_recv = 0; 688 hcrx->ccid3hcrx_x_recv = 0;
576 hcrx->p_inverse = ~0U; /* see RFC 4342, 8.5 */ 689 hcrx->ccid3hcrx_pinv = ~0U; /* see RFC 4342, 8.5 */
577 break; 690 break;
578 case CCID3_FBACK_PARAM_CHANGE: 691 case CCID3_FBACK_PARAM_CHANGE:
579 if (unlikely(hcrx->feedback == CCID3_FBACK_NONE)) {
580 /*
581 * rfc3448bis-06, 6.3.1: First packet(s) lost or marked
582 * FIXME: in rfc3448bis the receiver returns X_recv=0
583 * here as it normally would in the first feedback packet.
584 * However this is not possible yet, since the code still
585 * uses RFC 3448, i.e.
586 * If (p > 0)
587 * Calculate X_calc using the TCP throughput equation.
588 * X = max(min(X_calc, 2*X_recv), s/t_mbi);
589 * would bring X down to s/t_mbi. That is why we return
590 * X_recv according to rfc3448bis-06 for the moment.
591 */
592 u32 s = tfrc_rx_hist_packet_size(&hcrx->hist),
593 rtt = tfrc_rx_hist_rtt(&hcrx->hist);
594
595 hcrx->x_recv = scaled_div32(s, 2 * rtt);
596 break;
597 }
598 /* 692 /*
599 * When parameters change (new loss or p > p_prev), we do not 693 * When parameters change (new loss or p > p_prev), we do not
600 * have a reliable estimate for R_m of [RFC 3448, 6.2] and so 694 * have a reliable estimate for R_m of [RFC 3448, 6.2] and so
601 * always check whether at least RTT time units were covered. 695 * need to reuse the previous value of X_recv. However, when
696 * X_recv was 0 (due to early loss), this would kill X down to
697 * s/t_mbi (i.e. one packet in 64 seconds).
698 * To avoid such drastic reduction, we approximate X_recv as
699 * the number of bytes since last feedback.
700 * This is a safe fallback, since X is bounded above by X_calc.
602 */ 701 */
603 hcrx->x_recv = tfrc_rx_hist_x_recv(&hcrx->hist, hcrx->x_recv); 702 if (hcrx->ccid3hcrx_x_recv > 0)
604 break; 703 break;
704 /* fall through */
605 case CCID3_FBACK_PERIODIC: 705 case CCID3_FBACK_PERIODIC:
606 /* 706 delta = ktime_us_delta(now, hcrx->ccid3hcrx_tstamp_last_feedback);
607 * Step (2) of rfc3448bis-06, 6.2: 707 if (delta <= 0)
608 * - if no data packets have been received, just restart timer 708 DCCP_BUG("delta (%ld) <= 0", (long)delta);
609 * - if data packets have been received, re-compute X_recv 709 else
610 */ 710 hcrx->ccid3hcrx_x_recv =
611 if (hcrx->hist.bytes_recvd == 0) 711 scaled_div32(hcrx->ccid3hcrx_bytes_recv, delta);
612 goto prepare_for_next_time;
613 hcrx->x_recv = tfrc_rx_hist_x_recv(&hcrx->hist, hcrx->x_recv);
614 break; 712 break;
615 default: 713 default:
616 return; 714 return;
617 } 715 }
618 716
619 ccid3_pr_debug("X_recv=%u, 1/p=%u\n", hcrx->x_recv, hcrx->p_inverse); 717 ccid3_pr_debug("Interval %ldusec, X_recv=%u, 1/p=%u\n", (long)delta,
718 hcrx->ccid3hcrx_x_recv, hcrx->ccid3hcrx_pinv);
620 719
621 dccp_sk(sk)->dccps_hc_rx_insert_options = 1; 720 hcrx->ccid3hcrx_tstamp_last_feedback = now;
622 dccp_send_ack(sk); 721 hcrx->ccid3hcrx_last_counter = dccp_hdr(skb)->dccph_ccval;
722 hcrx->ccid3hcrx_bytes_recv = 0;
623 723
624prepare_for_next_time: 724 dp->dccps_hc_rx_insert_options = 1;
625 tfrc_rx_hist_restart_byte_counter(&hcrx->hist); 725 dccp_send_ack(sk);
626 hcrx->last_counter = dccp_hdr(skb)->dccph_ccval;
627 hcrx->feedback = fbtype;
628} 726}
629 727
630static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb) 728static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb)
631{ 729{
632 const struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); 730 const struct ccid3_hc_rx_sock *hcrx;
633 __be32 x_recv, pinv; 731 __be32 x_recv, pinv;
634 732
635 if (!(sk->sk_state == DCCP_OPEN || sk->sk_state == DCCP_PARTOPEN)) 733 if (!(sk->sk_state == DCCP_OPEN || sk->sk_state == DCCP_PARTOPEN))
636 return 0; 734 return 0;
637 735
736 hcrx = ccid3_hc_rx_sk(sk);
737
638 if (dccp_packet_without_ack(skb)) 738 if (dccp_packet_without_ack(skb))
639 return 0; 739 return 0;
640 740
641 x_recv = htonl(hcrx->x_recv); 741 x_recv = htonl(hcrx->ccid3hcrx_x_recv);
642 pinv = htonl(hcrx->p_inverse); 742 pinv = htonl(hcrx->ccid3hcrx_pinv);
643 743
644 if (dccp_insert_option(sk, skb, TFRC_OPT_LOSS_EVENT_RATE, 744 if (dccp_insert_option(sk, skb, TFRC_OPT_LOSS_EVENT_RATE,
645 &pinv, sizeof(pinv)) || 745 &pinv, sizeof(pinv)) ||
@@ -662,95 +762,171 @@ static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb)
662static u32 ccid3_first_li(struct sock *sk) 762static u32 ccid3_first_li(struct sock *sk)
663{ 763{
664 struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); 764 struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
665 u32 s = tfrc_rx_hist_packet_size(&hcrx->hist), 765 u32 x_recv, p, delta;
666 rtt = tfrc_rx_hist_rtt(&hcrx->hist), x_recv, p;
667 u64 fval; 766 u64 fval;
668 767
669 /* 768 if (hcrx->ccid3hcrx_rtt == 0) {
670 * rfc3448bis-06, 6.3.1: First data packet(s) are marked or lost. Set p 769 DCCP_WARN("No RTT estimate available, using fallback RTT\n");
671 * to give the equivalent of X_target = s/(2*R). Thus fval = 2 and so p 770 hcrx->ccid3hcrx_rtt = DCCP_FALLBACK_RTT;
672 * is about 20.64%. This yields an interval length of 4.84 (rounded up). 771 }
673 */
674 if (unlikely(hcrx->feedback == CCID3_FBACK_NONE))
675 return 5;
676 772
677 x_recv = tfrc_rx_hist_x_recv(&hcrx->hist, hcrx->x_recv); 773 delta = ktime_to_us(net_timedelta(hcrx->ccid3hcrx_tstamp_last_feedback));
678 if (x_recv == 0) 774 x_recv = scaled_div32(hcrx->ccid3hcrx_bytes_recv, delta);
679 goto failed; 775 if (x_recv == 0) { /* would also trigger divide-by-zero */
776 DCCP_WARN("X_recv==0\n");
777 if ((x_recv = hcrx->ccid3hcrx_x_recv) == 0) {
778 DCCP_BUG("stored value of X_recv is zero");
779 return ~0U;
780 }
781 }
680 782
681 fval = scaled_div32(scaled_div(s, rtt), x_recv); 783 fval = scaled_div(hcrx->ccid3hcrx_s, hcrx->ccid3hcrx_rtt);
784 fval = scaled_div32(fval, x_recv);
682 p = tfrc_calc_x_reverse_lookup(fval); 785 p = tfrc_calc_x_reverse_lookup(fval);
683 786
684 ccid3_pr_debug("%s(%p), receive rate=%u bytes/s, implied " 787 ccid3_pr_debug("%s(%p), receive rate=%u bytes/s, implied "
685 "loss rate=%u\n", dccp_role(sk), sk, x_recv, p); 788 "loss rate=%u\n", dccp_role(sk), sk, x_recv, p);
686 789
687 if (p > 0) 790 return p == 0 ? ~0U : scaled_div(1, p);
688 return scaled_div(1, p);
689failed:
690 return UINT_MAX;
691} 791}
692 792
693static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) 793static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
694{ 794{
695 struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); 795 struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
796 enum ccid3_fback_type do_feedback = CCID3_FBACK_NONE;
696 const u64 ndp = dccp_sk(sk)->dccps_options_received.dccpor_ndp; 797 const u64 ndp = dccp_sk(sk)->dccps_options_received.dccpor_ndp;
697 const bool is_data_packet = dccp_data_packet(skb); 798 const bool is_data_packet = dccp_data_packet(skb);
698 799
800 if (unlikely(hcrx->ccid3hcrx_state == TFRC_RSTATE_NO_DATA)) {
801 if (is_data_packet) {
802 const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4;
803 do_feedback = CCID3_FBACK_INITIAL;
804 ccid3_hc_rx_set_state(sk, TFRC_RSTATE_DATA);
805 hcrx->ccid3hcrx_s = payload;
806 /*
807 * Not necessary to update ccid3hcrx_bytes_recv here,
808 * since X_recv = 0 for the first feedback packet (cf.
809 * RFC 3448, 6.3) -- gerrit
810 */
811 }
812 goto update_records;
813 }
814
815 if (tfrc_rx_hist_duplicate(&hcrx->ccid3hcrx_hist, skb))
816 return; /* done receiving */
817
818 if (is_data_packet) {
819 const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4;
820 /*
821 * Update moving-average of s and the sum of received payload bytes
822 */
823 hcrx->ccid3hcrx_s = tfrc_ewma(hcrx->ccid3hcrx_s, payload, 9);
824 hcrx->ccid3hcrx_bytes_recv += payload;
825 }
826
699 /* 827 /*
700 * Perform loss detection and handle pending losses 828 * Perform loss detection and handle pending losses
701 */ 829 */
702 if (tfrc_rx_congestion_event(&hcrx->hist, &hcrx->li_hist, 830 if (tfrc_rx_handle_loss(&hcrx->ccid3hcrx_hist, &hcrx->ccid3hcrx_li_hist,
703 skb, ndp, ccid3_first_li, sk)) 831 skb, ndp, ccid3_first_li, sk)) {
704 ccid3_hc_rx_send_feedback(sk, skb, CCID3_FBACK_PARAM_CHANGE); 832 do_feedback = CCID3_FBACK_PARAM_CHANGE;
833 goto done_receiving;
834 }
835
836 if (tfrc_rx_hist_loss_pending(&hcrx->ccid3hcrx_hist))
837 return; /* done receiving */
838
705 /* 839 /*
706 * Feedback for first non-empty data packet (RFC 3448, 6.3) 840 * Handle data packets: RTT sampling and monitoring p
707 */ 841 */
708 else if (unlikely(hcrx->feedback == CCID3_FBACK_NONE && is_data_packet)) 842 if (unlikely(!is_data_packet))
709 ccid3_hc_rx_send_feedback(sk, skb, CCID3_FBACK_INITIAL); 843 goto update_records;
844
845 if (!tfrc_lh_is_initialised(&hcrx->ccid3hcrx_li_hist)) {
846 const u32 sample = tfrc_rx_hist_sample_rtt(&hcrx->ccid3hcrx_hist, skb);
847 /*
848 * Empty loss history: no loss so far, hence p stays 0.
849 * Sample RTT values, since an RTT estimate is required for the
850 * computation of p when the first loss occurs; RFC 3448, 6.3.1.
851 */
852 if (sample != 0)
853 hcrx->ccid3hcrx_rtt = tfrc_ewma(hcrx->ccid3hcrx_rtt, sample, 9);
854
855 } else if (tfrc_lh_update_i_mean(&hcrx->ccid3hcrx_li_hist, skb)) {
856 /*
857 * Step (3) of [RFC 3448, 6.1]: Recompute I_mean and, if I_mean
858 * has decreased (resp. p has increased), send feedback now.
859 */
860 do_feedback = CCID3_FBACK_PARAM_CHANGE;
861 }
862
710 /* 863 /*
711 * Check if the periodic once-per-RTT feedback is due; RFC 4342, 10.3 864 * Check if the periodic once-per-RTT feedback is due; RFC 4342, 10.3
712 */ 865 */
713 else if (!tfrc_rx_hist_loss_pending(&hcrx->hist) && is_data_packet && 866 if (SUB16(dccp_hdr(skb)->dccph_ccval, hcrx->ccid3hcrx_last_counter) > 3)
714 SUB16(dccp_hdr(skb)->dccph_ccval, hcrx->last_counter) > 3) 867 do_feedback = CCID3_FBACK_PERIODIC;
715 ccid3_hc_rx_send_feedback(sk, skb, CCID3_FBACK_PERIODIC); 868
869update_records:
870 tfrc_rx_hist_add_packet(&hcrx->ccid3hcrx_hist, skb, ndp);
871
872done_receiving:
873 if (do_feedback)
874 ccid3_hc_rx_send_feedback(sk, skb, do_feedback);
716} 875}
717 876
718static int ccid3_hc_rx_init(struct ccid *ccid, struct sock *sk) 877static int ccid3_hc_rx_init(struct ccid *ccid, struct sock *sk)
719{ 878{
720 struct ccid3_hc_rx_sock *hcrx = ccid_priv(ccid); 879 struct ccid3_hc_rx_sock *hcrx = ccid_priv(ccid);
721 880
722 tfrc_lh_init(&hcrx->li_hist); 881 hcrx->ccid3hcrx_state = TFRC_RSTATE_NO_DATA;
723 return tfrc_rx_hist_init(&hcrx->hist, sk); 882 tfrc_lh_init(&hcrx->ccid3hcrx_li_hist);
883 return tfrc_rx_hist_alloc(&hcrx->ccid3hcrx_hist);
724} 884}
725 885
726static void ccid3_hc_rx_exit(struct sock *sk) 886static void ccid3_hc_rx_exit(struct sock *sk)
727{ 887{
728 struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); 888 struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
729 889
730 tfrc_rx_hist_purge(&hcrx->hist); 890 ccid3_hc_rx_set_state(sk, TFRC_RSTATE_TERM);
731 tfrc_lh_cleanup(&hcrx->li_hist); 891
892 tfrc_rx_hist_purge(&hcrx->ccid3hcrx_hist);
893 tfrc_lh_cleanup(&hcrx->ccid3hcrx_li_hist);
732} 894}
733 895
734static void ccid3_hc_rx_get_info(struct sock *sk, struct tcp_info *info) 896static void ccid3_hc_rx_get_info(struct sock *sk, struct tcp_info *info)
735{ 897{
898 const struct ccid3_hc_rx_sock *hcrx;
899
900 /* Listen socks doesn't have a private CCID block */
901 if (sk->sk_state == DCCP_LISTEN)
902 return;
903
904 hcrx = ccid3_hc_rx_sk(sk);
905 info->tcpi_ca_state = hcrx->ccid3hcrx_state;
736 info->tcpi_options |= TCPI_OPT_TIMESTAMPS; 906 info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
737 info->tcpi_rcv_rtt = tfrc_rx_hist_rtt(&ccid3_hc_rx_sk(sk)->hist); 907 info->tcpi_rcv_rtt = hcrx->ccid3hcrx_rtt;
738} 908}
739 909
740static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len, 910static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len,
741 u32 __user *optval, int __user *optlen) 911 u32 __user *optval, int __user *optlen)
742{ 912{
743 const struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); 913 const struct ccid3_hc_rx_sock *hcrx;
744 struct tfrc_rx_info rx_info; 914 struct tfrc_rx_info rx_info;
745 const void *val; 915 const void *val;
746 916
917 /* Listen socks doesn't have a private CCID block */
918 if (sk->sk_state == DCCP_LISTEN)
919 return -EINVAL;
920
921 hcrx = ccid3_hc_rx_sk(sk);
747 switch (optname) { 922 switch (optname) {
748 case DCCP_SOCKOPT_CCID_RX_INFO: 923 case DCCP_SOCKOPT_CCID_RX_INFO:
749 if (len < sizeof(rx_info)) 924 if (len < sizeof(rx_info))
750 return -EINVAL; 925 return -EINVAL;
751 rx_info.tfrcrx_x_recv = hcrx->x_recv; 926 rx_info.tfrcrx_x_recv = hcrx->ccid3hcrx_x_recv;
752 rx_info.tfrcrx_rtt = tfrc_rx_hist_rtt(&hcrx->hist); 927 rx_info.tfrcrx_rtt = hcrx->ccid3hcrx_rtt;
753 rx_info.tfrcrx_p = tfrc_invert_loss_event_rate(hcrx->p_inverse); 928 rx_info.tfrcrx_p = hcrx->ccid3hcrx_pinv == 0 ? ~0U :
929 scaled_div(1, hcrx->ccid3hcrx_pinv);
754 len = sizeof(rx_info); 930 len = sizeof(rx_info);
755 val = &rx_info; 931 val = &rx_info;
756 break; 932 break;
@@ -786,9 +962,6 @@ static struct ccid_operations ccid3 = {
786 .ccid_hc_tx_getsockopt = ccid3_hc_tx_getsockopt, 962 .ccid_hc_tx_getsockopt = ccid3_hc_tx_getsockopt,
787}; 963};
788 964
789module_param(do_osc_prev, bool, 0644);
790MODULE_PARM_DESC(do_osc_prev, "Use Oscillation Prevention (RFC 3448, 4.5)");
791
792#ifdef CONFIG_IP_DCCP_CCID3_DEBUG 965#ifdef CONFIG_IP_DCCP_CCID3_DEBUG
793module_param(ccid3_debug, bool, 0644); 966module_param(ccid3_debug, bool, 0644);
794MODULE_PARM_DESC(ccid3_debug, "Enable debug messages"); 967MODULE_PARM_DESC(ccid3_debug, "Enable debug messages");
@@ -796,19 +969,6 @@ MODULE_PARM_DESC(ccid3_debug, "Enable debug messages");
796 969
797static __init int ccid3_module_init(void) 970static __init int ccid3_module_init(void)
798{ 971{
799 struct timespec tp;
800
801 /*
802 * Without a fine-grained clock resolution, RTTs/X_recv are not sampled
803 * correctly and feedback is sent either too early or too late.
804 */
805 hrtimer_get_res(CLOCK_MONOTONIC, &tp);
806 if (tp.tv_sec || tp.tv_nsec > DCCP_TIME_RESOLUTION * NSEC_PER_USEC) {
807 printk(KERN_ERR "%s: Timer too coarse (%ld usec), need %u-usec"
808 " resolution - check your clocksource.\n", __func__,
809 tp.tv_nsec/NSEC_PER_USEC, DCCP_TIME_RESOLUTION);
810 return -ESOCKTNOSUPPORT;
811 }
812 return ccid_register(&ccid3); 972 return ccid_register(&ccid3);
813} 973}
814module_init(ccid3_module_init); 974module_init(ccid3_module_init);
diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h
index af6e1bf937d9..49ca32bd7e79 100644
--- a/net/dccp/ccids/ccid3.h
+++ b/net/dccp/ccids/ccid3.h
@@ -47,22 +47,11 @@
47/* Two seconds as per RFC 3448 4.2 */ 47/* Two seconds as per RFC 3448 4.2 */
48#define TFRC_INITIAL_TIMEOUT (2 * USEC_PER_SEC) 48#define TFRC_INITIAL_TIMEOUT (2 * USEC_PER_SEC)
49 49
50/* Maximum backoff interval t_mbi (RFC 3448, 4.3) */ 50/* In usecs - half the scheduling granularity as per RFC3448 4.6 */
51#define TFRC_T_MBI (64 * USEC_PER_SEC) 51#define TFRC_OPSYS_HALF_TIME_GRAN (USEC_PER_SEC / (2 * HZ))
52 52
53/* 53/* Parameter t_mbi from [RFC 3448, 4.3]: backoff interval in seconds */
54 * The t_delta parameter (RFC 3448, 4.6): delays of less than %USEC_PER_MSEC are 54#define TFRC_T_MBI 64
55 * rounded down to 0, since sk_reset_timer() here uses millisecond granularity.
56 * Hence we can use a constant t_delta = %USEC_PER_MSEC when HZ >= 500. A coarse
57 * resolution of HZ < 500 means that the error is below one timer tick (t_gran)
58 * when using the constant t_delta = t_gran / 2 = %USEC_PER_SEC / (2 * HZ).
59 */
60#if (HZ >= 500)
61# define TFRC_T_DELTA USEC_PER_MSEC
62#else
63# define TFRC_T_DELTA (USEC_PER_SEC / (2 * HZ))
64#warning Coarse CONFIG_HZ resolution -- higher value recommended for TFRC.
65#endif
66 55
67enum ccid3_options { 56enum ccid3_options {
68 TFRC_OPT_LOSS_EVENT_RATE = 192, 57 TFRC_OPT_LOSS_EVENT_RATE = 192,
@@ -70,43 +59,62 @@ enum ccid3_options {
70 TFRC_OPT_RECEIVE_RATE = 194, 59 TFRC_OPT_RECEIVE_RATE = 194,
71}; 60};
72 61
62struct ccid3_options_received {
63 u64 ccid3or_seqno:48,
64 ccid3or_loss_intervals_idx:16;
65 u16 ccid3or_loss_intervals_len;
66 u32 ccid3or_loss_event_rate;
67 u32 ccid3or_receive_rate;
68};
69
70/* TFRC sender states */
71enum ccid3_hc_tx_states {
72 TFRC_SSTATE_NO_SENT = 1,
73 TFRC_SSTATE_NO_FBACK,
74 TFRC_SSTATE_FBACK,
75 TFRC_SSTATE_TERM,
76};
77
73/** struct ccid3_hc_tx_sock - CCID3 sender half-connection socket 78/** struct ccid3_hc_tx_sock - CCID3 sender half-connection socket
74 * 79 *
75 * @x - Current sending rate in 64 * bytes per second 80 * @ccid3hctx_x - Current sending rate in 64 * bytes per second
76 * @x_recv - Receive rate in 64 * bytes per second 81 * @ccid3hctx_x_recv - Receive rate in 64 * bytes per second
77 * @x_calc - Calculated rate in bytes per second 82 * @ccid3hctx_x_calc - Calculated rate in bytes per second
78 * @rtt - Estimate of current round trip time in usecs 83 * @ccid3hctx_rtt - Estimate of current round trip time in usecs
79 * @r_sqmean - Estimate of long-term RTT (RFC 3448, 4.5) 84 * @ccid3hctx_p - Current loss event rate (0-1) scaled by 1000000
80 * @p - Current loss event rate (0-1) scaled by 1000000 85 * @ccid3hctx_s - Packet size in bytes
81 * @s - Packet size in bytes 86 * @ccid3hctx_t_rto - Nofeedback Timer setting in usecs
82 * @t_rto - Nofeedback Timer setting in usecs 87 * @ccid3hctx_t_ipi - Interpacket (send) interval (RFC 3448, 4.6) in usecs
83 * @t_ipi - Interpacket (send) interval (RFC 3448, 4.6) in usecs 88 * @ccid3hctx_state - Sender state, one of %ccid3_hc_tx_states
84 * @feedback - Whether feedback has been received or not 89 * @ccid3hctx_last_win_count - Last window counter sent
85 * @last_win_count - Last window counter sent 90 * @ccid3hctx_t_last_win_count - Timestamp of earliest packet
86 * @t_last_win_count - Timestamp of earliest packet with 91 * with last_win_count value sent
87 * last_win_count value sent 92 * @ccid3hctx_no_feedback_timer - Handle to no feedback timer
88 * @no_feedback_timer - Handle to no feedback timer 93 * @ccid3hctx_t_ld - Time last doubled during slow start
89 * @t_ld - Time last doubled during slow start 94 * @ccid3hctx_t_nom - Nominal send time of next packet
90 * @t_nom - Nominal send time of next packet 95 * @ccid3hctx_delta - Send timer delta (RFC 3448, 4.6) in usecs
91 * @hist - Packet history 96 * @ccid3hctx_hist - Packet history
97 * @ccid3hctx_options_received - Parsed set of retrieved options
92 */ 98 */
93struct ccid3_hc_tx_sock { 99struct ccid3_hc_tx_sock {
94 u64 x; 100 struct tfrc_tx_info ccid3hctx_tfrc;
95 u64 x_recv; 101#define ccid3hctx_x ccid3hctx_tfrc.tfrctx_x
96 u32 x_calc; 102#define ccid3hctx_x_recv ccid3hctx_tfrc.tfrctx_x_recv
97 u32 rtt; 103#define ccid3hctx_x_calc ccid3hctx_tfrc.tfrctx_x_calc
98 u16 r_sqmean; 104#define ccid3hctx_rtt ccid3hctx_tfrc.tfrctx_rtt
99 u32 p; 105#define ccid3hctx_p ccid3hctx_tfrc.tfrctx_p
100 u32 t_rto; 106#define ccid3hctx_t_rto ccid3hctx_tfrc.tfrctx_rto
101 u32 t_ipi; 107#define ccid3hctx_t_ipi ccid3hctx_tfrc.tfrctx_ipi
102 u16 s; 108 u16 ccid3hctx_s;
103 bool feedback:1; 109 enum ccid3_hc_tx_states ccid3hctx_state:8;
104 u8 last_win_count; 110 u8 ccid3hctx_last_win_count;
105 ktime_t t_last_win_count; 111 ktime_t ccid3hctx_t_last_win_count;
106 struct timer_list no_feedback_timer; 112 struct timer_list ccid3hctx_no_feedback_timer;
107 ktime_t t_ld; 113 ktime_t ccid3hctx_t_ld;
108 ktime_t t_nom; 114 ktime_t ccid3hctx_t_nom;
109 struct tfrc_tx_hist_entry *hist; 115 u32 ccid3hctx_delta;
116 struct tfrc_tx_hist_entry *ccid3hctx_hist;
117 struct ccid3_options_received ccid3hctx_options_received;
110}; 118};
111 119
112static inline struct ccid3_hc_tx_sock *ccid3_hc_tx_sk(const struct sock *sk) 120static inline struct ccid3_hc_tx_sock *ccid3_hc_tx_sk(const struct sock *sk)
@@ -116,32 +124,41 @@ static inline struct ccid3_hc_tx_sock *ccid3_hc_tx_sk(const struct sock *sk)
116 return hctx; 124 return hctx;
117} 125}
118 126
119 127/* TFRC receiver states */
120enum ccid3_fback_type { 128enum ccid3_hc_rx_states {
121 CCID3_FBACK_NONE = 0, 129 TFRC_RSTATE_NO_DATA = 1,
122 CCID3_FBACK_INITIAL, 130 TFRC_RSTATE_DATA,
123 CCID3_FBACK_PERIODIC, 131 TFRC_RSTATE_TERM = 127,
124 CCID3_FBACK_PARAM_CHANGE
125}; 132};
126 133
127/** struct ccid3_hc_rx_sock - CCID3 receiver half-connection socket 134/** struct ccid3_hc_rx_sock - CCID3 receiver half-connection socket
128 * 135 *
129 * @last_counter - Tracks window counter (RFC 4342, 8.1) 136 * @ccid3hcrx_x_recv - Receiver estimate of send rate (RFC 3448 4.3)
130 * @feedback - The type of the feedback last sent 137 * @ccid3hcrx_rtt - Receiver estimate of rtt (non-standard)
131 * @x_recv - Receiver estimate of send rate (RFC 3448, sec. 4.3) 138 * @ccid3hcrx_p - Current loss event rate (RFC 3448 5.4)
132 * @tstamp_last_feedback - Time at which last feedback was sent 139 * @ccid3hcrx_last_counter - Tracks window counter (RFC 4342, 8.1)
133 * @hist - Packet history (loss detection + RTT sampling) 140 * @ccid3hcrx_state - Receiver state, one of %ccid3_hc_rx_states
134 * @li_hist - Loss Interval database 141 * @ccid3hcrx_bytes_recv - Total sum of DCCP payload bytes
135 * @p_inverse - Inverse of Loss Event Rate (RFC 4342, sec. 8.5) 142 * @ccid3hcrx_x_recv - Receiver estimate of send rate (RFC 3448, sec. 4.3)
143 * @ccid3hcrx_rtt - Receiver estimate of RTT
144 * @ccid3hcrx_tstamp_last_feedback - Time at which last feedback was sent
145 * @ccid3hcrx_tstamp_last_ack - Time at which last feedback was sent
146 * @ccid3hcrx_hist - Packet history (loss detection + RTT sampling)
147 * @ccid3hcrx_li_hist - Loss Interval database
148 * @ccid3hcrx_s - Received packet size in bytes
149 * @ccid3hcrx_pinv - Inverse of Loss Event Rate (RFC 4342, sec. 8.5)
136 */ 150 */
137struct ccid3_hc_rx_sock { 151struct ccid3_hc_rx_sock {
138 u8 last_counter:4; 152 u8 ccid3hcrx_last_counter:4;
139 enum ccid3_fback_type feedback:4; 153 enum ccid3_hc_rx_states ccid3hcrx_state:8;
140 u32 x_recv; 154 u32 ccid3hcrx_bytes_recv;
141 ktime_t tstamp_last_feedback; 155 u32 ccid3hcrx_x_recv;
142 struct tfrc_rx_hist hist; 156 u32 ccid3hcrx_rtt;
143 struct tfrc_loss_hist li_hist; 157 ktime_t ccid3hcrx_tstamp_last_feedback;
144#define p_inverse li_hist.i_mean 158 struct tfrc_rx_hist ccid3hcrx_hist;
159 struct tfrc_loss_hist ccid3hcrx_li_hist;
160 u16 ccid3hcrx_s;
161#define ccid3hcrx_pinv ccid3hcrx_li_hist.i_mean
145}; 162};
146 163
147static inline struct ccid3_hc_rx_sock *ccid3_hc_rx_sk(const struct sock *sk) 164static inline struct ccid3_hc_rx_sock *ccid3_hc_rx_sk(const struct sock *sk)
diff --git a/net/dccp/ccids/lib/loss_interval.c b/net/dccp/ccids/lib/loss_interval.c
index b1ae8f8259e5..5b3ce0688c5c 100644
--- a/net/dccp/ccids/lib/loss_interval.c
+++ b/net/dccp/ccids/lib/loss_interval.c
@@ -86,26 +86,21 @@ static void tfrc_lh_calc_i_mean(struct tfrc_loss_hist *lh)
86 86
87/** 87/**
88 * tfrc_lh_update_i_mean - Update the `open' loss interval I_0 88 * tfrc_lh_update_i_mean - Update the `open' loss interval I_0
89 * This updates I_mean as the sequence numbers increase. As a consequence, the 89 * For recomputing p: returns `true' if p > p_prev <=> 1/p < 1/p_prev
90 * open loss interval I_0 increases, hence p = W_tot/max(I_tot0, I_tot1)
91 * decreases, and thus there is no need to send renewed feedback.
92 */ 90 */
93void tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *skb) 91u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *skb)
94{ 92{
95 struct tfrc_loss_interval *cur = tfrc_lh_peek(lh); 93 struct tfrc_loss_interval *cur = tfrc_lh_peek(lh);
94 u32 old_i_mean = lh->i_mean;
96 s64 len; 95 s64 len;
97 96
98 if (cur == NULL) /* not initialised */ 97 if (cur == NULL) /* not initialised */
99 return; 98 return 0;
100
101 /* FIXME: should probably also count non-data packets (RFC 4342, 6.1) */
102 if (!dccp_data_packet(skb))
103 return;
104 99
105 len = dccp_delta_seqno(cur->li_seqno, DCCP_SKB_CB(skb)->dccpd_seq) + 1; 100 len = dccp_delta_seqno(cur->li_seqno, DCCP_SKB_CB(skb)->dccpd_seq) + 1;
106 101
107 if (len - (s64)cur->li_length <= 0) /* duplicate or reordered */ 102 if (len - (s64)cur->li_length <= 0) /* duplicate or reordered */
108 return; 103 return 0;
109 104
110 if (SUB16(dccp_hdr(skb)->dccph_ccval, cur->li_ccval) > 4) 105 if (SUB16(dccp_hdr(skb)->dccph_ccval, cur->li_ccval) > 4)
111 /* 106 /*
@@ -119,11 +114,14 @@ void tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *skb)
119 cur->li_is_closed = 1; 114 cur->li_is_closed = 1;
120 115
121 if (tfrc_lh_length(lh) == 1) /* due to RFC 3448, 6.3.1 */ 116 if (tfrc_lh_length(lh) == 1) /* due to RFC 3448, 6.3.1 */
122 return; 117 return 0;
123 118
124 cur->li_length = len; 119 cur->li_length = len;
125 tfrc_lh_calc_i_mean(lh); 120 tfrc_lh_calc_i_mean(lh);
121
122 return (lh->i_mean < old_i_mean);
126} 123}
124EXPORT_SYMBOL_GPL(tfrc_lh_update_i_mean);
127 125
128/* Determine if `new_loss' does begin a new loss interval [RFC 4342, 10.2] */ 126/* Determine if `new_loss' does begin a new loss interval [RFC 4342, 10.2] */
129static inline u8 tfrc_lh_is_new_loss(struct tfrc_loss_interval *cur, 127static inline u8 tfrc_lh_is_new_loss(struct tfrc_loss_interval *cur,
@@ -140,18 +138,18 @@ static inline u8 tfrc_lh_is_new_loss(struct tfrc_loss_interval *cur,
140 * @sk: Used by @calc_first_li in caller-specific way (subtyping) 138 * @sk: Used by @calc_first_li in caller-specific way (subtyping)
141 * Updates I_mean and returns 1 if a new interval has in fact been added to @lh. 139 * Updates I_mean and returns 1 if a new interval has in fact been added to @lh.
142 */ 140 */
143bool tfrc_lh_interval_add(struct tfrc_loss_hist *lh, struct tfrc_rx_hist *rh, 141int tfrc_lh_interval_add(struct tfrc_loss_hist *lh, struct tfrc_rx_hist *rh,
144 u32 (*calc_first_li)(struct sock *), struct sock *sk) 142 u32 (*calc_first_li)(struct sock *), struct sock *sk)
145{ 143{
146 struct tfrc_loss_interval *cur = tfrc_lh_peek(lh), *new; 144 struct tfrc_loss_interval *cur = tfrc_lh_peek(lh), *new;
147 145
148 if (cur != NULL && !tfrc_lh_is_new_loss(cur, tfrc_rx_hist_loss_prev(rh))) 146 if (cur != NULL && !tfrc_lh_is_new_loss(cur, tfrc_rx_hist_loss_prev(rh)))
149 return false; 147 return 0;
150 148
151 new = tfrc_lh_demand_next(lh); 149 new = tfrc_lh_demand_next(lh);
152 if (unlikely(new == NULL)) { 150 if (unlikely(new == NULL)) {
153 DCCP_CRIT("Cannot allocate/add loss record."); 151 DCCP_CRIT("Cannot allocate/add loss record.");
154 return false; 152 return 0;
155 } 153 }
156 154
157 new->li_seqno = tfrc_rx_hist_loss_prev(rh)->tfrchrx_seqno; 155 new->li_seqno = tfrc_rx_hist_loss_prev(rh)->tfrchrx_seqno;
@@ -169,7 +167,7 @@ bool tfrc_lh_interval_add(struct tfrc_loss_hist *lh, struct tfrc_rx_hist *rh,
169 167
170 tfrc_lh_calc_i_mean(lh); 168 tfrc_lh_calc_i_mean(lh);
171 } 169 }
172 return true; 170 return 1;
173} 171}
174EXPORT_SYMBOL_GPL(tfrc_lh_interval_add); 172EXPORT_SYMBOL_GPL(tfrc_lh_interval_add);
175 173
diff --git a/net/dccp/ccids/lib/loss_interval.h b/net/dccp/ccids/lib/loss_interval.h
index d08a226db43e..246018a3b269 100644
--- a/net/dccp/ccids/lib/loss_interval.h
+++ b/net/dccp/ccids/lib/loss_interval.h
@@ -67,9 +67,9 @@ static inline u8 tfrc_lh_length(struct tfrc_loss_hist *lh)
67 67
68struct tfrc_rx_hist; 68struct tfrc_rx_hist;
69 69
70extern bool tfrc_lh_interval_add(struct tfrc_loss_hist *, struct tfrc_rx_hist *, 70extern int tfrc_lh_interval_add(struct tfrc_loss_hist *, struct tfrc_rx_hist *,
71 u32 (*first_li)(struct sock *), struct sock *); 71 u32 (*first_li)(struct sock *), struct sock *);
72extern void tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *); 72extern u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *);
73extern void tfrc_lh_cleanup(struct tfrc_loss_hist *lh); 73extern void tfrc_lh_cleanup(struct tfrc_loss_hist *lh);
74 74
75#endif /* _DCCP_LI_HIST_ */ 75#endif /* _DCCP_LI_HIST_ */
diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c
index cce9f03bda3e..6cc108afdc3b 100644
--- a/net/dccp/ccids/lib/packet_history.c
+++ b/net/dccp/ccids/lib/packet_history.c
@@ -40,6 +40,18 @@
40#include "packet_history.h" 40#include "packet_history.h"
41#include "../../dccp.h" 41#include "../../dccp.h"
42 42
43/**
44 * tfrc_tx_hist_entry - Simple singly-linked TX history list
45 * @next: next oldest entry (LIFO order)
46 * @seqno: sequence number of this entry
47 * @stamp: send time of packet with sequence number @seqno
48 */
49struct tfrc_tx_hist_entry {
50 struct tfrc_tx_hist_entry *next;
51 u64 seqno;
52 ktime_t stamp;
53};
54
43/* 55/*
44 * Transmitter History Routines 56 * Transmitter History Routines
45 */ 57 */
@@ -61,6 +73,15 @@ void tfrc_tx_packet_history_exit(void)
61 } 73 }
62} 74}
63 75
76static struct tfrc_tx_hist_entry *
77 tfrc_tx_hist_find_entry(struct tfrc_tx_hist_entry *head, u64 seqno)
78{
79 while (head != NULL && head->seqno != seqno)
80 head = head->next;
81
82 return head;
83}
84
64int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno) 85int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno)
65{ 86{
66 struct tfrc_tx_hist_entry *entry = kmem_cache_alloc(tfrc_tx_hist_slab, gfp_any()); 87 struct tfrc_tx_hist_entry *entry = kmem_cache_alloc(tfrc_tx_hist_slab, gfp_any());
@@ -90,6 +111,25 @@ void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp)
90} 111}
91EXPORT_SYMBOL_GPL(tfrc_tx_hist_purge); 112EXPORT_SYMBOL_GPL(tfrc_tx_hist_purge);
92 113
114u32 tfrc_tx_hist_rtt(struct tfrc_tx_hist_entry *head, const u64 seqno,
115 const ktime_t now)
116{
117 u32 rtt = 0;
118 struct tfrc_tx_hist_entry *packet = tfrc_tx_hist_find_entry(head, seqno);
119
120 if (packet != NULL) {
121 rtt = ktime_us_delta(now, packet->stamp);
122 /*
123 * Garbage-collect older (irrelevant) entries:
124 */
125 tfrc_tx_hist_purge(&packet->next);
126 }
127
128 return rtt;
129}
130EXPORT_SYMBOL_GPL(tfrc_tx_hist_rtt);
131
132
93/* 133/*
94 * Receiver History Routines 134 * Receiver History Routines
95 */ 135 */
@@ -151,31 +191,14 @@ int tfrc_rx_hist_duplicate(struct tfrc_rx_hist *h, struct sk_buff *skb)
151} 191}
152EXPORT_SYMBOL_GPL(tfrc_rx_hist_duplicate); 192EXPORT_SYMBOL_GPL(tfrc_rx_hist_duplicate);
153 193
154
155static void __tfrc_rx_hist_swap(struct tfrc_rx_hist *h, const u8 a, const u8 b)
156{
157 struct tfrc_rx_hist_entry *tmp = h->ring[a];
158
159 h->ring[a] = h->ring[b];
160 h->ring[b] = tmp;
161}
162
163static void tfrc_rx_hist_swap(struct tfrc_rx_hist *h, const u8 a, const u8 b) 194static void tfrc_rx_hist_swap(struct tfrc_rx_hist *h, const u8 a, const u8 b)
164{ 195{
165 __tfrc_rx_hist_swap(h, tfrc_rx_hist_index(h, a), 196 const u8 idx_a = tfrc_rx_hist_index(h, a),
166 tfrc_rx_hist_index(h, b)); 197 idx_b = tfrc_rx_hist_index(h, b);
167} 198 struct tfrc_rx_hist_entry *tmp = h->ring[idx_a];
168 199
169/** 200 h->ring[idx_a] = h->ring[idx_b];
170 * tfrc_rx_hist_resume_rtt_sampling - Prepare RX history for RTT sampling 201 h->ring[idx_b] = tmp;
171 * This is called after loss detection has finished, when the history entry
172 * with the index of `loss_count' holds the highest-received sequence number.
173 * RTT sampling requires this information at ring[0] (tfrc_rx_hist_sample_rtt).
174 */
175static inline void tfrc_rx_hist_resume_rtt_sampling(struct tfrc_rx_hist *h)
176{
177 __tfrc_rx_hist_swap(h, 0, tfrc_rx_hist_index(h, h->loss_count));
178 h->loss_count = h->loss_start = 0;
179} 202}
180 203
181/* 204/*
@@ -192,8 +215,10 @@ static void __do_track_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u64 n1)
192 u64 s0 = tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno, 215 u64 s0 = tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno,
193 s1 = DCCP_SKB_CB(skb)->dccpd_seq; 216 s1 = DCCP_SKB_CB(skb)->dccpd_seq;
194 217
195 if (!dccp_loss_free(s0, s1, n1)) /* gap between S0 and S1 */ 218 if (!dccp_loss_free(s0, s1, n1)) { /* gap between S0 and S1 */
196 h->loss_count = 1; 219 h->loss_count = 1;
220 tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 1), skb, n1);
221 }
197} 222}
198 223
199static void __one_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n2) 224static void __one_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n2)
@@ -215,7 +240,8 @@ static void __one_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n2
215 240
216 if (dccp_loss_free(s2, s1, n1)) { 241 if (dccp_loss_free(s2, s1, n1)) {
217 /* hole is filled: S0, S2, and S1 are consecutive */ 242 /* hole is filled: S0, S2, and S1 are consecutive */
218 tfrc_rx_hist_resume_rtt_sampling(h); 243 h->loss_count = 0;
244 h->loss_start = tfrc_rx_hist_index(h, 1);
219 } else 245 } else
220 /* gap between S2 and S1: just update loss_prev */ 246 /* gap between S2 and S1: just update loss_prev */
221 tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_loss_prev(h), skb, n2); 247 tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_loss_prev(h), skb, n2);
@@ -268,7 +294,8 @@ static int __two_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n3)
268 294
269 if (dccp_loss_free(s1, s2, n2)) { 295 if (dccp_loss_free(s1, s2, n2)) {
270 /* entire hole filled by S0, S3, S1, S2 */ 296 /* entire hole filled by S0, S3, S1, S2 */
271 tfrc_rx_hist_resume_rtt_sampling(h); 297 h->loss_start = tfrc_rx_hist_index(h, 2);
298 h->loss_count = 0;
272 } else { 299 } else {
273 /* gap remains between S1 and S2 */ 300 /* gap remains between S1 and S2 */
274 h->loss_start = tfrc_rx_hist_index(h, 1); 301 h->loss_start = tfrc_rx_hist_index(h, 1);
@@ -312,7 +339,8 @@ static void __three_after_loss(struct tfrc_rx_hist *h)
312 339
313 if (dccp_loss_free(s2, s3, n3)) { 340 if (dccp_loss_free(s2, s3, n3)) {
314 /* no gap between S2 and S3: entire hole is filled */ 341 /* no gap between S2 and S3: entire hole is filled */
315 tfrc_rx_hist_resume_rtt_sampling(h); 342 h->loss_start = tfrc_rx_hist_index(h, 3);
343 h->loss_count = 0;
316 } else { 344 } else {
317 /* gap between S2 and S3 */ 345 /* gap between S2 and S3 */
318 h->loss_start = tfrc_rx_hist_index(h, 2); 346 h->loss_start = tfrc_rx_hist_index(h, 2);
@@ -326,13 +354,13 @@ static void __three_after_loss(struct tfrc_rx_hist *h)
326} 354}
327 355
328/** 356/**
329 * tfrc_rx_congestion_event - Loss detection and further processing 357 * tfrc_rx_handle_loss - Loss detection and further processing
330 * @h: The non-empty RX history object 358 * @h: The non-empty RX history object
331 * @lh: Loss Intervals database to update 359 * @lh: Loss Intervals database to update
332 * @skb: Currently received packet 360 * @skb: Currently received packet
333 * @ndp: The NDP count belonging to @skb 361 * @ndp: The NDP count belonging to @skb
334 * @first_li: Caller-dependent computation of first loss interval in @lh 362 * @calc_first_li: Caller-dependent computation of first loss interval in @lh
335 * @sk: Used by @calc_first_li (see tfrc_lh_interval_add) 363 * @sk: Used by @calc_first_li (see tfrc_lh_interval_add)
336 * Chooses action according to pending loss, updates LI database when a new 364 * Chooses action according to pending loss, updates LI database when a new
337 * loss was detected, and does required post-processing. Returns 1 when caller 365 * loss was detected, and does required post-processing. Returns 1 when caller
338 * should send feedback, 0 otherwise. 366 * should send feedback, 0 otherwise.
@@ -340,20 +368,15 @@ static void __three_after_loss(struct tfrc_rx_hist *h)
340 * records accordingly, the caller should not perform any more RX history 368 * records accordingly, the caller should not perform any more RX history
341 * operations when loss_count is greater than 0 after calling this function. 369 * operations when loss_count is greater than 0 after calling this function.
342 */ 370 */
343bool tfrc_rx_congestion_event(struct tfrc_rx_hist *h, 371int tfrc_rx_handle_loss(struct tfrc_rx_hist *h,
344 struct tfrc_loss_hist *lh, 372 struct tfrc_loss_hist *lh,
345 struct sk_buff *skb, const u64 ndp, 373 struct sk_buff *skb, const u64 ndp,
346 u32 (*first_li)(struct sock *), struct sock *sk) 374 u32 (*calc_first_li)(struct sock *), struct sock *sk)
347{ 375{
348 bool new_event = false; 376 int is_new_loss = 0;
349
350 if (tfrc_rx_hist_duplicate(h, skb))
351 return 0;
352 377
353 if (h->loss_count == 0) { 378 if (h->loss_count == 0) {
354 __do_track_loss(h, skb, ndp); 379 __do_track_loss(h, skb, ndp);
355 tfrc_rx_hist_sample_rtt(h, skb);
356 tfrc_rx_hist_add_packet(h, skb, ndp);
357 } else if (h->loss_count == 1) { 380 } else if (h->loss_count == 1) {
358 __one_after_loss(h, skb, ndp); 381 __one_after_loss(h, skb, ndp);
359 } else if (h->loss_count != 2) { 382 } else if (h->loss_count != 2) {
@@ -362,57 +385,34 @@ bool tfrc_rx_congestion_event(struct tfrc_rx_hist *h,
362 /* 385 /*
363 * Update Loss Interval database and recycle RX records 386 * Update Loss Interval database and recycle RX records
364 */ 387 */
365 new_event = tfrc_lh_interval_add(lh, h, first_li, sk); 388 is_new_loss = tfrc_lh_interval_add(lh, h, calc_first_li, sk);
366 __three_after_loss(h); 389 __three_after_loss(h);
367 } 390 }
368 391 return is_new_loss;
369 /*
370 * Update moving-average of `s' and the sum of received payload bytes.
371 */
372 if (dccp_data_packet(skb)) {
373 const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4;
374
375 h->packet_size = tfrc_ewma(h->packet_size, payload, 9);
376 h->bytes_recvd += payload;
377 }
378
379 /* RFC 3448, 6.1: update I_0, whose growth implies p <= p_prev */
380 if (!new_event)
381 tfrc_lh_update_i_mean(lh, skb);
382
383 return new_event;
384} 392}
385EXPORT_SYMBOL_GPL(tfrc_rx_congestion_event); 393EXPORT_SYMBOL_GPL(tfrc_rx_handle_loss);
386 394
387/* Compute the sending rate X_recv measured between feedback intervals */ 395int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h)
388u32 tfrc_rx_hist_x_recv(struct tfrc_rx_hist *h, const u32 last_x_recv)
389{ 396{
390 u64 bytes = h->bytes_recvd, last_rtt = h->rtt_estimate; 397 int i;
391 s64 delta = ktime_to_us(net_timedelta(h->bytes_start));
392
393 WARN_ON(delta <= 0);
394 /*
395 * Ensure that the sampling interval for X_recv is at least one RTT,
396 * by extending the sampling interval backwards in time, over the last
397 * R_(m-1) seconds, as per rfc3448bis-06, 6.2.
398 * To reduce noise (e.g. when the RTT changes often), this is only
399 * done when delta is smaller than RTT/2.
400 */
401 if (last_x_recv > 0 && delta < last_rtt/2) {
402 tfrc_pr_debug("delta < RTT ==> %ld us < %u us\n",
403 (long)delta, (unsigned)last_rtt);
404 398
405 delta = (bytes ? delta : 0) + last_rtt; 399 for (i = 0; i <= TFRC_NDUPACK; i++) {
406 bytes += div_u64((u64)last_x_recv * last_rtt, USEC_PER_SEC); 400 h->ring[i] = kmem_cache_alloc(tfrc_rx_hist_slab, GFP_ATOMIC);
401 if (h->ring[i] == NULL)
402 goto out_free;
407 } 403 }
408 404
409 if (unlikely(bytes == 0)) { 405 h->loss_count = h->loss_start = 0;
410 DCCP_WARN("X_recv == 0, using old value of %u\n", last_x_recv); 406 return 0;
411 return last_x_recv; 407
408out_free:
409 while (i-- != 0) {
410 kmem_cache_free(tfrc_rx_hist_slab, h->ring[i]);
411 h->ring[i] = NULL;
412 } 412 }
413 return scaled_div32(bytes, delta); 413 return -ENOBUFS;
414} 414}
415EXPORT_SYMBOL_GPL(tfrc_rx_hist_x_recv); 415EXPORT_SYMBOL_GPL(tfrc_rx_hist_alloc);
416 416
417void tfrc_rx_hist_purge(struct tfrc_rx_hist *h) 417void tfrc_rx_hist_purge(struct tfrc_rx_hist *h)
418{ 418{
@@ -426,81 +426,73 @@ void tfrc_rx_hist_purge(struct tfrc_rx_hist *h)
426} 426}
427EXPORT_SYMBOL_GPL(tfrc_rx_hist_purge); 427EXPORT_SYMBOL_GPL(tfrc_rx_hist_purge);
428 428
429static int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h) 429/**
430 * tfrc_rx_hist_rtt_last_s - reference entry to compute RTT samples against
431 */
432static inline struct tfrc_rx_hist_entry *
433 tfrc_rx_hist_rtt_last_s(const struct tfrc_rx_hist *h)
430{ 434{
431 int i; 435 return h->ring[0];
432
433 memset(h, 0, sizeof(*h));
434
435 for (i = 0; i <= TFRC_NDUPACK; i++) {
436 h->ring[i] = kmem_cache_alloc(tfrc_rx_hist_slab, GFP_ATOMIC);
437 if (h->ring[i] == NULL) {
438 tfrc_rx_hist_purge(h);
439 return -ENOBUFS;
440 }
441 }
442 return 0;
443} 436}
444 437
445int tfrc_rx_hist_init(struct tfrc_rx_hist *h, struct sock *sk) 438/**
439 * tfrc_rx_hist_rtt_prev_s: previously suitable (wrt rtt_last_s) RTT-sampling entry
440 */
441static inline struct tfrc_rx_hist_entry *
442 tfrc_rx_hist_rtt_prev_s(const struct tfrc_rx_hist *h)
446{ 443{
447 if (tfrc_rx_hist_alloc(h)) 444 return h->ring[h->rtt_sample_prev];
448 return -ENOBUFS;
449 /*
450 * Initialise first entry with GSR to start loss detection as early as
451 * possible. Code using this must not use any other fields. The entry
452 * will be overwritten once the CCID updates its received packets.
453 */
454 tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno = dccp_sk(sk)->dccps_gsr;
455 return 0;
456} 445}
457EXPORT_SYMBOL_GPL(tfrc_rx_hist_init);
458 446
459/** 447/**
460 * tfrc_rx_hist_sample_rtt - Sample RTT from timestamp / CCVal 448 * tfrc_rx_hist_sample_rtt - Sample RTT from timestamp / CCVal
461 * Based on ideas presented in RFC 4342, 8.1. This function expects that no loss 449 * Based on ideas presented in RFC 4342, 8.1. Returns 0 if it was not able
462 * is pending and uses the following history entries (via rtt_sample_prev): 450 * to compute a sample with given data - calling function should check this.
463 * - h->ring[0] contains the most recent history entry prior to @skb;
464 * - h->ring[1] is an unused `dummy' entry when the current difference is 0;
465 */ 451 */
466void tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, const struct sk_buff *skb) 452u32 tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, const struct sk_buff *skb)
467{ 453{
468 struct tfrc_rx_hist_entry *last = h->ring[0]; 454 u32 sample = 0,
469 u32 sample, delta_v; 455 delta_v = SUB16(dccp_hdr(skb)->dccph_ccval,
470 456 tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval);
471 /* 457
472 * When not to sample: 458 if (delta_v < 1 || delta_v > 4) { /* unsuitable CCVal delta */
473 * - on non-data packets 459 if (h->rtt_sample_prev == 2) { /* previous candidate stored */
474 * (RFC 4342, 8.1: CCVal only fully defined for data packets); 460 sample = SUB16(tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_ccval,
475 * - when no data packets have been received yet 461 tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval);
476 * (FIXME: using sampled packet size as indicator here); 462 if (sample)
477 * - as long as there are gaps in the sequence space (pending loss). 463 sample = 4 / sample *
478 */ 464 ktime_us_delta(tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_tstamp,
479 if (!dccp_data_packet(skb) || h->packet_size == 0 || 465 tfrc_rx_hist_rtt_last_s(h)->tfrchrx_tstamp);
480 tfrc_rx_hist_loss_pending(h)) 466 else /*
481 return; 467 * FIXME: This condition is in principle not
468 * possible but occurs when CCID is used for
469 * two-way data traffic. I have tried to trace
470 * it, but the cause does not seem to be here.
471 */
472 DCCP_BUG("please report to dccp@vger.kernel.org"
473 " => prev = %u, last = %u",
474 tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_ccval,
475 tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval);
476 } else if (delta_v < 1) {
477 h->rtt_sample_prev = 1;
478 goto keep_ref_for_next_time;
479 }
482 480
483 h->rtt_sample_prev = 0; /* reset previous candidate */ 481 } else if (delta_v == 4) /* optimal match */
482 sample = ktime_to_us(net_timedelta(tfrc_rx_hist_rtt_last_s(h)->tfrchrx_tstamp));
483 else { /* suboptimal match */
484 h->rtt_sample_prev = 2;
485 goto keep_ref_for_next_time;
486 }
484 487
485 delta_v = SUB16(dccp_hdr(skb)->dccph_ccval, last->tfrchrx_ccval); 488 if (unlikely(sample > DCCP_SANE_RTT_MAX)) {
486 if (delta_v == 0) { /* less than RTT/4 difference */ 489 DCCP_WARN("RTT sample %u too large, using max\n", sample);
487 h->rtt_sample_prev = 1; 490 sample = DCCP_SANE_RTT_MAX;
488 return;
489 } 491 }
490 sample = dccp_sane_rtt(ktime_to_us(net_timedelta(last->tfrchrx_tstamp)));
491 492
492 if (delta_v <= 4) /* between RTT/4 and RTT */ 493 h->rtt_sample_prev = 0; /* use current entry as next reference */
493 sample *= 4 / delta_v; 494keep_ref_for_next_time:
494 else if (!(sample < h->rtt_estimate && sample > h->rtt_estimate/2))
495 /*
496 * Optimisation: CCVal difference is greater than 1 RTT, yet the
497 * sample is less than the local RTT estimate; which means that
498 * the RTT estimate is too high.
499 * To avoid noise, it is not done if the sample is below RTT/2.
500 */
501 return;
502 495
503 /* Use a lower weight than usual to increase responsiveness */ 496 return sample;
504 h->rtt_estimate = tfrc_ewma(h->rtt_estimate, sample, 5);
505} 497}
506EXPORT_SYMBOL_GPL(tfrc_rx_hist_sample_rtt); 498EXPORT_SYMBOL_GPL(tfrc_rx_hist_sample_rtt);
diff --git a/net/dccp/ccids/lib/packet_history.h b/net/dccp/ccids/lib/packet_history.h
index 555e65cd73a0..461cc91cce88 100644
--- a/net/dccp/ccids/lib/packet_history.h
+++ b/net/dccp/ccids/lib/packet_history.h
@@ -40,28 +40,12 @@
40#include <linux/slab.h> 40#include <linux/slab.h>
41#include "tfrc.h" 41#include "tfrc.h"
42 42
43/** 43struct tfrc_tx_hist_entry;
44 * tfrc_tx_hist_entry - Simple singly-linked TX history list
45 * @next: next oldest entry (LIFO order)
46 * @seqno: sequence number of this entry
47 * @stamp: send time of packet with sequence number @seqno
48 */
49struct tfrc_tx_hist_entry {
50 struct tfrc_tx_hist_entry *next;
51 u64 seqno;
52 ktime_t stamp;
53};
54
55static inline struct tfrc_tx_hist_entry *
56 tfrc_tx_hist_find_entry(struct tfrc_tx_hist_entry *head, u64 seqno)
57{
58 while (head != NULL && head->seqno != seqno)
59 head = head->next;
60 return head;
61}
62 44
63extern int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno); 45extern int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno);
64extern void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp); 46extern void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp);
47extern u32 tfrc_tx_hist_rtt(struct tfrc_tx_hist_entry *head,
48 const u64 seqno, const ktime_t now);
65 49
66/* Subtraction a-b modulo-16, respects circular wrap-around */ 50/* Subtraction a-b modulo-16, respects circular wrap-around */
67#define SUB16(a, b) (((a) + 16 - (b)) & 0xF) 51#define SUB16(a, b) (((a) + 16 - (b)) & 0xF)
@@ -91,22 +75,12 @@ struct tfrc_rx_hist_entry {
91 * @loss_count: Number of entries in circular history 75 * @loss_count: Number of entries in circular history
92 * @loss_start: Movable index (for loss detection) 76 * @loss_start: Movable index (for loss detection)
93 * @rtt_sample_prev: Used during RTT sampling, points to candidate entry 77 * @rtt_sample_prev: Used during RTT sampling, points to candidate entry
94 * @rtt_estimate: Receiver RTT estimate
95 * @packet_size: Packet size in bytes (as per RFC 3448, 3.1)
96 * @bytes_recvd: Number of bytes received since @bytes_start
97 * @bytes_start: Start time for counting @bytes_recvd
98 */ 78 */
99struct tfrc_rx_hist { 79struct tfrc_rx_hist {
100 struct tfrc_rx_hist_entry *ring[TFRC_NDUPACK + 1]; 80 struct tfrc_rx_hist_entry *ring[TFRC_NDUPACK + 1];
101 u8 loss_count:2, 81 u8 loss_count:2,
102 loss_start:2; 82 loss_start:2;
103 /* Receiver RTT sampling */
104#define rtt_sample_prev loss_start 83#define rtt_sample_prev loss_start
105 u32 rtt_estimate;
106 /* Receiver sampling of application payload lengths */
107 u32 packet_size,
108 bytes_recvd;
109 ktime_t bytes_start;
110}; 84};
111 85
112/** 86/**
@@ -150,50 +124,20 @@ static inline bool tfrc_rx_hist_loss_pending(const struct tfrc_rx_hist *h)
150 return h->loss_count > 0; 124 return h->loss_count > 0;
151} 125}
152 126
153/*
154 * Accessor functions to retrieve parameters sampled by the RX history
155 */
156static inline u32 tfrc_rx_hist_packet_size(const struct tfrc_rx_hist *h)
157{
158 if (h->packet_size == 0) {
159 DCCP_WARN("No sample for s, using fallback\n");
160 return TCP_MIN_RCVMSS;
161 }
162 return h->packet_size;
163
164}
165static inline u32 tfrc_rx_hist_rtt(const struct tfrc_rx_hist *h)
166{
167 if (h->rtt_estimate == 0) {
168 DCCP_WARN("No RTT estimate available, using fallback RTT\n");
169 return DCCP_FALLBACK_RTT;
170 }
171 return h->rtt_estimate;
172}
173
174static inline void tfrc_rx_hist_restart_byte_counter(struct tfrc_rx_hist *h)
175{
176 h->bytes_recvd = 0;
177 h->bytes_start = ktime_get_real();
178}
179
180extern u32 tfrc_rx_hist_x_recv(struct tfrc_rx_hist *h, const u32 last_x_recv);
181
182
183extern void tfrc_rx_hist_add_packet(struct tfrc_rx_hist *h, 127extern void tfrc_rx_hist_add_packet(struct tfrc_rx_hist *h,
184 const struct sk_buff *skb, const u64 ndp); 128 const struct sk_buff *skb, const u64 ndp);
185 129
186extern int tfrc_rx_hist_duplicate(struct tfrc_rx_hist *h, struct sk_buff *skb); 130extern int tfrc_rx_hist_duplicate(struct tfrc_rx_hist *h, struct sk_buff *skb);
187 131
188struct tfrc_loss_hist; 132struct tfrc_loss_hist;
189extern bool tfrc_rx_congestion_event(struct tfrc_rx_hist *h, 133extern int tfrc_rx_handle_loss(struct tfrc_rx_hist *h,
190 struct tfrc_loss_hist *lh, 134 struct tfrc_loss_hist *lh,
191 struct sk_buff *skb, const u64 ndp, 135 struct sk_buff *skb, const u64 ndp,
192 u32 (*first_li)(struct sock *sk), 136 u32 (*first_li)(struct sock *sk),
193 struct sock *sk); 137 struct sock *sk);
194extern void tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, 138extern u32 tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h,
195 const struct sk_buff *skb); 139 const struct sk_buff *skb);
196extern int tfrc_rx_hist_init(struct tfrc_rx_hist *h, struct sock *sk); 140extern int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h);
197extern void tfrc_rx_hist_purge(struct tfrc_rx_hist *h); 141extern void tfrc_rx_hist_purge(struct tfrc_rx_hist *h);
198 142
199#endif /* _DCCP_PKT_HIST_ */ 143#endif /* _DCCP_PKT_HIST_ */
diff --git a/net/dccp/ccids/lib/tfrc.h b/net/dccp/ccids/lib/tfrc.h
index ede12f53de5a..ed9857527acf 100644
--- a/net/dccp/ccids/lib/tfrc.h
+++ b/net/dccp/ccids/lib/tfrc.h
@@ -48,21 +48,6 @@ static inline u32 scaled_div32(u64 a, u64 b)
48} 48}
49 49
50/** 50/**
51 * tfrc_scaled_sqrt - Compute scaled integer sqrt(x) for 0 < x < 2^22-1
52 * Uses scaling to improve accuracy of the integer approximation of sqrt(). The
53 * scaling factor of 2^10 limits the maximum @sample to 4e6; this is okay for
54 * clamped RTT samples (dccp_sample_rtt).
55 * Should best be used for expressions of type sqrt(x)/sqrt(y), since then the
56 * scaling factor is neutralised. For this purpose, it avoids returning zero.
57 */
58static inline u16 tfrc_scaled_sqrt(const u32 sample)
59{
60 const unsigned long non_zero_sample = sample ? : 1;
61
62 return int_sqrt(non_zero_sample << 10);
63}
64
65/**
66 * tfrc_ewma - Exponentially weighted moving average 51 * tfrc_ewma - Exponentially weighted moving average
67 * @weight: Weight to be used as damping factor, in units of 1/10 52 * @weight: Weight to be used as damping factor, in units of 1/10
68 */ 53 */
@@ -73,7 +58,6 @@ static inline u32 tfrc_ewma(const u32 avg, const u32 newval, const u8 weight)
73 58
74extern u32 tfrc_calc_x(u16 s, u32 R, u32 p); 59extern u32 tfrc_calc_x(u16 s, u32 R, u32 p);
75extern u32 tfrc_calc_x_reverse_lookup(u32 fvalue); 60extern u32 tfrc_calc_x_reverse_lookup(u32 fvalue);
76extern u32 tfrc_invert_loss_event_rate(u32 loss_event_rate);
77 61
78extern int tfrc_tx_packet_history_init(void); 62extern int tfrc_tx_packet_history_init(void);
79extern void tfrc_tx_packet_history_exit(void); 63extern void tfrc_tx_packet_history_exit(void);
diff --git a/net/dccp/ccids/lib/tfrc_equation.c b/net/dccp/ccids/lib/tfrc_equation.c
index 38239c4d5e14..2f20a29cffe4 100644
--- a/net/dccp/ccids/lib/tfrc_equation.c
+++ b/net/dccp/ccids/lib/tfrc_equation.c
@@ -632,16 +632,8 @@ u32 tfrc_calc_x(u16 s, u32 R, u32 p)
632 632
633 if (p <= TFRC_CALC_X_SPLIT) { /* 0.0000 < p <= 0.05 */ 633 if (p <= TFRC_CALC_X_SPLIT) { /* 0.0000 < p <= 0.05 */
634 if (p < TFRC_SMALLEST_P) { /* 0.0000 < p < 0.0001 */ 634 if (p < TFRC_SMALLEST_P) { /* 0.0000 < p < 0.0001 */
635 /* 635 DCCP_WARN("Value of p (%d) below resolution. "
636 * In the congestion-avoidance phase p decays towards 0 636 "Substituting %d\n", p, TFRC_SMALLEST_P);
637 * when there are no further losses, so this case is
638 * natural. Truncating to p_min = 0.01% means that the
639 * maximum achievable throughput is limited to about
640 * X_calc_max = 122.4 * s/RTT (see RFC 3448, 3.1); e.g.
641 * with s=1500 bytes, RTT=0.01 s: X_calc_max = 147 Mbps.
642 */
643 tfrc_pr_debug("Value of p (%d) below resolution. "
644 "Substituting %d\n", p, TFRC_SMALLEST_P);
645 index = 0; 637 index = 0;
646 } else /* 0.0001 <= p <= 0.05 */ 638 } else /* 0.0001 <= p <= 0.05 */
647 index = p/TFRC_SMALLEST_P - 1; 639 index = p/TFRC_SMALLEST_P - 1;
@@ -666,6 +658,7 @@ u32 tfrc_calc_x(u16 s, u32 R, u32 p)
666 result = scaled_div(s, R); 658 result = scaled_div(s, R);
667 return scaled_div32(result, f); 659 return scaled_div32(result, f);
668} 660}
661
669EXPORT_SYMBOL_GPL(tfrc_calc_x); 662EXPORT_SYMBOL_GPL(tfrc_calc_x);
670 663
671/** 664/**
@@ -700,19 +693,5 @@ u32 tfrc_calc_x_reverse_lookup(u32 fvalue)
700 index = tfrc_binsearch(fvalue, 0); 693 index = tfrc_binsearch(fvalue, 0);
701 return (index + 1) * 1000000 / TFRC_CALC_X_ARRSIZE; 694 return (index + 1) * 1000000 / TFRC_CALC_X_ARRSIZE;
702} 695}
703EXPORT_SYMBOL_GPL(tfrc_calc_x_reverse_lookup);
704 696
705/** 697EXPORT_SYMBOL_GPL(tfrc_calc_x_reverse_lookup);
706 * tfrc_invert_loss_event_rate - Compute p so that 10^6 corresponds to 100%
707 * When @loss_event_rate is large, there is a chance that p is truncated to 0.
708 * To avoid re-entering slow-start in that case, we set p = TFRC_SMALLEST_P > 0.
709 */
710u32 tfrc_invert_loss_event_rate(u32 loss_event_rate)
711{
712 if (loss_event_rate == UINT_MAX) /* see RFC 4342, 8.5 */
713 return 0;
714 if (unlikely(loss_event_rate == 0)) /* map 1/0 into 100% */
715 return 1000000;
716 return max_t(u32, scaled_div(1, loss_event_rate), TFRC_SMALLEST_P);
717}
718EXPORT_SYMBOL_GPL(tfrc_invert_loss_event_rate);