aboutsummaryrefslogtreecommitdiffstats
path: root/net/dccp
diff options
context:
space:
mode:
Diffstat (limited to 'net/dccp')
-rw-r--r--net/dccp/Kconfig1
-rw-r--r--net/dccp/ackvec.c163
-rw-r--r--net/dccp/ackvec.h62
-rw-r--r--net/dccp/ccid.c8
-rw-r--r--net/dccp/ccid.h37
-rw-r--r--net/dccp/ccids/Kconfig30
-rw-r--r--net/dccp/ccids/ccid2.c228
-rw-r--r--net/dccp/ccids/ccid2.h21
-rw-r--r--net/dccp/ccids/ccid3.c710
-rw-r--r--net/dccp/ccids/ccid3.h41
-rw-r--r--net/dccp/ccids/lib/Makefile2
-rw-r--r--net/dccp/ccids/lib/loss_interval.c352
-rw-r--r--net/dccp/ccids/lib/loss_interval.h64
-rw-r--r--net/dccp/ccids/lib/packet_history.c599
-rw-r--r--net/dccp/ccids/lib/packet_history.h220
-rw-r--r--net/dccp/ccids/lib/tfrc.c63
-rw-r--r--net/dccp/ccids/lib/tfrc.h29
-rw-r--r--net/dccp/dccp.h35
-rw-r--r--net/dccp/feat.c29
-rw-r--r--net/dccp/feat.h26
-rw-r--r--net/dccp/input.c155
-rw-r--r--net/dccp/ipv4.c12
-rw-r--r--net/dccp/ipv6.c10
-rw-r--r--net/dccp/minisocks.c33
-rw-r--r--net/dccp/options.c139
-rw-r--r--net/dccp/output.c55
-rw-r--r--net/dccp/proto.c194
-rw-r--r--net/dccp/sysctl.c36
-rw-r--r--net/dccp/timer.c5
29 files changed, 1777 insertions, 1582 deletions
diff --git a/net/dccp/Kconfig b/net/dccp/Kconfig
index 0549e4719b13..7aa2a7acc7ec 100644
--- a/net/dccp/Kconfig
+++ b/net/dccp/Kconfig
@@ -1,6 +1,7 @@
1menuconfig IP_DCCP 1menuconfig IP_DCCP
2 tristate "The DCCP Protocol (EXPERIMENTAL)" 2 tristate "The DCCP Protocol (EXPERIMENTAL)"
3 depends on INET && EXPERIMENTAL 3 depends on INET && EXPERIMENTAL
4 select IP_DCCP_CCID2
4 ---help--- 5 ---help---
5 Datagram Congestion Control Protocol (RFC 4340) 6 Datagram Congestion Control Protocol (RFC 4340)
6 7
diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c
index 83378f379f72..6de4bd195d28 100644
--- a/net/dccp/ackvec.c
+++ b/net/dccp/ackvec.c
@@ -30,7 +30,7 @@ static struct dccp_ackvec_record *dccp_ackvec_record_new(void)
30 kmem_cache_alloc(dccp_ackvec_record_slab, GFP_ATOMIC); 30 kmem_cache_alloc(dccp_ackvec_record_slab, GFP_ATOMIC);
31 31
32 if (avr != NULL) 32 if (avr != NULL)
33 INIT_LIST_HEAD(&avr->dccpavr_node); 33 INIT_LIST_HEAD(&avr->avr_node);
34 34
35 return avr; 35 return avr;
36} 36}
@@ -40,7 +40,7 @@ static void dccp_ackvec_record_delete(struct dccp_ackvec_record *avr)
40 if (unlikely(avr == NULL)) 40 if (unlikely(avr == NULL))
41 return; 41 return;
42 /* Check if deleting a linked record */ 42 /* Check if deleting a linked record */
43 WARN_ON(!list_empty(&avr->dccpavr_node)); 43 WARN_ON(!list_empty(&avr->avr_node));
44 kmem_cache_free(dccp_ackvec_record_slab, avr); 44 kmem_cache_free(dccp_ackvec_record_slab, avr);
45} 45}
46 46
@@ -52,16 +52,15 @@ static void dccp_ackvec_insert_avr(struct dccp_ackvec *av,
52 * just add the AVR at the head of the list. 52 * just add the AVR at the head of the list.
53 * -sorbo. 53 * -sorbo.
54 */ 54 */
55 if (!list_empty(&av->dccpav_records)) { 55 if (!list_empty(&av->av_records)) {
56 const struct dccp_ackvec_record *head = 56 const struct dccp_ackvec_record *head =
57 list_entry(av->dccpav_records.next, 57 list_entry(av->av_records.next,
58 struct dccp_ackvec_record, 58 struct dccp_ackvec_record,
59 dccpavr_node); 59 avr_node);
60 BUG_ON(before48(avr->dccpavr_ack_seqno, 60 BUG_ON(before48(avr->avr_ack_seqno, head->avr_ack_seqno));
61 head->dccpavr_ack_seqno));
62 } 61 }
63 62
64 list_add(&avr->dccpavr_node, &av->dccpav_records); 63 list_add(&avr->avr_node, &av->av_records);
65} 64}
66 65
67int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb) 66int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb)
@@ -69,9 +68,8 @@ int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb)
69 struct dccp_sock *dp = dccp_sk(sk); 68 struct dccp_sock *dp = dccp_sk(sk);
70 struct dccp_ackvec *av = dp->dccps_hc_rx_ackvec; 69 struct dccp_ackvec *av = dp->dccps_hc_rx_ackvec;
71 /* Figure out how many options do we need to represent the ackvec */ 70 /* Figure out how many options do we need to represent the ackvec */
72 const u16 nr_opts = DIV_ROUND_UP(av->dccpav_vec_len, 71 const u16 nr_opts = DIV_ROUND_UP(av->av_vec_len, DCCP_MAX_ACKVEC_OPT_LEN);
73 DCCP_MAX_ACKVEC_OPT_LEN); 72 u16 len = av->av_vec_len + 2 * nr_opts, i;
74 u16 len = av->dccpav_vec_len + 2 * nr_opts, i;
75 u32 elapsed_time; 73 u32 elapsed_time;
76 const unsigned char *tail, *from; 74 const unsigned char *tail, *from;
77 unsigned char *to; 75 unsigned char *to;
@@ -81,7 +79,7 @@ int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb)
81 if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) 79 if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN)
82 return -1; 80 return -1;
83 81
84 delta = ktime_us_delta(ktime_get_real(), av->dccpav_time); 82 delta = ktime_us_delta(ktime_get_real(), av->av_time);
85 elapsed_time = delta / 10; 83 elapsed_time = delta / 10;
86 84
87 if (elapsed_time != 0 && 85 if (elapsed_time != 0 &&
@@ -95,9 +93,9 @@ int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb)
95 DCCP_SKB_CB(skb)->dccpd_opt_len += len; 93 DCCP_SKB_CB(skb)->dccpd_opt_len += len;
96 94
97 to = skb_push(skb, len); 95 to = skb_push(skb, len);
98 len = av->dccpav_vec_len; 96 len = av->av_vec_len;
99 from = av->dccpav_buf + av->dccpav_buf_head; 97 from = av->av_buf + av->av_buf_head;
100 tail = av->dccpav_buf + DCCP_MAX_ACKVEC_LEN; 98 tail = av->av_buf + DCCP_MAX_ACKVEC_LEN;
101 99
102 for (i = 0; i < nr_opts; ++i) { 100 for (i = 0; i < nr_opts; ++i) {
103 int copylen = len; 101 int copylen = len;
@@ -116,7 +114,7 @@ int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb)
116 to += tailsize; 114 to += tailsize;
117 len -= tailsize; 115 len -= tailsize;
118 copylen -= tailsize; 116 copylen -= tailsize;
119 from = av->dccpav_buf; 117 from = av->av_buf;
120 } 118 }
121 119
122 memcpy(to, from, copylen); 120 memcpy(to, from, copylen);
@@ -134,19 +132,19 @@ int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb)
134 * buf_head; ack_ackno will equal buf_ackno; and ack_nonce will 132 * buf_head; ack_ackno will equal buf_ackno; and ack_nonce will
135 * equal buf_nonce. 133 * equal buf_nonce.
136 */ 134 */
137 avr->dccpavr_ack_seqno = DCCP_SKB_CB(skb)->dccpd_seq; 135 avr->avr_ack_seqno = DCCP_SKB_CB(skb)->dccpd_seq;
138 avr->dccpavr_ack_ptr = av->dccpav_buf_head; 136 avr->avr_ack_ptr = av->av_buf_head;
139 avr->dccpavr_ack_ackno = av->dccpav_buf_ackno; 137 avr->avr_ack_ackno = av->av_buf_ackno;
140 avr->dccpavr_ack_nonce = av->dccpav_buf_nonce; 138 avr->avr_ack_nonce = av->av_buf_nonce;
141 avr->dccpavr_sent_len = av->dccpav_vec_len; 139 avr->avr_sent_len = av->av_vec_len;
142 140
143 dccp_ackvec_insert_avr(av, avr); 141 dccp_ackvec_insert_avr(av, avr);
144 142
145 dccp_pr_debug("%s ACK Vector 0, len=%d, ack_seqno=%llu, " 143 dccp_pr_debug("%s ACK Vector 0, len=%d, ack_seqno=%llu, "
146 "ack_ackno=%llu\n", 144 "ack_ackno=%llu\n",
147 dccp_role(sk), avr->dccpavr_sent_len, 145 dccp_role(sk), avr->avr_sent_len,
148 (unsigned long long)avr->dccpavr_ack_seqno, 146 (unsigned long long)avr->avr_ack_seqno,
149 (unsigned long long)avr->dccpavr_ack_ackno); 147 (unsigned long long)avr->avr_ack_ackno);
150 return 0; 148 return 0;
151} 149}
152 150
@@ -155,12 +153,12 @@ struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority)
155 struct dccp_ackvec *av = kmem_cache_alloc(dccp_ackvec_slab, priority); 153 struct dccp_ackvec *av = kmem_cache_alloc(dccp_ackvec_slab, priority);
156 154
157 if (av != NULL) { 155 if (av != NULL) {
158 av->dccpav_buf_head = DCCP_MAX_ACKVEC_LEN - 1; 156 av->av_buf_head = DCCP_MAX_ACKVEC_LEN - 1;
159 av->dccpav_buf_ackno = UINT48_MAX + 1; 157 av->av_buf_ackno = UINT48_MAX + 1;
160 av->dccpav_buf_nonce = av->dccpav_buf_nonce = 0; 158 av->av_buf_nonce = 0;
161 av->dccpav_time = ktime_set(0, 0); 159 av->av_time = ktime_set(0, 0);
162 av->dccpav_vec_len = 0; 160 av->av_vec_len = 0;
163 INIT_LIST_HEAD(&av->dccpav_records); 161 INIT_LIST_HEAD(&av->av_records);
164 } 162 }
165 163
166 return av; 164 return av;
@@ -171,12 +169,11 @@ void dccp_ackvec_free(struct dccp_ackvec *av)
171 if (unlikely(av == NULL)) 169 if (unlikely(av == NULL))
172 return; 170 return;
173 171
174 if (!list_empty(&av->dccpav_records)) { 172 if (!list_empty(&av->av_records)) {
175 struct dccp_ackvec_record *avr, *next; 173 struct dccp_ackvec_record *avr, *next;
176 174
177 list_for_each_entry_safe(avr, next, &av->dccpav_records, 175 list_for_each_entry_safe(avr, next, &av->av_records, avr_node) {
178 dccpavr_node) { 176 list_del_init(&avr->avr_node);
179 list_del_init(&avr->dccpavr_node);
180 dccp_ackvec_record_delete(avr); 177 dccp_ackvec_record_delete(avr);
181 } 178 }
182 } 179 }
@@ -187,13 +184,13 @@ void dccp_ackvec_free(struct dccp_ackvec *av)
187static inline u8 dccp_ackvec_state(const struct dccp_ackvec *av, 184static inline u8 dccp_ackvec_state(const struct dccp_ackvec *av,
188 const u32 index) 185 const u32 index)
189{ 186{
190 return av->dccpav_buf[index] & DCCP_ACKVEC_STATE_MASK; 187 return av->av_buf[index] & DCCP_ACKVEC_STATE_MASK;
191} 188}
192 189
193static inline u8 dccp_ackvec_len(const struct dccp_ackvec *av, 190static inline u8 dccp_ackvec_len(const struct dccp_ackvec *av,
194 const u32 index) 191 const u32 index)
195{ 192{
196 return av->dccpav_buf[index] & DCCP_ACKVEC_LEN_MASK; 193 return av->av_buf[index] & DCCP_ACKVEC_LEN_MASK;
197} 194}
198 195
199/* 196/*
@@ -208,29 +205,29 @@ static inline int dccp_ackvec_set_buf_head_state(struct dccp_ackvec *av,
208 unsigned int gap; 205 unsigned int gap;
209 long new_head; 206 long new_head;
210 207
211 if (av->dccpav_vec_len + packets > DCCP_MAX_ACKVEC_LEN) 208 if (av->av_vec_len + packets > DCCP_MAX_ACKVEC_LEN)
212 return -ENOBUFS; 209 return -ENOBUFS;
213 210
214 gap = packets - 1; 211 gap = packets - 1;
215 new_head = av->dccpav_buf_head - packets; 212 new_head = av->av_buf_head - packets;
216 213
217 if (new_head < 0) { 214 if (new_head < 0) {
218 if (gap > 0) { 215 if (gap > 0) {
219 memset(av->dccpav_buf, DCCP_ACKVEC_STATE_NOT_RECEIVED, 216 memset(av->av_buf, DCCP_ACKVEC_STATE_NOT_RECEIVED,
220 gap + new_head + 1); 217 gap + new_head + 1);
221 gap = -new_head; 218 gap = -new_head;
222 } 219 }
223 new_head += DCCP_MAX_ACKVEC_LEN; 220 new_head += DCCP_MAX_ACKVEC_LEN;
224 } 221 }
225 222
226 av->dccpav_buf_head = new_head; 223 av->av_buf_head = new_head;
227 224
228 if (gap > 0) 225 if (gap > 0)
229 memset(av->dccpav_buf + av->dccpav_buf_head + 1, 226 memset(av->av_buf + av->av_buf_head + 1,
230 DCCP_ACKVEC_STATE_NOT_RECEIVED, gap); 227 DCCP_ACKVEC_STATE_NOT_RECEIVED, gap);
231 228
232 av->dccpav_buf[av->dccpav_buf_head] = state; 229 av->av_buf[av->av_buf_head] = state;
233 av->dccpav_vec_len += packets; 230 av->av_vec_len += packets;
234 return 0; 231 return 0;
235} 232}
236 233
@@ -243,7 +240,7 @@ int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk,
243 /* 240 /*
244 * Check at the right places if the buffer is full, if it is, tell the 241 * Check at the right places if the buffer is full, if it is, tell the
245 * caller to start dropping packets till the HC-Sender acks our ACK 242 * caller to start dropping packets till the HC-Sender acks our ACK
246 * vectors, when we will free up space in dccpav_buf. 243 * vectors, when we will free up space in av_buf.
247 * 244 *
248 * We may well decide to do buffer compression, etc, but for now lets 245 * We may well decide to do buffer compression, etc, but for now lets
249 * just drop. 246 * just drop.
@@ -263,22 +260,20 @@ int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk,
263 */ 260 */
264 261
265 /* See if this is the first ackno being inserted */ 262 /* See if this is the first ackno being inserted */
266 if (av->dccpav_vec_len == 0) { 263 if (av->av_vec_len == 0) {
267 av->dccpav_buf[av->dccpav_buf_head] = state; 264 av->av_buf[av->av_buf_head] = state;
268 av->dccpav_vec_len = 1; 265 av->av_vec_len = 1;
269 } else if (after48(ackno, av->dccpav_buf_ackno)) { 266 } else if (after48(ackno, av->av_buf_ackno)) {
270 const u64 delta = dccp_delta_seqno(av->dccpav_buf_ackno, 267 const u64 delta = dccp_delta_seqno(av->av_buf_ackno, ackno);
271 ackno);
272 268
273 /* 269 /*
274 * Look if the state of this packet is the same as the 270 * Look if the state of this packet is the same as the
275 * previous ackno and if so if we can bump the head len. 271 * previous ackno and if so if we can bump the head len.
276 */ 272 */
277 if (delta == 1 && 273 if (delta == 1 &&
278 dccp_ackvec_state(av, av->dccpav_buf_head) == state && 274 dccp_ackvec_state(av, av->av_buf_head) == state &&
279 (dccp_ackvec_len(av, av->dccpav_buf_head) < 275 dccp_ackvec_len(av, av->av_buf_head) < DCCP_ACKVEC_LEN_MASK)
280 DCCP_ACKVEC_LEN_MASK)) 276 av->av_buf[av->av_buf_head]++;
281 av->dccpav_buf[av->dccpav_buf_head]++;
282 else if (dccp_ackvec_set_buf_head_state(av, delta, state)) 277 else if (dccp_ackvec_set_buf_head_state(av, delta, state))
283 return -ENOBUFS; 278 return -ENOBUFS;
284 } else { 279 } else {
@@ -290,14 +285,14 @@ int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk,
290 * the byte corresponding to S. (Indexing structures 285 * the byte corresponding to S. (Indexing structures
291 * could reduce the complexity of this scan.) 286 * could reduce the complexity of this scan.)
292 */ 287 */
293 u64 delta = dccp_delta_seqno(ackno, av->dccpav_buf_ackno); 288 u64 delta = dccp_delta_seqno(ackno, av->av_buf_ackno);
294 u32 index = av->dccpav_buf_head; 289 u32 index = av->av_buf_head;
295 290
296 while (1) { 291 while (1) {
297 const u8 len = dccp_ackvec_len(av, index); 292 const u8 len = dccp_ackvec_len(av, index);
298 const u8 state = dccp_ackvec_state(av, index); 293 const u8 state = dccp_ackvec_state(av, index);
299 /* 294 /*
300 * valid packets not yet in dccpav_buf have a reserved 295 * valid packets not yet in av_buf have a reserved
301 * entry, with a len equal to 0. 296 * entry, with a len equal to 0.
302 */ 297 */
303 if (state == DCCP_ACKVEC_STATE_NOT_RECEIVED && 298 if (state == DCCP_ACKVEC_STATE_NOT_RECEIVED &&
@@ -305,7 +300,7 @@ int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk,
305 reserved seat! */ 300 reserved seat! */
306 dccp_pr_debug("Found %llu reserved seat!\n", 301 dccp_pr_debug("Found %llu reserved seat!\n",
307 (unsigned long long)ackno); 302 (unsigned long long)ackno);
308 av->dccpav_buf[index] = state; 303 av->av_buf[index] = state;
309 goto out; 304 goto out;
310 } 305 }
311 /* len == 0 means one packet */ 306 /* len == 0 means one packet */
@@ -318,8 +313,8 @@ int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk,
318 } 313 }
319 } 314 }
320 315
321 av->dccpav_buf_ackno = ackno; 316 av->av_buf_ackno = ackno;
322 av->dccpav_time = ktime_get_real(); 317 av->av_time = ktime_get_real();
323out: 318out:
324 return 0; 319 return 0;
325 320
@@ -349,9 +344,9 @@ void dccp_ackvector_print(const u64 ackno, const unsigned char *vector, int len)
349 344
350void dccp_ackvec_print(const struct dccp_ackvec *av) 345void dccp_ackvec_print(const struct dccp_ackvec *av)
351{ 346{
352 dccp_ackvector_print(av->dccpav_buf_ackno, 347 dccp_ackvector_print(av->av_buf_ackno,
353 av->dccpav_buf + av->dccpav_buf_head, 348 av->av_buf + av->av_buf_head,
354 av->dccpav_vec_len); 349 av->av_vec_len);
355} 350}
356#endif 351#endif
357 352
@@ -361,17 +356,15 @@ static void dccp_ackvec_throw_record(struct dccp_ackvec *av,
361 struct dccp_ackvec_record *next; 356 struct dccp_ackvec_record *next;
362 357
363 /* sort out vector length */ 358 /* sort out vector length */
364 if (av->dccpav_buf_head <= avr->dccpavr_ack_ptr) 359 if (av->av_buf_head <= avr->avr_ack_ptr)
365 av->dccpav_vec_len = avr->dccpavr_ack_ptr - av->dccpav_buf_head; 360 av->av_vec_len = avr->avr_ack_ptr - av->av_buf_head;
366 else 361 else
367 av->dccpav_vec_len = DCCP_MAX_ACKVEC_LEN - 1 362 av->av_vec_len = DCCP_MAX_ACKVEC_LEN - 1 -
368 - av->dccpav_buf_head 363 av->av_buf_head + avr->avr_ack_ptr;
369 + avr->dccpavr_ack_ptr;
370 364
371 /* free records */ 365 /* free records */
372 list_for_each_entry_safe_from(avr, next, &av->dccpav_records, 366 list_for_each_entry_safe_from(avr, next, &av->av_records, avr_node) {
373 dccpavr_node) { 367 list_del_init(&avr->avr_node);
374 list_del_init(&avr->dccpavr_node);
375 dccp_ackvec_record_delete(avr); 368 dccp_ackvec_record_delete(avr);
376 } 369 }
377} 370}
@@ -386,16 +379,16 @@ void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av, struct sock *sk,
386 * windows. We will be receiving ACKs for stuff we sent a while back 379 * windows. We will be receiving ACKs for stuff we sent a while back
387 * -sorbo. 380 * -sorbo.
388 */ 381 */
389 list_for_each_entry_reverse(avr, &av->dccpav_records, dccpavr_node) { 382 list_for_each_entry_reverse(avr, &av->av_records, avr_node) {
390 if (ackno == avr->dccpavr_ack_seqno) { 383 if (ackno == avr->avr_ack_seqno) {
391 dccp_pr_debug("%s ACK packet 0, len=%d, ack_seqno=%llu, " 384 dccp_pr_debug("%s ACK packet 0, len=%d, ack_seqno=%llu, "
392 "ack_ackno=%llu, ACKED!\n", 385 "ack_ackno=%llu, ACKED!\n",
393 dccp_role(sk), 1, 386 dccp_role(sk), 1,
394 (unsigned long long)avr->dccpavr_ack_seqno, 387 (unsigned long long)avr->avr_ack_seqno,
395 (unsigned long long)avr->dccpavr_ack_ackno); 388 (unsigned long long)avr->avr_ack_ackno);
396 dccp_ackvec_throw_record(av, avr); 389 dccp_ackvec_throw_record(av, avr);
397 break; 390 break;
398 } else if (avr->dccpavr_ack_seqno > ackno) 391 } else if (avr->avr_ack_seqno > ackno)
399 break; /* old news */ 392 break; /* old news */
400 } 393 }
401} 394}
@@ -409,7 +402,7 @@ static void dccp_ackvec_check_rcv_ackvector(struct dccp_ackvec *av,
409 struct dccp_ackvec_record *avr; 402 struct dccp_ackvec_record *avr;
410 403
411 /* Check if we actually sent an ACK vector */ 404 /* Check if we actually sent an ACK vector */
412 if (list_empty(&av->dccpav_records)) 405 if (list_empty(&av->av_records))
413 return; 406 return;
414 407
415 i = len; 408 i = len;
@@ -418,8 +411,7 @@ static void dccp_ackvec_check_rcv_ackvector(struct dccp_ackvec *av,
418 * I think it might be more efficient to work backwards. See comment on 411 * I think it might be more efficient to work backwards. See comment on
419 * rcv_ackno. -sorbo. 412 * rcv_ackno. -sorbo.
420 */ 413 */
421 avr = list_entry(av->dccpav_records.next, struct dccp_ackvec_record, 414 avr = list_entry(av->av_records.next, struct dccp_ackvec_record, avr_node);
422 dccpavr_node);
423 while (i--) { 415 while (i--) {
424 const u8 rl = *vector & DCCP_ACKVEC_LEN_MASK; 416 const u8 rl = *vector & DCCP_ACKVEC_LEN_MASK;
425 u64 ackno_end_rl; 417 u64 ackno_end_rl;
@@ -430,15 +422,14 @@ static void dccp_ackvec_check_rcv_ackvector(struct dccp_ackvec *av,
430 * If our AVR sequence number is greater than the ack, go 422 * If our AVR sequence number is greater than the ack, go
431 * forward in the AVR list until it is not so. 423 * forward in the AVR list until it is not so.
432 */ 424 */
433 list_for_each_entry_from(avr, &av->dccpav_records, 425 list_for_each_entry_from(avr, &av->av_records, avr_node) {
434 dccpavr_node) { 426 if (!after48(avr->avr_ack_seqno, *ackno))
435 if (!after48(avr->dccpavr_ack_seqno, *ackno))
436 goto found; 427 goto found;
437 } 428 }
438 /* End of the dccpav_records list, not found, exit */ 429 /* End of the av_records list, not found, exit */
439 break; 430 break;
440found: 431found:
441 if (between48(avr->dccpavr_ack_seqno, ackno_end_rl, *ackno)) { 432 if (between48(avr->avr_ack_seqno, ackno_end_rl, *ackno)) {
442 const u8 state = *vector & DCCP_ACKVEC_STATE_MASK; 433 const u8 state = *vector & DCCP_ACKVEC_STATE_MASK;
443 if (state != DCCP_ACKVEC_STATE_NOT_RECEIVED) { 434 if (state != DCCP_ACKVEC_STATE_NOT_RECEIVED) {
444 dccp_pr_debug("%s ACK vector 0, len=%d, " 435 dccp_pr_debug("%s ACK vector 0, len=%d, "
@@ -446,9 +437,9 @@ found:
446 "ACKED!\n", 437 "ACKED!\n",
447 dccp_role(sk), len, 438 dccp_role(sk), len,
448 (unsigned long long) 439 (unsigned long long)
449 avr->dccpavr_ack_seqno, 440 avr->avr_ack_seqno,
450 (unsigned long long) 441 (unsigned long long)
451 avr->dccpavr_ack_ackno); 442 avr->avr_ack_ackno);
452 dccp_ackvec_throw_record(av, avr); 443 dccp_ackvec_throw_record(av, avr);
453 break; 444 break;
454 } 445 }
diff --git a/net/dccp/ackvec.h b/net/dccp/ackvec.h
index 9ef0737043ee..bcb64fb4acef 100644
--- a/net/dccp/ackvec.h
+++ b/net/dccp/ackvec.h
@@ -32,54 +32,54 @@
32 * 32 *
33 * This data structure is the one defined in RFC 4340, Appendix A. 33 * This data structure is the one defined in RFC 4340, Appendix A.
34 * 34 *
35 * @dccpav_buf_head - circular buffer head 35 * @av_buf_head - circular buffer head
36 * @dccpav_buf_tail - circular buffer tail 36 * @av_buf_tail - circular buffer tail
37 * @dccpav_buf_ackno - ack # of the most recent packet acknowledgeable in the 37 * @av_buf_ackno - ack # of the most recent packet acknowledgeable in the
38 * buffer (i.e. %dccpav_buf_head) 38 * buffer (i.e. %av_buf_head)
39 * @dccpav_buf_nonce - the one-bit sum of the ECN Nonces on all packets acked 39 * @av_buf_nonce - the one-bit sum of the ECN Nonces on all packets acked
40 * by the buffer with State 0 40 * by the buffer with State 0
41 * 41 *
42 * Additionally, the HC-Receiver must keep some information about the 42 * Additionally, the HC-Receiver must keep some information about the
43 * Ack Vectors it has recently sent. For each packet sent carrying an 43 * Ack Vectors it has recently sent. For each packet sent carrying an
44 * Ack Vector, it remembers four variables: 44 * Ack Vector, it remembers four variables:
45 * 45 *
46 * @dccpav_records - list of dccp_ackvec_record 46 * @av_records - list of dccp_ackvec_record
47 * @dccpav_ack_nonce - the one-bit sum of the ECN Nonces for all State 0. 47 * @av_ack_nonce - the one-bit sum of the ECN Nonces for all State 0.
48 * 48 *
49 * @dccpav_time - the time in usecs 49 * @av_time - the time in usecs
50 * @dccpav_buf - circular buffer of acknowledgeable packets 50 * @av_buf - circular buffer of acknowledgeable packets
51 */ 51 */
52struct dccp_ackvec { 52struct dccp_ackvec {
53 u64 dccpav_buf_ackno; 53 u64 av_buf_ackno;
54 struct list_head dccpav_records; 54 struct list_head av_records;
55 ktime_t dccpav_time; 55 ktime_t av_time;
56 u16 dccpav_buf_head; 56 u16 av_buf_head;
57 u16 dccpav_vec_len; 57 u16 av_vec_len;
58 u8 dccpav_buf_nonce; 58 u8 av_buf_nonce;
59 u8 dccpav_ack_nonce; 59 u8 av_ack_nonce;
60 u8 dccpav_buf[DCCP_MAX_ACKVEC_LEN]; 60 u8 av_buf[DCCP_MAX_ACKVEC_LEN];
61}; 61};
62 62
63/** struct dccp_ackvec_record - ack vector record 63/** struct dccp_ackvec_record - ack vector record
64 * 64 *
65 * ACK vector record as defined in Appendix A of spec. 65 * ACK vector record as defined in Appendix A of spec.
66 * 66 *
67 * The list is sorted by dccpavr_ack_seqno 67 * The list is sorted by avr_ack_seqno
68 * 68 *
69 * @dccpavr_node - node in dccpav_records 69 * @avr_node - node in av_records
70 * @dccpavr_ack_seqno - sequence number of the packet this record was sent on 70 * @avr_ack_seqno - sequence number of the packet this record was sent on
71 * @dccpavr_ack_ackno - sequence number being acknowledged 71 * @avr_ack_ackno - sequence number being acknowledged
72 * @dccpavr_ack_ptr - pointer into dccpav_buf where this record starts 72 * @avr_ack_ptr - pointer into av_buf where this record starts
73 * @dccpavr_ack_nonce - dccpav_ack_nonce at the time this record was sent 73 * @avr_ack_nonce - av_ack_nonce at the time this record was sent
74 * @dccpavr_sent_len - lenght of the record in dccpav_buf 74 * @avr_sent_len - lenght of the record in av_buf
75 */ 75 */
76struct dccp_ackvec_record { 76struct dccp_ackvec_record {
77 struct list_head dccpavr_node; 77 struct list_head avr_node;
78 u64 dccpavr_ack_seqno; 78 u64 avr_ack_seqno;
79 u64 dccpavr_ack_ackno; 79 u64 avr_ack_ackno;
80 u16 dccpavr_ack_ptr; 80 u16 avr_ack_ptr;
81 u16 dccpavr_sent_len; 81 u16 avr_sent_len;
82 u8 dccpavr_ack_nonce; 82 u8 avr_ack_nonce;
83}; 83};
84 84
85struct sock; 85struct sock;
@@ -105,7 +105,7 @@ extern int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb);
105 105
106static inline int dccp_ackvec_pending(const struct dccp_ackvec *av) 106static inline int dccp_ackvec_pending(const struct dccp_ackvec *av)
107{ 107{
108 return av->dccpav_vec_len; 108 return av->av_vec_len;
109} 109}
110#else /* CONFIG_IP_DCCP_ACKVEC */ 110#else /* CONFIG_IP_DCCP_ACKVEC */
111static inline int dccp_ackvec_init(void) 111static inline int dccp_ackvec_init(void)
diff --git a/net/dccp/ccid.c b/net/dccp/ccid.c
index c45088b5e6fb..4809753d12ae 100644
--- a/net/dccp/ccid.c
+++ b/net/dccp/ccid.c
@@ -92,15 +92,15 @@ int ccid_register(struct ccid_operations *ccid_ops)
92 92
93 ccid_ops->ccid_hc_rx_slab = 93 ccid_ops->ccid_hc_rx_slab =
94 ccid_kmem_cache_create(ccid_ops->ccid_hc_rx_obj_size, 94 ccid_kmem_cache_create(ccid_ops->ccid_hc_rx_obj_size,
95 "%s_hc_rx_sock", 95 "ccid%u_hc_rx_sock",
96 ccid_ops->ccid_name); 96 ccid_ops->ccid_id);
97 if (ccid_ops->ccid_hc_rx_slab == NULL) 97 if (ccid_ops->ccid_hc_rx_slab == NULL)
98 goto out; 98 goto out;
99 99
100 ccid_ops->ccid_hc_tx_slab = 100 ccid_ops->ccid_hc_tx_slab =
101 ccid_kmem_cache_create(ccid_ops->ccid_hc_tx_obj_size, 101 ccid_kmem_cache_create(ccid_ops->ccid_hc_tx_obj_size,
102 "%s_hc_tx_sock", 102 "ccid%u_hc_tx_sock",
103 ccid_ops->ccid_name); 103 ccid_ops->ccid_id);
104 if (ccid_ops->ccid_hc_tx_slab == NULL) 104 if (ccid_ops->ccid_hc_tx_slab == NULL)
105 goto out_free_rx_slab; 105 goto out_free_rx_slab;
106 106
diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h
index c65cb2453e43..fdeae7b57319 100644
--- a/net/dccp/ccid.h
+++ b/net/dccp/ccid.h
@@ -23,14 +23,37 @@
23 23
24struct tcp_info; 24struct tcp_info;
25 25
26/**
27 * struct ccid_operations - Interface to Congestion-Control Infrastructure
28 *
29 * @ccid_id: numerical CCID ID (up to %CCID_MAX, cf. table 5 in RFC 4340, 10.)
30 * @ccid_ccmps: the CCMPS including network/transport headers (0 when disabled)
31 * @ccid_name: alphabetical identifier string for @ccid_id
32 * @ccid_owner: module which implements/owns this CCID
33 * @ccid_hc_{r,t}x_slab: memory pool for the receiver/sender half-connection
34 * @ccid_hc_{r,t}x_obj_size: size of the receiver/sender half-connection socket
35 *
36 * @ccid_hc_{r,t}x_init: CCID-specific initialisation routine (before startup)
37 * @ccid_hc_{r,t}x_exit: CCID-specific cleanup routine (before destruction)
38 * @ccid_hc_rx_packet_recv: implements the HC-receiver side
39 * @ccid_hc_{r,t}x_parse_options: parsing routine for CCID/HC-specific options
40 * @ccid_hc_{r,t}x_insert_options: insert routine for CCID/HC-specific options
41 * @ccid_hc_tx_packet_recv: implements feedback processing for the HC-sender
42 * @ccid_hc_tx_send_packet: implements the sending part of the HC-sender
43 * @ccid_hc_tx_packet_sent: does accounting for packets in flight by HC-sender
44 * @ccid_hc_{r,t}x_get_info: INET_DIAG information for HC-receiver/sender
45 * @ccid_hc_{r,t}x_getsockopt: socket options specific to HC-receiver/sender
46 */
26struct ccid_operations { 47struct ccid_operations {
27 unsigned char ccid_id; 48 unsigned char ccid_id;
28 const char *ccid_name; 49 __u32 ccid_ccmps;
29 struct module *ccid_owner; 50 const char *ccid_name;
30 struct kmem_cache *ccid_hc_rx_slab; 51 struct module *ccid_owner;
31 __u32 ccid_hc_rx_obj_size; 52 struct kmem_cache *ccid_hc_rx_slab,
32 struct kmem_cache *ccid_hc_tx_slab; 53 *ccid_hc_tx_slab;
33 __u32 ccid_hc_tx_obj_size; 54 __u32 ccid_hc_rx_obj_size,
55 ccid_hc_tx_obj_size;
56 /* Interface Routines */
34 int (*ccid_hc_rx_init)(struct ccid *ccid, struct sock *sk); 57 int (*ccid_hc_rx_init)(struct ccid *ccid, struct sock *sk);
35 int (*ccid_hc_tx_init)(struct ccid *ccid, struct sock *sk); 58 int (*ccid_hc_tx_init)(struct ccid *ccid, struct sock *sk);
36 void (*ccid_hc_rx_exit)(struct sock *sk); 59 void (*ccid_hc_rx_exit)(struct sock *sk);
diff --git a/net/dccp/ccids/Kconfig b/net/dccp/ccids/Kconfig
index 80f469887691..12275943eab8 100644
--- a/net/dccp/ccids/Kconfig
+++ b/net/dccp/ccids/Kconfig
@@ -1,9 +1,8 @@
1menu "DCCP CCIDs Configuration (EXPERIMENTAL)" 1menu "DCCP CCIDs Configuration (EXPERIMENTAL)"
2 depends on IP_DCCP && EXPERIMENTAL 2 depends on EXPERIMENTAL
3 3
4config IP_DCCP_CCID2 4config IP_DCCP_CCID2
5 tristate "CCID2 (TCP-Like) (EXPERIMENTAL)" 5 tristate "CCID2 (TCP-Like) (EXPERIMENTAL)"
6 depends on IP_DCCP
7 def_tristate IP_DCCP 6 def_tristate IP_DCCP
8 select IP_DCCP_ACKVEC 7 select IP_DCCP_ACKVEC
9 ---help--- 8 ---help---
@@ -20,18 +19,9 @@ config IP_DCCP_CCID2
20 to the user. For example, a hypothetical application that 19 to the user. For example, a hypothetical application that
21 transferred files over DCCP, using application-level retransmissions 20 transferred files over DCCP, using application-level retransmissions
22 for lost packets, would prefer CCID 2 to CCID 3. On-line games may 21 for lost packets, would prefer CCID 2 to CCID 3. On-line games may
23 also prefer CCID 2. 22 also prefer CCID 2. See RFC 4341 for further details.
24 23
25 CCID 2 is further described in RFC 4341, 24 CCID2 is the default CCID used by DCCP.
26 http://www.ietf.org/rfc/rfc4341.txt
27
28 This text was extracted from RFC 4340 (sec. 10.1),
29 http://www.ietf.org/rfc/rfc4340.txt
30
31 To compile this CCID as a module, choose M here: the module will be
32 called dccp_ccid2.
33
34 If in doubt, say M.
35 25
36config IP_DCCP_CCID2_DEBUG 26config IP_DCCP_CCID2_DEBUG
37 bool "CCID2 debugging messages" 27 bool "CCID2 debugging messages"
@@ -47,8 +37,8 @@ config IP_DCCP_CCID2_DEBUG
47 37
48config IP_DCCP_CCID3 38config IP_DCCP_CCID3
49 tristate "CCID3 (TCP-Friendly) (EXPERIMENTAL)" 39 tristate "CCID3 (TCP-Friendly) (EXPERIMENTAL)"
50 depends on IP_DCCP
51 def_tristate IP_DCCP 40 def_tristate IP_DCCP
41 select IP_DCCP_TFRC_LIB
52 ---help--- 42 ---help---
53 CCID 3 denotes TCP-Friendly Rate Control (TFRC), an equation-based 43 CCID 3 denotes TCP-Friendly Rate Control (TFRC), an equation-based
54 rate-controlled congestion control mechanism. TFRC is designed to 44 rate-controlled congestion control mechanism. TFRC is designed to
@@ -74,10 +64,6 @@ config IP_DCCP_CCID3
74 64
75 If in doubt, say M. 65 If in doubt, say M.
76 66
77config IP_DCCP_TFRC_LIB
78 depends on IP_DCCP_CCID3
79 def_tristate IP_DCCP_CCID3
80
81config IP_DCCP_CCID3_DEBUG 67config IP_DCCP_CCID3_DEBUG
82 bool "CCID3 debugging messages" 68 bool "CCID3 debugging messages"
83 depends on IP_DCCP_CCID3 69 depends on IP_DCCP_CCID3
@@ -121,5 +107,13 @@ config IP_DCCP_CCID3_RTO
121 is serious network congestion: experimenting with larger values should 107 is serious network congestion: experimenting with larger values should
122 therefore not be performed on WANs. 108 therefore not be performed on WANs.
123 109
110config IP_DCCP_TFRC_LIB
111 tristate
112 default n
113
114config IP_DCCP_TFRC_DEBUG
115 bool
116 depends on IP_DCCP_TFRC_LIB
117 default y if IP_DCCP_CCID3_DEBUG
124 118
125endmenu 119endmenu
diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c
index d694656b8800..b5b52ebb2693 100644
--- a/net/dccp/ccids/ccid2.c
+++ b/net/dccp/ccids/ccid2.c
@@ -24,9 +24,6 @@
24 24
25/* 25/*
26 * This implementation should follow RFC 4341 26 * This implementation should follow RFC 4341
27 *
28 * BUGS:
29 * - sequence number wrapping
30 */ 27 */
31 28
32#include "../ccid.h" 29#include "../ccid.h"
@@ -129,50 +126,35 @@ static int ccid2_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
129{ 126{
130 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); 127 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
131 128
132 ccid2_pr_debug("pipe=%d cwnd=%d\n", hctx->ccid2hctx_pipe, 129 if (hctx->ccid2hctx_pipe < hctx->ccid2hctx_cwnd)
133 hctx->ccid2hctx_cwnd); 130 return 0;
134
135 if (hctx->ccid2hctx_pipe < hctx->ccid2hctx_cwnd) {
136 /* OK we can send... make sure previous packet was sent off */
137 if (!hctx->ccid2hctx_sendwait) {
138 hctx->ccid2hctx_sendwait = 1;
139 return 0;
140 }
141 }
142 131
143 return 1; /* XXX CCID should dequeue when ready instead of polling */ 132 return 1; /* XXX CCID should dequeue when ready instead of polling */
144} 133}
145 134
146static void ccid2_change_l_ack_ratio(struct sock *sk, int val) 135static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val)
147{ 136{
148 struct dccp_sock *dp = dccp_sk(sk); 137 struct dccp_sock *dp = dccp_sk(sk);
138 u32 max_ratio = DIV_ROUND_UP(ccid2_hc_tx_sk(sk)->ccid2hctx_cwnd, 2);
139
149 /* 140 /*
150 * XXX I don't really agree with val != 2. If cwnd is 1, ack ratio 141 * Ensure that Ack Ratio does not exceed ceil(cwnd/2), which is (2) from
151 * should be 1... it shouldn't be allowed to become 2. 142 * RFC 4341, 6.1.2. We ignore the statement that Ack Ratio 2 is always
152 * -sorbo. 143 * acceptable since this causes starvation/deadlock whenever cwnd < 2.
144 * The same problem arises when Ack Ratio is 0 (ie. Ack Ratio disabled).
153 */ 145 */
154 if (val != 2) { 146 if (val == 0 || val > max_ratio) {
155 const struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); 147 DCCP_WARN("Limiting Ack Ratio (%u) to %u\n", val, max_ratio);
156 int max = hctx->ccid2hctx_cwnd / 2; 148 val = max_ratio;
157
158 /* round up */
159 if (hctx->ccid2hctx_cwnd & 1)
160 max++;
161
162 if (val > max)
163 val = max;
164 } 149 }
150 if (val > 0xFFFF) /* RFC 4340, 11.3 */
151 val = 0xFFFF;
165 152
166 ccid2_pr_debug("changing local ack ratio to %d\n", val); 153 if (val == dp->dccps_l_ack_ratio)
167 WARN_ON(val <= 0); 154 return;
168 dp->dccps_l_ack_ratio = val;
169}
170 155
171static void ccid2_change_cwnd(struct ccid2_hc_tx_sock *hctx, u32 val) 156 ccid2_pr_debug("changing local ack ratio to %u\n", val);
172{ 157 dp->dccps_l_ack_ratio = val;
173 /* XXX do we need to change ack ratio? */
174 hctx->ccid2hctx_cwnd = val? : 1;
175 ccid2_pr_debug("changed cwnd to %u\n", hctx->ccid2hctx_cwnd);
176} 158}
177 159
178static void ccid2_change_srtt(struct ccid2_hc_tx_sock *hctx, long val) 160static void ccid2_change_srtt(struct ccid2_hc_tx_sock *hctx, long val)
@@ -181,11 +163,6 @@ static void ccid2_change_srtt(struct ccid2_hc_tx_sock *hctx, long val)
181 hctx->ccid2hctx_srtt = val; 163 hctx->ccid2hctx_srtt = val;
182} 164}
183 165
184static void ccid2_change_pipe(struct ccid2_hc_tx_sock *hctx, long val)
185{
186 hctx->ccid2hctx_pipe = val;
187}
188
189static void ccid2_start_rto_timer(struct sock *sk); 166static void ccid2_start_rto_timer(struct sock *sk);
190 167
191static void ccid2_hc_tx_rto_expire(unsigned long data) 168static void ccid2_hc_tx_rto_expire(unsigned long data)
@@ -215,21 +192,17 @@ static void ccid2_hc_tx_rto_expire(unsigned long data)
215 ccid2_start_rto_timer(sk); 192 ccid2_start_rto_timer(sk);
216 193
217 /* adjust pipe, cwnd etc */ 194 /* adjust pipe, cwnd etc */
218 ccid2_change_pipe(hctx, 0); 195 hctx->ccid2hctx_ssthresh = hctx->ccid2hctx_cwnd / 2;
219 hctx->ccid2hctx_ssthresh = hctx->ccid2hctx_cwnd >> 1;
220 if (hctx->ccid2hctx_ssthresh < 2) 196 if (hctx->ccid2hctx_ssthresh < 2)
221 hctx->ccid2hctx_ssthresh = 2; 197 hctx->ccid2hctx_ssthresh = 2;
222 ccid2_change_cwnd(hctx, 1); 198 hctx->ccid2hctx_cwnd = 1;
199 hctx->ccid2hctx_pipe = 0;
223 200
224 /* clear state about stuff we sent */ 201 /* clear state about stuff we sent */
225 hctx->ccid2hctx_seqt = hctx->ccid2hctx_seqh; 202 hctx->ccid2hctx_seqt = hctx->ccid2hctx_seqh;
226 hctx->ccid2hctx_ssacks = 0; 203 hctx->ccid2hctx_packets_acked = 0;
227 hctx->ccid2hctx_acks = 0;
228 hctx->ccid2hctx_sent = 0;
229 204
230 /* clear ack ratio state. */ 205 /* clear ack ratio state. */
231 hctx->ccid2hctx_arsent = 0;
232 hctx->ccid2hctx_ackloss = 0;
233 hctx->ccid2hctx_rpseq = 0; 206 hctx->ccid2hctx_rpseq = 0;
234 hctx->ccid2hctx_rpdupack = -1; 207 hctx->ccid2hctx_rpdupack = -1;
235 ccid2_change_l_ack_ratio(sk, 1); 208 ccid2_change_l_ack_ratio(sk, 1);
@@ -255,23 +228,10 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len)
255 struct dccp_sock *dp = dccp_sk(sk); 228 struct dccp_sock *dp = dccp_sk(sk);
256 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); 229 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
257 struct ccid2_seq *next; 230 struct ccid2_seq *next;
258 u64 seq;
259
260 ccid2_hc_tx_check_sanity(hctx);
261 231
262 BUG_ON(!hctx->ccid2hctx_sendwait); 232 hctx->ccid2hctx_pipe++;
263 hctx->ccid2hctx_sendwait = 0;
264 ccid2_change_pipe(hctx, hctx->ccid2hctx_pipe + 1);
265 BUG_ON(hctx->ccid2hctx_pipe < 0);
266 233
267 /* There is an issue. What if another packet is sent between 234 hctx->ccid2hctx_seqh->ccid2s_seq = dp->dccps_gss;
268 * packet_send() and packet_sent(). Then the sequence number would be
269 * wrong.
270 * -sorbo.
271 */
272 seq = dp->dccps_gss;
273
274 hctx->ccid2hctx_seqh->ccid2s_seq = seq;
275 hctx->ccid2hctx_seqh->ccid2s_acked = 0; 235 hctx->ccid2hctx_seqh->ccid2s_acked = 0;
276 hctx->ccid2hctx_seqh->ccid2s_sent = jiffies; 236 hctx->ccid2hctx_seqh->ccid2s_sent = jiffies;
277 237
@@ -291,8 +251,26 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len)
291 ccid2_pr_debug("cwnd=%d pipe=%d\n", hctx->ccid2hctx_cwnd, 251 ccid2_pr_debug("cwnd=%d pipe=%d\n", hctx->ccid2hctx_cwnd,
292 hctx->ccid2hctx_pipe); 252 hctx->ccid2hctx_pipe);
293 253
294 hctx->ccid2hctx_sent++; 254 /*
295 255 * FIXME: The code below is broken and the variables have been removed
256 * from the socket struct. The `ackloss' variable was always set to 0,
257 * and with arsent there are several problems:
258 * (i) it doesn't just count the number of Acks, but all sent packets;
259 * (ii) it is expressed in # of packets, not # of windows, so the
260 * comparison below uses the wrong formula: Appendix A of RFC 4341
261 * comes up with the number K = cwnd / (R^2 - R) of consecutive windows
262 * of data with no lost or marked Ack packets. If arsent were the # of
263 * consecutive Acks received without loss, then Ack Ratio needs to be
264 * decreased by 1 when
265 * arsent >= K * cwnd / R = cwnd^2 / (R^3 - R^2)
266 * where cwnd / R is the number of Acks received per window of data
267 * (cf. RFC 4341, App. A). The problems are that
268 * - arsent counts other packets as well;
269 * - the comparison uses a formula different from RFC 4341;
270 * - computing a cubic/quadratic equation each time is too complicated.
271 * Hence a different algorithm is needed.
272 */
273#if 0
296 /* Ack Ratio. Need to maintain a concept of how many windows we sent */ 274 /* Ack Ratio. Need to maintain a concept of how many windows we sent */
297 hctx->ccid2hctx_arsent++; 275 hctx->ccid2hctx_arsent++;
298 /* We had an ack loss in this window... */ 276 /* We had an ack loss in this window... */
@@ -320,14 +298,13 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len)
320 hctx->ccid2hctx_arsent = 0; /* or maybe set it to cwnd*/ 298 hctx->ccid2hctx_arsent = 0; /* or maybe set it to cwnd*/
321 } 299 }
322 } 300 }
301#endif
323 302
324 /* setup RTO timer */ 303 /* setup RTO timer */
325 if (!timer_pending(&hctx->ccid2hctx_rtotimer)) 304 if (!timer_pending(&hctx->ccid2hctx_rtotimer))
326 ccid2_start_rto_timer(sk); 305 ccid2_start_rto_timer(sk);
327 306
328#ifdef CONFIG_IP_DCCP_CCID2_DEBUG 307#ifdef CONFIG_IP_DCCP_CCID2_DEBUG
329 ccid2_pr_debug("pipe=%d\n", hctx->ccid2hctx_pipe);
330 ccid2_pr_debug("Sent: seq=%llu\n", (unsigned long long)seq);
331 do { 308 do {
332 struct ccid2_seq *seqp = hctx->ccid2hctx_seqt; 309 struct ccid2_seq *seqp = hctx->ccid2hctx_seqt;
333 310
@@ -419,31 +396,15 @@ static inline void ccid2_new_ack(struct sock *sk,
419{ 396{
420 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); 397 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
421 398
422 /* slow start */
423 if (hctx->ccid2hctx_cwnd < hctx->ccid2hctx_ssthresh) { 399 if (hctx->ccid2hctx_cwnd < hctx->ccid2hctx_ssthresh) {
424 hctx->ccid2hctx_acks = 0; 400 if (*maxincr > 0 && ++hctx->ccid2hctx_packets_acked == 2) {
425 401 hctx->ccid2hctx_cwnd += 1;
426 /* We can increase cwnd at most maxincr [ack_ratio/2] */ 402 *maxincr -= 1;
427 if (*maxincr) { 403 hctx->ccid2hctx_packets_acked = 0;
428 /* increase every 2 acks */
429 hctx->ccid2hctx_ssacks++;
430 if (hctx->ccid2hctx_ssacks == 2) {
431 ccid2_change_cwnd(hctx, hctx->ccid2hctx_cwnd+1);
432 hctx->ccid2hctx_ssacks = 0;
433 *maxincr = *maxincr - 1;
434 }
435 } else {
436 /* increased cwnd enough for this single ack */
437 hctx->ccid2hctx_ssacks = 0;
438 }
439 } else {
440 hctx->ccid2hctx_ssacks = 0;
441 hctx->ccid2hctx_acks++;
442
443 if (hctx->ccid2hctx_acks >= hctx->ccid2hctx_cwnd) {
444 ccid2_change_cwnd(hctx, hctx->ccid2hctx_cwnd + 1);
445 hctx->ccid2hctx_acks = 0;
446 } 404 }
405 } else if (++hctx->ccid2hctx_packets_acked >= hctx->ccid2hctx_cwnd) {
406 hctx->ccid2hctx_cwnd += 1;
407 hctx->ccid2hctx_packets_acked = 0;
447 } 408 }
448 409
449 /* update RTO */ 410 /* update RTO */
@@ -502,7 +463,6 @@ static inline void ccid2_new_ack(struct sock *sk,
502 ccid2_pr_debug("srtt: %ld rttvar: %ld rto: %ld (HZ=%d) R=%lu\n", 463 ccid2_pr_debug("srtt: %ld rttvar: %ld rto: %ld (HZ=%d) R=%lu\n",
503 hctx->ccid2hctx_srtt, hctx->ccid2hctx_rttvar, 464 hctx->ccid2hctx_srtt, hctx->ccid2hctx_rttvar,
504 hctx->ccid2hctx_rto, HZ, r); 465 hctx->ccid2hctx_rto, HZ, r);
505 hctx->ccid2hctx_sent = 0;
506 } 466 }
507 467
508 /* we got a new ack, so re-start RTO timer */ 468 /* we got a new ack, so re-start RTO timer */
@@ -514,16 +474,19 @@ static void ccid2_hc_tx_dec_pipe(struct sock *sk)
514{ 474{
515 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); 475 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
516 476
517 ccid2_change_pipe(hctx, hctx->ccid2hctx_pipe-1); 477 if (hctx->ccid2hctx_pipe == 0)
518 BUG_ON(hctx->ccid2hctx_pipe < 0); 478 DCCP_BUG("pipe == 0");
479 else
480 hctx->ccid2hctx_pipe--;
519 481
520 if (hctx->ccid2hctx_pipe == 0) 482 if (hctx->ccid2hctx_pipe == 0)
521 ccid2_hc_tx_kill_rto_timer(sk); 483 ccid2_hc_tx_kill_rto_timer(sk);
522} 484}
523 485
524static void ccid2_congestion_event(struct ccid2_hc_tx_sock *hctx, 486static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp)
525 struct ccid2_seq *seqp)
526{ 487{
488 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
489
527 if (time_before(seqp->ccid2s_sent, hctx->ccid2hctx_last_cong)) { 490 if (time_before(seqp->ccid2s_sent, hctx->ccid2hctx_last_cong)) {
528 ccid2_pr_debug("Multiple losses in an RTT---treating as one\n"); 491 ccid2_pr_debug("Multiple losses in an RTT---treating as one\n");
529 return; 492 return;
@@ -531,10 +494,12 @@ static void ccid2_congestion_event(struct ccid2_hc_tx_sock *hctx,
531 494
532 hctx->ccid2hctx_last_cong = jiffies; 495 hctx->ccid2hctx_last_cong = jiffies;
533 496
534 ccid2_change_cwnd(hctx, hctx->ccid2hctx_cwnd >> 1); 497 hctx->ccid2hctx_cwnd = hctx->ccid2hctx_cwnd / 2 ? : 1U;
535 hctx->ccid2hctx_ssthresh = hctx->ccid2hctx_cwnd; 498 hctx->ccid2hctx_ssthresh = max(hctx->ccid2hctx_cwnd, 2U);
536 if (hctx->ccid2hctx_ssthresh < 2) 499
537 hctx->ccid2hctx_ssthresh = 2; 500 /* Avoid spurious timeouts resulting from Ack Ratio > cwnd */
501 if (dccp_sk(sk)->dccps_l_ack_ratio > hctx->ccid2hctx_cwnd)
502 ccid2_change_l_ack_ratio(sk, hctx->ccid2hctx_cwnd);
538} 503}
539 504
540static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) 505static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
@@ -570,12 +535,11 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
570 hctx->ccid2hctx_rpdupack++; 535 hctx->ccid2hctx_rpdupack++;
571 536
572 /* check if we got enough dupacks */ 537 /* check if we got enough dupacks */
573 if (hctx->ccid2hctx_rpdupack >= 538 if (hctx->ccid2hctx_rpdupack >= NUMDUPACK) {
574 hctx->ccid2hctx_numdupack) {
575 hctx->ccid2hctx_rpdupack = -1; /* XXX lame */ 539 hctx->ccid2hctx_rpdupack = -1; /* XXX lame */
576 hctx->ccid2hctx_rpseq = 0; 540 hctx->ccid2hctx_rpseq = 0;
577 541
578 ccid2_change_l_ack_ratio(sk, dp->dccps_l_ack_ratio << 1); 542 ccid2_change_l_ack_ratio(sk, 2 * dp->dccps_l_ack_ratio);
579 } 543 }
580 } 544 }
581 } 545 }
@@ -606,12 +570,13 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
606 } 570 }
607 } 571 }
608 572
609 /* If in slow-start, cwnd can increase at most Ack Ratio / 2 packets for 573 /*
610 * this single ack. I round up. 574 * In slow-start, cwnd can increase up to a maximum of Ack Ratio/2
611 * -sorbo. 575 * packets per acknowledgement. Rounding up avoids that cwnd is not
576 * advanced when Ack Ratio is 1 and gives a slight edge otherwise.
612 */ 577 */
613 maxincr = dp->dccps_l_ack_ratio >> 1; 578 if (hctx->ccid2hctx_cwnd < hctx->ccid2hctx_ssthresh)
614 maxincr++; 579 maxincr = DIV_ROUND_UP(dp->dccps_l_ack_ratio, 2);
615 580
616 /* go through all ack vectors */ 581 /* go through all ack vectors */
617 while ((offset = ccid2_ackvector(sk, skb, offset, 582 while ((offset = ccid2_ackvector(sk, skb, offset,
@@ -619,9 +584,8 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
619 /* go through this ack vector */ 584 /* go through this ack vector */
620 while (veclen--) { 585 while (veclen--) {
621 const u8 rl = *vector & DCCP_ACKVEC_LEN_MASK; 586 const u8 rl = *vector & DCCP_ACKVEC_LEN_MASK;
622 u64 ackno_end_rl; 587 u64 ackno_end_rl = SUB48(ackno, rl);
623 588
624 dccp_set_seqno(&ackno_end_rl, ackno - rl);
625 ccid2_pr_debug("ackvec start:%llu end:%llu\n", 589 ccid2_pr_debug("ackvec start:%llu end:%llu\n",
626 (unsigned long long)ackno, 590 (unsigned long long)ackno,
627 (unsigned long long)ackno_end_rl); 591 (unsigned long long)ackno_end_rl);
@@ -651,7 +615,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
651 !seqp->ccid2s_acked) { 615 !seqp->ccid2s_acked) {
652 if (state == 616 if (state ==
653 DCCP_ACKVEC_STATE_ECN_MARKED) { 617 DCCP_ACKVEC_STATE_ECN_MARKED) {
654 ccid2_congestion_event(hctx, 618 ccid2_congestion_event(sk,
655 seqp); 619 seqp);
656 } else 620 } else
657 ccid2_new_ack(sk, seqp, 621 ccid2_new_ack(sk, seqp,
@@ -666,13 +630,12 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
666 done = 1; 630 done = 1;
667 break; 631 break;
668 } 632 }
669 seqp = seqp->ccid2s_next; 633 seqp = seqp->ccid2s_prev;
670 } 634 }
671 if (done) 635 if (done)
672 break; 636 break;
673 637
674 638 ackno = SUB48(ackno_end_rl, 1);
675 dccp_set_seqno(&ackno, ackno_end_rl - 1);
676 vector++; 639 vector++;
677 } 640 }
678 if (done) 641 if (done)
@@ -694,7 +657,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
694 while (1) { 657 while (1) {
695 if (seqp->ccid2s_acked) { 658 if (seqp->ccid2s_acked) {
696 done++; 659 done++;
697 if (done == hctx->ccid2hctx_numdupack) 660 if (done == NUMDUPACK)
698 break; 661 break;
699 } 662 }
700 if (seqp == hctx->ccid2hctx_seqt) 663 if (seqp == hctx->ccid2hctx_seqt)
@@ -705,7 +668,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
705 /* If there are at least 3 acknowledgements, anything unacknowledged 668 /* If there are at least 3 acknowledgements, anything unacknowledged
706 * below the last sequence number is considered lost 669 * below the last sequence number is considered lost
707 */ 670 */
708 if (done == hctx->ccid2hctx_numdupack) { 671 if (done == NUMDUPACK) {
709 struct ccid2_seq *last_acked = seqp; 672 struct ccid2_seq *last_acked = seqp;
710 673
711 /* check for lost packets */ 674 /* check for lost packets */
@@ -717,7 +680,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
717 * order to detect multiple congestion events in 680 * order to detect multiple congestion events in
718 * one ack vector. 681 * one ack vector.
719 */ 682 */
720 ccid2_congestion_event(hctx, seqp); 683 ccid2_congestion_event(sk, seqp);
721 ccid2_hc_tx_dec_pipe(sk); 684 ccid2_hc_tx_dec_pipe(sk);
722 } 685 }
723 if (seqp == hctx->ccid2hctx_seqt) 686 if (seqp == hctx->ccid2hctx_seqt)
@@ -742,14 +705,23 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
742static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk) 705static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk)
743{ 706{
744 struct ccid2_hc_tx_sock *hctx = ccid_priv(ccid); 707 struct ccid2_hc_tx_sock *hctx = ccid_priv(ccid);
708 struct dccp_sock *dp = dccp_sk(sk);
709 u32 max_ratio;
710
711 /* RFC 4341, 5: initialise ssthresh to arbitrarily high (max) value */
712 hctx->ccid2hctx_ssthresh = ~0U;
745 713
746 ccid2_change_cwnd(hctx, 1); 714 /*
747 /* Initialize ssthresh to infinity. This means that we will exit the 715 * RFC 4341, 5: "The cwnd parameter is initialized to at most four
748 * initial slow-start after the first packet loss. This is what we 716 * packets for new connections, following the rules from [RFC3390]".
749 * want. 717 * We need to convert the bytes of RFC3390 into the packets of RFC 4341.
750 */ 718 */
751 hctx->ccid2hctx_ssthresh = ~0; 719 hctx->ccid2hctx_cwnd = min(4U, max(2U, 4380U / dp->dccps_mss_cache));
752 hctx->ccid2hctx_numdupack = 3; 720
721 /* Make sure that Ack Ratio is enabled and within bounds. */
722 max_ratio = DIV_ROUND_UP(hctx->ccid2hctx_cwnd, 2);
723 if (dp->dccps_l_ack_ratio == 0 || dp->dccps_l_ack_ratio > max_ratio)
724 dp->dccps_l_ack_ratio = max_ratio;
753 725
754 /* XXX init ~ to window size... */ 726 /* XXX init ~ to window size... */
755 if (ccid2_hc_tx_alloc_seq(hctx)) 727 if (ccid2_hc_tx_alloc_seq(hctx))
@@ -760,10 +732,8 @@ static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk)
760 hctx->ccid2hctx_rttvar = -1; 732 hctx->ccid2hctx_rttvar = -1;
761 hctx->ccid2hctx_rpdupack = -1; 733 hctx->ccid2hctx_rpdupack = -1;
762 hctx->ccid2hctx_last_cong = jiffies; 734 hctx->ccid2hctx_last_cong = jiffies;
763 735 setup_timer(&hctx->ccid2hctx_rtotimer, ccid2_hc_tx_rto_expire,
764 hctx->ccid2hctx_rtotimer.function = &ccid2_hc_tx_rto_expire; 736 (unsigned long)sk);
765 hctx->ccid2hctx_rtotimer.data = (unsigned long)sk;
766 init_timer(&hctx->ccid2hctx_rtotimer);
767 737
768 ccid2_hc_tx_check_sanity(hctx); 738 ccid2_hc_tx_check_sanity(hctx);
769 return 0; 739 return 0;
@@ -800,7 +770,7 @@ static void ccid2_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
800 770
801static struct ccid_operations ccid2 = { 771static struct ccid_operations ccid2 = {
802 .ccid_id = DCCPC_CCID2, 772 .ccid_id = DCCPC_CCID2,
803 .ccid_name = "ccid2", 773 .ccid_name = "TCP-like",
804 .ccid_owner = THIS_MODULE, 774 .ccid_owner = THIS_MODULE,
805 .ccid_hc_tx_obj_size = sizeof(struct ccid2_hc_tx_sock), 775 .ccid_hc_tx_obj_size = sizeof(struct ccid2_hc_tx_sock),
806 .ccid_hc_tx_init = ccid2_hc_tx_init, 776 .ccid_hc_tx_init = ccid2_hc_tx_init,
diff --git a/net/dccp/ccids/ccid2.h b/net/dccp/ccids/ccid2.h
index d9daa534c9be..2c94ca029010 100644
--- a/net/dccp/ccids/ccid2.h
+++ b/net/dccp/ccids/ccid2.h
@@ -24,6 +24,8 @@
24#include <linux/timer.h> 24#include <linux/timer.h>
25#include <linux/types.h> 25#include <linux/types.h>
26#include "../ccid.h" 26#include "../ccid.h"
27/* NUMDUPACK parameter from RFC 4341, p. 6 */
28#define NUMDUPACK 3
27 29
28struct sock; 30struct sock;
29 31
@@ -40,22 +42,17 @@ struct ccid2_seq {
40 42
41/** struct ccid2_hc_tx_sock - CCID2 TX half connection 43/** struct ccid2_hc_tx_sock - CCID2 TX half connection
42 * 44 *
43 * @ccid2hctx_ssacks - ACKs recv in slow start 45 * @ccid2hctx_{cwnd,ssthresh,pipe}: as per RFC 4341, section 5
44 * @ccid2hctx_acks - ACKS recv in AI phase 46 * @ccid2hctx_packets_acked - Ack counter for deriving cwnd growth (RFC 3465)
45 * @ccid2hctx_sent - packets sent in this window
46 * @ccid2hctx_lastrtt -time RTT was last measured 47 * @ccid2hctx_lastrtt -time RTT was last measured
47 * @ccid2hctx_arsent - packets sent [ack ratio]
48 * @ccid2hctx_ackloss - ack was lost in this win
49 * @ccid2hctx_rpseq - last consecutive seqno 48 * @ccid2hctx_rpseq - last consecutive seqno
50 * @ccid2hctx_rpdupack - dupacks since rpseq 49 * @ccid2hctx_rpdupack - dupacks since rpseq
51*/ 50*/
52struct ccid2_hc_tx_sock { 51struct ccid2_hc_tx_sock {
53 u32 ccid2hctx_cwnd; 52 u32 ccid2hctx_cwnd;
54 int ccid2hctx_ssacks; 53 u32 ccid2hctx_ssthresh;
55 int ccid2hctx_acks; 54 u32 ccid2hctx_pipe;
56 unsigned int ccid2hctx_ssthresh; 55 u32 ccid2hctx_packets_acked;
57 int ccid2hctx_pipe;
58 int ccid2hctx_numdupack;
59 struct ccid2_seq *ccid2hctx_seqbuf[CCID2_SEQBUF_MAX]; 56 struct ccid2_seq *ccid2hctx_seqbuf[CCID2_SEQBUF_MAX];
60 int ccid2hctx_seqbufc; 57 int ccid2hctx_seqbufc;
61 struct ccid2_seq *ccid2hctx_seqh; 58 struct ccid2_seq *ccid2hctx_seqh;
@@ -63,14 +60,10 @@ struct ccid2_hc_tx_sock {
63 long ccid2hctx_rto; 60 long ccid2hctx_rto;
64 long ccid2hctx_srtt; 61 long ccid2hctx_srtt;
65 long ccid2hctx_rttvar; 62 long ccid2hctx_rttvar;
66 int ccid2hctx_sent;
67 unsigned long ccid2hctx_lastrtt; 63 unsigned long ccid2hctx_lastrtt;
68 struct timer_list ccid2hctx_rtotimer; 64 struct timer_list ccid2hctx_rtotimer;
69 unsigned long ccid2hctx_arsent;
70 int ccid2hctx_ackloss;
71 u64 ccid2hctx_rpseq; 65 u64 ccid2hctx_rpseq;
72 int ccid2hctx_rpdupack; 66 int ccid2hctx_rpdupack;
73 int ccid2hctx_sendwait;
74 unsigned long ccid2hctx_last_cong; 67 unsigned long ccid2hctx_last_cong;
75 u64 ccid2hctx_high_ack; 68 u64 ccid2hctx_high_ack;
76}; 69};
diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c
index 19b33586333d..e76f460af0ea 100644
--- a/net/dccp/ccids/ccid3.c
+++ b/net/dccp/ccids/ccid3.c
@@ -1,6 +1,7 @@
1/* 1/*
2 * net/dccp/ccids/ccid3.c 2 * net/dccp/ccids/ccid3.c
3 * 3 *
4 * Copyright (c) 2007 The University of Aberdeen, Scotland, UK
4 * Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand. 5 * Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand.
5 * Copyright (c) 2005-7 Ian McDonald <ian.mcdonald@jandi.co.nz> 6 * Copyright (c) 2005-7 Ian McDonald <ian.mcdonald@jandi.co.nz>
6 * 7 *
@@ -33,11 +34,7 @@
33 * along with this program; if not, write to the Free Software 34 * along with this program; if not, write to the Free Software
34 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 35 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
35 */ 36 */
36#include "../ccid.h"
37#include "../dccp.h" 37#include "../dccp.h"
38#include "lib/packet_history.h"
39#include "lib/loss_interval.h"
40#include "lib/tfrc.h"
41#include "ccid3.h" 38#include "ccid3.h"
42 39
43#include <asm/unaligned.h> 40#include <asm/unaligned.h>
@@ -49,9 +46,6 @@ static int ccid3_debug;
49#define ccid3_pr_debug(format, a...) 46#define ccid3_pr_debug(format, a...)
50#endif 47#endif
51 48
52static struct dccp_tx_hist *ccid3_tx_hist;
53static struct dccp_rx_hist *ccid3_rx_hist;
54
55/* 49/*
56 * Transmitter Half-Connection Routines 50 * Transmitter Half-Connection Routines
57 */ 51 */
@@ -83,24 +77,27 @@ static void ccid3_hc_tx_set_state(struct sock *sk,
83} 77}
84 78
85/* 79/*
86 * Compute the initial sending rate X_init according to RFC 3390: 80 * Compute the initial sending rate X_init in the manner of RFC 3390:
87 * w_init = min(4 * MSS, max(2 * MSS, 4380 bytes)) 81 *
88 * X_init = w_init / RTT 82 * X_init = min(4 * s, max(2 * s, 4380 bytes)) / RTT
83 *
84 * Note that RFC 3390 uses MSS, RFC 4342 refers to RFC 3390, and rfc3448bis
85 * (rev-02) clarifies the use of RFC 3390 with regard to the above formula.
89 * For consistency with other parts of the code, X_init is scaled by 2^6. 86 * For consistency with other parts of the code, X_init is scaled by 2^6.
90 */ 87 */
91static inline u64 rfc3390_initial_rate(struct sock *sk) 88static inline u64 rfc3390_initial_rate(struct sock *sk)
92{ 89{
93 const struct dccp_sock *dp = dccp_sk(sk); 90 const struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
94 const __u32 w_init = min(4 * dp->dccps_mss_cache, 91 const __u32 w_init = min_t(__u32, 4 * hctx->ccid3hctx_s,
95 max(2 * dp->dccps_mss_cache, 4380U)); 92 max_t(__u32, 2 * hctx->ccid3hctx_s, 4380));
96 93
97 return scaled_div(w_init << 6, ccid3_hc_tx_sk(sk)->ccid3hctx_rtt); 94 return scaled_div(w_init << 6, hctx->ccid3hctx_rtt);
98} 95}
99 96
100/* 97/*
101 * Recalculate t_ipi and delta (should be called whenever X changes) 98 * Recalculate t_ipi and delta (should be called whenever X changes)
102 */ 99 */
103static inline void ccid3_update_send_interval(struct ccid3_hc_tx_sock *hctx) 100static void ccid3_update_send_interval(struct ccid3_hc_tx_sock *hctx)
104{ 101{
105 /* Calculate new t_ipi = s / X_inst (X_inst is in 64 * bytes/second) */ 102 /* Calculate new t_ipi = s / X_inst (X_inst is in 64 * bytes/second) */
106 hctx->ccid3hctx_t_ipi = scaled_div32(((u64)hctx->ccid3hctx_s) << 6, 103 hctx->ccid3hctx_t_ipi = scaled_div32(((u64)hctx->ccid3hctx_s) << 6,
@@ -116,6 +113,13 @@ static inline void ccid3_update_send_interval(struct ccid3_hc_tx_sock *hctx)
116 113
117} 114}
118 115
116static u32 ccid3_hc_tx_idle_rtt(struct ccid3_hc_tx_sock *hctx, ktime_t now)
117{
118 u32 delta = ktime_us_delta(now, hctx->ccid3hctx_t_last_win_count);
119
120 return delta / hctx->ccid3hctx_rtt;
121}
122
119/** 123/**
120 * ccid3_hc_tx_update_x - Update allowed sending rate X 124 * ccid3_hc_tx_update_x - Update allowed sending rate X
121 * @stamp: most recent time if available - can be left NULL. 125 * @stamp: most recent time if available - can be left NULL.
@@ -127,19 +131,19 @@ static inline void ccid3_update_send_interval(struct ccid3_hc_tx_sock *hctx)
127 * 131 *
128 */ 132 */
129static void ccid3_hc_tx_update_x(struct sock *sk, ktime_t *stamp) 133static void ccid3_hc_tx_update_x(struct sock *sk, ktime_t *stamp)
130
131{ 134{
132 struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); 135 struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
133 __u64 min_rate = 2 * hctx->ccid3hctx_x_recv; 136 __u64 min_rate = 2 * hctx->ccid3hctx_x_recv;
134 const __u64 old_x = hctx->ccid3hctx_x; 137 const __u64 old_x = hctx->ccid3hctx_x;
135 ktime_t now = stamp? *stamp : ktime_get_real(); 138 ktime_t now = stamp ? *stamp : ktime_get_real();
136 139
137 /* 140 /*
138 * Handle IDLE periods: do not reduce below RFC3390 initial sending rate 141 * Handle IDLE periods: do not reduce below RFC3390 initial sending rate
139 * when idling [RFC 4342, 5.1]. See also draft-ietf-dccp-rfc3448bis. 142 * when idling [RFC 4342, 5.1]. Definition of idling is from rfc3448bis:
143 * a sender is idle if it has not sent anything over a 2-RTT-period.
140 * For consistency with X and X_recv, min_rate is also scaled by 2^6. 144 * For consistency with X and X_recv, min_rate is also scaled by 2^6.
141 */ 145 */
142 if (unlikely(hctx->ccid3hctx_idle)) { 146 if (ccid3_hc_tx_idle_rtt(hctx, now) >= 2) {
143 min_rate = rfc3390_initial_rate(sk); 147 min_rate = rfc3390_initial_rate(sk);
144 min_rate = max(min_rate, 2 * hctx->ccid3hctx_x_recv); 148 min_rate = max(min_rate, 2 * hctx->ccid3hctx_x_recv);
145 } 149 }
@@ -181,7 +185,7 @@ static inline void ccid3_hc_tx_update_s(struct ccid3_hc_tx_sock *hctx, int len)
181{ 185{
182 const u16 old_s = hctx->ccid3hctx_s; 186 const u16 old_s = hctx->ccid3hctx_s;
183 187
184 hctx->ccid3hctx_s = old_s == 0 ? len : (9 * old_s + len) / 10; 188 hctx->ccid3hctx_s = tfrc_ewma(hctx->ccid3hctx_s, len, 9);
185 189
186 if (hctx->ccid3hctx_s != old_s) 190 if (hctx->ccid3hctx_s != old_s)
187 ccid3_update_send_interval(hctx); 191 ccid3_update_send_interval(hctx);
@@ -225,29 +229,27 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data)
225 ccid3_pr_debug("%s(%p, state=%s) - entry \n", dccp_role(sk), sk, 229 ccid3_pr_debug("%s(%p, state=%s) - entry \n", dccp_role(sk), sk,
226 ccid3_tx_state_name(hctx->ccid3hctx_state)); 230 ccid3_tx_state_name(hctx->ccid3hctx_state));
227 231
228 hctx->ccid3hctx_idle = 1; 232 if (hctx->ccid3hctx_state == TFRC_SSTATE_FBACK)
233 ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK);
234 else if (hctx->ccid3hctx_state != TFRC_SSTATE_NO_FBACK)
235 goto out;
229 236
230 switch (hctx->ccid3hctx_state) { 237 /*
231 case TFRC_SSTATE_NO_FBACK: 238 * Determine new allowed sending rate X as per draft rfc3448bis-00, 4.4
232 /* RFC 3448, 4.4: Halve send rate directly */ 239 */
240 if (hctx->ccid3hctx_t_rto == 0 || /* no feedback received yet */
241 hctx->ccid3hctx_p == 0) {
242
243 /* halve send rate directly */
233 hctx->ccid3hctx_x = max(hctx->ccid3hctx_x / 2, 244 hctx->ccid3hctx_x = max(hctx->ccid3hctx_x / 2,
234 (((__u64)hctx->ccid3hctx_s) << 6) / 245 (((__u64)hctx->ccid3hctx_s) << 6) /
235 TFRC_T_MBI); 246 TFRC_T_MBI);
236
237 ccid3_pr_debug("%s(%p, state=%s), updated tx rate to %u "
238 "bytes/s\n", dccp_role(sk), sk,
239 ccid3_tx_state_name(hctx->ccid3hctx_state),
240 (unsigned)(hctx->ccid3hctx_x >> 6));
241 /* The value of R is still undefined and so we can not recompute
242 * the timout value. Keep initial value as per [RFC 4342, 5]. */
243 t_nfb = TFRC_INITIAL_TIMEOUT;
244 ccid3_update_send_interval(hctx); 247 ccid3_update_send_interval(hctx);
245 break; 248 } else {
246 case TFRC_SSTATE_FBACK:
247 /* 249 /*
248 * Modify the cached value of X_recv [RFC 3448, 4.4] 250 * Modify the cached value of X_recv
249 * 251 *
250 * If (p == 0 || X_calc > 2 * X_recv) 252 * If (X_calc > 2 * X_recv)
251 * X_recv = max(X_recv / 2, s / (2 * t_mbi)); 253 * X_recv = max(X_recv / 2, s / (2 * t_mbi));
252 * Else 254 * Else
253 * X_recv = X_calc / 4; 255 * X_recv = X_calc / 4;
@@ -256,32 +258,28 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data)
256 */ 258 */
257 BUG_ON(hctx->ccid3hctx_p && !hctx->ccid3hctx_x_calc); 259 BUG_ON(hctx->ccid3hctx_p && !hctx->ccid3hctx_x_calc);
258 260
259 if (hctx->ccid3hctx_p == 0 || 261 if (hctx->ccid3hctx_x_calc > (hctx->ccid3hctx_x_recv >> 5))
260 (hctx->ccid3hctx_x_calc > (hctx->ccid3hctx_x_recv >> 5))) {
261
262 hctx->ccid3hctx_x_recv = 262 hctx->ccid3hctx_x_recv =
263 max(hctx->ccid3hctx_x_recv / 2, 263 max(hctx->ccid3hctx_x_recv / 2,
264 (((__u64)hctx->ccid3hctx_s) << 6) / 264 (((__u64)hctx->ccid3hctx_s) << 6) /
265 (2 * TFRC_T_MBI)); 265 (2 * TFRC_T_MBI));
266 } else { 266 else {
267 hctx->ccid3hctx_x_recv = hctx->ccid3hctx_x_calc; 267 hctx->ccid3hctx_x_recv = hctx->ccid3hctx_x_calc;
268 hctx->ccid3hctx_x_recv <<= 4; 268 hctx->ccid3hctx_x_recv <<= 4;
269 } 269 }
270 /* Now recalculate X [RFC 3448, 4.3, step (4)] */
271 ccid3_hc_tx_update_x(sk, NULL); 270 ccid3_hc_tx_update_x(sk, NULL);
272 /*
273 * Schedule no feedback timer to expire in
274 * max(t_RTO, 2 * s/X) = max(t_RTO, 2 * t_ipi)
275 * See comments in packet_recv() regarding the value of t_RTO.
276 */
277 t_nfb = max(hctx->ccid3hctx_t_rto, 2 * hctx->ccid3hctx_t_ipi);
278 break;
279 case TFRC_SSTATE_NO_SENT:
280 DCCP_BUG("%s(%p) - Illegal state NO_SENT", dccp_role(sk), sk);
281 /* fall through */
282 case TFRC_SSTATE_TERM:
283 goto out;
284 } 271 }
272 ccid3_pr_debug("Reduced X to %llu/64 bytes/sec\n",
273 (unsigned long long)hctx->ccid3hctx_x);
274
275 /*
276 * Set new timeout for the nofeedback timer.
277 * See comments in packet_recv() regarding the value of t_RTO.
278 */
279 if (unlikely(hctx->ccid3hctx_t_rto == 0)) /* no feedback yet */
280 t_nfb = TFRC_INITIAL_TIMEOUT;
281 else
282 t_nfb = max(hctx->ccid3hctx_t_rto, 2 * hctx->ccid3hctx_t_ipi);
285 283
286restart_timer: 284restart_timer:
287 sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer, 285 sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer,
@@ -336,8 +334,8 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
336 hctx->ccid3hctx_x = rfc3390_initial_rate(sk); 334 hctx->ccid3hctx_x = rfc3390_initial_rate(sk);
337 hctx->ccid3hctx_t_ld = now; 335 hctx->ccid3hctx_t_ld = now;
338 } else { 336 } else {
339 /* Sender does not have RTT sample: X = MSS/second */ 337 /* Sender does not have RTT sample: X_pps = 1 pkt/sec */
340 hctx->ccid3hctx_x = dp->dccps_mss_cache; 338 hctx->ccid3hctx_x = hctx->ccid3hctx_s;
341 hctx->ccid3hctx_x <<= 6; 339 hctx->ccid3hctx_x <<= 6;
342 } 340 }
343 ccid3_update_send_interval(hctx); 341 ccid3_update_send_interval(hctx);
@@ -369,7 +367,6 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
369 /* prepare to send now (add options etc.) */ 367 /* prepare to send now (add options etc.) */
370 dp->dccps_hc_tx_insert_options = 1; 368 dp->dccps_hc_tx_insert_options = 1;
371 DCCP_SKB_CB(skb)->dccpd_ccval = hctx->ccid3hctx_last_win_count; 369 DCCP_SKB_CB(skb)->dccpd_ccval = hctx->ccid3hctx_last_win_count;
372 hctx->ccid3hctx_idle = 0;
373 370
374 /* set the nominal send time for the next following packet */ 371 /* set the nominal send time for the next following packet */
375 hctx->ccid3hctx_t_nom = ktime_add_us(hctx->ccid3hctx_t_nom, 372 hctx->ccid3hctx_t_nom = ktime_add_us(hctx->ccid3hctx_t_nom,
@@ -381,28 +378,17 @@ static void ccid3_hc_tx_packet_sent(struct sock *sk, int more,
381 unsigned int len) 378 unsigned int len)
382{ 379{
383 struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); 380 struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
384 struct dccp_tx_hist_entry *packet;
385 381
386 ccid3_hc_tx_update_s(hctx, len); 382 ccid3_hc_tx_update_s(hctx, len);
387 383
388 packet = dccp_tx_hist_entry_new(ccid3_tx_hist, GFP_ATOMIC); 384 if (tfrc_tx_hist_add(&hctx->ccid3hctx_hist, dccp_sk(sk)->dccps_gss))
389 if (unlikely(packet == NULL)) {
390 DCCP_CRIT("packet history - out of memory!"); 385 DCCP_CRIT("packet history - out of memory!");
391 return;
392 }
393 dccp_tx_hist_add_entry(&hctx->ccid3hctx_hist, packet);
394
395 packet->dccphtx_tstamp = ktime_get_real();
396 packet->dccphtx_seqno = dccp_sk(sk)->dccps_gss;
397 packet->dccphtx_rtt = hctx->ccid3hctx_rtt;
398 packet->dccphtx_sent = 1;
399} 386}
400 387
401static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) 388static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
402{ 389{
403 struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); 390 struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
404 struct ccid3_options_received *opt_recv; 391 struct ccid3_options_received *opt_recv;
405 struct dccp_tx_hist_entry *packet;
406 ktime_t now; 392 ktime_t now;
407 unsigned long t_nfb; 393 unsigned long t_nfb;
408 u32 pinv, r_sample; 394 u32 pinv, r_sample;
@@ -411,131 +397,112 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
411 if (!(DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK || 397 if (!(DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK ||
412 DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_DATAACK)) 398 DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_DATAACK))
413 return; 399 return;
400 /* ... and only in the established state */
401 if (hctx->ccid3hctx_state != TFRC_SSTATE_FBACK &&
402 hctx->ccid3hctx_state != TFRC_SSTATE_NO_FBACK)
403 return;
414 404
415 opt_recv = &hctx->ccid3hctx_options_received; 405 opt_recv = &hctx->ccid3hctx_options_received;
406 now = ktime_get_real();
416 407
417 switch (hctx->ccid3hctx_state) { 408 /* Estimate RTT from history if ACK number is valid */
418 case TFRC_SSTATE_NO_FBACK: 409 r_sample = tfrc_tx_hist_rtt(hctx->ccid3hctx_hist,
419 case TFRC_SSTATE_FBACK: 410 DCCP_SKB_CB(skb)->dccpd_ack_seq, now);
420 /* get packet from history to look up t_recvdata */ 411 if (r_sample == 0) {
421 packet = dccp_tx_hist_find_entry(&hctx->ccid3hctx_hist, 412 DCCP_WARN("%s(%p): %s with bogus ACK-%llu\n", dccp_role(sk), sk,
422 DCCP_SKB_CB(skb)->dccpd_ack_seq); 413 dccp_packet_name(DCCP_SKB_CB(skb)->dccpd_type),
423 if (unlikely(packet == NULL)) { 414 (unsigned long long)DCCP_SKB_CB(skb)->dccpd_ack_seq);
424 DCCP_WARN("%s(%p), seqno %llu(%s) doesn't exist " 415 return;
425 "in history!\n", dccp_role(sk), sk, 416 }
426 (unsigned long long)DCCP_SKB_CB(skb)->dccpd_ack_seq,
427 dccp_packet_name(DCCP_SKB_CB(skb)->dccpd_type));
428 return;
429 }
430
431 /* Update receive rate in units of 64 * bytes/second */
432 hctx->ccid3hctx_x_recv = opt_recv->ccid3or_receive_rate;
433 hctx->ccid3hctx_x_recv <<= 6;
434 417
435 /* Update loss event rate */ 418 /* Update receive rate in units of 64 * bytes/second */
436 pinv = opt_recv->ccid3or_loss_event_rate; 419 hctx->ccid3hctx_x_recv = opt_recv->ccid3or_receive_rate;
437 if (pinv == ~0U || pinv == 0) /* see RFC 4342, 8.5 */ 420 hctx->ccid3hctx_x_recv <<= 6;
438 hctx->ccid3hctx_p = 0;
439 else /* can not exceed 100% */
440 hctx->ccid3hctx_p = 1000000 / pinv;
441 421
442 now = ktime_get_real(); 422 /* Update loss event rate (which is scaled by 1e6) */
443 /* 423 pinv = opt_recv->ccid3or_loss_event_rate;
444 * Calculate new round trip sample as per [RFC 3448, 4.3] by 424 if (pinv == ~0U || pinv == 0) /* see RFC 4342, 8.5 */
445 * R_sample = (now - t_recvdata) - t_elapsed 425 hctx->ccid3hctx_p = 0;
446 */ 426 else /* can not exceed 100% */
447 r_sample = dccp_sample_rtt(sk, ktime_us_delta(now, packet->dccphtx_tstamp)); 427 hctx->ccid3hctx_p = scaled_div(1, pinv);
428 /*
429 * Validate new RTT sample and update moving average
430 */
431 r_sample = dccp_sample_rtt(sk, r_sample);
432 hctx->ccid3hctx_rtt = tfrc_ewma(hctx->ccid3hctx_rtt, r_sample, 9);
433 /*
434 * Update allowed sending rate X as per draft rfc3448bis-00, 4.2/3
435 */
436 if (hctx->ccid3hctx_state == TFRC_SSTATE_NO_FBACK) {
437 ccid3_hc_tx_set_state(sk, TFRC_SSTATE_FBACK);
448 438
449 /* 439 if (hctx->ccid3hctx_t_rto == 0) {
450 * Update RTT estimate by
451 * If (No feedback recv)
452 * R = R_sample;
453 * Else
454 * R = q * R + (1 - q) * R_sample;
455 *
456 * q is a constant, RFC 3448 recomments 0.9
457 */
458 if (hctx->ccid3hctx_state == TFRC_SSTATE_NO_FBACK) {
459 /* 440 /*
460 * Larger Initial Windows [RFC 4342, sec. 5] 441 * Initial feedback packet: Larger Initial Windows (4.2)
461 */ 442 */
462 hctx->ccid3hctx_rtt = r_sample;
463 hctx->ccid3hctx_x = rfc3390_initial_rate(sk); 443 hctx->ccid3hctx_x = rfc3390_initial_rate(sk);
464 hctx->ccid3hctx_t_ld = now; 444 hctx->ccid3hctx_t_ld = now;
465 445
466 ccid3_update_send_interval(hctx); 446 ccid3_update_send_interval(hctx);
467 447
468 ccid3_pr_debug("%s(%p), s=%u, MSS=%u, " 448 goto done_computing_x;
469 "R_sample=%uus, X=%u\n", dccp_role(sk), 449 } else if (hctx->ccid3hctx_p == 0) {
470 sk, hctx->ccid3hctx_s, 450 /*
471 dccp_sk(sk)->dccps_mss_cache, r_sample, 451 * First feedback after nofeedback timer expiry (4.3)
472 (unsigned)(hctx->ccid3hctx_x >> 6)); 452 */
473 453 goto done_computing_x;
474 ccid3_hc_tx_set_state(sk, TFRC_SSTATE_FBACK);
475 } else {
476 hctx->ccid3hctx_rtt = (9 * hctx->ccid3hctx_rtt +
477 r_sample) / 10;
478
479 /* Update sending rate (step 4 of [RFC 3448, 4.3]) */
480 if (hctx->ccid3hctx_p > 0)
481 hctx->ccid3hctx_x_calc =
482 tfrc_calc_x(hctx->ccid3hctx_s,
483 hctx->ccid3hctx_rtt,
484 hctx->ccid3hctx_p);
485 ccid3_hc_tx_update_x(sk, &now);
486
487 ccid3_pr_debug("%s(%p), RTT=%uus (sample=%uus), s=%u, "
488 "p=%u, X_calc=%u, X_recv=%u, X=%u\n",
489 dccp_role(sk),
490 sk, hctx->ccid3hctx_rtt, r_sample,
491 hctx->ccid3hctx_s, hctx->ccid3hctx_p,
492 hctx->ccid3hctx_x_calc,
493 (unsigned)(hctx->ccid3hctx_x_recv >> 6),
494 (unsigned)(hctx->ccid3hctx_x >> 6));
495 } 454 }
455 }
496 456
497 /* unschedule no feedback timer */ 457 /* Update sending rate (step 4 of [RFC 3448, 4.3]) */
498 sk_stop_timer(sk, &hctx->ccid3hctx_no_feedback_timer); 458 if (hctx->ccid3hctx_p > 0)
459 hctx->ccid3hctx_x_calc =
460 tfrc_calc_x(hctx->ccid3hctx_s,
461 hctx->ccid3hctx_rtt,
462 hctx->ccid3hctx_p);
463 ccid3_hc_tx_update_x(sk, &now);
464
465done_computing_x:
466 ccid3_pr_debug("%s(%p), RTT=%uus (sample=%uus), s=%u, "
467 "p=%u, X_calc=%u, X_recv=%u, X=%u\n",
468 dccp_role(sk),
469 sk, hctx->ccid3hctx_rtt, r_sample,
470 hctx->ccid3hctx_s, hctx->ccid3hctx_p,
471 hctx->ccid3hctx_x_calc,
472 (unsigned)(hctx->ccid3hctx_x_recv >> 6),
473 (unsigned)(hctx->ccid3hctx_x >> 6));
499 474
500 /* remove all packets older than the one acked from history */ 475 /* unschedule no feedback timer */
501 dccp_tx_hist_purge_older(ccid3_tx_hist, 476 sk_stop_timer(sk, &hctx->ccid3hctx_no_feedback_timer);
502 &hctx->ccid3hctx_hist, packet);
503 /*
504 * As we have calculated new ipi, delta, t_nom it is possible
505 * that we now can send a packet, so wake up dccp_wait_for_ccid
506 */
507 sk->sk_write_space(sk);
508 477
509 /* 478 /*
510 * Update timeout interval for the nofeedback timer. 479 * As we have calculated new ipi, delta, t_nom it is possible
511 * We use a configuration option to increase the lower bound. 480 * that we now can send a packet, so wake up dccp_wait_for_ccid
512 * This can help avoid triggering the nofeedback timer too 481 */
513 * often ('spinning') on LANs with small RTTs. 482 sk->sk_write_space(sk);
514 */
515 hctx->ccid3hctx_t_rto = max_t(u32, 4 * hctx->ccid3hctx_rtt,
516 CONFIG_IP_DCCP_CCID3_RTO *
517 (USEC_PER_SEC/1000));
518 /*
519 * Schedule no feedback timer to expire in
520 * max(t_RTO, 2 * s/X) = max(t_RTO, 2 * t_ipi)
521 */
522 t_nfb = max(hctx->ccid3hctx_t_rto, 2 * hctx->ccid3hctx_t_ipi);
523 483
524 ccid3_pr_debug("%s(%p), Scheduled no feedback timer to " 484 /*
525 "expire in %lu jiffies (%luus)\n", 485 * Update timeout interval for the nofeedback timer.
526 dccp_role(sk), 486 * We use a configuration option to increase the lower bound.
527 sk, usecs_to_jiffies(t_nfb), t_nfb); 487 * This can help avoid triggering the nofeedback timer too
488 * often ('spinning') on LANs with small RTTs.
489 */
490 hctx->ccid3hctx_t_rto = max_t(u32, 4 * hctx->ccid3hctx_rtt,
491 (CONFIG_IP_DCCP_CCID3_RTO *
492 (USEC_PER_SEC / 1000)));
493 /*
494 * Schedule no feedback timer to expire in
495 * max(t_RTO, 2 * s/X) = max(t_RTO, 2 * t_ipi)
496 */
497 t_nfb = max(hctx->ccid3hctx_t_rto, 2 * hctx->ccid3hctx_t_ipi);
528 498
529 sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer, 499 ccid3_pr_debug("%s(%p), Scheduled no feedback timer to "
530 jiffies + usecs_to_jiffies(t_nfb)); 500 "expire in %lu jiffies (%luus)\n",
501 dccp_role(sk),
502 sk, usecs_to_jiffies(t_nfb), t_nfb);
531 503
532 /* set idle flag */ 504 sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer,
533 hctx->ccid3hctx_idle = 1; 505 jiffies + usecs_to_jiffies(t_nfb));
534 break;
535 case TFRC_SSTATE_NO_SENT: /* fall through */
536 case TFRC_SSTATE_TERM: /* ignore feedback when closing */
537 break;
538 }
539} 506}
540 507
541static int ccid3_hc_tx_parse_options(struct sock *sk, unsigned char option, 508static int ccid3_hc_tx_parse_options(struct sock *sk, unsigned char option,
@@ -605,12 +572,9 @@ static int ccid3_hc_tx_init(struct ccid *ccid, struct sock *sk)
605 struct ccid3_hc_tx_sock *hctx = ccid_priv(ccid); 572 struct ccid3_hc_tx_sock *hctx = ccid_priv(ccid);
606 573
607 hctx->ccid3hctx_state = TFRC_SSTATE_NO_SENT; 574 hctx->ccid3hctx_state = TFRC_SSTATE_NO_SENT;
608 INIT_LIST_HEAD(&hctx->ccid3hctx_hist); 575 hctx->ccid3hctx_hist = NULL;
609 576 setup_timer(&hctx->ccid3hctx_no_feedback_timer,
610 hctx->ccid3hctx_no_feedback_timer.function = 577 ccid3_hc_tx_no_feedback_timer, (unsigned long)sk);
611 ccid3_hc_tx_no_feedback_timer;
612 hctx->ccid3hctx_no_feedback_timer.data = (unsigned long)sk;
613 init_timer(&hctx->ccid3hctx_no_feedback_timer);
614 578
615 return 0; 579 return 0;
616} 580}
@@ -622,8 +586,7 @@ static void ccid3_hc_tx_exit(struct sock *sk)
622 ccid3_hc_tx_set_state(sk, TFRC_SSTATE_TERM); 586 ccid3_hc_tx_set_state(sk, TFRC_SSTATE_TERM);
623 sk_stop_timer(sk, &hctx->ccid3hctx_no_feedback_timer); 587 sk_stop_timer(sk, &hctx->ccid3hctx_no_feedback_timer);
624 588
625 /* Empty packet history */ 589 tfrc_tx_hist_purge(&hctx->ccid3hctx_hist);
626 dccp_tx_hist_purge(ccid3_tx_hist, &hctx->ccid3hctx_hist);
627} 590}
628 591
629static void ccid3_hc_tx_get_info(struct sock *sk, struct tcp_info *info) 592static void ccid3_hc_tx_get_info(struct sock *sk, struct tcp_info *info)
@@ -670,6 +633,15 @@ static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len,
670/* 633/*
671 * Receiver Half-Connection Routines 634 * Receiver Half-Connection Routines
672 */ 635 */
636
637/* CCID3 feedback types */
638enum ccid3_fback_type {
639 CCID3_FBACK_NONE = 0,
640 CCID3_FBACK_INITIAL,
641 CCID3_FBACK_PERIODIC,
642 CCID3_FBACK_PARAM_CHANGE
643};
644
673#ifdef CONFIG_IP_DCCP_CCID3_DEBUG 645#ifdef CONFIG_IP_DCCP_CCID3_DEBUG
674static const char *ccid3_rx_state_name(enum ccid3_hc_rx_states state) 646static const char *ccid3_rx_state_name(enum ccid3_hc_rx_states state)
675{ 647{
@@ -696,67 +668,58 @@ static void ccid3_hc_rx_set_state(struct sock *sk,
696 hcrx->ccid3hcrx_state = state; 668 hcrx->ccid3hcrx_state = state;
697} 669}
698 670
699static inline void ccid3_hc_rx_update_s(struct ccid3_hc_rx_sock *hcrx, int len) 671static void ccid3_hc_rx_send_feedback(struct sock *sk,
700{ 672 const struct sk_buff *skb,
701 if (unlikely(len == 0)) /* don't update on empty packets (e.g. ACKs) */ 673 enum ccid3_fback_type fbtype)
702 ccid3_pr_debug("Packet payload length is 0 - not updating\n");
703 else
704 hcrx->ccid3hcrx_s = hcrx->ccid3hcrx_s == 0 ? len :
705 (9 * hcrx->ccid3hcrx_s + len) / 10;
706}
707
708static void ccid3_hc_rx_send_feedback(struct sock *sk)
709{ 674{
710 struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); 675 struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
711 struct dccp_sock *dp = dccp_sk(sk); 676 struct dccp_sock *dp = dccp_sk(sk);
712 struct dccp_rx_hist_entry *packet;
713 ktime_t now; 677 ktime_t now;
714 suseconds_t delta; 678 s64 delta = 0;
715 679
716 ccid3_pr_debug("%s(%p) - entry \n", dccp_role(sk), sk); 680 if (unlikely(hcrx->ccid3hcrx_state == TFRC_RSTATE_TERM))
681 return;
717 682
718 now = ktime_get_real(); 683 now = ktime_get_real();
719 684
720 switch (hcrx->ccid3hcrx_state) { 685 switch (fbtype) {
721 case TFRC_RSTATE_NO_DATA: 686 case CCID3_FBACK_INITIAL:
722 hcrx->ccid3hcrx_x_recv = 0; 687 hcrx->ccid3hcrx_x_recv = 0;
688 hcrx->ccid3hcrx_pinv = ~0U; /* see RFC 4342, 8.5 */
723 break; 689 break;
724 case TFRC_RSTATE_DATA: 690 case CCID3_FBACK_PARAM_CHANGE:
725 delta = ktime_us_delta(now, 691 /*
726 hcrx->ccid3hcrx_tstamp_last_feedback); 692 * When parameters change (new loss or p > p_prev), we do not
727 DCCP_BUG_ON(delta < 0); 693 * have a reliable estimate for R_m of [RFC 3448, 6.2] and so
728 hcrx->ccid3hcrx_x_recv = 694 * need to reuse the previous value of X_recv. However, when
729 scaled_div32(hcrx->ccid3hcrx_bytes_recv, delta); 695 * X_recv was 0 (due to early loss), this would kill X down to
696 * s/t_mbi (i.e. one packet in 64 seconds).
697 * To avoid such drastic reduction, we approximate X_recv as
698 * the number of bytes since last feedback.
699 * This is a safe fallback, since X is bounded above by X_calc.
700 */
701 if (hcrx->ccid3hcrx_x_recv > 0)
702 break;
703 /* fall through */
704 case CCID3_FBACK_PERIODIC:
705 delta = ktime_us_delta(now, hcrx->ccid3hcrx_tstamp_last_feedback);
706 if (delta <= 0)
707 DCCP_BUG("delta (%ld) <= 0", (long)delta);
708 else
709 hcrx->ccid3hcrx_x_recv =
710 scaled_div32(hcrx->ccid3hcrx_bytes_recv, delta);
730 break; 711 break;
731 case TFRC_RSTATE_TERM: 712 default:
732 DCCP_BUG("%s(%p) - Illegal state TERM", dccp_role(sk), sk);
733 return; 713 return;
734 } 714 }
735 715
736 packet = dccp_rx_hist_find_data_packet(&hcrx->ccid3hcrx_hist); 716 ccid3_pr_debug("Interval %ldusec, X_recv=%u, 1/p=%u\n", (long)delta,
737 if (unlikely(packet == NULL)) { 717 hcrx->ccid3hcrx_x_recv, hcrx->ccid3hcrx_pinv);
738 DCCP_WARN("%s(%p), no data packet in history!\n",
739 dccp_role(sk), sk);
740 return;
741 }
742 718
743 hcrx->ccid3hcrx_tstamp_last_feedback = now; 719 hcrx->ccid3hcrx_tstamp_last_feedback = now;
744 hcrx->ccid3hcrx_ccval_last_counter = packet->dccphrx_ccval; 720 hcrx->ccid3hcrx_last_counter = dccp_hdr(skb)->dccph_ccval;
745 hcrx->ccid3hcrx_bytes_recv = 0; 721 hcrx->ccid3hcrx_bytes_recv = 0;
746 722
747 /* Elapsed time information [RFC 4340, 13.2] in units of 10 * usecs */
748 delta = ktime_us_delta(now, packet->dccphrx_tstamp);
749 DCCP_BUG_ON(delta < 0);
750 hcrx->ccid3hcrx_elapsed_time = delta / 10;
751
752 if (hcrx->ccid3hcrx_p == 0)
753 hcrx->ccid3hcrx_pinv = ~0U; /* see RFC 4342, 8.5 */
754 else if (hcrx->ccid3hcrx_p > 1000000) {
755 DCCP_WARN("p (%u) > 100%%\n", hcrx->ccid3hcrx_p);
756 hcrx->ccid3hcrx_pinv = 1; /* use 100% in this case */
757 } else
758 hcrx->ccid3hcrx_pinv = 1000000 / hcrx->ccid3hcrx_p;
759
760 dp->dccps_hc_rx_insert_options = 1; 723 dp->dccps_hc_rx_insert_options = 1;
761 dccp_send_ack(sk); 724 dccp_send_ack(sk);
762} 725}
@@ -770,7 +733,6 @@ static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb)
770 return 0; 733 return 0;
771 734
772 hcrx = ccid3_hc_rx_sk(sk); 735 hcrx = ccid3_hc_rx_sk(sk);
773 DCCP_SKB_CB(skb)->dccpd_ccval = hcrx->ccid3hcrx_ccval_last_counter;
774 736
775 if (dccp_packet_without_ack(skb)) 737 if (dccp_packet_without_ack(skb))
776 return 0; 738 return 0;
@@ -778,11 +740,7 @@ static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb)
778 x_recv = htonl(hcrx->ccid3hcrx_x_recv); 740 x_recv = htonl(hcrx->ccid3hcrx_x_recv);
779 pinv = htonl(hcrx->ccid3hcrx_pinv); 741 pinv = htonl(hcrx->ccid3hcrx_pinv);
780 742
781 if ((hcrx->ccid3hcrx_elapsed_time != 0 && 743 if (dccp_insert_option(sk, skb, TFRC_OPT_LOSS_EVENT_RATE,
782 dccp_insert_option_elapsed_time(sk, skb,
783 hcrx->ccid3hcrx_elapsed_time)) ||
784 dccp_insert_option_timestamp(sk, skb) ||
785 dccp_insert_option(sk, skb, TFRC_OPT_LOSS_EVENT_RATE,
786 &pinv, sizeof(pinv)) || 744 &pinv, sizeof(pinv)) ||
787 dccp_insert_option(sk, skb, TFRC_OPT_RECEIVE_RATE, 745 dccp_insert_option(sk, skb, TFRC_OPT_RECEIVE_RATE,
788 &x_recv, sizeof(x_recv))) 746 &x_recv, sizeof(x_recv)))
@@ -791,180 +749,139 @@ static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb)
791 return 0; 749 return 0;
792} 750}
793 751
794static int ccid3_hc_rx_detect_loss(struct sock *sk, 752/** ccid3_first_li - Implements [RFC 3448, 6.3.1]
795 struct dccp_rx_hist_entry *packet) 753 *
754 * Determine the length of the first loss interval via inverse lookup.
755 * Assume that X_recv can be computed by the throughput equation
756 * s
757 * X_recv = --------
758 * R * fval
759 * Find some p such that f(p) = fval; return 1/p (scaled).
760 */
761static u32 ccid3_first_li(struct sock *sk)
796{ 762{
797 struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); 763 struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
798 struct dccp_rx_hist_entry *rx_hist = 764 u32 x_recv, p, delta;
799 dccp_rx_hist_head(&hcrx->ccid3hcrx_hist); 765 u64 fval;
800 u64 seqno = packet->dccphrx_seqno;
801 u64 tmp_seqno;
802 int loss = 0;
803 u8 ccval;
804
805
806 tmp_seqno = hcrx->ccid3hcrx_seqno_nonloss;
807 766
808 if (!rx_hist || 767 if (hcrx->ccid3hcrx_rtt == 0) {
809 follows48(packet->dccphrx_seqno, hcrx->ccid3hcrx_seqno_nonloss)) { 768 DCCP_WARN("No RTT estimate available, using fallback RTT\n");
810 hcrx->ccid3hcrx_seqno_nonloss = seqno; 769 hcrx->ccid3hcrx_rtt = DCCP_FALLBACK_RTT;
811 hcrx->ccid3hcrx_ccval_nonloss = packet->dccphrx_ccval;
812 goto detect_out;
813 } 770 }
814 771
815 772 delta = ktime_to_us(net_timedelta(hcrx->ccid3hcrx_tstamp_last_feedback));
816 while (dccp_delta_seqno(hcrx->ccid3hcrx_seqno_nonloss, seqno) 773 x_recv = scaled_div32(hcrx->ccid3hcrx_bytes_recv, delta);
817 > TFRC_RECV_NUM_LATE_LOSS) { 774 if (x_recv == 0) { /* would also trigger divide-by-zero */
818 loss = 1; 775 DCCP_WARN("X_recv==0\n");
819 dccp_li_update_li(sk, 776 if ((x_recv = hcrx->ccid3hcrx_x_recv) == 0) {
820 &hcrx->ccid3hcrx_li_hist, 777 DCCP_BUG("stored value of X_recv is zero");
821 &hcrx->ccid3hcrx_hist, 778 return ~0U;
822 hcrx->ccid3hcrx_tstamp_last_feedback,
823 hcrx->ccid3hcrx_s,
824 hcrx->ccid3hcrx_bytes_recv,
825 hcrx->ccid3hcrx_x_recv,
826 hcrx->ccid3hcrx_seqno_nonloss,
827 hcrx->ccid3hcrx_ccval_nonloss);
828 tmp_seqno = hcrx->ccid3hcrx_seqno_nonloss;
829 dccp_inc_seqno(&tmp_seqno);
830 hcrx->ccid3hcrx_seqno_nonloss = tmp_seqno;
831 dccp_inc_seqno(&tmp_seqno);
832 while (dccp_rx_hist_find_entry(&hcrx->ccid3hcrx_hist,
833 tmp_seqno, &ccval)) {
834 hcrx->ccid3hcrx_seqno_nonloss = tmp_seqno;
835 hcrx->ccid3hcrx_ccval_nonloss = ccval;
836 dccp_inc_seqno(&tmp_seqno);
837 } 779 }
838 } 780 }
839 781
840 /* FIXME - this code could be simplified with above while */ 782 fval = scaled_div(hcrx->ccid3hcrx_s, hcrx->ccid3hcrx_rtt);
841 /* but works at moment */ 783 fval = scaled_div32(fval, x_recv);
842 if (follows48(packet->dccphrx_seqno, hcrx->ccid3hcrx_seqno_nonloss)) { 784 p = tfrc_calc_x_reverse_lookup(fval);
843 hcrx->ccid3hcrx_seqno_nonloss = seqno;
844 hcrx->ccid3hcrx_ccval_nonloss = packet->dccphrx_ccval;
845 }
846 785
847detect_out: 786 ccid3_pr_debug("%s(%p), receive rate=%u bytes/s, implied "
848 dccp_rx_hist_add_packet(ccid3_rx_hist, &hcrx->ccid3hcrx_hist, 787 "loss rate=%u\n", dccp_role(sk), sk, x_recv, p);
849 &hcrx->ccid3hcrx_li_hist, packet, 788
850 hcrx->ccid3hcrx_seqno_nonloss); 789 return p == 0 ? ~0U : scaled_div(1, p);
851 return loss;
852} 790}
853 791
854static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) 792static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
855{ 793{
856 struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); 794 struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
857 const struct dccp_options_received *opt_recv; 795 enum ccid3_fback_type do_feedback = CCID3_FBACK_NONE;
858 struct dccp_rx_hist_entry *packet; 796 const u32 ndp = dccp_sk(sk)->dccps_options_received.dccpor_ndp;
859 u32 p_prev, r_sample, rtt_prev; 797 const bool is_data_packet = dccp_data_packet(skb);
860 int loss, payload_size; 798
861 ktime_t now; 799 if (unlikely(hcrx->ccid3hcrx_state == TFRC_RSTATE_NO_DATA)) {
862 800 if (is_data_packet) {
863 opt_recv = &dccp_sk(sk)->dccps_options_received; 801 const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4;
864 802 do_feedback = CCID3_FBACK_INITIAL;
865 switch (DCCP_SKB_CB(skb)->dccpd_type) { 803 ccid3_hc_rx_set_state(sk, TFRC_RSTATE_DATA);
866 case DCCP_PKT_ACK: 804 hcrx->ccid3hcrx_s = payload;
867 if (hcrx->ccid3hcrx_state == TFRC_RSTATE_NO_DATA) 805 /*
868 return; 806 * Not necessary to update ccid3hcrx_bytes_recv here,
869 case DCCP_PKT_DATAACK: 807 * since X_recv = 0 for the first feedback packet (cf.
870 if (opt_recv->dccpor_timestamp_echo == 0) 808 * RFC 3448, 6.3) -- gerrit
871 break; 809 */
872 r_sample = dccp_timestamp() - opt_recv->dccpor_timestamp_echo; 810 }
873 rtt_prev = hcrx->ccid3hcrx_rtt; 811 goto update_records;
874 r_sample = dccp_sample_rtt(sk, 10 * r_sample); 812 }
875 813
876 if (hcrx->ccid3hcrx_state == TFRC_RSTATE_NO_DATA) 814 if (tfrc_rx_hist_duplicate(&hcrx->ccid3hcrx_hist, skb))
877 hcrx->ccid3hcrx_rtt = r_sample; 815 return; /* done receiving */
878 else
879 hcrx->ccid3hcrx_rtt = (hcrx->ccid3hcrx_rtt * 9) / 10 +
880 r_sample / 10;
881 816
882 if (rtt_prev != hcrx->ccid3hcrx_rtt) 817 if (is_data_packet) {
883 ccid3_pr_debug("%s(%p), New RTT=%uus, elapsed time=%u\n", 818 const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4;
884 dccp_role(sk), sk, hcrx->ccid3hcrx_rtt, 819 /*
885 opt_recv->dccpor_elapsed_time); 820 * Update moving-average of s and the sum of received payload bytes
886 break; 821 */
887 case DCCP_PKT_DATA: 822 hcrx->ccid3hcrx_s = tfrc_ewma(hcrx->ccid3hcrx_s, payload, 9);
888 break; 823 hcrx->ccid3hcrx_bytes_recv += payload;
889 default: /* We're not interested in other packet types, move along */
890 return;
891 } 824 }
892 825
893 packet = dccp_rx_hist_entry_new(ccid3_rx_hist, opt_recv->dccpor_ndp, 826 /*
894 skb, GFP_ATOMIC); 827 * Handle pending losses and otherwise check for new loss
895 if (unlikely(packet == NULL)) { 828 */
896 DCCP_WARN("%s(%p), Not enough mem to add rx packet " 829 if (tfrc_rx_hist_loss_pending(&hcrx->ccid3hcrx_hist) &&
897 "to history, consider it lost!\n", dccp_role(sk), sk); 830 tfrc_rx_handle_loss(&hcrx->ccid3hcrx_hist,
898 return; 831 &hcrx->ccid3hcrx_li_hist,
832 skb, ndp, ccid3_first_li, sk) ) {
833 do_feedback = CCID3_FBACK_PARAM_CHANGE;
834 goto done_receiving;
899 } 835 }
900 836
901 loss = ccid3_hc_rx_detect_loss(sk, packet); 837 if (tfrc_rx_hist_new_loss_indicated(&hcrx->ccid3hcrx_hist, skb, ndp))
838 goto update_records;
902 839
903 if (DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK) 840 /*
904 return; 841 * Handle data packets: RTT sampling and monitoring p
905 842 */
906 payload_size = skb->len - dccp_hdr(skb)->dccph_doff * 4; 843 if (unlikely(!is_data_packet))
907 ccid3_hc_rx_update_s(hcrx, payload_size); 844 goto update_records;
908 845
909 switch (hcrx->ccid3hcrx_state) { 846 if (!tfrc_lh_is_initialised(&hcrx->ccid3hcrx_li_hist)) {
910 case TFRC_RSTATE_NO_DATA: 847 const u32 sample = tfrc_rx_hist_sample_rtt(&hcrx->ccid3hcrx_hist, skb);
911 ccid3_pr_debug("%s(%p, state=%s), skb=%p, sending initial " 848 /*
912 "feedback\n", dccp_role(sk), sk, 849 * Empty loss history: no loss so far, hence p stays 0.
913 dccp_state_name(sk->sk_state), skb); 850 * Sample RTT values, since an RTT estimate is required for the
914 ccid3_hc_rx_send_feedback(sk); 851 * computation of p when the first loss occurs; RFC 3448, 6.3.1.
915 ccid3_hc_rx_set_state(sk, TFRC_RSTATE_DATA); 852 */
916 return; 853 if (sample != 0)
917 case TFRC_RSTATE_DATA: 854 hcrx->ccid3hcrx_rtt = tfrc_ewma(hcrx->ccid3hcrx_rtt, sample, 9);
918 hcrx->ccid3hcrx_bytes_recv += payload_size;
919 if (loss)
920 break;
921 855
922 now = ktime_get_real(); 856 } else if (tfrc_lh_update_i_mean(&hcrx->ccid3hcrx_li_hist, skb)) {
923 if ((ktime_us_delta(now, hcrx->ccid3hcrx_tstamp_last_ack) - 857 /*
924 (s64)hcrx->ccid3hcrx_rtt) >= 0) { 858 * Step (3) of [RFC 3448, 6.1]: Recompute I_mean and, if I_mean
925 hcrx->ccid3hcrx_tstamp_last_ack = now; 859 * has decreased (resp. p has increased), send feedback now.
926 ccid3_hc_rx_send_feedback(sk); 860 */
927 } 861 do_feedback = CCID3_FBACK_PARAM_CHANGE;
928 return;
929 case TFRC_RSTATE_TERM:
930 DCCP_BUG("%s(%p) - Illegal state TERM", dccp_role(sk), sk);
931 return;
932 } 862 }
933 863
934 /* Dealing with packet loss */ 864 /*
935 ccid3_pr_debug("%s(%p, state=%s), data loss! Reacting...\n", 865 * Check if the periodic once-per-RTT feedback is due; RFC 4342, 10.3
936 dccp_role(sk), sk, dccp_state_name(sk->sk_state)); 866 */
937 867 if (SUB16(dccp_hdr(skb)->dccph_ccval, hcrx->ccid3hcrx_last_counter) > 3)
938 p_prev = hcrx->ccid3hcrx_p; 868 do_feedback = CCID3_FBACK_PERIODIC;
939
940 /* Calculate loss event rate */
941 if (!list_empty(&hcrx->ccid3hcrx_li_hist)) {
942 u32 i_mean = dccp_li_hist_calc_i_mean(&hcrx->ccid3hcrx_li_hist);
943 869
944 /* Scaling up by 1000000 as fixed decimal */ 870update_records:
945 if (i_mean != 0) 871 tfrc_rx_hist_add_packet(&hcrx->ccid3hcrx_hist, skb, ndp);
946 hcrx->ccid3hcrx_p = 1000000 / i_mean;
947 } else
948 DCCP_BUG("empty loss history");
949 872
950 if (hcrx->ccid3hcrx_p > p_prev) { 873done_receiving:
951 ccid3_hc_rx_send_feedback(sk); 874 if (do_feedback)
952 return; 875 ccid3_hc_rx_send_feedback(sk, skb, do_feedback);
953 }
954} 876}
955 877
956static int ccid3_hc_rx_init(struct ccid *ccid, struct sock *sk) 878static int ccid3_hc_rx_init(struct ccid *ccid, struct sock *sk)
957{ 879{
958 struct ccid3_hc_rx_sock *hcrx = ccid_priv(ccid); 880 struct ccid3_hc_rx_sock *hcrx = ccid_priv(ccid);
959 881
960 ccid3_pr_debug("entry\n");
961
962 hcrx->ccid3hcrx_state = TFRC_RSTATE_NO_DATA; 882 hcrx->ccid3hcrx_state = TFRC_RSTATE_NO_DATA;
963 INIT_LIST_HEAD(&hcrx->ccid3hcrx_hist); 883 tfrc_lh_init(&hcrx->ccid3hcrx_li_hist);
964 INIT_LIST_HEAD(&hcrx->ccid3hcrx_li_hist); 884 return tfrc_rx_hist_alloc(&hcrx->ccid3hcrx_hist);
965 hcrx->ccid3hcrx_tstamp_last_feedback =
966 hcrx->ccid3hcrx_tstamp_last_ack = ktime_get_real();
967 return 0;
968} 885}
969 886
970static void ccid3_hc_rx_exit(struct sock *sk) 887static void ccid3_hc_rx_exit(struct sock *sk)
@@ -973,11 +890,8 @@ static void ccid3_hc_rx_exit(struct sock *sk)
973 890
974 ccid3_hc_rx_set_state(sk, TFRC_RSTATE_TERM); 891 ccid3_hc_rx_set_state(sk, TFRC_RSTATE_TERM);
975 892
976 /* Empty packet history */ 893 tfrc_rx_hist_purge(&hcrx->ccid3hcrx_hist);
977 dccp_rx_hist_purge(ccid3_rx_hist, &hcrx->ccid3hcrx_hist); 894 tfrc_lh_cleanup(&hcrx->ccid3hcrx_li_hist);
978
979 /* Empty loss interval history */
980 dccp_li_hist_purge(&hcrx->ccid3hcrx_li_hist);
981} 895}
982 896
983static void ccid3_hc_rx_get_info(struct sock *sk, struct tcp_info *info) 897static void ccid3_hc_rx_get_info(struct sock *sk, struct tcp_info *info)
@@ -998,6 +912,7 @@ static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len,
998 u32 __user *optval, int __user *optlen) 912 u32 __user *optval, int __user *optlen)
999{ 913{
1000 const struct ccid3_hc_rx_sock *hcrx; 914 const struct ccid3_hc_rx_sock *hcrx;
915 struct tfrc_rx_info rx_info;
1001 const void *val; 916 const void *val;
1002 917
1003 /* Listen socks doesn't have a private CCID block */ 918 /* Listen socks doesn't have a private CCID block */
@@ -1007,10 +922,14 @@ static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len,
1007 hcrx = ccid3_hc_rx_sk(sk); 922 hcrx = ccid3_hc_rx_sk(sk);
1008 switch (optname) { 923 switch (optname) {
1009 case DCCP_SOCKOPT_CCID_RX_INFO: 924 case DCCP_SOCKOPT_CCID_RX_INFO:
1010 if (len < sizeof(hcrx->ccid3hcrx_tfrc)) 925 if (len < sizeof(rx_info))
1011 return -EINVAL; 926 return -EINVAL;
1012 len = sizeof(hcrx->ccid3hcrx_tfrc); 927 rx_info.tfrcrx_x_recv = hcrx->ccid3hcrx_x_recv;
1013 val = &hcrx->ccid3hcrx_tfrc; 928 rx_info.tfrcrx_rtt = hcrx->ccid3hcrx_rtt;
929 rx_info.tfrcrx_p = hcrx->ccid3hcrx_pinv == 0 ? ~0U :
930 scaled_div(1, hcrx->ccid3hcrx_pinv);
931 len = sizeof(rx_info);
932 val = &rx_info;
1014 break; 933 break;
1015 default: 934 default:
1016 return -ENOPROTOOPT; 935 return -ENOPROTOOPT;
@@ -1024,7 +943,7 @@ static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len,
1024 943
1025static struct ccid_operations ccid3 = { 944static struct ccid_operations ccid3 = {
1026 .ccid_id = DCCPC_CCID3, 945 .ccid_id = DCCPC_CCID3,
1027 .ccid_name = "ccid3", 946 .ccid_name = "TCP-Friendly Rate Control",
1028 .ccid_owner = THIS_MODULE, 947 .ccid_owner = THIS_MODULE,
1029 .ccid_hc_tx_obj_size = sizeof(struct ccid3_hc_tx_sock), 948 .ccid_hc_tx_obj_size = sizeof(struct ccid3_hc_tx_sock),
1030 .ccid_hc_tx_init = ccid3_hc_tx_init, 949 .ccid_hc_tx_init = ccid3_hc_tx_init,
@@ -1051,44 +970,13 @@ MODULE_PARM_DESC(ccid3_debug, "Enable debug messages");
1051 970
1052static __init int ccid3_module_init(void) 971static __init int ccid3_module_init(void)
1053{ 972{
1054 int rc = -ENOBUFS; 973 return ccid_register(&ccid3);
1055
1056 ccid3_rx_hist = dccp_rx_hist_new("ccid3");
1057 if (ccid3_rx_hist == NULL)
1058 goto out;
1059
1060 ccid3_tx_hist = dccp_tx_hist_new("ccid3");
1061 if (ccid3_tx_hist == NULL)
1062 goto out_free_rx;
1063
1064 rc = ccid_register(&ccid3);
1065 if (rc != 0)
1066 goto out_free_tx;
1067out:
1068 return rc;
1069
1070out_free_tx:
1071 dccp_tx_hist_delete(ccid3_tx_hist);
1072 ccid3_tx_hist = NULL;
1073out_free_rx:
1074 dccp_rx_hist_delete(ccid3_rx_hist);
1075 ccid3_rx_hist = NULL;
1076 goto out;
1077} 974}
1078module_init(ccid3_module_init); 975module_init(ccid3_module_init);
1079 976
1080static __exit void ccid3_module_exit(void) 977static __exit void ccid3_module_exit(void)
1081{ 978{
1082 ccid_unregister(&ccid3); 979 ccid_unregister(&ccid3);
1083
1084 if (ccid3_tx_hist != NULL) {
1085 dccp_tx_hist_delete(ccid3_tx_hist);
1086 ccid3_tx_hist = NULL;
1087 }
1088 if (ccid3_rx_hist != NULL) {
1089 dccp_rx_hist_delete(ccid3_rx_hist);
1090 ccid3_rx_hist = NULL;
1091 }
1092} 980}
1093module_exit(ccid3_module_exit); 981module_exit(ccid3_module_exit);
1094 982
diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h
index 0cdc982cfe47..49ca32bd7e79 100644
--- a/net/dccp/ccids/ccid3.h
+++ b/net/dccp/ccids/ccid3.h
@@ -1,7 +1,8 @@
1/* 1/*
2 * net/dccp/ccids/ccid3.h 2 * net/dccp/ccids/ccid3.h
3 * 3 *
4 * Copyright (c) 2005-6 The University of Waikato, Hamilton, New Zealand. 4 * Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand.
5 * Copyright (c) 2007 The University of Aberdeen, Scotland, UK
5 * 6 *
6 * An implementation of the DCCP protocol 7 * An implementation of the DCCP protocol
7 * 8 *
@@ -40,6 +41,7 @@
40#include <linux/list.h> 41#include <linux/list.h>
41#include <linux/types.h> 42#include <linux/types.h>
42#include <linux/tfrc.h> 43#include <linux/tfrc.h>
44#include "lib/tfrc.h"
43#include "../ccid.h" 45#include "../ccid.h"
44 46
45/* Two seconds as per RFC 3448 4.2 */ 47/* Two seconds as per RFC 3448 4.2 */
@@ -88,7 +90,6 @@ enum ccid3_hc_tx_states {
88 * @ccid3hctx_t_last_win_count - Timestamp of earliest packet 90 * @ccid3hctx_t_last_win_count - Timestamp of earliest packet
89 * with last_win_count value sent 91 * with last_win_count value sent
90 * @ccid3hctx_no_feedback_timer - Handle to no feedback timer 92 * @ccid3hctx_no_feedback_timer - Handle to no feedback timer
91 * @ccid3hctx_idle - Flag indicating that sender is idling
92 * @ccid3hctx_t_ld - Time last doubled during slow start 93 * @ccid3hctx_t_ld - Time last doubled during slow start
93 * @ccid3hctx_t_nom - Nominal send time of next packet 94 * @ccid3hctx_t_nom - Nominal send time of next packet
94 * @ccid3hctx_delta - Send timer delta (RFC 3448, 4.6) in usecs 95 * @ccid3hctx_delta - Send timer delta (RFC 3448, 4.6) in usecs
@@ -107,13 +108,12 @@ struct ccid3_hc_tx_sock {
107 u16 ccid3hctx_s; 108 u16 ccid3hctx_s;
108 enum ccid3_hc_tx_states ccid3hctx_state:8; 109 enum ccid3_hc_tx_states ccid3hctx_state:8;
109 u8 ccid3hctx_last_win_count; 110 u8 ccid3hctx_last_win_count;
110 u8 ccid3hctx_idle;
111 ktime_t ccid3hctx_t_last_win_count; 111 ktime_t ccid3hctx_t_last_win_count;
112 struct timer_list ccid3hctx_no_feedback_timer; 112 struct timer_list ccid3hctx_no_feedback_timer;
113 ktime_t ccid3hctx_t_ld; 113 ktime_t ccid3hctx_t_ld;
114 ktime_t ccid3hctx_t_nom; 114 ktime_t ccid3hctx_t_nom;
115 u32 ccid3hctx_delta; 115 u32 ccid3hctx_delta;
116 struct list_head ccid3hctx_hist; 116 struct tfrc_tx_hist_entry *ccid3hctx_hist;
117 struct ccid3_options_received ccid3hctx_options_received; 117 struct ccid3_options_received ccid3hctx_options_received;
118}; 118};
119 119
@@ -135,37 +135,30 @@ enum ccid3_hc_rx_states {
135 * 135 *
136 * @ccid3hcrx_x_recv - Receiver estimate of send rate (RFC 3448 4.3) 136 * @ccid3hcrx_x_recv - Receiver estimate of send rate (RFC 3448 4.3)
137 * @ccid3hcrx_rtt - Receiver estimate of rtt (non-standard) 137 * @ccid3hcrx_rtt - Receiver estimate of rtt (non-standard)
138 * @ccid3hcrx_p - current loss event rate (RFC 3448 5.4) 138 * @ccid3hcrx_p - Current loss event rate (RFC 3448 5.4)
139 * @ccid3hcrx_seqno_nonloss - Last received non-loss sequence number 139 * @ccid3hcrx_last_counter - Tracks window counter (RFC 4342, 8.1)
140 * @ccid3hcrx_ccval_nonloss - Last received non-loss Window CCVal 140 * @ccid3hcrx_state - Receiver state, one of %ccid3_hc_rx_states
141 * @ccid3hcrx_ccval_last_counter - Tracks window counter (RFC 4342, 8.1)
142 * @ccid3hcrx_state - receiver state, one of %ccid3_hc_rx_states
143 * @ccid3hcrx_bytes_recv - Total sum of DCCP payload bytes 141 * @ccid3hcrx_bytes_recv - Total sum of DCCP payload bytes
142 * @ccid3hcrx_x_recv - Receiver estimate of send rate (RFC 3448, sec. 4.3)
143 * @ccid3hcrx_rtt - Receiver estimate of RTT
144 * @ccid3hcrx_tstamp_last_feedback - Time at which last feedback was sent 144 * @ccid3hcrx_tstamp_last_feedback - Time at which last feedback was sent
145 * @ccid3hcrx_tstamp_last_ack - Time at which last feedback was sent 145 * @ccid3hcrx_tstamp_last_ack - Time at which last feedback was sent
146 * @ccid3hcrx_hist - Packet history 146 * @ccid3hcrx_hist - Packet history (loss detection + RTT sampling)
147 * @ccid3hcrx_li_hist - Loss Interval History 147 * @ccid3hcrx_li_hist - Loss Interval database
148 * @ccid3hcrx_s - Received packet size in bytes 148 * @ccid3hcrx_s - Received packet size in bytes
149 * @ccid3hcrx_pinv - Inverse of Loss Event Rate (RFC 4342, sec. 8.5) 149 * @ccid3hcrx_pinv - Inverse of Loss Event Rate (RFC 4342, sec. 8.5)
150 * @ccid3hcrx_elapsed_time - Time since packet reception
151 */ 150 */
152struct ccid3_hc_rx_sock { 151struct ccid3_hc_rx_sock {
153 struct tfrc_rx_info ccid3hcrx_tfrc; 152 u8 ccid3hcrx_last_counter:4;
154#define ccid3hcrx_x_recv ccid3hcrx_tfrc.tfrcrx_x_recv
155#define ccid3hcrx_rtt ccid3hcrx_tfrc.tfrcrx_rtt
156#define ccid3hcrx_p ccid3hcrx_tfrc.tfrcrx_p
157 u64 ccid3hcrx_seqno_nonloss:48,
158 ccid3hcrx_ccval_nonloss:4,
159 ccid3hcrx_ccval_last_counter:4;
160 enum ccid3_hc_rx_states ccid3hcrx_state:8; 153 enum ccid3_hc_rx_states ccid3hcrx_state:8;
161 u32 ccid3hcrx_bytes_recv; 154 u32 ccid3hcrx_bytes_recv;
155 u32 ccid3hcrx_x_recv;
156 u32 ccid3hcrx_rtt;
162 ktime_t ccid3hcrx_tstamp_last_feedback; 157 ktime_t ccid3hcrx_tstamp_last_feedback;
163 ktime_t ccid3hcrx_tstamp_last_ack; 158 struct tfrc_rx_hist ccid3hcrx_hist;
164 struct list_head ccid3hcrx_hist; 159 struct tfrc_loss_hist ccid3hcrx_li_hist;
165 struct list_head ccid3hcrx_li_hist;
166 u16 ccid3hcrx_s; 160 u16 ccid3hcrx_s;
167 u32 ccid3hcrx_pinv; 161#define ccid3hcrx_pinv ccid3hcrx_li_hist.i_mean
168 u32 ccid3hcrx_elapsed_time;
169}; 162};
170 163
171static inline struct ccid3_hc_rx_sock *ccid3_hc_rx_sk(const struct sock *sk) 164static inline struct ccid3_hc_rx_sock *ccid3_hc_rx_sk(const struct sock *sk)
diff --git a/net/dccp/ccids/lib/Makefile b/net/dccp/ccids/lib/Makefile
index 5f940a6cbaca..68c93e3d89dc 100644
--- a/net/dccp/ccids/lib/Makefile
+++ b/net/dccp/ccids/lib/Makefile
@@ -1,3 +1,3 @@
1obj-$(CONFIG_IP_DCCP_TFRC_LIB) += dccp_tfrc_lib.o 1obj-$(CONFIG_IP_DCCP_TFRC_LIB) += dccp_tfrc_lib.o
2 2
3dccp_tfrc_lib-y := loss_interval.o packet_history.o tfrc_equation.o 3dccp_tfrc_lib-y := tfrc.o tfrc_equation.o packet_history.o loss_interval.o
diff --git a/net/dccp/ccids/lib/loss_interval.c b/net/dccp/ccids/lib/loss_interval.c
index d26b88dbbb45..849e181e698f 100644
--- a/net/dccp/ccids/lib/loss_interval.c
+++ b/net/dccp/ccids/lib/loss_interval.c
@@ -1,6 +1,7 @@
1/* 1/*
2 * net/dccp/ccids/lib/loss_interval.c 2 * net/dccp/ccids/lib/loss_interval.c
3 * 3 *
4 * Copyright (c) 2007 The University of Aberdeen, Scotland, UK
4 * Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand. 5 * Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand.
5 * Copyright (c) 2005-7 Ian McDonald <ian.mcdonald@jandi.co.nz> 6 * Copyright (c) 2005-7 Ian McDonald <ian.mcdonald@jandi.co.nz>
6 * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br> 7 * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
@@ -10,285 +11,176 @@
10 * the Free Software Foundation; either version 2 of the License, or 11 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version. 12 * (at your option) any later version.
12 */ 13 */
13
14#include <linux/module.h>
15#include <net/sock.h> 14#include <net/sock.h>
16#include "../../dccp.h"
17#include "loss_interval.h"
18#include "packet_history.h"
19#include "tfrc.h" 15#include "tfrc.h"
20 16
21#define DCCP_LI_HIST_IVAL_F_LENGTH 8 17static struct kmem_cache *tfrc_lh_slab __read_mostly;
22 18/* Loss Interval weights from [RFC 3448, 5.4], scaled by 10 */
23struct dccp_li_hist_entry { 19static const int tfrc_lh_weights[NINTERVAL] = { 10, 10, 10, 10, 8, 6, 4, 2 };
24 struct list_head dccplih_node;
25 u64 dccplih_seqno:48,
26 dccplih_win_count:4;
27 u32 dccplih_interval;
28};
29 20
30static struct kmem_cache *dccp_li_cachep __read_mostly; 21/* implements LIFO semantics on the array */
31 22static inline u8 LIH_INDEX(const u8 ctr)
32static inline struct dccp_li_hist_entry *dccp_li_hist_entry_new(const gfp_t prio)
33{ 23{
34 return kmem_cache_alloc(dccp_li_cachep, prio); 24 return (LIH_SIZE - 1 - (ctr % LIH_SIZE));
35} 25}
36 26
37static inline void dccp_li_hist_entry_delete(struct dccp_li_hist_entry *entry) 27/* the `counter' index always points at the next entry to be populated */
28static inline struct tfrc_loss_interval *tfrc_lh_peek(struct tfrc_loss_hist *lh)
38{ 29{
39 if (entry != NULL) 30 return lh->counter ? lh->ring[LIH_INDEX(lh->counter - 1)] : NULL;
40 kmem_cache_free(dccp_li_cachep, entry);
41} 31}
42 32
43void dccp_li_hist_purge(struct list_head *list) 33/* given i with 0 <= i <= k, return I_i as per the rfc3448bis notation */
34static inline u32 tfrc_lh_get_interval(struct tfrc_loss_hist *lh, const u8 i)
44{ 35{
45 struct dccp_li_hist_entry *entry, *next; 36 BUG_ON(i >= lh->counter);
46 37 return lh->ring[LIH_INDEX(lh->counter - i - 1)]->li_length;
47 list_for_each_entry_safe(entry, next, list, dccplih_node) {
48 list_del_init(&entry->dccplih_node);
49 kmem_cache_free(dccp_li_cachep, entry);
50 }
51} 38}
52 39
53EXPORT_SYMBOL_GPL(dccp_li_hist_purge);
54
55/* Weights used to calculate loss event rate */
56/* 40/*
57 * These are integers as per section 8 of RFC3448. We can then divide by 4 * 41 * On-demand allocation and de-allocation of entries
58 * when we use it.
59 */ 42 */
60static const int dccp_li_hist_w[DCCP_LI_HIST_IVAL_F_LENGTH] = { 43static struct tfrc_loss_interval *tfrc_lh_demand_next(struct tfrc_loss_hist *lh)
61 4, 4, 4, 4, 3, 2, 1, 1,
62};
63
64u32 dccp_li_hist_calc_i_mean(struct list_head *list)
65{ 44{
66 struct dccp_li_hist_entry *li_entry, *li_next; 45 if (lh->ring[LIH_INDEX(lh->counter)] == NULL)
67 int i = 0; 46 lh->ring[LIH_INDEX(lh->counter)] = kmem_cache_alloc(tfrc_lh_slab,
68 u32 i_tot; 47 GFP_ATOMIC);
69 u32 i_tot0 = 0; 48 return lh->ring[LIH_INDEX(lh->counter)];
70 u32 i_tot1 = 0;
71 u32 w_tot = 0;
72
73 list_for_each_entry_safe(li_entry, li_next, list, dccplih_node) {
74 if (li_entry->dccplih_interval != ~0U) {
75 i_tot0 += li_entry->dccplih_interval * dccp_li_hist_w[i];
76 w_tot += dccp_li_hist_w[i];
77 if (i != 0)
78 i_tot1 += li_entry->dccplih_interval * dccp_li_hist_w[i - 1];
79 }
80
81
82 if (++i > DCCP_LI_HIST_IVAL_F_LENGTH)
83 break;
84 }
85
86 if (i != DCCP_LI_HIST_IVAL_F_LENGTH)
87 return 0;
88
89 i_tot = max(i_tot0, i_tot1);
90
91 if (!w_tot) {
92 DCCP_WARN("w_tot = 0\n");
93 return 1;
94 }
95
96 return i_tot / w_tot;
97} 49}
98 50
99EXPORT_SYMBOL_GPL(dccp_li_hist_calc_i_mean); 51void tfrc_lh_cleanup(struct tfrc_loss_hist *lh)
100
101static int dccp_li_hist_interval_new(struct list_head *list,
102 const u64 seq_loss, const u8 win_loss)
103{ 52{
104 struct dccp_li_hist_entry *entry; 53 if (!tfrc_lh_is_initialised(lh))
105 int i; 54 return;
106 55
107 for (i = 0; i < DCCP_LI_HIST_IVAL_F_LENGTH; i++) { 56 for (lh->counter = 0; lh->counter < LIH_SIZE; lh->counter++)
108 entry = dccp_li_hist_entry_new(GFP_ATOMIC); 57 if (lh->ring[LIH_INDEX(lh->counter)] != NULL) {
109 if (entry == NULL) { 58 kmem_cache_free(tfrc_lh_slab,
110 dccp_li_hist_purge(list); 59 lh->ring[LIH_INDEX(lh->counter)]);
111 DCCP_BUG("loss interval list entry is NULL"); 60 lh->ring[LIH_INDEX(lh->counter)] = NULL;
112 return 0;
113 } 61 }
114 entry->dccplih_interval = ~0;
115 list_add(&entry->dccplih_node, list);
116 }
117
118 entry->dccplih_seqno = seq_loss;
119 entry->dccplih_win_count = win_loss;
120 return 1;
121} 62}
63EXPORT_SYMBOL_GPL(tfrc_lh_cleanup);
122 64
123/* calculate first loss interval 65static void tfrc_lh_calc_i_mean(struct tfrc_loss_hist *lh)
124 *
125 * returns estimated loss interval in usecs */
126static u32 dccp_li_calc_first_li(struct sock *sk,
127 struct list_head *hist_list,
128 ktime_t last_feedback,
129 u16 s, u32 bytes_recv,
130 u32 previous_x_recv)
131{ 66{
132 struct dccp_rx_hist_entry *entry, *next, *tail = NULL; 67 u32 i_i, i_tot0 = 0, i_tot1 = 0, w_tot = 0;
133 u32 x_recv, p; 68 int i, k = tfrc_lh_length(lh) - 1; /* k is as in rfc3448bis, 5.4 */
134 suseconds_t rtt, delta;
135 ktime_t tstamp = ktime_set(0, 0);
136 int interval = 0;
137 int win_count = 0;
138 int step = 0;
139 u64 fval;
140 69
141 list_for_each_entry_safe(entry, next, hist_list, dccphrx_node) { 70 for (i=0; i <= k; i++) {
142 if (dccp_rx_hist_entry_data_packet(entry)) { 71 i_i = tfrc_lh_get_interval(lh, i);
143 tail = entry;
144 72
145 switch (step) { 73 if (i < k) {
146 case 0: 74 i_tot0 += i_i * tfrc_lh_weights[i];
147 tstamp = entry->dccphrx_tstamp; 75 w_tot += tfrc_lh_weights[i];
148 win_count = entry->dccphrx_ccval;
149 step = 1;
150 break;
151 case 1:
152 interval = win_count - entry->dccphrx_ccval;
153 if (interval < 0)
154 interval += TFRC_WIN_COUNT_LIMIT;
155 if (interval > 4)
156 goto found;
157 break;
158 }
159 } 76 }
77 if (i > 0)
78 i_tot1 += i_i * tfrc_lh_weights[i-1];
160 } 79 }
161 80
162 if (unlikely(step == 0)) { 81 BUG_ON(w_tot == 0);
163 DCCP_WARN("%s(%p), packet history has no data packets!\n", 82 lh->i_mean = max(i_tot0, i_tot1) / w_tot;
164 dccp_role(sk), sk); 83}
165 return ~0;
166 }
167
168 if (unlikely(interval == 0)) {
169 DCCP_WARN("%s(%p), Could not find a win_count interval > 0. "
170 "Defaulting to 1\n", dccp_role(sk), sk);
171 interval = 1;
172 }
173found:
174 if (!tail) {
175 DCCP_CRIT("tail is null\n");
176 return ~0;
177 }
178
179 delta = ktime_us_delta(tstamp, tail->dccphrx_tstamp);
180 DCCP_BUG_ON(delta < 0);
181 84
182 rtt = delta * 4 / interval; 85/**
183 dccp_pr_debug("%s(%p), approximated RTT to %dus\n", 86 * tfrc_lh_update_i_mean - Update the `open' loss interval I_0
184 dccp_role(sk), sk, (int)rtt); 87 * For recomputing p: returns `true' if p > p_prev <=> 1/p < 1/p_prev
88 */
89u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *skb)
90{
91 struct tfrc_loss_interval *cur = tfrc_lh_peek(lh);
92 u32 old_i_mean = lh->i_mean;
93 s64 length;
185 94
186 /* 95 if (cur == NULL) /* not initialised */
187 * Determine the length of the first loss interval via inverse lookup. 96 return 0;
188 * Assume that X_recv can be computed by the throughput equation
189 * s
190 * X_recv = --------
191 * R * fval
192 * Find some p such that f(p) = fval; return 1/p [RFC 3448, 6.3.1].
193 */
194 if (rtt == 0) { /* would result in divide-by-zero */
195 DCCP_WARN("RTT==0\n");
196 return ~0;
197 }
198 97
199 delta = ktime_us_delta(ktime_get_real(), last_feedback); 98 length = dccp_delta_seqno(cur->li_seqno, DCCP_SKB_CB(skb)->dccpd_seq);
200 DCCP_BUG_ON(delta <= 0);
201 99
202 x_recv = scaled_div32(bytes_recv, delta); 100 if (length - cur->li_length <= 0) /* duplicate or reordered */
203 if (x_recv == 0) { /* would also trigger divide-by-zero */ 101 return 0;
204 DCCP_WARN("X_recv==0\n");
205 if (previous_x_recv == 0) {
206 DCCP_BUG("stored value of X_recv is zero");
207 return ~0;
208 }
209 x_recv = previous_x_recv;
210 }
211 102
212 fval = scaled_div(s, rtt); 103 if (SUB16(dccp_hdr(skb)->dccph_ccval, cur->li_ccval) > 4)
213 fval = scaled_div32(fval, x_recv); 104 /*
214 p = tfrc_calc_x_reverse_lookup(fval); 105 * Implements RFC 4342, 10.2:
106 * If a packet S (skb) exists whose seqno comes `after' the one
107 * starting the current loss interval (cur) and if the modulo-16
108 * distance from C(cur) to C(S) is greater than 4, consider all
109 * subsequent packets as belonging to a new loss interval. This
110 * test is necessary since CCVal may wrap between intervals.
111 */
112 cur->li_is_closed = 1;
113
114 if (tfrc_lh_length(lh) == 1) /* due to RFC 3448, 6.3.1 */
115 return 0;
215 116
216 dccp_pr_debug("%s(%p), receive rate=%u bytes/s, implied " 117 cur->li_length = length;
217 "loss rate=%u\n", dccp_role(sk), sk, x_recv, p); 118 tfrc_lh_calc_i_mean(lh);
218 119
219 if (p == 0) 120 return (lh->i_mean < old_i_mean);
220 return ~0;
221 else
222 return 1000000 / p;
223} 121}
122EXPORT_SYMBOL_GPL(tfrc_lh_update_i_mean);
224 123
225void dccp_li_update_li(struct sock *sk, 124/* Determine if `new_loss' does begin a new loss interval [RFC 4342, 10.2] */
226 struct list_head *li_hist_list, 125static inline u8 tfrc_lh_is_new_loss(struct tfrc_loss_interval *cur,
227 struct list_head *hist_list, 126 struct tfrc_rx_hist_entry *new_loss)
228 ktime_t last_feedback, u16 s, u32 bytes_recv,
229 u32 previous_x_recv, u64 seq_loss, u8 win_loss)
230{ 127{
231 struct dccp_li_hist_entry *head; 128 return dccp_delta_seqno(cur->li_seqno, new_loss->tfrchrx_seqno) > 0 &&
232 u64 seq_temp; 129 (cur->li_is_closed || SUB16(new_loss->tfrchrx_ccval, cur->li_ccval) > 4);
233 130}
234 if (list_empty(li_hist_list)) {
235 if (!dccp_li_hist_interval_new(li_hist_list, seq_loss,
236 win_loss))
237 return;
238
239 head = list_entry(li_hist_list->next, struct dccp_li_hist_entry,
240 dccplih_node);
241 head->dccplih_interval = dccp_li_calc_first_li(sk, hist_list,
242 last_feedback,
243 s, bytes_recv,
244 previous_x_recv);
245 } else {
246 struct dccp_li_hist_entry *entry;
247 struct list_head *tail;
248 131
249 head = list_entry(li_hist_list->next, struct dccp_li_hist_entry, 132/** tfrc_lh_interval_add - Insert new record into the Loss Interval database
250 dccplih_node); 133 * @lh: Loss Interval database
251 /* FIXME win count check removed as was wrong */ 134 * @rh: Receive history containing a fresh loss event
252 /* should make this check with receive history */ 135 * @calc_first_li: Caller-dependent routine to compute length of first interval
253 /* and compare there as per section 10.2 of RFC4342 */ 136 * @sk: Used by @calc_first_li in caller-specific way (subtyping)
137 * Updates I_mean and returns 1 if a new interval has in fact been added to @lh.
138 */
139int tfrc_lh_interval_add(struct tfrc_loss_hist *lh, struct tfrc_rx_hist *rh,
140 u32 (*calc_first_li)(struct sock *), struct sock *sk)
141{
142 struct tfrc_loss_interval *cur = tfrc_lh_peek(lh), *new;
254 143
255 /* new loss event detected */ 144 if (cur != NULL && !tfrc_lh_is_new_loss(cur, tfrc_rx_hist_loss_prev(rh)))
256 /* calculate last interval length */ 145 return 0;
257 seq_temp = dccp_delta_seqno(head->dccplih_seqno, seq_loss);
258 entry = dccp_li_hist_entry_new(GFP_ATOMIC);
259 146
260 if (entry == NULL) { 147 new = tfrc_lh_demand_next(lh);
261 DCCP_BUG("out of memory - can not allocate entry"); 148 if (unlikely(new == NULL)) {
262 return; 149 DCCP_CRIT("Cannot allocate/add loss record.");
263 } 150 return 0;
151 }
264 152
265 list_add(&entry->dccplih_node, li_hist_list); 153 new->li_seqno = tfrc_rx_hist_loss_prev(rh)->tfrchrx_seqno;
154 new->li_ccval = tfrc_rx_hist_loss_prev(rh)->tfrchrx_ccval;
155 new->li_is_closed = 0;
266 156
267 tail = li_hist_list->prev; 157 if (++lh->counter == 1)
268 list_del(tail); 158 lh->i_mean = new->li_length = (*calc_first_li)(sk);
269 kmem_cache_free(dccp_li_cachep, tail); 159 else {
160 cur->li_length = dccp_delta_seqno(cur->li_seqno, new->li_seqno);
161 new->li_length = dccp_delta_seqno(new->li_seqno,
162 tfrc_rx_hist_last_rcv(rh)->tfrchrx_seqno);
163 if (lh->counter > (2*LIH_SIZE))
164 lh->counter -= LIH_SIZE;
270 165
271 /* Create the newest interval */ 166 tfrc_lh_calc_i_mean(lh);
272 entry->dccplih_seqno = seq_loss;
273 entry->dccplih_interval = seq_temp;
274 entry->dccplih_win_count = win_loss;
275 } 167 }
168 return 1;
276} 169}
170EXPORT_SYMBOL_GPL(tfrc_lh_interval_add);
277 171
278EXPORT_SYMBOL_GPL(dccp_li_update_li); 172int __init tfrc_li_init(void)
279
280static __init int dccp_li_init(void)
281{ 173{
282 dccp_li_cachep = kmem_cache_create("dccp_li_hist", 174 tfrc_lh_slab = kmem_cache_create("tfrc_li_hist",
283 sizeof(struct dccp_li_hist_entry), 175 sizeof(struct tfrc_loss_interval), 0,
284 0, SLAB_HWCACHE_ALIGN, NULL); 176 SLAB_HWCACHE_ALIGN, NULL);
285 return dccp_li_cachep == NULL ? -ENOBUFS : 0; 177 return tfrc_lh_slab == NULL ? -ENOBUFS : 0;
286} 178}
287 179
288static __exit void dccp_li_exit(void) 180void tfrc_li_exit(void)
289{ 181{
290 kmem_cache_destroy(dccp_li_cachep); 182 if (tfrc_lh_slab != NULL) {
183 kmem_cache_destroy(tfrc_lh_slab);
184 tfrc_lh_slab = NULL;
185 }
291} 186}
292
293module_init(dccp_li_init);
294module_exit(dccp_li_exit);
diff --git a/net/dccp/ccids/lib/loss_interval.h b/net/dccp/ccids/lib/loss_interval.h
index 27bee92dae13..246018a3b269 100644
--- a/net/dccp/ccids/lib/loss_interval.h
+++ b/net/dccp/ccids/lib/loss_interval.h
@@ -3,6 +3,7 @@
3/* 3/*
4 * net/dccp/ccids/lib/loss_interval.h 4 * net/dccp/ccids/lib/loss_interval.h
5 * 5 *
6 * Copyright (c) 2007 The University of Aberdeen, Scotland, UK
6 * Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand. 7 * Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand.
7 * Copyright (c) 2005-7 Ian McDonald <ian.mcdonald@jandi.co.nz> 8 * Copyright (c) 2005-7 Ian McDonald <ian.mcdonald@jandi.co.nz>
8 * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br> 9 * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
@@ -12,18 +13,63 @@
12 * Software Foundation; either version 2 of the License, or (at your option) 13 * Software Foundation; either version 2 of the License, or (at your option)
13 * any later version. 14 * any later version.
14 */ 15 */
15
16#include <linux/ktime.h> 16#include <linux/ktime.h>
17#include <linux/list.h> 17#include <linux/list.h>
18#include <linux/slab.h>
19
20/*
21 * Number of loss intervals (RFC 4342, 8.6.1). The history size is one more than
22 * NINTERVAL, since the `open' interval I_0 is always stored as the first entry.
23 */
24#define NINTERVAL 8
25#define LIH_SIZE (NINTERVAL + 1)
26
27/**
28 * tfrc_loss_interval - Loss history record for TFRC-based protocols
29 * @li_seqno: Highest received seqno before the start of loss
30 * @li_ccval: The CCVal belonging to @li_seqno
31 * @li_is_closed: Whether @li_seqno is older than 1 RTT
32 * @li_length: Loss interval sequence length
33 */
34struct tfrc_loss_interval {
35 u64 li_seqno:48,
36 li_ccval:4,
37 li_is_closed:1;
38 u32 li_length;
39};
40
41/**
42 * tfrc_loss_hist - Loss record database
43 * @ring: Circular queue managed in LIFO manner
44 * @counter: Current count of entries (can be more than %LIH_SIZE)
45 * @i_mean: Current Average Loss Interval [RFC 3448, 5.4]
46 */
47struct tfrc_loss_hist {
48 struct tfrc_loss_interval *ring[LIH_SIZE];
49 u8 counter;
50 u32 i_mean;
51};
52
53static inline void tfrc_lh_init(struct tfrc_loss_hist *lh)
54{
55 memset(lh, 0, sizeof(struct tfrc_loss_hist));
56}
57
58static inline u8 tfrc_lh_is_initialised(struct tfrc_loss_hist *lh)
59{
60 return lh->counter > 0;
61}
62
63static inline u8 tfrc_lh_length(struct tfrc_loss_hist *lh)
64{
65 return min(lh->counter, (u8)LIH_SIZE);
66}
18 67
19extern void dccp_li_hist_purge(struct list_head *list); 68struct tfrc_rx_hist;
20 69
21extern u32 dccp_li_hist_calc_i_mean(struct list_head *list); 70extern int tfrc_lh_interval_add(struct tfrc_loss_hist *, struct tfrc_rx_hist *,
71 u32 (*first_li)(struct sock *), struct sock *);
72extern u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *);
73extern void tfrc_lh_cleanup(struct tfrc_loss_hist *lh);
22 74
23extern void dccp_li_update_li(struct sock *sk,
24 struct list_head *li_hist_list,
25 struct list_head *hist_list,
26 ktime_t last_feedback, u16 s,
27 u32 bytes_recv, u32 previous_x_recv,
28 u64 seq_loss, u8 win_loss);
29#endif /* _DCCP_LI_HIST_ */ 75#endif /* _DCCP_LI_HIST_ */
diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c
index 34c4f6047724..20af1a693427 100644
--- a/net/dccp/ccids/lib/packet_history.c
+++ b/net/dccp/ccids/lib/packet_history.c
@@ -1,7 +1,8 @@
1/* 1/*
2 * net/dccp/packet_history.c 2 * net/dccp/packet_history.c
3 * 3 *
4 * Copyright (c) 2005-6 The University of Waikato, Hamilton, New Zealand. 4 * Copyright (c) 2007 The University of Aberdeen, Scotland, UK
5 * Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand.
5 * 6 *
6 * An implementation of the DCCP protocol 7 * An implementation of the DCCP protocol
7 * 8 *
@@ -34,267 +35,465 @@
34 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 35 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
35 */ 36 */
36 37
37#include <linux/module.h>
38#include <linux/string.h> 38#include <linux/string.h>
39#include <linux/slab.h>
39#include "packet_history.h" 40#include "packet_history.h"
41#include "../../dccp.h"
42
43/**
44 * tfrc_tx_hist_entry - Simple singly-linked TX history list
45 * @next: next oldest entry (LIFO order)
46 * @seqno: sequence number of this entry
47 * @stamp: send time of packet with sequence number @seqno
48 */
49struct tfrc_tx_hist_entry {
50 struct tfrc_tx_hist_entry *next;
51 u64 seqno;
52 ktime_t stamp;
53};
40 54
41/* 55/*
42 * Transmitter History Routines 56 * Transmitter History Routines
43 */ 57 */
44struct dccp_tx_hist *dccp_tx_hist_new(const char *name) 58static struct kmem_cache *tfrc_tx_hist_slab;
59
60int __init tfrc_tx_packet_history_init(void)
45{ 61{
46 struct dccp_tx_hist *hist = kmalloc(sizeof(*hist), GFP_ATOMIC); 62 tfrc_tx_hist_slab = kmem_cache_create("tfrc_tx_hist",
47 static const char dccp_tx_hist_mask[] = "tx_hist_%s"; 63 sizeof(struct tfrc_tx_hist_entry),
48 char *slab_name; 64 0, SLAB_HWCACHE_ALIGN, NULL);
49 65 return tfrc_tx_hist_slab == NULL ? -ENOBUFS : 0;
50 if (hist == NULL)
51 goto out;
52
53 slab_name = kmalloc(strlen(name) + sizeof(dccp_tx_hist_mask) - 1,
54 GFP_ATOMIC);
55 if (slab_name == NULL)
56 goto out_free_hist;
57
58 sprintf(slab_name, dccp_tx_hist_mask, name);
59 hist->dccptxh_slab = kmem_cache_create(slab_name,
60 sizeof(struct dccp_tx_hist_entry),
61 0, SLAB_HWCACHE_ALIGN,
62 NULL);
63 if (hist->dccptxh_slab == NULL)
64 goto out_free_slab_name;
65out:
66 return hist;
67out_free_slab_name:
68 kfree(slab_name);
69out_free_hist:
70 kfree(hist);
71 hist = NULL;
72 goto out;
73} 66}
74 67
75EXPORT_SYMBOL_GPL(dccp_tx_hist_new); 68void tfrc_tx_packet_history_exit(void)
76
77void dccp_tx_hist_delete(struct dccp_tx_hist *hist)
78{ 69{
79 const char* name = kmem_cache_name(hist->dccptxh_slab); 70 if (tfrc_tx_hist_slab != NULL) {
80 71 kmem_cache_destroy(tfrc_tx_hist_slab);
81 kmem_cache_destroy(hist->dccptxh_slab); 72 tfrc_tx_hist_slab = NULL;
82 kfree(name); 73 }
83 kfree(hist);
84} 74}
85 75
86EXPORT_SYMBOL_GPL(dccp_tx_hist_delete); 76static struct tfrc_tx_hist_entry *
87 77 tfrc_tx_hist_find_entry(struct tfrc_tx_hist_entry *head, u64 seqno)
88struct dccp_tx_hist_entry *
89 dccp_tx_hist_find_entry(const struct list_head *list, const u64 seq)
90{ 78{
91 struct dccp_tx_hist_entry *packet = NULL, *entry; 79 while (head != NULL && head->seqno != seqno)
92 80 head = head->next;
93 list_for_each_entry(entry, list, dccphtx_node)
94 if (entry->dccphtx_seqno == seq) {
95 packet = entry;
96 break;
97 }
98 81
99 return packet; 82 return head;
100} 83}
101 84
102EXPORT_SYMBOL_GPL(dccp_tx_hist_find_entry); 85int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno)
86{
87 struct tfrc_tx_hist_entry *entry = kmem_cache_alloc(tfrc_tx_hist_slab, gfp_any());
88
89 if (entry == NULL)
90 return -ENOBUFS;
91 entry->seqno = seqno;
92 entry->stamp = ktime_get_real();
93 entry->next = *headp;
94 *headp = entry;
95 return 0;
96}
97EXPORT_SYMBOL_GPL(tfrc_tx_hist_add);
103 98
104void dccp_tx_hist_purge(struct dccp_tx_hist *hist, struct list_head *list) 99void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp)
105{ 100{
106 struct dccp_tx_hist_entry *entry, *next; 101 struct tfrc_tx_hist_entry *head = *headp;
102
103 while (head != NULL) {
104 struct tfrc_tx_hist_entry *next = head->next;
107 105
108 list_for_each_entry_safe(entry, next, list, dccphtx_node) { 106 kmem_cache_free(tfrc_tx_hist_slab, head);
109 list_del_init(&entry->dccphtx_node); 107 head = next;
110 dccp_tx_hist_entry_delete(hist, entry);
111 } 108 }
112}
113 109
114EXPORT_SYMBOL_GPL(dccp_tx_hist_purge); 110 *headp = NULL;
111}
112EXPORT_SYMBOL_GPL(tfrc_tx_hist_purge);
115 113
116void dccp_tx_hist_purge_older(struct dccp_tx_hist *hist, 114u32 tfrc_tx_hist_rtt(struct tfrc_tx_hist_entry *head, const u64 seqno,
117 struct list_head *list, 115 const ktime_t now)
118 struct dccp_tx_hist_entry *packet)
119{ 116{
120 struct dccp_tx_hist_entry *next; 117 u32 rtt = 0;
118 struct tfrc_tx_hist_entry *packet = tfrc_tx_hist_find_entry(head, seqno);
121 119
122 list_for_each_entry_safe_continue(packet, next, list, dccphtx_node) { 120 if (packet != NULL) {
123 list_del_init(&packet->dccphtx_node); 121 rtt = ktime_us_delta(now, packet->stamp);
124 dccp_tx_hist_entry_delete(hist, packet); 122 /*
123 * Garbage-collect older (irrelevant) entries:
124 */
125 tfrc_tx_hist_purge(&packet->next);
125 } 126 }
127
128 return rtt;
126} 129}
130EXPORT_SYMBOL_GPL(tfrc_tx_hist_rtt);
127 131
128EXPORT_SYMBOL_GPL(dccp_tx_hist_purge_older);
129 132
130/* 133/*
131 * Receiver History Routines 134 * Receiver History Routines
132 */ 135 */
133struct dccp_rx_hist *dccp_rx_hist_new(const char *name) 136static struct kmem_cache *tfrc_rx_hist_slab;
137
138int __init tfrc_rx_packet_history_init(void)
134{ 139{
135 struct dccp_rx_hist *hist = kmalloc(sizeof(*hist), GFP_ATOMIC); 140 tfrc_rx_hist_slab = kmem_cache_create("tfrc_rxh_cache",
136 static const char dccp_rx_hist_mask[] = "rx_hist_%s"; 141 sizeof(struct tfrc_rx_hist_entry),
137 char *slab_name; 142 0, SLAB_HWCACHE_ALIGN, NULL);
138 143 return tfrc_rx_hist_slab == NULL ? -ENOBUFS : 0;
139 if (hist == NULL)
140 goto out;
141
142 slab_name = kmalloc(strlen(name) + sizeof(dccp_rx_hist_mask) - 1,
143 GFP_ATOMIC);
144 if (slab_name == NULL)
145 goto out_free_hist;
146
147 sprintf(slab_name, dccp_rx_hist_mask, name);
148 hist->dccprxh_slab = kmem_cache_create(slab_name,
149 sizeof(struct dccp_rx_hist_entry),
150 0, SLAB_HWCACHE_ALIGN,
151 NULL);
152 if (hist->dccprxh_slab == NULL)
153 goto out_free_slab_name;
154out:
155 return hist;
156out_free_slab_name:
157 kfree(slab_name);
158out_free_hist:
159 kfree(hist);
160 hist = NULL;
161 goto out;
162} 144}
163 145
164EXPORT_SYMBOL_GPL(dccp_rx_hist_new); 146void tfrc_rx_packet_history_exit(void)
147{
148 if (tfrc_rx_hist_slab != NULL) {
149 kmem_cache_destroy(tfrc_rx_hist_slab);
150 tfrc_rx_hist_slab = NULL;
151 }
152}
165 153
166void dccp_rx_hist_delete(struct dccp_rx_hist *hist) 154static inline void tfrc_rx_hist_entry_from_skb(struct tfrc_rx_hist_entry *entry,
155 const struct sk_buff *skb,
156 const u32 ndp)
167{ 157{
168 const char* name = kmem_cache_name(hist->dccprxh_slab); 158 const struct dccp_hdr *dh = dccp_hdr(skb);
169 159
170 kmem_cache_destroy(hist->dccprxh_slab); 160 entry->tfrchrx_seqno = DCCP_SKB_CB(skb)->dccpd_seq;
171 kfree(name); 161 entry->tfrchrx_ccval = dh->dccph_ccval;
172 kfree(hist); 162 entry->tfrchrx_type = dh->dccph_type;
163 entry->tfrchrx_ndp = ndp;
164 entry->tfrchrx_tstamp = ktime_get_real();
173} 165}
174 166
175EXPORT_SYMBOL_GPL(dccp_rx_hist_delete); 167void tfrc_rx_hist_add_packet(struct tfrc_rx_hist *h,
168 const struct sk_buff *skb,
169 const u32 ndp)
170{
171 struct tfrc_rx_hist_entry *entry = tfrc_rx_hist_last_rcv(h);
172
173 tfrc_rx_hist_entry_from_skb(entry, skb, ndp);
174}
175EXPORT_SYMBOL_GPL(tfrc_rx_hist_add_packet);
176 176
177int dccp_rx_hist_find_entry(const struct list_head *list, const u64 seq, 177/* has the packet contained in skb been seen before? */
178 u8 *ccval) 178int tfrc_rx_hist_duplicate(struct tfrc_rx_hist *h, struct sk_buff *skb)
179{ 179{
180 struct dccp_rx_hist_entry *packet = NULL, *entry; 180 const u64 seq = DCCP_SKB_CB(skb)->dccpd_seq;
181 int i;
181 182
182 list_for_each_entry(entry, list, dccphrx_node) 183 if (dccp_delta_seqno(tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno, seq) <= 0)
183 if (entry->dccphrx_seqno == seq) { 184 return 1;
184 packet = entry;
185 break;
186 }
187 185
188 if (packet) 186 for (i = 1; i <= h->loss_count; i++)
189 *ccval = packet->dccphrx_ccval; 187 if (tfrc_rx_hist_entry(h, i)->tfrchrx_seqno == seq)
188 return 1;
190 189
191 return packet != NULL; 190 return 0;
192} 191}
192EXPORT_SYMBOL_GPL(tfrc_rx_hist_duplicate);
193 193
194EXPORT_SYMBOL_GPL(dccp_rx_hist_find_entry); 194static void tfrc_rx_hist_swap(struct tfrc_rx_hist *h, const u8 a, const u8 b)
195struct dccp_rx_hist_entry *
196 dccp_rx_hist_find_data_packet(const struct list_head *list)
197{ 195{
198 struct dccp_rx_hist_entry *entry, *packet = NULL; 196 const u8 idx_a = tfrc_rx_hist_index(h, a),
199 197 idx_b = tfrc_rx_hist_index(h, b);
200 list_for_each_entry(entry, list, dccphrx_node) 198 struct tfrc_rx_hist_entry *tmp = h->ring[idx_a];
201 if (entry->dccphrx_type == DCCP_PKT_DATA ||
202 entry->dccphrx_type == DCCP_PKT_DATAACK) {
203 packet = entry;
204 break;
205 }
206 199
207 return packet; 200 h->ring[idx_a] = h->ring[idx_b];
201 h->ring[idx_b] = tmp;
208} 202}
209 203
210EXPORT_SYMBOL_GPL(dccp_rx_hist_find_data_packet); 204/*
205 * Private helper functions for loss detection.
206 *
207 * In the descriptions, `Si' refers to the sequence number of entry number i,
208 * whose NDP count is `Ni' (lower case is used for variables).
209 * Note: All __after_loss functions expect that a test against duplicates has
210 * been performed already: the seqno of the skb must not be less than the
211 * seqno of loss_prev; and it must not equal that of any valid hist_entry.
212 */
213static void __one_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n2)
214{
215 u64 s0 = tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno,
216 s1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_seqno,
217 s2 = DCCP_SKB_CB(skb)->dccpd_seq;
218 int n1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_ndp,
219 d12 = dccp_delta_seqno(s1, s2), d2;
220
221 if (d12 > 0) { /* S1 < S2 */
222 h->loss_count = 2;
223 tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 2), skb, n2);
224 return;
225 }
226
227 /* S0 < S2 < S1 */
228 d2 = dccp_delta_seqno(s0, s2);
211 229
212void dccp_rx_hist_add_packet(struct dccp_rx_hist *hist, 230 if (d2 == 1 || n2 >= d2) { /* S2 is direct successor of S0 */
213 struct list_head *rx_list, 231 int d21 = -d12;
214 struct list_head *li_list, 232
215 struct dccp_rx_hist_entry *packet, 233 if (d21 == 1 || n1 >= d21) {
216 u64 nonloss_seqno) 234 /* hole is filled: S0, S2, and S1 are consecutive */
235 h->loss_count = 0;
236 h->loss_start = tfrc_rx_hist_index(h, 1);
237 } else
238 /* gap between S2 and S1: just update loss_prev */
239 tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_loss_prev(h), skb, n2);
240
241 } else { /* hole between S0 and S2 */
242 /*
243 * Reorder history to insert S2 between S0 and s1
244 */
245 tfrc_rx_hist_swap(h, 0, 3);
246 h->loss_start = tfrc_rx_hist_index(h, 3);
247 tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 1), skb, n2);
248 h->loss_count = 2;
249 }
250}
251
252/* return 1 if a new loss event has been identified */
253static int __two_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n3)
217{ 254{
218 struct dccp_rx_hist_entry *entry, *next; 255 u64 s0 = tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno,
219 u8 num_later = 0; 256 s1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_seqno,
220 257 s2 = tfrc_rx_hist_entry(h, 2)->tfrchrx_seqno,
221 list_add(&packet->dccphrx_node, rx_list); 258 s3 = DCCP_SKB_CB(skb)->dccpd_seq;
222 259 int n1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_ndp,
223 num_later = TFRC_RECV_NUM_LATE_LOSS + 1; 260 d23 = dccp_delta_seqno(s2, s3), d13, d3, d31;
224 261
225 if (!list_empty(li_list)) { 262 if (d23 > 0) { /* S2 < S3 */
226 list_for_each_entry_safe(entry, next, rx_list, dccphrx_node) { 263 h->loss_count = 3;
227 if (num_later == 0) { 264 tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 3), skb, n3);
228 if (after48(nonloss_seqno, 265 return 1;
229 entry->dccphrx_seqno)) { 266 }
230 list_del_init(&entry->dccphrx_node); 267
231 dccp_rx_hist_entry_delete(hist, entry); 268 /* S3 < S2 */
232 } 269 d13 = dccp_delta_seqno(s1, s3);
233 } else if (dccp_rx_hist_entry_data_packet(entry)) 270
234 --num_later; 271 if (d13 > 0) {
235 }
236 } else {
237 int step = 0;
238 u8 win_count = 0; /* Not needed, but lets shut up gcc */
239 int tmp;
240 /* 272 /*
241 * We have no loss interval history so we need at least one 273 * The sequence number order is S1, S3, S2
242 * rtt:s of data packets to approximate rtt. 274 * Reorder history to insert entry between S1 and S2
243 */ 275 */
244 list_for_each_entry_safe(entry, next, rx_list, dccphrx_node) { 276 tfrc_rx_hist_swap(h, 2, 3);
245 if (num_later == 0) { 277 tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 2), skb, n3);
246 switch (step) { 278 h->loss_count = 3;
247 case 0: 279 return 1;
248 step = 1; 280 }
249 /* OK, find next data packet */ 281
250 num_later = 1; 282 /* S0 < S3 < S1 */
251 break; 283 d31 = -d13;
252 case 1: 284 d3 = dccp_delta_seqno(s0, s3);
253 step = 2; 285
254 /* OK, find next data packet */ 286 if (d3 == 1 || n3 >= d3) { /* S3 is a successor of S0 */
255 num_later = 1; 287
256 win_count = entry->dccphrx_ccval; 288 if (d31 == 1 || n1 >= d31) {
257 break; 289 /* hole between S0 and S1 filled by S3 */
258 case 2: 290 int d2 = dccp_delta_seqno(s1, s2),
259 tmp = win_count - entry->dccphrx_ccval; 291 n2 = tfrc_rx_hist_entry(h, 2)->tfrchrx_ndp;
260 if (tmp < 0) 292
261 tmp += TFRC_WIN_COUNT_LIMIT; 293 if (d2 == 1 || n2 >= d2) {
262 if (tmp > TFRC_WIN_COUNT_PER_RTT + 1) { 294 /* entire hole filled by S0, S3, S1, S2 */
263 /* 295 h->loss_start = tfrc_rx_hist_index(h, 2);
264 * We have found a packet older 296 h->loss_count = 0;
265 * than one rtt remove the rest 297 } else {
266 */ 298 /* gap remains between S1 and S2 */
267 step = 3; 299 h->loss_start = tfrc_rx_hist_index(h, 1);
268 } else /* OK, find next data packet */ 300 h->loss_count = 1;
269 num_later = 1; 301 }
270 break; 302
271 case 3: 303 } else /* gap exists between S3 and S1, loss_count stays at 2 */
272 list_del_init(&entry->dccphrx_node); 304 tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_loss_prev(h), skb, n3);
273 dccp_rx_hist_entry_delete(hist, entry); 305
274 break; 306 return 0;
275 } 307 }
276 } else if (dccp_rx_hist_entry_data_packet(entry)) 308
277 --num_later; 309 /*
310 * The remaining case: S3 is not a successor of S0.
311 * Sequence order is S0, S3, S1, S2; reorder to insert between S0 and S1
312 */
313 tfrc_rx_hist_swap(h, 0, 3);
314 h->loss_start = tfrc_rx_hist_index(h, 3);
315 tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 1), skb, n3);
316 h->loss_count = 3;
317
318 return 1;
319}
320
321/* return the signed modulo-2^48 sequence number distance from entry e1 to e2 */
322static s64 tfrc_rx_hist_delta_seqno(struct tfrc_rx_hist *h, u8 e1, u8 e2)
323{
324 DCCP_BUG_ON(e1 > h->loss_count || e2 > h->loss_count);
325
326 return dccp_delta_seqno(tfrc_rx_hist_entry(h, e1)->tfrchrx_seqno,
327 tfrc_rx_hist_entry(h, e2)->tfrchrx_seqno);
328}
329
330/* recycle RX history records to continue loss detection if necessary */
331static void __three_after_loss(struct tfrc_rx_hist *h)
332{
333 /*
334 * The distance between S0 and S1 is always greater than 1 and the NDP
335 * count of S1 is smaller than this distance. Otherwise there would
336 * have been no loss. Hence it is only necessary to see whether there
337 * are further missing data packets between S1/S2 and S2/S3.
338 */
339 int d2 = tfrc_rx_hist_delta_seqno(h, 1, 2),
340 d3 = tfrc_rx_hist_delta_seqno(h, 2, 3),
341 n2 = tfrc_rx_hist_entry(h, 2)->tfrchrx_ndp,
342 n3 = tfrc_rx_hist_entry(h, 3)->tfrchrx_ndp;
343
344 if (d2 == 1 || n2 >= d2) { /* S2 is successor to S1 */
345
346 if (d3 == 1 || n3 >= d3) {
347 /* S3 is successor of S2: entire hole is filled */
348 h->loss_start = tfrc_rx_hist_index(h, 3);
349 h->loss_count = 0;
350 } else {
351 /* gap between S2 and S3 */
352 h->loss_start = tfrc_rx_hist_index(h, 2);
353 h->loss_count = 1;
278 } 354 }
355
356 } else { /* gap between S1 and S2 */
357 h->loss_start = tfrc_rx_hist_index(h, 1);
358 h->loss_count = 2;
279 } 359 }
280} 360}
281 361
282EXPORT_SYMBOL_GPL(dccp_rx_hist_add_packet); 362/**
363 * tfrc_rx_handle_loss - Loss detection and further processing
364 * @h: The non-empty RX history object
365 * @lh: Loss Intervals database to update
366 * @skb: Currently received packet
367 * @ndp: The NDP count belonging to @skb
368 * @calc_first_li: Caller-dependent computation of first loss interval in @lh
369 * @sk: Used by @calc_first_li (see tfrc_lh_interval_add)
370 * Chooses action according to pending loss, updates LI database when a new
371 * loss was detected, and does required post-processing. Returns 1 when caller
372 * should send feedback, 0 otherwise.
373 */
374int tfrc_rx_handle_loss(struct tfrc_rx_hist *h,
375 struct tfrc_loss_hist *lh,
376 struct sk_buff *skb, u32 ndp,
377 u32 (*calc_first_li)(struct sock *), struct sock *sk)
378{
379 int is_new_loss = 0;
283 380
284void dccp_rx_hist_purge(struct dccp_rx_hist *hist, struct list_head *list) 381 if (h->loss_count == 1) {
382 __one_after_loss(h, skb, ndp);
383 } else if (h->loss_count != 2) {
384 DCCP_BUG("invalid loss_count %d", h->loss_count);
385 } else if (__two_after_loss(h, skb, ndp)) {
386 /*
387 * Update Loss Interval database and recycle RX records
388 */
389 is_new_loss = tfrc_lh_interval_add(lh, h, calc_first_li, sk);
390 __three_after_loss(h);
391 }
392 return is_new_loss;
393}
394EXPORT_SYMBOL_GPL(tfrc_rx_handle_loss);
395
396int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h)
285{ 397{
286 struct dccp_rx_hist_entry *entry, *next; 398 int i;
399
400 for (i = 0; i <= TFRC_NDUPACK; i++) {
401 h->ring[i] = kmem_cache_alloc(tfrc_rx_hist_slab, GFP_ATOMIC);
402 if (h->ring[i] == NULL)
403 goto out_free;
404 }
405
406 h->loss_count = h->loss_start = 0;
407 return 0;
287 408
288 list_for_each_entry_safe(entry, next, list, dccphrx_node) { 409out_free:
289 list_del_init(&entry->dccphrx_node); 410 while (i-- != 0) {
290 kmem_cache_free(hist->dccprxh_slab, entry); 411 kmem_cache_free(tfrc_rx_hist_slab, h->ring[i]);
412 h->ring[i] = NULL;
291 } 413 }
414 return -ENOBUFS;
292} 415}
416EXPORT_SYMBOL_GPL(tfrc_rx_hist_alloc);
417
418void tfrc_rx_hist_purge(struct tfrc_rx_hist *h)
419{
420 int i;
293 421
294EXPORT_SYMBOL_GPL(dccp_rx_hist_purge); 422 for (i = 0; i <= TFRC_NDUPACK; ++i)
423 if (h->ring[i] != NULL) {
424 kmem_cache_free(tfrc_rx_hist_slab, h->ring[i]);
425 h->ring[i] = NULL;
426 }
427}
428EXPORT_SYMBOL_GPL(tfrc_rx_hist_purge);
295 429
430/**
431 * tfrc_rx_hist_rtt_last_s - reference entry to compute RTT samples against
432 */
433static inline struct tfrc_rx_hist_entry *
434 tfrc_rx_hist_rtt_last_s(const struct tfrc_rx_hist *h)
435{
436 return h->ring[0];
437}
296 438
297MODULE_AUTHOR("Ian McDonald <ian.mcdonald@jandi.co.nz>, " 439/**
298 "Arnaldo Carvalho de Melo <acme@ghostprotocols.net>"); 440 * tfrc_rx_hist_rtt_prev_s: previously suitable (wrt rtt_last_s) RTT-sampling entry
299MODULE_DESCRIPTION("DCCP TFRC library"); 441 */
300MODULE_LICENSE("GPL"); 442static inline struct tfrc_rx_hist_entry *
443 tfrc_rx_hist_rtt_prev_s(const struct tfrc_rx_hist *h)
444{
445 return h->ring[h->rtt_sample_prev];
446}
447
448/**
449 * tfrc_rx_hist_sample_rtt - Sample RTT from timestamp / CCVal
450 * Based on ideas presented in RFC 4342, 8.1. Returns 0 if it was not able
451 * to compute a sample with given data - calling function should check this.
452 */
453u32 tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, const struct sk_buff *skb)
454{
455 u32 sample = 0,
456 delta_v = SUB16(dccp_hdr(skb)->dccph_ccval,
457 tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval);
458
459 if (delta_v < 1 || delta_v > 4) { /* unsuitable CCVal delta */
460 if (h->rtt_sample_prev == 2) { /* previous candidate stored */
461 sample = SUB16(tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_ccval,
462 tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval);
463 if (sample)
464 sample = 4 / sample *
465 ktime_us_delta(tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_tstamp,
466 tfrc_rx_hist_rtt_last_s(h)->tfrchrx_tstamp);
467 else /*
468 * FIXME: This condition is in principle not
469 * possible but occurs when CCID is used for
470 * two-way data traffic. I have tried to trace
471 * it, but the cause does not seem to be here.
472 */
473 DCCP_BUG("please report to dccp@vger.kernel.org"
474 " => prev = %u, last = %u",
475 tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_ccval,
476 tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval);
477 } else if (delta_v < 1) {
478 h->rtt_sample_prev = 1;
479 goto keep_ref_for_next_time;
480 }
481
482 } else if (delta_v == 4) /* optimal match */
483 sample = ktime_to_us(net_timedelta(tfrc_rx_hist_rtt_last_s(h)->tfrchrx_tstamp));
484 else { /* suboptimal match */
485 h->rtt_sample_prev = 2;
486 goto keep_ref_for_next_time;
487 }
488
489 if (unlikely(sample > DCCP_SANE_RTT_MAX)) {
490 DCCP_WARN("RTT sample %u too large, using max\n", sample);
491 sample = DCCP_SANE_RTT_MAX;
492 }
493
494 h->rtt_sample_prev = 0; /* use current entry as next reference */
495keep_ref_for_next_time:
496
497 return sample;
498}
499EXPORT_SYMBOL_GPL(tfrc_rx_hist_sample_rtt);
diff --git a/net/dccp/ccids/lib/packet_history.h b/net/dccp/ccids/lib/packet_history.h
index 032bb61c6e39..c7eeda49cb20 100644
--- a/net/dccp/ccids/lib/packet_history.h
+++ b/net/dccp/ccids/lib/packet_history.h
@@ -1,10 +1,9 @@
1/* 1/*
2 * net/dccp/packet_history.h 2 * Packet RX/TX history data structures and routines for TFRC-based protocols.
3 * 3 *
4 * Copyright (c) 2007 The University of Aberdeen, Scotland, UK
4 * Copyright (c) 2005-6 The University of Waikato, Hamilton, New Zealand. 5 * Copyright (c) 2005-6 The University of Waikato, Hamilton, New Zealand.
5 * 6 *
6 * An implementation of the DCCP protocol
7 *
8 * This code has been developed by the University of Waikato WAND 7 * This code has been developed by the University of Waikato WAND
9 * research group. For further information please see http://www.wand.net.nz/ 8 * research group. For further information please see http://www.wand.net.nz/
10 * or e-mail Ian McDonald - ian.mcdonald@jandi.co.nz 9 * or e-mail Ian McDonald - ian.mcdonald@jandi.co.nz
@@ -37,165 +36,128 @@
37#ifndef _DCCP_PKT_HIST_ 36#ifndef _DCCP_PKT_HIST_
38#define _DCCP_PKT_HIST_ 37#define _DCCP_PKT_HIST_
39 38
40#include <linux/ktime.h>
41#include <linux/list.h> 39#include <linux/list.h>
42#include <linux/slab.h> 40#include <linux/slab.h>
41#include "tfrc.h"
43 42
44#include "../../dccp.h" 43struct tfrc_tx_hist_entry;
45 44
46/* Number of later packets received before one is considered lost */ 45extern int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno);
47#define TFRC_RECV_NUM_LATE_LOSS 3 46extern void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp);
47extern u32 tfrc_tx_hist_rtt(struct tfrc_tx_hist_entry *head,
48 const u64 seqno, const ktime_t now);
48 49
49#define TFRC_WIN_COUNT_PER_RTT 4 50/* Subtraction a-b modulo-16, respects circular wrap-around */
50#define TFRC_WIN_COUNT_LIMIT 16 51#define SUB16(a, b) (((a) + 16 - (b)) & 0xF)
51 52
52/* 53/* Number of packets to wait after a missing packet (RFC 4342, 6.1) */
53 * Transmitter History data structures and declarations 54#define TFRC_NDUPACK 3
55
56/**
57 * tfrc_rx_hist_entry - Store information about a single received packet
58 * @tfrchrx_seqno: DCCP packet sequence number
59 * @tfrchrx_ccval: window counter value of packet (RFC 4342, 8.1)
60 * @tfrchrx_ndp: the NDP count (if any) of the packet
61 * @tfrchrx_tstamp: actual receive time of packet
54 */ 62 */
55struct dccp_tx_hist_entry { 63struct tfrc_rx_hist_entry {
56 struct list_head dccphtx_node; 64 u64 tfrchrx_seqno:48,
57 u64 dccphtx_seqno:48, 65 tfrchrx_ccval:4,
58 dccphtx_sent:1; 66 tfrchrx_type:4;
59 u32 dccphtx_rtt; 67 u32 tfrchrx_ndp; /* In fact it is from 8 to 24 bits */
60 ktime_t dccphtx_tstamp; 68 ktime_t tfrchrx_tstamp;
61}; 69};
62 70
63struct dccp_tx_hist { 71/**
64 struct kmem_cache *dccptxh_slab; 72 * tfrc_rx_hist - RX history structure for TFRC-based protocols
73 *
74 * @ring: Packet history for RTT sampling and loss detection
75 * @loss_count: Number of entries in circular history
76 * @loss_start: Movable index (for loss detection)
77 * @rtt_sample_prev: Used during RTT sampling, points to candidate entry
78 */
79struct tfrc_rx_hist {
80 struct tfrc_rx_hist_entry *ring[TFRC_NDUPACK + 1];
81 u8 loss_count:2,
82 loss_start:2;
83#define rtt_sample_prev loss_start
65}; 84};
66 85
67extern struct dccp_tx_hist *dccp_tx_hist_new(const char *name); 86/**
68extern void dccp_tx_hist_delete(struct dccp_tx_hist *hist); 87 * tfrc_rx_hist_index - index to reach n-th entry after loss_start
69 88 */
70static inline struct dccp_tx_hist_entry * 89static inline u8 tfrc_rx_hist_index(const struct tfrc_rx_hist *h, const u8 n)
71 dccp_tx_hist_entry_new(struct dccp_tx_hist *hist,
72 const gfp_t prio)
73{ 90{
74 struct dccp_tx_hist_entry *entry = kmem_cache_alloc(hist->dccptxh_slab, 91 return (h->loss_start + n) & TFRC_NDUPACK;
75 prio);
76
77 if (entry != NULL)
78 entry->dccphtx_sent = 0;
79
80 return entry;
81} 92}
82 93
83static inline struct dccp_tx_hist_entry * 94/**
84 dccp_tx_hist_head(struct list_head *list) 95 * tfrc_rx_hist_last_rcv - entry with highest-received-seqno so far
96 */
97static inline struct tfrc_rx_hist_entry *
98 tfrc_rx_hist_last_rcv(const struct tfrc_rx_hist *h)
85{ 99{
86 struct dccp_tx_hist_entry *head = NULL; 100 return h->ring[tfrc_rx_hist_index(h, h->loss_count)];
87
88 if (!list_empty(list))
89 head = list_entry(list->next, struct dccp_tx_hist_entry,
90 dccphtx_node);
91 return head;
92} 101}
93 102
94extern struct dccp_tx_hist_entry * 103/**
95 dccp_tx_hist_find_entry(const struct list_head *list, 104 * tfrc_rx_hist_entry - return the n-th history entry after loss_start
96 const u64 seq); 105 */
97 106static inline struct tfrc_rx_hist_entry *
98static inline void dccp_tx_hist_add_entry(struct list_head *list, 107 tfrc_rx_hist_entry(const struct tfrc_rx_hist *h, const u8 n)
99 struct dccp_tx_hist_entry *entry)
100{ 108{
101 list_add(&entry->dccphtx_node, list); 109 return h->ring[tfrc_rx_hist_index(h, n)];
102} 110}
103 111
104static inline void dccp_tx_hist_entry_delete(struct dccp_tx_hist *hist, 112/**
105 struct dccp_tx_hist_entry *entry) 113 * tfrc_rx_hist_loss_prev - entry with highest-received-seqno before loss was detected
114 */
115static inline struct tfrc_rx_hist_entry *
116 tfrc_rx_hist_loss_prev(const struct tfrc_rx_hist *h)
106{ 117{
107 if (entry != NULL) 118 return h->ring[h->loss_start];
108 kmem_cache_free(hist->dccptxh_slab, entry);
109} 119}
110 120
111extern void dccp_tx_hist_purge(struct dccp_tx_hist *hist, 121/* initialise loss detection and disable RTT sampling */
112 struct list_head *list); 122static inline void tfrc_rx_hist_loss_indicated(struct tfrc_rx_hist *h)
113
114extern void dccp_tx_hist_purge_older(struct dccp_tx_hist *hist,
115 struct list_head *list,
116 struct dccp_tx_hist_entry *next);
117
118/*
119 * Receiver History data structures and declarations
120 */
121struct dccp_rx_hist_entry {
122 struct list_head dccphrx_node;
123 u64 dccphrx_seqno:48,
124 dccphrx_ccval:4,
125 dccphrx_type:4;
126 u32 dccphrx_ndp; /* In fact it is from 8 to 24 bits */
127 ktime_t dccphrx_tstamp;
128};
129
130struct dccp_rx_hist {
131 struct kmem_cache *dccprxh_slab;
132};
133
134extern struct dccp_rx_hist *dccp_rx_hist_new(const char *name);
135extern void dccp_rx_hist_delete(struct dccp_rx_hist *hist);
136
137static inline struct dccp_rx_hist_entry *
138 dccp_rx_hist_entry_new(struct dccp_rx_hist *hist,
139 const u32 ndp,
140 const struct sk_buff *skb,
141 const gfp_t prio)
142{ 123{
143 struct dccp_rx_hist_entry *entry = kmem_cache_alloc(hist->dccprxh_slab, 124 h->loss_count = 1;
144 prio);
145
146 if (entry != NULL) {
147 const struct dccp_hdr *dh = dccp_hdr(skb);
148
149 entry->dccphrx_seqno = DCCP_SKB_CB(skb)->dccpd_seq;
150 entry->dccphrx_ccval = dh->dccph_ccval;
151 entry->dccphrx_type = dh->dccph_type;
152 entry->dccphrx_ndp = ndp;
153 entry->dccphrx_tstamp = ktime_get_real();
154 }
155
156 return entry;
157} 125}
158 126
159static inline struct dccp_rx_hist_entry * 127/* indicate whether previously a packet was detected missing */
160 dccp_rx_hist_head(struct list_head *list) 128static inline int tfrc_rx_hist_loss_pending(const struct tfrc_rx_hist *h)
161{ 129{
162 struct dccp_rx_hist_entry *head = NULL; 130 return h->loss_count;
163
164 if (!list_empty(list))
165 head = list_entry(list->next, struct dccp_rx_hist_entry,
166 dccphrx_node);
167 return head;
168} 131}
169 132
170extern int dccp_rx_hist_find_entry(const struct list_head *list, const u64 seq, 133/* any data packets missing between last reception and skb ? */
171 u8 *ccval); 134static inline int tfrc_rx_hist_new_loss_indicated(struct tfrc_rx_hist *h,
172extern struct dccp_rx_hist_entry * 135 const struct sk_buff *skb,
173 dccp_rx_hist_find_data_packet(const struct list_head *list); 136 u32 ndp)
174
175extern void dccp_rx_hist_add_packet(struct dccp_rx_hist *hist,
176 struct list_head *rx_list,
177 struct list_head *li_list,
178 struct dccp_rx_hist_entry *packet,
179 u64 nonloss_seqno);
180
181static inline void dccp_rx_hist_entry_delete(struct dccp_rx_hist *hist,
182 struct dccp_rx_hist_entry *entry)
183{ 137{
184 if (entry != NULL) 138 int delta = dccp_delta_seqno(tfrc_rx_hist_last_rcv(h)->tfrchrx_seqno,
185 kmem_cache_free(hist->dccprxh_slab, entry); 139 DCCP_SKB_CB(skb)->dccpd_seq);
186}
187 140
188extern void dccp_rx_hist_purge(struct dccp_rx_hist *hist, 141 if (delta > 1 && ndp < delta)
189 struct list_head *list); 142 tfrc_rx_hist_loss_indicated(h);
190 143
191static inline int 144 return tfrc_rx_hist_loss_pending(h);
192 dccp_rx_hist_entry_data_packet(const struct dccp_rx_hist_entry *entry)
193{
194 return entry->dccphrx_type == DCCP_PKT_DATA ||
195 entry->dccphrx_type == DCCP_PKT_DATAACK;
196} 145}
197 146
198extern u64 dccp_rx_hist_detect_loss(struct list_head *rx_list, 147extern void tfrc_rx_hist_add_packet(struct tfrc_rx_hist *h,
199 struct list_head *li_list, u8 *win_loss); 148 const struct sk_buff *skb, const u32 ndp);
149
150extern int tfrc_rx_hist_duplicate(struct tfrc_rx_hist *h, struct sk_buff *skb);
151
152struct tfrc_loss_hist;
153extern int tfrc_rx_handle_loss(struct tfrc_rx_hist *h,
154 struct tfrc_loss_hist *lh,
155 struct sk_buff *skb, u32 ndp,
156 u32 (*first_li)(struct sock *sk),
157 struct sock *sk);
158extern u32 tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h,
159 const struct sk_buff *skb);
160extern int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h);
161extern void tfrc_rx_hist_purge(struct tfrc_rx_hist *h);
200 162
201#endif /* _DCCP_PKT_HIST_ */ 163#endif /* _DCCP_PKT_HIST_ */
diff --git a/net/dccp/ccids/lib/tfrc.c b/net/dccp/ccids/lib/tfrc.c
new file mode 100644
index 000000000000..d1dfbb8de64c
--- /dev/null
+++ b/net/dccp/ccids/lib/tfrc.c
@@ -0,0 +1,63 @@
1/*
2 * TFRC: main module holding the pieces of the TFRC library together
3 *
4 * Copyright (c) 2007 The University of Aberdeen, Scotland, UK
5 * Copyright (c) 2007 Arnaldo Carvalho de Melo <acme@redhat.com>
6 */
7#include <linux/module.h>
8#include <linux/moduleparam.h>
9#include "tfrc.h"
10
11#ifdef CONFIG_IP_DCCP_TFRC_DEBUG
12int tfrc_debug;
13module_param(tfrc_debug, bool, 0444);
14MODULE_PARM_DESC(tfrc_debug, "Enable debug messages");
15#endif
16
17extern int tfrc_tx_packet_history_init(void);
18extern void tfrc_tx_packet_history_exit(void);
19extern int tfrc_rx_packet_history_init(void);
20extern void tfrc_rx_packet_history_exit(void);
21
22extern int tfrc_li_init(void);
23extern void tfrc_li_exit(void);
24
25static int __init tfrc_module_init(void)
26{
27 int rc = tfrc_li_init();
28
29 if (rc)
30 goto out;
31
32 rc = tfrc_tx_packet_history_init();
33 if (rc)
34 goto out_free_loss_intervals;
35
36 rc = tfrc_rx_packet_history_init();
37 if (rc)
38 goto out_free_tx_history;
39 return 0;
40
41out_free_tx_history:
42 tfrc_tx_packet_history_exit();
43out_free_loss_intervals:
44 tfrc_li_exit();
45out:
46 return rc;
47}
48
49static void __exit tfrc_module_exit(void)
50{
51 tfrc_rx_packet_history_exit();
52 tfrc_tx_packet_history_exit();
53 tfrc_li_exit();
54}
55
56module_init(tfrc_module_init);
57module_exit(tfrc_module_exit);
58
59MODULE_AUTHOR("Gerrit Renker <gerrit@erg.abdn.ac.uk>, "
60 "Ian McDonald <ian.mcdonald@jandi.co.nz>, "
61 "Arnaldo Carvalho de Melo <acme@redhat.com>");
62MODULE_DESCRIPTION("DCCP TFRC library");
63MODULE_LICENSE("GPL");
diff --git a/net/dccp/ccids/lib/tfrc.h b/net/dccp/ccids/lib/tfrc.h
index faf5f7e219e3..1fb1187bbf1c 100644
--- a/net/dccp/ccids/lib/tfrc.h
+++ b/net/dccp/ccids/lib/tfrc.h
@@ -3,10 +3,11 @@
3/* 3/*
4 * net/dccp/ccids/lib/tfrc.h 4 * net/dccp/ccids/lib/tfrc.h
5 * 5 *
6 * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand. 6 * Copyright (c) 2007 The University of Aberdeen, Scotland, UK
7 * Copyright (c) 2005 Ian McDonald <ian.mcdonald@jandi.co.nz> 7 * Copyright (c) 2005-6 The University of Waikato, Hamilton, New Zealand.
8 * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br> 8 * Copyright (c) 2005-6 Ian McDonald <ian.mcdonald@jandi.co.nz>
9 * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon 9 * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
10 * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon
10 * 11 *
11 * This program is free software; you can redistribute it and/or modify 12 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by 13 * it under the terms of the GNU General Public License as published by
@@ -15,6 +16,17 @@
15 */ 16 */
16#include <linux/types.h> 17#include <linux/types.h>
17#include <asm/div64.h> 18#include <asm/div64.h>
19#include "../../dccp.h"
20/* internal includes that this module exports: */
21#include "loss_interval.h"
22#include "packet_history.h"
23
24#ifdef CONFIG_IP_DCCP_TFRC_DEBUG
25extern int tfrc_debug;
26#define tfrc_pr_debug(format, a...) DCCP_PR_DEBUG(tfrc_debug, format, ##a)
27#else
28#define tfrc_pr_debug(format, a...)
29#endif
18 30
19/* integer-arithmetic divisions of type (a * 1000000)/b */ 31/* integer-arithmetic divisions of type (a * 1000000)/b */
20static inline u64 scaled_div(u64 a, u32 b) 32static inline u64 scaled_div(u64 a, u32 b)
@@ -37,6 +49,15 @@ static inline u32 scaled_div32(u64 a, u32 b)
37 return result; 49 return result;
38} 50}
39 51
52/**
53 * tfrc_ewma - Exponentially weighted moving average
54 * @weight: Weight to be used as damping factor, in units of 1/10
55 */
56static inline u32 tfrc_ewma(const u32 avg, const u32 newval, const u8 weight)
57{
58 return avg ? (weight * avg + (10 - weight) * newval) / 10 : newval;
59}
60
40extern u32 tfrc_calc_x(u16 s, u32 R, u32 p); 61extern u32 tfrc_calc_x(u16 s, u32 R, u32 p);
41extern u32 tfrc_calc_x_reverse_lookup(u32 fvalue); 62extern u32 tfrc_calc_x_reverse_lookup(u32 fvalue);
42 63
diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
index ee97950d77d1..ebe59d98721a 100644
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -72,11 +72,21 @@ extern void dccp_time_wait(struct sock *sk, int state, int timeo);
72/* RFC 1122, 4.2.3.1 initial RTO value */ 72/* RFC 1122, 4.2.3.1 initial RTO value */
73#define DCCP_TIMEOUT_INIT ((unsigned)(3 * HZ)) 73#define DCCP_TIMEOUT_INIT ((unsigned)(3 * HZ))
74 74
75#define DCCP_RTO_MAX ((unsigned)(120 * HZ)) /* FIXME: using TCP value */ 75/*
76 * The maximum back-off value for retransmissions. This is needed for
77 * - retransmitting client-Requests (sec. 8.1.1),
78 * - retransmitting Close/CloseReq when closing (sec. 8.3),
79 * - feature-negotiation retransmission (sec. 6.6.3),
80 * - Acks in client-PARTOPEN state (sec. 8.1.5).
81 */
82#define DCCP_RTO_MAX ((unsigned)(64 * HZ))
76 83
77/* bounds for sampled RTT values from packet exchanges (in usec) */ 84/*
85 * RTT sampling: sanity bounds and fallback RTT value from RFC 4340, section 3.4
86 */
78#define DCCP_SANE_RTT_MIN 100 87#define DCCP_SANE_RTT_MIN 100
79#define DCCP_SANE_RTT_MAX (4 * USEC_PER_SEC) 88#define DCCP_FALLBACK_RTT (USEC_PER_SEC / 5)
89#define DCCP_SANE_RTT_MAX (3 * USEC_PER_SEC)
80 90
81/* Maximal interval between probes for local resources. */ 91/* Maximal interval between probes for local resources. */
82#define DCCP_RESOURCE_PROBE_INTERVAL ((unsigned)(HZ / 2U)) 92#define DCCP_RESOURCE_PROBE_INTERVAL ((unsigned)(HZ / 2U))
@@ -143,12 +153,6 @@ static inline u64 max48(const u64 seq1, const u64 seq2)
143 return after48(seq1, seq2) ? seq1 : seq2; 153 return after48(seq1, seq2) ? seq1 : seq2;
144} 154}
145 155
146/* is seq1 next seqno after seq2 */
147static inline int follows48(const u64 seq1, const u64 seq2)
148{
149 return dccp_delta_seqno(seq2, seq1) == 1;
150}
151
152enum { 156enum {
153 DCCP_MIB_NUM = 0, 157 DCCP_MIB_NUM = 0,
154 DCCP_MIB_ACTIVEOPENS, /* ActiveOpens */ 158 DCCP_MIB_ACTIVEOPENS, /* ActiveOpens */
@@ -334,6 +338,7 @@ struct dccp_skb_cb {
334 338
335#define DCCP_SKB_CB(__skb) ((struct dccp_skb_cb *)&((__skb)->cb[0])) 339#define DCCP_SKB_CB(__skb) ((struct dccp_skb_cb *)&((__skb)->cb[0]))
336 340
341/* RFC 4340, sec. 7.7 */
337static inline int dccp_non_data_packet(const struct sk_buff *skb) 342static inline int dccp_non_data_packet(const struct sk_buff *skb)
338{ 343{
339 const __u8 type = DCCP_SKB_CB(skb)->dccpd_type; 344 const __u8 type = DCCP_SKB_CB(skb)->dccpd_type;
@@ -346,6 +351,17 @@ static inline int dccp_non_data_packet(const struct sk_buff *skb)
346 type == DCCP_PKT_SYNCACK; 351 type == DCCP_PKT_SYNCACK;
347} 352}
348 353
354/* RFC 4340, sec. 7.7 */
355static inline int dccp_data_packet(const struct sk_buff *skb)
356{
357 const __u8 type = DCCP_SKB_CB(skb)->dccpd_type;
358
359 return type == DCCP_PKT_DATA ||
360 type == DCCP_PKT_DATAACK ||
361 type == DCCP_PKT_REQUEST ||
362 type == DCCP_PKT_RESPONSE;
363}
364
349static inline int dccp_packet_without_ack(const struct sk_buff *skb) 365static inline int dccp_packet_without_ack(const struct sk_buff *skb)
350{ 366{
351 const __u8 type = DCCP_SKB_CB(skb)->dccpd_type; 367 const __u8 type = DCCP_SKB_CB(skb)->dccpd_type;
@@ -406,6 +422,7 @@ static inline int dccp_ack_pending(const struct sock *sk)
406} 422}
407 423
408extern int dccp_insert_options(struct sock *sk, struct sk_buff *skb); 424extern int dccp_insert_options(struct sock *sk, struct sk_buff *skb);
425extern int dccp_insert_options_rsk(struct dccp_request_sock*, struct sk_buff*);
409extern int dccp_insert_option_elapsed_time(struct sock *sk, 426extern int dccp_insert_option_elapsed_time(struct sock *sk,
410 struct sk_buff *skb, 427 struct sk_buff *skb,
411 u32 elapsed_time); 428 u32 elapsed_time);
diff --git a/net/dccp/feat.c b/net/dccp/feat.c
index 5ebdd86c1b99..4a4f6ce4498d 100644
--- a/net/dccp/feat.c
+++ b/net/dccp/feat.c
@@ -4,10 +4,16 @@
4 * An implementation of the DCCP protocol 4 * An implementation of the DCCP protocol
5 * Andrea Bittau <a.bittau@cs.ucl.ac.uk> 5 * Andrea Bittau <a.bittau@cs.ucl.ac.uk>
6 * 6 *
7 * This program is free software; you can redistribute it and/or 7 * ASSUMPTIONS
8 * modify it under the terms of the GNU General Public License 8 * -----------
9 * as published by the Free Software Foundation; either version 9 * o All currently known SP features have 1-byte quantities. If in the future
10 * 2 of the License, or (at your option) any later version. 10 * extensions of RFCs 4340..42 define features with item lengths larger than
11 * one byte, a feature-specific extension of the code will be required.
12 *
13 * This program is free software; you can redistribute it and/or
14 * modify it under the terms of the GNU General Public License
15 * as published by the Free Software Foundation; either version
16 * 2 of the License, or (at your option) any later version.
11 */ 17 */
12 18
13#include <linux/module.h> 19#include <linux/module.h>
@@ -24,11 +30,7 @@ int dccp_feat_change(struct dccp_minisock *dmsk, u8 type, u8 feature,
24 30
25 dccp_feat_debug(type, feature, *val); 31 dccp_feat_debug(type, feature, *val);
26 32
27 if (!dccp_feat_is_valid_type(type)) { 33 if (len > 3) {
28 DCCP_WARN("option type %d invalid in negotiation\n", type);
29 return 1;
30 }
31 if (!dccp_feat_is_valid_length(type, feature, len)) {
32 DCCP_WARN("invalid length %d\n", len); 34 DCCP_WARN("invalid length %d\n", len);
33 return 1; 35 return 1;
34 } 36 }
@@ -99,7 +101,6 @@ static int dccp_feat_update_ccid(struct sock *sk, u8 type, u8 new_ccid_nr)
99 return 0; 101 return 0;
100} 102}
101 103
102/* XXX taking only u8 vals */
103static int dccp_feat_update(struct sock *sk, u8 type, u8 feat, u8 val) 104static int dccp_feat_update(struct sock *sk, u8 type, u8 feat, u8 val)
104{ 105{
105 dccp_feat_debug(type, feat, val); 106 dccp_feat_debug(type, feat, val);
@@ -144,7 +145,6 @@ static int dccp_feat_reconcile(struct sock *sk, struct dccp_opt_pend *opt,
144 /* FIXME sanity check vals */ 145 /* FIXME sanity check vals */
145 146
146 /* Are values in any order? XXX Lame "algorithm" here */ 147 /* Are values in any order? XXX Lame "algorithm" here */
147 /* XXX assume values are 1 byte */
148 for (i = 0; i < slen; i++) { 148 for (i = 0; i < slen; i++) {
149 for (j = 0; j < rlen; j++) { 149 for (j = 0; j < rlen; j++) {
150 if (spref[i] == rpref[j]) { 150 if (spref[i] == rpref[j]) {
@@ -179,7 +179,6 @@ static int dccp_feat_reconcile(struct sock *sk, struct dccp_opt_pend *opt,
179 } 179 }
180 180
181 /* need to put result and our preference list */ 181 /* need to put result and our preference list */
182 /* XXX assume 1 byte vals */
183 rlen = 1 + opt->dccpop_len; 182 rlen = 1 + opt->dccpop_len;
184 rpref = kmalloc(rlen, GFP_ATOMIC); 183 rpref = kmalloc(rlen, GFP_ATOMIC);
185 if (rpref == NULL) 184 if (rpref == NULL)
@@ -637,12 +636,12 @@ const char *dccp_feat_name(const u8 feat)
637 [DCCPF_MIN_CSUM_COVER] = "Min. Csum Coverage", 636 [DCCPF_MIN_CSUM_COVER] = "Min. Csum Coverage",
638 [DCCPF_DATA_CHECKSUM] = "Send Data Checksum", 637 [DCCPF_DATA_CHECKSUM] = "Send Data Checksum",
639 }; 638 };
639 if (feat > DCCPF_DATA_CHECKSUM && feat < DCCPF_MIN_CCID_SPECIFIC)
640 return feature_names[DCCPF_RESERVED];
641
640 if (feat >= DCCPF_MIN_CCID_SPECIFIC) 642 if (feat >= DCCPF_MIN_CCID_SPECIFIC)
641 return "CCID-specific"; 643 return "CCID-specific";
642 644
643 if (dccp_feat_is_reserved(feat))
644 return feature_names[DCCPF_RESERVED];
645
646 return feature_names[feat]; 645 return feature_names[feat];
647} 646}
648 647
diff --git a/net/dccp/feat.h b/net/dccp/feat.h
index 177f7dee4d10..e272222c7ace 100644
--- a/net/dccp/feat.h
+++ b/net/dccp/feat.h
@@ -14,32 +14,6 @@
14#include <linux/types.h> 14#include <linux/types.h>
15#include "dccp.h" 15#include "dccp.h"
16 16
17static inline int dccp_feat_is_valid_length(u8 type, u8 feature, u8 len)
18{
19 /* sec. 6.1: Confirm has at least length 3,
20 * sec. 6.2: Change has at least length 4 */
21 if (len < 3)
22 return 1;
23 if (len < 4 && (type == DCCPO_CHANGE_L || type == DCCPO_CHANGE_R))
24 return 1;
25 /* XXX: add per-feature length validation (sec. 6.6.8) */
26 return 0;
27}
28
29static inline int dccp_feat_is_reserved(const u8 feat)
30{
31 return (feat > DCCPF_DATA_CHECKSUM &&
32 feat < DCCPF_MIN_CCID_SPECIFIC) ||
33 feat == DCCPF_RESERVED;
34}
35
36/* feature negotiation knows only these four option types (RFC 4340, sec. 6) */
37static inline int dccp_feat_is_valid_type(const u8 optnum)
38{
39 return optnum >= DCCPO_CHANGE_L && optnum <= DCCPO_CONFIRM_R;
40
41}
42
43#ifdef CONFIG_IP_DCCP_DEBUG 17#ifdef CONFIG_IP_DCCP_DEBUG
44extern const char *dccp_feat_typename(const u8 type); 18extern const char *dccp_feat_typename(const u8 type);
45extern const char *dccp_feat_name(const u8 feat); 19extern const char *dccp_feat_name(const u8 feat);
diff --git a/net/dccp/input.c b/net/dccp/input.c
index 1ce101062824..08392ed86c25 100644
--- a/net/dccp/input.c
+++ b/net/dccp/input.c
@@ -22,26 +22,77 @@
22/* rate-limit for syncs in reply to sequence-invalid packets; RFC 4340, 7.5.4 */ 22/* rate-limit for syncs in reply to sequence-invalid packets; RFC 4340, 7.5.4 */
23int sysctl_dccp_sync_ratelimit __read_mostly = HZ / 8; 23int sysctl_dccp_sync_ratelimit __read_mostly = HZ / 8;
24 24
25static void dccp_fin(struct sock *sk, struct sk_buff *skb) 25static void dccp_enqueue_skb(struct sock *sk, struct sk_buff *skb)
26{ 26{
27 sk->sk_shutdown |= RCV_SHUTDOWN;
28 sock_set_flag(sk, SOCK_DONE);
29 __skb_pull(skb, dccp_hdr(skb)->dccph_doff * 4); 27 __skb_pull(skb, dccp_hdr(skb)->dccph_doff * 4);
30 __skb_queue_tail(&sk->sk_receive_queue, skb); 28 __skb_queue_tail(&sk->sk_receive_queue, skb);
31 skb_set_owner_r(skb, sk); 29 skb_set_owner_r(skb, sk);
32 sk->sk_data_ready(sk, 0); 30 sk->sk_data_ready(sk, 0);
33} 31}
34 32
35static void dccp_rcv_close(struct sock *sk, struct sk_buff *skb) 33static void dccp_fin(struct sock *sk, struct sk_buff *skb)
36{ 34{
37 dccp_send_reset(sk, DCCP_RESET_CODE_CLOSED); 35 /*
38 dccp_fin(sk, skb); 36 * On receiving Close/CloseReq, both RD/WR shutdown are performed.
39 dccp_set_state(sk, DCCP_CLOSED); 37 * RFC 4340, 8.3 says that we MAY send further Data/DataAcks after
40 sk_wake_async(sk, 1, POLL_HUP); 38 * receiving the closing segment, but there is no guarantee that such
39 * data will be processed at all.
40 */
41 sk->sk_shutdown = SHUTDOWN_MASK;
42 sock_set_flag(sk, SOCK_DONE);
43 dccp_enqueue_skb(sk, skb);
44}
45
46static int dccp_rcv_close(struct sock *sk, struct sk_buff *skb)
47{
48 int queued = 0;
49
50 switch (sk->sk_state) {
51 /*
52 * We ignore Close when received in one of the following states:
53 * - CLOSED (may be a late or duplicate packet)
54 * - PASSIVE_CLOSEREQ (the peer has sent a CloseReq earlier)
55 * - RESPOND (already handled by dccp_check_req)
56 */
57 case DCCP_CLOSING:
58 /*
59 * Simultaneous-close: receiving a Close after sending one. This
60 * can happen if both client and server perform active-close and
61 * will result in an endless ping-pong of crossing and retrans-
62 * mitted Close packets, which only terminates when one of the
63 * nodes times out (min. 64 seconds). Quicker convergence can be
64 * achieved when one of the nodes acts as tie-breaker.
65 * This is ok as both ends are done with data transfer and each
66 * end is just waiting for the other to acknowledge termination.
67 */
68 if (dccp_sk(sk)->dccps_role != DCCP_ROLE_CLIENT)
69 break;
70 /* fall through */
71 case DCCP_REQUESTING:
72 case DCCP_ACTIVE_CLOSEREQ:
73 dccp_send_reset(sk, DCCP_RESET_CODE_CLOSED);
74 dccp_done(sk);
75 break;
76 case DCCP_OPEN:
77 case DCCP_PARTOPEN:
78 /* Give waiting application a chance to read pending data */
79 queued = 1;
80 dccp_fin(sk, skb);
81 dccp_set_state(sk, DCCP_PASSIVE_CLOSE);
82 /* fall through */
83 case DCCP_PASSIVE_CLOSE:
84 /*
85 * Retransmitted Close: we have already enqueued the first one.
86 */
87 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP);
88 }
89 return queued;
41} 90}
42 91
43static void dccp_rcv_closereq(struct sock *sk, struct sk_buff *skb) 92static int dccp_rcv_closereq(struct sock *sk, struct sk_buff *skb)
44{ 93{
94 int queued = 0;
95
45 /* 96 /*
46 * Step 7: Check for unexpected packet types 97 * Step 7: Check for unexpected packet types
47 * If (S.is_server and P.type == CloseReq) 98 * If (S.is_server and P.type == CloseReq)
@@ -50,12 +101,26 @@ static void dccp_rcv_closereq(struct sock *sk, struct sk_buff *skb)
50 */ 101 */
51 if (dccp_sk(sk)->dccps_role != DCCP_ROLE_CLIENT) { 102 if (dccp_sk(sk)->dccps_role != DCCP_ROLE_CLIENT) {
52 dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq, DCCP_PKT_SYNC); 103 dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq, DCCP_PKT_SYNC);
53 return; 104 return queued;
54 } 105 }
55 106
56 if (sk->sk_state != DCCP_CLOSING) 107 /* Step 13: process relevant Client states < CLOSEREQ */
108 switch (sk->sk_state) {
109 case DCCP_REQUESTING:
110 dccp_send_close(sk, 0);
57 dccp_set_state(sk, DCCP_CLOSING); 111 dccp_set_state(sk, DCCP_CLOSING);
58 dccp_send_close(sk, 0); 112 break;
113 case DCCP_OPEN:
114 case DCCP_PARTOPEN:
115 /* Give waiting application a chance to read pending data */
116 queued = 1;
117 dccp_fin(sk, skb);
118 dccp_set_state(sk, DCCP_PASSIVE_CLOSEREQ);
119 /* fall through */
120 case DCCP_PASSIVE_CLOSEREQ:
121 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP);
122 }
123 return queued;
59} 124}
60 125
61static u8 dccp_reset_code_convert(const u8 code) 126static u8 dccp_reset_code_convert(const u8 code)
@@ -90,7 +155,7 @@ static void dccp_rcv_reset(struct sock *sk, struct sk_buff *skb)
90 dccp_fin(sk, skb); 155 dccp_fin(sk, skb);
91 156
92 if (err && !sock_flag(sk, SOCK_DEAD)) 157 if (err && !sock_flag(sk, SOCK_DEAD))
93 sk_wake_async(sk, 0, POLL_ERR); 158 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
94 dccp_time_wait(sk, DCCP_TIME_WAIT, 0); 159 dccp_time_wait(sk, DCCP_TIME_WAIT, 0);
95} 160}
96 161
@@ -103,6 +168,21 @@ static void dccp_event_ack_recv(struct sock *sk, struct sk_buff *skb)
103 DCCP_SKB_CB(skb)->dccpd_ack_seq); 168 DCCP_SKB_CB(skb)->dccpd_ack_seq);
104} 169}
105 170
171static void dccp_deliver_input_to_ccids(struct sock *sk, struct sk_buff *skb)
172{
173 const struct dccp_sock *dp = dccp_sk(sk);
174
175 /* Don't deliver to RX CCID when node has shut down read end. */
176 if (!(sk->sk_shutdown & RCV_SHUTDOWN))
177 ccid_hc_rx_packet_recv(dp->dccps_hc_rx_ccid, sk, skb);
178 /*
179 * Until the TX queue has been drained, we can not honour SHUT_WR, since
180 * we need received feedback as input to adjust congestion control.
181 */
182 if (sk->sk_write_queue.qlen > 0 || !(sk->sk_shutdown & SEND_SHUTDOWN))
183 ccid_hc_tx_packet_recv(dp->dccps_hc_tx_ccid, sk, skb);
184}
185
106static int dccp_check_seqno(struct sock *sk, struct sk_buff *skb) 186static int dccp_check_seqno(struct sock *sk, struct sk_buff *skb)
107{ 187{
108 const struct dccp_hdr *dh = dccp_hdr(skb); 188 const struct dccp_hdr *dh = dccp_hdr(skb);
@@ -209,13 +289,11 @@ static int __dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
209 case DCCP_PKT_DATAACK: 289 case DCCP_PKT_DATAACK:
210 case DCCP_PKT_DATA: 290 case DCCP_PKT_DATA:
211 /* 291 /*
212 * FIXME: check if sk_receive_queue is full, schedule DATA_DROPPED 292 * FIXME: schedule DATA_DROPPED (RFC 4340, 11.7.2) if and when
213 * option if it is. 293 * - sk_shutdown == RCV_SHUTDOWN, use Code 1, "Not Listening"
294 * - sk_receive_queue is full, use Code 2, "Receive Buffer"
214 */ 295 */
215 __skb_pull(skb, dh->dccph_doff * 4); 296 dccp_enqueue_skb(sk, skb);
216 __skb_queue_tail(&sk->sk_receive_queue, skb);
217 skb_set_owner_r(skb, sk);
218 sk->sk_data_ready(sk, 0);
219 return 0; 297 return 0;
220 case DCCP_PKT_ACK: 298 case DCCP_PKT_ACK:
221 goto discard; 299 goto discard;
@@ -231,11 +309,13 @@ static int __dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
231 dccp_rcv_reset(sk, skb); 309 dccp_rcv_reset(sk, skb);
232 return 0; 310 return 0;
233 case DCCP_PKT_CLOSEREQ: 311 case DCCP_PKT_CLOSEREQ:
234 dccp_rcv_closereq(sk, skb); 312 if (dccp_rcv_closereq(sk, skb))
313 return 0;
235 goto discard; 314 goto discard;
236 case DCCP_PKT_CLOSE: 315 case DCCP_PKT_CLOSE:
237 dccp_rcv_close(sk, skb); 316 if (dccp_rcv_close(sk, skb))
238 return 0; 317 return 0;
318 goto discard;
239 case DCCP_PKT_REQUEST: 319 case DCCP_PKT_REQUEST:
240 /* Step 7 320 /* Step 7
241 * or (S.is_server and P.type == Response) 321 * or (S.is_server and P.type == Response)
@@ -289,7 +369,7 @@ int dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
289 if (dccp_check_seqno(sk, skb)) 369 if (dccp_check_seqno(sk, skb))
290 goto discard; 370 goto discard;
291 371
292 if (dccp_parse_options(sk, skb)) 372 if (dccp_parse_options(sk, NULL, skb))
293 goto discard; 373 goto discard;
294 374
295 if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) 375 if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
@@ -300,9 +380,7 @@ int dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
300 DCCP_SKB_CB(skb)->dccpd_seq, 380 DCCP_SKB_CB(skb)->dccpd_seq,
301 DCCP_ACKVEC_STATE_RECEIVED)) 381 DCCP_ACKVEC_STATE_RECEIVED))
302 goto discard; 382 goto discard;
303 383 dccp_deliver_input_to_ccids(sk, skb);
304 ccid_hc_rx_packet_recv(dp->dccps_hc_rx_ccid, sk, skb);
305 ccid_hc_tx_packet_recv(dp->dccps_hc_tx_ccid, sk, skb);
306 384
307 return __dccp_rcv_established(sk, skb, dh, len); 385 return __dccp_rcv_established(sk, skb, dh, len);
308discard: 386discard:
@@ -349,7 +427,7 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk,
349 goto out_invalid_packet; 427 goto out_invalid_packet;
350 } 428 }
351 429
352 if (dccp_parse_options(sk, skb)) 430 if (dccp_parse_options(sk, NULL, skb))
353 goto out_invalid_packet; 431 goto out_invalid_packet;
354 432
355 /* Obtain usec RTT sample from SYN exchange (used by CCID 3) */ 433 /* Obtain usec RTT sample from SYN exchange (used by CCID 3) */
@@ -402,7 +480,7 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk,
402 480
403 if (!sock_flag(sk, SOCK_DEAD)) { 481 if (!sock_flag(sk, SOCK_DEAD)) {
404 sk->sk_state_change(sk); 482 sk->sk_state_change(sk);
405 sk_wake_async(sk, 0, POLL_OUT); 483 sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
406 } 484 }
407 485
408 if (sk->sk_write_pending || icsk->icsk_ack.pingpong || 486 if (sk->sk_write_pending || icsk->icsk_ack.pingpong ||
@@ -531,7 +609,7 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
531 /* 609 /*
532 * Step 8: Process options and mark acknowledgeable 610 * Step 8: Process options and mark acknowledgeable
533 */ 611 */
534 if (dccp_parse_options(sk, skb)) 612 if (dccp_parse_options(sk, NULL, skb))
535 goto discard; 613 goto discard;
536 614
537 if (dcb->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) 615 if (dcb->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
@@ -543,8 +621,7 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
543 DCCP_ACKVEC_STATE_RECEIVED)) 621 DCCP_ACKVEC_STATE_RECEIVED))
544 goto discard; 622 goto discard;
545 623
546 ccid_hc_rx_packet_recv(dp->dccps_hc_rx_ccid, sk, skb); 624 dccp_deliver_input_to_ccids(sk, skb);
547 ccid_hc_tx_packet_recv(dp->dccps_hc_tx_ccid, sk, skb);
548 } 625 }
549 626
550 /* 627 /*
@@ -560,16 +637,14 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
560 return 0; 637 return 0;
561 /* 638 /*
562 * Step 7: Check for unexpected packet types 639 * Step 7: Check for unexpected packet types
563 * If (S.is_server and P.type == CloseReq) 640 * If (S.is_server and P.type == Response)
564 * or (S.is_server and P.type == Response)
565 * or (S.is_client and P.type == Request) 641 * or (S.is_client and P.type == Request)
566 * or (S.state == RESPOND and P.type == Data), 642 * or (S.state == RESPOND and P.type == Data),
567 * Send Sync packet acknowledging P.seqno 643 * Send Sync packet acknowledging P.seqno
568 * Drop packet and return 644 * Drop packet and return
569 */ 645 */
570 } else if ((dp->dccps_role != DCCP_ROLE_CLIENT && 646 } else if ((dp->dccps_role != DCCP_ROLE_CLIENT &&
571 (dh->dccph_type == DCCP_PKT_RESPONSE || 647 dh->dccph_type == DCCP_PKT_RESPONSE) ||
572 dh->dccph_type == DCCP_PKT_CLOSEREQ)) ||
573 (dp->dccps_role == DCCP_ROLE_CLIENT && 648 (dp->dccps_role == DCCP_ROLE_CLIENT &&
574 dh->dccph_type == DCCP_PKT_REQUEST) || 649 dh->dccph_type == DCCP_PKT_REQUEST) ||
575 (sk->sk_state == DCCP_RESPOND && 650 (sk->sk_state == DCCP_RESPOND &&
@@ -577,11 +652,13 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
577 dccp_send_sync(sk, dcb->dccpd_seq, DCCP_PKT_SYNC); 652 dccp_send_sync(sk, dcb->dccpd_seq, DCCP_PKT_SYNC);
578 goto discard; 653 goto discard;
579 } else if (dh->dccph_type == DCCP_PKT_CLOSEREQ) { 654 } else if (dh->dccph_type == DCCP_PKT_CLOSEREQ) {
580 dccp_rcv_closereq(sk, skb); 655 if (dccp_rcv_closereq(sk, skb))
656 return 0;
581 goto discard; 657 goto discard;
582 } else if (dh->dccph_type == DCCP_PKT_CLOSE) { 658 } else if (dh->dccph_type == DCCP_PKT_CLOSE) {
583 dccp_rcv_close(sk, skb); 659 if (dccp_rcv_close(sk, skb))
584 return 0; 660 return 0;
661 goto discard;
585 } 662 }
586 663
587 switch (sk->sk_state) { 664 switch (sk->sk_state) {
@@ -611,7 +688,7 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
611 switch (old_state) { 688 switch (old_state) {
612 case DCCP_PARTOPEN: 689 case DCCP_PARTOPEN:
613 sk->sk_state_change(sk); 690 sk->sk_state_change(sk);
614 sk_wake_async(sk, 0, POLL_OUT); 691 sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
615 break; 692 break;
616 } 693 }
617 } else if (unlikely(dh->dccph_type == DCCP_PKT_SYNC)) { 694 } else if (unlikely(dh->dccph_type == DCCP_PKT_SYNC)) {
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index db17b83e8d3e..9e38b0d6195c 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -408,7 +408,7 @@ struct sock *dccp_v4_request_recv_sock(struct sock *sk, struct sk_buff *skb,
408 408
409 dccp_sync_mss(newsk, dst_mtu(dst)); 409 dccp_sync_mss(newsk, dst_mtu(dst));
410 410
411 __inet_hash(&dccp_hashinfo, newsk, 0); 411 __inet_hash_nolisten(&dccp_hashinfo, newsk);
412 __inet_inherit_port(&dccp_hashinfo, sk, newsk); 412 __inet_inherit_port(&dccp_hashinfo, sk, newsk);
413 413
414 return newsk; 414 return newsk;
@@ -469,7 +469,7 @@ static struct dst_entry* dccp_v4_route_skb(struct sock *sk,
469 }; 469 };
470 470
471 security_skb_classify_flow(skb, &fl); 471 security_skb_classify_flow(skb, &fl);
472 if (ip_route_output_flow(&rt, &fl, sk, 0)) { 472 if (ip_route_output_flow(&init_net, &rt, &fl, sk, 0)) {
473 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); 473 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
474 return NULL; 474 return NULL;
475 } 475 }
@@ -600,11 +600,12 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
600 if (req == NULL) 600 if (req == NULL)
601 goto drop; 601 goto drop;
602 602
603 if (dccp_parse_options(sk, skb))
604 goto drop_and_free;
605
606 dccp_reqsk_init(req, skb); 603 dccp_reqsk_init(req, skb);
607 604
605 dreq = dccp_rsk(req);
606 if (dccp_parse_options(sk, dreq, skb))
607 goto drop_and_free;
608
608 if (security_inet_conn_request(sk, skb, req)) 609 if (security_inet_conn_request(sk, skb, req))
609 goto drop_and_free; 610 goto drop_and_free;
610 611
@@ -621,7 +622,6 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
621 * In fact we defer setting S.GSR, S.SWL, S.SWH to 622 * In fact we defer setting S.GSR, S.SWL, S.SWH to
622 * dccp_create_openreq_child. 623 * dccp_create_openreq_child.
623 */ 624 */
624 dreq = dccp_rsk(req);
625 dreq->dreq_isr = dcb->dccpd_seq; 625 dreq->dreq_isr = dcb->dccpd_seq;
626 dreq->dreq_iss = dccp_v4_init_sequence(skb); 626 dreq->dreq_iss = dccp_v4_init_sequence(skb);
627 dreq->dreq_service = service; 627 dreq->dreq_service = service;
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 87c98fb86fa8..f42b75ce7f5c 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -415,11 +415,12 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
415 if (req == NULL) 415 if (req == NULL)
416 goto drop; 416 goto drop;
417 417
418 if (dccp_parse_options(sk, skb))
419 goto drop_and_free;
420
421 dccp_reqsk_init(req, skb); 418 dccp_reqsk_init(req, skb);
422 419
420 dreq = dccp_rsk(req);
421 if (dccp_parse_options(sk, dreq, skb))
422 goto drop_and_free;
423
423 if (security_inet_conn_request(sk, skb, req)) 424 if (security_inet_conn_request(sk, skb, req))
424 goto drop_and_free; 425 goto drop_and_free;
425 426
@@ -449,7 +450,6 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
449 * In fact we defer setting S.GSR, S.SWL, S.SWH to 450 * In fact we defer setting S.GSR, S.SWL, S.SWH to
450 * dccp_create_openreq_child. 451 * dccp_create_openreq_child.
451 */ 452 */
452 dreq = dccp_rsk(req);
453 dreq->dreq_isr = dcb->dccpd_seq; 453 dreq->dreq_isr = dcb->dccpd_seq;
454 dreq->dreq_iss = dccp_v6_init_sequence(skb); 454 dreq->dreq_iss = dccp_v6_init_sequence(skb);
455 dreq->dreq_service = service; 455 dreq->dreq_service = service;
@@ -994,7 +994,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
994 if (final_p) 994 if (final_p)
995 ipv6_addr_copy(&fl.fl6_dst, final_p); 995 ipv6_addr_copy(&fl.fl6_dst, final_p);
996 996
997 err = __xfrm_lookup(&dst, &fl, sk, 1); 997 err = __xfrm_lookup(&dst, &fl, sk, XFRM_LOOKUP_WAIT);
998 if (err < 0) { 998 if (err < 0) {
999 if (err == -EREMOTE) 999 if (err == -EREMOTE)
1000 err = ip6_dst_blackhole(sk, &dst, &fl); 1000 err = ip6_dst_blackhole(sk, &dst, &fl);
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
index 831b76e08d02..027d1814e1ab 100644
--- a/net/dccp/minisocks.c
+++ b/net/dccp/minisocks.c
@@ -117,11 +117,13 @@ struct sock *dccp_create_openreq_child(struct sock *sk,
117 struct dccp_sock *newdp = dccp_sk(newsk); 117 struct dccp_sock *newdp = dccp_sk(newsk);
118 struct dccp_minisock *newdmsk = dccp_msk(newsk); 118 struct dccp_minisock *newdmsk = dccp_msk(newsk);
119 119
120 newdp->dccps_role = DCCP_ROLE_SERVER; 120 newdp->dccps_role = DCCP_ROLE_SERVER;
121 newdp->dccps_hc_rx_ackvec = NULL; 121 newdp->dccps_hc_rx_ackvec = NULL;
122 newdp->dccps_service_list = NULL; 122 newdp->dccps_service_list = NULL;
123 newdp->dccps_service = dreq->dreq_service; 123 newdp->dccps_service = dreq->dreq_service;
124 newicsk->icsk_rto = DCCP_TIMEOUT_INIT; 124 newdp->dccps_timestamp_echo = dreq->dreq_timestamp_echo;
125 newdp->dccps_timestamp_time = dreq->dreq_timestamp_time;
126 newicsk->icsk_rto = DCCP_TIMEOUT_INIT;
125 127
126 if (dccp_feat_clone(sk, newsk)) 128 if (dccp_feat_clone(sk, newsk))
127 goto out_free; 129 goto out_free;
@@ -200,10 +202,10 @@ struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb,
200 struct request_sock **prev) 202 struct request_sock **prev)
201{ 203{
202 struct sock *child = NULL; 204 struct sock *child = NULL;
205 struct dccp_request_sock *dreq = dccp_rsk(req);
203 206
204 /* Check for retransmitted REQUEST */ 207 /* Check for retransmitted REQUEST */
205 if (dccp_hdr(skb)->dccph_type == DCCP_PKT_REQUEST) { 208 if (dccp_hdr(skb)->dccph_type == DCCP_PKT_REQUEST) {
206 struct dccp_request_sock *dreq = dccp_rsk(req);
207 209
208 if (after48(DCCP_SKB_CB(skb)->dccpd_seq, dreq->dreq_isr)) { 210 if (after48(DCCP_SKB_CB(skb)->dccpd_seq, dreq->dreq_isr)) {
209 dccp_pr_debug("Retransmitted REQUEST\n"); 211 dccp_pr_debug("Retransmitted REQUEST\n");
@@ -227,22 +229,22 @@ struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb,
227 goto drop; 229 goto drop;
228 230
229 /* Invalid ACK */ 231 /* Invalid ACK */
230 if (DCCP_SKB_CB(skb)->dccpd_ack_seq != dccp_rsk(req)->dreq_iss) { 232 if (DCCP_SKB_CB(skb)->dccpd_ack_seq != dreq->dreq_iss) {
231 dccp_pr_debug("Invalid ACK number: ack_seq=%llu, " 233 dccp_pr_debug("Invalid ACK number: ack_seq=%llu, "
232 "dreq_iss=%llu\n", 234 "dreq_iss=%llu\n",
233 (unsigned long long) 235 (unsigned long long)
234 DCCP_SKB_CB(skb)->dccpd_ack_seq, 236 DCCP_SKB_CB(skb)->dccpd_ack_seq,
235 (unsigned long long) 237 (unsigned long long) dreq->dreq_iss);
236 dccp_rsk(req)->dreq_iss);
237 goto drop; 238 goto drop;
238 } 239 }
239 240
241 if (dccp_parse_options(sk, dreq, skb))
242 goto drop;
243
240 child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL); 244 child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
241 if (child == NULL) 245 if (child == NULL)
242 goto listen_overflow; 246 goto listen_overflow;
243 247
244 /* FIXME: deal with options */
245
246 inet_csk_reqsk_queue_unlink(sk, req, prev); 248 inet_csk_reqsk_queue_unlink(sk, req, prev);
247 inet_csk_reqsk_queue_removed(sk, req); 249 inet_csk_reqsk_queue_removed(sk, req);
248 inet_csk_reqsk_queue_add(sk, req, child); 250 inet_csk_reqsk_queue_add(sk, req, child);
@@ -303,9 +305,12 @@ EXPORT_SYMBOL_GPL(dccp_reqsk_send_ack);
303 305
304void dccp_reqsk_init(struct request_sock *req, struct sk_buff *skb) 306void dccp_reqsk_init(struct request_sock *req, struct sk_buff *skb)
305{ 307{
306 inet_rsk(req)->rmt_port = dccp_hdr(skb)->dccph_sport; 308 struct dccp_request_sock *dreq = dccp_rsk(req);
307 inet_rsk(req)->acked = 0; 309
308 req->rcv_wnd = sysctl_dccp_feat_sequence_window; 310 inet_rsk(req)->rmt_port = dccp_hdr(skb)->dccph_sport;
311 inet_rsk(req)->acked = 0;
312 req->rcv_wnd = sysctl_dccp_feat_sequence_window;
313 dreq->dreq_timestamp_echo = 0;
309} 314}
310 315
311EXPORT_SYMBOL_GPL(dccp_reqsk_init); 316EXPORT_SYMBOL_GPL(dccp_reqsk_init);
diff --git a/net/dccp/options.c b/net/dccp/options.c
index d286cffe2c49..d2a84a2fecee 100644
--- a/net/dccp/options.c
+++ b/net/dccp/options.c
@@ -46,7 +46,13 @@ static u32 dccp_decode_value_var(const unsigned char *bf, const u8 len)
46 return value; 46 return value;
47} 47}
48 48
49int dccp_parse_options(struct sock *sk, struct sk_buff *skb) 49/**
50 * dccp_parse_options - Parse DCCP options present in @skb
51 * @sk: client|server|listening dccp socket (when @dreq != NULL)
52 * @dreq: request socket to use during connection setup, or NULL
53 */
54int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
55 struct sk_buff *skb)
50{ 56{
51 struct dccp_sock *dp = dccp_sk(sk); 57 struct dccp_sock *dp = dccp_sk(sk);
52 const struct dccp_hdr *dh = dccp_hdr(skb); 58 const struct dccp_hdr *dh = dccp_hdr(skb);
@@ -92,6 +98,20 @@ int dccp_parse_options(struct sock *sk, struct sk_buff *skb)
92 goto out_invalid_option; 98 goto out_invalid_option;
93 } 99 }
94 100
101 /*
102 * CCID-Specific Options (from RFC 4340, sec. 10.3):
103 *
104 * Option numbers 128 through 191 are for options sent from the
105 * HC-Sender to the HC-Receiver; option numbers 192 through 255
106 * are for options sent from the HC-Receiver to the HC-Sender.
107 *
108 * CCID-specific options are ignored during connection setup, as
109 * negotiation may still be in progress (see RFC 4340, 10.3).
110 *
111 */
112 if (dreq != NULL && opt >= 128)
113 goto ignore_option;
114
95 switch (opt) { 115 switch (opt) {
96 case DCCPO_PADDING: 116 case DCCPO_PADDING:
97 break; 117 break;
@@ -112,6 +132,8 @@ int dccp_parse_options(struct sock *sk, struct sk_buff *skb)
112 case DCCPO_CHANGE_L: 132 case DCCPO_CHANGE_L:
113 /* fall through */ 133 /* fall through */
114 case DCCPO_CHANGE_R: 134 case DCCPO_CHANGE_R:
135 if (pkt_type == DCCP_PKT_DATA)
136 break;
115 if (len < 2) 137 if (len < 2)
116 goto out_invalid_option; 138 goto out_invalid_option;
117 rc = dccp_feat_change_recv(sk, opt, *value, value + 1, 139 rc = dccp_feat_change_recv(sk, opt, *value, value + 1,
@@ -128,7 +150,9 @@ int dccp_parse_options(struct sock *sk, struct sk_buff *skb)
128 case DCCPO_CONFIRM_L: 150 case DCCPO_CONFIRM_L:
129 /* fall through */ 151 /* fall through */
130 case DCCPO_CONFIRM_R: 152 case DCCPO_CONFIRM_R:
131 if (len < 2) 153 if (pkt_type == DCCP_PKT_DATA)
154 break;
155 if (len < 2) /* FIXME this disallows empty confirm */
132 goto out_invalid_option; 156 goto out_invalid_option;
133 if (dccp_feat_confirm_recv(sk, opt, *value, 157 if (dccp_feat_confirm_recv(sk, opt, *value,
134 value + 1, len - 1)) 158 value + 1, len - 1))
@@ -136,7 +160,7 @@ int dccp_parse_options(struct sock *sk, struct sk_buff *skb)
136 break; 160 break;
137 case DCCPO_ACK_VECTOR_0: 161 case DCCPO_ACK_VECTOR_0:
138 case DCCPO_ACK_VECTOR_1: 162 case DCCPO_ACK_VECTOR_1:
139 if (pkt_type == DCCP_PKT_DATA) 163 if (dccp_packet_without_ack(skb)) /* RFC 4340, 11.4 */
140 break; 164 break;
141 165
142 if (dccp_msk(sk)->dccpms_send_ack_vector && 166 if (dccp_msk(sk)->dccpms_send_ack_vector &&
@@ -146,15 +170,27 @@ int dccp_parse_options(struct sock *sk, struct sk_buff *skb)
146 case DCCPO_TIMESTAMP: 170 case DCCPO_TIMESTAMP:
147 if (len != 4) 171 if (len != 4)
148 goto out_invalid_option; 172 goto out_invalid_option;
149 173 /*
174 * RFC 4340 13.1: "The precise time corresponding to
175 * Timestamp Value zero is not specified". We use
176 * zero to indicate absence of a meaningful timestamp.
177 */
150 opt_val = get_unaligned((__be32 *)value); 178 opt_val = get_unaligned((__be32 *)value);
151 opt_recv->dccpor_timestamp = ntohl(opt_val); 179 if (unlikely(opt_val == 0)) {
152 180 DCCP_WARN("Timestamp with zero value\n");
153 dp->dccps_timestamp_echo = opt_recv->dccpor_timestamp; 181 break;
154 dp->dccps_timestamp_time = ktime_get_real(); 182 }
155 183
184 if (dreq != NULL) {
185 dreq->dreq_timestamp_echo = ntohl(opt_val);
186 dreq->dreq_timestamp_time = dccp_timestamp();
187 } else {
188 opt_recv->dccpor_timestamp =
189 dp->dccps_timestamp_echo = ntohl(opt_val);
190 dp->dccps_timestamp_time = dccp_timestamp();
191 }
156 dccp_pr_debug("%s rx opt: TIMESTAMP=%u, ackno=%llu\n", 192 dccp_pr_debug("%s rx opt: TIMESTAMP=%u, ackno=%llu\n",
157 dccp_role(sk), opt_recv->dccpor_timestamp, 193 dccp_role(sk), ntohl(opt_val),
158 (unsigned long long) 194 (unsigned long long)
159 DCCP_SKB_CB(skb)->dccpd_ack_seq); 195 DCCP_SKB_CB(skb)->dccpd_ack_seq);
160 break; 196 break;
@@ -194,18 +230,17 @@ int dccp_parse_options(struct sock *sk, struct sk_buff *skb)
194 opt_recv->dccpor_elapsed_time = elapsed_time; 230 opt_recv->dccpor_elapsed_time = elapsed_time;
195 break; 231 break;
196 case DCCPO_ELAPSED_TIME: 232 case DCCPO_ELAPSED_TIME:
197 if (len != 2 && len != 4) 233 if (dccp_packet_without_ack(skb)) /* RFC 4340, 13.2 */
198 goto out_invalid_option; 234 break;
199
200 if (pkt_type == DCCP_PKT_DATA)
201 continue;
202 235
203 if (len == 2) { 236 if (len == 2) {
204 __be16 opt_val2 = get_unaligned((__be16 *)value); 237 __be16 opt_val2 = get_unaligned((__be16 *)value);
205 elapsed_time = ntohs(opt_val2); 238 elapsed_time = ntohs(opt_val2);
206 } else { 239 } else if (len == 4) {
207 opt_val = get_unaligned((__be32 *)value); 240 opt_val = get_unaligned((__be32 *)value);
208 elapsed_time = ntohl(opt_val); 241 elapsed_time = ntohl(opt_val);
242 } else {
243 goto out_invalid_option;
209 } 244 }
210 245
211 if (elapsed_time > opt_recv->dccpor_elapsed_time) 246 if (elapsed_time > opt_recv->dccpor_elapsed_time)
@@ -214,15 +249,6 @@ int dccp_parse_options(struct sock *sk, struct sk_buff *skb)
214 dccp_pr_debug("%s rx opt: ELAPSED_TIME=%d\n", 249 dccp_pr_debug("%s rx opt: ELAPSED_TIME=%d\n",
215 dccp_role(sk), elapsed_time); 250 dccp_role(sk), elapsed_time);
216 break; 251 break;
217 /*
218 * From RFC 4340, sec. 10.3:
219 *
220 * Option numbers 128 through 191 are for
221 * options sent from the HC-Sender to the
222 * HC-Receiver; option numbers 192 through 255
223 * are for options sent from the HC-Receiver to
224 * the HC-Sender.
225 */
226 case 128 ... 191: { 252 case 128 ... 191: {
227 const u16 idx = value - options; 253 const u16 idx = value - options;
228 254
@@ -246,7 +272,7 @@ int dccp_parse_options(struct sock *sk, struct sk_buff *skb)
246 "implemented, ignoring", sk, opt, len); 272 "implemented, ignoring", sk, opt, len);
247 break; 273 break;
248 } 274 }
249 275ignore_option:
250 if (opt != DCCPO_MANDATORY) 276 if (opt != DCCPO_MANDATORY)
251 mandatory = 0; 277 mandatory = 0;
252 } 278 }
@@ -382,16 +408,24 @@ int dccp_insert_option_timestamp(struct sock *sk, struct sk_buff *skb)
382 408
383EXPORT_SYMBOL_GPL(dccp_insert_option_timestamp); 409EXPORT_SYMBOL_GPL(dccp_insert_option_timestamp);
384 410
385static int dccp_insert_option_timestamp_echo(struct sock *sk, 411static int dccp_insert_option_timestamp_echo(struct dccp_sock *dp,
412 struct dccp_request_sock *dreq,
386 struct sk_buff *skb) 413 struct sk_buff *skb)
387{ 414{
388 struct dccp_sock *dp = dccp_sk(sk);
389 __be32 tstamp_echo; 415 __be32 tstamp_echo;
390 int len, elapsed_time_len;
391 unsigned char *to; 416 unsigned char *to;
392 const suseconds_t delta = ktime_us_delta(ktime_get_real(), 417 u32 elapsed_time, elapsed_time_len, len;
393 dp->dccps_timestamp_time); 418
394 u32 elapsed_time = delta / 10; 419 if (dreq != NULL) {
420 elapsed_time = dccp_timestamp() - dreq->dreq_timestamp_time;
421 tstamp_echo = htonl(dreq->dreq_timestamp_echo);
422 dreq->dreq_timestamp_echo = 0;
423 } else {
424 elapsed_time = dccp_timestamp() - dp->dccps_timestamp_time;
425 tstamp_echo = htonl(dp->dccps_timestamp_echo);
426 dp->dccps_timestamp_echo = 0;
427 }
428
395 elapsed_time_len = dccp_elapsed_time_len(elapsed_time); 429 elapsed_time_len = dccp_elapsed_time_len(elapsed_time);
396 len = 6 + elapsed_time_len; 430 len = 6 + elapsed_time_len;
397 431
@@ -404,7 +438,6 @@ static int dccp_insert_option_timestamp_echo(struct sock *sk,
404 *to++ = DCCPO_TIMESTAMP_ECHO; 438 *to++ = DCCPO_TIMESTAMP_ECHO;
405 *to++ = len; 439 *to++ = len;
406 440
407 tstamp_echo = htonl(dp->dccps_timestamp_echo);
408 memcpy(to, &tstamp_echo, 4); 441 memcpy(to, &tstamp_echo, 4);
409 to += 4; 442 to += 4;
410 443
@@ -416,8 +449,6 @@ static int dccp_insert_option_timestamp_echo(struct sock *sk,
416 memcpy(to, &var32, 4); 449 memcpy(to, &var32, 4);
417 } 450 }
418 451
419 dp->dccps_timestamp_echo = 0;
420 dp->dccps_timestamp_time = ktime_set(0, 0);
421 return 0; 452 return 0;
422} 453}
423 454
@@ -510,6 +541,18 @@ static int dccp_insert_options_feat(struct sock *sk, struct sk_buff *skb)
510 return 0; 541 return 0;
511} 542}
512 543
544/* The length of all options needs to be a multiple of 4 (5.8) */
545static void dccp_insert_option_padding(struct sk_buff *skb)
546{
547 int padding = DCCP_SKB_CB(skb)->dccpd_opt_len % 4;
548
549 if (padding != 0) {
550 padding = 4 - padding;
551 memset(skb_push(skb, padding), 0, padding);
552 DCCP_SKB_CB(skb)->dccpd_opt_len += padding;
553 }
554}
555
513int dccp_insert_options(struct sock *sk, struct sk_buff *skb) 556int dccp_insert_options(struct sock *sk, struct sk_buff *skb)
514{ 557{
515 struct dccp_sock *dp = dccp_sk(sk); 558 struct dccp_sock *dp = dccp_sk(sk);
@@ -526,10 +569,6 @@ int dccp_insert_options(struct sock *sk, struct sk_buff *skb)
526 dccp_ackvec_pending(dp->dccps_hc_rx_ackvec) && 569 dccp_ackvec_pending(dp->dccps_hc_rx_ackvec) &&
527 dccp_insert_option_ackvec(sk, skb)) 570 dccp_insert_option_ackvec(sk, skb))
528 return -1; 571 return -1;
529
530 if (dp->dccps_timestamp_echo != 0 &&
531 dccp_insert_option_timestamp_echo(sk, skb))
532 return -1;
533 } 572 }
534 573
535 if (dp->dccps_hc_rx_insert_options) { 574 if (dp->dccps_hc_rx_insert_options) {
@@ -553,18 +592,22 @@ int dccp_insert_options(struct sock *sk, struct sk_buff *skb)
553 dccp_insert_option_timestamp(sk, skb)) 592 dccp_insert_option_timestamp(sk, skb))
554 return -1; 593 return -1;
555 594
556 /* XXX: insert other options when appropriate */ 595 if (dp->dccps_timestamp_echo != 0 &&
596 dccp_insert_option_timestamp_echo(dp, NULL, skb))
597 return -1;
598
599 dccp_insert_option_padding(skb);
600 return 0;
601}
557 602
558 if (DCCP_SKB_CB(skb)->dccpd_opt_len != 0) { 603int dccp_insert_options_rsk(struct dccp_request_sock *dreq, struct sk_buff *skb)
559 /* The length of all options has to be a multiple of 4 */ 604{
560 int padding = DCCP_SKB_CB(skb)->dccpd_opt_len % 4; 605 DCCP_SKB_CB(skb)->dccpd_opt_len = 0;
561 606
562 if (padding != 0) { 607 if (dreq->dreq_timestamp_echo != 0 &&
563 padding = 4 - padding; 608 dccp_insert_option_timestamp_echo(NULL, dreq, skb))
564 memset(skb_push(skb, padding), 0, padding); 609 return -1;
565 DCCP_SKB_CB(skb)->dccpd_opt_len += padding;
566 }
567 }
568 610
611 dccp_insert_option_padding(skb);
569 return 0; 612 return 0;
570} 613}
diff --git a/net/dccp/output.c b/net/dccp/output.c
index f49544618f20..3b763db3d863 100644
--- a/net/dccp/output.c
+++ b/net/dccp/output.c
@@ -133,15 +133,31 @@ static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb)
133 return -ENOBUFS; 133 return -ENOBUFS;
134} 134}
135 135
136/**
137 * dccp_determine_ccmps - Find out about CCID-specfic packet-size limits
138 * We only consider the HC-sender CCID for setting the CCMPS (RFC 4340, 14.),
139 * since the RX CCID is restricted to feedback packets (Acks), which are small
140 * in comparison with the data traffic. A value of 0 means "no current CCMPS".
141 */
142static u32 dccp_determine_ccmps(const struct dccp_sock *dp)
143{
144 const struct ccid *tx_ccid = dp->dccps_hc_tx_ccid;
145
146 if (tx_ccid == NULL || tx_ccid->ccid_ops == NULL)
147 return 0;
148 return tx_ccid->ccid_ops->ccid_ccmps;
149}
150
136unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu) 151unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu)
137{ 152{
138 struct inet_connection_sock *icsk = inet_csk(sk); 153 struct inet_connection_sock *icsk = inet_csk(sk);
139 struct dccp_sock *dp = dccp_sk(sk); 154 struct dccp_sock *dp = dccp_sk(sk);
140 int mss_now = (pmtu - icsk->icsk_af_ops->net_header_len - 155 u32 ccmps = dccp_determine_ccmps(dp);
141 sizeof(struct dccp_hdr) - sizeof(struct dccp_hdr_ext)); 156 int cur_mps = ccmps ? min(pmtu, ccmps) : pmtu;
142 157
143 /* Now subtract optional transport overhead */ 158 /* Account for header lengths and IPv4/v6 option overhead */
144 mss_now -= icsk->icsk_ext_hdr_len; 159 cur_mps -= (icsk->icsk_af_ops->net_header_len + icsk->icsk_ext_hdr_len +
160 sizeof(struct dccp_hdr) + sizeof(struct dccp_hdr_ext));
145 161
146 /* 162 /*
147 * FIXME: this should come from the CCID infrastructure, where, say, 163 * FIXME: this should come from the CCID infrastructure, where, say,
@@ -151,13 +167,13 @@ unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu)
151 * make it a multiple of 4 167 * make it a multiple of 4
152 */ 168 */
153 169
154 mss_now -= ((5 + 6 + 10 + 6 + 6 + 6 + 3) / 4) * 4; 170 cur_mps -= ((5 + 6 + 10 + 6 + 6 + 6 + 3) / 4) * 4;
155 171
156 /* And store cached results */ 172 /* And store cached results */
157 icsk->icsk_pmtu_cookie = pmtu; 173 icsk->icsk_pmtu_cookie = pmtu;
158 dp->dccps_mss_cache = mss_now; 174 dp->dccps_mss_cache = cur_mps;
159 175
160 return mss_now; 176 return cur_mps;
161} 177}
162 178
163EXPORT_SYMBOL_GPL(dccp_sync_mss); 179EXPORT_SYMBOL_GPL(dccp_sync_mss);
@@ -170,7 +186,7 @@ void dccp_write_space(struct sock *sk)
170 wake_up_interruptible(sk->sk_sleep); 186 wake_up_interruptible(sk->sk_sleep);
171 /* Should agree with poll, otherwise some programs break */ 187 /* Should agree with poll, otherwise some programs break */
172 if (sock_writeable(sk)) 188 if (sock_writeable(sk))
173 sk_wake_async(sk, 2, POLL_OUT); 189 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
174 190
175 read_unlock(&sk->sk_callback_lock); 191 read_unlock(&sk->sk_callback_lock);
176} 192}
@@ -303,7 +319,7 @@ struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst,
303 DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_RESPONSE; 319 DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_RESPONSE;
304 DCCP_SKB_CB(skb)->dccpd_seq = dreq->dreq_iss; 320 DCCP_SKB_CB(skb)->dccpd_seq = dreq->dreq_iss;
305 321
306 if (dccp_insert_options(sk, skb)) { 322 if (dccp_insert_options_rsk(dreq, skb)) {
307 kfree_skb(skb); 323 kfree_skb(skb);
308 return NULL; 324 return NULL;
309 } 325 }
@@ -391,7 +407,7 @@ int dccp_send_reset(struct sock *sk, enum dccp_reset_codes code)
391 * FIXME: what if rebuild_header fails? 407 * FIXME: what if rebuild_header fails?
392 * Should we be doing a rebuild_header here? 408 * Should we be doing a rebuild_header here?
393 */ 409 */
394 int err = inet_sk_rebuild_header(sk); 410 int err = inet_csk(sk)->icsk_af_ops->rebuild_header(sk);
395 411
396 if (err != 0) 412 if (err != 0)
397 return err; 413 return err;
@@ -567,14 +583,27 @@ void dccp_send_close(struct sock *sk, const int active)
567 583
568 /* Reserve space for headers and prepare control bits. */ 584 /* Reserve space for headers and prepare control bits. */
569 skb_reserve(skb, sk->sk_prot->max_header); 585 skb_reserve(skb, sk->sk_prot->max_header);
570 DCCP_SKB_CB(skb)->dccpd_type = dp->dccps_role == DCCP_ROLE_CLIENT ? 586 if (dp->dccps_role == DCCP_ROLE_SERVER && !dp->dccps_server_timewait)
571 DCCP_PKT_CLOSE : DCCP_PKT_CLOSEREQ; 587 DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_CLOSEREQ;
588 else
589 DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_CLOSE;
572 590
573 if (active) { 591 if (active) {
574 dccp_write_xmit(sk, 1); 592 dccp_write_xmit(sk, 1);
575 dccp_skb_entail(sk, skb); 593 dccp_skb_entail(sk, skb);
576 dccp_transmit_skb(sk, skb_clone(skb, prio)); 594 dccp_transmit_skb(sk, skb_clone(skb, prio));
577 /* FIXME do we need a retransmit timer here? */ 595 /*
596 * Retransmission timer for active-close: RFC 4340, 8.3 requires
597 * to retransmit the Close/CloseReq until the CLOSING/CLOSEREQ
598 * state can be left. The initial timeout is 2 RTTs.
599 * Since RTT measurement is done by the CCIDs, there is no easy
600 * way to get an RTT sample. The fallback RTT from RFC 4340, 3.4
601 * is too low (200ms); we use a high value to avoid unnecessary
602 * retransmissions when the link RTT is > 0.2 seconds.
603 * FIXME: Let main module sample RTTs and use that instead.
604 */
605 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
606 DCCP_TIMEOUT_INIT, DCCP_RTO_MAX);
578 } else 607 } else
579 dccp_transmit_skb(sk, skb); 608 dccp_transmit_skb(sk, skb);
580} 609}
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 7a3bea9c28c1..0bed4a6095b7 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -60,8 +60,7 @@ void dccp_set_state(struct sock *sk, const int state)
60{ 60{
61 const int oldstate = sk->sk_state; 61 const int oldstate = sk->sk_state;
62 62
63 dccp_pr_debug("%s(%p) %-10.10s -> %s\n", 63 dccp_pr_debug("%s(%p) %s --> %s\n", dccp_role(sk), sk,
64 dccp_role(sk), sk,
65 dccp_state_name(oldstate), dccp_state_name(state)); 64 dccp_state_name(oldstate), dccp_state_name(state));
66 WARN_ON(state == oldstate); 65 WARN_ON(state == oldstate);
67 66
@@ -72,7 +71,8 @@ void dccp_set_state(struct sock *sk, const int state)
72 break; 71 break;
73 72
74 case DCCP_CLOSED: 73 case DCCP_CLOSED:
75 if (oldstate == DCCP_CLOSING || oldstate == DCCP_OPEN) 74 if (oldstate == DCCP_OPEN || oldstate == DCCP_ACTIVE_CLOSEREQ ||
75 oldstate == DCCP_CLOSING)
76 DCCP_INC_STATS(DCCP_MIB_ESTABRESETS); 76 DCCP_INC_STATS(DCCP_MIB_ESTABRESETS);
77 77
78 sk->sk_prot->unhash(sk); 78 sk->sk_prot->unhash(sk);
@@ -93,6 +93,24 @@ void dccp_set_state(struct sock *sk, const int state)
93 93
94EXPORT_SYMBOL_GPL(dccp_set_state); 94EXPORT_SYMBOL_GPL(dccp_set_state);
95 95
96static void dccp_finish_passive_close(struct sock *sk)
97{
98 switch (sk->sk_state) {
99 case DCCP_PASSIVE_CLOSE:
100 /* Node (client or server) has received Close packet. */
101 dccp_send_reset(sk, DCCP_RESET_CODE_CLOSED);
102 dccp_set_state(sk, DCCP_CLOSED);
103 break;
104 case DCCP_PASSIVE_CLOSEREQ:
105 /*
106 * Client received CloseReq. We set the `active' flag so that
107 * dccp_send_close() retransmits the Close as per RFC 4340, 8.3.
108 */
109 dccp_send_close(sk, 1);
110 dccp_set_state(sk, DCCP_CLOSING);
111 }
112}
113
96void dccp_done(struct sock *sk) 114void dccp_done(struct sock *sk)
97{ 115{
98 dccp_set_state(sk, DCCP_CLOSED); 116 dccp_set_state(sk, DCCP_CLOSED);
@@ -134,14 +152,17 @@ EXPORT_SYMBOL_GPL(dccp_packet_name);
134const char *dccp_state_name(const int state) 152const char *dccp_state_name(const int state)
135{ 153{
136 static char *dccp_state_names[] = { 154 static char *dccp_state_names[] = {
137 [DCCP_OPEN] = "OPEN", 155 [DCCP_OPEN] = "OPEN",
138 [DCCP_REQUESTING] = "REQUESTING", 156 [DCCP_REQUESTING] = "REQUESTING",
139 [DCCP_PARTOPEN] = "PARTOPEN", 157 [DCCP_PARTOPEN] = "PARTOPEN",
140 [DCCP_LISTEN] = "LISTEN", 158 [DCCP_LISTEN] = "LISTEN",
141 [DCCP_RESPOND] = "RESPOND", 159 [DCCP_RESPOND] = "RESPOND",
142 [DCCP_CLOSING] = "CLOSING", 160 [DCCP_CLOSING] = "CLOSING",
143 [DCCP_TIME_WAIT] = "TIME_WAIT", 161 [DCCP_ACTIVE_CLOSEREQ] = "CLOSEREQ",
144 [DCCP_CLOSED] = "CLOSED", 162 [DCCP_PASSIVE_CLOSE] = "PASSIVE_CLOSE",
163 [DCCP_PASSIVE_CLOSEREQ] = "PASSIVE_CLOSEREQ",
164 [DCCP_TIME_WAIT] = "TIME_WAIT",
165 [DCCP_CLOSED] = "CLOSED",
145 }; 166 };
146 167
147 if (state >= DCCP_MAX_STATES) 168 if (state >= DCCP_MAX_STATES)
@@ -174,6 +195,19 @@ int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized)
174 195
175 dccp_minisock_init(&dp->dccps_minisock); 196 dccp_minisock_init(&dp->dccps_minisock);
176 197
198 icsk->icsk_rto = DCCP_TIMEOUT_INIT;
199 icsk->icsk_syn_retries = sysctl_dccp_request_retries;
200 sk->sk_state = DCCP_CLOSED;
201 sk->sk_write_space = dccp_write_space;
202 icsk->icsk_sync_mss = dccp_sync_mss;
203 dp->dccps_mss_cache = 536;
204 dp->dccps_rate_last = jiffies;
205 dp->dccps_role = DCCP_ROLE_UNDEFINED;
206 dp->dccps_service = DCCP_SERVICE_CODE_IS_ABSENT;
207 dp->dccps_l_ack_ratio = dp->dccps_r_ack_ratio = 1;
208
209 dccp_init_xmit_timers(sk);
210
177 /* 211 /*
178 * FIXME: We're hardcoding the CCID, and doing this at this point makes 212 * FIXME: We're hardcoding the CCID, and doing this at this point makes
179 * the listening (master) sock get CCID control blocks, which is not 213 * the listening (master) sock get CCID control blocks, which is not
@@ -213,18 +247,6 @@ int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized)
213 INIT_LIST_HEAD(&dmsk->dccpms_conf); 247 INIT_LIST_HEAD(&dmsk->dccpms_conf);
214 } 248 }
215 249
216 dccp_init_xmit_timers(sk);
217 icsk->icsk_rto = DCCP_TIMEOUT_INIT;
218 icsk->icsk_syn_retries = sysctl_dccp_request_retries;
219 sk->sk_state = DCCP_CLOSED;
220 sk->sk_write_space = dccp_write_space;
221 icsk->icsk_sync_mss = dccp_sync_mss;
222 dp->dccps_mss_cache = 536;
223 dp->dccps_rate_last = jiffies;
224 dp->dccps_role = DCCP_ROLE_UNDEFINED;
225 dp->dccps_service = DCCP_SERVICE_CODE_IS_ABSENT;
226 dp->dccps_l_ack_ratio = dp->dccps_r_ack_ratio = 1;
227
228 return 0; 250 return 0;
229} 251}
230 252
@@ -275,6 +297,12 @@ static inline int dccp_listen_start(struct sock *sk, int backlog)
275 return inet_csk_listen_start(sk, backlog); 297 return inet_csk_listen_start(sk, backlog);
276} 298}
277 299
300static inline int dccp_need_reset(int state)
301{
302 return state != DCCP_CLOSED && state != DCCP_LISTEN &&
303 state != DCCP_REQUESTING;
304}
305
278int dccp_disconnect(struct sock *sk, int flags) 306int dccp_disconnect(struct sock *sk, int flags)
279{ 307{
280 struct inet_connection_sock *icsk = inet_csk(sk); 308 struct inet_connection_sock *icsk = inet_csk(sk);
@@ -285,10 +313,15 @@ int dccp_disconnect(struct sock *sk, int flags)
285 if (old_state != DCCP_CLOSED) 313 if (old_state != DCCP_CLOSED)
286 dccp_set_state(sk, DCCP_CLOSED); 314 dccp_set_state(sk, DCCP_CLOSED);
287 315
288 /* ABORT function of RFC793 */ 316 /*
317 * This corresponds to the ABORT function of RFC793, sec. 3.8
318 * TCP uses a RST segment, DCCP a Reset packet with Code 2, "Aborted".
319 */
289 if (old_state == DCCP_LISTEN) { 320 if (old_state == DCCP_LISTEN) {
290 inet_csk_listen_stop(sk); 321 inet_csk_listen_stop(sk);
291 /* FIXME: do the active reset thing */ 322 } else if (dccp_need_reset(old_state)) {
323 dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED);
324 sk->sk_err = ECONNRESET;
292 } else if (old_state == DCCP_REQUESTING) 325 } else if (old_state == DCCP_REQUESTING)
293 sk->sk_err = ECONNRESET; 326 sk->sk_err = ECONNRESET;
294 327
@@ -518,6 +551,12 @@ static int do_dccp_setsockopt(struct sock *sk, int level, int optname,
518 (struct dccp_so_feat __user *) 551 (struct dccp_so_feat __user *)
519 optval); 552 optval);
520 break; 553 break;
554 case DCCP_SOCKOPT_SERVER_TIMEWAIT:
555 if (dp->dccps_role != DCCP_ROLE_SERVER)
556 err = -EOPNOTSUPP;
557 else
558 dp->dccps_server_timewait = (val != 0);
559 break;
521 case DCCP_SOCKOPT_SEND_CSCOV: /* sender side, RFC 4340, sec. 9.2 */ 560 case DCCP_SOCKOPT_SEND_CSCOV: /* sender side, RFC 4340, sec. 9.2 */
522 if (val < 0 || val > 15) 561 if (val < 0 || val > 15)
523 err = -EINVAL; 562 err = -EINVAL;
@@ -618,15 +657,15 @@ static int do_dccp_getsockopt(struct sock *sk, int level, int optname,
618 (__be32 __user *)optval, optlen); 657 (__be32 __user *)optval, optlen);
619 case DCCP_SOCKOPT_GET_CUR_MPS: 658 case DCCP_SOCKOPT_GET_CUR_MPS:
620 val = dp->dccps_mss_cache; 659 val = dp->dccps_mss_cache;
621 len = sizeof(val); 660 break;
661 case DCCP_SOCKOPT_SERVER_TIMEWAIT:
662 val = dp->dccps_server_timewait;
622 break; 663 break;
623 case DCCP_SOCKOPT_SEND_CSCOV: 664 case DCCP_SOCKOPT_SEND_CSCOV:
624 val = dp->dccps_pcslen; 665 val = dp->dccps_pcslen;
625 len = sizeof(val);
626 break; 666 break;
627 case DCCP_SOCKOPT_RECV_CSCOV: 667 case DCCP_SOCKOPT_RECV_CSCOV:
628 val = dp->dccps_pcrlen; 668 val = dp->dccps_pcrlen;
629 len = sizeof(val);
630 break; 669 break;
631 case 128 ... 191: 670 case 128 ... 191:
632 return ccid_hc_rx_getsockopt(dp->dccps_hc_rx_ccid, sk, optname, 671 return ccid_hc_rx_getsockopt(dp->dccps_hc_rx_ccid, sk, optname,
@@ -638,6 +677,7 @@ static int do_dccp_getsockopt(struct sock *sk, int level, int optname,
638 return -ENOPROTOOPT; 677 return -ENOPROTOOPT;
639 } 678 }
640 679
680 len = sizeof(val);
641 if (put_user(len, optlen) || copy_to_user(optval, &val, len)) 681 if (put_user(len, optlen) || copy_to_user(optval, &val, len))
642 return -EFAULT; 682 return -EFAULT;
643 683
@@ -748,19 +788,26 @@ int dccp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
748 788
749 dh = dccp_hdr(skb); 789 dh = dccp_hdr(skb);
750 790
751 if (dh->dccph_type == DCCP_PKT_DATA || 791 switch (dh->dccph_type) {
752 dh->dccph_type == DCCP_PKT_DATAACK) 792 case DCCP_PKT_DATA:
793 case DCCP_PKT_DATAACK:
753 goto found_ok_skb; 794 goto found_ok_skb;
754 795
755 if (dh->dccph_type == DCCP_PKT_RESET || 796 case DCCP_PKT_CLOSE:
756 dh->dccph_type == DCCP_PKT_CLOSE) { 797 case DCCP_PKT_CLOSEREQ:
757 dccp_pr_debug("found fin ok!\n"); 798 if (!(flags & MSG_PEEK))
799 dccp_finish_passive_close(sk);
800 /* fall through */
801 case DCCP_PKT_RESET:
802 dccp_pr_debug("found fin (%s) ok!\n",
803 dccp_packet_name(dh->dccph_type));
758 len = 0; 804 len = 0;
759 goto found_fin_ok; 805 goto found_fin_ok;
806 default:
807 dccp_pr_debug("packet_type=%s\n",
808 dccp_packet_name(dh->dccph_type));
809 sk_eat_skb(sk, skb, 0);
760 } 810 }
761 dccp_pr_debug("packet_type=%s\n",
762 dccp_packet_name(dh->dccph_type));
763 sk_eat_skb(sk, skb, 0);
764verify_sock_status: 811verify_sock_status:
765 if (sock_flag(sk, SOCK_DONE)) { 812 if (sock_flag(sk, SOCK_DONE)) {
766 len = 0; 813 len = 0;
@@ -862,34 +909,38 @@ out:
862 909
863EXPORT_SYMBOL_GPL(inet_dccp_listen); 910EXPORT_SYMBOL_GPL(inet_dccp_listen);
864 911
865static const unsigned char dccp_new_state[] = { 912static void dccp_terminate_connection(struct sock *sk)
866 /* current state: new state: action: */
867 [0] = DCCP_CLOSED,
868 [DCCP_OPEN] = DCCP_CLOSING | DCCP_ACTION_FIN,
869 [DCCP_REQUESTING] = DCCP_CLOSED,
870 [DCCP_PARTOPEN] = DCCP_CLOSING | DCCP_ACTION_FIN,
871 [DCCP_LISTEN] = DCCP_CLOSED,
872 [DCCP_RESPOND] = DCCP_CLOSED,
873 [DCCP_CLOSING] = DCCP_CLOSED,
874 [DCCP_TIME_WAIT] = DCCP_CLOSED,
875 [DCCP_CLOSED] = DCCP_CLOSED,
876};
877
878static int dccp_close_state(struct sock *sk)
879{ 913{
880 const int next = dccp_new_state[sk->sk_state]; 914 u8 next_state = DCCP_CLOSED;
881 const int ns = next & DCCP_STATE_MASK;
882 915
883 if (ns != sk->sk_state) 916 switch (sk->sk_state) {
884 dccp_set_state(sk, ns); 917 case DCCP_PASSIVE_CLOSE:
918 case DCCP_PASSIVE_CLOSEREQ:
919 dccp_finish_passive_close(sk);
920 break;
921 case DCCP_PARTOPEN:
922 dccp_pr_debug("Stop PARTOPEN timer (%p)\n", sk);
923 inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
924 /* fall through */
925 case DCCP_OPEN:
926 dccp_send_close(sk, 1);
885 927
886 return next & DCCP_ACTION_FIN; 928 if (dccp_sk(sk)->dccps_role == DCCP_ROLE_SERVER &&
929 !dccp_sk(sk)->dccps_server_timewait)
930 next_state = DCCP_ACTIVE_CLOSEREQ;
931 else
932 next_state = DCCP_CLOSING;
933 /* fall through */
934 default:
935 dccp_set_state(sk, next_state);
936 }
887} 937}
888 938
889void dccp_close(struct sock *sk, long timeout) 939void dccp_close(struct sock *sk, long timeout)
890{ 940{
891 struct dccp_sock *dp = dccp_sk(sk); 941 struct dccp_sock *dp = dccp_sk(sk);
892 struct sk_buff *skb; 942 struct sk_buff *skb;
943 u32 data_was_unread = 0;
893 int state; 944 int state;
894 945
895 lock_sock(sk); 946 lock_sock(sk);
@@ -912,16 +963,21 @@ void dccp_close(struct sock *sk, long timeout)
912 * descriptor close, not protocol-sourced closes, because the 963 * descriptor close, not protocol-sourced closes, because the
913 *reader process may not have drained the data yet! 964 *reader process may not have drained the data yet!
914 */ 965 */
915 /* FIXME: check for unread data */
916 while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) { 966 while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
967 data_was_unread += skb->len;
917 __kfree_skb(skb); 968 __kfree_skb(skb);
918 } 969 }
919 970
920 if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) { 971 if (data_was_unread) {
972 /* Unread data was tossed, send an appropriate Reset Code */
973 DCCP_WARN("DCCP: ABORT -- %u bytes unread\n", data_was_unread);
974 dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED);
975 dccp_set_state(sk, DCCP_CLOSED);
976 } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
921 /* Check zero linger _after_ checking for unread data. */ 977 /* Check zero linger _after_ checking for unread data. */
922 sk->sk_prot->disconnect(sk, 0); 978 sk->sk_prot->disconnect(sk, 0);
923 } else if (dccp_close_state(sk)) { 979 } else if (sk->sk_state != DCCP_CLOSED) {
924 dccp_send_close(sk, 1); 980 dccp_terminate_connection(sk);
925 } 981 }
926 982
927 sk_stream_wait_close(sk, timeout); 983 sk_stream_wait_close(sk, timeout);
@@ -948,24 +1004,6 @@ adjudge_to_death:
948 if (state != DCCP_CLOSED && sk->sk_state == DCCP_CLOSED) 1004 if (state != DCCP_CLOSED && sk->sk_state == DCCP_CLOSED)
949 goto out; 1005 goto out;
950 1006
951 /*
952 * The last release_sock may have processed the CLOSE or RESET
953 * packet moving sock to CLOSED state, if not we have to fire
954 * the CLOSE/CLOSEREQ retransmission timer, see "8.3. Termination"
955 * in draft-ietf-dccp-spec-11. -acme
956 */
957 if (sk->sk_state == DCCP_CLOSING) {
958 /* FIXME: should start at 2 * RTT */
959 /* Timer for repeating the CLOSE/CLOSEREQ until an answer. */
960 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
961 inet_csk(sk)->icsk_rto,
962 DCCP_RTO_MAX);
963#if 0
964 /* Yeah, we should use sk->sk_prot->orphan_count, etc */
965 dccp_set_state(sk, DCCP_CLOSED);
966#endif
967 }
968
969 if (sk->sk_state == DCCP_CLOSED) 1007 if (sk->sk_state == DCCP_CLOSED)
970 inet_csk_destroy_sock(sk); 1008 inet_csk_destroy_sock(sk);
971 1009
@@ -981,7 +1019,7 @@ EXPORT_SYMBOL_GPL(dccp_close);
981 1019
982void dccp_shutdown(struct sock *sk, int how) 1020void dccp_shutdown(struct sock *sk, int how)
983{ 1021{
984 dccp_pr_debug("entry\n"); 1022 dccp_pr_debug("called shutdown(%x)\n", how);
985} 1023}
986 1024
987EXPORT_SYMBOL_GPL(dccp_shutdown); 1025EXPORT_SYMBOL_GPL(dccp_shutdown);
diff --git a/net/dccp/sysctl.c b/net/dccp/sysctl.c
index c62c05039f69..21295993fdb8 100644
--- a/net/dccp/sysctl.c
+++ b/net/dccp/sysctl.c
@@ -100,41 +100,19 @@ static struct ctl_table dccp_default_table[] = {
100 { .ctl_name = 0, } 100 { .ctl_name = 0, }
101}; 101};
102 102
103static struct ctl_table dccp_table[] = { 103static struct ctl_path dccp_path[] = {
104 { 104 { .procname = "net", .ctl_name = CTL_NET, },
105 .ctl_name = NET_DCCP_DEFAULT, 105 { .procname = "dccp", .ctl_name = NET_DCCP, },
106 .procname = "default", 106 { .procname = "default", .ctl_name = NET_DCCP_DEFAULT, },
107 .mode = 0555, 107 { }
108 .child = dccp_default_table,
109 },
110 { .ctl_name = 0, },
111};
112
113static struct ctl_table dccp_dir_table[] = {
114 {
115 .ctl_name = NET_DCCP,
116 .procname = "dccp",
117 .mode = 0555,
118 .child = dccp_table,
119 },
120 { .ctl_name = 0, },
121};
122
123static struct ctl_table dccp_root_table[] = {
124 {
125 .ctl_name = CTL_NET,
126 .procname = "net",
127 .mode = 0555,
128 .child = dccp_dir_table,
129 },
130 { .ctl_name = 0, },
131}; 108};
132 109
133static struct ctl_table_header *dccp_table_header; 110static struct ctl_table_header *dccp_table_header;
134 111
135int __init dccp_sysctl_init(void) 112int __init dccp_sysctl_init(void)
136{ 113{
137 dccp_table_header = register_sysctl_table(dccp_root_table); 114 dccp_table_header = register_sysctl_paths(dccp_path,
115 dccp_default_table);
138 116
139 return dccp_table_header != NULL ? 0 : -ENOMEM; 117 return dccp_table_header != NULL ? 0 : -ENOMEM;
140} 118}
diff --git a/net/dccp/timer.c b/net/dccp/timer.c
index 3af067354bd4..8703a792b560 100644
--- a/net/dccp/timer.c
+++ b/net/dccp/timer.c
@@ -280,9 +280,8 @@ static void dccp_init_write_xmit_timer(struct sock *sk)
280{ 280{
281 struct dccp_sock *dp = dccp_sk(sk); 281 struct dccp_sock *dp = dccp_sk(sk);
282 282
283 init_timer(&dp->dccps_xmit_timer); 283 setup_timer(&dp->dccps_xmit_timer, dccp_write_xmit_timer,
284 dp->dccps_xmit_timer.data = (unsigned long)sk; 284 (unsigned long)sk);
285 dp->dccps_xmit_timer.function = dccp_write_xmit_timer;
286} 285}
287 286
288void dccp_init_xmit_timers(struct sock *sk) 287void dccp_init_xmit_timers(struct sock *sk)