aboutsummaryrefslogtreecommitdiffstats
path: root/net/dccp
diff options
context:
space:
mode:
Diffstat (limited to 'net/dccp')
-rw-r--r--net/dccp/Makefile4
-rw-r--r--net/dccp/ackvec.c415
-rw-r--r--net/dccp/ackvec.h38
-rw-r--r--net/dccp/ccids/ccid2.c134
-rw-r--r--net/dccp/ccids/ccid2.h2
-rw-r--r--net/dccp/dccp.h21
-rw-r--r--net/dccp/input.c34
-rw-r--r--net/dccp/ipv4.c13
-rw-r--r--net/dccp/options.c43
-rw-r--r--net/dccp/output.c22
-rw-r--r--net/dccp/proto.c71
-rw-r--r--net/dccp/qpolicy.c137
12 files changed, 566 insertions, 368 deletions
diff --git a/net/dccp/Makefile b/net/dccp/Makefile
index 2991efcc8dea..5c8362b037ed 100644
--- a/net/dccp/Makefile
+++ b/net/dccp/Makefile
@@ -1,7 +1,7 @@
1obj-$(CONFIG_IP_DCCP) += dccp.o dccp_ipv4.o 1obj-$(CONFIG_IP_DCCP) += dccp.o dccp_ipv4.o
2 2
3dccp-y := ccid.o feat.o input.o minisocks.o options.o output.o proto.o timer.o 3dccp-y := ccid.o feat.o input.o minisocks.o options.o output.o proto.o timer.o \
4 4 qpolicy.o
5# 5#
6# CCID algorithms to be used by dccp.ko 6# CCID algorithms to be used by dccp.ko
7# 7#
diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c
index abaf241c7353..25b7a8d1ad58 100644
--- a/net/dccp/ackvec.c
+++ b/net/dccp/ackvec.c
@@ -9,18 +9,10 @@
9 * under the terms of the GNU General Public License as published by the 9 * under the terms of the GNU General Public License as published by the
10 * Free Software Foundation; version 2 of the License; 10 * Free Software Foundation; version 2 of the License;
11 */ 11 */
12
13#include "ackvec.h"
14#include "dccp.h" 12#include "dccp.h"
15
16#include <linux/init.h>
17#include <linux/errno.h>
18#include <linux/kernel.h> 13#include <linux/kernel.h>
19#include <linux/skbuff.h>
20#include <linux/slab.h> 14#include <linux/slab.h>
21 15
22#include <net/sock.h>
23
24static struct kmem_cache *dccp_ackvec_slab; 16static struct kmem_cache *dccp_ackvec_slab;
25static struct kmem_cache *dccp_ackvec_record_slab; 17static struct kmem_cache *dccp_ackvec_record_slab;
26 18
@@ -92,6 +84,24 @@ int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seqno, u8 nonce_sum)
92 return 0; 84 return 0;
93} 85}
94 86
87static struct dccp_ackvec_record *dccp_ackvec_lookup(struct list_head *av_list,
88 const u64 ackno)
89{
90 struct dccp_ackvec_record *avr;
91 /*
92 * Exploit that records are inserted in descending order of sequence
93 * number, start with the oldest record first. If @ackno is `before'
94 * the earliest ack_ackno, the packet is too old to be considered.
95 */
96 list_for_each_entry_reverse(avr, av_list, avr_node) {
97 if (avr->avr_ack_seqno == ackno)
98 return avr;
99 if (before48(ackno, avr->avr_ack_seqno))
100 break;
101 }
102 return NULL;
103}
104
95/* 105/*
96 * Buffer index and length computation using modulo-buffersize arithmetic. 106 * Buffer index and length computation using modulo-buffersize arithmetic.
97 * Note that, as pointers move from right to left, head is `before' tail. 107 * Note that, as pointers move from right to left, head is `before' tail.
@@ -113,248 +123,253 @@ u16 dccp_ackvec_buflen(const struct dccp_ackvec *av)
113 return __ackvec_idx_sub(av->av_buf_tail, av->av_buf_head); 123 return __ackvec_idx_sub(av->av_buf_tail, av->av_buf_head);
114} 124}
115 125
116/* 126/**
117 * If several packets are missing, the HC-Receiver may prefer to enter multiple 127 * dccp_ackvec_update_old - Update previous state as per RFC 4340, 11.4.1
118 * bytes with run length 0, rather than a single byte with a larger run length; 128 * @av: non-empty buffer to update
119 * this simplifies table updates if one of the missing packets arrives. 129 * @distance: negative or zero distance of @seqno from buf_ackno downward
130 * @seqno: the (old) sequence number whose record is to be updated
131 * @state: state in which packet carrying @seqno was received
120 */ 132 */
121static inline int dccp_ackvec_set_buf_head_state(struct dccp_ackvec *av, 133static void dccp_ackvec_update_old(struct dccp_ackvec *av, s64 distance,
122 const unsigned int packets, 134 u64 seqno, enum dccp_ackvec_states state)
123 const unsigned char state)
124{ 135{
125 long gap; 136 u16 ptr = av->av_buf_head;
126 long new_head;
127 137
128 if (av->av_vec_len + packets > DCCPAV_MAX_ACKVEC_LEN) 138 BUG_ON(distance > 0);
129 return -ENOBUFS; 139 if (unlikely(dccp_ackvec_is_empty(av)))
140 return;
130 141
131 gap = packets - 1; 142 do {
132 new_head = av->av_buf_head - packets; 143 u8 runlen = dccp_ackvec_runlen(av->av_buf + ptr);
133 144
134 if (new_head < 0) { 145 if (distance + runlen >= 0) {
135 if (gap > 0) { 146 /*
136 memset(av->av_buf, DCCPAV_NOT_RECEIVED, 147 * Only update the state if packet has not been received
137 gap + new_head + 1); 148 * yet. This is OK as per the second table in RFC 4340,
138 gap = -new_head; 149 * 11.4.1; i.e. here we are using the following table:
150 * RECEIVED
151 * 0 1 3
152 * S +---+---+---+
153 * T 0 | 0 | 0 | 0 |
154 * O +---+---+---+
155 * R 1 | 1 | 1 | 1 |
156 * E +---+---+---+
157 * D 3 | 0 | 1 | 3 |
158 * +---+---+---+
159 * The "Not Received" state was set by reserve_seats().
160 */
161 if (av->av_buf[ptr] == DCCPAV_NOT_RECEIVED)
162 av->av_buf[ptr] = state;
163 else
164 dccp_pr_debug("Not changing %llu state to %u\n",
165 (unsigned long long)seqno, state);
166 break;
139 } 167 }
140 new_head += DCCPAV_MAX_ACKVEC_LEN;
141 }
142 168
143 av->av_buf_head = new_head; 169 distance += runlen + 1;
170 ptr = __ackvec_idx_add(ptr, 1);
144 171
145 if (gap > 0) 172 } while (ptr != av->av_buf_tail);
146 memset(av->av_buf + av->av_buf_head + 1, 173}
147 DCCPAV_NOT_RECEIVED, gap);
148 174
149 av->av_buf[av->av_buf_head] = state; 175/* Mark @num entries after buf_head as "Not yet received". */
150 av->av_vec_len += packets; 176static void dccp_ackvec_reserve_seats(struct dccp_ackvec *av, u16 num)
151 return 0; 177{
178 u16 start = __ackvec_idx_add(av->av_buf_head, 1),
179 len = DCCPAV_MAX_ACKVEC_LEN - start;
180
181 /* check for buffer wrap-around */
182 if (num > len) {
183 memset(av->av_buf + start, DCCPAV_NOT_RECEIVED, len);
184 start = 0;
185 num -= len;
186 }
187 if (num)
188 memset(av->av_buf + start, DCCPAV_NOT_RECEIVED, num);
152} 189}
153 190
154/* 191/**
155 * Implements the RFC 4340, Appendix A 192 * dccp_ackvec_add_new - Record one or more new entries in Ack Vector buffer
193 * @av: container of buffer to update (can be empty or non-empty)
194 * @num_packets: number of packets to register (must be >= 1)
195 * @seqno: sequence number of the first packet in @num_packets
196 * @state: state in which packet carrying @seqno was received
156 */ 197 */
157int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk, 198static void dccp_ackvec_add_new(struct dccp_ackvec *av, u32 num_packets,
158 const u64 ackno, const u8 state) 199 u64 seqno, enum dccp_ackvec_states state)
159{ 200{
160 u8 *cur_head = av->av_buf + av->av_buf_head, 201 u32 num_cells = num_packets;
161 *buf_end = av->av_buf + DCCPAV_MAX_ACKVEC_LEN;
162 /*
163 * Check at the right places if the buffer is full, if it is, tell the
164 * caller to start dropping packets till the HC-Sender acks our ACK
165 * vectors, when we will free up space in av_buf.
166 *
167 * We may well decide to do buffer compression, etc, but for now lets
168 * just drop.
169 *
170 * From Appendix A.1.1 (`New Packets'):
171 *
172 * Of course, the circular buffer may overflow, either when the
173 * HC-Sender is sending data at a very high rate, when the
174 * HC-Receiver's acknowledgements are not reaching the HC-Sender,
175 * or when the HC-Sender is forgetting to acknowledge those acks
176 * (so the HC-Receiver is unable to clean up old state). In this
177 * case, the HC-Receiver should either compress the buffer (by
178 * increasing run lengths when possible), transfer its state to
179 * a larger buffer, or, as a last resort, drop all received
180 * packets, without processing them whatsoever, until its buffer
181 * shrinks again.
182 */
183 202
184 /* See if this is the first ackno being inserted */ 203 if (num_packets > DCCPAV_BURST_THRESH) {
185 if (av->av_vec_len == 0) { 204 u32 lost_packets = num_packets - 1;
186 *cur_head = state;
187 av->av_vec_len = 1;
188 } else if (after48(ackno, av->av_buf_ackno)) {
189 const u64 delta = dccp_delta_seqno(av->av_buf_ackno, ackno);
190 205
206 DCCP_WARN("Warning: large burst loss (%u)\n", lost_packets);
191 /* 207 /*
192 * Look if the state of this packet is the same as the 208 * We received 1 packet and have a loss of size "num_packets-1"
193 * previous ackno and if so if we can bump the head len. 209 * which we squeeze into num_cells-1 rather than reserving an
210 * entire byte for each lost packet.
211 * The reason is that the vector grows in O(burst_length); when
212 * it grows too large there will no room left for the payload.
213 * This is a trade-off: if a few packets out of the burst show
214 * up later, their state will not be changed; it is simply too
215 * costly to reshuffle/reallocate/copy the buffer each time.
216 * Should such problems persist, we will need to switch to a
217 * different underlying data structure.
194 */ 218 */
195 if (delta == 1 && dccp_ackvec_state(cur_head) == state && 219 for (num_packets = num_cells = 1; lost_packets; ++num_cells) {
196 dccp_ackvec_runlen(cur_head) < DCCPAV_MAX_RUNLEN) 220 u8 len = min(lost_packets, (u32)DCCPAV_MAX_RUNLEN);
197 *cur_head += 1;
198 else if (dccp_ackvec_set_buf_head_state(av, delta, state))
199 return -ENOBUFS;
200 } else {
201 /*
202 * A.1.2. Old Packets
203 *
204 * When a packet with Sequence Number S <= buf_ackno
205 * arrives, the HC-Receiver will scan the table for
206 * the byte corresponding to S. (Indexing structures
207 * could reduce the complexity of this scan.)
208 */
209 u64 delta = dccp_delta_seqno(ackno, av->av_buf_ackno);
210 221
211 while (1) { 222 av->av_buf_head = __ackvec_idx_sub(av->av_buf_head, 1);
212 const u8 len = dccp_ackvec_runlen(cur_head); 223 av->av_buf[av->av_buf_head] = DCCPAV_NOT_RECEIVED | len;
213 /* 224
214 * valid packets not yet in av_buf have a reserved 225 lost_packets -= len;
215 * entry, with a len equal to 0.
216 */
217 if (*cur_head == DCCPAV_NOT_RECEIVED && delta == 0) {
218 dccp_pr_debug("Found %llu reserved seat!\n",
219 (unsigned long long)ackno);
220 *cur_head = state;
221 goto out;
222 }
223 /* len == 0 means one packet */
224 if (delta < len + 1)
225 goto out_duplicate;
226
227 delta -= len + 1;
228 if (++cur_head == buf_end)
229 cur_head = av->av_buf;
230 } 226 }
231 } 227 }
232 228
233 av->av_buf_ackno = ackno; 229 if (num_cells + dccp_ackvec_buflen(av) >= DCCPAV_MAX_ACKVEC_LEN) {
234out: 230 DCCP_CRIT("Ack Vector buffer overflow: dropping old entries\n");
235 return 0; 231 av->av_overflow = true;
232 }
233
234 av->av_buf_head = __ackvec_idx_sub(av->av_buf_head, num_packets);
235 if (av->av_overflow)
236 av->av_buf_tail = av->av_buf_head;
237
238 av->av_buf[av->av_buf_head] = state;
239 av->av_buf_ackno = seqno;
236 240
237out_duplicate: 241 if (num_packets > 1)
238 /* Duplicate packet */ 242 dccp_ackvec_reserve_seats(av, num_packets - 1);
239 dccp_pr_debug("Received a dup or already considered lost "
240 "packet: %llu\n", (unsigned long long)ackno);
241 return -EILSEQ;
242} 243}
243 244
244static void dccp_ackvec_throw_record(struct dccp_ackvec *av, 245/**
245 struct dccp_ackvec_record *avr) 246 * dccp_ackvec_input - Register incoming packet in the buffer
247 */
248void dccp_ackvec_input(struct dccp_ackvec *av, struct sk_buff *skb)
246{ 249{
247 struct dccp_ackvec_record *next; 250 u64 seqno = DCCP_SKB_CB(skb)->dccpd_seq;
251 enum dccp_ackvec_states state = DCCPAV_RECEIVED;
248 252
249 /* sort out vector length */ 253 if (dccp_ackvec_is_empty(av)) {
250 if (av->av_buf_head <= avr->avr_ack_ptr) 254 dccp_ackvec_add_new(av, 1, seqno, state);
251 av->av_vec_len = avr->avr_ack_ptr - av->av_buf_head; 255 av->av_tail_ackno = seqno;
252 else
253 av->av_vec_len = DCCPAV_MAX_ACKVEC_LEN - 1 -
254 av->av_buf_head + avr->avr_ack_ptr;
255 256
256 /* free records */ 257 } else {
257 list_for_each_entry_safe_from(avr, next, &av->av_records, avr_node) { 258 s64 num_packets = dccp_delta_seqno(av->av_buf_ackno, seqno);
258 list_del(&avr->avr_node); 259 u8 *current_head = av->av_buf + av->av_buf_head;
259 kmem_cache_free(dccp_ackvec_record_slab, avr);
260 }
261}
262 260
263void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av, struct sock *sk, 261 if (num_packets == 1 &&
264 const u64 ackno) 262 dccp_ackvec_state(current_head) == state &&
265{ 263 dccp_ackvec_runlen(current_head) < DCCPAV_MAX_RUNLEN) {
266 struct dccp_ackvec_record *avr;
267 264
268 /* 265 *current_head += 1;
269 * If we traverse backwards, it should be faster when we have large 266 av->av_buf_ackno = seqno;
270 * windows. We will be receiving ACKs for stuff we sent a while back 267
271 * -sorbo. 268 } else if (num_packets > 0) {
272 */ 269 dccp_ackvec_add_new(av, num_packets, seqno, state);
273 list_for_each_entry_reverse(avr, &av->av_records, avr_node) { 270 } else {
274 if (ackno == avr->avr_ack_seqno) { 271 dccp_ackvec_update_old(av, num_packets, seqno, state);
275 dccp_pr_debug("%s ACK packet 0, len=%d, ack_seqno=%llu, " 272 }
276 "ack_ackno=%llu, ACKED!\n",
277 dccp_role(sk), avr->avr_ack_runlen,
278 (unsigned long long)avr->avr_ack_seqno,
279 (unsigned long long)avr->avr_ack_ackno);
280 dccp_ackvec_throw_record(av, avr);
281 break;
282 } else if (avr->avr_ack_seqno > ackno)
283 break; /* old news */
284 } 273 }
285} 274}
286 275
287static void dccp_ackvec_check_rcv_ackvector(struct dccp_ackvec *av, 276/**
288 struct sock *sk, u64 *ackno, 277 * dccp_ackvec_clear_state - Perform house-keeping / garbage-collection
289 const unsigned char len, 278 * This routine is called when the peer acknowledges the receipt of Ack Vectors
290 const unsigned char *vector) 279 * up to and including @ackno. While based on on section A.3 of RFC 4340, here
280 * are additional precautions to prevent corrupted buffer state. In particular,
281 * we use tail_ackno to identify outdated records; it always marks the earliest
282 * packet of group (2) in 11.4.2.
283 */
284void dccp_ackvec_clear_state(struct dccp_ackvec *av, const u64 ackno)
291{ 285{
292 unsigned char i; 286 struct dccp_ackvec_record *avr, *next;
293 struct dccp_ackvec_record *avr; 287 u8 runlen_now, eff_runlen;
288 s64 delta;
294 289
295 /* Check if we actually sent an ACK vector */ 290 avr = dccp_ackvec_lookup(&av->av_records, ackno);
296 if (list_empty(&av->av_records)) 291 if (avr == NULL)
297 return; 292 return;
293 /*
294 * Deal with outdated acknowledgments: this arises when e.g. there are
295 * several old records and the acks from the peer come in slowly. In
296 * that case we may still have records that pre-date tail_ackno.
297 */
298 delta = dccp_delta_seqno(av->av_tail_ackno, avr->avr_ack_ackno);
299 if (delta < 0)
300 goto free_records;
301 /*
302 * Deal with overlapping Ack Vectors: don't subtract more than the
303 * number of packets between tail_ackno and ack_ackno.
304 */
305 eff_runlen = delta < avr->avr_ack_runlen ? delta : avr->avr_ack_runlen;
298 306
299 i = len; 307 runlen_now = dccp_ackvec_runlen(av->av_buf + avr->avr_ack_ptr);
300 /* 308 /*
301 * XXX 309 * The run length of Ack Vector cells does not decrease over time. If
302 * I think it might be more efficient to work backwards. See comment on 310 * the run length is the same as at the time the Ack Vector was sent, we
303 * rcv_ackno. -sorbo. 311 * free the ack_ptr cell. That cell can however not be freed if the run
312 * length has increased: in this case we need to move the tail pointer
313 * backwards (towards higher indices), to its next-oldest neighbour.
304 */ 314 */
305 avr = list_entry(av->av_records.next, struct dccp_ackvec_record, avr_node); 315 if (runlen_now > eff_runlen) {
306 while (i--) {
307 const u8 rl = dccp_ackvec_runlen(vector);
308 u64 ackno_end_rl;
309 316
310 dccp_set_seqno(&ackno_end_rl, *ackno - rl); 317 av->av_buf[avr->avr_ack_ptr] -= eff_runlen + 1;
318 av->av_buf_tail = __ackvec_idx_add(avr->avr_ack_ptr, 1);
311 319
320 /* This move may not have cleared the overflow flag. */
321 if (av->av_overflow)
322 av->av_overflow = (av->av_buf_head == av->av_buf_tail);
323 } else {
324 av->av_buf_tail = avr->avr_ack_ptr;
312 /* 325 /*
313 * If our AVR sequence number is greater than the ack, go 326 * We have made sure that avr points to a valid cell within the
314 * forward in the AVR list until it is not so. 327 * buffer. This cell is either older than head, or equals head
328 * (empty buffer): in both cases we no longer have any overflow.
315 */ 329 */
316 list_for_each_entry_from(avr, &av->av_records, avr_node) { 330 av->av_overflow = 0;
317 if (!after48(avr->avr_ack_seqno, *ackno)) 331 }
318 goto found;
319 }
320 /* End of the av_records list, not found, exit */
321 break;
322found:
323 if (between48(avr->avr_ack_seqno, ackno_end_rl, *ackno)) {
324 if (dccp_ackvec_state(vector) != DCCPAV_NOT_RECEIVED) {
325 dccp_pr_debug("%s ACK vector 0, len=%d, "
326 "ack_seqno=%llu, ack_ackno=%llu, "
327 "ACKED!\n",
328 dccp_role(sk), len,
329 (unsigned long long)
330 avr->avr_ack_seqno,
331 (unsigned long long)
332 avr->avr_ack_ackno);
333 dccp_ackvec_throw_record(av, avr);
334 break;
335 }
336 /*
337 * If it wasn't received, continue scanning... we might
338 * find another one.
339 */
340 }
341 332
342 dccp_set_seqno(ackno, ackno_end_rl - 1); 333 /*
343 ++vector; 334 * The peer has acknowledged up to and including ack_ackno. Hence the
335 * first packet in group (2) of 11.4.2 is the successor of ack_ackno.
336 */
337 av->av_tail_ackno = ADD48(avr->avr_ack_ackno, 1);
338
339free_records:
340 list_for_each_entry_safe_from(avr, next, &av->av_records, avr_node) {
341 list_del(&avr->avr_node);
342 kmem_cache_free(dccp_ackvec_record_slab, avr);
344 } 343 }
345} 344}
346 345
347int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb, 346/*
348 u64 *ackno, const u8 opt, const u8 *value, const u8 len) 347 * Routines to keep track of Ack Vectors received in an skb
348 */
349int dccp_ackvec_parsed_add(struct list_head *head, u8 *vec, u8 len, u8 nonce)
349{ 350{
350 if (len > DCCP_SINGLE_OPT_MAXLEN) 351 struct dccp_ackvec_parsed *new = kmalloc(sizeof(*new), GFP_ATOMIC);
351 return -1; 352
353 if (new == NULL)
354 return -ENOBUFS;
355 new->vec = vec;
356 new->len = len;
357 new->nonce = nonce;
352 358
353 /* dccp_ackvector_print(DCCP_SKB_CB(skb)->dccpd_ack_seq, value, len); */ 359 list_add_tail(&new->node, head);
354 dccp_ackvec_check_rcv_ackvector(dccp_sk(sk)->dccps_hc_rx_ackvec, sk,
355 ackno, len, value);
356 return 0; 360 return 0;
357} 361}
362EXPORT_SYMBOL_GPL(dccp_ackvec_parsed_add);
363
364void dccp_ackvec_parsed_cleanup(struct list_head *parsed_chunks)
365{
366 struct dccp_ackvec_parsed *cur, *next;
367
368 list_for_each_entry_safe(cur, next, parsed_chunks, node)
369 kfree(cur);
370 INIT_LIST_HEAD(parsed_chunks);
371}
372EXPORT_SYMBOL_GPL(dccp_ackvec_parsed_cleanup);
358 373
359int __init dccp_ackvec_init(void) 374int __init dccp_ackvec_init(void)
360{ 375{
diff --git a/net/dccp/ackvec.h b/net/dccp/ackvec.h
index 23880be8fc29..e2ab0627a5ff 100644
--- a/net/dccp/ackvec.h
+++ b/net/dccp/ackvec.h
@@ -29,6 +29,9 @@
29/* Estimated minimum average Ack Vector length - used for updating MPS */ 29/* Estimated minimum average Ack Vector length - used for updating MPS */
30#define DCCPAV_MIN_OPTLEN 16 30#define DCCPAV_MIN_OPTLEN 16
31 31
32/* Threshold for coping with large bursts of losses */
33#define DCCPAV_BURST_THRESH (DCCPAV_MAX_ACKVEC_LEN / 8)
34
32enum dccp_ackvec_states { 35enum dccp_ackvec_states {
33 DCCPAV_RECEIVED = 0x00, 36 DCCPAV_RECEIVED = 0x00,
34 DCCPAV_ECN_MARKED = 0x40, 37 DCCPAV_ECN_MARKED = 0x40,
@@ -61,7 +64,6 @@ static inline u8 dccp_ackvec_state(const u8 *cell)
61 * %DCCP_SINGLE_OPT_MAXLEN cells in the live portion of @av_buf 64 * %DCCP_SINGLE_OPT_MAXLEN cells in the live portion of @av_buf
62 * @av_overflow: if 1 then buf_head == buf_tail indicates buffer wraparound 65 * @av_overflow: if 1 then buf_head == buf_tail indicates buffer wraparound
63 * @av_records: list of %dccp_ackvec_record (Ack Vectors sent previously) 66 * @av_records: list of %dccp_ackvec_record (Ack Vectors sent previously)
64 * @av_veclen: length of the live portion of @av_buf
65 */ 67 */
66struct dccp_ackvec { 68struct dccp_ackvec {
67 u8 av_buf[DCCPAV_MAX_ACKVEC_LEN]; 69 u8 av_buf[DCCPAV_MAX_ACKVEC_LEN];
@@ -72,7 +74,6 @@ struct dccp_ackvec {
72 bool av_buf_nonce[DCCPAV_NUM_ACKVECS]; 74 bool av_buf_nonce[DCCPAV_NUM_ACKVECS];
73 u8 av_overflow:1; 75 u8 av_overflow:1;
74 struct list_head av_records; 76 struct list_head av_records;
75 u16 av_vec_len;
76}; 77};
77 78
78/** struct dccp_ackvec_record - Records information about sent Ack Vectors 79/** struct dccp_ackvec_record - Records information about sent Ack Vectors
@@ -98,29 +99,38 @@ struct dccp_ackvec_record {
98 u8 avr_ack_nonce:1; 99 u8 avr_ack_nonce:1;
99}; 100};
100 101
101struct sock;
102struct sk_buff;
103
104extern int dccp_ackvec_init(void); 102extern int dccp_ackvec_init(void);
105extern void dccp_ackvec_exit(void); 103extern void dccp_ackvec_exit(void);
106 104
107extern struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority); 105extern struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority);
108extern void dccp_ackvec_free(struct dccp_ackvec *av); 106extern void dccp_ackvec_free(struct dccp_ackvec *av);
109 107
110extern int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk, 108extern void dccp_ackvec_input(struct dccp_ackvec *av, struct sk_buff *skb);
111 const u64 ackno, const u8 state);
112
113extern void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av,
114 struct sock *sk, const u64 ackno);
115extern int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb,
116 u64 *ackno, const u8 opt,
117 const u8 *value, const u8 len);
118
119extern int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seq, u8 sum); 109extern int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seq, u8 sum);
110extern void dccp_ackvec_clear_state(struct dccp_ackvec *av, const u64 ackno);
120extern u16 dccp_ackvec_buflen(const struct dccp_ackvec *av); 111extern u16 dccp_ackvec_buflen(const struct dccp_ackvec *av);
121 112
122static inline bool dccp_ackvec_is_empty(const struct dccp_ackvec *av) 113static inline bool dccp_ackvec_is_empty(const struct dccp_ackvec *av)
123{ 114{
124 return av->av_overflow == 0 && av->av_buf_head == av->av_buf_tail; 115 return av->av_overflow == 0 && av->av_buf_head == av->av_buf_tail;
125} 116}
117
118/**
119 * struct dccp_ackvec_parsed - Record offsets of Ack Vectors in skb
120 * @vec: start of vector (offset into skb)
121 * @len: length of @vec
122 * @nonce: whether @vec had an ECN nonce of 0 or 1
123 * @node: FIFO - arranged in descending order of ack_ackno
124 * This structure is used by CCIDs to access Ack Vectors in a received skb.
125 */
126struct dccp_ackvec_parsed {
127 u8 *vec,
128 len,
129 nonce:1;
130 struct list_head node;
131};
132
133extern int dccp_ackvec_parsed_add(struct list_head *head,
134 u8 *vec, u8 len, u8 nonce);
135extern void dccp_ackvec_parsed_cleanup(struct list_head *parsed_chunks);
126#endif /* _ACKVEC_H */ 136#endif /* _ACKVEC_H */
diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c
index cb1b4a0d1877..e96d5e810039 100644
--- a/net/dccp/ccids/ccid2.c
+++ b/net/dccp/ccids/ccid2.c
@@ -246,68 +246,6 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len)
246#endif 246#endif
247} 247}
248 248
249/* XXX Lame code duplication!
250 * returns -1 if none was found.
251 * else returns the next offset to use in the function call.
252 */
253static int ccid2_ackvector(struct sock *sk, struct sk_buff *skb, int offset,
254 unsigned char **vec, unsigned char *veclen)
255{
256 const struct dccp_hdr *dh = dccp_hdr(skb);
257 unsigned char *options = (unsigned char *)dh + dccp_hdr_len(skb);
258 unsigned char *opt_ptr;
259 const unsigned char *opt_end = (unsigned char *)dh +
260 (dh->dccph_doff * 4);
261 unsigned char opt, len;
262 unsigned char *value;
263
264 BUG_ON(offset < 0);
265 options += offset;
266 opt_ptr = options;
267 if (opt_ptr >= opt_end)
268 return -1;
269
270 while (opt_ptr != opt_end) {
271 opt = *opt_ptr++;
272 len = 0;
273 value = NULL;
274
275 /* Check if this isn't a single byte option */
276 if (opt > DCCPO_MAX_RESERVED) {
277 if (opt_ptr == opt_end)
278 goto out_invalid_option;
279
280 len = *opt_ptr++;
281 if (len < 3)
282 goto out_invalid_option;
283 /*
284 * Remove the type and len fields, leaving
285 * just the value size
286 */
287 len -= 2;
288 value = opt_ptr;
289 opt_ptr += len;
290
291 if (opt_ptr > opt_end)
292 goto out_invalid_option;
293 }
294
295 switch (opt) {
296 case DCCPO_ACK_VECTOR_0:
297 case DCCPO_ACK_VECTOR_1:
298 *vec = value;
299 *veclen = len;
300 return offset + (opt_ptr - options);
301 }
302 }
303
304 return -1;
305
306out_invalid_option:
307 DCCP_BUG("Invalid option - this should not happen (previous parsing)!");
308 return -1;
309}
310
311/** 249/**
312 * ccid2_rtt_estimator - Sample RTT and compute RTO using RFC2988 algorithm 250 * ccid2_rtt_estimator - Sample RTT and compute RTO using RFC2988 algorithm
313 * This code is almost identical with TCP's tcp_rtt_estimator(), since 251 * This code is almost identical with TCP's tcp_rtt_estimator(), since
@@ -432,16 +370,28 @@ static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp)
432 ccid2_change_l_ack_ratio(sk, hc->tx_cwnd); 370 ccid2_change_l_ack_ratio(sk, hc->tx_cwnd);
433} 371}
434 372
373static int ccid2_hc_tx_parse_options(struct sock *sk, u8 packet_type,
374 u8 option, u8 *optval, u8 optlen)
375{
376 struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
377
378 switch (option) {
379 case DCCPO_ACK_VECTOR_0:
380 case DCCPO_ACK_VECTOR_1:
381 return dccp_ackvec_parsed_add(&hc->tx_av_chunks, optval, optlen,
382 option - DCCPO_ACK_VECTOR_0);
383 }
384 return 0;
385}
386
435static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) 387static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
436{ 388{
437 struct dccp_sock *dp = dccp_sk(sk); 389 struct dccp_sock *dp = dccp_sk(sk);
438 struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); 390 struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
439 const bool sender_was_blocked = ccid2_cwnd_network_limited(hc); 391 const bool sender_was_blocked = ccid2_cwnd_network_limited(hc);
392 struct dccp_ackvec_parsed *avp;
440 u64 ackno, seqno; 393 u64 ackno, seqno;
441 struct ccid2_seq *seqp; 394 struct ccid2_seq *seqp;
442 unsigned char *vector;
443 unsigned char veclen;
444 int offset = 0;
445 int done = 0; 395 int done = 0;
446 unsigned int maxincr = 0; 396 unsigned int maxincr = 0;
447 397
@@ -475,17 +425,12 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
475 } 425 }
476 426
477 /* check forward path congestion */ 427 /* check forward path congestion */
478 /* still didn't send out new data packets */ 428 if (dccp_packet_without_ack(skb))
479 if (hc->tx_seqh == hc->tx_seqt)
480 return; 429 return;
481 430
482 switch (DCCP_SKB_CB(skb)->dccpd_type) { 431 /* still didn't send out new data packets */
483 case DCCP_PKT_ACK: 432 if (hc->tx_seqh == hc->tx_seqt)
484 case DCCP_PKT_DATAACK: 433 goto done;
485 break;
486 default:
487 return;
488 }
489 434
490 ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq; 435 ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq;
491 if (after48(ackno, hc->tx_high_ack)) 436 if (after48(ackno, hc->tx_high_ack))
@@ -509,15 +454,16 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
509 maxincr = DIV_ROUND_UP(dp->dccps_l_ack_ratio, 2); 454 maxincr = DIV_ROUND_UP(dp->dccps_l_ack_ratio, 2);
510 455
511 /* go through all ack vectors */ 456 /* go through all ack vectors */
512 while ((offset = ccid2_ackvector(sk, skb, offset, 457 list_for_each_entry(avp, &hc->tx_av_chunks, node) {
513 &vector, &veclen)) != -1) {
514 /* go through this ack vector */ 458 /* go through this ack vector */
515 while (veclen--) { 459 for (; avp->len--; avp->vec++) {
516 u64 ackno_end_rl = SUB48(ackno, dccp_ackvec_runlen(vector)); 460 u64 ackno_end_rl = SUB48(ackno,
461 dccp_ackvec_runlen(avp->vec));
517 462
518 ccid2_pr_debug("ackvec start:%llu end:%llu\n", 463 ccid2_pr_debug("ackvec %llu |%u,%u|\n",
519 (unsigned long long)ackno, 464 (unsigned long long)ackno,
520 (unsigned long long)ackno_end_rl); 465 dccp_ackvec_state(avp->vec) >> 6,
466 dccp_ackvec_runlen(avp->vec));
521 /* if the seqno we are analyzing is larger than the 467 /* if the seqno we are analyzing is larger than the
522 * current ackno, then move towards the tail of our 468 * current ackno, then move towards the tail of our
523 * seqnos. 469 * seqnos.
@@ -536,7 +482,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
536 * run length 482 * run length
537 */ 483 */
538 while (between48(seqp->ccid2s_seq,ackno_end_rl,ackno)) { 484 while (between48(seqp->ccid2s_seq,ackno_end_rl,ackno)) {
539 const u8 state = dccp_ackvec_state(vector); 485 const u8 state = dccp_ackvec_state(avp->vec);
540 486
541 /* new packet received or marked */ 487 /* new packet received or marked */
542 if (state != DCCPAV_NOT_RECEIVED && 488 if (state != DCCPAV_NOT_RECEIVED &&
@@ -563,7 +509,6 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
563 break; 509 break;
564 510
565 ackno = SUB48(ackno_end_rl, 1); 511 ackno = SUB48(ackno_end_rl, 1);
566 vector++;
567 } 512 }
568 if (done) 513 if (done)
569 break; 514 break;
@@ -631,10 +576,11 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
631 sk_stop_timer(sk, &hc->tx_rtotimer); 576 sk_stop_timer(sk, &hc->tx_rtotimer);
632 else 577 else
633 sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto); 578 sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto);
634 579done:
635 /* check if incoming Acks allow pending packets to be sent */ 580 /* check if incoming Acks allow pending packets to be sent */
636 if (sender_was_blocked && !ccid2_cwnd_network_limited(hc)) 581 if (sender_was_blocked && !ccid2_cwnd_network_limited(hc))
637 tasklet_schedule(&dccp_sk(sk)->dccps_xmitlet); 582 tasklet_schedule(&dccp_sk(sk)->dccps_xmitlet);
583 dccp_ackvec_parsed_cleanup(&hc->tx_av_chunks);
638} 584}
639 585
640static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk) 586static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk)
@@ -663,6 +609,7 @@ static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk)
663 hc->tx_last_cong = ccid2_time_stamp; 609 hc->tx_last_cong = ccid2_time_stamp;
664 setup_timer(&hc->tx_rtotimer, ccid2_hc_tx_rto_expire, 610 setup_timer(&hc->tx_rtotimer, ccid2_hc_tx_rto_expire,
665 (unsigned long)sk); 611 (unsigned long)sk);
612 INIT_LIST_HEAD(&hc->tx_av_chunks);
666 return 0; 613 return 0;
667} 614}
668 615
@@ -696,16 +643,17 @@ static void ccid2_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
696} 643}
697 644
698struct ccid_operations ccid2_ops = { 645struct ccid_operations ccid2_ops = {
699 .ccid_id = DCCPC_CCID2, 646 .ccid_id = DCCPC_CCID2,
700 .ccid_name = "TCP-like", 647 .ccid_name = "TCP-like",
701 .ccid_hc_tx_obj_size = sizeof(struct ccid2_hc_tx_sock), 648 .ccid_hc_tx_obj_size = sizeof(struct ccid2_hc_tx_sock),
702 .ccid_hc_tx_init = ccid2_hc_tx_init, 649 .ccid_hc_tx_init = ccid2_hc_tx_init,
703 .ccid_hc_tx_exit = ccid2_hc_tx_exit, 650 .ccid_hc_tx_exit = ccid2_hc_tx_exit,
704 .ccid_hc_tx_send_packet = ccid2_hc_tx_send_packet, 651 .ccid_hc_tx_send_packet = ccid2_hc_tx_send_packet,
705 .ccid_hc_tx_packet_sent = ccid2_hc_tx_packet_sent, 652 .ccid_hc_tx_packet_sent = ccid2_hc_tx_packet_sent,
706 .ccid_hc_tx_packet_recv = ccid2_hc_tx_packet_recv, 653 .ccid_hc_tx_parse_options = ccid2_hc_tx_parse_options,
707 .ccid_hc_rx_obj_size = sizeof(struct ccid2_hc_rx_sock), 654 .ccid_hc_tx_packet_recv = ccid2_hc_tx_packet_recv,
708 .ccid_hc_rx_packet_recv = ccid2_hc_rx_packet_recv, 655 .ccid_hc_rx_obj_size = sizeof(struct ccid2_hc_rx_sock),
656 .ccid_hc_rx_packet_recv = ccid2_hc_rx_packet_recv,
709}; 657};
710 658
711#ifdef CONFIG_IP_DCCP_CCID2_DEBUG 659#ifdef CONFIG_IP_DCCP_CCID2_DEBUG
diff --git a/net/dccp/ccids/ccid2.h b/net/dccp/ccids/ccid2.h
index 25cb6b216eda..e9985dafc2c7 100644
--- a/net/dccp/ccids/ccid2.h
+++ b/net/dccp/ccids/ccid2.h
@@ -55,6 +55,7 @@ struct ccid2_seq {
55 * @tx_rtt_seq: to decay RTTVAR at most once per flight 55 * @tx_rtt_seq: to decay RTTVAR at most once per flight
56 * @tx_rpseq: last consecutive seqno 56 * @tx_rpseq: last consecutive seqno
57 * @tx_rpdupack: dupacks since rpseq 57 * @tx_rpdupack: dupacks since rpseq
58 * @tx_av_chunks: list of Ack Vectors received on current skb
58 */ 59 */
59struct ccid2_hc_tx_sock { 60struct ccid2_hc_tx_sock {
60 u32 tx_cwnd; 61 u32 tx_cwnd;
@@ -79,6 +80,7 @@ struct ccid2_hc_tx_sock {
79 int tx_rpdupack; 80 int tx_rpdupack;
80 u32 tx_last_cong; 81 u32 tx_last_cong;
81 u64 tx_high_ack; 82 u64 tx_high_ack;
83 struct list_head tx_av_chunks;
82}; 84};
83 85
84static inline bool ccid2_cwnd_network_limited(struct ccid2_hc_tx_sock *hc) 86static inline bool ccid2_cwnd_network_limited(struct ccid2_hc_tx_sock *hc)
diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
index 19fafd597465..45087052d894 100644
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -93,9 +93,6 @@ extern void dccp_time_wait(struct sock *sk, int state, int timeo);
93#define DCCP_FALLBACK_RTT (USEC_PER_SEC / 5) 93#define DCCP_FALLBACK_RTT (USEC_PER_SEC / 5)
94#define DCCP_SANE_RTT_MAX (3 * USEC_PER_SEC) 94#define DCCP_SANE_RTT_MAX (3 * USEC_PER_SEC)
95 95
96/* Maximal interval between probes for local resources. */
97#define DCCP_RESOURCE_PROBE_INTERVAL ((unsigned)(HZ / 2U))
98
99/* sysctl variables for DCCP */ 96/* sysctl variables for DCCP */
100extern int sysctl_dccp_request_retries; 97extern int sysctl_dccp_request_retries;
101extern int sysctl_dccp_retries1; 98extern int sysctl_dccp_retries1;
@@ -203,12 +200,7 @@ struct dccp_mib {
203DECLARE_SNMP_STAT(struct dccp_mib, dccp_statistics); 200DECLARE_SNMP_STAT(struct dccp_mib, dccp_statistics);
204#define DCCP_INC_STATS(field) SNMP_INC_STATS(dccp_statistics, field) 201#define DCCP_INC_STATS(field) SNMP_INC_STATS(dccp_statistics, field)
205#define DCCP_INC_STATS_BH(field) SNMP_INC_STATS_BH(dccp_statistics, field) 202#define DCCP_INC_STATS_BH(field) SNMP_INC_STATS_BH(dccp_statistics, field)
206#define DCCP_INC_STATS_USER(field) SNMP_INC_STATS_USER(dccp_statistics, field)
207#define DCCP_DEC_STATS(field) SNMP_DEC_STATS(dccp_statistics, field) 203#define DCCP_DEC_STATS(field) SNMP_DEC_STATS(dccp_statistics, field)
208#define DCCP_ADD_STATS_BH(field, val) \
209 SNMP_ADD_STATS_BH(dccp_statistics, field, val)
210#define DCCP_ADD_STATS_USER(field, val) \
211 SNMP_ADD_STATS_USER(dccp_statistics, field, val)
212 204
213/* 205/*
214 * Checksumming routines 206 * Checksumming routines
@@ -243,6 +235,19 @@ extern void dccp_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
243extern void dccp_send_sync(struct sock *sk, const u64 seq, 235extern void dccp_send_sync(struct sock *sk, const u64 seq,
244 const enum dccp_pkt_type pkt_type); 236 const enum dccp_pkt_type pkt_type);
245 237
238/*
239 * TX Packet Dequeueing Interface
240 */
241extern void dccp_qpolicy_push(struct sock *sk, struct sk_buff *skb);
242extern bool dccp_qpolicy_full(struct sock *sk);
243extern void dccp_qpolicy_drop(struct sock *sk, struct sk_buff *skb);
244extern struct sk_buff *dccp_qpolicy_top(struct sock *sk);
245extern struct sk_buff *dccp_qpolicy_pop(struct sock *sk);
246extern bool dccp_qpolicy_param_ok(struct sock *sk, __be32 param);
247
248/*
249 * TX Packet Output and TX Timers
250 */
246extern void dccp_write_xmit(struct sock *sk); 251extern void dccp_write_xmit(struct sock *sk);
247extern void dccp_write_space(struct sock *sk); 252extern void dccp_write_space(struct sock *sk);
248extern void dccp_flush_write_queue(struct sock *sk, long *time_budget); 253extern void dccp_flush_write_queue(struct sock *sk, long *time_budget);
diff --git a/net/dccp/input.c b/net/dccp/input.c
index c7aeeba859d4..15af247ea007 100644
--- a/net/dccp/input.c
+++ b/net/dccp/input.c
@@ -160,13 +160,15 @@ static void dccp_rcv_reset(struct sock *sk, struct sk_buff *skb)
160 dccp_time_wait(sk, DCCP_TIME_WAIT, 0); 160 dccp_time_wait(sk, DCCP_TIME_WAIT, 0);
161} 161}
162 162
163static void dccp_event_ack_recv(struct sock *sk, struct sk_buff *skb) 163static void dccp_handle_ackvec_processing(struct sock *sk, struct sk_buff *skb)
164{ 164{
165 struct dccp_sock *dp = dccp_sk(sk); 165 struct dccp_ackvec *av = dccp_sk(sk)->dccps_hc_rx_ackvec;
166 166
167 if (dp->dccps_hc_rx_ackvec != NULL) 167 if (av == NULL)
168 dccp_ackvec_check_rcv_ackno(dp->dccps_hc_rx_ackvec, sk, 168 return;
169 DCCP_SKB_CB(skb)->dccpd_ack_seq); 169 if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
170 dccp_ackvec_clear_state(av, DCCP_SKB_CB(skb)->dccpd_ack_seq);
171 dccp_ackvec_input(av, skb);
170} 172}
171 173
172static void dccp_deliver_input_to_ccids(struct sock *sk, struct sk_buff *skb) 174static void dccp_deliver_input_to_ccids(struct sock *sk, struct sk_buff *skb)
@@ -239,7 +241,8 @@ static int dccp_check_seqno(struct sock *sk, struct sk_buff *skb)
239 dccp_update_gsr(sk, seqno); 241 dccp_update_gsr(sk, seqno);
240 242
241 if (dh->dccph_type != DCCP_PKT_SYNC && 243 if (dh->dccph_type != DCCP_PKT_SYNC &&
242 (ackno != DCCP_PKT_WITHOUT_ACK_SEQ)) 244 ackno != DCCP_PKT_WITHOUT_ACK_SEQ &&
245 after48(ackno, dp->dccps_gar))
243 dp->dccps_gar = ackno; 246 dp->dccps_gar = ackno;
244 } else { 247 } else {
245 unsigned long now = jiffies; 248 unsigned long now = jiffies;
@@ -365,21 +368,13 @@ discard:
365int dccp_rcv_established(struct sock *sk, struct sk_buff *skb, 368int dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
366 const struct dccp_hdr *dh, const unsigned len) 369 const struct dccp_hdr *dh, const unsigned len)
367{ 370{
368 struct dccp_sock *dp = dccp_sk(sk);
369
370 if (dccp_check_seqno(sk, skb)) 371 if (dccp_check_seqno(sk, skb))
371 goto discard; 372 goto discard;
372 373
373 if (dccp_parse_options(sk, NULL, skb)) 374 if (dccp_parse_options(sk, NULL, skb))
374 return 1; 375 return 1;
375 376
376 if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) 377 dccp_handle_ackvec_processing(sk, skb);
377 dccp_event_ack_recv(sk, skb);
378
379 if (dp->dccps_hc_rx_ackvec != NULL &&
380 dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk,
381 DCCP_SKB_CB(skb)->dccpd_seq, DCCPAV_RECEIVED))
382 goto discard;
383 dccp_deliver_input_to_ccids(sk, skb); 378 dccp_deliver_input_to_ccids(sk, skb);
384 379
385 return __dccp_rcv_established(sk, skb, dh, len); 380 return __dccp_rcv_established(sk, skb, dh, len);
@@ -631,14 +626,7 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
631 if (dccp_parse_options(sk, NULL, skb)) 626 if (dccp_parse_options(sk, NULL, skb))
632 return 1; 627 return 1;
633 628
634 if (dcb->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) 629 dccp_handle_ackvec_processing(sk, skb);
635 dccp_event_ack_recv(sk, skb);
636
637 if (dp->dccps_hc_rx_ackvec != NULL &&
638 dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk,
639 DCCP_SKB_CB(skb)->dccpd_seq, DCCPAV_RECEIVED))
640 goto discard;
641
642 dccp_deliver_input_to_ccids(sk, skb); 630 dccp_deliver_input_to_ccids(sk, skb);
643 } 631 }
644 632
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 3f69ea114829..45a434f94169 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -462,15 +462,12 @@ static struct dst_entry* dccp_v4_route_skb(struct net *net, struct sock *sk,
462{ 462{
463 struct rtable *rt; 463 struct rtable *rt;
464 struct flowi fl = { .oif = skb_rtable(skb)->rt_iif, 464 struct flowi fl = { .oif = skb_rtable(skb)->rt_iif,
465 .nl_u = { .ip4_u = 465 .fl4_dst = ip_hdr(skb)->saddr,
466 { .daddr = ip_hdr(skb)->saddr, 466 .fl4_src = ip_hdr(skb)->daddr,
467 .saddr = ip_hdr(skb)->daddr, 467 .fl4_tos = RT_CONN_FLAGS(sk),
468 .tos = RT_CONN_FLAGS(sk) } },
469 .proto = sk->sk_protocol, 468 .proto = sk->sk_protocol,
470 .uli_u = { .ports = 469 .fl_ip_sport = dccp_hdr(skb)->dccph_dport,
471 { .sport = dccp_hdr(skb)->dccph_dport, 470 .fl_ip_dport = dccp_hdr(skb)->dccph_sport
472 .dport = dccp_hdr(skb)->dccph_sport }
473 }
474 }; 471 };
475 472
476 security_skb_classify_flow(skb, &fl); 473 security_skb_classify_flow(skb, &fl);
diff --git a/net/dccp/options.c b/net/dccp/options.c
index 5adeeed5e0d2..f06ffcfc8d71 100644
--- a/net/dccp/options.c
+++ b/net/dccp/options.c
@@ -54,7 +54,6 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
54 struct dccp_sock *dp = dccp_sk(sk); 54 struct dccp_sock *dp = dccp_sk(sk);
55 const struct dccp_hdr *dh = dccp_hdr(skb); 55 const struct dccp_hdr *dh = dccp_hdr(skb);
56 const u8 pkt_type = DCCP_SKB_CB(skb)->dccpd_type; 56 const u8 pkt_type = DCCP_SKB_CB(skb)->dccpd_type;
57 u64 ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq;
58 unsigned char *options = (unsigned char *)dh + dccp_hdr_len(skb); 57 unsigned char *options = (unsigned char *)dh + dccp_hdr_len(skb);
59 unsigned char *opt_ptr = options; 58 unsigned char *opt_ptr = options;
60 const unsigned char *opt_end = (unsigned char *)dh + 59 const unsigned char *opt_end = (unsigned char *)dh +
@@ -129,14 +128,6 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
129 if (rc) 128 if (rc)
130 goto out_featneg_failed; 129 goto out_featneg_failed;
131 break; 130 break;
132 case DCCPO_ACK_VECTOR_0:
133 case DCCPO_ACK_VECTOR_1:
134 if (dccp_packet_without_ack(skb)) /* RFC 4340, 11.4 */
135 break;
136 if (dp->dccps_hc_rx_ackvec != NULL &&
137 dccp_ackvec_parse(sk, skb, &ackno, opt, value, len))
138 goto out_invalid_option;
139 break;
140 case DCCPO_TIMESTAMP: 131 case DCCPO_TIMESTAMP:
141 if (len != 4) 132 if (len != 4)
142 goto out_invalid_option; 133 goto out_invalid_option;
@@ -226,6 +217,16 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
226 pkt_type, opt, value, len)) 217 pkt_type, opt, value, len))
227 goto out_invalid_option; 218 goto out_invalid_option;
228 break; 219 break;
220 case DCCPO_ACK_VECTOR_0:
221 case DCCPO_ACK_VECTOR_1:
222 if (dccp_packet_without_ack(skb)) /* RFC 4340, 11.4 */
223 break;
224 /*
225 * Ack vectors are processed by the TX CCID if it is
226 * interested. The RX CCID need not parse Ack Vectors,
227 * since it is only interested in clearing old state.
228 * Fall through.
229 */
229 case DCCPO_MIN_TX_CCID_SPECIFIC ... DCCPO_MAX_TX_CCID_SPECIFIC: 230 case DCCPO_MIN_TX_CCID_SPECIFIC ... DCCPO_MAX_TX_CCID_SPECIFIC:
230 if (ccid_hc_tx_parse_options(dp->dccps_hc_tx_ccid, sk, 231 if (ccid_hc_tx_parse_options(dp->dccps_hc_tx_ccid, sk,
231 pkt_type, opt, value, len)) 232 pkt_type, opt, value, len))
@@ -429,6 +430,7 @@ static int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb)
429{ 430{
430 struct dccp_sock *dp = dccp_sk(sk); 431 struct dccp_sock *dp = dccp_sk(sk);
431 struct dccp_ackvec *av = dp->dccps_hc_rx_ackvec; 432 struct dccp_ackvec *av = dp->dccps_hc_rx_ackvec;
433 struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
432 const u16 buflen = dccp_ackvec_buflen(av); 434 const u16 buflen = dccp_ackvec_buflen(av);
433 /* Figure out how many options do we need to represent the ackvec */ 435 /* Figure out how many options do we need to represent the ackvec */
434 const u8 nr_opts = DIV_ROUND_UP(buflen, DCCP_SINGLE_OPT_MAXLEN); 436 const u8 nr_opts = DIV_ROUND_UP(buflen, DCCP_SINGLE_OPT_MAXLEN);
@@ -437,10 +439,25 @@ static int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb)
437 const unsigned char *tail, *from; 439 const unsigned char *tail, *from;
438 unsigned char *to; 440 unsigned char *to;
439 441
440 if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) 442 if (dcb->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) {
443 DCCP_WARN("Lacking space for %u bytes on %s packet\n", len,
444 dccp_packet_name(dcb->dccpd_type));
441 return -1; 445 return -1;
442 446 }
443 DCCP_SKB_CB(skb)->dccpd_opt_len += len; 447 /*
448 * Since Ack Vectors are variable-length, we can not always predict
449 * their size. To catch exception cases where the space is running out
450 * on the skb, a separate Sync is scheduled to carry the Ack Vector.
451 */
452 if (len > DCCPAV_MIN_OPTLEN &&
453 len + dcb->dccpd_opt_len + skb->len > dp->dccps_mss_cache) {
454 DCCP_WARN("No space left for Ack Vector (%u) on skb (%u+%u), "
455 "MPS=%u ==> reduce payload size?\n", len, skb->len,
456 dcb->dccpd_opt_len, dp->dccps_mss_cache);
457 dp->dccps_sync_scheduled = 1;
458 return 0;
459 }
460 dcb->dccpd_opt_len += len;
444 461
445 to = skb_push(skb, len); 462 to = skb_push(skb, len);
446 len = buflen; 463 len = buflen;
@@ -481,7 +498,7 @@ static int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb)
481 /* 498 /*
482 * Each sent Ack Vector is recorded in the list, as per A.2 of RFC 4340. 499 * Each sent Ack Vector is recorded in the list, as per A.2 of RFC 4340.
483 */ 500 */
484 if (dccp_ackvec_update_records(av, DCCP_SKB_CB(skb)->dccpd_seq, nonce)) 501 if (dccp_ackvec_update_records(av, dcb->dccpd_seq, nonce))
485 return -ENOBUFS; 502 return -ENOBUFS;
486 return 0; 503 return 0;
487} 504}
diff --git a/net/dccp/output.c b/net/dccp/output.c
index 45b91853f5ae..784d30210543 100644
--- a/net/dccp/output.c
+++ b/net/dccp/output.c
@@ -242,7 +242,7 @@ static void dccp_xmit_packet(struct sock *sk)
242{ 242{
243 int err, len; 243 int err, len;
244 struct dccp_sock *dp = dccp_sk(sk); 244 struct dccp_sock *dp = dccp_sk(sk);
245 struct sk_buff *skb = skb_dequeue(&sk->sk_write_queue); 245 struct sk_buff *skb = dccp_qpolicy_pop(sk);
246 246
247 if (unlikely(skb == NULL)) 247 if (unlikely(skb == NULL))
248 return; 248 return;
@@ -283,6 +283,15 @@ static void dccp_xmit_packet(struct sock *sk)
283 * any local drop will eventually be reported via receiver feedback. 283 * any local drop will eventually be reported via receiver feedback.
284 */ 284 */
285 ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, len); 285 ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, len);
286
287 /*
288 * If the CCID needs to transfer additional header options out-of-band
289 * (e.g. Ack Vectors or feature-negotiation options), it activates this
290 * flag to schedule a Sync. The Sync will automatically incorporate all
291 * currently pending header options, thus clearing the backlog.
292 */
293 if (dp->dccps_sync_scheduled)
294 dccp_send_sync(sk, dp->dccps_gsr, DCCP_PKT_SYNC);
286} 295}
287 296
288/** 297/**
@@ -336,7 +345,7 @@ void dccp_write_xmit(struct sock *sk)
336 struct dccp_sock *dp = dccp_sk(sk); 345 struct dccp_sock *dp = dccp_sk(sk);
337 struct sk_buff *skb; 346 struct sk_buff *skb;
338 347
339 while ((skb = skb_peek(&sk->sk_write_queue))) { 348 while ((skb = dccp_qpolicy_top(sk))) {
340 int rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb); 349 int rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb);
341 350
342 switch (ccid_packet_dequeue_eval(rc)) { 351 switch (ccid_packet_dequeue_eval(rc)) {
@@ -350,8 +359,7 @@ void dccp_write_xmit(struct sock *sk)
350 dccp_xmit_packet(sk); 359 dccp_xmit_packet(sk);
351 break; 360 break;
352 case CCID_PACKET_ERR: 361 case CCID_PACKET_ERR:
353 skb_dequeue(&sk->sk_write_queue); 362 dccp_qpolicy_drop(sk, skb);
354 kfree_skb(skb);
355 dccp_pr_debug("packet discarded due to err=%d\n", rc); 363 dccp_pr_debug("packet discarded due to err=%d\n", rc);
356 } 364 }
357 } 365 }
@@ -636,6 +644,12 @@ void dccp_send_sync(struct sock *sk, const u64 ackno,
636 DCCP_SKB_CB(skb)->dccpd_type = pkt_type; 644 DCCP_SKB_CB(skb)->dccpd_type = pkt_type;
637 DCCP_SKB_CB(skb)->dccpd_ack_seq = ackno; 645 DCCP_SKB_CB(skb)->dccpd_ack_seq = ackno;
638 646
647 /*
648 * Clear the flag in case the Sync was scheduled for out-of-band data,
649 * such as carrying a long Ack Vector.
650 */
651 dccp_sk(sk)->dccps_sync_scheduled = 0;
652
639 dccp_transmit_skb(sk, skb); 653 dccp_transmit_skb(sk, skb);
640} 654}
641 655
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index ef343d53fcea..152975d942d9 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -185,6 +185,7 @@ int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized)
185 dp->dccps_role = DCCP_ROLE_UNDEFINED; 185 dp->dccps_role = DCCP_ROLE_UNDEFINED;
186 dp->dccps_service = DCCP_SERVICE_CODE_IS_ABSENT; 186 dp->dccps_service = DCCP_SERVICE_CODE_IS_ABSENT;
187 dp->dccps_l_ack_ratio = dp->dccps_r_ack_ratio = 1; 187 dp->dccps_l_ack_ratio = dp->dccps_r_ack_ratio = 1;
188 dp->dccps_tx_qlen = sysctl_dccp_tx_qlen;
188 189
189 dccp_init_xmit_timers(sk); 190 dccp_init_xmit_timers(sk);
190 191
@@ -532,6 +533,20 @@ static int do_dccp_setsockopt(struct sock *sk, int level, int optname,
532 case DCCP_SOCKOPT_RECV_CSCOV: 533 case DCCP_SOCKOPT_RECV_CSCOV:
533 err = dccp_setsockopt_cscov(sk, val, true); 534 err = dccp_setsockopt_cscov(sk, val, true);
534 break; 535 break;
536 case DCCP_SOCKOPT_QPOLICY_ID:
537 if (sk->sk_state != DCCP_CLOSED)
538 err = -EISCONN;
539 else if (val < 0 || val >= DCCPQ_POLICY_MAX)
540 err = -EINVAL;
541 else
542 dp->dccps_qpolicy = val;
543 break;
544 case DCCP_SOCKOPT_QPOLICY_TXQLEN:
545 if (val < 0)
546 err = -EINVAL;
547 else
548 dp->dccps_tx_qlen = val;
549 break;
535 default: 550 default:
536 err = -ENOPROTOOPT; 551 err = -ENOPROTOOPT;
537 break; 552 break;
@@ -639,6 +654,12 @@ static int do_dccp_getsockopt(struct sock *sk, int level, int optname,
639 case DCCP_SOCKOPT_RECV_CSCOV: 654 case DCCP_SOCKOPT_RECV_CSCOV:
640 val = dp->dccps_pcrlen; 655 val = dp->dccps_pcrlen;
641 break; 656 break;
657 case DCCP_SOCKOPT_QPOLICY_ID:
658 val = dp->dccps_qpolicy;
659 break;
660 case DCCP_SOCKOPT_QPOLICY_TXQLEN:
661 val = dp->dccps_tx_qlen;
662 break;
642 case 128 ... 191: 663 case 128 ... 191:
643 return ccid_hc_rx_getsockopt(dp->dccps_hc_rx_ccid, sk, optname, 664 return ccid_hc_rx_getsockopt(dp->dccps_hc_rx_ccid, sk, optname,
644 len, (u32 __user *)optval, optlen); 665 len, (u32 __user *)optval, optlen);
@@ -681,6 +702,47 @@ int compat_dccp_getsockopt(struct sock *sk, int level, int optname,
681EXPORT_SYMBOL_GPL(compat_dccp_getsockopt); 702EXPORT_SYMBOL_GPL(compat_dccp_getsockopt);
682#endif 703#endif
683 704
705static int dccp_msghdr_parse(struct msghdr *msg, struct sk_buff *skb)
706{
707 struct cmsghdr *cmsg = CMSG_FIRSTHDR(msg);
708
709 /*
710 * Assign an (opaque) qpolicy priority value to skb->priority.
711 *
712 * We are overloading this skb field for use with the qpolicy subystem.
713 * The skb->priority is normally used for the SO_PRIORITY option, which
714 * is initialised from sk_priority. Since the assignment of sk_priority
715 * to skb->priority happens later (on layer 3), we overload this field
716 * for use with queueing priorities as long as the skb is on layer 4.
717 * The default priority value (if nothing is set) is 0.
718 */
719 skb->priority = 0;
720
721 for (; cmsg != NULL; cmsg = CMSG_NXTHDR(msg, cmsg)) {
722
723 if (!CMSG_OK(msg, cmsg))
724 return -EINVAL;
725
726 if (cmsg->cmsg_level != SOL_DCCP)
727 continue;
728
729 if (cmsg->cmsg_type <= DCCP_SCM_QPOLICY_MAX &&
730 !dccp_qpolicy_param_ok(skb->sk, cmsg->cmsg_type))
731 return -EINVAL;
732
733 switch (cmsg->cmsg_type) {
734 case DCCP_SCM_PRIORITY:
735 if (cmsg->cmsg_len != CMSG_LEN(sizeof(__u32)))
736 return -EINVAL;
737 skb->priority = *(__u32 *)CMSG_DATA(cmsg);
738 break;
739 default:
740 return -EINVAL;
741 }
742 }
743 return 0;
744}
745
684int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, 746int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
685 size_t len) 747 size_t len)
686{ 748{
@@ -696,8 +758,7 @@ int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
696 758
697 lock_sock(sk); 759 lock_sock(sk);
698 760
699 if (sysctl_dccp_tx_qlen && 761 if (dccp_qpolicy_full(sk)) {
700 (sk->sk_write_queue.qlen >= sysctl_dccp_tx_qlen)) {
701 rc = -EAGAIN; 762 rc = -EAGAIN;
702 goto out_release; 763 goto out_release;
703 } 764 }
@@ -725,7 +786,11 @@ int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
725 if (rc != 0) 786 if (rc != 0)
726 goto out_discard; 787 goto out_discard;
727 788
728 skb_queue_tail(&sk->sk_write_queue, skb); 789 rc = dccp_msghdr_parse(msg, skb);
790 if (rc != 0)
791 goto out_discard;
792
793 dccp_qpolicy_push(sk, skb);
729 /* 794 /*
730 * The xmit_timer is set if the TX CCID is rate-based and will expire 795 * The xmit_timer is set if the TX CCID is rate-based and will expire
731 * when congestion control permits to release further packets into the 796 * when congestion control permits to release further packets into the
diff --git a/net/dccp/qpolicy.c b/net/dccp/qpolicy.c
new file mode 100644
index 000000000000..63c30bfa4703
--- /dev/null
+++ b/net/dccp/qpolicy.c
@@ -0,0 +1,137 @@
1/*
2 * net/dccp/qpolicy.c
3 *
4 * Policy-based packet dequeueing interface for DCCP.
5 *
6 * Copyright (c) 2008 Tomasz Grobelny <tomasz@grobelny.oswiecenia.net>
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License v2
10 * as published by the Free Software Foundation.
11 */
12#include "dccp.h"
13
14/*
15 * Simple Dequeueing Policy:
16 * If tx_qlen is different from 0, enqueue up to tx_qlen elements.
17 */
18static void qpolicy_simple_push(struct sock *sk, struct sk_buff *skb)
19{
20 skb_queue_tail(&sk->sk_write_queue, skb);
21}
22
23static bool qpolicy_simple_full(struct sock *sk)
24{
25 return dccp_sk(sk)->dccps_tx_qlen &&
26 sk->sk_write_queue.qlen >= dccp_sk(sk)->dccps_tx_qlen;
27}
28
29static struct sk_buff *qpolicy_simple_top(struct sock *sk)
30{
31 return skb_peek(&sk->sk_write_queue);
32}
33
34/*
35 * Priority-based Dequeueing Policy:
36 * If tx_qlen is different from 0 and the queue has reached its upper bound
37 * of tx_qlen elements, replace older packets lowest-priority-first.
38 */
39static struct sk_buff *qpolicy_prio_best_skb(struct sock *sk)
40{
41 struct sk_buff *skb, *best = NULL;
42
43 skb_queue_walk(&sk->sk_write_queue, skb)
44 if (best == NULL || skb->priority > best->priority)
45 best = skb;
46 return best;
47}
48
49static struct sk_buff *qpolicy_prio_worst_skb(struct sock *sk)
50{
51 struct sk_buff *skb, *worst = NULL;
52
53 skb_queue_walk(&sk->sk_write_queue, skb)
54 if (worst == NULL || skb->priority < worst->priority)
55 worst = skb;
56 return worst;
57}
58
59static bool qpolicy_prio_full(struct sock *sk)
60{
61 if (qpolicy_simple_full(sk))
62 dccp_qpolicy_drop(sk, qpolicy_prio_worst_skb(sk));
63 return false;
64}
65
66/**
67 * struct dccp_qpolicy_operations - TX Packet Dequeueing Interface
68 * @push: add a new @skb to the write queue
69 * @full: indicates that no more packets will be admitted
70 * @top: peeks at whatever the queueing policy defines as its `top'
71 */
72static struct dccp_qpolicy_operations {
73 void (*push) (struct sock *sk, struct sk_buff *skb);
74 bool (*full) (struct sock *sk);
75 struct sk_buff* (*top) (struct sock *sk);
76 __be32 params;
77
78} qpol_table[DCCPQ_POLICY_MAX] = {
79 [DCCPQ_POLICY_SIMPLE] = {
80 .push = qpolicy_simple_push,
81 .full = qpolicy_simple_full,
82 .top = qpolicy_simple_top,
83 .params = 0,
84 },
85 [DCCPQ_POLICY_PRIO] = {
86 .push = qpolicy_simple_push,
87 .full = qpolicy_prio_full,
88 .top = qpolicy_prio_best_skb,
89 .params = DCCP_SCM_PRIORITY,
90 },
91};
92
93/*
94 * Externally visible interface
95 */
96void dccp_qpolicy_push(struct sock *sk, struct sk_buff *skb)
97{
98 qpol_table[dccp_sk(sk)->dccps_qpolicy].push(sk, skb);
99}
100
101bool dccp_qpolicy_full(struct sock *sk)
102{
103 return qpol_table[dccp_sk(sk)->dccps_qpolicy].full(sk);
104}
105
106void dccp_qpolicy_drop(struct sock *sk, struct sk_buff *skb)
107{
108 if (skb != NULL) {
109 skb_unlink(skb, &sk->sk_write_queue);
110 kfree_skb(skb);
111 }
112}
113
114struct sk_buff *dccp_qpolicy_top(struct sock *sk)
115{
116 return qpol_table[dccp_sk(sk)->dccps_qpolicy].top(sk);
117}
118
119struct sk_buff *dccp_qpolicy_pop(struct sock *sk)
120{
121 struct sk_buff *skb = dccp_qpolicy_top(sk);
122
123 if (skb != NULL) {
124 /* Clear any skb fields that we used internally */
125 skb->priority = 0;
126 skb_unlink(skb, &sk->sk_write_queue);
127 }
128 return skb;
129}
130
131bool dccp_qpolicy_param_ok(struct sock *sk, __be32 param)
132{
133 /* check if exactly one bit is set */
134 if (!param || (param & (param - 1)))
135 return false;
136 return (qpol_table[dccp_sk(sk)->dccps_qpolicy].params & param) == param;
137}