aboutsummaryrefslogtreecommitdiffstats
path: root/net/dccp
diff options
context:
space:
mode:
authorGlenn Elliott <gelliott@cs.unc.edu>2012-03-04 19:47:13 -0500
committerGlenn Elliott <gelliott@cs.unc.edu>2012-03-04 19:47:13 -0500
commitc71c03bda1e86c9d5198c5d83f712e695c4f2a1e (patch)
treeecb166cb3e2b7e2adb3b5e292245fefd23381ac8 /net/dccp
parentea53c912f8a86a8567697115b6a0d8152beee5c8 (diff)
parent6a00f206debf8a5c8899055726ad127dbeeed098 (diff)
Merge branch 'mpi-master' into wip-k-fmlpwip-k-fmlp
Conflicts: litmus/sched_cedf.c
Diffstat (limited to 'net/dccp')
-rw-r--r--net/dccp/Kconfig4
-rw-r--r--net/dccp/Makefile4
-rw-r--r--net/dccp/ackvec.c616
-rw-r--r--net/dccp/ackvec.h151
-rw-r--r--net/dccp/ccid.h86
-rw-r--r--net/dccp/ccids/Kconfig31
-rw-r--r--net/dccp/ccids/ccid2.c444
-rw-r--r--net/dccp/ccids/ccid2.h42
-rw-r--r--net/dccp/ccids/ccid3.c268
-rw-r--r--net/dccp/ccids/ccid3.h51
-rw-r--r--net/dccp/ccids/lib/loss_interval.c2
-rw-r--r--net/dccp/ccids/lib/packet_history.c39
-rw-r--r--net/dccp/ccids/lib/packet_history.h22
-rw-r--r--net/dccp/ccids/lib/tfrc.h1
-rw-r--r--net/dccp/ccids/lib/tfrc_equation.c14
-rw-r--r--net/dccp/dccp.h84
-rw-r--r--net/dccp/feat.c10
-rw-r--r--net/dccp/feat.h1
-rw-r--r--net/dccp/input.c65
-rw-r--r--net/dccp/ipv4.c102
-rw-r--r--net/dccp/ipv6.c208
-rw-r--r--net/dccp/minisocks.c30
-rw-r--r--net/dccp/options.c133
-rw-r--r--net/dccp/output.c251
-rw-r--r--net/dccp/probe.c1
-rw-r--r--net/dccp/proto.c142
-rw-r--r--net/dccp/qpolicy.c137
-rw-r--r--net/dccp/sysctl.c4
-rw-r--r--net/dccp/timer.c27
29 files changed, 1531 insertions, 1439 deletions
diff --git a/net/dccp/Kconfig b/net/dccp/Kconfig
index ad6dffd9070e..b75968a04017 100644
--- a/net/dccp/Kconfig
+++ b/net/dccp/Kconfig
@@ -49,7 +49,9 @@ config NET_DCCPPROBE
49 what was just said, you don't need it: say N. 49 what was just said, you don't need it: say N.
50 50
51 Documentation on how to use DCCP connection probing can be found 51 Documentation on how to use DCCP connection probing can be found
52 at http://linux-net.osdl.org/index.php/DccpProbe 52 at:
53
54 http://www.linuxfoundation.org/collaborate/workgroups/networking/dccpprobe
53 55
54 To compile this code as a module, choose M here: the 56 To compile this code as a module, choose M here: the
55 module will be called dccp_probe. 57 module will be called dccp_probe.
diff --git a/net/dccp/Makefile b/net/dccp/Makefile
index 2991efcc8dea..5c8362b037ed 100644
--- a/net/dccp/Makefile
+++ b/net/dccp/Makefile
@@ -1,7 +1,7 @@
1obj-$(CONFIG_IP_DCCP) += dccp.o dccp_ipv4.o 1obj-$(CONFIG_IP_DCCP) += dccp.o dccp_ipv4.o
2 2
3dccp-y := ccid.o feat.o input.o minisocks.o options.o output.o proto.o timer.o 3dccp-y := ccid.o feat.o input.o minisocks.o options.o output.o proto.o timer.o \
4 4 qpolicy.o
5# 5#
6# CCID algorithms to be used by dccp.ko 6# CCID algorithms to be used by dccp.ko
7# 7#
diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c
index 92a6fcb40d7d..25b7a8d1ad58 100644
--- a/net/dccp/ackvec.c
+++ b/net/dccp/ackvec.c
@@ -1,444 +1,375 @@
1/* 1/*
2 * net/dccp/ackvec.c 2 * net/dccp/ackvec.c
3 * 3 *
4 * An implementation of the DCCP protocol 4 * An implementation of Ack Vectors for the DCCP protocol
5 * Copyright (c) 2007 University of Aberdeen, Scotland, UK
5 * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@ghostprotocols.net> 6 * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
6 * 7 *
7 * This program is free software; you can redistribute it and/or modify it 8 * This program is free software; you can redistribute it and/or modify it
8 * under the terms of the GNU General Public License as published by the 9 * under the terms of the GNU General Public License as published by the
9 * Free Software Foundation; version 2 of the License; 10 * Free Software Foundation; version 2 of the License;
10 */ 11 */
11
12#include "ackvec.h"
13#include "dccp.h" 12#include "dccp.h"
14
15#include <linux/init.h>
16#include <linux/errno.h>
17#include <linux/kernel.h> 13#include <linux/kernel.h>
18#include <linux/skbuff.h>
19#include <linux/slab.h> 14#include <linux/slab.h>
20 15
21#include <net/sock.h>
22
23static struct kmem_cache *dccp_ackvec_slab; 16static struct kmem_cache *dccp_ackvec_slab;
24static struct kmem_cache *dccp_ackvec_record_slab; 17static struct kmem_cache *dccp_ackvec_record_slab;
25 18
26static struct dccp_ackvec_record *dccp_ackvec_record_new(void) 19struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority)
27{ 20{
28 struct dccp_ackvec_record *avr = 21 struct dccp_ackvec *av = kmem_cache_zalloc(dccp_ackvec_slab, priority);
29 kmem_cache_alloc(dccp_ackvec_record_slab, GFP_ATOMIC);
30 22
31 if (avr != NULL) 23 if (av != NULL) {
32 INIT_LIST_HEAD(&avr->avr_node); 24 av->av_buf_head = av->av_buf_tail = DCCPAV_MAX_ACKVEC_LEN - 1;
33 25 INIT_LIST_HEAD(&av->av_records);
34 return avr; 26 }
27 return av;
35} 28}
36 29
37static void dccp_ackvec_record_delete(struct dccp_ackvec_record *avr) 30static void dccp_ackvec_purge_records(struct dccp_ackvec *av)
38{ 31{
39 if (unlikely(avr == NULL)) 32 struct dccp_ackvec_record *cur, *next;
40 return; 33
41 /* Check if deleting a linked record */ 34 list_for_each_entry_safe(cur, next, &av->av_records, avr_node)
42 WARN_ON(!list_empty(&avr->avr_node)); 35 kmem_cache_free(dccp_ackvec_record_slab, cur);
43 kmem_cache_free(dccp_ackvec_record_slab, avr); 36 INIT_LIST_HEAD(&av->av_records);
44} 37}
45 38
46static void dccp_ackvec_insert_avr(struct dccp_ackvec *av, 39void dccp_ackvec_free(struct dccp_ackvec *av)
47 struct dccp_ackvec_record *avr)
48{ 40{
49 /* 41 if (likely(av != NULL)) {
50 * AVRs are sorted by seqno. Since we are sending them in order, we 42 dccp_ackvec_purge_records(av);
51 * just add the AVR at the head of the list. 43 kmem_cache_free(dccp_ackvec_slab, av);
52 * -sorbo.
53 */
54 if (!list_empty(&av->av_records)) {
55 const struct dccp_ackvec_record *head =
56 list_entry(av->av_records.next,
57 struct dccp_ackvec_record,
58 avr_node);
59 BUG_ON(before48(avr->avr_ack_seqno, head->avr_ack_seqno));
60 } 44 }
61
62 list_add(&avr->avr_node, &av->av_records);
63} 45}
64 46
65int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb) 47/**
48 * dccp_ackvec_update_records - Record information about sent Ack Vectors
49 * @av: Ack Vector records to update
50 * @seqno: Sequence number of the packet carrying the Ack Vector just sent
51 * @nonce_sum: The sum of all buffer nonces contained in the Ack Vector
52 */
53int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seqno, u8 nonce_sum)
66{ 54{
67 struct dccp_sock *dp = dccp_sk(sk);
68 struct dccp_ackvec *av = dp->dccps_hc_rx_ackvec;
69 /* Figure out how many options do we need to represent the ackvec */
70 const u8 nr_opts = DIV_ROUND_UP(av->av_vec_len, DCCP_SINGLE_OPT_MAXLEN);
71 u16 len = av->av_vec_len + 2 * nr_opts, i;
72 u32 elapsed_time;
73 const unsigned char *tail, *from;
74 unsigned char *to;
75 struct dccp_ackvec_record *avr; 55 struct dccp_ackvec_record *avr;
76 suseconds_t delta;
77
78 if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN)
79 return -1;
80
81 delta = ktime_us_delta(ktime_get_real(), av->av_time);
82 elapsed_time = delta / 10;
83 56
84 if (elapsed_time != 0 && 57 avr = kmem_cache_alloc(dccp_ackvec_record_slab, GFP_ATOMIC);
85 dccp_insert_option_elapsed_time(skb, elapsed_time))
86 return -1;
87
88 avr = dccp_ackvec_record_new();
89 if (avr == NULL) 58 if (avr == NULL)
90 return -1; 59 return -ENOBUFS;
91
92 DCCP_SKB_CB(skb)->dccpd_opt_len += len;
93
94 to = skb_push(skb, len);
95 len = av->av_vec_len;
96 from = av->av_buf + av->av_buf_head;
97 tail = av->av_buf + DCCP_MAX_ACKVEC_LEN;
98
99 for (i = 0; i < nr_opts; ++i) {
100 int copylen = len;
101
102 if (len > DCCP_SINGLE_OPT_MAXLEN)
103 copylen = DCCP_SINGLE_OPT_MAXLEN;
104
105 *to++ = DCCPO_ACK_VECTOR_0;
106 *to++ = copylen + 2;
107
108 /* Check if buf_head wraps */
109 if (from + copylen > tail) {
110 const u16 tailsize = tail - from;
111
112 memcpy(to, from, tailsize);
113 to += tailsize;
114 len -= tailsize;
115 copylen -= tailsize;
116 from = av->av_buf;
117 }
118
119 memcpy(to, from, copylen);
120 from += copylen;
121 to += copylen;
122 len -= copylen;
123 }
124 60
61 avr->avr_ack_seqno = seqno;
62 avr->avr_ack_ptr = av->av_buf_head;
63 avr->avr_ack_ackno = av->av_buf_ackno;
64 avr->avr_ack_nonce = nonce_sum;
65 avr->avr_ack_runlen = dccp_ackvec_runlen(av->av_buf + av->av_buf_head);
125 /* 66 /*
126 * From RFC 4340, A.2: 67 * When the buffer overflows, we keep no more than one record. This is
127 * 68 * the simplest way of disambiguating sender-Acks dating from before the
128 * For each acknowledgement it sends, the HC-Receiver will add an 69 * overflow from sender-Acks which refer to after the overflow; a simple
129 * acknowledgement record. ack_seqno will equal the HC-Receiver 70 * solution is preferable here since we are handling an exception.
130 * sequence number it used for the ack packet; ack_ptr will equal
131 * buf_head; ack_ackno will equal buf_ackno; and ack_nonce will
132 * equal buf_nonce.
133 */ 71 */
134 avr->avr_ack_seqno = DCCP_SKB_CB(skb)->dccpd_seq; 72 if (av->av_overflow)
135 avr->avr_ack_ptr = av->av_buf_head; 73 dccp_ackvec_purge_records(av);
136 avr->avr_ack_ackno = av->av_buf_ackno; 74 /*
137 avr->avr_ack_nonce = av->av_buf_nonce; 75 * Since GSS is incremented for each packet, the list is automatically
138 avr->avr_sent_len = av->av_vec_len; 76 * arranged in descending order of @ack_seqno.
139 77 */
140 dccp_ackvec_insert_avr(av, avr); 78 list_add(&avr->avr_node, &av->av_records);
141 79
142 dccp_pr_debug("%s ACK Vector 0, len=%d, ack_seqno=%llu, " 80 dccp_pr_debug("Added Vector, ack_seqno=%llu, ack_ackno=%llu (rl=%u)\n",
143 "ack_ackno=%llu\n",
144 dccp_role(sk), avr->avr_sent_len,
145 (unsigned long long)avr->avr_ack_seqno, 81 (unsigned long long)avr->avr_ack_seqno,
146 (unsigned long long)avr->avr_ack_ackno); 82 (unsigned long long)avr->avr_ack_ackno,
83 avr->avr_ack_runlen);
147 return 0; 84 return 0;
148} 85}
149 86
150struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority) 87static struct dccp_ackvec_record *dccp_ackvec_lookup(struct list_head *av_list,
88 const u64 ackno)
151{ 89{
152 struct dccp_ackvec *av = kmem_cache_alloc(dccp_ackvec_slab, priority); 90 struct dccp_ackvec_record *avr;
153 91 /*
154 if (av != NULL) { 92 * Exploit that records are inserted in descending order of sequence
155 av->av_buf_head = DCCP_MAX_ACKVEC_LEN - 1; 93 * number, start with the oldest record first. If @ackno is `before'
156 av->av_buf_ackno = UINT48_MAX + 1; 94 * the earliest ack_ackno, the packet is too old to be considered.
157 av->av_buf_nonce = 0; 95 */
158 av->av_time = ktime_set(0, 0); 96 list_for_each_entry_reverse(avr, av_list, avr_node) {
159 av->av_vec_len = 0; 97 if (avr->avr_ack_seqno == ackno)
160 INIT_LIST_HEAD(&av->av_records); 98 return avr;
99 if (before48(ackno, avr->avr_ack_seqno))
100 break;
161 } 101 }
162 102 return NULL;
163 return av;
164} 103}
165 104
166void dccp_ackvec_free(struct dccp_ackvec *av) 105/*
106 * Buffer index and length computation using modulo-buffersize arithmetic.
107 * Note that, as pointers move from right to left, head is `before' tail.
108 */
109static inline u16 __ackvec_idx_add(const u16 a, const u16 b)
167{ 110{
168 if (unlikely(av == NULL)) 111 return (a + b) % DCCPAV_MAX_ACKVEC_LEN;
169 return;
170
171 if (!list_empty(&av->av_records)) {
172 struct dccp_ackvec_record *avr, *next;
173
174 list_for_each_entry_safe(avr, next, &av->av_records, avr_node) {
175 list_del_init(&avr->avr_node);
176 dccp_ackvec_record_delete(avr);
177 }
178 }
179
180 kmem_cache_free(dccp_ackvec_slab, av);
181} 112}
182 113
183static inline u8 dccp_ackvec_state(const struct dccp_ackvec *av, 114static inline u16 __ackvec_idx_sub(const u16 a, const u16 b)
184 const u32 index)
185{ 115{
186 return av->av_buf[index] & DCCP_ACKVEC_STATE_MASK; 116 return __ackvec_idx_add(a, DCCPAV_MAX_ACKVEC_LEN - b);
187} 117}
188 118
189static inline u8 dccp_ackvec_len(const struct dccp_ackvec *av, 119u16 dccp_ackvec_buflen(const struct dccp_ackvec *av)
190 const u32 index)
191{ 120{
192 return av->av_buf[index] & DCCP_ACKVEC_LEN_MASK; 121 if (unlikely(av->av_overflow))
122 return DCCPAV_MAX_ACKVEC_LEN;
123 return __ackvec_idx_sub(av->av_buf_tail, av->av_buf_head);
193} 124}
194 125
195/* 126/**
196 * If several packets are missing, the HC-Receiver may prefer to enter multiple 127 * dccp_ackvec_update_old - Update previous state as per RFC 4340, 11.4.1
197 * bytes with run length 0, rather than a single byte with a larger run length; 128 * @av: non-empty buffer to update
198 * this simplifies table updates if one of the missing packets arrives. 129 * @distance: negative or zero distance of @seqno from buf_ackno downward
130 * @seqno: the (old) sequence number whose record is to be updated
131 * @state: state in which packet carrying @seqno was received
199 */ 132 */
200static inline int dccp_ackvec_set_buf_head_state(struct dccp_ackvec *av, 133static void dccp_ackvec_update_old(struct dccp_ackvec *av, s64 distance,
201 const unsigned int packets, 134 u64 seqno, enum dccp_ackvec_states state)
202 const unsigned char state)
203{ 135{
204 long gap; 136 u16 ptr = av->av_buf_head;
205 long new_head;
206 137
207 if (av->av_vec_len + packets > DCCP_MAX_ACKVEC_LEN) 138 BUG_ON(distance > 0);
208 return -ENOBUFS; 139 if (unlikely(dccp_ackvec_is_empty(av)))
140 return;
209 141
210 gap = packets - 1; 142 do {
211 new_head = av->av_buf_head - packets; 143 u8 runlen = dccp_ackvec_runlen(av->av_buf + ptr);
212 144
213 if (new_head < 0) { 145 if (distance + runlen >= 0) {
214 if (gap > 0) { 146 /*
215 memset(av->av_buf, DCCP_ACKVEC_STATE_NOT_RECEIVED, 147 * Only update the state if packet has not been received
216 gap + new_head + 1); 148 * yet. This is OK as per the second table in RFC 4340,
217 gap = -new_head; 149 * 11.4.1; i.e. here we are using the following table:
150 * RECEIVED
151 * 0 1 3
152 * S +---+---+---+
153 * T 0 | 0 | 0 | 0 |
154 * O +---+---+---+
155 * R 1 | 1 | 1 | 1 |
156 * E +---+---+---+
157 * D 3 | 0 | 1 | 3 |
158 * +---+---+---+
159 * The "Not Received" state was set by reserve_seats().
160 */
161 if (av->av_buf[ptr] == DCCPAV_NOT_RECEIVED)
162 av->av_buf[ptr] = state;
163 else
164 dccp_pr_debug("Not changing %llu state to %u\n",
165 (unsigned long long)seqno, state);
166 break;
218 } 167 }
219 new_head += DCCP_MAX_ACKVEC_LEN;
220 }
221 168
222 av->av_buf_head = new_head; 169 distance += runlen + 1;
170 ptr = __ackvec_idx_add(ptr, 1);
223 171
224 if (gap > 0) 172 } while (ptr != av->av_buf_tail);
225 memset(av->av_buf + av->av_buf_head + 1, 173}
226 DCCP_ACKVEC_STATE_NOT_RECEIVED, gap);
227 174
228 av->av_buf[av->av_buf_head] = state; 175/* Mark @num entries after buf_head as "Not yet received". */
229 av->av_vec_len += packets; 176static void dccp_ackvec_reserve_seats(struct dccp_ackvec *av, u16 num)
230 return 0; 177{
178 u16 start = __ackvec_idx_add(av->av_buf_head, 1),
179 len = DCCPAV_MAX_ACKVEC_LEN - start;
180
181 /* check for buffer wrap-around */
182 if (num > len) {
183 memset(av->av_buf + start, DCCPAV_NOT_RECEIVED, len);
184 start = 0;
185 num -= len;
186 }
187 if (num)
188 memset(av->av_buf + start, DCCPAV_NOT_RECEIVED, num);
231} 189}
232 190
233/* 191/**
234 * Implements the RFC 4340, Appendix A 192 * dccp_ackvec_add_new - Record one or more new entries in Ack Vector buffer
193 * @av: container of buffer to update (can be empty or non-empty)
194 * @num_packets: number of packets to register (must be >= 1)
195 * @seqno: sequence number of the first packet in @num_packets
196 * @state: state in which packet carrying @seqno was received
235 */ 197 */
236int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk, 198static void dccp_ackvec_add_new(struct dccp_ackvec *av, u32 num_packets,
237 const u64 ackno, const u8 state) 199 u64 seqno, enum dccp_ackvec_states state)
238{ 200{
239 /* 201 u32 num_cells = num_packets;
240 * Check at the right places if the buffer is full, if it is, tell the
241 * caller to start dropping packets till the HC-Sender acks our ACK
242 * vectors, when we will free up space in av_buf.
243 *
244 * We may well decide to do buffer compression, etc, but for now lets
245 * just drop.
246 *
247 * From Appendix A.1.1 (`New Packets'):
248 *
249 * Of course, the circular buffer may overflow, either when the
250 * HC-Sender is sending data at a very high rate, when the
251 * HC-Receiver's acknowledgements are not reaching the HC-Sender,
252 * or when the HC-Sender is forgetting to acknowledge those acks
253 * (so the HC-Receiver is unable to clean up old state). In this
254 * case, the HC-Receiver should either compress the buffer (by
255 * increasing run lengths when possible), transfer its state to
256 * a larger buffer, or, as a last resort, drop all received
257 * packets, without processing them whatsoever, until its buffer
258 * shrinks again.
259 */
260 202
261 /* See if this is the first ackno being inserted */ 203 if (num_packets > DCCPAV_BURST_THRESH) {
262 if (av->av_vec_len == 0) { 204 u32 lost_packets = num_packets - 1;
263 av->av_buf[av->av_buf_head] = state;
264 av->av_vec_len = 1;
265 } else if (after48(ackno, av->av_buf_ackno)) {
266 const u64 delta = dccp_delta_seqno(av->av_buf_ackno, ackno);
267 205
206 DCCP_WARN("Warning: large burst loss (%u)\n", lost_packets);
268 /* 207 /*
269 * Look if the state of this packet is the same as the 208 * We received 1 packet and have a loss of size "num_packets-1"
270 * previous ackno and if so if we can bump the head len. 209 * which we squeeze into num_cells-1 rather than reserving an
210 * entire byte for each lost packet.
211 * The reason is that the vector grows in O(burst_length); when
212 * it grows too large there will no room left for the payload.
213 * This is a trade-off: if a few packets out of the burst show
214 * up later, their state will not be changed; it is simply too
215 * costly to reshuffle/reallocate/copy the buffer each time.
216 * Should such problems persist, we will need to switch to a
217 * different underlying data structure.
271 */ 218 */
272 if (delta == 1 && 219 for (num_packets = num_cells = 1; lost_packets; ++num_cells) {
273 dccp_ackvec_state(av, av->av_buf_head) == state && 220 u8 len = min(lost_packets, (u32)DCCPAV_MAX_RUNLEN);
274 dccp_ackvec_len(av, av->av_buf_head) < DCCP_ACKVEC_LEN_MASK)
275 av->av_buf[av->av_buf_head]++;
276 else if (dccp_ackvec_set_buf_head_state(av, delta, state))
277 return -ENOBUFS;
278 } else {
279 /*
280 * A.1.2. Old Packets
281 *
282 * When a packet with Sequence Number S <= buf_ackno
283 * arrives, the HC-Receiver will scan the table for
284 * the byte corresponding to S. (Indexing structures
285 * could reduce the complexity of this scan.)
286 */
287 u64 delta = dccp_delta_seqno(ackno, av->av_buf_ackno);
288 u32 index = av->av_buf_head;
289 221
290 while (1) { 222 av->av_buf_head = __ackvec_idx_sub(av->av_buf_head, 1);
291 const u8 len = dccp_ackvec_len(av, index); 223 av->av_buf[av->av_buf_head] = DCCPAV_NOT_RECEIVED | len;
292 const u8 av_state = dccp_ackvec_state(av, index); 224
293 /* 225 lost_packets -= len;
294 * valid packets not yet in av_buf have a reserved
295 * entry, with a len equal to 0.
296 */
297 if (av_state == DCCP_ACKVEC_STATE_NOT_RECEIVED &&
298 len == 0 && delta == 0) { /* Found our
299 reserved seat! */
300 dccp_pr_debug("Found %llu reserved seat!\n",
301 (unsigned long long)ackno);
302 av->av_buf[index] = state;
303 goto out;
304 }
305 /* len == 0 means one packet */
306 if (delta < len + 1)
307 goto out_duplicate;
308
309 delta -= len + 1;
310 if (++index == DCCP_MAX_ACKVEC_LEN)
311 index = 0;
312 } 226 }
313 } 227 }
314 228
315 av->av_buf_ackno = ackno; 229 if (num_cells + dccp_ackvec_buflen(av) >= DCCPAV_MAX_ACKVEC_LEN) {
316 av->av_time = ktime_get_real(); 230 DCCP_CRIT("Ack Vector buffer overflow: dropping old entries\n");
317out: 231 av->av_overflow = true;
318 return 0; 232 }
233
234 av->av_buf_head = __ackvec_idx_sub(av->av_buf_head, num_packets);
235 if (av->av_overflow)
236 av->av_buf_tail = av->av_buf_head;
319 237
320out_duplicate: 238 av->av_buf[av->av_buf_head] = state;
321 /* Duplicate packet */ 239 av->av_buf_ackno = seqno;
322 dccp_pr_debug("Received a dup or already considered lost " 240
323 "packet: %llu\n", (unsigned long long)ackno); 241 if (num_packets > 1)
324 return -EILSEQ; 242 dccp_ackvec_reserve_seats(av, num_packets - 1);
325} 243}
326 244
327static void dccp_ackvec_throw_record(struct dccp_ackvec *av, 245/**
328 struct dccp_ackvec_record *avr) 246 * dccp_ackvec_input - Register incoming packet in the buffer
247 */
248void dccp_ackvec_input(struct dccp_ackvec *av, struct sk_buff *skb)
329{ 249{
330 struct dccp_ackvec_record *next; 250 u64 seqno = DCCP_SKB_CB(skb)->dccpd_seq;
251 enum dccp_ackvec_states state = DCCPAV_RECEIVED;
331 252
332 /* sort out vector length */ 253 if (dccp_ackvec_is_empty(av)) {
333 if (av->av_buf_head <= avr->avr_ack_ptr) 254 dccp_ackvec_add_new(av, 1, seqno, state);
334 av->av_vec_len = avr->avr_ack_ptr - av->av_buf_head; 255 av->av_tail_ackno = seqno;
335 else
336 av->av_vec_len = DCCP_MAX_ACKVEC_LEN - 1 -
337 av->av_buf_head + avr->avr_ack_ptr;
338 256
339 /* free records */ 257 } else {
340 list_for_each_entry_safe_from(avr, next, &av->av_records, avr_node) { 258 s64 num_packets = dccp_delta_seqno(av->av_buf_ackno, seqno);
341 list_del_init(&avr->avr_node); 259 u8 *current_head = av->av_buf + av->av_buf_head;
342 dccp_ackvec_record_delete(avr);
343 }
344}
345 260
346void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av, struct sock *sk, 261 if (num_packets == 1 &&
347 const u64 ackno) 262 dccp_ackvec_state(current_head) == state &&
348{ 263 dccp_ackvec_runlen(current_head) < DCCPAV_MAX_RUNLEN) {
349 struct dccp_ackvec_record *avr;
350 264
351 /* 265 *current_head += 1;
352 * If we traverse backwards, it should be faster when we have large 266 av->av_buf_ackno = seqno;
353 * windows. We will be receiving ACKs for stuff we sent a while back 267
354 * -sorbo. 268 } else if (num_packets > 0) {
355 */ 269 dccp_ackvec_add_new(av, num_packets, seqno, state);
356 list_for_each_entry_reverse(avr, &av->av_records, avr_node) { 270 } else {
357 if (ackno == avr->avr_ack_seqno) { 271 dccp_ackvec_update_old(av, num_packets, seqno, state);
358 dccp_pr_debug("%s ACK packet 0, len=%d, ack_seqno=%llu, " 272 }
359 "ack_ackno=%llu, ACKED!\n",
360 dccp_role(sk), 1,
361 (unsigned long long)avr->avr_ack_seqno,
362 (unsigned long long)avr->avr_ack_ackno);
363 dccp_ackvec_throw_record(av, avr);
364 break;
365 } else if (avr->avr_ack_seqno > ackno)
366 break; /* old news */
367 } 273 }
368} 274}
369 275
370static void dccp_ackvec_check_rcv_ackvector(struct dccp_ackvec *av, 276/**
371 struct sock *sk, u64 *ackno, 277 * dccp_ackvec_clear_state - Perform house-keeping / garbage-collection
372 const unsigned char len, 278 * This routine is called when the peer acknowledges the receipt of Ack Vectors
373 const unsigned char *vector) 279 * up to and including @ackno. While based on on section A.3 of RFC 4340, here
280 * are additional precautions to prevent corrupted buffer state. In particular,
281 * we use tail_ackno to identify outdated records; it always marks the earliest
282 * packet of group (2) in 11.4.2.
283 */
284void dccp_ackvec_clear_state(struct dccp_ackvec *av, const u64 ackno)
374{ 285{
375 unsigned char i; 286 struct dccp_ackvec_record *avr, *next;
376 struct dccp_ackvec_record *avr; 287 u8 runlen_now, eff_runlen;
288 s64 delta;
377 289
378 /* Check if we actually sent an ACK vector */ 290 avr = dccp_ackvec_lookup(&av->av_records, ackno);
379 if (list_empty(&av->av_records)) 291 if (avr == NULL)
380 return; 292 return;
293 /*
294 * Deal with outdated acknowledgments: this arises when e.g. there are
295 * several old records and the acks from the peer come in slowly. In
296 * that case we may still have records that pre-date tail_ackno.
297 */
298 delta = dccp_delta_seqno(av->av_tail_ackno, avr->avr_ack_ackno);
299 if (delta < 0)
300 goto free_records;
301 /*
302 * Deal with overlapping Ack Vectors: don't subtract more than the
303 * number of packets between tail_ackno and ack_ackno.
304 */
305 eff_runlen = delta < avr->avr_ack_runlen ? delta : avr->avr_ack_runlen;
381 306
382 i = len; 307 runlen_now = dccp_ackvec_runlen(av->av_buf + avr->avr_ack_ptr);
383 /* 308 /*
384 * XXX 309 * The run length of Ack Vector cells does not decrease over time. If
385 * I think it might be more efficient to work backwards. See comment on 310 * the run length is the same as at the time the Ack Vector was sent, we
386 * rcv_ackno. -sorbo. 311 * free the ack_ptr cell. That cell can however not be freed if the run
312 * length has increased: in this case we need to move the tail pointer
313 * backwards (towards higher indices), to its next-oldest neighbour.
387 */ 314 */
388 avr = list_entry(av->av_records.next, struct dccp_ackvec_record, avr_node); 315 if (runlen_now > eff_runlen) {
389 while (i--) {
390 const u8 rl = *vector & DCCP_ACKVEC_LEN_MASK;
391 u64 ackno_end_rl;
392 316
393 dccp_set_seqno(&ackno_end_rl, *ackno - rl); 317 av->av_buf[avr->avr_ack_ptr] -= eff_runlen + 1;
318 av->av_buf_tail = __ackvec_idx_add(avr->avr_ack_ptr, 1);
394 319
320 /* This move may not have cleared the overflow flag. */
321 if (av->av_overflow)
322 av->av_overflow = (av->av_buf_head == av->av_buf_tail);
323 } else {
324 av->av_buf_tail = avr->avr_ack_ptr;
395 /* 325 /*
396 * If our AVR sequence number is greater than the ack, go 326 * We have made sure that avr points to a valid cell within the
397 * forward in the AVR list until it is not so. 327 * buffer. This cell is either older than head, or equals head
328 * (empty buffer): in both cases we no longer have any overflow.
398 */ 329 */
399 list_for_each_entry_from(avr, &av->av_records, avr_node) { 330 av->av_overflow = 0;
400 if (!after48(avr->avr_ack_seqno, *ackno)) 331 }
401 goto found;
402 }
403 /* End of the av_records list, not found, exit */
404 break;
405found:
406 if (between48(avr->avr_ack_seqno, ackno_end_rl, *ackno)) {
407 const u8 state = *vector & DCCP_ACKVEC_STATE_MASK;
408 if (state != DCCP_ACKVEC_STATE_NOT_RECEIVED) {
409 dccp_pr_debug("%s ACK vector 0, len=%d, "
410 "ack_seqno=%llu, ack_ackno=%llu, "
411 "ACKED!\n",
412 dccp_role(sk), len,
413 (unsigned long long)
414 avr->avr_ack_seqno,
415 (unsigned long long)
416 avr->avr_ack_ackno);
417 dccp_ackvec_throw_record(av, avr);
418 break;
419 }
420 /*
421 * If it wasn't received, continue scanning... we might
422 * find another one.
423 */
424 }
425 332
426 dccp_set_seqno(ackno, ackno_end_rl - 1); 333 /*
427 ++vector; 334 * The peer has acknowledged up to and including ack_ackno. Hence the
335 * first packet in group (2) of 11.4.2 is the successor of ack_ackno.
336 */
337 av->av_tail_ackno = ADD48(avr->avr_ack_ackno, 1);
338
339free_records:
340 list_for_each_entry_safe_from(avr, next, &av->av_records, avr_node) {
341 list_del(&avr->avr_node);
342 kmem_cache_free(dccp_ackvec_record_slab, avr);
428 } 343 }
429} 344}
430 345
431int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb, 346/*
432 u64 *ackno, const u8 opt, const u8 *value, const u8 len) 347 * Routines to keep track of Ack Vectors received in an skb
348 */
349int dccp_ackvec_parsed_add(struct list_head *head, u8 *vec, u8 len, u8 nonce)
433{ 350{
434 if (len > DCCP_SINGLE_OPT_MAXLEN) 351 struct dccp_ackvec_parsed *new = kmalloc(sizeof(*new), GFP_ATOMIC);
435 return -1; 352
353 if (new == NULL)
354 return -ENOBUFS;
355 new->vec = vec;
356 new->len = len;
357 new->nonce = nonce;
436 358
437 /* dccp_ackvector_print(DCCP_SKB_CB(skb)->dccpd_ack_seq, value, len); */ 359 list_add_tail(&new->node, head);
438 dccp_ackvec_check_rcv_ackvector(dccp_sk(sk)->dccps_hc_rx_ackvec, sk,
439 ackno, len, value);
440 return 0; 360 return 0;
441} 361}
362EXPORT_SYMBOL_GPL(dccp_ackvec_parsed_add);
363
364void dccp_ackvec_parsed_cleanup(struct list_head *parsed_chunks)
365{
366 struct dccp_ackvec_parsed *cur, *next;
367
368 list_for_each_entry_safe(cur, next, parsed_chunks, node)
369 kfree(cur);
370 INIT_LIST_HEAD(parsed_chunks);
371}
372EXPORT_SYMBOL_GPL(dccp_ackvec_parsed_cleanup);
442 373
443int __init dccp_ackvec_init(void) 374int __init dccp_ackvec_init(void)
444{ 375{
@@ -448,10 +379,9 @@ int __init dccp_ackvec_init(void)
448 if (dccp_ackvec_slab == NULL) 379 if (dccp_ackvec_slab == NULL)
449 goto out_err; 380 goto out_err;
450 381
451 dccp_ackvec_record_slab = 382 dccp_ackvec_record_slab = kmem_cache_create("dccp_ackvec_record",
452 kmem_cache_create("dccp_ackvec_record", 383 sizeof(struct dccp_ackvec_record),
453 sizeof(struct dccp_ackvec_record), 384 0, SLAB_HWCACHE_ALIGN, NULL);
454 0, SLAB_HWCACHE_ALIGN, NULL);
455 if (dccp_ackvec_record_slab == NULL) 385 if (dccp_ackvec_record_slab == NULL)
456 goto out_destroy_slab; 386 goto out_destroy_slab;
457 387
diff --git a/net/dccp/ackvec.h b/net/dccp/ackvec.h
index 7ea557b7c6b1..e2ab0627a5ff 100644
--- a/net/dccp/ackvec.h
+++ b/net/dccp/ackvec.h
@@ -3,9 +3,9 @@
3/* 3/*
4 * net/dccp/ackvec.h 4 * net/dccp/ackvec.h
5 * 5 *
6 * An implementation of the DCCP protocol 6 * An implementation of Ack Vectors for the DCCP protocol
7 * Copyright (c) 2007 University of Aberdeen, Scotland, UK
7 * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@mandriva.com> 8 * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@mandriva.com>
8 *
9 * This program is free software; you can redistribute it and/or modify it 9 * This program is free software; you can redistribute it and/or modify it
10 * under the terms of the GNU General Public License version 2 as 10 * under the terms of the GNU General Public License version 2 as
11 * published by the Free Software Foundation. 11 * published by the Free Software Foundation.
@@ -13,99 +13,124 @@
13 13
14#include <linux/dccp.h> 14#include <linux/dccp.h>
15#include <linux/compiler.h> 15#include <linux/compiler.h>
16#include <linux/ktime.h>
17#include <linux/list.h> 16#include <linux/list.h>
18#include <linux/types.h> 17#include <linux/types.h>
19 18
20/* We can spread an ack vector across multiple options */ 19/*
21#define DCCP_MAX_ACKVEC_LEN (DCCP_SINGLE_OPT_MAXLEN * 2) 20 * Ack Vector buffer space is static, in multiples of %DCCP_SINGLE_OPT_MAXLEN,
21 * the maximum size of a single Ack Vector. Setting %DCCPAV_NUM_ACKVECS to 1
22 * will be sufficient for most cases of low Ack Ratios, using a value of 2 gives
23 * more headroom if Ack Ratio is higher or when the sender acknowledges slowly.
24 * The maximum value is bounded by the u16 types for indices and functions.
25 */
26#define DCCPAV_NUM_ACKVECS 2
27#define DCCPAV_MAX_ACKVEC_LEN (DCCP_SINGLE_OPT_MAXLEN * DCCPAV_NUM_ACKVECS)
22 28
23/* Estimated minimum average Ack Vector length - used for updating MPS */ 29/* Estimated minimum average Ack Vector length - used for updating MPS */
24#define DCCPAV_MIN_OPTLEN 16 30#define DCCPAV_MIN_OPTLEN 16
25 31
26#define DCCP_ACKVEC_STATE_RECEIVED 0 32/* Threshold for coping with large bursts of losses */
27#define DCCP_ACKVEC_STATE_ECN_MARKED (1 << 6) 33#define DCCPAV_BURST_THRESH (DCCPAV_MAX_ACKVEC_LEN / 8)
28#define DCCP_ACKVEC_STATE_NOT_RECEIVED (3 << 6)
29 34
30#define DCCP_ACKVEC_STATE_MASK 0xC0 /* 11000000 */ 35enum dccp_ackvec_states {
31#define DCCP_ACKVEC_LEN_MASK 0x3F /* 00111111 */ 36 DCCPAV_RECEIVED = 0x00,
37 DCCPAV_ECN_MARKED = 0x40,
38 DCCPAV_RESERVED = 0x80,
39 DCCPAV_NOT_RECEIVED = 0xC0
40};
41#define DCCPAV_MAX_RUNLEN 0x3F
32 42
33/** struct dccp_ackvec - ack vector 43static inline u8 dccp_ackvec_runlen(const u8 *cell)
34 * 44{
35 * This data structure is the one defined in RFC 4340, Appendix A. 45 return *cell & DCCPAV_MAX_RUNLEN;
36 * 46}
37 * @av_buf_head - circular buffer head 47
38 * @av_buf_tail - circular buffer tail 48static inline u8 dccp_ackvec_state(const u8 *cell)
39 * @av_buf_ackno - ack # of the most recent packet acknowledgeable in the 49{
40 * buffer (i.e. %av_buf_head) 50 return *cell & ~DCCPAV_MAX_RUNLEN;
41 * @av_buf_nonce - the one-bit sum of the ECN Nonces on all packets acked 51}
42 * by the buffer with State 0 52
43 * 53/** struct dccp_ackvec - Ack Vector main data structure
44 * Additionally, the HC-Receiver must keep some information about the
45 * Ack Vectors it has recently sent. For each packet sent carrying an
46 * Ack Vector, it remembers four variables:
47 * 54 *
48 * @av_records - list of dccp_ackvec_record 55 * This implements a fixed-size circular buffer within an array and is largely
49 * @av_ack_nonce - the one-bit sum of the ECN Nonces for all State 0. 56 * based on Appendix A of RFC 4340.
50 * 57 *
51 * @av_time - the time in usecs 58 * @av_buf: circular buffer storage area
52 * @av_buf - circular buffer of acknowledgeable packets 59 * @av_buf_head: head index; begin of live portion in @av_buf
60 * @av_buf_tail: tail index; first index _after_ the live portion in @av_buf
61 * @av_buf_ackno: highest seqno of acknowledgeable packet recorded in @av_buf
62 * @av_tail_ackno: lowest seqno of acknowledgeable packet recorded in @av_buf
63 * @av_buf_nonce: ECN nonce sums, each covering subsequent segments of up to
64 * %DCCP_SINGLE_OPT_MAXLEN cells in the live portion of @av_buf
65 * @av_overflow: if 1 then buf_head == buf_tail indicates buffer wraparound
66 * @av_records: list of %dccp_ackvec_record (Ack Vectors sent previously)
53 */ 67 */
54struct dccp_ackvec { 68struct dccp_ackvec {
55 u64 av_buf_ackno; 69 u8 av_buf[DCCPAV_MAX_ACKVEC_LEN];
56 struct list_head av_records;
57 ktime_t av_time;
58 u16 av_buf_head; 70 u16 av_buf_head;
59 u16 av_vec_len; 71 u16 av_buf_tail;
60 u8 av_buf_nonce; 72 u64 av_buf_ackno:48;
61 u8 av_ack_nonce; 73 u64 av_tail_ackno:48;
62 u8 av_buf[DCCP_MAX_ACKVEC_LEN]; 74 bool av_buf_nonce[DCCPAV_NUM_ACKVECS];
75 u8 av_overflow:1;
76 struct list_head av_records;
63}; 77};
64 78
65/** struct dccp_ackvec_record - ack vector record 79/** struct dccp_ackvec_record - Records information about sent Ack Vectors
66 * 80 *
67 * ACK vector record as defined in Appendix A of spec. 81 * These list entries define the additional information which the HC-Receiver
82 * keeps about recently-sent Ack Vectors; again refer to RFC 4340, Appendix A.
68 * 83 *
69 * The list is sorted by avr_ack_seqno 84 * @avr_node: the list node in @av_records
85 * @avr_ack_seqno: sequence number of the packet the Ack Vector was sent on
86 * @avr_ack_ackno: the Ack number that this record/Ack Vector refers to
87 * @avr_ack_ptr: pointer into @av_buf where this record starts
88 * @avr_ack_runlen: run length of @avr_ack_ptr at the time of sending
89 * @avr_ack_nonce: the sum of @av_buf_nonce's at the time this record was sent
70 * 90 *
71 * @avr_node - node in av_records 91 * The list as a whole is sorted in descending order by @avr_ack_seqno.
72 * @avr_ack_seqno - sequence number of the packet this record was sent on
73 * @avr_ack_ackno - sequence number being acknowledged
74 * @avr_ack_ptr - pointer into av_buf where this record starts
75 * @avr_ack_nonce - av_ack_nonce at the time this record was sent
76 * @avr_sent_len - lenght of the record in av_buf
77 */ 92 */
78struct dccp_ackvec_record { 93struct dccp_ackvec_record {
79 struct list_head avr_node; 94 struct list_head avr_node;
80 u64 avr_ack_seqno; 95 u64 avr_ack_seqno:48;
81 u64 avr_ack_ackno; 96 u64 avr_ack_ackno:48;
82 u16 avr_ack_ptr; 97 u16 avr_ack_ptr;
83 u16 avr_sent_len; 98 u8 avr_ack_runlen;
84 u8 avr_ack_nonce; 99 u8 avr_ack_nonce:1;
85}; 100};
86 101
87struct sock;
88struct sk_buff;
89
90extern int dccp_ackvec_init(void); 102extern int dccp_ackvec_init(void);
91extern void dccp_ackvec_exit(void); 103extern void dccp_ackvec_exit(void);
92 104
93extern struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority); 105extern struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority);
94extern void dccp_ackvec_free(struct dccp_ackvec *av); 106extern void dccp_ackvec_free(struct dccp_ackvec *av);
95 107
96extern int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk, 108extern void dccp_ackvec_input(struct dccp_ackvec *av, struct sk_buff *skb);
97 const u64 ackno, const u8 state); 109extern int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seq, u8 sum);
98 110extern void dccp_ackvec_clear_state(struct dccp_ackvec *av, const u64 ackno);
99extern void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av, 111extern u16 dccp_ackvec_buflen(const struct dccp_ackvec *av);
100 struct sock *sk, const u64 ackno);
101extern int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb,
102 u64 *ackno, const u8 opt,
103 const u8 *value, const u8 len);
104 112
105extern int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb); 113static inline bool dccp_ackvec_is_empty(const struct dccp_ackvec *av)
106
107static inline int dccp_ackvec_pending(const struct dccp_ackvec *av)
108{ 114{
109 return av->av_vec_len; 115 return av->av_overflow == 0 && av->av_buf_head == av->av_buf_tail;
110} 116}
117
118/**
119 * struct dccp_ackvec_parsed - Record offsets of Ack Vectors in skb
120 * @vec: start of vector (offset into skb)
121 * @len: length of @vec
122 * @nonce: whether @vec had an ECN nonce of 0 or 1
123 * @node: FIFO - arranged in descending order of ack_ackno
124 * This structure is used by CCIDs to access Ack Vectors in a received skb.
125 */
126struct dccp_ackvec_parsed {
127 u8 *vec,
128 len,
129 nonce:1;
130 struct list_head node;
131};
132
133extern int dccp_ackvec_parsed_add(struct list_head *head,
134 u8 *vec, u8 len, u8 nonce);
135extern void dccp_ackvec_parsed_cleanup(struct list_head *parsed_chunks);
111#endif /* _ACKVEC_H */ 136#endif /* _ACKVEC_H */
diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h
index 6df6f8ac9636..75c3582a7678 100644
--- a/net/dccp/ccid.h
+++ b/net/dccp/ccid.h
@@ -62,22 +62,18 @@ struct ccid_operations {
62 void (*ccid_hc_tx_exit)(struct sock *sk); 62 void (*ccid_hc_tx_exit)(struct sock *sk);
63 void (*ccid_hc_rx_packet_recv)(struct sock *sk, 63 void (*ccid_hc_rx_packet_recv)(struct sock *sk,
64 struct sk_buff *skb); 64 struct sk_buff *skb);
65 int (*ccid_hc_rx_parse_options)(struct sock *sk, 65 int (*ccid_hc_rx_parse_options)(struct sock *sk, u8 pkt,
66 unsigned char option, 66 u8 opt, u8 *val, u8 len);
67 unsigned char len, u16 idx,
68 unsigned char* value);
69 int (*ccid_hc_rx_insert_options)(struct sock *sk, 67 int (*ccid_hc_rx_insert_options)(struct sock *sk,
70 struct sk_buff *skb); 68 struct sk_buff *skb);
71 void (*ccid_hc_tx_packet_recv)(struct sock *sk, 69 void (*ccid_hc_tx_packet_recv)(struct sock *sk,
72 struct sk_buff *skb); 70 struct sk_buff *skb);
73 int (*ccid_hc_tx_parse_options)(struct sock *sk, 71 int (*ccid_hc_tx_parse_options)(struct sock *sk, u8 pkt,
74 unsigned char option, 72 u8 opt, u8 *val, u8 len);
75 unsigned char len, u16 idx,
76 unsigned char* value);
77 int (*ccid_hc_tx_send_packet)(struct sock *sk, 73 int (*ccid_hc_tx_send_packet)(struct sock *sk,
78 struct sk_buff *skb); 74 struct sk_buff *skb);
79 void (*ccid_hc_tx_packet_sent)(struct sock *sk, 75 void (*ccid_hc_tx_packet_sent)(struct sock *sk,
80 int more, unsigned int len); 76 unsigned int len);
81 void (*ccid_hc_rx_get_info)(struct sock *sk, 77 void (*ccid_hc_rx_get_info)(struct sock *sk,
82 struct tcp_info *info); 78 struct tcp_info *info);
83 void (*ccid_hc_tx_get_info)(struct sock *sk, 79 void (*ccid_hc_tx_get_info)(struct sock *sk,
@@ -138,20 +134,48 @@ static inline int ccid_get_current_tx_ccid(struct dccp_sock *dp)
138extern void ccid_hc_rx_delete(struct ccid *ccid, struct sock *sk); 134extern void ccid_hc_rx_delete(struct ccid *ccid, struct sock *sk);
139extern void ccid_hc_tx_delete(struct ccid *ccid, struct sock *sk); 135extern void ccid_hc_tx_delete(struct ccid *ccid, struct sock *sk);
140 136
137/*
138 * Congestion control of queued data packets via CCID decision.
139 *
140 * The TX CCID performs its congestion-control by indicating whether and when a
141 * queued packet may be sent, using the return code of ccid_hc_tx_send_packet().
142 * The following modes are supported via the symbolic constants below:
143 * - timer-based pacing (CCID returns a delay value in milliseconds);
144 * - autonomous dequeueing (CCID internally schedules dccps_xmitlet).
145 */
146
147enum ccid_dequeueing_decision {
148 CCID_PACKET_SEND_AT_ONCE = 0x00000, /* "green light": no delay */
149 CCID_PACKET_DELAY_MAX = 0x0FFFF, /* maximum delay in msecs */
150 CCID_PACKET_DELAY = 0x10000, /* CCID msec-delay mode */
151 CCID_PACKET_WILL_DEQUEUE_LATER = 0x20000, /* CCID autonomous mode */
152 CCID_PACKET_ERR = 0xF0000, /* error condition */
153};
154
155static inline int ccid_packet_dequeue_eval(const int return_code)
156{
157 if (return_code < 0)
158 return CCID_PACKET_ERR;
159 if (return_code == 0)
160 return CCID_PACKET_SEND_AT_ONCE;
161 if (return_code <= CCID_PACKET_DELAY_MAX)
162 return CCID_PACKET_DELAY;
163 return return_code;
164}
165
141static inline int ccid_hc_tx_send_packet(struct ccid *ccid, struct sock *sk, 166static inline int ccid_hc_tx_send_packet(struct ccid *ccid, struct sock *sk,
142 struct sk_buff *skb) 167 struct sk_buff *skb)
143{ 168{
144 int rc = 0;
145 if (ccid->ccid_ops->ccid_hc_tx_send_packet != NULL) 169 if (ccid->ccid_ops->ccid_hc_tx_send_packet != NULL)
146 rc = ccid->ccid_ops->ccid_hc_tx_send_packet(sk, skb); 170 return ccid->ccid_ops->ccid_hc_tx_send_packet(sk, skb);
147 return rc; 171 return CCID_PACKET_SEND_AT_ONCE;
148} 172}
149 173
150static inline void ccid_hc_tx_packet_sent(struct ccid *ccid, struct sock *sk, 174static inline void ccid_hc_tx_packet_sent(struct ccid *ccid, struct sock *sk,
151 int more, unsigned int len) 175 unsigned int len)
152{ 176{
153 if (ccid->ccid_ops->ccid_hc_tx_packet_sent != NULL) 177 if (ccid->ccid_ops->ccid_hc_tx_packet_sent != NULL)
154 ccid->ccid_ops->ccid_hc_tx_packet_sent(sk, more, len); 178 ccid->ccid_ops->ccid_hc_tx_packet_sent(sk, len);
155} 179}
156 180
157static inline void ccid_hc_rx_packet_recv(struct ccid *ccid, struct sock *sk, 181static inline void ccid_hc_rx_packet_recv(struct ccid *ccid, struct sock *sk,
@@ -168,27 +192,31 @@ static inline void ccid_hc_tx_packet_recv(struct ccid *ccid, struct sock *sk,
168 ccid->ccid_ops->ccid_hc_tx_packet_recv(sk, skb); 192 ccid->ccid_ops->ccid_hc_tx_packet_recv(sk, skb);
169} 193}
170 194
195/**
196 * ccid_hc_tx_parse_options - Parse CCID-specific options sent by the receiver
197 * @pkt: type of packet that @opt appears on (RFC 4340, 5.1)
198 * @opt: the CCID-specific option type (RFC 4340, 5.8 and 10.3)
199 * @val: value of @opt
200 * @len: length of @val in bytes
201 */
171static inline int ccid_hc_tx_parse_options(struct ccid *ccid, struct sock *sk, 202static inline int ccid_hc_tx_parse_options(struct ccid *ccid, struct sock *sk,
172 unsigned char option, 203 u8 pkt, u8 opt, u8 *val, u8 len)
173 unsigned char len, u16 idx,
174 unsigned char* value)
175{ 204{
176 int rc = 0; 205 if (ccid->ccid_ops->ccid_hc_tx_parse_options == NULL)
177 if (ccid->ccid_ops->ccid_hc_tx_parse_options != NULL) 206 return 0;
178 rc = ccid->ccid_ops->ccid_hc_tx_parse_options(sk, option, len, idx, 207 return ccid->ccid_ops->ccid_hc_tx_parse_options(sk, pkt, opt, val, len);
179 value);
180 return rc;
181} 208}
182 209
210/**
211 * ccid_hc_rx_parse_options - Parse CCID-specific options sent by the sender
212 * Arguments are analogous to ccid_hc_tx_parse_options()
213 */
183static inline int ccid_hc_rx_parse_options(struct ccid *ccid, struct sock *sk, 214static inline int ccid_hc_rx_parse_options(struct ccid *ccid, struct sock *sk,
184 unsigned char option, 215 u8 pkt, u8 opt, u8 *val, u8 len)
185 unsigned char len, u16 idx,
186 unsigned char* value)
187{ 216{
188 int rc = 0; 217 if (ccid->ccid_ops->ccid_hc_rx_parse_options == NULL)
189 if (ccid->ccid_ops->ccid_hc_rx_parse_options != NULL) 218 return 0;
190 rc = ccid->ccid_ops->ccid_hc_rx_parse_options(sk, option, len, idx, value); 219 return ccid->ccid_ops->ccid_hc_rx_parse_options(sk, pkt, opt, val, len);
191 return rc;
192} 220}
193 221
194static inline int ccid_hc_rx_insert_options(struct ccid *ccid, struct sock *sk, 222static inline int ccid_hc_rx_insert_options(struct ccid *ccid, struct sock *sk,
diff --git a/net/dccp/ccids/Kconfig b/net/dccp/ccids/Kconfig
index 8408398cd44e..0581143cb800 100644
--- a/net/dccp/ccids/Kconfig
+++ b/net/dccp/ccids/Kconfig
@@ -47,37 +47,6 @@ config IP_DCCP_CCID3_DEBUG
47 47
48 If in doubt, say N. 48 If in doubt, say N.
49 49
50config IP_DCCP_CCID3_RTO
51 int "Use higher bound for nofeedback timer"
52 default 100
53 depends on IP_DCCP_CCID3 && EXPERIMENTAL
54 ---help---
55 Use higher lower bound for nofeedback timer expiration.
56
57 The TFRC nofeedback timer normally expires after the maximum of 4
58 RTTs and twice the current send interval (RFC 3448, 4.3). On LANs
59 with a small RTT this can mean a high processing load and reduced
60 performance, since then the nofeedback timer is triggered very
61 frequently.
62
63 This option enables to set a higher lower bound for the nofeedback
64 value. Values in units of milliseconds can be set here.
65
66 A value of 0 disables this feature by enforcing the value specified
67 in RFC 3448. The following values have been suggested as bounds for
68 experimental use:
69 * 16-20ms to match the typical multimedia inter-frame interval
70 * 100ms as a reasonable compromise [default]
71 * 1000ms corresponds to the lower TCP RTO bound (RFC 2988, 2.4)
72
73 The default of 100ms is a compromise between a large value for
74 efficient DCCP implementations, and a small value to avoid disrupting
75 the network in times of congestion.
76
77 The purpose of the nofeedback timer is to slow DCCP down when there
78 is serious network congestion: experimenting with larger values should
79 therefore not be performed on WANs.
80
81config IP_DCCP_TFRC_LIB 50config IP_DCCP_TFRC_LIB
82 def_bool y if IP_DCCP_CCID3 51 def_bool y if IP_DCCP_CCID3
83 52
diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c
index 9b3ae9922be1..fadecd20d75b 100644
--- a/net/dccp/ccids/ccid2.c
+++ b/net/dccp/ccids/ccid2.c
@@ -25,59 +25,14 @@
25 */ 25 */
26#include <linux/slab.h> 26#include <linux/slab.h>
27#include "../feat.h" 27#include "../feat.h"
28#include "../ccid.h"
29#include "../dccp.h"
30#include "ccid2.h" 28#include "ccid2.h"
31 29
32 30
33#ifdef CONFIG_IP_DCCP_CCID2_DEBUG 31#ifdef CONFIG_IP_DCCP_CCID2_DEBUG
34static int ccid2_debug; 32static int ccid2_debug;
35#define ccid2_pr_debug(format, a...) DCCP_PR_DEBUG(ccid2_debug, format, ##a) 33#define ccid2_pr_debug(format, a...) DCCP_PR_DEBUG(ccid2_debug, format, ##a)
36
37static void ccid2_hc_tx_check_sanity(const struct ccid2_hc_tx_sock *hc)
38{
39 int len = 0;
40 int pipe = 0;
41 struct ccid2_seq *seqp = hc->tx_seqh;
42
43 /* there is data in the chain */
44 if (seqp != hc->tx_seqt) {
45 seqp = seqp->ccid2s_prev;
46 len++;
47 if (!seqp->ccid2s_acked)
48 pipe++;
49
50 while (seqp != hc->tx_seqt) {
51 struct ccid2_seq *prev = seqp->ccid2s_prev;
52
53 len++;
54 if (!prev->ccid2s_acked)
55 pipe++;
56
57 /* packets are sent sequentially */
58 BUG_ON(dccp_delta_seqno(seqp->ccid2s_seq,
59 prev->ccid2s_seq ) >= 0);
60 BUG_ON(time_before(seqp->ccid2s_sent,
61 prev->ccid2s_sent));
62
63 seqp = prev;
64 }
65 }
66
67 BUG_ON(pipe != hc->tx_pipe);
68 ccid2_pr_debug("len of chain=%d\n", len);
69
70 do {
71 seqp = seqp->ccid2s_prev;
72 len++;
73 } while (seqp != hc->tx_seqh);
74
75 ccid2_pr_debug("total len=%d\n", len);
76 BUG_ON(len != hc->tx_seqbufc * CCID2_SEQBUF_LEN);
77}
78#else 34#else
79#define ccid2_pr_debug(format, a...) 35#define ccid2_pr_debug(format, a...)
80#define ccid2_hc_tx_check_sanity(hc)
81#endif 36#endif
82 37
83static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hc) 38static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hc)
@@ -123,12 +78,9 @@ static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hc)
123 78
124static int ccid2_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) 79static int ccid2_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
125{ 80{
126 struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); 81 if (ccid2_cwnd_network_limited(ccid2_hc_tx_sk(sk)))
127 82 return CCID_PACKET_WILL_DEQUEUE_LATER;
128 if (hc->tx_pipe < hc->tx_cwnd) 83 return CCID_PACKET_SEND_AT_ONCE;
129 return 0;
130
131 return 1; /* XXX CCID should dequeue when ready instead of polling */
132} 84}
133 85
134static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val) 86static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val)
@@ -156,19 +108,11 @@ static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val)
156 dp->dccps_l_ack_ratio = val; 108 dp->dccps_l_ack_ratio = val;
157} 109}
158 110
159static void ccid2_change_srtt(struct ccid2_hc_tx_sock *hc, long val)
160{
161 ccid2_pr_debug("change SRTT to %ld\n", val);
162 hc->tx_srtt = val;
163}
164
165static void ccid2_start_rto_timer(struct sock *sk);
166
167static void ccid2_hc_tx_rto_expire(unsigned long data) 111static void ccid2_hc_tx_rto_expire(unsigned long data)
168{ 112{
169 struct sock *sk = (struct sock *)data; 113 struct sock *sk = (struct sock *)data;
170 struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); 114 struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
171 long s; 115 const bool sender_was_blocked = ccid2_cwnd_network_limited(hc);
172 116
173 bh_lock_sock(sk); 117 bh_lock_sock(sk);
174 if (sock_owned_by_user(sk)) { 118 if (sock_owned_by_user(sk)) {
@@ -178,23 +122,17 @@ static void ccid2_hc_tx_rto_expire(unsigned long data)
178 122
179 ccid2_pr_debug("RTO_EXPIRE\n"); 123 ccid2_pr_debug("RTO_EXPIRE\n");
180 124
181 ccid2_hc_tx_check_sanity(hc);
182
183 /* back-off timer */ 125 /* back-off timer */
184 hc->tx_rto <<= 1; 126 hc->tx_rto <<= 1;
185 127 if (hc->tx_rto > DCCP_RTO_MAX)
186 s = hc->tx_rto / HZ; 128 hc->tx_rto = DCCP_RTO_MAX;
187 if (s > 60)
188 hc->tx_rto = 60 * HZ;
189
190 ccid2_start_rto_timer(sk);
191 129
192 /* adjust pipe, cwnd etc */ 130 /* adjust pipe, cwnd etc */
193 hc->tx_ssthresh = hc->tx_cwnd / 2; 131 hc->tx_ssthresh = hc->tx_cwnd / 2;
194 if (hc->tx_ssthresh < 2) 132 if (hc->tx_ssthresh < 2)
195 hc->tx_ssthresh = 2; 133 hc->tx_ssthresh = 2;
196 hc->tx_cwnd = 1; 134 hc->tx_cwnd = 1;
197 hc->tx_pipe = 0; 135 hc->tx_pipe = 0;
198 136
199 /* clear state about stuff we sent */ 137 /* clear state about stuff we sent */
200 hc->tx_seqt = hc->tx_seqh; 138 hc->tx_seqt = hc->tx_seqh;
@@ -204,23 +142,18 @@ static void ccid2_hc_tx_rto_expire(unsigned long data)
204 hc->tx_rpseq = 0; 142 hc->tx_rpseq = 0;
205 hc->tx_rpdupack = -1; 143 hc->tx_rpdupack = -1;
206 ccid2_change_l_ack_ratio(sk, 1); 144 ccid2_change_l_ack_ratio(sk, 1);
207 ccid2_hc_tx_check_sanity(hc); 145
146 /* if we were blocked before, we may now send cwnd=1 packet */
147 if (sender_was_blocked)
148 tasklet_schedule(&dccp_sk(sk)->dccps_xmitlet);
149 /* restart backed-off timer */
150 sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto);
208out: 151out:
209 bh_unlock_sock(sk); 152 bh_unlock_sock(sk);
210 sock_put(sk); 153 sock_put(sk);
211} 154}
212 155
213static void ccid2_start_rto_timer(struct sock *sk) 156static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len)
214{
215 struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
216
217 ccid2_pr_debug("setting RTO timeout=%ld\n", hc->tx_rto);
218
219 BUG_ON(timer_pending(&hc->tx_rtotimer));
220 sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto);
221}
222
223static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len)
224{ 157{
225 struct dccp_sock *dp = dccp_sk(sk); 158 struct dccp_sock *dp = dccp_sk(sk);
226 struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); 159 struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
@@ -230,7 +163,7 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len)
230 163
231 hc->tx_seqh->ccid2s_seq = dp->dccps_gss; 164 hc->tx_seqh->ccid2s_seq = dp->dccps_gss;
232 hc->tx_seqh->ccid2s_acked = 0; 165 hc->tx_seqh->ccid2s_acked = 0;
233 hc->tx_seqh->ccid2s_sent = jiffies; 166 hc->tx_seqh->ccid2s_sent = ccid2_time_stamp;
234 167
235 next = hc->tx_seqh->ccid2s_next; 168 next = hc->tx_seqh->ccid2s_next;
236 /* check if we need to alloc more space */ 169 /* check if we need to alloc more space */
@@ -296,99 +229,104 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len)
296 } 229 }
297#endif 230#endif
298 231
299 /* setup RTO timer */ 232 sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto);
300 if (!timer_pending(&hc->tx_rtotimer))
301 ccid2_start_rto_timer(sk);
302 233
303#ifdef CONFIG_IP_DCCP_CCID2_DEBUG 234#ifdef CONFIG_IP_DCCP_CCID2_DEBUG
304 do { 235 do {
305 struct ccid2_seq *seqp = hc->tx_seqt; 236 struct ccid2_seq *seqp = hc->tx_seqt;
306 237
307 while (seqp != hc->tx_seqh) { 238 while (seqp != hc->tx_seqh) {
308 ccid2_pr_debug("out seq=%llu acked=%d time=%lu\n", 239 ccid2_pr_debug("out seq=%llu acked=%d time=%u\n",
309 (unsigned long long)seqp->ccid2s_seq, 240 (unsigned long long)seqp->ccid2s_seq,
310 seqp->ccid2s_acked, seqp->ccid2s_sent); 241 seqp->ccid2s_acked, seqp->ccid2s_sent);
311 seqp = seqp->ccid2s_next; 242 seqp = seqp->ccid2s_next;
312 } 243 }
313 } while (0); 244 } while (0);
314 ccid2_pr_debug("=========\n"); 245 ccid2_pr_debug("=========\n");
315 ccid2_hc_tx_check_sanity(hc);
316#endif 246#endif
317} 247}
318 248
319/* XXX Lame code duplication! 249/**
320 * returns -1 if none was found. 250 * ccid2_rtt_estimator - Sample RTT and compute RTO using RFC2988 algorithm
321 * else returns the next offset to use in the function call. 251 * This code is almost identical with TCP's tcp_rtt_estimator(), since
252 * - it has a higher sampling frequency (recommended by RFC 1323),
253 * - the RTO does not collapse into RTT due to RTTVAR going towards zero,
254 * - it is simple (cf. more complex proposals such as Eifel timer or research
255 * which suggests that the gain should be set according to window size),
256 * - in tests it was found to work well with CCID2 [gerrit].
322 */ 257 */
323static int ccid2_ackvector(struct sock *sk, struct sk_buff *skb, int offset, 258static void ccid2_rtt_estimator(struct sock *sk, const long mrtt)
324 unsigned char **vec, unsigned char *veclen)
325{ 259{
326 const struct dccp_hdr *dh = dccp_hdr(skb); 260 struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
327 unsigned char *options = (unsigned char *)dh + dccp_hdr_len(skb); 261 long m = mrtt ? : 1;
328 unsigned char *opt_ptr; 262
329 const unsigned char *opt_end = (unsigned char *)dh + 263 if (hc->tx_srtt == 0) {
330 (dh->dccph_doff * 4); 264 /* First measurement m */
331 unsigned char opt, len; 265 hc->tx_srtt = m << 3;
332 unsigned char *value; 266 hc->tx_mdev = m << 1;
333 267
334 BUG_ON(offset < 0); 268 hc->tx_mdev_max = max(hc->tx_mdev, tcp_rto_min(sk));
335 options += offset; 269 hc->tx_rttvar = hc->tx_mdev_max;
336 opt_ptr = options; 270
337 if (opt_ptr >= opt_end) 271 hc->tx_rtt_seq = dccp_sk(sk)->dccps_gss;
338 return -1; 272 } else {
339 273 /* Update scaled SRTT as SRTT += 1/8 * (m - SRTT) */
340 while (opt_ptr != opt_end) { 274 m -= (hc->tx_srtt >> 3);
341 opt = *opt_ptr++; 275 hc->tx_srtt += m;
342 len = 0; 276
343 value = NULL; 277 /* Similarly, update scaled mdev with regard to |m| */
344 278 if (m < 0) {
345 /* Check if this isn't a single byte option */ 279 m = -m;
346 if (opt > DCCPO_MAX_RESERVED) { 280 m -= (hc->tx_mdev >> 2);
347 if (opt_ptr == opt_end)
348 goto out_invalid_option;
349
350 len = *opt_ptr++;
351 if (len < 3)
352 goto out_invalid_option;
353 /* 281 /*
354 * Remove the type and len fields, leaving 282 * This neutralises RTO increase when RTT < SRTT - mdev
355 * just the value size 283 * (see P. Sarolahti, A. Kuznetsov,"Congestion Control
284 * in Linux TCP", USENIX 2002, pp. 49-62).
356 */ 285 */
357 len -= 2; 286 if (m > 0)
358 value = opt_ptr; 287 m >>= 3;
359 opt_ptr += len; 288 } else {
289 m -= (hc->tx_mdev >> 2);
290 }
291 hc->tx_mdev += m;
360 292
361 if (opt_ptr > opt_end) 293 if (hc->tx_mdev > hc->tx_mdev_max) {
362 goto out_invalid_option; 294 hc->tx_mdev_max = hc->tx_mdev;
295 if (hc->tx_mdev_max > hc->tx_rttvar)
296 hc->tx_rttvar = hc->tx_mdev_max;
363 } 297 }
364 298
365 switch (opt) { 299 /*
366 case DCCPO_ACK_VECTOR_0: 300 * Decay RTTVAR at most once per flight, exploiting that
367 case DCCPO_ACK_VECTOR_1: 301 * 1) pipe <= cwnd <= Sequence_Window = W (RFC 4340, 7.5.2)
368 *vec = value; 302 * 2) AWL = GSS-W+1 <= GAR <= GSS (RFC 4340, 7.5.1)
369 *veclen = len; 303 * GAR is a useful bound for FlightSize = pipe.
370 return offset + (opt_ptr - options); 304 * AWL is probably too low here, as it over-estimates pipe.
305 */
306 if (after48(dccp_sk(sk)->dccps_gar, hc->tx_rtt_seq)) {
307 if (hc->tx_mdev_max < hc->tx_rttvar)
308 hc->tx_rttvar -= (hc->tx_rttvar -
309 hc->tx_mdev_max) >> 2;
310 hc->tx_rtt_seq = dccp_sk(sk)->dccps_gss;
311 hc->tx_mdev_max = tcp_rto_min(sk);
371 } 312 }
372 } 313 }
373 314
374 return -1; 315 /*
375 316 * Set RTO from SRTT and RTTVAR
376out_invalid_option: 317 * As in TCP, 4 * RTTVAR >= TCP_RTO_MIN, giving a minimum RTO of 200 ms.
377 DCCP_BUG("Invalid option - this should not happen (previous parsing)!"); 318 * This agrees with RFC 4341, 5:
378 return -1; 319 * "Because DCCP does not retransmit data, DCCP does not require
379} 320 * TCP's recommended minimum timeout of one second".
380 321 */
381static void ccid2_hc_tx_kill_rto_timer(struct sock *sk) 322 hc->tx_rto = (hc->tx_srtt >> 3) + hc->tx_rttvar;
382{
383 struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
384 323
385 sk_stop_timer(sk, &hc->tx_rtotimer); 324 if (hc->tx_rto > DCCP_RTO_MAX)
386 ccid2_pr_debug("deleted RTO timer\n"); 325 hc->tx_rto = DCCP_RTO_MAX;
387} 326}
388 327
389static inline void ccid2_new_ack(struct sock *sk, 328static void ccid2_new_ack(struct sock *sk, struct ccid2_seq *seqp,
390 struct ccid2_seq *seqp, 329 unsigned int *maxincr)
391 unsigned int *maxincr)
392{ 330{
393 struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); 331 struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
394 332
@@ -402,93 +340,27 @@ static inline void ccid2_new_ack(struct sock *sk,
402 hc->tx_cwnd += 1; 340 hc->tx_cwnd += 1;
403 hc->tx_packets_acked = 0; 341 hc->tx_packets_acked = 0;
404 } 342 }
405 343 /*
406 /* update RTO */ 344 * FIXME: RTT is sampled several times per acknowledgment (for each
407 if (hc->tx_srtt == -1 || 345 * entry in the Ack Vector), instead of once per Ack (as in TCP SACK).
408 time_after(jiffies, hc->tx_lastrtt + hc->tx_srtt)) { 346 * This causes the RTT to be over-estimated, since the older entries
409 unsigned long r = (long)jiffies - (long)seqp->ccid2s_sent; 347 * in the Ack Vector have earlier sending times.
410 int s; 348 * The cleanest solution is to not use the ccid2s_sent field at all
411 349 * and instead use DCCP timestamps: requires changes in other places.
412 /* first measurement */ 350 */
413 if (hc->tx_srtt == -1) { 351 ccid2_rtt_estimator(sk, ccid2_time_stamp - seqp->ccid2s_sent);
414 ccid2_pr_debug("R: %lu Time=%lu seq=%llu\n",
415 r, jiffies,
416 (unsigned long long)seqp->ccid2s_seq);
417 ccid2_change_srtt(hc, r);
418 hc->tx_rttvar = r >> 1;
419 } else {
420 /* RTTVAR */
421 long tmp = hc->tx_srtt - r;
422 long srtt;
423
424 if (tmp < 0)
425 tmp *= -1;
426
427 tmp >>= 2;
428 hc->tx_rttvar *= 3;
429 hc->tx_rttvar >>= 2;
430 hc->tx_rttvar += tmp;
431
432 /* SRTT */
433 srtt = hc->tx_srtt;
434 srtt *= 7;
435 srtt >>= 3;
436 tmp = r >> 3;
437 srtt += tmp;
438 ccid2_change_srtt(hc, srtt);
439 }
440 s = hc->tx_rttvar << 2;
441 /* clock granularity is 1 when based on jiffies */
442 if (!s)
443 s = 1;
444 hc->tx_rto = hc->tx_srtt + s;
445
446 /* must be at least a second */
447 s = hc->tx_rto / HZ;
448 /* DCCP doesn't require this [but I like it cuz my code sux] */
449#if 1
450 if (s < 1)
451 hc->tx_rto = HZ;
452#endif
453 /* max 60 seconds */
454 if (s > 60)
455 hc->tx_rto = HZ * 60;
456
457 hc->tx_lastrtt = jiffies;
458
459 ccid2_pr_debug("srtt: %ld rttvar: %ld rto: %ld (HZ=%d) R=%lu\n",
460 hc->tx_srtt, hc->tx_rttvar,
461 hc->tx_rto, HZ, r);
462 }
463
464 /* we got a new ack, so re-start RTO timer */
465 ccid2_hc_tx_kill_rto_timer(sk);
466 ccid2_start_rto_timer(sk);
467}
468
469static void ccid2_hc_tx_dec_pipe(struct sock *sk)
470{
471 struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
472
473 if (hc->tx_pipe == 0)
474 DCCP_BUG("pipe == 0");
475 else
476 hc->tx_pipe--;
477
478 if (hc->tx_pipe == 0)
479 ccid2_hc_tx_kill_rto_timer(sk);
480} 352}
481 353
482static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp) 354static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp)
483{ 355{
484 struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); 356 struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
485 357
486 if (time_before(seqp->ccid2s_sent, hc->tx_last_cong)) { 358 if ((s32)(seqp->ccid2s_sent - hc->tx_last_cong) < 0) {
487 ccid2_pr_debug("Multiple losses in an RTT---treating as one\n"); 359 ccid2_pr_debug("Multiple losses in an RTT---treating as one\n");
488 return; 360 return;
489 } 361 }
490 362
491 hc->tx_last_cong = jiffies; 363 hc->tx_last_cong = ccid2_time_stamp;
492 364
493 hc->tx_cwnd = hc->tx_cwnd / 2 ? : 1U; 365 hc->tx_cwnd = hc->tx_cwnd / 2 ? : 1U;
494 hc->tx_ssthresh = max(hc->tx_cwnd, 2U); 366 hc->tx_ssthresh = max(hc->tx_cwnd, 2U);
@@ -498,19 +370,31 @@ static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp)
498 ccid2_change_l_ack_ratio(sk, hc->tx_cwnd); 370 ccid2_change_l_ack_ratio(sk, hc->tx_cwnd);
499} 371}
500 372
373static int ccid2_hc_tx_parse_options(struct sock *sk, u8 packet_type,
374 u8 option, u8 *optval, u8 optlen)
375{
376 struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
377
378 switch (option) {
379 case DCCPO_ACK_VECTOR_0:
380 case DCCPO_ACK_VECTOR_1:
381 return dccp_ackvec_parsed_add(&hc->tx_av_chunks, optval, optlen,
382 option - DCCPO_ACK_VECTOR_0);
383 }
384 return 0;
385}
386
501static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) 387static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
502{ 388{
503 struct dccp_sock *dp = dccp_sk(sk); 389 struct dccp_sock *dp = dccp_sk(sk);
504 struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); 390 struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
391 const bool sender_was_blocked = ccid2_cwnd_network_limited(hc);
392 struct dccp_ackvec_parsed *avp;
505 u64 ackno, seqno; 393 u64 ackno, seqno;
506 struct ccid2_seq *seqp; 394 struct ccid2_seq *seqp;
507 unsigned char *vector;
508 unsigned char veclen;
509 int offset = 0;
510 int done = 0; 395 int done = 0;
511 unsigned int maxincr = 0; 396 unsigned int maxincr = 0;
512 397
513 ccid2_hc_tx_check_sanity(hc);
514 /* check reverse path congestion */ 398 /* check reverse path congestion */
515 seqno = DCCP_SKB_CB(skb)->dccpd_seq; 399 seqno = DCCP_SKB_CB(skb)->dccpd_seq;
516 400
@@ -541,17 +425,12 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
541 } 425 }
542 426
543 /* check forward path congestion */ 427 /* check forward path congestion */
544 /* still didn't send out new data packets */ 428 if (dccp_packet_without_ack(skb))
545 if (hc->tx_seqh == hc->tx_seqt)
546 return; 429 return;
547 430
548 switch (DCCP_SKB_CB(skb)->dccpd_type) { 431 /* still didn't send out new data packets */
549 case DCCP_PKT_ACK: 432 if (hc->tx_seqh == hc->tx_seqt)
550 case DCCP_PKT_DATAACK: 433 goto done;
551 break;
552 default:
553 return;
554 }
555 434
556 ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq; 435 ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq;
557 if (after48(ackno, hc->tx_high_ack)) 436 if (after48(ackno, hc->tx_high_ack))
@@ -575,16 +454,16 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
575 maxincr = DIV_ROUND_UP(dp->dccps_l_ack_ratio, 2); 454 maxincr = DIV_ROUND_UP(dp->dccps_l_ack_ratio, 2);
576 455
577 /* go through all ack vectors */ 456 /* go through all ack vectors */
578 while ((offset = ccid2_ackvector(sk, skb, offset, 457 list_for_each_entry(avp, &hc->tx_av_chunks, node) {
579 &vector, &veclen)) != -1) {
580 /* go through this ack vector */ 458 /* go through this ack vector */
581 while (veclen--) { 459 for (; avp->len--; avp->vec++) {
582 const u8 rl = *vector & DCCP_ACKVEC_LEN_MASK; 460 u64 ackno_end_rl = SUB48(ackno,
583 u64 ackno_end_rl = SUB48(ackno, rl); 461 dccp_ackvec_runlen(avp->vec));
584 462
585 ccid2_pr_debug("ackvec start:%llu end:%llu\n", 463 ccid2_pr_debug("ackvec %llu |%u,%u|\n",
586 (unsigned long long)ackno, 464 (unsigned long long)ackno,
587 (unsigned long long)ackno_end_rl); 465 dccp_ackvec_state(avp->vec) >> 6,
466 dccp_ackvec_runlen(avp->vec));
588 /* if the seqno we are analyzing is larger than the 467 /* if the seqno we are analyzing is larger than the
589 * current ackno, then move towards the tail of our 468 * current ackno, then move towards the tail of our
590 * seqnos. 469 * seqnos.
@@ -603,24 +482,22 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
603 * run length 482 * run length
604 */ 483 */
605 while (between48(seqp->ccid2s_seq,ackno_end_rl,ackno)) { 484 while (between48(seqp->ccid2s_seq,ackno_end_rl,ackno)) {
606 const u8 state = *vector & 485 const u8 state = dccp_ackvec_state(avp->vec);
607 DCCP_ACKVEC_STATE_MASK;
608 486
609 /* new packet received or marked */ 487 /* new packet received or marked */
610 if (state != DCCP_ACKVEC_STATE_NOT_RECEIVED && 488 if (state != DCCPAV_NOT_RECEIVED &&
611 !seqp->ccid2s_acked) { 489 !seqp->ccid2s_acked) {
612 if (state == 490 if (state == DCCPAV_ECN_MARKED)
613 DCCP_ACKVEC_STATE_ECN_MARKED) {
614 ccid2_congestion_event(sk, 491 ccid2_congestion_event(sk,
615 seqp); 492 seqp);
616 } else 493 else
617 ccid2_new_ack(sk, seqp, 494 ccid2_new_ack(sk, seqp,
618 &maxincr); 495 &maxincr);
619 496
620 seqp->ccid2s_acked = 1; 497 seqp->ccid2s_acked = 1;
621 ccid2_pr_debug("Got ack for %llu\n", 498 ccid2_pr_debug("Got ack for %llu\n",
622 (unsigned long long)seqp->ccid2s_seq); 499 (unsigned long long)seqp->ccid2s_seq);
623 ccid2_hc_tx_dec_pipe(sk); 500 hc->tx_pipe--;
624 } 501 }
625 if (seqp == hc->tx_seqt) { 502 if (seqp == hc->tx_seqt) {
626 done = 1; 503 done = 1;
@@ -632,7 +509,6 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
632 break; 509 break;
633 510
634 ackno = SUB48(ackno_end_rl, 1); 511 ackno = SUB48(ackno_end_rl, 1);
635 vector++;
636 } 512 }
637 if (done) 513 if (done)
638 break; 514 break;
@@ -677,7 +553,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
677 * one ack vector. 553 * one ack vector.
678 */ 554 */
679 ccid2_congestion_event(sk, seqp); 555 ccid2_congestion_event(sk, seqp);
680 ccid2_hc_tx_dec_pipe(sk); 556 hc->tx_pipe--;
681 } 557 }
682 if (seqp == hc->tx_seqt) 558 if (seqp == hc->tx_seqt)
683 break; 559 break;
@@ -695,7 +571,25 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
695 hc->tx_seqt = hc->tx_seqt->ccid2s_next; 571 hc->tx_seqt = hc->tx_seqt->ccid2s_next;
696 } 572 }
697 573
698 ccid2_hc_tx_check_sanity(hc); 574 /* restart RTO timer if not all outstanding data has been acked */
575 if (hc->tx_pipe == 0)
576 sk_stop_timer(sk, &hc->tx_rtotimer);
577 else
578 sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto);
579done:
580 /* check if incoming Acks allow pending packets to be sent */
581 if (sender_was_blocked && !ccid2_cwnd_network_limited(hc))
582 tasklet_schedule(&dccp_sk(sk)->dccps_xmitlet);
583 dccp_ackvec_parsed_cleanup(&hc->tx_av_chunks);
584}
585
586/*
587 * Convert RFC 3390 larger initial window into an equivalent number of packets.
588 * This is based on the numbers specified in RFC 5681, 3.1.
589 */
590static inline u32 rfc3390_bytes_to_packets(const u32 smss)
591{
592 return smss <= 1095 ? 4 : (smss > 2190 ? 2 : 3);
699} 593}
700 594
701static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk) 595static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk)
@@ -707,12 +601,8 @@ static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk)
707 /* RFC 4341, 5: initialise ssthresh to arbitrarily high (max) value */ 601 /* RFC 4341, 5: initialise ssthresh to arbitrarily high (max) value */
708 hc->tx_ssthresh = ~0U; 602 hc->tx_ssthresh = ~0U;
709 603
710 /* 604 /* Use larger initial windows (RFC 4341, section 5). */
711 * RFC 4341, 5: "The cwnd parameter is initialized to at most four 605 hc->tx_cwnd = rfc3390_bytes_to_packets(dp->dccps_mss_cache);
712 * packets for new connections, following the rules from [RFC3390]".
713 * We need to convert the bytes of RFC3390 into the packets of RFC 4341.
714 */
715 hc->tx_cwnd = clamp(4380U / dp->dccps_mss_cache, 2U, 4U);
716 606
717 /* Make sure that Ack Ratio is enabled and within bounds. */ 607 /* Make sure that Ack Ratio is enabled and within bounds. */
718 max_ratio = DIV_ROUND_UP(hc->tx_cwnd, 2); 608 max_ratio = DIV_ROUND_UP(hc->tx_cwnd, 2);
@@ -723,15 +613,12 @@ static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk)
723 if (ccid2_hc_tx_alloc_seq(hc)) 613 if (ccid2_hc_tx_alloc_seq(hc))
724 return -ENOMEM; 614 return -ENOMEM;
725 615
726 hc->tx_rto = 3 * HZ; 616 hc->tx_rto = DCCP_TIMEOUT_INIT;
727 ccid2_change_srtt(hc, -1);
728 hc->tx_rttvar = -1;
729 hc->tx_rpdupack = -1; 617 hc->tx_rpdupack = -1;
730 hc->tx_last_cong = jiffies; 618 hc->tx_last_cong = ccid2_time_stamp;
731 setup_timer(&hc->tx_rtotimer, ccid2_hc_tx_rto_expire, 619 setup_timer(&hc->tx_rtotimer, ccid2_hc_tx_rto_expire,
732 (unsigned long)sk); 620 (unsigned long)sk);
733 621 INIT_LIST_HEAD(&hc->tx_av_chunks);
734 ccid2_hc_tx_check_sanity(hc);
735 return 0; 622 return 0;
736} 623}
737 624
@@ -740,7 +627,7 @@ static void ccid2_hc_tx_exit(struct sock *sk)
740 struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); 627 struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
741 int i; 628 int i;
742 629
743 ccid2_hc_tx_kill_rto_timer(sk); 630 sk_stop_timer(sk, &hc->tx_rtotimer);
744 631
745 for (i = 0; i < hc->tx_seqbufc; i++) 632 for (i = 0; i < hc->tx_seqbufc; i++)
746 kfree(hc->tx_seqbuf[i]); 633 kfree(hc->tx_seqbuf[i]);
@@ -765,16 +652,17 @@ static void ccid2_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
765} 652}
766 653
767struct ccid_operations ccid2_ops = { 654struct ccid_operations ccid2_ops = {
768 .ccid_id = DCCPC_CCID2, 655 .ccid_id = DCCPC_CCID2,
769 .ccid_name = "TCP-like", 656 .ccid_name = "TCP-like",
770 .ccid_hc_tx_obj_size = sizeof(struct ccid2_hc_tx_sock), 657 .ccid_hc_tx_obj_size = sizeof(struct ccid2_hc_tx_sock),
771 .ccid_hc_tx_init = ccid2_hc_tx_init, 658 .ccid_hc_tx_init = ccid2_hc_tx_init,
772 .ccid_hc_tx_exit = ccid2_hc_tx_exit, 659 .ccid_hc_tx_exit = ccid2_hc_tx_exit,
773 .ccid_hc_tx_send_packet = ccid2_hc_tx_send_packet, 660 .ccid_hc_tx_send_packet = ccid2_hc_tx_send_packet,
774 .ccid_hc_tx_packet_sent = ccid2_hc_tx_packet_sent, 661 .ccid_hc_tx_packet_sent = ccid2_hc_tx_packet_sent,
775 .ccid_hc_tx_packet_recv = ccid2_hc_tx_packet_recv, 662 .ccid_hc_tx_parse_options = ccid2_hc_tx_parse_options,
776 .ccid_hc_rx_obj_size = sizeof(struct ccid2_hc_rx_sock), 663 .ccid_hc_tx_packet_recv = ccid2_hc_tx_packet_recv,
777 .ccid_hc_rx_packet_recv = ccid2_hc_rx_packet_recv, 664 .ccid_hc_rx_obj_size = sizeof(struct ccid2_hc_rx_sock),
665 .ccid_hc_rx_packet_recv = ccid2_hc_rx_packet_recv,
778}; 666};
779 667
780#ifdef CONFIG_IP_DCCP_CCID2_DEBUG 668#ifdef CONFIG_IP_DCCP_CCID2_DEBUG
diff --git a/net/dccp/ccids/ccid2.h b/net/dccp/ccids/ccid2.h
index 1ec6a30103bb..e9985dafc2c7 100644
--- a/net/dccp/ccids/ccid2.h
+++ b/net/dccp/ccids/ccid2.h
@@ -18,18 +18,23 @@
18#ifndef _DCCP_CCID2_H_ 18#ifndef _DCCP_CCID2_H_
19#define _DCCP_CCID2_H_ 19#define _DCCP_CCID2_H_
20 20
21#include <linux/dccp.h>
22#include <linux/timer.h> 21#include <linux/timer.h>
23#include <linux/types.h> 22#include <linux/types.h>
24#include "../ccid.h" 23#include "../ccid.h"
24#include "../dccp.h"
25
26/*
27 * CCID-2 timestamping faces the same issues as TCP timestamping.
28 * Hence we reuse/share as much of the code as possible.
29 */
30#define ccid2_time_stamp tcp_time_stamp
31
25/* NUMDUPACK parameter from RFC 4341, p. 6 */ 32/* NUMDUPACK parameter from RFC 4341, p. 6 */
26#define NUMDUPACK 3 33#define NUMDUPACK 3
27 34
28struct sock;
29
30struct ccid2_seq { 35struct ccid2_seq {
31 u64 ccid2s_seq; 36 u64 ccid2s_seq;
32 unsigned long ccid2s_sent; 37 u32 ccid2s_sent;
33 int ccid2s_acked; 38 int ccid2s_acked;
34 struct ccid2_seq *ccid2s_prev; 39 struct ccid2_seq *ccid2s_prev;
35 struct ccid2_seq *ccid2s_next; 40 struct ccid2_seq *ccid2s_next;
@@ -42,9 +47,15 @@ struct ccid2_seq {
42 * struct ccid2_hc_tx_sock - CCID2 TX half connection 47 * struct ccid2_hc_tx_sock - CCID2 TX half connection
43 * @tx_{cwnd,ssthresh,pipe}: as per RFC 4341, section 5 48 * @tx_{cwnd,ssthresh,pipe}: as per RFC 4341, section 5
44 * @tx_packets_acked: Ack counter for deriving cwnd growth (RFC 3465) 49 * @tx_packets_acked: Ack counter for deriving cwnd growth (RFC 3465)
45 * @tx_lastrtt: time RTT was last measured 50 * @tx_srtt: smoothed RTT estimate, scaled by 2^3
51 * @tx_mdev: smoothed RTT variation, scaled by 2^2
52 * @tx_mdev_max: maximum of @mdev during one flight
53 * @tx_rttvar: moving average/maximum of @mdev_max
54 * @tx_rto: RTO value deriving from SRTT and RTTVAR (RFC 2988)
55 * @tx_rtt_seq: to decay RTTVAR at most once per flight
46 * @tx_rpseq: last consecutive seqno 56 * @tx_rpseq: last consecutive seqno
47 * @tx_rpdupack: dupacks since rpseq 57 * @tx_rpdupack: dupacks since rpseq
58 * @tx_av_chunks: list of Ack Vectors received on current skb
48 */ 59 */
49struct ccid2_hc_tx_sock { 60struct ccid2_hc_tx_sock {
50 u32 tx_cwnd; 61 u32 tx_cwnd;
@@ -55,17 +66,28 @@ struct ccid2_hc_tx_sock {
55 int tx_seqbufc; 66 int tx_seqbufc;
56 struct ccid2_seq *tx_seqh; 67 struct ccid2_seq *tx_seqh;
57 struct ccid2_seq *tx_seqt; 68 struct ccid2_seq *tx_seqt;
58 long tx_rto; 69
59 long tx_srtt; 70 /* RTT measurement: variables/principles are the same as in TCP */
60 long tx_rttvar; 71 u32 tx_srtt,
61 unsigned long tx_lastrtt; 72 tx_mdev,
73 tx_mdev_max,
74 tx_rttvar,
75 tx_rto;
76 u64 tx_rtt_seq:48;
62 struct timer_list tx_rtotimer; 77 struct timer_list tx_rtotimer;
78
63 u64 tx_rpseq; 79 u64 tx_rpseq;
64 int tx_rpdupack; 80 int tx_rpdupack;
65 unsigned long tx_last_cong; 81 u32 tx_last_cong;
66 u64 tx_high_ack; 82 u64 tx_high_ack;
83 struct list_head tx_av_chunks;
67}; 84};
68 85
86static inline bool ccid2_cwnd_network_limited(struct ccid2_hc_tx_sock *hc)
87{
88 return hc->tx_pipe >= hc->tx_cwnd;
89}
90
69struct ccid2_hc_rx_sock { 91struct ccid2_hc_rx_sock {
70 int rx_data; 92 int rx_data;
71}; 93};
diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c
index 95f752986497..3d604e1349c0 100644
--- a/net/dccp/ccids/ccid3.c
+++ b/net/dccp/ccids/ccid3.c
@@ -54,7 +54,6 @@ static const char *ccid3_tx_state_name(enum ccid3_hc_tx_states state)
54 [TFRC_SSTATE_NO_SENT] = "NO_SENT", 54 [TFRC_SSTATE_NO_SENT] = "NO_SENT",
55 [TFRC_SSTATE_NO_FBACK] = "NO_FBACK", 55 [TFRC_SSTATE_NO_FBACK] = "NO_FBACK",
56 [TFRC_SSTATE_FBACK] = "FBACK", 56 [TFRC_SSTATE_FBACK] = "FBACK",
57 [TFRC_SSTATE_TERM] = "TERM",
58 }; 57 };
59 58
60 return ccid3_state_names[state]; 59 return ccid3_state_names[state];
@@ -91,19 +90,16 @@ static inline u64 rfc3390_initial_rate(struct sock *sk)
91 return scaled_div(w_init << 6, hc->tx_rtt); 90 return scaled_div(w_init << 6, hc->tx_rtt);
92} 91}
93 92
94/* 93/**
95 * Recalculate t_ipi and delta (should be called whenever X changes) 94 * ccid3_update_send_interval - Calculate new t_ipi = s / X_inst
95 * This respects the granularity of X_inst (64 * bytes/second).
96 */ 96 */
97static void ccid3_update_send_interval(struct ccid3_hc_tx_sock *hc) 97static void ccid3_update_send_interval(struct ccid3_hc_tx_sock *hc)
98{ 98{
99 /* Calculate new t_ipi = s / X_inst (X_inst is in 64 * bytes/second) */
100 hc->tx_t_ipi = scaled_div32(((u64)hc->tx_s) << 6, hc->tx_x); 99 hc->tx_t_ipi = scaled_div32(((u64)hc->tx_s) << 6, hc->tx_x);
101 100
102 /* Calculate new delta by delta = min(t_ipi / 2, t_gran / 2) */ 101 ccid3_pr_debug("t_ipi=%u, s=%u, X=%u\n", hc->tx_t_ipi,
103 hc->tx_delta = min_t(u32, hc->tx_t_ipi / 2, TFRC_OPSYS_HALF_TIME_GRAN); 102 hc->tx_s, (unsigned)(hc->tx_x >> 6));
104
105 ccid3_pr_debug("t_ipi=%u, delta=%u, s=%u, X=%u\n", hc->tx_t_ipi,
106 hc->tx_delta, hc->tx_s, (unsigned)(hc->tx_x >> 6));
107} 103}
108 104
109static u32 ccid3_hc_tx_idle_rtt(struct ccid3_hc_tx_sock *hc, ktime_t now) 105static u32 ccid3_hc_tx_idle_rtt(struct ccid3_hc_tx_sock *hc, ktime_t now)
@@ -211,16 +207,19 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data)
211 ccid3_pr_debug("%s(%p, state=%s) - entry\n", dccp_role(sk), sk, 207 ccid3_pr_debug("%s(%p, state=%s) - entry\n", dccp_role(sk), sk,
212 ccid3_tx_state_name(hc->tx_state)); 208 ccid3_tx_state_name(hc->tx_state));
213 209
210 /* Ignore and do not restart after leaving the established state */
211 if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN))
212 goto out;
213
214 /* Reset feedback state to "no feedback received" */
214 if (hc->tx_state == TFRC_SSTATE_FBACK) 215 if (hc->tx_state == TFRC_SSTATE_FBACK)
215 ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK); 216 ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK);
216 else if (hc->tx_state != TFRC_SSTATE_NO_FBACK)
217 goto out;
218 217
219 /* 218 /*
220 * Determine new allowed sending rate X as per draft rfc3448bis-00, 4.4 219 * Determine new allowed sending rate X as per draft rfc3448bis-00, 4.4
220 * RTO is 0 if and only if no feedback has been received yet.
221 */ 221 */
222 if (hc->tx_t_rto == 0 || /* no feedback received yet */ 222 if (hc->tx_t_rto == 0 || hc->tx_p == 0) {
223 hc->tx_p == 0) {
224 223
225 /* halve send rate directly */ 224 /* halve send rate directly */
226 hc->tx_x = max(hc->tx_x / 2, 225 hc->tx_x = max(hc->tx_x / 2,
@@ -256,7 +255,7 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data)
256 * Set new timeout for the nofeedback timer. 255 * Set new timeout for the nofeedback timer.
257 * See comments in packet_recv() regarding the value of t_RTO. 256 * See comments in packet_recv() regarding the value of t_RTO.
258 */ 257 */
259 if (unlikely(hc->tx_t_rto == 0)) /* no feedback yet */ 258 if (unlikely(hc->tx_t_rto == 0)) /* no feedback received yet */
260 t_nfb = TFRC_INITIAL_TIMEOUT; 259 t_nfb = TFRC_INITIAL_TIMEOUT;
261 else 260 else
262 t_nfb = max(hc->tx_t_rto, 2 * hc->tx_t_ipi); 261 t_nfb = max(hc->tx_t_rto, 2 * hc->tx_t_ipi);
@@ -269,11 +268,11 @@ out:
269 sock_put(sk); 268 sock_put(sk);
270} 269}
271 270
272/* 271/**
273 * returns 272 * ccid3_hc_tx_send_packet - Delay-based dequeueing of TX packets
274 * > 0: delay (in msecs) that should pass before actually sending 273 * @skb: next packet candidate to send on @sk
275 * = 0: can send immediately 274 * This function uses the convention of ccid_packet_dequeue_eval() and
276 * < 0: error condition; do not send packet 275 * returns a millisecond-delay value between 0 and t_mbi = 64000 msec.
277 */ 276 */
278static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) 277static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
279{ 278{
@@ -290,8 +289,7 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
290 if (unlikely(skb->len == 0)) 289 if (unlikely(skb->len == 0))
291 return -EBADMSG; 290 return -EBADMSG;
292 291
293 switch (hc->tx_state) { 292 if (hc->tx_state == TFRC_SSTATE_NO_SENT) {
294 case TFRC_SSTATE_NO_SENT:
295 sk_reset_timer(sk, &hc->tx_no_feedback_timer, (jiffies + 293 sk_reset_timer(sk, &hc->tx_no_feedback_timer, (jiffies +
296 usecs_to_jiffies(TFRC_INITIAL_TIMEOUT))); 294 usecs_to_jiffies(TFRC_INITIAL_TIMEOUT)));
297 hc->tx_last_win_count = 0; 295 hc->tx_last_win_count = 0;
@@ -326,27 +324,22 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
326 ccid3_update_send_interval(hc); 324 ccid3_update_send_interval(hc);
327 325
328 ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK); 326 ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK);
329 break; 327
330 case TFRC_SSTATE_NO_FBACK: 328 } else {
331 case TFRC_SSTATE_FBACK:
332 delay = ktime_us_delta(hc->tx_t_nom, now); 329 delay = ktime_us_delta(hc->tx_t_nom, now);
333 ccid3_pr_debug("delay=%ld\n", (long)delay); 330 ccid3_pr_debug("delay=%ld\n", (long)delay);
334 /* 331 /*
335 * Scheduling of packet transmissions [RFC 3448, 4.6] 332 * Scheduling of packet transmissions (RFC 5348, 8.3)
336 * 333 *
337 * if (t_now > t_nom - delta) 334 * if (t_now > t_nom - delta)
338 * // send the packet now 335 * // send the packet now
339 * else 336 * else
340 * // send the packet in (t_nom - t_now) milliseconds. 337 * // send the packet in (t_nom - t_now) milliseconds.
341 */ 338 */
342 if (delay - (s64)hc->tx_delta >= 1000) 339 if (delay >= TFRC_T_DELTA)
343 return (u32)delay / 1000L; 340 return (u32)delay / USEC_PER_MSEC;
344 341
345 ccid3_hc_tx_update_win_count(hc, now); 342 ccid3_hc_tx_update_win_count(hc, now);
346 break;
347 case TFRC_SSTATE_TERM:
348 DCCP_BUG("%s(%p) - Illegal state TERM", dccp_role(sk), sk);
349 return -EINVAL;
350 } 343 }
351 344
352 /* prepare to send now (add options etc.) */ 345 /* prepare to send now (add options etc.) */
@@ -355,11 +348,10 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
355 348
356 /* set the nominal send time for the next following packet */ 349 /* set the nominal send time for the next following packet */
357 hc->tx_t_nom = ktime_add_us(hc->tx_t_nom, hc->tx_t_ipi); 350 hc->tx_t_nom = ktime_add_us(hc->tx_t_nom, hc->tx_t_ipi);
358 return 0; 351 return CCID_PACKET_SEND_AT_ONCE;
359} 352}
360 353
361static void ccid3_hc_tx_packet_sent(struct sock *sk, int more, 354static void ccid3_hc_tx_packet_sent(struct sock *sk, unsigned int len)
362 unsigned int len)
363{ 355{
364 struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); 356 struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
365 357
@@ -372,48 +364,34 @@ static void ccid3_hc_tx_packet_sent(struct sock *sk, int more,
372static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) 364static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
373{ 365{
374 struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); 366 struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
375 struct ccid3_options_received *opt_recv; 367 struct tfrc_tx_hist_entry *acked;
376 ktime_t now; 368 ktime_t now;
377 unsigned long t_nfb; 369 unsigned long t_nfb;
378 u32 pinv, r_sample; 370 u32 r_sample;
379 371
380 /* we are only interested in ACKs */ 372 /* we are only interested in ACKs */
381 if (!(DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK || 373 if (!(DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK ||
382 DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_DATAACK)) 374 DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_DATAACK))
383 return; 375 return;
384 /* ... and only in the established state */
385 if (hc->tx_state != TFRC_SSTATE_FBACK &&
386 hc->tx_state != TFRC_SSTATE_NO_FBACK)
387 return;
388
389 opt_recv = &hc->tx_options_received;
390 now = ktime_get_real();
391
392 /* Estimate RTT from history if ACK number is valid */
393 r_sample = tfrc_tx_hist_rtt(hc->tx_hist,
394 DCCP_SKB_CB(skb)->dccpd_ack_seq, now);
395 if (r_sample == 0) {
396 DCCP_WARN("%s(%p): %s with bogus ACK-%llu\n", dccp_role(sk), sk,
397 dccp_packet_name(DCCP_SKB_CB(skb)->dccpd_type),
398 (unsigned long long)DCCP_SKB_CB(skb)->dccpd_ack_seq);
399 return;
400 }
401
402 /* Update receive rate in units of 64 * bytes/second */
403 hc->tx_x_recv = opt_recv->ccid3or_receive_rate;
404 hc->tx_x_recv <<= 6;
405
406 /* Update loss event rate (which is scaled by 1e6) */
407 pinv = opt_recv->ccid3or_loss_event_rate;
408 if (pinv == ~0U || pinv == 0) /* see RFC 4342, 8.5 */
409 hc->tx_p = 0;
410 else /* can not exceed 100% */
411 hc->tx_p = scaled_div(1, pinv);
412 /* 376 /*
413 * Validate new RTT sample and update moving average 377 * Locate the acknowledged packet in the TX history.
378 *
379 * Returning "entry not found" here can for instance happen when
380 * - the host has not sent out anything (e.g. a passive server),
381 * - the Ack is outdated (packet with higher Ack number was received),
382 * - it is a bogus Ack (for a packet not sent on this connection).
414 */ 383 */
415 r_sample = dccp_sample_rtt(sk, r_sample); 384 acked = tfrc_tx_hist_find_entry(hc->tx_hist, dccp_hdr_ack_seq(skb));
385 if (acked == NULL)
386 return;
387 /* For the sake of RTT sampling, ignore/remove all older entries */
388 tfrc_tx_hist_purge(&acked->next);
389
390 /* Update the moving average for the RTT estimate (RFC 3448, 4.3) */
391 now = ktime_get_real();
392 r_sample = dccp_sample_rtt(sk, ktime_us_delta(now, acked->stamp));
416 hc->tx_rtt = tfrc_ewma(hc->tx_rtt, r_sample, 9); 393 hc->tx_rtt = tfrc_ewma(hc->tx_rtt, r_sample, 9);
394
417 /* 395 /*
418 * Update allowed sending rate X as per draft rfc3448bis-00, 4.2/3 396 * Update allowed sending rate X as per draft rfc3448bis-00, 4.2/3
419 */ 397 */
@@ -461,13 +439,12 @@ done_computing_x:
461 sk->sk_write_space(sk); 439 sk->sk_write_space(sk);
462 440
463 /* 441 /*
464 * Update timeout interval for the nofeedback timer. 442 * Update timeout interval for the nofeedback timer. In order to control
465 * We use a configuration option to increase the lower bound. 443 * rate halving on networks with very low RTTs (<= 1 ms), use per-route
466 * This can help avoid triggering the nofeedback timer too 444 * tunable RTAX_RTO_MIN value as the lower bound.
467 * often ('spinning') on LANs with small RTTs.
468 */ 445 */
469 hc->tx_t_rto = max_t(u32, 4 * hc->tx_rtt, (CONFIG_IP_DCCP_CCID3_RTO * 446 hc->tx_t_rto = max_t(u32, 4 * hc->tx_rtt,
470 (USEC_PER_SEC / 1000))); 447 USEC_PER_SEC/HZ * tcp_rto_min(sk));
471 /* 448 /*
472 * Schedule no feedback timer to expire in 449 * Schedule no feedback timer to expire in
473 * max(t_RTO, 2 * s/X) = max(t_RTO, 2 * t_ipi) 450 * max(t_RTO, 2 * s/X) = max(t_RTO, 2 * t_ipi)
@@ -482,66 +459,41 @@ done_computing_x:
482 jiffies + usecs_to_jiffies(t_nfb)); 459 jiffies + usecs_to_jiffies(t_nfb));
483} 460}
484 461
485static int ccid3_hc_tx_parse_options(struct sock *sk, unsigned char option, 462static int ccid3_hc_tx_parse_options(struct sock *sk, u8 packet_type,
486 unsigned char len, u16 idx, 463 u8 option, u8 *optval, u8 optlen)
487 unsigned char *value)
488{ 464{
489 int rc = 0;
490 const struct dccp_sock *dp = dccp_sk(sk);
491 struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); 465 struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
492 struct ccid3_options_received *opt_recv;
493 __be32 opt_val; 466 __be32 opt_val;
494 467
495 opt_recv = &hc->tx_options_received;
496
497 if (opt_recv->ccid3or_seqno != dp->dccps_gsr) {
498 opt_recv->ccid3or_seqno = dp->dccps_gsr;
499 opt_recv->ccid3or_loss_event_rate = ~0;
500 opt_recv->ccid3or_loss_intervals_idx = 0;
501 opt_recv->ccid3or_loss_intervals_len = 0;
502 opt_recv->ccid3or_receive_rate = 0;
503 }
504
505 switch (option) { 468 switch (option) {
469 case TFRC_OPT_RECEIVE_RATE:
506 case TFRC_OPT_LOSS_EVENT_RATE: 470 case TFRC_OPT_LOSS_EVENT_RATE:
507 if (unlikely(len != 4)) { 471 /* Must be ignored on Data packets, cf. RFC 4342 8.3 and 8.5 */
508 DCCP_WARN("%s(%p), invalid len %d " 472 if (packet_type == DCCP_PKT_DATA)
509 "for TFRC_OPT_LOSS_EVENT_RATE\n", 473 break;
510 dccp_role(sk), sk, len); 474 if (unlikely(optlen != 4)) {
511 rc = -EINVAL; 475 DCCP_WARN("%s(%p), invalid len %d for %u\n",
512 } else { 476 dccp_role(sk), sk, optlen, option);
513 opt_val = get_unaligned((__be32 *)value); 477 return -EINVAL;
514 opt_recv->ccid3or_loss_event_rate = ntohl(opt_val);
515 ccid3_pr_debug("%s(%p), LOSS_EVENT_RATE=%u\n",
516 dccp_role(sk), sk,
517 opt_recv->ccid3or_loss_event_rate);
518 } 478 }
519 break; 479 opt_val = ntohl(get_unaligned((__be32 *)optval));
520 case TFRC_OPT_LOSS_INTERVALS: 480
521 opt_recv->ccid3or_loss_intervals_idx = idx; 481 if (option == TFRC_OPT_RECEIVE_RATE) {
522 opt_recv->ccid3or_loss_intervals_len = len; 482 /* Receive Rate is kept in units of 64 bytes/second */
523 ccid3_pr_debug("%s(%p), LOSS_INTERVALS=(%u, %u)\n", 483 hc->tx_x_recv = opt_val;
524 dccp_role(sk), sk, 484 hc->tx_x_recv <<= 6;
525 opt_recv->ccid3or_loss_intervals_idx, 485
526 opt_recv->ccid3or_loss_intervals_len);
527 break;
528 case TFRC_OPT_RECEIVE_RATE:
529 if (unlikely(len != 4)) {
530 DCCP_WARN("%s(%p), invalid len %d "
531 "for TFRC_OPT_RECEIVE_RATE\n",
532 dccp_role(sk), sk, len);
533 rc = -EINVAL;
534 } else {
535 opt_val = get_unaligned((__be32 *)value);
536 opt_recv->ccid3or_receive_rate = ntohl(opt_val);
537 ccid3_pr_debug("%s(%p), RECEIVE_RATE=%u\n", 486 ccid3_pr_debug("%s(%p), RECEIVE_RATE=%u\n",
538 dccp_role(sk), sk, 487 dccp_role(sk), sk, opt_val);
539 opt_recv->ccid3or_receive_rate); 488 } else {
489 /* Update the fixpoint Loss Event Rate fraction */
490 hc->tx_p = tfrc_invert_loss_event_rate(opt_val);
491
492 ccid3_pr_debug("%s(%p), LOSS_EVENT_RATE=%u\n",
493 dccp_role(sk), sk, opt_val);
540 } 494 }
541 break;
542 } 495 }
543 496 return 0;
544 return rc;
545} 497}
546 498
547static int ccid3_hc_tx_init(struct ccid *ccid, struct sock *sk) 499static int ccid3_hc_tx_init(struct ccid *ccid, struct sock *sk)
@@ -559,42 +511,36 @@ static void ccid3_hc_tx_exit(struct sock *sk)
559{ 511{
560 struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); 512 struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
561 513
562 ccid3_hc_tx_set_state(sk, TFRC_SSTATE_TERM);
563 sk_stop_timer(sk, &hc->tx_no_feedback_timer); 514 sk_stop_timer(sk, &hc->tx_no_feedback_timer);
564
565 tfrc_tx_hist_purge(&hc->tx_hist); 515 tfrc_tx_hist_purge(&hc->tx_hist);
566} 516}
567 517
568static void ccid3_hc_tx_get_info(struct sock *sk, struct tcp_info *info) 518static void ccid3_hc_tx_get_info(struct sock *sk, struct tcp_info *info)
569{ 519{
570 struct ccid3_hc_tx_sock *hc; 520 info->tcpi_rto = ccid3_hc_tx_sk(sk)->tx_t_rto;
571 521 info->tcpi_rtt = ccid3_hc_tx_sk(sk)->tx_rtt;
572 /* Listen socks doesn't have a private CCID block */
573 if (sk->sk_state == DCCP_LISTEN)
574 return;
575
576 hc = ccid3_hc_tx_sk(sk);
577 info->tcpi_rto = hc->tx_t_rto;
578 info->tcpi_rtt = hc->tx_rtt;
579} 522}
580 523
581static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len, 524static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len,
582 u32 __user *optval, int __user *optlen) 525 u32 __user *optval, int __user *optlen)
583{ 526{
584 const struct ccid3_hc_tx_sock *hc; 527 const struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
528 struct tfrc_tx_info tfrc;
585 const void *val; 529 const void *val;
586 530
587 /* Listen socks doesn't have a private CCID block */
588 if (sk->sk_state == DCCP_LISTEN)
589 return -EINVAL;
590
591 hc = ccid3_hc_tx_sk(sk);
592 switch (optname) { 531 switch (optname) {
593 case DCCP_SOCKOPT_CCID_TX_INFO: 532 case DCCP_SOCKOPT_CCID_TX_INFO:
594 if (len < sizeof(hc->tx_tfrc)) 533 if (len < sizeof(tfrc))
595 return -EINVAL; 534 return -EINVAL;
596 len = sizeof(hc->tx_tfrc); 535 tfrc.tfrctx_x = hc->tx_x;
597 val = &hc->tx_tfrc; 536 tfrc.tfrctx_x_recv = hc->tx_x_recv;
537 tfrc.tfrctx_x_calc = hc->tx_x_calc;
538 tfrc.tfrctx_rtt = hc->tx_rtt;
539 tfrc.tfrctx_p = hc->tx_p;
540 tfrc.tfrctx_rto = hc->tx_t_rto;
541 tfrc.tfrctx_ipi = hc->tx_t_ipi;
542 len = sizeof(tfrc);
543 val = &tfrc;
598 break; 544 break;
599 default: 545 default:
600 return -ENOPROTOOPT; 546 return -ENOPROTOOPT;
@@ -624,7 +570,6 @@ static const char *ccid3_rx_state_name(enum ccid3_hc_rx_states state)
624 static const char *const ccid3_rx_state_names[] = { 570 static const char *const ccid3_rx_state_names[] = {
625 [TFRC_RSTATE_NO_DATA] = "NO_DATA", 571 [TFRC_RSTATE_NO_DATA] = "NO_DATA",
626 [TFRC_RSTATE_DATA] = "DATA", 572 [TFRC_RSTATE_DATA] = "DATA",
627 [TFRC_RSTATE_TERM] = "TERM",
628 }; 573 };
629 574
630 return ccid3_rx_state_names[state]; 575 return ccid3_rx_state_names[state];
@@ -650,14 +595,9 @@ static void ccid3_hc_rx_send_feedback(struct sock *sk,
650{ 595{
651 struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk); 596 struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk);
652 struct dccp_sock *dp = dccp_sk(sk); 597 struct dccp_sock *dp = dccp_sk(sk);
653 ktime_t now; 598 ktime_t now = ktime_get_real();
654 s64 delta = 0; 599 s64 delta = 0;
655 600
656 if (unlikely(hc->rx_state == TFRC_RSTATE_TERM))
657 return;
658
659 now = ktime_get_real();
660
661 switch (fbtype) { 601 switch (fbtype) {
662 case CCID3_FBACK_INITIAL: 602 case CCID3_FBACK_INITIAL:
663 hc->rx_x_recv = 0; 603 hc->rx_x_recv = 0;
@@ -701,14 +641,12 @@ static void ccid3_hc_rx_send_feedback(struct sock *sk,
701 641
702static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb) 642static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb)
703{ 643{
704 const struct ccid3_hc_rx_sock *hc; 644 const struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk);
705 __be32 x_recv, pinv; 645 __be32 x_recv, pinv;
706 646
707 if (!(sk->sk_state == DCCP_OPEN || sk->sk_state == DCCP_PARTOPEN)) 647 if (!(sk->sk_state == DCCP_OPEN || sk->sk_state == DCCP_PARTOPEN))
708 return 0; 648 return 0;
709 649
710 hc = ccid3_hc_rx_sk(sk);
711
712 if (dccp_packet_without_ack(skb)) 650 if (dccp_packet_without_ack(skb))
713 return 0; 651 return 0;
714 652
@@ -749,10 +687,11 @@ static u32 ccid3_first_li(struct sock *sk)
749 x_recv = scaled_div32(hc->rx_bytes_recv, delta); 687 x_recv = scaled_div32(hc->rx_bytes_recv, delta);
750 if (x_recv == 0) { /* would also trigger divide-by-zero */ 688 if (x_recv == 0) { /* would also trigger divide-by-zero */
751 DCCP_WARN("X_recv==0\n"); 689 DCCP_WARN("X_recv==0\n");
752 if ((x_recv = hc->rx_x_recv) == 0) { 690 if (hc->rx_x_recv == 0) {
753 DCCP_BUG("stored value of X_recv is zero"); 691 DCCP_BUG("stored value of X_recv is zero");
754 return ~0U; 692 return ~0U;
755 } 693 }
694 x_recv = hc->rx_x_recv;
756 } 695 }
757 696
758 fval = scaled_div(hc->rx_s, hc->rx_rtt); 697 fval = scaled_div(hc->rx_s, hc->rx_rtt);
@@ -862,46 +801,31 @@ static void ccid3_hc_rx_exit(struct sock *sk)
862{ 801{
863 struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk); 802 struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk);
864 803
865 ccid3_hc_rx_set_state(sk, TFRC_RSTATE_TERM);
866
867 tfrc_rx_hist_purge(&hc->rx_hist); 804 tfrc_rx_hist_purge(&hc->rx_hist);
868 tfrc_lh_cleanup(&hc->rx_li_hist); 805 tfrc_lh_cleanup(&hc->rx_li_hist);
869} 806}
870 807
871static void ccid3_hc_rx_get_info(struct sock *sk, struct tcp_info *info) 808static void ccid3_hc_rx_get_info(struct sock *sk, struct tcp_info *info)
872{ 809{
873 const struct ccid3_hc_rx_sock *hc; 810 info->tcpi_ca_state = ccid3_hc_rx_sk(sk)->rx_state;
874
875 /* Listen socks doesn't have a private CCID block */
876 if (sk->sk_state == DCCP_LISTEN)
877 return;
878
879 hc = ccid3_hc_rx_sk(sk);
880 info->tcpi_ca_state = hc->rx_state;
881 info->tcpi_options |= TCPI_OPT_TIMESTAMPS; 811 info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
882 info->tcpi_rcv_rtt = hc->rx_rtt; 812 info->tcpi_rcv_rtt = ccid3_hc_rx_sk(sk)->rx_rtt;
883} 813}
884 814
885static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len, 815static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len,
886 u32 __user *optval, int __user *optlen) 816 u32 __user *optval, int __user *optlen)
887{ 817{
888 const struct ccid3_hc_rx_sock *hc; 818 const struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk);
889 struct tfrc_rx_info rx_info; 819 struct tfrc_rx_info rx_info;
890 const void *val; 820 const void *val;
891 821
892 /* Listen socks doesn't have a private CCID block */
893 if (sk->sk_state == DCCP_LISTEN)
894 return -EINVAL;
895
896 hc = ccid3_hc_rx_sk(sk);
897 switch (optname) { 822 switch (optname) {
898 case DCCP_SOCKOPT_CCID_RX_INFO: 823 case DCCP_SOCKOPT_CCID_RX_INFO:
899 if (len < sizeof(rx_info)) 824 if (len < sizeof(rx_info))
900 return -EINVAL; 825 return -EINVAL;
901 rx_info.tfrcrx_x_recv = hc->rx_x_recv; 826 rx_info.tfrcrx_x_recv = hc->rx_x_recv;
902 rx_info.tfrcrx_rtt = hc->rx_rtt; 827 rx_info.tfrcrx_rtt = hc->rx_rtt;
903 rx_info.tfrcrx_p = hc->rx_pinv == 0 ? ~0U : 828 rx_info.tfrcrx_p = tfrc_invert_loss_event_rate(hc->rx_pinv);
904 scaled_div(1, hc->rx_pinv);
905 len = sizeof(rx_info); 829 len = sizeof(rx_info);
906 val = &rx_info; 830 val = &rx_info;
907 break; 831 break;
diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h
index 032635776653..1a9933c29672 100644
--- a/net/dccp/ccids/ccid3.h
+++ b/net/dccp/ccids/ccid3.h
@@ -42,35 +42,36 @@
42#include "lib/tfrc.h" 42#include "lib/tfrc.h"
43#include "../ccid.h" 43#include "../ccid.h"
44 44
45/* Two seconds as per RFC 3448 4.2 */ 45/* Two seconds as per RFC 5348, 4.2 */
46#define TFRC_INITIAL_TIMEOUT (2 * USEC_PER_SEC) 46#define TFRC_INITIAL_TIMEOUT (2 * USEC_PER_SEC)
47 47
48/* In usecs - half the scheduling granularity as per RFC3448 4.6 */
49#define TFRC_OPSYS_HALF_TIME_GRAN (USEC_PER_SEC / (2 * HZ))
50
51/* Parameter t_mbi from [RFC 3448, 4.3]: backoff interval in seconds */ 48/* Parameter t_mbi from [RFC 3448, 4.3]: backoff interval in seconds */
52#define TFRC_T_MBI 64 49#define TFRC_T_MBI 64
53 50
51/*
52 * The t_delta parameter (RFC 5348, 8.3): delays of less than %USEC_PER_MSEC are
53 * rounded down to 0, since sk_reset_timer() here uses millisecond granularity.
54 * Hence we can use a constant t_delta = %USEC_PER_MSEC when HZ >= 500. A coarse
55 * resolution of HZ < 500 means that the error is below one timer tick (t_gran)
56 * when using the constant t_delta = t_gran / 2 = %USEC_PER_SEC / (2 * HZ).
57 */
58#if (HZ >= 500)
59# define TFRC_T_DELTA USEC_PER_MSEC
60#else
61# define TFRC_T_DELTA (USEC_PER_SEC / (2 * HZ))
62#endif
63
54enum ccid3_options { 64enum ccid3_options {
55 TFRC_OPT_LOSS_EVENT_RATE = 192, 65 TFRC_OPT_LOSS_EVENT_RATE = 192,
56 TFRC_OPT_LOSS_INTERVALS = 193, 66 TFRC_OPT_LOSS_INTERVALS = 193,
57 TFRC_OPT_RECEIVE_RATE = 194, 67 TFRC_OPT_RECEIVE_RATE = 194,
58}; 68};
59 69
60struct ccid3_options_received {
61 u64 ccid3or_seqno:48,
62 ccid3or_loss_intervals_idx:16;
63 u16 ccid3or_loss_intervals_len;
64 u32 ccid3or_loss_event_rate;
65 u32 ccid3or_receive_rate;
66};
67
68/* TFRC sender states */ 70/* TFRC sender states */
69enum ccid3_hc_tx_states { 71enum ccid3_hc_tx_states {
70 TFRC_SSTATE_NO_SENT = 1, 72 TFRC_SSTATE_NO_SENT = 1,
71 TFRC_SSTATE_NO_FBACK, 73 TFRC_SSTATE_NO_FBACK,
72 TFRC_SSTATE_FBACK, 74 TFRC_SSTATE_FBACK,
73 TFRC_SSTATE_TERM,
74}; 75};
75 76
76/** 77/**
@@ -90,19 +91,16 @@ enum ccid3_hc_tx_states {
90 * @tx_no_feedback_timer: Handle to no feedback timer 91 * @tx_no_feedback_timer: Handle to no feedback timer
91 * @tx_t_ld: Time last doubled during slow start 92 * @tx_t_ld: Time last doubled during slow start
92 * @tx_t_nom: Nominal send time of next packet 93 * @tx_t_nom: Nominal send time of next packet
93 * @tx_delta: Send timer delta (RFC 3448, 4.6) in usecs
94 * @tx_hist: Packet history 94 * @tx_hist: Packet history
95 * @tx_options_received: Parsed set of retrieved options
96 */ 95 */
97struct ccid3_hc_tx_sock { 96struct ccid3_hc_tx_sock {
98 struct tfrc_tx_info tx_tfrc; 97 u64 tx_x;
99#define tx_x tx_tfrc.tfrctx_x 98 u64 tx_x_recv;
100#define tx_x_recv tx_tfrc.tfrctx_x_recv 99 u32 tx_x_calc;
101#define tx_x_calc tx_tfrc.tfrctx_x_calc 100 u32 tx_rtt;
102#define tx_rtt tx_tfrc.tfrctx_rtt 101 u32 tx_p;
103#define tx_p tx_tfrc.tfrctx_p 102 u32 tx_t_rto;
104#define tx_t_rto tx_tfrc.tfrctx_rto 103 u32 tx_t_ipi;
105#define tx_t_ipi tx_tfrc.tfrctx_ipi
106 u16 tx_s; 104 u16 tx_s;
107 enum ccid3_hc_tx_states tx_state:8; 105 enum ccid3_hc_tx_states tx_state:8;
108 u8 tx_last_win_count; 106 u8 tx_last_win_count;
@@ -110,9 +108,7 @@ struct ccid3_hc_tx_sock {
110 struct timer_list tx_no_feedback_timer; 108 struct timer_list tx_no_feedback_timer;
111 ktime_t tx_t_ld; 109 ktime_t tx_t_ld;
112 ktime_t tx_t_nom; 110 ktime_t tx_t_nom;
113 u32 tx_delta;
114 struct tfrc_tx_hist_entry *tx_hist; 111 struct tfrc_tx_hist_entry *tx_hist;
115 struct ccid3_options_received tx_options_received;
116}; 112};
117 113
118static inline struct ccid3_hc_tx_sock *ccid3_hc_tx_sk(const struct sock *sk) 114static inline struct ccid3_hc_tx_sock *ccid3_hc_tx_sk(const struct sock *sk)
@@ -126,21 +122,16 @@ static inline struct ccid3_hc_tx_sock *ccid3_hc_tx_sk(const struct sock *sk)
126enum ccid3_hc_rx_states { 122enum ccid3_hc_rx_states {
127 TFRC_RSTATE_NO_DATA = 1, 123 TFRC_RSTATE_NO_DATA = 1,
128 TFRC_RSTATE_DATA, 124 TFRC_RSTATE_DATA,
129 TFRC_RSTATE_TERM = 127,
130}; 125};
131 126
132/** 127/**
133 * struct ccid3_hc_rx_sock - CCID3 receiver half-connection socket 128 * struct ccid3_hc_rx_sock - CCID3 receiver half-connection socket
134 * @rx_x_recv: Receiver estimate of send rate (RFC 3448 4.3)
135 * @rx_rtt: Receiver estimate of rtt (non-standard)
136 * @rx_p: Current loss event rate (RFC 3448 5.4)
137 * @rx_last_counter: Tracks window counter (RFC 4342, 8.1) 129 * @rx_last_counter: Tracks window counter (RFC 4342, 8.1)
138 * @rx_state: Receiver state, one of %ccid3_hc_rx_states 130 * @rx_state: Receiver state, one of %ccid3_hc_rx_states
139 * @rx_bytes_recv: Total sum of DCCP payload bytes 131 * @rx_bytes_recv: Total sum of DCCP payload bytes
140 * @rx_x_recv: Receiver estimate of send rate (RFC 3448, sec. 4.3) 132 * @rx_x_recv: Receiver estimate of send rate (RFC 3448, sec. 4.3)
141 * @rx_rtt: Receiver estimate of RTT 133 * @rx_rtt: Receiver estimate of RTT
142 * @rx_tstamp_last_feedback: Time at which last feedback was sent 134 * @rx_tstamp_last_feedback: Time at which last feedback was sent
143 * @rx_tstamp_last_ack: Time at which last feedback was sent
144 * @rx_hist: Packet history (loss detection + RTT sampling) 135 * @rx_hist: Packet history (loss detection + RTT sampling)
145 * @rx_li_hist: Loss Interval database 136 * @rx_li_hist: Loss Interval database
146 * @rx_s: Received packet size in bytes 137 * @rx_s: Received packet size in bytes
diff --git a/net/dccp/ccids/lib/loss_interval.c b/net/dccp/ccids/lib/loss_interval.c
index 8fc3cbf79071..497723c4d4bb 100644
--- a/net/dccp/ccids/lib/loss_interval.c
+++ b/net/dccp/ccids/lib/loss_interval.c
@@ -116,7 +116,7 @@ u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *skb)
116 cur->li_length = len; 116 cur->li_length = len;
117 tfrc_lh_calc_i_mean(lh); 117 tfrc_lh_calc_i_mean(lh);
118 118
119 return (lh->i_mean < old_i_mean); 119 return lh->i_mean < old_i_mean;
120} 120}
121 121
122/* Determine if `new_loss' does begin a new loss interval [RFC 4342, 10.2] */ 122/* Determine if `new_loss' does begin a new loss interval [RFC 4342, 10.2] */
diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c
index 3a4f414e94a0..de8fe294bf0b 100644
--- a/net/dccp/ccids/lib/packet_history.c
+++ b/net/dccp/ccids/lib/packet_history.c
@@ -38,18 +38,6 @@
38#include "packet_history.h" 38#include "packet_history.h"
39#include "../../dccp.h" 39#include "../../dccp.h"
40 40
41/**
42 * tfrc_tx_hist_entry - Simple singly-linked TX history list
43 * @next: next oldest entry (LIFO order)
44 * @seqno: sequence number of this entry
45 * @stamp: send time of packet with sequence number @seqno
46 */
47struct tfrc_tx_hist_entry {
48 struct tfrc_tx_hist_entry *next;
49 u64 seqno;
50 ktime_t stamp;
51};
52
53/* 41/*
54 * Transmitter History Routines 42 * Transmitter History Routines
55 */ 43 */
@@ -71,15 +59,6 @@ void tfrc_tx_packet_history_exit(void)
71 } 59 }
72} 60}
73 61
74static struct tfrc_tx_hist_entry *
75 tfrc_tx_hist_find_entry(struct tfrc_tx_hist_entry *head, u64 seqno)
76{
77 while (head != NULL && head->seqno != seqno)
78 head = head->next;
79
80 return head;
81}
82
83int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno) 62int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno)
84{ 63{
85 struct tfrc_tx_hist_entry *entry = kmem_cache_alloc(tfrc_tx_hist_slab, gfp_any()); 64 struct tfrc_tx_hist_entry *entry = kmem_cache_alloc(tfrc_tx_hist_slab, gfp_any());
@@ -107,24 +86,6 @@ void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp)
107 *headp = NULL; 86 *headp = NULL;
108} 87}
109 88
110u32 tfrc_tx_hist_rtt(struct tfrc_tx_hist_entry *head, const u64 seqno,
111 const ktime_t now)
112{
113 u32 rtt = 0;
114 struct tfrc_tx_hist_entry *packet = tfrc_tx_hist_find_entry(head, seqno);
115
116 if (packet != NULL) {
117 rtt = ktime_us_delta(now, packet->stamp);
118 /*
119 * Garbage-collect older (irrelevant) entries:
120 */
121 tfrc_tx_hist_purge(&packet->next);
122 }
123
124 return rtt;
125}
126
127
128/* 89/*
129 * Receiver History Routines 90 * Receiver History Routines
130 */ 91 */
diff --git a/net/dccp/ccids/lib/packet_history.h b/net/dccp/ccids/lib/packet_history.h
index 7df6c5299999..7ee4a9d9d335 100644
--- a/net/dccp/ccids/lib/packet_history.h
+++ b/net/dccp/ccids/lib/packet_history.h
@@ -40,12 +40,28 @@
40#include <linux/slab.h> 40#include <linux/slab.h>
41#include "tfrc.h" 41#include "tfrc.h"
42 42
43struct tfrc_tx_hist_entry; 43/**
44 * tfrc_tx_hist_entry - Simple singly-linked TX history list
45 * @next: next oldest entry (LIFO order)
46 * @seqno: sequence number of this entry
47 * @stamp: send time of packet with sequence number @seqno
48 */
49struct tfrc_tx_hist_entry {
50 struct tfrc_tx_hist_entry *next;
51 u64 seqno;
52 ktime_t stamp;
53};
54
55static inline struct tfrc_tx_hist_entry *
56 tfrc_tx_hist_find_entry(struct tfrc_tx_hist_entry *head, u64 seqno)
57{
58 while (head != NULL && head->seqno != seqno)
59 head = head->next;
60 return head;
61}
44 62
45extern int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno); 63extern int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno);
46extern void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp); 64extern void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp);
47extern u32 tfrc_tx_hist_rtt(struct tfrc_tx_hist_entry *head,
48 const u64 seqno, const ktime_t now);
49 65
50/* Subtraction a-b modulo-16, respects circular wrap-around */ 66/* Subtraction a-b modulo-16, respects circular wrap-around */
51#define SUB16(a, b) (((a) + 16 - (b)) & 0xF) 67#define SUB16(a, b) (((a) + 16 - (b)) & 0xF)
diff --git a/net/dccp/ccids/lib/tfrc.h b/net/dccp/ccids/lib/tfrc.h
index 01bb48e96c2e..f8ee3f549770 100644
--- a/net/dccp/ccids/lib/tfrc.h
+++ b/net/dccp/ccids/lib/tfrc.h
@@ -57,6 +57,7 @@ static inline u32 tfrc_ewma(const u32 avg, const u32 newval, const u8 weight)
57 57
58extern u32 tfrc_calc_x(u16 s, u32 R, u32 p); 58extern u32 tfrc_calc_x(u16 s, u32 R, u32 p);
59extern u32 tfrc_calc_x_reverse_lookup(u32 fvalue); 59extern u32 tfrc_calc_x_reverse_lookup(u32 fvalue);
60extern u32 tfrc_invert_loss_event_rate(u32 loss_event_rate);
60 61
61extern int tfrc_tx_packet_history_init(void); 62extern int tfrc_tx_packet_history_init(void);
62extern void tfrc_tx_packet_history_exit(void); 63extern void tfrc_tx_packet_history_exit(void);
diff --git a/net/dccp/ccids/lib/tfrc_equation.c b/net/dccp/ccids/lib/tfrc_equation.c
index 22ca1cf0eb55..a052a4377e26 100644
--- a/net/dccp/ccids/lib/tfrc_equation.c
+++ b/net/dccp/ccids/lib/tfrc_equation.c
@@ -687,3 +687,17 @@ u32 tfrc_calc_x_reverse_lookup(u32 fvalue)
687 index = tfrc_binsearch(fvalue, 0); 687 index = tfrc_binsearch(fvalue, 0);
688 return (index + 1) * 1000000 / TFRC_CALC_X_ARRSIZE; 688 return (index + 1) * 1000000 / TFRC_CALC_X_ARRSIZE;
689} 689}
690
691/**
692 * tfrc_invert_loss_event_rate - Compute p so that 10^6 corresponds to 100%
693 * When @loss_event_rate is large, there is a chance that p is truncated to 0.
694 * To avoid re-entering slow-start in that case, we set p = TFRC_SMALLEST_P > 0.
695 */
696u32 tfrc_invert_loss_event_rate(u32 loss_event_rate)
697{
698 if (loss_event_rate == UINT_MAX) /* see RFC 4342, 8.5 */
699 return 0;
700 if (unlikely(loss_event_rate == 0)) /* map 1/0 into 100% */
701 return 1000000;
702 return max_t(u32, scaled_div(1, loss_event_rate), TFRC_SMALLEST_P);
703}
diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
index 3ccef1b70fee..5fdb07229017 100644
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -93,9 +93,6 @@ extern void dccp_time_wait(struct sock *sk, int state, int timeo);
93#define DCCP_FALLBACK_RTT (USEC_PER_SEC / 5) 93#define DCCP_FALLBACK_RTT (USEC_PER_SEC / 5)
94#define DCCP_SANE_RTT_MAX (3 * USEC_PER_SEC) 94#define DCCP_SANE_RTT_MAX (3 * USEC_PER_SEC)
95 95
96/* Maximal interval between probes for local resources. */
97#define DCCP_RESOURCE_PROBE_INTERVAL ((unsigned)(HZ / 2U))
98
99/* sysctl variables for DCCP */ 96/* sysctl variables for DCCP */
100extern int sysctl_dccp_request_retries; 97extern int sysctl_dccp_request_retries;
101extern int sysctl_dccp_retries1; 98extern int sysctl_dccp_retries1;
@@ -153,18 +150,27 @@ static inline u64 max48(const u64 seq1, const u64 seq2)
153} 150}
154 151
155/** 152/**
156 * dccp_loss_free - Evaluates condition for data loss from RFC 4340, 7.7.1 153 * dccp_loss_count - Approximate the number of lost data packets in a burst loss
157 * @s1: start sequence number 154 * @s1: last known sequence number before the loss ('hole')
158 * @s2: end sequence number 155 * @s2: first sequence number seen after the 'hole'
159 * @ndp: NDP count on packet with sequence number @s2 156 * @ndp: NDP count on packet with sequence number @s2
160 * Returns true if the sequence range s1...s2 has no data loss.
161 */ 157 */
162static inline bool dccp_loss_free(const u64 s1, const u64 s2, const u64 ndp) 158static inline u64 dccp_loss_count(const u64 s1, const u64 s2, const u64 ndp)
163{ 159{
164 s64 delta = dccp_delta_seqno(s1, s2); 160 s64 delta = dccp_delta_seqno(s1, s2);
165 161
166 WARN_ON(delta < 0); 162 WARN_ON(delta < 0);
167 return (u64)delta <= ndp + 1; 163 delta -= ndp + 1;
164
165 return delta > 0 ? delta : 0;
166}
167
168/**
169 * dccp_loss_free - Evaluate condition for data loss from RFC 4340, 7.7.1
170 */
171static inline bool dccp_loss_free(const u64 s1, const u64 s2, const u64 ndp)
172{
173 return dccp_loss_count(s1, s2, ndp) == 0;
168} 174}
169 175
170enum { 176enum {
@@ -194,12 +200,7 @@ struct dccp_mib {
194DECLARE_SNMP_STAT(struct dccp_mib, dccp_statistics); 200DECLARE_SNMP_STAT(struct dccp_mib, dccp_statistics);
195#define DCCP_INC_STATS(field) SNMP_INC_STATS(dccp_statistics, field) 201#define DCCP_INC_STATS(field) SNMP_INC_STATS(dccp_statistics, field)
196#define DCCP_INC_STATS_BH(field) SNMP_INC_STATS_BH(dccp_statistics, field) 202#define DCCP_INC_STATS_BH(field) SNMP_INC_STATS_BH(dccp_statistics, field)
197#define DCCP_INC_STATS_USER(field) SNMP_INC_STATS_USER(dccp_statistics, field)
198#define DCCP_DEC_STATS(field) SNMP_DEC_STATS(dccp_statistics, field) 203#define DCCP_DEC_STATS(field) SNMP_DEC_STATS(dccp_statistics, field)
199#define DCCP_ADD_STATS_BH(field, val) \
200 SNMP_ADD_STATS_BH(dccp_statistics, field, val)
201#define DCCP_ADD_STATS_USER(field, val) \
202 SNMP_ADD_STATS_USER(dccp_statistics, field, val)
203 204
204/* 205/*
205 * Checksumming routines 206 * Checksumming routines
@@ -234,8 +235,22 @@ extern void dccp_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
234extern void dccp_send_sync(struct sock *sk, const u64 seq, 235extern void dccp_send_sync(struct sock *sk, const u64 seq,
235 const enum dccp_pkt_type pkt_type); 236 const enum dccp_pkt_type pkt_type);
236 237
237extern void dccp_write_xmit(struct sock *sk, int block); 238/*
238extern void dccp_write_space(struct sock *sk); 239 * TX Packet Dequeueing Interface
240 */
241extern void dccp_qpolicy_push(struct sock *sk, struct sk_buff *skb);
242extern bool dccp_qpolicy_full(struct sock *sk);
243extern void dccp_qpolicy_drop(struct sock *sk, struct sk_buff *skb);
244extern struct sk_buff *dccp_qpolicy_top(struct sock *sk);
245extern struct sk_buff *dccp_qpolicy_pop(struct sock *sk);
246extern bool dccp_qpolicy_param_ok(struct sock *sk, __be32 param);
247
248/*
249 * TX Packet Output and TX Timers
250 */
251extern void dccp_write_xmit(struct sock *sk);
252extern void dccp_write_space(struct sock *sk);
253extern void dccp_flush_write_queue(struct sock *sk, long *time_budget);
239 254
240extern void dccp_init_xmit_timers(struct sock *sk); 255extern void dccp_init_xmit_timers(struct sock *sk);
241static inline void dccp_clear_xmit_timers(struct sock *sk) 256static inline void dccp_clear_xmit_timers(struct sock *sk)
@@ -246,7 +261,6 @@ static inline void dccp_clear_xmit_timers(struct sock *sk)
246extern unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu); 261extern unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu);
247 262
248extern const char *dccp_packet_name(const int type); 263extern const char *dccp_packet_name(const int type);
249extern const char *dccp_state_name(const int state);
250 264
251extern void dccp_set_state(struct sock *sk, const int state); 265extern void dccp_set_state(struct sock *sk, const int state);
252extern void dccp_done(struct sock *sk); 266extern void dccp_done(struct sock *sk);
@@ -412,9 +426,27 @@ static inline void dccp_update_gsr(struct sock *sk, u64 seq)
412{ 426{
413 struct dccp_sock *dp = dccp_sk(sk); 427 struct dccp_sock *dp = dccp_sk(sk);
414 428
415 dp->dccps_gsr = seq; 429 if (after48(seq, dp->dccps_gsr))
430 dp->dccps_gsr = seq;
416 /* Sequence validity window depends on remote Sequence Window (7.5.1) */ 431 /* Sequence validity window depends on remote Sequence Window (7.5.1) */
417 dp->dccps_swl = SUB48(ADD48(dp->dccps_gsr, 1), dp->dccps_r_seq_win / 4); 432 dp->dccps_swl = SUB48(ADD48(dp->dccps_gsr, 1), dp->dccps_r_seq_win / 4);
433 /*
434 * Adjust SWL so that it is not below ISR. In contrast to RFC 4340,
435 * 7.5.1 we perform this check beyond the initial handshake: W/W' are
436 * always > 32, so for the first W/W' packets in the lifetime of a
437 * connection we always have to adjust SWL.
438 * A second reason why we are doing this is that the window depends on
439 * the feature-remote value of Sequence Window: nothing stops the peer
440 * from updating this value while we are busy adjusting SWL for the
441 * first W packets (we would have to count from scratch again then).
442 * Therefore it is safer to always make sure that the Sequence Window
443 * is not artificially extended by a peer who grows SWL downwards by
444 * continually updating the feature-remote Sequence-Window.
445 * If sequence numbers wrap it is bad luck. But that will take a while
446 * (48 bit), and this measure prevents Sequence-number attacks.
447 */
448 if (before48(dp->dccps_swl, dp->dccps_isr))
449 dp->dccps_swl = dp->dccps_isr;
418 dp->dccps_swh = ADD48(dp->dccps_gsr, (3 * dp->dccps_r_seq_win) / 4); 450 dp->dccps_swh = ADD48(dp->dccps_gsr, (3 * dp->dccps_r_seq_win) / 4);
419} 451}
420 452
@@ -425,16 +457,21 @@ static inline void dccp_update_gss(struct sock *sk, u64 seq)
425 dp->dccps_gss = seq; 457 dp->dccps_gss = seq;
426 /* Ack validity window depends on local Sequence Window value (7.5.1) */ 458 /* Ack validity window depends on local Sequence Window value (7.5.1) */
427 dp->dccps_awl = SUB48(ADD48(dp->dccps_gss, 1), dp->dccps_l_seq_win); 459 dp->dccps_awl = SUB48(ADD48(dp->dccps_gss, 1), dp->dccps_l_seq_win);
460 /* Adjust AWL so that it is not below ISS - see comment above for SWL */
461 if (before48(dp->dccps_awl, dp->dccps_iss))
462 dp->dccps_awl = dp->dccps_iss;
428 dp->dccps_awh = dp->dccps_gss; 463 dp->dccps_awh = dp->dccps_gss;
429} 464}
430 465
466static inline int dccp_ackvec_pending(const struct sock *sk)
467{
468 return dccp_sk(sk)->dccps_hc_rx_ackvec != NULL &&
469 !dccp_ackvec_is_empty(dccp_sk(sk)->dccps_hc_rx_ackvec);
470}
471
431static inline int dccp_ack_pending(const struct sock *sk) 472static inline int dccp_ack_pending(const struct sock *sk)
432{ 473{
433 const struct dccp_sock *dp = dccp_sk(sk); 474 return dccp_ackvec_pending(sk) || inet_csk_ack_scheduled(sk);
434 return dp->dccps_timestamp_echo != 0 ||
435 (dp->dccps_hc_rx_ackvec != NULL &&
436 dccp_ackvec_pending(dp->dccps_hc_rx_ackvec)) ||
437 inet_csk_ack_scheduled(sk);
438} 475}
439 476
440extern int dccp_feat_finalise_settings(struct dccp_sock *dp); 477extern int dccp_feat_finalise_settings(struct dccp_sock *dp);
@@ -449,7 +486,6 @@ extern int dccp_insert_options_rsk(struct dccp_request_sock*, struct sk_buff*);
449extern int dccp_insert_option_elapsed_time(struct sk_buff *skb, u32 elapsed); 486extern int dccp_insert_option_elapsed_time(struct sk_buff *skb, u32 elapsed);
450extern u32 dccp_timestamp(void); 487extern u32 dccp_timestamp(void);
451extern void dccp_timestamping_init(void); 488extern void dccp_timestamping_init(void);
452extern int dccp_insert_option_timestamp(struct sk_buff *skb);
453extern int dccp_insert_option(struct sk_buff *skb, unsigned char option, 489extern int dccp_insert_option(struct sk_buff *skb, unsigned char option,
454 const void *value, unsigned char len); 490 const void *value, unsigned char len);
455 491
diff --git a/net/dccp/feat.c b/net/dccp/feat.c
index df7dd26cf07e..568def952722 100644
--- a/net/dccp/feat.c
+++ b/net/dccp/feat.c
@@ -730,16 +730,6 @@ int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local,
730 0, list, len); 730 0, list, len);
731} 731}
732 732
733/* Analogous to dccp_feat_register_sp(), but for non-negotiable values */
734int dccp_feat_register_nn(struct sock *sk, u8 feat, u64 val)
735{
736 /* any changes must be registered before establishing the connection */
737 if (sk->sk_state != DCCP_CLOSED)
738 return -EISCONN;
739 if (dccp_feat_type(feat) != FEAT_NN)
740 return -EINVAL;
741 return __feat_register_nn(&dccp_sk(sk)->dccps_featneg, feat, 0, val);
742}
743 733
744/* 734/*
745 * Tracking features whose value depend on the choice of CCID 735 * Tracking features whose value depend on the choice of CCID
diff --git a/net/dccp/feat.h b/net/dccp/feat.h
index f96721619def..e56a4e5e634e 100644
--- a/net/dccp/feat.h
+++ b/net/dccp/feat.h
@@ -111,7 +111,6 @@ extern int dccp_feat_init(struct sock *sk);
111extern void dccp_feat_initialise_sysctls(void); 111extern void dccp_feat_initialise_sysctls(void);
112extern int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local, 112extern int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local,
113 u8 const *list, u8 len); 113 u8 const *list, u8 len);
114extern int dccp_feat_register_nn(struct sock *sk, u8 feat, u64 val);
115extern int dccp_feat_parse_options(struct sock *, struct dccp_request_sock *, 114extern int dccp_feat_parse_options(struct sock *, struct dccp_request_sock *,
116 u8 mand, u8 opt, u8 feat, u8 *val, u8 len); 115 u8 mand, u8 opt, u8 feat, u8 *val, u8 len);
117extern int dccp_feat_clone_list(struct list_head const *, struct list_head *); 116extern int dccp_feat_clone_list(struct list_head const *, struct list_head *);
diff --git a/net/dccp/input.c b/net/dccp/input.c
index 10c957a88f4f..4222e7a654b0 100644
--- a/net/dccp/input.c
+++ b/net/dccp/input.c
@@ -160,13 +160,15 @@ static void dccp_rcv_reset(struct sock *sk, struct sk_buff *skb)
160 dccp_time_wait(sk, DCCP_TIME_WAIT, 0); 160 dccp_time_wait(sk, DCCP_TIME_WAIT, 0);
161} 161}
162 162
163static void dccp_event_ack_recv(struct sock *sk, struct sk_buff *skb) 163static void dccp_handle_ackvec_processing(struct sock *sk, struct sk_buff *skb)
164{ 164{
165 struct dccp_sock *dp = dccp_sk(sk); 165 struct dccp_ackvec *av = dccp_sk(sk)->dccps_hc_rx_ackvec;
166 166
167 if (dp->dccps_hc_rx_ackvec != NULL) 167 if (av == NULL)
168 dccp_ackvec_check_rcv_ackno(dp->dccps_hc_rx_ackvec, sk, 168 return;
169 DCCP_SKB_CB(skb)->dccpd_ack_seq); 169 if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
170 dccp_ackvec_clear_state(av, DCCP_SKB_CB(skb)->dccpd_ack_seq);
171 dccp_ackvec_input(av, skb);
170} 172}
171 173
172static void dccp_deliver_input_to_ccids(struct sock *sk, struct sk_buff *skb) 174static void dccp_deliver_input_to_ccids(struct sock *sk, struct sk_buff *skb)
@@ -239,7 +241,8 @@ static int dccp_check_seqno(struct sock *sk, struct sk_buff *skb)
239 dccp_update_gsr(sk, seqno); 241 dccp_update_gsr(sk, seqno);
240 242
241 if (dh->dccph_type != DCCP_PKT_SYNC && 243 if (dh->dccph_type != DCCP_PKT_SYNC &&
242 (ackno != DCCP_PKT_WITHOUT_ACK_SEQ)) 244 ackno != DCCP_PKT_WITHOUT_ACK_SEQ &&
245 after48(ackno, dp->dccps_gar))
243 dp->dccps_gar = ackno; 246 dp->dccps_gar = ackno;
244 } else { 247 } else {
245 unsigned long now = jiffies; 248 unsigned long now = jiffies;
@@ -257,9 +260,9 @@ static int dccp_check_seqno(struct sock *sk, struct sk_buff *skb)
257 */ 260 */
258 if (time_before(now, (dp->dccps_rate_last + 261 if (time_before(now, (dp->dccps_rate_last +
259 sysctl_dccp_sync_ratelimit))) 262 sysctl_dccp_sync_ratelimit)))
260 return 0; 263 return -1;
261 264
262 DCCP_WARN("DCCP: Step 6 failed for %s packet, " 265 DCCP_WARN("Step 6 failed for %s packet, "
263 "(LSWL(%llu) <= P.seqno(%llu) <= S.SWH(%llu)) and " 266 "(LSWL(%llu) <= P.seqno(%llu) <= S.SWH(%llu)) and "
264 "(P.ackno %s or LAWL(%llu) <= P.ackno(%llu) <= S.AWH(%llu), " 267 "(P.ackno %s or LAWL(%llu) <= P.ackno(%llu) <= S.AWH(%llu), "
265 "sending SYNC...\n", dccp_packet_name(dh->dccph_type), 268 "sending SYNC...\n", dccp_packet_name(dh->dccph_type),
@@ -365,22 +368,13 @@ discard:
365int dccp_rcv_established(struct sock *sk, struct sk_buff *skb, 368int dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
366 const struct dccp_hdr *dh, const unsigned len) 369 const struct dccp_hdr *dh, const unsigned len)
367{ 370{
368 struct dccp_sock *dp = dccp_sk(sk);
369
370 if (dccp_check_seqno(sk, skb)) 371 if (dccp_check_seqno(sk, skb))
371 goto discard; 372 goto discard;
372 373
373 if (dccp_parse_options(sk, NULL, skb)) 374 if (dccp_parse_options(sk, NULL, skb))
374 return 1; 375 return 1;
375 376
376 if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) 377 dccp_handle_ackvec_processing(sk, skb);
377 dccp_event_ack_recv(sk, skb);
378
379 if (dp->dccps_hc_rx_ackvec != NULL &&
380 dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk,
381 DCCP_SKB_CB(skb)->dccpd_seq,
382 DCCP_ACKVEC_STATE_RECEIVED))
383 goto discard;
384 dccp_deliver_input_to_ccids(sk, skb); 378 dccp_deliver_input_to_ccids(sk, skb);
385 379
386 return __dccp_rcv_established(sk, skb, dh, len); 380 return __dccp_rcv_established(sk, skb, dh, len);
@@ -441,20 +435,14 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk,
441 kfree_skb(sk->sk_send_head); 435 kfree_skb(sk->sk_send_head);
442 sk->sk_send_head = NULL; 436 sk->sk_send_head = NULL;
443 437
444 dp->dccps_isr = DCCP_SKB_CB(skb)->dccpd_seq;
445 dccp_update_gsr(sk, dp->dccps_isr);
446 /* 438 /*
447 * SWL and AWL are initially adjusted so that they are not less than 439 * Set ISR, GSR from packet. ISS was set in dccp_v{4,6}_connect
448 * the initial Sequence Numbers received and sent, respectively: 440 * and GSS in dccp_transmit_skb(). Setting AWL/AWH and SWL/SWH
449 * SWL := max(GSR + 1 - floor(W/4), ISR), 441 * is done as part of activating the feature values below, since
450 * AWL := max(GSS - W' + 1, ISS). 442 * these settings depend on the local/remote Sequence Window
451 * These adjustments MUST be applied only at the beginning of the 443 * features, which were undefined or not confirmed until now.
452 * connection.
453 *
454 * AWL was adjusted in dccp_v4_connect -acme
455 */ 444 */
456 dccp_set_seqno(&dp->dccps_swl, 445 dp->dccps_gsr = dp->dccps_isr = DCCP_SKB_CB(skb)->dccpd_seq;
457 max48(dp->dccps_swl, dp->dccps_isr));
458 446
459 dccp_sync_mss(sk, icsk->icsk_pmtu_cookie); 447 dccp_sync_mss(sk, icsk->icsk_pmtu_cookie);
460 448
@@ -626,6 +614,9 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
626 /* Caller (dccp_v4_do_rcv) will send Reset */ 614 /* Caller (dccp_v4_do_rcv) will send Reset */
627 dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION; 615 dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION;
628 return 1; 616 return 1;
617 } else if (sk->sk_state == DCCP_CLOSED) {
618 dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION;
619 return 1;
629 } 620 }
630 621
631 if (sk->sk_state != DCCP_REQUESTING && sk->sk_state != DCCP_RESPOND) { 622 if (sk->sk_state != DCCP_REQUESTING && sk->sk_state != DCCP_RESPOND) {
@@ -638,15 +629,7 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
638 if (dccp_parse_options(sk, NULL, skb)) 629 if (dccp_parse_options(sk, NULL, skb))
639 return 1; 630 return 1;
640 631
641 if (dcb->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) 632 dccp_handle_ackvec_processing(sk, skb);
642 dccp_event_ack_recv(sk, skb);
643
644 if (dp->dccps_hc_rx_ackvec != NULL &&
645 dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk,
646 DCCP_SKB_CB(skb)->dccpd_seq,
647 DCCP_ACKVEC_STATE_RECEIVED))
648 goto discard;
649
650 dccp_deliver_input_to_ccids(sk, skb); 633 dccp_deliver_input_to_ccids(sk, skb);
651 } 634 }
652 635
@@ -688,10 +671,6 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
688 } 671 }
689 672
690 switch (sk->sk_state) { 673 switch (sk->sk_state) {
691 case DCCP_CLOSED:
692 dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION;
693 return 1;
694
695 case DCCP_REQUESTING: 674 case DCCP_REQUESTING:
696 queued = dccp_rcv_request_sent_state_process(sk, skb, dh, len); 675 queued = dccp_rcv_request_sent_state_process(sk, skb, dh, len);
697 if (queued >= 0) 676 if (queued >= 0)
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index d4a166f0f391..8c36adfd1919 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -40,13 +40,15 @@
40 40
41int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) 41int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
42{ 42{
43 const struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
43 struct inet_sock *inet = inet_sk(sk); 44 struct inet_sock *inet = inet_sk(sk);
44 struct dccp_sock *dp = dccp_sk(sk); 45 struct dccp_sock *dp = dccp_sk(sk);
45 const struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; 46 __be16 orig_sport, orig_dport;
46 struct rtable *rt;
47 __be32 daddr, nexthop; 47 __be32 daddr, nexthop;
48 int tmp; 48 struct flowi4 *fl4;
49 struct rtable *rt;
49 int err; 50 int err;
51 struct ip_options_rcu *inet_opt;
50 52
51 dp->dccps_role = DCCP_ROLE_CLIENT; 53 dp->dccps_role = DCCP_ROLE_CLIENT;
52 54
@@ -57,37 +59,43 @@ int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
57 return -EAFNOSUPPORT; 59 return -EAFNOSUPPORT;
58 60
59 nexthop = daddr = usin->sin_addr.s_addr; 61 nexthop = daddr = usin->sin_addr.s_addr;
60 if (inet->opt != NULL && inet->opt->srr) { 62
63 inet_opt = rcu_dereference_protected(inet->inet_opt,
64 sock_owned_by_user(sk));
65 if (inet_opt != NULL && inet_opt->opt.srr) {
61 if (daddr == 0) 66 if (daddr == 0)
62 return -EINVAL; 67 return -EINVAL;
63 nexthop = inet->opt->faddr; 68 nexthop = inet_opt->opt.faddr;
64 } 69 }
65 70
66 tmp = ip_route_connect(&rt, nexthop, inet->inet_saddr, 71 orig_sport = inet->inet_sport;
67 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, 72 orig_dport = usin->sin_port;
68 IPPROTO_DCCP, 73 fl4 = &inet->cork.fl.u.ip4;
69 inet->inet_sport, usin->sin_port, sk, 1); 74 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
70 if (tmp < 0) 75 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
71 return tmp; 76 IPPROTO_DCCP,
77 orig_sport, orig_dport, sk, true);
78 if (IS_ERR(rt))
79 return PTR_ERR(rt);
72 80
73 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { 81 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
74 ip_rt_put(rt); 82 ip_rt_put(rt);
75 return -ENETUNREACH; 83 return -ENETUNREACH;
76 } 84 }
77 85
78 if (inet->opt == NULL || !inet->opt->srr) 86 if (inet_opt == NULL || !inet_opt->opt.srr)
79 daddr = rt->rt_dst; 87 daddr = fl4->daddr;
80 88
81 if (inet->inet_saddr == 0) 89 if (inet->inet_saddr == 0)
82 inet->inet_saddr = rt->rt_src; 90 inet->inet_saddr = fl4->saddr;
83 inet->inet_rcv_saddr = inet->inet_saddr; 91 inet->inet_rcv_saddr = inet->inet_saddr;
84 92
85 inet->inet_dport = usin->sin_port; 93 inet->inet_dport = usin->sin_port;
86 inet->inet_daddr = daddr; 94 inet->inet_daddr = daddr;
87 95
88 inet_csk(sk)->icsk_ext_hdr_len = 0; 96 inet_csk(sk)->icsk_ext_hdr_len = 0;
89 if (inet->opt != NULL) 97 if (inet_opt)
90 inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen; 98 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
91 /* 99 /*
92 * Socket identity is still unknown (sport may be zero). 100 * Socket identity is still unknown (sport may be zero).
93 * However we set state to DCCP_REQUESTING and not releasing socket 101 * However we set state to DCCP_REQUESTING and not releasing socket
@@ -99,11 +107,12 @@ int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
99 if (err != 0) 107 if (err != 0)
100 goto failure; 108 goto failure;
101 109
102 err = ip_route_newports(&rt, IPPROTO_DCCP, inet->inet_sport, 110 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
103 inet->inet_dport, sk); 111 inet->inet_sport, inet->inet_dport, sk);
104 if (err != 0) 112 if (IS_ERR(rt)) {
113 rt = NULL;
105 goto failure; 114 goto failure;
106 115 }
107 /* OK, now commit destination to socket. */ 116 /* OK, now commit destination to socket. */
108 sk_setup_caps(sk, &rt->dst); 117 sk_setup_caps(sk, &rt->dst);
109 118
@@ -387,39 +396,44 @@ struct sock *dccp_v4_request_recv_sock(struct sock *sk, struct sk_buff *skb,
387 if (sk_acceptq_is_full(sk)) 396 if (sk_acceptq_is_full(sk))
388 goto exit_overflow; 397 goto exit_overflow;
389 398
390 if (dst == NULL && (dst = inet_csk_route_req(sk, req)) == NULL)
391 goto exit;
392
393 newsk = dccp_create_openreq_child(sk, req, skb); 399 newsk = dccp_create_openreq_child(sk, req, skb);
394 if (newsk == NULL) 400 if (newsk == NULL)
395 goto exit; 401 goto exit_nonewsk;
396
397 sk_setup_caps(newsk, dst);
398 402
399 newinet = inet_sk(newsk); 403 newinet = inet_sk(newsk);
400 ireq = inet_rsk(req); 404 ireq = inet_rsk(req);
401 newinet->inet_daddr = ireq->rmt_addr; 405 newinet->inet_daddr = ireq->rmt_addr;
402 newinet->inet_rcv_saddr = ireq->loc_addr; 406 newinet->inet_rcv_saddr = ireq->loc_addr;
403 newinet->inet_saddr = ireq->loc_addr; 407 newinet->inet_saddr = ireq->loc_addr;
404 newinet->opt = ireq->opt; 408 newinet->inet_opt = ireq->opt;
405 ireq->opt = NULL; 409 ireq->opt = NULL;
406 newinet->mc_index = inet_iif(skb); 410 newinet->mc_index = inet_iif(skb);
407 newinet->mc_ttl = ip_hdr(skb)->ttl; 411 newinet->mc_ttl = ip_hdr(skb)->ttl;
408 newinet->inet_id = jiffies; 412 newinet->inet_id = jiffies;
409 413
414 if (dst == NULL && (dst = inet_csk_route_child_sock(sk, newsk, req)) == NULL)
415 goto put_and_exit;
416
417 sk_setup_caps(newsk, dst);
418
410 dccp_sync_mss(newsk, dst_mtu(dst)); 419 dccp_sync_mss(newsk, dst_mtu(dst));
411 420
421 if (__inet_inherit_port(sk, newsk) < 0)
422 goto put_and_exit;
412 __inet_hash_nolisten(newsk, NULL); 423 __inet_hash_nolisten(newsk, NULL);
413 __inet_inherit_port(sk, newsk);
414 424
415 return newsk; 425 return newsk;
416 426
417exit_overflow: 427exit_overflow:
418 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); 428 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
429exit_nonewsk:
430 dst_release(dst);
419exit: 431exit:
420 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); 432 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
421 dst_release(dst);
422 return NULL; 433 return NULL;
434put_and_exit:
435 sock_put(newsk);
436 goto exit;
423} 437}
424 438
425EXPORT_SYMBOL_GPL(dccp_v4_request_recv_sock); 439EXPORT_SYMBOL_GPL(dccp_v4_request_recv_sock);
@@ -457,20 +471,19 @@ static struct dst_entry* dccp_v4_route_skb(struct net *net, struct sock *sk,
457 struct sk_buff *skb) 471 struct sk_buff *skb)
458{ 472{
459 struct rtable *rt; 473 struct rtable *rt;
460 struct flowi fl = { .oif = skb_rtable(skb)->rt_iif, 474 struct flowi4 fl4 = {
461 .nl_u = { .ip4_u = 475 .flowi4_oif = skb_rtable(skb)->rt_iif,
462 { .daddr = ip_hdr(skb)->saddr, 476 .daddr = ip_hdr(skb)->saddr,
463 .saddr = ip_hdr(skb)->daddr, 477 .saddr = ip_hdr(skb)->daddr,
464 .tos = RT_CONN_FLAGS(sk) } }, 478 .flowi4_tos = RT_CONN_FLAGS(sk),
465 .proto = sk->sk_protocol, 479 .flowi4_proto = sk->sk_protocol,
466 .uli_u = { .ports = 480 .fl4_sport = dccp_hdr(skb)->dccph_dport,
467 { .sport = dccp_hdr(skb)->dccph_dport, 481 .fl4_dport = dccp_hdr(skb)->dccph_sport,
468 .dport = dccp_hdr(skb)->dccph_sport } 482 };
469 } 483
470 }; 484 security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
471 485 rt = ip_route_output_flow(net, &fl4, sk);
472 security_skb_classify_flow(skb, &fl); 486 if (IS_ERR(rt)) {
473 if (ip_route_output_flow(net, &rt, &fl, sk, 0)) {
474 IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES); 487 IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES);
475 return NULL; 488 return NULL;
476 } 489 }
@@ -484,8 +497,9 @@ static int dccp_v4_send_response(struct sock *sk, struct request_sock *req,
484 int err = -1; 497 int err = -1;
485 struct sk_buff *skb; 498 struct sk_buff *skb;
486 struct dst_entry *dst; 499 struct dst_entry *dst;
500 struct flowi4 fl4;
487 501
488 dst = inet_csk_route_req(sk, req); 502 dst = inet_csk_route_req(sk, &fl4, req);
489 if (dst == NULL) 503 if (dst == NULL)
490 goto out; 504 goto out;
491 505
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 6e3f32575df7..8dc4348774a5 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -54,8 +54,8 @@ static void dccp_v6_hash(struct sock *sk)
54 54
55/* add pseudo-header to DCCP checksum stored in skb->csum */ 55/* add pseudo-header to DCCP checksum stored in skb->csum */
56static inline __sum16 dccp_v6_csum_finish(struct sk_buff *skb, 56static inline __sum16 dccp_v6_csum_finish(struct sk_buff *skb,
57 struct in6_addr *saddr, 57 const struct in6_addr *saddr,
58 struct in6_addr *daddr) 58 const struct in6_addr *daddr)
59{ 59{
60 return csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_DCCP, skb->csum); 60 return csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_DCCP, skb->csum);
61} 61}
@@ -87,7 +87,7 @@ static inline __u32 dccp_v6_init_sequence(struct sk_buff *skb)
87static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, 87static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
88 u8 type, u8 code, int offset, __be32 info) 88 u8 type, u8 code, int offset, __be32 info)
89{ 89{
90 struct ipv6hdr *hdr = (struct ipv6hdr *)skb->data; 90 const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data;
91 const struct dccp_hdr *dh = (struct dccp_hdr *)(skb->data + offset); 91 const struct dccp_hdr *dh = (struct dccp_hdr *)(skb->data + offset);
92 struct dccp_sock *dp; 92 struct dccp_sock *dp;
93 struct ipv6_pinfo *np; 93 struct ipv6_pinfo *np;
@@ -147,30 +147,24 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
147 dst = __sk_dst_check(sk, np->dst_cookie); 147 dst = __sk_dst_check(sk, np->dst_cookie);
148 if (dst == NULL) { 148 if (dst == NULL) {
149 struct inet_sock *inet = inet_sk(sk); 149 struct inet_sock *inet = inet_sk(sk);
150 struct flowi fl; 150 struct flowi6 fl6;
151 151
152 /* BUGGG_FUTURE: Again, it is not clear how 152 /* BUGGG_FUTURE: Again, it is not clear how
153 to handle rthdr case. Ignore this complexity 153 to handle rthdr case. Ignore this complexity
154 for now. 154 for now.
155 */ 155 */
156 memset(&fl, 0, sizeof(fl)); 156 memset(&fl6, 0, sizeof(fl6));
157 fl.proto = IPPROTO_DCCP; 157 fl6.flowi6_proto = IPPROTO_DCCP;
158 ipv6_addr_copy(&fl.fl6_dst, &np->daddr); 158 ipv6_addr_copy(&fl6.daddr, &np->daddr);
159 ipv6_addr_copy(&fl.fl6_src, &np->saddr); 159 ipv6_addr_copy(&fl6.saddr, &np->saddr);
160 fl.oif = sk->sk_bound_dev_if; 160 fl6.flowi6_oif = sk->sk_bound_dev_if;
161 fl.fl_ip_dport = inet->inet_dport; 161 fl6.fl6_dport = inet->inet_dport;
162 fl.fl_ip_sport = inet->inet_sport; 162 fl6.fl6_sport = inet->inet_sport;
163 security_sk_classify_flow(sk, &fl); 163 security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
164 164
165 err = ip6_dst_lookup(sk, &dst, &fl); 165 dst = ip6_dst_lookup_flow(sk, &fl6, NULL, false);
166 if (err) { 166 if (IS_ERR(dst)) {
167 sk->sk_err_soft = -err; 167 sk->sk_err_soft = -PTR_ERR(dst);
168 goto out;
169 }
170
171 err = xfrm_lookup(net, &dst, &fl, sk, 0);
172 if (err < 0) {
173 sk->sk_err_soft = -err;
174 goto out; 168 goto out;
175 } 169 }
176 } else 170 } else
@@ -249,34 +243,30 @@ static int dccp_v6_send_response(struct sock *sk, struct request_sock *req,
249 struct sk_buff *skb; 243 struct sk_buff *skb;
250 struct ipv6_txoptions *opt = NULL; 244 struct ipv6_txoptions *opt = NULL;
251 struct in6_addr *final_p, final; 245 struct in6_addr *final_p, final;
252 struct flowi fl; 246 struct flowi6 fl6;
253 int err = -1; 247 int err = -1;
254 struct dst_entry *dst; 248 struct dst_entry *dst;
255 249
256 memset(&fl, 0, sizeof(fl)); 250 memset(&fl6, 0, sizeof(fl6));
257 fl.proto = IPPROTO_DCCP; 251 fl6.flowi6_proto = IPPROTO_DCCP;
258 ipv6_addr_copy(&fl.fl6_dst, &ireq6->rmt_addr); 252 ipv6_addr_copy(&fl6.daddr, &ireq6->rmt_addr);
259 ipv6_addr_copy(&fl.fl6_src, &ireq6->loc_addr); 253 ipv6_addr_copy(&fl6.saddr, &ireq6->loc_addr);
260 fl.fl6_flowlabel = 0; 254 fl6.flowlabel = 0;
261 fl.oif = ireq6->iif; 255 fl6.flowi6_oif = ireq6->iif;
262 fl.fl_ip_dport = inet_rsk(req)->rmt_port; 256 fl6.fl6_dport = inet_rsk(req)->rmt_port;
263 fl.fl_ip_sport = inet_rsk(req)->loc_port; 257 fl6.fl6_sport = inet_rsk(req)->loc_port;
264 security_req_classify_flow(req, &fl); 258 security_req_classify_flow(req, flowi6_to_flowi(&fl6));
265 259
266 opt = np->opt; 260 opt = np->opt;
267 261
268 final_p = fl6_update_dst(&fl, opt, &final); 262 final_p = fl6_update_dst(&fl6, opt, &final);
269
270 err = ip6_dst_lookup(sk, &dst, &fl);
271 if (err)
272 goto done;
273
274 if (final_p)
275 ipv6_addr_copy(&fl.fl6_dst, final_p);
276 263
277 err = xfrm_lookup(sock_net(sk), &dst, &fl, sk, 0); 264 dst = ip6_dst_lookup_flow(sk, &fl6, final_p, false);
278 if (err < 0) 265 if (IS_ERR(dst)) {
266 err = PTR_ERR(dst);
267 dst = NULL;
279 goto done; 268 goto done;
269 }
280 270
281 skb = dccp_make_response(sk, dst, req); 271 skb = dccp_make_response(sk, dst, req);
282 if (skb != NULL) { 272 if (skb != NULL) {
@@ -285,8 +275,8 @@ static int dccp_v6_send_response(struct sock *sk, struct request_sock *req,
285 dh->dccph_checksum = dccp_v6_csum_finish(skb, 275 dh->dccph_checksum = dccp_v6_csum_finish(skb,
286 &ireq6->loc_addr, 276 &ireq6->loc_addr,
287 &ireq6->rmt_addr); 277 &ireq6->rmt_addr);
288 ipv6_addr_copy(&fl.fl6_dst, &ireq6->rmt_addr); 278 ipv6_addr_copy(&fl6.daddr, &ireq6->rmt_addr);
289 err = ip6_xmit(sk, skb, &fl, opt); 279 err = ip6_xmit(sk, skb, &fl6, opt);
290 err = net_xmit_eval(err); 280 err = net_xmit_eval(err);
291 } 281 }
292 282
@@ -306,9 +296,9 @@ static void dccp_v6_reqsk_destructor(struct request_sock *req)
306 296
307static void dccp_v6_ctl_send_reset(struct sock *sk, struct sk_buff *rxskb) 297static void dccp_v6_ctl_send_reset(struct sock *sk, struct sk_buff *rxskb)
308{ 298{
309 struct ipv6hdr *rxip6h; 299 const struct ipv6hdr *rxip6h;
310 struct sk_buff *skb; 300 struct sk_buff *skb;
311 struct flowi fl; 301 struct flowi6 fl6;
312 struct net *net = dev_net(skb_dst(rxskb)->dev); 302 struct net *net = dev_net(skb_dst(rxskb)->dev);
313 struct sock *ctl_sk = net->dccp.v6_ctl_sk; 303 struct sock *ctl_sk = net->dccp.v6_ctl_sk;
314 struct dst_entry *dst; 304 struct dst_entry *dst;
@@ -327,25 +317,24 @@ static void dccp_v6_ctl_send_reset(struct sock *sk, struct sk_buff *rxskb)
327 dccp_hdr(skb)->dccph_checksum = dccp_v6_csum_finish(skb, &rxip6h->saddr, 317 dccp_hdr(skb)->dccph_checksum = dccp_v6_csum_finish(skb, &rxip6h->saddr,
328 &rxip6h->daddr); 318 &rxip6h->daddr);
329 319
330 memset(&fl, 0, sizeof(fl)); 320 memset(&fl6, 0, sizeof(fl6));
331 ipv6_addr_copy(&fl.fl6_dst, &rxip6h->saddr); 321 ipv6_addr_copy(&fl6.daddr, &rxip6h->saddr);
332 ipv6_addr_copy(&fl.fl6_src, &rxip6h->daddr); 322 ipv6_addr_copy(&fl6.saddr, &rxip6h->daddr);
333 323
334 fl.proto = IPPROTO_DCCP; 324 fl6.flowi6_proto = IPPROTO_DCCP;
335 fl.oif = inet6_iif(rxskb); 325 fl6.flowi6_oif = inet6_iif(rxskb);
336 fl.fl_ip_dport = dccp_hdr(skb)->dccph_dport; 326 fl6.fl6_dport = dccp_hdr(skb)->dccph_dport;
337 fl.fl_ip_sport = dccp_hdr(skb)->dccph_sport; 327 fl6.fl6_sport = dccp_hdr(skb)->dccph_sport;
338 security_skb_classify_flow(rxskb, &fl); 328 security_skb_classify_flow(rxskb, flowi6_to_flowi(&fl6));
339 329
340 /* sk = NULL, but it is safe for now. RST socket required. */ 330 /* sk = NULL, but it is safe for now. RST socket required. */
341 if (!ip6_dst_lookup(ctl_sk, &dst, &fl)) { 331 dst = ip6_dst_lookup_flow(ctl_sk, &fl6, NULL, false);
342 if (xfrm_lookup(net, &dst, &fl, NULL, 0) >= 0) { 332 if (!IS_ERR(dst)) {
343 skb_dst_set(skb, dst); 333 skb_dst_set(skb, dst);
344 ip6_xmit(ctl_sk, skb, &fl, NULL); 334 ip6_xmit(ctl_sk, skb, &fl6, NULL);
345 DCCP_INC_STATS_BH(DCCP_MIB_OUTSEGS); 335 DCCP_INC_STATS_BH(DCCP_MIB_OUTSEGS);
346 DCCP_INC_STATS_BH(DCCP_MIB_OUTRSTS); 336 DCCP_INC_STATS_BH(DCCP_MIB_OUTRSTS);
347 return; 337 return;
348 }
349 } 338 }
350 339
351 kfree_skb(skb); 340 kfree_skb(skb);
@@ -484,7 +473,6 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
484 struct inet6_request_sock *ireq6 = inet6_rsk(req); 473 struct inet6_request_sock *ireq6 = inet6_rsk(req);
485 struct ipv6_pinfo *newnp, *np = inet6_sk(sk); 474 struct ipv6_pinfo *newnp, *np = inet6_sk(sk);
486 struct inet_sock *newinet; 475 struct inet_sock *newinet;
487 struct dccp_sock *newdp;
488 struct dccp6_sock *newdp6; 476 struct dccp6_sock *newdp6;
489 struct sock *newsk; 477 struct sock *newsk;
490 struct ipv6_txoptions *opt; 478 struct ipv6_txoptions *opt;
@@ -498,7 +486,6 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
498 return NULL; 486 return NULL;
499 487
500 newdp6 = (struct dccp6_sock *)newsk; 488 newdp6 = (struct dccp6_sock *)newsk;
501 newdp = dccp_sk(newsk);
502 newinet = inet_sk(newsk); 489 newinet = inet_sk(newsk);
503 newinet->pinet6 = &newdp6->inet6; 490 newinet->pinet6 = &newdp6->inet6;
504 newnp = inet6_sk(newsk); 491 newnp = inet6_sk(newsk);
@@ -540,31 +527,26 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
540 527
541 if (dst == NULL) { 528 if (dst == NULL) {
542 struct in6_addr *final_p, final; 529 struct in6_addr *final_p, final;
543 struct flowi fl; 530 struct flowi6 fl6;
544 531
545 memset(&fl, 0, sizeof(fl)); 532 memset(&fl6, 0, sizeof(fl6));
546 fl.proto = IPPROTO_DCCP; 533 fl6.flowi6_proto = IPPROTO_DCCP;
547 ipv6_addr_copy(&fl.fl6_dst, &ireq6->rmt_addr); 534 ipv6_addr_copy(&fl6.daddr, &ireq6->rmt_addr);
548 final_p = fl6_update_dst(&fl, opt, &final); 535 final_p = fl6_update_dst(&fl6, opt, &final);
549 ipv6_addr_copy(&fl.fl6_src, &ireq6->loc_addr); 536 ipv6_addr_copy(&fl6.saddr, &ireq6->loc_addr);
550 fl.oif = sk->sk_bound_dev_if; 537 fl6.flowi6_oif = sk->sk_bound_dev_if;
551 fl.fl_ip_dport = inet_rsk(req)->rmt_port; 538 fl6.fl6_dport = inet_rsk(req)->rmt_port;
552 fl.fl_ip_sport = inet_rsk(req)->loc_port; 539 fl6.fl6_sport = inet_rsk(req)->loc_port;
553 security_sk_classify_flow(sk, &fl); 540 security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
554 541
555 if (ip6_dst_lookup(sk, &dst, &fl)) 542 dst = ip6_dst_lookup_flow(sk, &fl6, final_p, false);
556 goto out; 543 if (IS_ERR(dst))
557
558 if (final_p)
559 ipv6_addr_copy(&fl.fl6_dst, final_p);
560
561 if ((xfrm_lookup(sock_net(sk), &dst, &fl, sk, 0)) < 0)
562 goto out; 544 goto out;
563 } 545 }
564 546
565 newsk = dccp_create_openreq_child(sk, req, skb); 547 newsk = dccp_create_openreq_child(sk, req, skb);
566 if (newsk == NULL) 548 if (newsk == NULL)
567 goto out; 549 goto out_nonewsk;
568 550
569 /* 551 /*
570 * No need to charge this sock to the relevant IPv6 refcnt debug socks 552 * No need to charge this sock to the relevant IPv6 refcnt debug socks
@@ -578,7 +560,6 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
578 newdp6 = (struct dccp6_sock *)newsk; 560 newdp6 = (struct dccp6_sock *)newsk;
579 newinet = inet_sk(newsk); 561 newinet = inet_sk(newsk);
580 newinet->pinet6 = &newdp6->inet6; 562 newinet->pinet6 = &newdp6->inet6;
581 newdp = dccp_sk(newsk);
582 newnp = inet6_sk(newsk); 563 newnp = inet6_sk(newsk);
583 564
584 memcpy(newnp, np, sizeof(struct ipv6_pinfo)); 565 memcpy(newnp, np, sizeof(struct ipv6_pinfo));
@@ -592,7 +573,7 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
592 573
593 First: no IPv4 options. 574 First: no IPv4 options.
594 */ 575 */
595 newinet->opt = NULL; 576 newinet->inet_opt = NULL;
596 577
597 /* Clone RX bits */ 578 /* Clone RX bits */
598 newnp->rxopt.all = np->rxopt.all; 579 newnp->rxopt.all = np->rxopt.all;
@@ -632,18 +613,22 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
632 newinet->inet_daddr = newinet->inet_saddr = LOOPBACK4_IPV6; 613 newinet->inet_daddr = newinet->inet_saddr = LOOPBACK4_IPV6;
633 newinet->inet_rcv_saddr = LOOPBACK4_IPV6; 614 newinet->inet_rcv_saddr = LOOPBACK4_IPV6;
634 615
616 if (__inet_inherit_port(sk, newsk) < 0) {
617 sock_put(newsk);
618 goto out;
619 }
635 __inet6_hash(newsk, NULL); 620 __inet6_hash(newsk, NULL);
636 __inet_inherit_port(sk, newsk);
637 621
638 return newsk; 622 return newsk;
639 623
640out_overflow: 624out_overflow:
641 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); 625 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
626out_nonewsk:
627 dst_release(dst);
642out: 628out:
643 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); 629 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
644 if (opt != NULL && opt != np->opt) 630 if (opt != NULL && opt != np->opt)
645 sock_kfree_s(sk, opt, opt->tot_len); 631 sock_kfree_s(sk, opt, opt->tot_len);
646 dst_release(dst);
647 return NULL; 632 return NULL;
648} 633}
649 634
@@ -874,7 +859,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
874 struct ipv6_pinfo *np = inet6_sk(sk); 859 struct ipv6_pinfo *np = inet6_sk(sk);
875 struct dccp_sock *dp = dccp_sk(sk); 860 struct dccp_sock *dp = dccp_sk(sk);
876 struct in6_addr *saddr = NULL, *final_p, final; 861 struct in6_addr *saddr = NULL, *final_p, final;
877 struct flowi fl; 862 struct flowi6 fl6;
878 struct dst_entry *dst; 863 struct dst_entry *dst;
879 int addr_type; 864 int addr_type;
880 int err; 865 int err;
@@ -887,14 +872,14 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
887 if (usin->sin6_family != AF_INET6) 872 if (usin->sin6_family != AF_INET6)
888 return -EAFNOSUPPORT; 873 return -EAFNOSUPPORT;
889 874
890 memset(&fl, 0, sizeof(fl)); 875 memset(&fl6, 0, sizeof(fl6));
891 876
892 if (np->sndflow) { 877 if (np->sndflow) {
893 fl.fl6_flowlabel = usin->sin6_flowinfo & IPV6_FLOWINFO_MASK; 878 fl6.flowlabel = usin->sin6_flowinfo & IPV6_FLOWINFO_MASK;
894 IP6_ECN_flow_init(fl.fl6_flowlabel); 879 IP6_ECN_flow_init(fl6.flowlabel);
895 if (fl.fl6_flowlabel & IPV6_FLOWLABEL_MASK) { 880 if (fl6.flowlabel & IPV6_FLOWLABEL_MASK) {
896 struct ip6_flowlabel *flowlabel; 881 struct ip6_flowlabel *flowlabel;
897 flowlabel = fl6_sock_lookup(sk, fl.fl6_flowlabel); 882 flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
898 if (flowlabel == NULL) 883 if (flowlabel == NULL)
899 return -EINVAL; 884 return -EINVAL;
900 ipv6_addr_copy(&usin->sin6_addr, &flowlabel->dst); 885 ipv6_addr_copy(&usin->sin6_addr, &flowlabel->dst);
@@ -931,7 +916,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
931 } 916 }
932 917
933 ipv6_addr_copy(&np->daddr, &usin->sin6_addr); 918 ipv6_addr_copy(&np->daddr, &usin->sin6_addr);
934 np->flow_label = fl.fl6_flowlabel; 919 np->flow_label = fl6.flowlabel;
935 920
936 /* 921 /*
937 * DCCP over IPv4 922 * DCCP over IPv4
@@ -968,33 +953,24 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
968 if (!ipv6_addr_any(&np->rcv_saddr)) 953 if (!ipv6_addr_any(&np->rcv_saddr))
969 saddr = &np->rcv_saddr; 954 saddr = &np->rcv_saddr;
970 955
971 fl.proto = IPPROTO_DCCP; 956 fl6.flowi6_proto = IPPROTO_DCCP;
972 ipv6_addr_copy(&fl.fl6_dst, &np->daddr); 957 ipv6_addr_copy(&fl6.daddr, &np->daddr);
973 ipv6_addr_copy(&fl.fl6_src, saddr ? saddr : &np->saddr); 958 ipv6_addr_copy(&fl6.saddr, saddr ? saddr : &np->saddr);
974 fl.oif = sk->sk_bound_dev_if; 959 fl6.flowi6_oif = sk->sk_bound_dev_if;
975 fl.fl_ip_dport = usin->sin6_port; 960 fl6.fl6_dport = usin->sin6_port;
976 fl.fl_ip_sport = inet->inet_sport; 961 fl6.fl6_sport = inet->inet_sport;
977 security_sk_classify_flow(sk, &fl); 962 security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
978 963
979 final_p = fl6_update_dst(&fl, np->opt, &final); 964 final_p = fl6_update_dst(&fl6, np->opt, &final);
980 965
981 err = ip6_dst_lookup(sk, &dst, &fl); 966 dst = ip6_dst_lookup_flow(sk, &fl6, final_p, true);
982 if (err) 967 if (IS_ERR(dst)) {
968 err = PTR_ERR(dst);
983 goto failure; 969 goto failure;
984
985 if (final_p)
986 ipv6_addr_copy(&fl.fl6_dst, final_p);
987
988 err = __xfrm_lookup(sock_net(sk), &dst, &fl, sk, XFRM_LOOKUP_WAIT);
989 if (err < 0) {
990 if (err == -EREMOTE)
991 err = ip6_dst_blackhole(sk, &dst, &fl);
992 if (err < 0)
993 goto failure;
994 } 970 }
995 971
996 if (saddr == NULL) { 972 if (saddr == NULL) {
997 saddr = &fl.fl6_src; 973 saddr = &fl6.saddr;
998 ipv6_addr_copy(&np->rcv_saddr, saddr); 974 ipv6_addr_copy(&np->rcv_saddr, saddr);
999 } 975 }
1000 976
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
index 128b089d3aef..d7041a0963af 100644
--- a/net/dccp/minisocks.c
+++ b/net/dccp/minisocks.c
@@ -121,30 +121,18 @@ struct sock *dccp_create_openreq_child(struct sock *sk,
121 * 121 *
122 * Choose S.ISS (initial seqno) or set from Init Cookies 122 * Choose S.ISS (initial seqno) or set from Init Cookies
123 * Initialize S.GAR := S.ISS 123 * Initialize S.GAR := S.ISS
124 * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookies 124 * Set S.ISR, S.GSR from packet (or Init Cookies)
125 */ 125 *
126 newdp->dccps_gar = newdp->dccps_iss = dreq->dreq_iss; 126 * Setting AWL/AWH and SWL/SWH happens as part of the feature
127 dccp_update_gss(newsk, dreq->dreq_iss); 127 * activation below, as these windows all depend on the local
128 128 * and remote Sequence Window feature values (7.5.2).
129 newdp->dccps_isr = dreq->dreq_isr;
130 dccp_update_gsr(newsk, dreq->dreq_isr);
131
132 /*
133 * SWL and AWL are initially adjusted so that they are not less than
134 * the initial Sequence Numbers received and sent, respectively:
135 * SWL := max(GSR + 1 - floor(W/4), ISR),
136 * AWL := max(GSS - W' + 1, ISS).
137 * These adjustments MUST be applied only at the beginning of the
138 * connection.
139 */ 129 */
140 dccp_set_seqno(&newdp->dccps_swl, 130 newdp->dccps_gss = newdp->dccps_iss = dreq->dreq_iss;
141 max48(newdp->dccps_swl, newdp->dccps_isr)); 131 newdp->dccps_gar = newdp->dccps_iss;
142 dccp_set_seqno(&newdp->dccps_awl, 132 newdp->dccps_gsr = newdp->dccps_isr = dreq->dreq_isr;
143 max48(newdp->dccps_awl, newdp->dccps_iss));
144 133
145 /* 134 /*
146 * Activate features after initialising the sequence numbers, 135 * Activate features: initialise CCIDs, sequence windows etc.
147 * since CCID initialisation may depend on GSS, ISR, ISS etc.
148 */ 136 */
149 if (dccp_feat_activate_values(newsk, &dreq->dreq_featneg)) { 137 if (dccp_feat_activate_values(newsk, &dreq->dreq_featneg)) {
150 /* It is still raw copy of parent, so invalidate 138 /* It is still raw copy of parent, so invalidate
diff --git a/net/dccp/options.c b/net/dccp/options.c
index bfda087bd90d..4b2ab657ac8e 100644
--- a/net/dccp/options.c
+++ b/net/dccp/options.c
@@ -54,7 +54,6 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
54 struct dccp_sock *dp = dccp_sk(sk); 54 struct dccp_sock *dp = dccp_sk(sk);
55 const struct dccp_hdr *dh = dccp_hdr(skb); 55 const struct dccp_hdr *dh = dccp_hdr(skb);
56 const u8 pkt_type = DCCP_SKB_CB(skb)->dccpd_type; 56 const u8 pkt_type = DCCP_SKB_CB(skb)->dccpd_type;
57 u64 ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq;
58 unsigned char *options = (unsigned char *)dh + dccp_hdr_len(skb); 57 unsigned char *options = (unsigned char *)dh + dccp_hdr_len(skb);
59 unsigned char *opt_ptr = options; 58 unsigned char *opt_ptr = options;
60 const unsigned char *opt_end = (unsigned char *)dh + 59 const unsigned char *opt_end = (unsigned char *)dh +
@@ -96,18 +95,11 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
96 } 95 }
97 96
98 /* 97 /*
99 * CCID-Specific Options (from RFC 4340, sec. 10.3):
100 *
101 * Option numbers 128 through 191 are for options sent from the
102 * HC-Sender to the HC-Receiver; option numbers 192 through 255
103 * are for options sent from the HC-Receiver to the HC-Sender.
104 *
105 * CCID-specific options are ignored during connection setup, as 98 * CCID-specific options are ignored during connection setup, as
106 * negotiation may still be in progress (see RFC 4340, 10.3). 99 * negotiation may still be in progress (see RFC 4340, 10.3).
107 * The same applies to Ack Vectors, as these depend on the CCID. 100 * The same applies to Ack Vectors, as these depend on the CCID.
108 *
109 */ 101 */
110 if (dreq != NULL && (opt >= 128 || 102 if (dreq != NULL && (opt >= DCCPO_MIN_RX_CCID_SPECIFIC ||
111 opt == DCCPO_ACK_VECTOR_0 || opt == DCCPO_ACK_VECTOR_1)) 103 opt == DCCPO_ACK_VECTOR_0 || opt == DCCPO_ACK_VECTOR_1))
112 goto ignore_option; 104 goto ignore_option;
113 105
@@ -131,19 +123,13 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
131 case DCCPO_CHANGE_L ... DCCPO_CONFIRM_R: 123 case DCCPO_CHANGE_L ... DCCPO_CONFIRM_R:
132 if (pkt_type == DCCP_PKT_DATA) /* RFC 4340, 6 */ 124 if (pkt_type == DCCP_PKT_DATA) /* RFC 4340, 6 */
133 break; 125 break;
126 if (len == 0)
127 goto out_invalid_option;
134 rc = dccp_feat_parse_options(sk, dreq, mandatory, opt, 128 rc = dccp_feat_parse_options(sk, dreq, mandatory, opt,
135 *value, value + 1, len - 1); 129 *value, value + 1, len - 1);
136 if (rc) 130 if (rc)
137 goto out_featneg_failed; 131 goto out_featneg_failed;
138 break; 132 break;
139 case DCCPO_ACK_VECTOR_0:
140 case DCCPO_ACK_VECTOR_1:
141 if (dccp_packet_without_ack(skb)) /* RFC 4340, 11.4 */
142 break;
143 if (dp->dccps_hc_rx_ackvec != NULL &&
144 dccp_ackvec_parse(sk, skb, &ackno, opt, value, len))
145 goto out_invalid_option;
146 break;
147 case DCCPO_TIMESTAMP: 133 case DCCPO_TIMESTAMP:
148 if (len != 4) 134 if (len != 4)
149 goto out_invalid_option; 135 goto out_invalid_option;
@@ -170,6 +156,8 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
170 dccp_role(sk), ntohl(opt_val), 156 dccp_role(sk), ntohl(opt_val),
171 (unsigned long long) 157 (unsigned long long)
172 DCCP_SKB_CB(skb)->dccpd_ack_seq); 158 DCCP_SKB_CB(skb)->dccpd_ack_seq);
159 /* schedule an Ack in case this sender is quiescent */
160 inet_csk_schedule_ack(sk);
173 break; 161 break;
174 case DCCPO_TIMESTAMP_ECHO: 162 case DCCPO_TIMESTAMP_ECHO:
175 if (len != 4 && len != 6 && len != 8) 163 if (len != 4 && len != 6 && len != 8)
@@ -226,23 +214,25 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
226 dccp_pr_debug("%s rx opt: ELAPSED_TIME=%d\n", 214 dccp_pr_debug("%s rx opt: ELAPSED_TIME=%d\n",
227 dccp_role(sk), elapsed_time); 215 dccp_role(sk), elapsed_time);
228 break; 216 break;
229 case 128 ... 191: { 217 case DCCPO_MIN_RX_CCID_SPECIFIC ... DCCPO_MAX_RX_CCID_SPECIFIC:
230 const u16 idx = value - options;
231
232 if (ccid_hc_rx_parse_options(dp->dccps_hc_rx_ccid, sk, 218 if (ccid_hc_rx_parse_options(dp->dccps_hc_rx_ccid, sk,
233 opt, len, idx, 219 pkt_type, opt, value, len))
234 value) != 0)
235 goto out_invalid_option; 220 goto out_invalid_option;
236 }
237 break; 221 break;
238 case 192 ... 255: { 222 case DCCPO_ACK_VECTOR_0:
239 const u16 idx = value - options; 223 case DCCPO_ACK_VECTOR_1:
240 224 if (dccp_packet_without_ack(skb)) /* RFC 4340, 11.4 */
225 break;
226 /*
227 * Ack vectors are processed by the TX CCID if it is
228 * interested. The RX CCID need not parse Ack Vectors,
229 * since it is only interested in clearing old state.
230 * Fall through.
231 */
232 case DCCPO_MIN_TX_CCID_SPECIFIC ... DCCPO_MAX_TX_CCID_SPECIFIC:
241 if (ccid_hc_tx_parse_options(dp->dccps_hc_tx_ccid, sk, 233 if (ccid_hc_tx_parse_options(dp->dccps_hc_tx_ccid, sk,
242 opt, len, idx, 234 pkt_type, opt, value, len))
243 value) != 0)
244 goto out_invalid_option; 235 goto out_invalid_option;
245 }
246 break; 236 break;
247 default: 237 default:
248 DCCP_CRIT("DCCP(%p): option %d(len=%d) not " 238 DCCP_CRIT("DCCP(%p): option %d(len=%d) not "
@@ -353,6 +343,7 @@ static inline int dccp_elapsed_time_len(const u32 elapsed_time)
353 return elapsed_time == 0 ? 0 : elapsed_time <= 0xFFFF ? 2 : 4; 343 return elapsed_time == 0 ? 0 : elapsed_time <= 0xFFFF ? 2 : 4;
354} 344}
355 345
346/* FIXME: This function is currently not used anywhere */
356int dccp_insert_option_elapsed_time(struct sk_buff *skb, u32 elapsed_time) 347int dccp_insert_option_elapsed_time(struct sk_buff *skb, u32 elapsed_time)
357{ 348{
358 const int elapsed_time_len = dccp_elapsed_time_len(elapsed_time); 349 const int elapsed_time_len = dccp_elapsed_time_len(elapsed_time);
@@ -384,7 +375,7 @@ int dccp_insert_option_elapsed_time(struct sk_buff *skb, u32 elapsed_time)
384 375
385EXPORT_SYMBOL_GPL(dccp_insert_option_elapsed_time); 376EXPORT_SYMBOL_GPL(dccp_insert_option_elapsed_time);
386 377
387int dccp_insert_option_timestamp(struct sk_buff *skb) 378static int dccp_insert_option_timestamp(struct sk_buff *skb)
388{ 379{
389 __be32 now = htonl(dccp_timestamp()); 380 __be32 now = htonl(dccp_timestamp());
390 /* yes this will overflow but that is the point as we want a 381 /* yes this will overflow but that is the point as we want a
@@ -393,8 +384,6 @@ int dccp_insert_option_timestamp(struct sk_buff *skb)
393 return dccp_insert_option(skb, DCCPO_TIMESTAMP, &now, sizeof(now)); 384 return dccp_insert_option(skb, DCCPO_TIMESTAMP, &now, sizeof(now));
394} 385}
395 386
396EXPORT_SYMBOL_GPL(dccp_insert_option_timestamp);
397
398static int dccp_insert_option_timestamp_echo(struct dccp_sock *dp, 387static int dccp_insert_option_timestamp_echo(struct dccp_sock *dp,
399 struct dccp_request_sock *dreq, 388 struct dccp_request_sock *dreq,
400 struct sk_buff *skb) 389 struct sk_buff *skb)
@@ -439,6 +428,83 @@ static int dccp_insert_option_timestamp_echo(struct dccp_sock *dp,
439 return 0; 428 return 0;
440} 429}
441 430
431static int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb)
432{
433 struct dccp_sock *dp = dccp_sk(sk);
434 struct dccp_ackvec *av = dp->dccps_hc_rx_ackvec;
435 struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
436 const u16 buflen = dccp_ackvec_buflen(av);
437 /* Figure out how many options do we need to represent the ackvec */
438 const u8 nr_opts = DIV_ROUND_UP(buflen, DCCP_SINGLE_OPT_MAXLEN);
439 u16 len = buflen + 2 * nr_opts;
440 u8 i, nonce = 0;
441 const unsigned char *tail, *from;
442 unsigned char *to;
443
444 if (dcb->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) {
445 DCCP_WARN("Lacking space for %u bytes on %s packet\n", len,
446 dccp_packet_name(dcb->dccpd_type));
447 return -1;
448 }
449 /*
450 * Since Ack Vectors are variable-length, we can not always predict
451 * their size. To catch exception cases where the space is running out
452 * on the skb, a separate Sync is scheduled to carry the Ack Vector.
453 */
454 if (len > DCCPAV_MIN_OPTLEN &&
455 len + dcb->dccpd_opt_len + skb->len > dp->dccps_mss_cache) {
456 DCCP_WARN("No space left for Ack Vector (%u) on skb (%u+%u), "
457 "MPS=%u ==> reduce payload size?\n", len, skb->len,
458 dcb->dccpd_opt_len, dp->dccps_mss_cache);
459 dp->dccps_sync_scheduled = 1;
460 return 0;
461 }
462 dcb->dccpd_opt_len += len;
463
464 to = skb_push(skb, len);
465 len = buflen;
466 from = av->av_buf + av->av_buf_head;
467 tail = av->av_buf + DCCPAV_MAX_ACKVEC_LEN;
468
469 for (i = 0; i < nr_opts; ++i) {
470 int copylen = len;
471
472 if (len > DCCP_SINGLE_OPT_MAXLEN)
473 copylen = DCCP_SINGLE_OPT_MAXLEN;
474
475 /*
476 * RFC 4340, 12.2: Encode the Nonce Echo for this Ack Vector via
477 * its type; ack_nonce is the sum of all individual buf_nonce's.
478 */
479 nonce ^= av->av_buf_nonce[i];
480
481 *to++ = DCCPO_ACK_VECTOR_0 + av->av_buf_nonce[i];
482 *to++ = copylen + 2;
483
484 /* Check if buf_head wraps */
485 if (from + copylen > tail) {
486 const u16 tailsize = tail - from;
487
488 memcpy(to, from, tailsize);
489 to += tailsize;
490 len -= tailsize;
491 copylen -= tailsize;
492 from = av->av_buf;
493 }
494
495 memcpy(to, from, copylen);
496 from += copylen;
497 to += copylen;
498 len -= copylen;
499 }
500 /*
501 * Each sent Ack Vector is recorded in the list, as per A.2 of RFC 4340.
502 */
503 if (dccp_ackvec_update_records(av, dcb->dccpd_seq, nonce))
504 return -ENOBUFS;
505 return 0;
506}
507
442/** 508/**
443 * dccp_insert_option_mandatory - Mandatory option (5.8.2) 509 * dccp_insert_option_mandatory - Mandatory option (5.8.2)
444 * Note that since we are using skb_push, this function needs to be called 510 * Note that since we are using skb_push, this function needs to be called
@@ -534,8 +600,7 @@ int dccp_insert_options(struct sock *sk, struct sk_buff *skb)
534 if (dccp_insert_option_timestamp(skb)) 600 if (dccp_insert_option_timestamp(skb))
535 return -1; 601 return -1;
536 602
537 } else if (dp->dccps_hc_rx_ackvec != NULL && 603 } else if (dccp_ackvec_pending(sk) &&
538 dccp_ackvec_pending(dp->dccps_hc_rx_ackvec) &&
539 dccp_insert_option_ackvec(sk, skb)) { 604 dccp_insert_option_ackvec(sk, skb)) {
540 return -1; 605 return -1;
541 } 606 }
diff --git a/net/dccp/output.c b/net/dccp/output.c
index aadbdb58758b..fab108e51e5a 100644
--- a/net/dccp/output.c
+++ b/net/dccp/output.c
@@ -43,7 +43,7 @@ static void dccp_skb_entail(struct sock *sk, struct sk_buff *skb)
43static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb) 43static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb)
44{ 44{
45 if (likely(skb != NULL)) { 45 if (likely(skb != NULL)) {
46 const struct inet_sock *inet = inet_sk(sk); 46 struct inet_sock *inet = inet_sk(sk);
47 const struct inet_connection_sock *icsk = inet_csk(sk); 47 const struct inet_connection_sock *icsk = inet_csk(sk);
48 struct dccp_sock *dp = dccp_sk(sk); 48 struct dccp_sock *dp = dccp_sk(sk);
49 struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); 49 struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
@@ -136,14 +136,14 @@ static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb)
136 136
137 DCCP_INC_STATS(DCCP_MIB_OUTSEGS); 137 DCCP_INC_STATS(DCCP_MIB_OUTSEGS);
138 138
139 err = icsk->icsk_af_ops->queue_xmit(skb); 139 err = icsk->icsk_af_ops->queue_xmit(skb, &inet->cork.fl);
140 return net_xmit_eval(err); 140 return net_xmit_eval(err);
141 } 141 }
142 return -ENOBUFS; 142 return -ENOBUFS;
143} 143}
144 144
145/** 145/**
146 * dccp_determine_ccmps - Find out about CCID-specfic packet-size limits 146 * dccp_determine_ccmps - Find out about CCID-specific packet-size limits
147 * We only consider the HC-sender CCID for setting the CCMPS (RFC 4340, 14.), 147 * We only consider the HC-sender CCID for setting the CCMPS (RFC 4340, 14.),
148 * since the RX CCID is restricted to feedback packets (Acks), which are small 148 * since the RX CCID is restricted to feedback packets (Acks), which are small
149 * in comparison with the data traffic. A value of 0 means "no current CCMPS". 149 * in comparison with the data traffic. A value of 0 means "no current CCMPS".
@@ -209,108 +209,158 @@ void dccp_write_space(struct sock *sk)
209} 209}
210 210
211/** 211/**
212 * dccp_wait_for_ccid - Wait for ccid to tell us we can send a packet 212 * dccp_wait_for_ccid - Await CCID send permission
213 * @sk: socket to wait for 213 * @sk: socket to wait for
214 * @skb: current skb to pass on for waiting 214 * @delay: timeout in jiffies
215 * @delay: sleep timeout in milliseconds (> 0) 215 * This is used by CCIDs which need to delay the send time in process context.
216 * This function is called by default when the socket is closed, and
217 * when a non-zero linger time is set on the socket. For consistency
218 */ 216 */
219static int dccp_wait_for_ccid(struct sock *sk, struct sk_buff *skb, int delay) 217static int dccp_wait_for_ccid(struct sock *sk, unsigned long delay)
220{ 218{
221 struct dccp_sock *dp = dccp_sk(sk);
222 DEFINE_WAIT(wait); 219 DEFINE_WAIT(wait);
223 unsigned long jiffdelay; 220 long remaining;
224 int rc;
225 221
226 do { 222 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
227 dccp_pr_debug("delayed send by %d msec\n", delay); 223 sk->sk_write_pending++;
228 jiffdelay = msecs_to_jiffies(delay); 224 release_sock(sk);
229 225
230 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); 226 remaining = schedule_timeout(delay);
231 227
232 sk->sk_write_pending++; 228 lock_sock(sk);
233 release_sock(sk); 229 sk->sk_write_pending--;
234 schedule_timeout(jiffdelay); 230 finish_wait(sk_sleep(sk), &wait);
235 lock_sock(sk);
236 sk->sk_write_pending--;
237 231
238 if (sk->sk_err) 232 if (signal_pending(current) || sk->sk_err)
239 goto do_error; 233 return -1;
240 if (signal_pending(current)) 234 return remaining;
241 goto do_interrupted; 235}
242 236
243 rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb); 237/**
244 } while ((delay = rc) > 0); 238 * dccp_xmit_packet - Send data packet under control of CCID
245out: 239 * Transmits next-queued payload and informs CCID to account for the packet.
246 finish_wait(sk_sleep(sk), &wait); 240 */
247 return rc; 241static void dccp_xmit_packet(struct sock *sk)
248 242{
249do_error: 243 int err, len;
250 rc = -EPIPE; 244 struct dccp_sock *dp = dccp_sk(sk);
251 goto out; 245 struct sk_buff *skb = dccp_qpolicy_pop(sk);
252do_interrupted: 246
253 rc = -EINTR; 247 if (unlikely(skb == NULL))
254 goto out; 248 return;
249 len = skb->len;
250
251 if (sk->sk_state == DCCP_PARTOPEN) {
252 const u32 cur_mps = dp->dccps_mss_cache - DCCP_FEATNEG_OVERHEAD;
253 /*
254 * See 8.1.5 - Handshake Completion.
255 *
256 * For robustness we resend Confirm options until the client has
257 * entered OPEN. During the initial feature negotiation, the MPS
258 * is smaller than usual, reduced by the Change/Confirm options.
259 */
260 if (!list_empty(&dp->dccps_featneg) && len > cur_mps) {
261 DCCP_WARN("Payload too large (%d) for featneg.\n", len);
262 dccp_send_ack(sk);
263 dccp_feat_list_purge(&dp->dccps_featneg);
264 }
265
266 inet_csk_schedule_ack(sk);
267 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
268 inet_csk(sk)->icsk_rto,
269 DCCP_RTO_MAX);
270 DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATAACK;
271 } else if (dccp_ack_pending(sk)) {
272 DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATAACK;
273 } else {
274 DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATA;
275 }
276
277 err = dccp_transmit_skb(sk, skb);
278 if (err)
279 dccp_pr_debug("transmit_skb() returned err=%d\n", err);
280 /*
281 * Register this one as sent even if an error occurred. To the remote
282 * end a local packet drop is indistinguishable from network loss, i.e.
283 * any local drop will eventually be reported via receiver feedback.
284 */
285 ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, len);
286
287 /*
288 * If the CCID needs to transfer additional header options out-of-band
289 * (e.g. Ack Vectors or feature-negotiation options), it activates this
290 * flag to schedule a Sync. The Sync will automatically incorporate all
291 * currently pending header options, thus clearing the backlog.
292 */
293 if (dp->dccps_sync_scheduled)
294 dccp_send_sync(sk, dp->dccps_gsr, DCCP_PKT_SYNC);
255} 295}
256 296
257void dccp_write_xmit(struct sock *sk, int block) 297/**
298 * dccp_flush_write_queue - Drain queue at end of connection
299 * Since dccp_sendmsg queues packets without waiting for them to be sent, it may
300 * happen that the TX queue is not empty at the end of a connection. We give the
301 * HC-sender CCID a grace period of up to @time_budget jiffies. If this function
302 * returns with a non-empty write queue, it will be purged later.
303 */
304void dccp_flush_write_queue(struct sock *sk, long *time_budget)
258{ 305{
259 struct dccp_sock *dp = dccp_sk(sk); 306 struct dccp_sock *dp = dccp_sk(sk);
260 struct sk_buff *skb; 307 struct sk_buff *skb;
308 long delay, rc;
261 309
262 while ((skb = skb_peek(&sk->sk_write_queue))) { 310 while (*time_budget > 0 && (skb = skb_peek(&sk->sk_write_queue))) {
263 int err = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb); 311 rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb);
264
265 if (err > 0) {
266 if (!block) {
267 sk_reset_timer(sk, &dp->dccps_xmit_timer,
268 msecs_to_jiffies(err)+jiffies);
269 break;
270 } else
271 err = dccp_wait_for_ccid(sk, skb, err);
272 if (err && err != -EINTR)
273 DCCP_BUG("err=%d after dccp_wait_for_ccid", err);
274 }
275 312
276 skb_dequeue(&sk->sk_write_queue); 313 switch (ccid_packet_dequeue_eval(rc)) {
277 if (err == 0) { 314 case CCID_PACKET_WILL_DEQUEUE_LATER:
278 struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); 315 /*
279 const int len = skb->len; 316 * If the CCID determines when to send, the next sending
280 317 * time is unknown or the CCID may not even send again
281 if (sk->sk_state == DCCP_PARTOPEN) { 318 * (e.g. remote host crashes or lost Ack packets).
282 const u32 cur_mps = dp->dccps_mss_cache - DCCP_FEATNEG_OVERHEAD; 319 */
283 /* 320 DCCP_WARN("CCID did not manage to send all packets\n");
284 * See 8.1.5 - Handshake Completion. 321 return;
285 * 322 case CCID_PACKET_DELAY:
286 * For robustness we resend Confirm options until the client has 323 delay = msecs_to_jiffies(rc);
287 * entered OPEN. During the initial feature negotiation, the MPS 324 if (delay > *time_budget)
288 * is smaller than usual, reduced by the Change/Confirm options. 325 return;
289 */ 326 rc = dccp_wait_for_ccid(sk, delay);
290 if (!list_empty(&dp->dccps_featneg) && len > cur_mps) { 327 if (rc < 0)
291 DCCP_WARN("Payload too large (%d) for featneg.\n", len); 328 return;
292 dccp_send_ack(sk); 329 *time_budget -= (delay - rc);
293 dccp_feat_list_purge(&dp->dccps_featneg); 330 /* check again if we can send now */
294 } 331 break;
295 332 case CCID_PACKET_SEND_AT_ONCE:
296 inet_csk_schedule_ack(sk); 333 dccp_xmit_packet(sk);
297 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, 334 break;
298 inet_csk(sk)->icsk_rto, 335 case CCID_PACKET_ERR:
299 DCCP_RTO_MAX); 336 skb_dequeue(&sk->sk_write_queue);
300 dcb->dccpd_type = DCCP_PKT_DATAACK;
301 } else if (dccp_ack_pending(sk))
302 dcb->dccpd_type = DCCP_PKT_DATAACK;
303 else
304 dcb->dccpd_type = DCCP_PKT_DATA;
305
306 err = dccp_transmit_skb(sk, skb);
307 ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, 0, len);
308 if (err)
309 DCCP_BUG("err=%d after ccid_hc_tx_packet_sent",
310 err);
311 } else {
312 dccp_pr_debug("packet discarded due to err=%d\n", err);
313 kfree_skb(skb); 337 kfree_skb(skb);
338 dccp_pr_debug("packet discarded due to err=%ld\n", rc);
339 }
340 }
341}
342
343void dccp_write_xmit(struct sock *sk)
344{
345 struct dccp_sock *dp = dccp_sk(sk);
346 struct sk_buff *skb;
347
348 while ((skb = dccp_qpolicy_top(sk))) {
349 int rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb);
350
351 switch (ccid_packet_dequeue_eval(rc)) {
352 case CCID_PACKET_WILL_DEQUEUE_LATER:
353 return;
354 case CCID_PACKET_DELAY:
355 sk_reset_timer(sk, &dp->dccps_xmit_timer,
356 jiffies + msecs_to_jiffies(rc));
357 return;
358 case CCID_PACKET_SEND_AT_ONCE:
359 dccp_xmit_packet(sk);
360 break;
361 case CCID_PACKET_ERR:
362 dccp_qpolicy_drop(sk, skb);
363 dccp_pr_debug("packet discarded due to err=%d\n", rc);
314 } 364 }
315 } 365 }
316} 366}
@@ -474,8 +524,9 @@ int dccp_send_reset(struct sock *sk, enum dccp_reset_codes code)
474/* 524/*
475 * Do all connect socket setups that can be done AF independent. 525 * Do all connect socket setups that can be done AF independent.
476 */ 526 */
477static inline void dccp_connect_init(struct sock *sk) 527int dccp_connect(struct sock *sk)
478{ 528{
529 struct sk_buff *skb;
479 struct dccp_sock *dp = dccp_sk(sk); 530 struct dccp_sock *dp = dccp_sk(sk);
480 struct dst_entry *dst = __sk_dst_get(sk); 531 struct dst_entry *dst = __sk_dst_get(sk);
481 struct inet_connection_sock *icsk = inet_csk(sk); 532 struct inet_connection_sock *icsk = inet_csk(sk);
@@ -485,22 +536,12 @@ static inline void dccp_connect_init(struct sock *sk)
485 536
486 dccp_sync_mss(sk, dst_mtu(dst)); 537 dccp_sync_mss(sk, dst_mtu(dst));
487 538
488 /* Initialise GAR as per 8.5; AWL/AWH are set in dccp_transmit_skb() */
489 dp->dccps_gar = dp->dccps_iss;
490
491 icsk->icsk_retransmits = 0;
492}
493
494int dccp_connect(struct sock *sk)
495{
496 struct sk_buff *skb;
497 struct inet_connection_sock *icsk = inet_csk(sk);
498
499 /* do not connect if feature negotiation setup fails */ 539 /* do not connect if feature negotiation setup fails */
500 if (dccp_feat_finalise_settings(dccp_sk(sk))) 540 if (dccp_feat_finalise_settings(dccp_sk(sk)))
501 return -EPROTO; 541 return -EPROTO;
502 542
503 dccp_connect_init(sk); 543 /* Initialise GAR as per 8.5; AWL/AWH are set in dccp_transmit_skb() */
544 dp->dccps_gar = dp->dccps_iss;
504 545
505 skb = alloc_skb(sk->sk_prot->max_header, sk->sk_allocation); 546 skb = alloc_skb(sk->sk_prot->max_header, sk->sk_allocation);
506 if (unlikely(skb == NULL)) 547 if (unlikely(skb == NULL))
@@ -516,6 +557,7 @@ int dccp_connect(struct sock *sk)
516 DCCP_INC_STATS(DCCP_MIB_ACTIVEOPENS); 557 DCCP_INC_STATS(DCCP_MIB_ACTIVEOPENS);
517 558
518 /* Timer for repeating the REQUEST until an answer. */ 559 /* Timer for repeating the REQUEST until an answer. */
560 icsk->icsk_retransmits = 0;
519 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 561 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
520 icsk->icsk_rto, DCCP_RTO_MAX); 562 icsk->icsk_rto, DCCP_RTO_MAX);
521 return 0; 563 return 0;
@@ -602,6 +644,12 @@ void dccp_send_sync(struct sock *sk, const u64 ackno,
602 DCCP_SKB_CB(skb)->dccpd_type = pkt_type; 644 DCCP_SKB_CB(skb)->dccpd_type = pkt_type;
603 DCCP_SKB_CB(skb)->dccpd_ack_seq = ackno; 645 DCCP_SKB_CB(skb)->dccpd_ack_seq = ackno;
604 646
647 /*
648 * Clear the flag in case the Sync was scheduled for out-of-band data,
649 * such as carrying a long Ack Vector.
650 */
651 dccp_sk(sk)->dccps_sync_scheduled = 0;
652
605 dccp_transmit_skb(sk, skb); 653 dccp_transmit_skb(sk, skb);
606} 654}
607 655
@@ -630,7 +678,6 @@ void dccp_send_close(struct sock *sk, const int active)
630 DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_CLOSE; 678 DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_CLOSE;
631 679
632 if (active) { 680 if (active) {
633 dccp_write_xmit(sk, 1);
634 dccp_skb_entail(sk, skb); 681 dccp_skb_entail(sk, skb);
635 dccp_transmit_skb(sk, skb_clone(skb, prio)); 682 dccp_transmit_skb(sk, skb_clone(skb, prio));
636 /* 683 /*
diff --git a/net/dccp/probe.c b/net/dccp/probe.c
index 078e48d442fd..33d0e6297c21 100644
--- a/net/dccp/probe.c
+++ b/net/dccp/probe.c
@@ -149,6 +149,7 @@ static const struct file_operations dccpprobe_fops = {
149 .owner = THIS_MODULE, 149 .owner = THIS_MODULE,
150 .open = dccpprobe_open, 150 .open = dccpprobe_open,
151 .read = dccpprobe_read, 151 .read = dccpprobe_read,
152 .llseek = noop_llseek,
152}; 153};
153 154
154static __init int dccpprobe_init(void) 155static __init int dccpprobe_init(void)
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 096250d1323b..152975d942d9 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -50,6 +50,30 @@ EXPORT_SYMBOL_GPL(dccp_hashinfo);
50/* the maximum queue length for tx in packets. 0 is no limit */ 50/* the maximum queue length for tx in packets. 0 is no limit */
51int sysctl_dccp_tx_qlen __read_mostly = 5; 51int sysctl_dccp_tx_qlen __read_mostly = 5;
52 52
53#ifdef CONFIG_IP_DCCP_DEBUG
54static const char *dccp_state_name(const int state)
55{
56 static const char *const dccp_state_names[] = {
57 [DCCP_OPEN] = "OPEN",
58 [DCCP_REQUESTING] = "REQUESTING",
59 [DCCP_PARTOPEN] = "PARTOPEN",
60 [DCCP_LISTEN] = "LISTEN",
61 [DCCP_RESPOND] = "RESPOND",
62 [DCCP_CLOSING] = "CLOSING",
63 [DCCP_ACTIVE_CLOSEREQ] = "CLOSEREQ",
64 [DCCP_PASSIVE_CLOSE] = "PASSIVE_CLOSE",
65 [DCCP_PASSIVE_CLOSEREQ] = "PASSIVE_CLOSEREQ",
66 [DCCP_TIME_WAIT] = "TIME_WAIT",
67 [DCCP_CLOSED] = "CLOSED",
68 };
69
70 if (state >= DCCP_MAX_STATES)
71 return "INVALID STATE!";
72 else
73 return dccp_state_names[state];
74}
75#endif
76
53void dccp_set_state(struct sock *sk, const int state) 77void dccp_set_state(struct sock *sk, const int state)
54{ 78{
55 const int oldstate = sk->sk_state; 79 const int oldstate = sk->sk_state;
@@ -146,30 +170,6 @@ const char *dccp_packet_name(const int type)
146 170
147EXPORT_SYMBOL_GPL(dccp_packet_name); 171EXPORT_SYMBOL_GPL(dccp_packet_name);
148 172
149const char *dccp_state_name(const int state)
150{
151 static const char *const dccp_state_names[] = {
152 [DCCP_OPEN] = "OPEN",
153 [DCCP_REQUESTING] = "REQUESTING",
154 [DCCP_PARTOPEN] = "PARTOPEN",
155 [DCCP_LISTEN] = "LISTEN",
156 [DCCP_RESPOND] = "RESPOND",
157 [DCCP_CLOSING] = "CLOSING",
158 [DCCP_ACTIVE_CLOSEREQ] = "CLOSEREQ",
159 [DCCP_PASSIVE_CLOSE] = "PASSIVE_CLOSE",
160 [DCCP_PASSIVE_CLOSEREQ] = "PASSIVE_CLOSEREQ",
161 [DCCP_TIME_WAIT] = "TIME_WAIT",
162 [DCCP_CLOSED] = "CLOSED",
163 };
164
165 if (state >= DCCP_MAX_STATES)
166 return "INVALID STATE!";
167 else
168 return dccp_state_names[state];
169}
170
171EXPORT_SYMBOL_GPL(dccp_state_name);
172
173int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized) 173int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized)
174{ 174{
175 struct dccp_sock *dp = dccp_sk(sk); 175 struct dccp_sock *dp = dccp_sk(sk);
@@ -185,6 +185,7 @@ int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized)
185 dp->dccps_role = DCCP_ROLE_UNDEFINED; 185 dp->dccps_role = DCCP_ROLE_UNDEFINED;
186 dp->dccps_service = DCCP_SERVICE_CODE_IS_ABSENT; 186 dp->dccps_service = DCCP_SERVICE_CODE_IS_ABSENT;
187 dp->dccps_l_ack_ratio = dp->dccps_r_ack_ratio = 1; 187 dp->dccps_l_ack_ratio = dp->dccps_r_ack_ratio = 1;
188 dp->dccps_tx_qlen = sysctl_dccp_tx_qlen;
188 189
189 dccp_init_xmit_timers(sk); 190 dccp_init_xmit_timers(sk);
190 191
@@ -532,6 +533,20 @@ static int do_dccp_setsockopt(struct sock *sk, int level, int optname,
532 case DCCP_SOCKOPT_RECV_CSCOV: 533 case DCCP_SOCKOPT_RECV_CSCOV:
533 err = dccp_setsockopt_cscov(sk, val, true); 534 err = dccp_setsockopt_cscov(sk, val, true);
534 break; 535 break;
536 case DCCP_SOCKOPT_QPOLICY_ID:
537 if (sk->sk_state != DCCP_CLOSED)
538 err = -EISCONN;
539 else if (val < 0 || val >= DCCPQ_POLICY_MAX)
540 err = -EINVAL;
541 else
542 dp->dccps_qpolicy = val;
543 break;
544 case DCCP_SOCKOPT_QPOLICY_TXQLEN:
545 if (val < 0)
546 err = -EINVAL;
547 else
548 dp->dccps_tx_qlen = val;
549 break;
535 default: 550 default:
536 err = -ENOPROTOOPT; 551 err = -ENOPROTOOPT;
537 break; 552 break;
@@ -639,6 +654,12 @@ static int do_dccp_getsockopt(struct sock *sk, int level, int optname,
639 case DCCP_SOCKOPT_RECV_CSCOV: 654 case DCCP_SOCKOPT_RECV_CSCOV:
640 val = dp->dccps_pcrlen; 655 val = dp->dccps_pcrlen;
641 break; 656 break;
657 case DCCP_SOCKOPT_QPOLICY_ID:
658 val = dp->dccps_qpolicy;
659 break;
660 case DCCP_SOCKOPT_QPOLICY_TXQLEN:
661 val = dp->dccps_tx_qlen;
662 break;
642 case 128 ... 191: 663 case 128 ... 191:
643 return ccid_hc_rx_getsockopt(dp->dccps_hc_rx_ccid, sk, optname, 664 return ccid_hc_rx_getsockopt(dp->dccps_hc_rx_ccid, sk, optname,
644 len, (u32 __user *)optval, optlen); 665 len, (u32 __user *)optval, optlen);
@@ -681,6 +702,47 @@ int compat_dccp_getsockopt(struct sock *sk, int level, int optname,
681EXPORT_SYMBOL_GPL(compat_dccp_getsockopt); 702EXPORT_SYMBOL_GPL(compat_dccp_getsockopt);
682#endif 703#endif
683 704
705static int dccp_msghdr_parse(struct msghdr *msg, struct sk_buff *skb)
706{
707 struct cmsghdr *cmsg = CMSG_FIRSTHDR(msg);
708
709 /*
710 * Assign an (opaque) qpolicy priority value to skb->priority.
711 *
712 * We are overloading this skb field for use with the qpolicy subystem.
713 * The skb->priority is normally used for the SO_PRIORITY option, which
714 * is initialised from sk_priority. Since the assignment of sk_priority
715 * to skb->priority happens later (on layer 3), we overload this field
716 * for use with queueing priorities as long as the skb is on layer 4.
717 * The default priority value (if nothing is set) is 0.
718 */
719 skb->priority = 0;
720
721 for (; cmsg != NULL; cmsg = CMSG_NXTHDR(msg, cmsg)) {
722
723 if (!CMSG_OK(msg, cmsg))
724 return -EINVAL;
725
726 if (cmsg->cmsg_level != SOL_DCCP)
727 continue;
728
729 if (cmsg->cmsg_type <= DCCP_SCM_QPOLICY_MAX &&
730 !dccp_qpolicy_param_ok(skb->sk, cmsg->cmsg_type))
731 return -EINVAL;
732
733 switch (cmsg->cmsg_type) {
734 case DCCP_SCM_PRIORITY:
735 if (cmsg->cmsg_len != CMSG_LEN(sizeof(__u32)))
736 return -EINVAL;
737 skb->priority = *(__u32 *)CMSG_DATA(cmsg);
738 break;
739 default:
740 return -EINVAL;
741 }
742 }
743 return 0;
744}
745
684int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, 746int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
685 size_t len) 747 size_t len)
686{ 748{
@@ -696,8 +758,7 @@ int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
696 758
697 lock_sock(sk); 759 lock_sock(sk);
698 760
699 if (sysctl_dccp_tx_qlen && 761 if (dccp_qpolicy_full(sk)) {
700 (sk->sk_write_queue.qlen >= sysctl_dccp_tx_qlen)) {
701 rc = -EAGAIN; 762 rc = -EAGAIN;
702 goto out_release; 763 goto out_release;
703 } 764 }
@@ -725,8 +786,18 @@ int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
725 if (rc != 0) 786 if (rc != 0)
726 goto out_discard; 787 goto out_discard;
727 788
728 skb_queue_tail(&sk->sk_write_queue, skb); 789 rc = dccp_msghdr_parse(msg, skb);
729 dccp_write_xmit(sk,0); 790 if (rc != 0)
791 goto out_discard;
792
793 dccp_qpolicy_push(sk, skb);
794 /*
795 * The xmit_timer is set if the TX CCID is rate-based and will expire
796 * when congestion control permits to release further packets into the
797 * network. Window-based CCIDs do not use this timer.
798 */
799 if (!timer_pending(&dp->dccps_xmit_timer))
800 dccp_write_xmit(sk);
730out_release: 801out_release:
731 release_sock(sk); 802 release_sock(sk);
732 return rc ? : len; 803 return rc ? : len;
@@ -944,16 +1015,29 @@ void dccp_close(struct sock *sk, long timeout)
944 1015
945 if (data_was_unread) { 1016 if (data_was_unread) {
946 /* Unread data was tossed, send an appropriate Reset Code */ 1017 /* Unread data was tossed, send an appropriate Reset Code */
947 DCCP_WARN("DCCP: ABORT -- %u bytes unread\n", data_was_unread); 1018 DCCP_WARN("ABORT with %u bytes unread\n", data_was_unread);
948 dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED); 1019 dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED);
949 dccp_set_state(sk, DCCP_CLOSED); 1020 dccp_set_state(sk, DCCP_CLOSED);
950 } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) { 1021 } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
951 /* Check zero linger _after_ checking for unread data. */ 1022 /* Check zero linger _after_ checking for unread data. */
952 sk->sk_prot->disconnect(sk, 0); 1023 sk->sk_prot->disconnect(sk, 0);
953 } else if (sk->sk_state != DCCP_CLOSED) { 1024 } else if (sk->sk_state != DCCP_CLOSED) {
1025 /*
1026 * Normal connection termination. May need to wait if there are
1027 * still packets in the TX queue that are delayed by the CCID.
1028 */
1029 dccp_flush_write_queue(sk, &timeout);
954 dccp_terminate_connection(sk); 1030 dccp_terminate_connection(sk);
955 } 1031 }
956 1032
1033 /*
1034 * Flush write queue. This may be necessary in several cases:
1035 * - we have been closed by the peer but still have application data;
1036 * - abortive termination (unread data or zero linger time),
1037 * - normal termination but queue could not be flushed within time limit
1038 */
1039 __skb_queue_purge(&sk->sk_write_queue);
1040
957 sk_stream_wait_close(sk, timeout); 1041 sk_stream_wait_close(sk, timeout);
958 1042
959adjudge_to_death: 1043adjudge_to_death:
diff --git a/net/dccp/qpolicy.c b/net/dccp/qpolicy.c
new file mode 100644
index 000000000000..63c30bfa4703
--- /dev/null
+++ b/net/dccp/qpolicy.c
@@ -0,0 +1,137 @@
1/*
2 * net/dccp/qpolicy.c
3 *
4 * Policy-based packet dequeueing interface for DCCP.
5 *
6 * Copyright (c) 2008 Tomasz Grobelny <tomasz@grobelny.oswiecenia.net>
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License v2
10 * as published by the Free Software Foundation.
11 */
12#include "dccp.h"
13
14/*
15 * Simple Dequeueing Policy:
16 * If tx_qlen is different from 0, enqueue up to tx_qlen elements.
17 */
18static void qpolicy_simple_push(struct sock *sk, struct sk_buff *skb)
19{
20 skb_queue_tail(&sk->sk_write_queue, skb);
21}
22
23static bool qpolicy_simple_full(struct sock *sk)
24{
25 return dccp_sk(sk)->dccps_tx_qlen &&
26 sk->sk_write_queue.qlen >= dccp_sk(sk)->dccps_tx_qlen;
27}
28
29static struct sk_buff *qpolicy_simple_top(struct sock *sk)
30{
31 return skb_peek(&sk->sk_write_queue);
32}
33
34/*
35 * Priority-based Dequeueing Policy:
36 * If tx_qlen is different from 0 and the queue has reached its upper bound
37 * of tx_qlen elements, replace older packets lowest-priority-first.
38 */
39static struct sk_buff *qpolicy_prio_best_skb(struct sock *sk)
40{
41 struct sk_buff *skb, *best = NULL;
42
43 skb_queue_walk(&sk->sk_write_queue, skb)
44 if (best == NULL || skb->priority > best->priority)
45 best = skb;
46 return best;
47}
48
49static struct sk_buff *qpolicy_prio_worst_skb(struct sock *sk)
50{
51 struct sk_buff *skb, *worst = NULL;
52
53 skb_queue_walk(&sk->sk_write_queue, skb)
54 if (worst == NULL || skb->priority < worst->priority)
55 worst = skb;
56 return worst;
57}
58
59static bool qpolicy_prio_full(struct sock *sk)
60{
61 if (qpolicy_simple_full(sk))
62 dccp_qpolicy_drop(sk, qpolicy_prio_worst_skb(sk));
63 return false;
64}
65
66/**
67 * struct dccp_qpolicy_operations - TX Packet Dequeueing Interface
68 * @push: add a new @skb to the write queue
69 * @full: indicates that no more packets will be admitted
70 * @top: peeks at whatever the queueing policy defines as its `top'
71 */
72static struct dccp_qpolicy_operations {
73 void (*push) (struct sock *sk, struct sk_buff *skb);
74 bool (*full) (struct sock *sk);
75 struct sk_buff* (*top) (struct sock *sk);
76 __be32 params;
77
78} qpol_table[DCCPQ_POLICY_MAX] = {
79 [DCCPQ_POLICY_SIMPLE] = {
80 .push = qpolicy_simple_push,
81 .full = qpolicy_simple_full,
82 .top = qpolicy_simple_top,
83 .params = 0,
84 },
85 [DCCPQ_POLICY_PRIO] = {
86 .push = qpolicy_simple_push,
87 .full = qpolicy_prio_full,
88 .top = qpolicy_prio_best_skb,
89 .params = DCCP_SCM_PRIORITY,
90 },
91};
92
93/*
94 * Externally visible interface
95 */
96void dccp_qpolicy_push(struct sock *sk, struct sk_buff *skb)
97{
98 qpol_table[dccp_sk(sk)->dccps_qpolicy].push(sk, skb);
99}
100
101bool dccp_qpolicy_full(struct sock *sk)
102{
103 return qpol_table[dccp_sk(sk)->dccps_qpolicy].full(sk);
104}
105
106void dccp_qpolicy_drop(struct sock *sk, struct sk_buff *skb)
107{
108 if (skb != NULL) {
109 skb_unlink(skb, &sk->sk_write_queue);
110 kfree_skb(skb);
111 }
112}
113
114struct sk_buff *dccp_qpolicy_top(struct sock *sk)
115{
116 return qpol_table[dccp_sk(sk)->dccps_qpolicy].top(sk);
117}
118
119struct sk_buff *dccp_qpolicy_pop(struct sock *sk)
120{
121 struct sk_buff *skb = dccp_qpolicy_top(sk);
122
123 if (skb != NULL) {
124 /* Clear any skb fields that we used internally */
125 skb->priority = 0;
126 skb_unlink(skb, &sk->sk_write_queue);
127 }
128 return skb;
129}
130
131bool dccp_qpolicy_param_ok(struct sock *sk, __be32 param)
132{
133 /* check if exactly one bit is set */
134 if (!param || (param & (param - 1)))
135 return false;
136 return (qpol_table[dccp_sk(sk)->dccps_qpolicy].params & param) == param;
137}
diff --git a/net/dccp/sysctl.c b/net/dccp/sysctl.c
index 563943822e58..42348824ee31 100644
--- a/net/dccp/sysctl.c
+++ b/net/dccp/sysctl.c
@@ -21,7 +21,8 @@
21/* Boundary values */ 21/* Boundary values */
22static int zero = 0, 22static int zero = 0,
23 u8_max = 0xFF; 23 u8_max = 0xFF;
24static unsigned long seqw_min = 32; 24static unsigned long seqw_min = DCCPF_SEQ_WMIN,
25 seqw_max = 0xFFFFFFFF; /* maximum on 32 bit */
25 26
26static struct ctl_table dccp_default_table[] = { 27static struct ctl_table dccp_default_table[] = {
27 { 28 {
@@ -31,6 +32,7 @@ static struct ctl_table dccp_default_table[] = {
31 .mode = 0644, 32 .mode = 0644,
32 .proc_handler = proc_doulongvec_minmax, 33 .proc_handler = proc_doulongvec_minmax,
33 .extra1 = &seqw_min, /* RFC 4340, 7.5.2 */ 34 .extra1 = &seqw_min, /* RFC 4340, 7.5.2 */
35 .extra2 = &seqw_max,
34 }, 36 },
35 { 37 {
36 .procname = "rx_ccid", 38 .procname = "rx_ccid",
diff --git a/net/dccp/timer.c b/net/dccp/timer.c
index 1a9aa05d4dc4..7587870b7040 100644
--- a/net/dccp/timer.c
+++ b/net/dccp/timer.c
@@ -237,32 +237,35 @@ out:
237 sock_put(sk); 237 sock_put(sk);
238} 238}
239 239
240/* Transmit-delay timer: used by the CCIDs to delay actual send time */ 240/**
241static void dccp_write_xmit_timer(unsigned long data) 241 * dccp_write_xmitlet - Workhorse for CCID packet dequeueing interface
242 * See the comments above %ccid_dequeueing_decision for supported modes.
243 */
244static void dccp_write_xmitlet(unsigned long data)
242{ 245{
243 struct sock *sk = (struct sock *)data; 246 struct sock *sk = (struct sock *)data;
244 struct dccp_sock *dp = dccp_sk(sk);
245 247
246 bh_lock_sock(sk); 248 bh_lock_sock(sk);
247 if (sock_owned_by_user(sk)) 249 if (sock_owned_by_user(sk))
248 sk_reset_timer(sk, &dp->dccps_xmit_timer, jiffies+1); 250 sk_reset_timer(sk, &dccp_sk(sk)->dccps_xmit_timer, jiffies + 1);
249 else 251 else
250 dccp_write_xmit(sk, 0); 252 dccp_write_xmit(sk);
251 bh_unlock_sock(sk); 253 bh_unlock_sock(sk);
252 sock_put(sk);
253} 254}
254 255
255static void dccp_init_write_xmit_timer(struct sock *sk) 256static void dccp_write_xmit_timer(unsigned long data)
256{ 257{
257 struct dccp_sock *dp = dccp_sk(sk); 258 dccp_write_xmitlet(data);
258 259 sock_put((struct sock *)data);
259 setup_timer(&dp->dccps_xmit_timer, dccp_write_xmit_timer,
260 (unsigned long)sk);
261} 260}
262 261
263void dccp_init_xmit_timers(struct sock *sk) 262void dccp_init_xmit_timers(struct sock *sk)
264{ 263{
265 dccp_init_write_xmit_timer(sk); 264 struct dccp_sock *dp = dccp_sk(sk);
265
266 tasklet_init(&dp->dccps_xmitlet, dccp_write_xmitlet, (unsigned long)sk);
267 setup_timer(&dp->dccps_xmit_timer, dccp_write_xmit_timer,
268 (unsigned long)sk);
266 inet_csk_init_xmit_timers(sk, &dccp_write_timer, &dccp_delack_timer, 269 inet_csk_init_xmit_timers(sk, &dccp_write_timer, &dccp_delack_timer,
267 &dccp_keepalive_timer); 270 &dccp_keepalive_timer);
268} 271}