aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2008-09-08 20:28:59 -0400
committerDavid S. Miller <davem@davemloft.net>2008-09-08 20:28:59 -0400
commit0a68a20cc3eafa73bb54097c28b921147d7d3685 (patch)
tree8e5f315226b618cb8e050a0c7653c8ec134501e3
parent17dce5dfe38ae2fb359b61e855f5d8a3a8b7892b (diff)
parenta3cbdde8e9c38b66b4f13ac5d6ff1939ded0ff20 (diff)
Merge branch 'dccp' of git://eden-feed.erg.abdn.ac.uk/dccp_exp
Conflicts: net/dccp/input.c net/dccp/options.c
-rw-r--r--Documentation/networking/dccp.txt54
-rw-r--r--include/linux/dccp.h122
-rw-r--r--include/net/tcp.h15
-rw-r--r--net/dccp/Kconfig3
-rw-r--r--net/dccp/Makefile5
-rw-r--r--net/dccp/ackvec.c619
-rw-r--r--net/dccp/ackvec.h204
-rw-r--r--net/dccp/ccid.c101
-rw-r--r--net/dccp/ccid.h113
-rw-r--r--net/dccp/ccids/Kconfig30
-rw-r--r--net/dccp/ccids/ccid2.c622
-rw-r--r--net/dccp/ccids/ccid2.h63
-rw-r--r--net/dccp/ccids/ccid3.c762
-rw-r--r--net/dccp/ccids/ccid3.h153
-rw-r--r--net/dccp/ccids/lib/loss_interval.c30
-rw-r--r--net/dccp/ccids/lib/loss_interval.h4
-rw-r--r--net/dccp/ccids/lib/packet_history.c282
-rw-r--r--net/dccp/ccids/lib/packet_history.h78
-rw-r--r--net/dccp/ccids/lib/tfrc.h16
-rw-r--r--net/dccp/ccids/lib/tfrc_equation.c29
-rw-r--r--net/dccp/dccp.h104
-rw-r--r--net/dccp/diag.c2
-rw-r--r--net/dccp/feat.c1805
-rw-r--r--net/dccp/feat.h144
-rw-r--r--net/dccp/input.c164
-rw-r--r--net/dccp/ipv4.c4
-rw-r--r--net/dccp/ipv6.c4
-rw-r--r--net/dccp/minisocks.c87
-rw-r--r--net/dccp/options.c341
-rw-r--r--net/dccp/output.c279
-rw-r--r--net/dccp/probe.c75
-rw-r--r--net/dccp/proto.c281
-rw-r--r--net/dccp/qpolicy.c137
-rw-r--r--net/dccp/sysctl.c64
-rw-r--r--net/dccp/timer.c42
-rw-r--r--net/ipv4/tcp_input.c17
36 files changed, 3971 insertions, 2884 deletions
diff --git a/Documentation/networking/dccp.txt b/Documentation/networking/dccp.txt
index 39131a3c78f8..fcfc12534428 100644
--- a/Documentation/networking/dccp.txt
+++ b/Documentation/networking/dccp.txt
@@ -45,6 +45,25 @@ http://linux-net.osdl.org/index.php/DCCP_Testing#Experimental_DCCP_source_tree
45 45
46Socket options 46Socket options
47============== 47==============
48DCCP_SOCKOPT_QPOLICY_ID sets the dequeuing policy for outgoing packets. It takes
49a policy ID as argument and can only be set before the connection (i.e. changes
50during an established connection are not supported). Currently, two policies are
51defined: the "simple" policy (DCCPQ_POLICY_SIMPLE), which does nothing special,
52and a priority-based variant (DCCPQ_POLICY_PRIO). The latter allows to pass an
53u32 priority value as ancillary data to sendmsg(), where higher numbers indicate
54a higher packet priority (similar to SO_PRIORITY). This ancillary data needs to
55be formatted using a cmsg(3) message header filled in as follows:
56 cmsg->cmsg_level = SOL_DCCP;
57 cmsg->cmsg_type = DCCP_SCM_PRIORITY;
58 cmsg->cmsg_len = CMSG_LEN(sizeof(uint32_t)); /* or CMSG_LEN(4) */
59
60DCCP_SOCKOPT_QPOLICY_TXQLEN sets the maximum length of the output queue. A zero
61value is always interpreted as unbounded queue length. If different from zero,
62the interpretation of this parameter depends on the current dequeuing policy
63(see above): the "simple" policy will enforce a fixed queue size by returning
64EAGAIN, whereas the "prio" policy enforces a fixed queue length by dropping the
65lowest-priority packet first. The default value for this parameter is
66initialised from /proc/sys/net/dccp/default/tx_qlen.
48 67
49DCCP_SOCKOPT_SERVICE sets the service. The specification mandates use of 68DCCP_SOCKOPT_SERVICE sets the service. The specification mandates use of
50service codes (RFC 4340, sec. 8.1.2); if this socket option is not set, 69service codes (RFC 4340, sec. 8.1.2); if this socket option is not set,
@@ -57,6 +76,24 @@ can be set before calling bind().
57DCCP_SOCKOPT_GET_CUR_MPS is read-only and retrieves the current maximum packet 76DCCP_SOCKOPT_GET_CUR_MPS is read-only and retrieves the current maximum packet
58size (application payload size) in bytes, see RFC 4340, section 14. 77size (application payload size) in bytes, see RFC 4340, section 14.
59 78
79DCCP_SOCKOPT_AVAILABLE_CCIDS is also read-only and returns the list of CCIDs
80supported by the endpoint (see include/linux/dccp.h for symbolic constants).
81The caller needs to provide a sufficiently large (> 2) array of type uint8_t.
82
83DCCP_SOCKOPT_CCID is write-only and sets both the TX and RX CCIDs at the same
84time, combining the operation of the next two socket options. This option is
85preferrable over the latter two, since often applications will use the same
86type of CCID for both directions; and mixed use of CCIDs is not currently well
87understood. This socket option takes as argument at least one uint8_t value, or
88an array of uint8_t values, which must match available CCIDS (see above). CCIDs
89must be registered on the socket before calling connect() or listen().
90
91DCCP_SOCKOPT_TX_CCID is read/write. It returns the current CCID (if set) or sets
92the preference list for the TX CCID, using the same format as DCCP_SOCKOPT_CCID.
93Please note that the getsockopt argument type here is `int', not uint8_t.
94
95DCCP_SOCKOPT_RX_CCID is analogous to DCCP_SOCKOPT_TX_CCID, but for the RX CCID.
96
60DCCP_SOCKOPT_SERVER_TIMEWAIT enables the server (listening socket) to hold 97DCCP_SOCKOPT_SERVER_TIMEWAIT enables the server (listening socket) to hold
61timewait state when closing the connection (RFC 4340, 8.3). The usual case is 98timewait state when closing the connection (RFC 4340, 8.3). The usual case is
62that the closing server sends a CloseReq, whereupon the client holds timewait 99that the closing server sends a CloseReq, whereupon the client holds timewait
@@ -115,23 +152,16 @@ retries2
115 importance for retransmitted acknowledgments and feature negotiation, 152 importance for retransmitted acknowledgments and feature negotiation,
116 data packets are never retransmitted. Analogue of tcp_retries2. 153 data packets are never retransmitted. Analogue of tcp_retries2.
117 154
118send_ndp = 1
119 Whether or not to send NDP count options (sec. 7.7.2).
120
121send_ackvec = 1
122 Whether or not to send Ack Vector options (sec. 11.5).
123
124ack_ratio = 2
125 The default Ack Ratio (sec. 11.3) to use.
126
127tx_ccid = 2 155tx_ccid = 2
128 Default CCID for the sender-receiver half-connection. 156 Default CCID for the sender-receiver half-connection. Depending on the
157 choice of CCID, the Send Ack Vector feature is enabled automatically.
129 158
130rx_ccid = 2 159rx_ccid = 2
131 Default CCID for the receiver-sender half-connection. 160 Default CCID for the receiver-sender half-connection; see tx_ccid.
132 161
133seq_window = 100 162seq_window = 100
134 The initial sequence window (sec. 7.5.2). 163 The initial sequence window (sec. 7.5.2) of the sender. This influences
164 the local ackno validity and the remote seqno validity windows (7.5.1).
135 165
136tx_qlen = 5 166tx_qlen = 5
137 The size of the transmit buffer in packets. A value of 0 corresponds 167 The size of the transmit buffer in packets. A value of 0 corresponds
diff --git a/include/linux/dccp.h b/include/linux/dccp.h
index 6080449fbec9..010e2d87ed75 100644
--- a/include/linux/dccp.h
+++ b/include/linux/dccp.h
@@ -165,9 +165,13 @@ enum {
165 DCCPO_TIMESTAMP_ECHO = 42, 165 DCCPO_TIMESTAMP_ECHO = 42,
166 DCCPO_ELAPSED_TIME = 43, 166 DCCPO_ELAPSED_TIME = 43,
167 DCCPO_MAX = 45, 167 DCCPO_MAX = 45,
168 DCCPO_MIN_CCID_SPECIFIC = 128, 168 DCCPO_MIN_RX_CCID_SPECIFIC = 128, /* from sender to receiver */
169 DCCPO_MAX_CCID_SPECIFIC = 255, 169 DCCPO_MAX_RX_CCID_SPECIFIC = 191,
170 DCCPO_MIN_TX_CCID_SPECIFIC = 192, /* from receiver to sender */
171 DCCPO_MAX_TX_CCID_SPECIFIC = 255,
170}; 172};
173/* maximum size of a single TLV-encoded DCCP option (sans type/len bytes) */
174#define DCCP_SINGLE_OPT_MAXLEN 253
171 175
172/* DCCP CCIDS */ 176/* DCCP CCIDS */
173enum { 177enum {
@@ -176,27 +180,36 @@ enum {
176}; 180};
177 181
178/* DCCP features (RFC 4340 section 6.4) */ 182/* DCCP features (RFC 4340 section 6.4) */
179enum { 183enum dccp_feature_numbers {
180 DCCPF_RESERVED = 0, 184 DCCPF_RESERVED = 0,
181 DCCPF_CCID = 1, 185 DCCPF_CCID = 1,
182 DCCPF_SHORT_SEQNOS = 2, /* XXX: not yet implemented */ 186 DCCPF_SHORT_SEQNOS = 2,
183 DCCPF_SEQUENCE_WINDOW = 3, 187 DCCPF_SEQUENCE_WINDOW = 3,
184 DCCPF_ECN_INCAPABLE = 4, /* XXX: not yet implemented */ 188 DCCPF_ECN_INCAPABLE = 4,
185 DCCPF_ACK_RATIO = 5, 189 DCCPF_ACK_RATIO = 5,
186 DCCPF_SEND_ACK_VECTOR = 6, 190 DCCPF_SEND_ACK_VECTOR = 6,
187 DCCPF_SEND_NDP_COUNT = 7, 191 DCCPF_SEND_NDP_COUNT = 7,
188 DCCPF_MIN_CSUM_COVER = 8, 192 DCCPF_MIN_CSUM_COVER = 8,
189 DCCPF_DATA_CHECKSUM = 9, /* XXX: not yet implemented */ 193 DCCPF_DATA_CHECKSUM = 9,
190 /* 10-127 reserved */ 194 /* 10-127 reserved */
191 DCCPF_MIN_CCID_SPECIFIC = 128, 195 DCCPF_MIN_CCID_SPECIFIC = 128,
196 DCCPF_SEND_LEV_RATE = 192, /* RFC 4342, sec. 8.4 */
192 DCCPF_MAX_CCID_SPECIFIC = 255, 197 DCCPF_MAX_CCID_SPECIFIC = 255,
193}; 198};
194 199
195/* this structure is argument to DCCP_SOCKOPT_CHANGE_X */ 200/* DCCP socket control message types for cmsg */
196struct dccp_so_feat { 201enum dccp_cmsg_type {
197 __u8 dccpsf_feat; 202 DCCP_SCM_PRIORITY = 1,
198 __u8 __user *dccpsf_val; 203 DCCP_SCM_QPOLICY_MAX = 0xFFFF,
199 __u8 dccpsf_len; 204 /* ^-- Up to here reserved exclusively for qpolicy parameters */
205 DCCP_SCM_MAX
206};
207
208/* DCCP priorities for outgoing/queued packets */
209enum dccp_packet_dequeueing_policy {
210 DCCPQ_POLICY_SIMPLE,
211 DCCPQ_POLICY_PRIO,
212 DCCPQ_POLICY_MAX
200}; 213};
201 214
202/* DCCP socket options */ 215/* DCCP socket options */
@@ -208,6 +221,12 @@ struct dccp_so_feat {
208#define DCCP_SOCKOPT_SERVER_TIMEWAIT 6 221#define DCCP_SOCKOPT_SERVER_TIMEWAIT 6
209#define DCCP_SOCKOPT_SEND_CSCOV 10 222#define DCCP_SOCKOPT_SEND_CSCOV 10
210#define DCCP_SOCKOPT_RECV_CSCOV 11 223#define DCCP_SOCKOPT_RECV_CSCOV 11
224#define DCCP_SOCKOPT_AVAILABLE_CCIDS 12
225#define DCCP_SOCKOPT_CCID 13
226#define DCCP_SOCKOPT_TX_CCID 14
227#define DCCP_SOCKOPT_RX_CCID 15
228#define DCCP_SOCKOPT_QPOLICY_ID 16
229#define DCCP_SOCKOPT_QPOLICY_TXQLEN 17
211#define DCCP_SOCKOPT_CCID_RX_INFO 128 230#define DCCP_SOCKOPT_CCID_RX_INFO 128
212#define DCCP_SOCKOPT_CCID_TX_INFO 192 231#define DCCP_SOCKOPT_CCID_TX_INFO 192
213 232
@@ -355,62 +374,13 @@ static inline unsigned int dccp_hdr_len(const struct sk_buff *skb)
355 return __dccp_hdr_len(dccp_hdr(skb)); 374 return __dccp_hdr_len(dccp_hdr(skb));
356} 375}
357 376
358
359/* initial values for each feature */
360#define DCCPF_INITIAL_SEQUENCE_WINDOW 100
361#define DCCPF_INITIAL_ACK_RATIO 2
362#define DCCPF_INITIAL_CCID DCCPC_CCID2
363#define DCCPF_INITIAL_SEND_ACK_VECTOR 1
364/* FIXME: for now we're default to 1 but it should really be 0 */
365#define DCCPF_INITIAL_SEND_NDP_COUNT 1
366
367/**
368 * struct dccp_minisock - Minimal DCCP connection representation
369 *
370 * Will be used to pass the state from dccp_request_sock to dccp_sock.
371 *
372 * @dccpms_sequence_window - Sequence Window Feature (section 7.5.2)
373 * @dccpms_ccid - Congestion Control Id (CCID) (section 10)
374 * @dccpms_send_ack_vector - Send Ack Vector Feature (section 11.5)
375 * @dccpms_send_ndp_count - Send NDP Count Feature (7.7.2)
376 * @dccpms_ack_ratio - Ack Ratio Feature (section 11.3)
377 * @dccpms_pending - List of features being negotiated
378 * @dccpms_conf -
379 */
380struct dccp_minisock {
381 __u64 dccpms_sequence_window;
382 __u8 dccpms_rx_ccid;
383 __u8 dccpms_tx_ccid;
384 __u8 dccpms_send_ack_vector;
385 __u8 dccpms_send_ndp_count;
386 __u8 dccpms_ack_ratio;
387 struct list_head dccpms_pending;
388 struct list_head dccpms_conf;
389};
390
391struct dccp_opt_conf {
392 __u8 *dccpoc_val;
393 __u8 dccpoc_len;
394};
395
396struct dccp_opt_pend {
397 struct list_head dccpop_node;
398 __u8 dccpop_type;
399 __u8 dccpop_feat;
400 __u8 *dccpop_val;
401 __u8 dccpop_len;
402 int dccpop_conf;
403 struct dccp_opt_conf *dccpop_sc;
404};
405
406extern void dccp_minisock_init(struct dccp_minisock *dmsk);
407
408/** 377/**
409 * struct dccp_request_sock - represent DCCP-specific connection request 378 * struct dccp_request_sock - represent DCCP-specific connection request
410 * @dreq_inet_rsk: structure inherited from 379 * @dreq_inet_rsk: structure inherited from
411 * @dreq_iss: initial sequence number sent on the Response (RFC 4340, 7.1) 380 * @dreq_iss: initial sequence number sent on the Response (RFC 4340, 7.1)
412 * @dreq_isr: initial sequence number received on the Request 381 * @dreq_isr: initial sequence number received on the Request
413 * @dreq_service: service code present on the Request (there is just one) 382 * @dreq_service: service code present on the Request (there is just one)
383 * @dreq_featneg: feature negotiation options for this connection
414 * The following two fields are analogous to the ones in dccp_sock: 384 * The following two fields are analogous to the ones in dccp_sock:
415 * @dreq_timestamp_echo: last received timestamp to echo (13.1) 385 * @dreq_timestamp_echo: last received timestamp to echo (13.1)
416 * @dreq_timestamp_echo: the time of receiving the last @dreq_timestamp_echo 386 * @dreq_timestamp_echo: the time of receiving the last @dreq_timestamp_echo
@@ -420,6 +390,7 @@ struct dccp_request_sock {
420 __u64 dreq_iss; 390 __u64 dreq_iss;
421 __u64 dreq_isr; 391 __u64 dreq_isr;
422 __be32 dreq_service; 392 __be32 dreq_service;
393 struct list_head dreq_featneg;
423 __u32 dreq_timestamp_echo; 394 __u32 dreq_timestamp_echo;
424 __u32 dreq_timestamp_time; 395 __u32 dreq_timestamp_time;
425}; 396};
@@ -491,21 +462,28 @@ struct dccp_ackvec;
491 * @dccps_timestamp_time - time of receiving latest @dccps_timestamp_echo 462 * @dccps_timestamp_time - time of receiving latest @dccps_timestamp_echo
492 * @dccps_l_ack_ratio - feature-local Ack Ratio 463 * @dccps_l_ack_ratio - feature-local Ack Ratio
493 * @dccps_r_ack_ratio - feature-remote Ack Ratio 464 * @dccps_r_ack_ratio - feature-remote Ack Ratio
465 * @dccps_l_seq_win - local Sequence Window (influences ack number validity)
466 * @dccps_r_seq_win - remote Sequence Window (influences seq number validity)
494 * @dccps_pcslen - sender partial checksum coverage (via sockopt) 467 * @dccps_pcslen - sender partial checksum coverage (via sockopt)
495 * @dccps_pcrlen - receiver partial checksum coverage (via sockopt) 468 * @dccps_pcrlen - receiver partial checksum coverage (via sockopt)
469 * @dccps_send_ndp_count - local Send NDP Count feature (7.7.2)
496 * @dccps_ndp_count - number of Non Data Packets since last data packet 470 * @dccps_ndp_count - number of Non Data Packets since last data packet
497 * @dccps_mss_cache - current value of MSS (path MTU minus header sizes) 471 * @dccps_mss_cache - current value of MSS (path MTU minus header sizes)
498 * @dccps_rate_last - timestamp for rate-limiting DCCP-Sync (RFC 4340, 7.5.4) 472 * @dccps_rate_last - timestamp for rate-limiting DCCP-Sync (RFC 4340, 7.5.4)
499 * @dccps_minisock - associated minisock (accessed via dccp_msk) 473 * @dccps_featneg - tracks feature-negotiation state (mostly during handshake)
500 * @dccps_hc_rx_ackvec - rx half connection ack vector 474 * @dccps_hc_rx_ackvec - rx half connection ack vector
501 * @dccps_hc_rx_ccid - CCID used for the receiver (or receiving half-connection) 475 * @dccps_hc_rx_ccid - CCID used for the receiver (or receiving half-connection)
502 * @dccps_hc_tx_ccid - CCID used for the sender (or sending half-connection) 476 * @dccps_hc_tx_ccid - CCID used for the sender (or sending half-connection)
503 * @dccps_options_received - parsed set of retrieved options 477 * @dccps_options_received - parsed set of retrieved options
478 * @dccps_qpolicy - TX dequeueing policy, one of %dccp_packet_dequeueing_policy
479 * @dccps_tx_qlen - maximum length of the TX queue
504 * @dccps_role - role of this sock, one of %dccp_role 480 * @dccps_role - role of this sock, one of %dccp_role
505 * @dccps_hc_rx_insert_options - receiver wants to add options when acking 481 * @dccps_hc_rx_insert_options - receiver wants to add options when acking
506 * @dccps_hc_tx_insert_options - sender wants to add options when sending 482 * @dccps_hc_tx_insert_options - sender wants to add options when sending
507 * @dccps_server_timewait - server holds timewait state on close (RFC 4340, 8.3) 483 * @dccps_server_timewait - server holds timewait state on close (RFC 4340, 8.3)
508 * @dccps_xmit_timer - timer for when CCID is not ready to send 484 * @dccps_sync_scheduled - flag which signals "send out-of-band message soon"
485 * @dccps_xmitlet - tasklet scheduled by the TX CCID to dequeue data packets
486 * @dccps_xmit_timer - used by the TX CCID to delay sending (rate-based pacing)
509 * @dccps_syn_rtt - RTT sample from Request/Response exchange (in usecs) 487 * @dccps_syn_rtt - RTT sample from Request/Response exchange (in usecs)
510 */ 488 */
511struct dccp_sock { 489struct dccp_sock {
@@ -529,19 +507,26 @@ struct dccp_sock {
529 __u32 dccps_timestamp_time; 507 __u32 dccps_timestamp_time;
530 __u16 dccps_l_ack_ratio; 508 __u16 dccps_l_ack_ratio;
531 __u16 dccps_r_ack_ratio; 509 __u16 dccps_r_ack_ratio;
532 __u16 dccps_pcslen; 510 __u64 dccps_l_seq_win:48;
533 __u16 dccps_pcrlen; 511 __u64 dccps_r_seq_win:48;
512 __u8 dccps_pcslen:4;
513 __u8 dccps_pcrlen:4;
514 __u8 dccps_send_ndp_count:1;
534 __u64 dccps_ndp_count:48; 515 __u64 dccps_ndp_count:48;
535 unsigned long dccps_rate_last; 516 unsigned long dccps_rate_last;
536 struct dccp_minisock dccps_minisock; 517 struct list_head dccps_featneg;
537 struct dccp_ackvec *dccps_hc_rx_ackvec; 518 struct dccp_ackvec *dccps_hc_rx_ackvec;
538 struct ccid *dccps_hc_rx_ccid; 519 struct ccid *dccps_hc_rx_ccid;
539 struct ccid *dccps_hc_tx_ccid; 520 struct ccid *dccps_hc_tx_ccid;
540 struct dccp_options_received dccps_options_received; 521 struct dccp_options_received dccps_options_received;
522 __u8 dccps_qpolicy;
523 __u32 dccps_tx_qlen;
541 enum dccp_role dccps_role:2; 524 enum dccp_role dccps_role:2;
542 __u8 dccps_hc_rx_insert_options:1; 525 __u8 dccps_hc_rx_insert_options:1;
543 __u8 dccps_hc_tx_insert_options:1; 526 __u8 dccps_hc_tx_insert_options:1;
544 __u8 dccps_server_timewait:1; 527 __u8 dccps_server_timewait:1;
528 __u8 dccps_sync_scheduled:1;
529 struct tasklet_struct dccps_xmitlet;
545 struct timer_list dccps_xmit_timer; 530 struct timer_list dccps_xmit_timer;
546}; 531};
547 532
@@ -550,11 +535,6 @@ static inline struct dccp_sock *dccp_sk(const struct sock *sk)
550 return (struct dccp_sock *)sk; 535 return (struct dccp_sock *)sk;
551} 536}
552 537
553static inline struct dccp_minisock *dccp_msk(const struct sock *sk)
554{
555 return (struct dccp_minisock *)&dccp_sk(sk)->dccps_minisock;
556}
557
558static inline const char *dccp_role(const struct sock *sk) 538static inline const char *dccp_role(const struct sock *sk)
559{ 539{
560 switch (dccp_sk(sk)->dccps_role) { 540 switch (dccp_sk(sk)->dccps_role) {
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 8983386356a5..6bc4b8148ca0 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -782,6 +782,21 @@ static inline __u32 tcp_current_ssthresh(const struct sock *sk)
782/* Use define here intentionally to get WARN_ON location shown at the caller */ 782/* Use define here intentionally to get WARN_ON location shown at the caller */
783#define tcp_verify_left_out(tp) WARN_ON(tcp_left_out(tp) > tp->packets_out) 783#define tcp_verify_left_out(tp) WARN_ON(tcp_left_out(tp) > tp->packets_out)
784 784
785/*
786 * Convert RFC3390 larger initial windows into an equivalent number of packets.
787 *
788 * John Heffner states:
789 *
790 * The RFC specifies a window of no more than 4380 bytes
791 * unless 2*MSS > 4380. Reading the pseudocode in the RFC
792 * is a bit misleading because they use a clamp at 4380 bytes
793 * rather than a multiplier in the relevant range.
794 */
795static inline u32 rfc3390_bytes_to_packets(const u32 bytes)
796{
797 return bytes <= 1095 ? 4 : (bytes > 1460 ? 2 : 3);
798}
799
785extern void tcp_enter_cwr(struct sock *sk, const int set_ssthresh); 800extern void tcp_enter_cwr(struct sock *sk, const int set_ssthresh);
786extern __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst); 801extern __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst);
787 802
diff --git a/net/dccp/Kconfig b/net/dccp/Kconfig
index 7aa2a7acc7ec..206c16ad9c3c 100644
--- a/net/dccp/Kconfig
+++ b/net/dccp/Kconfig
@@ -25,9 +25,6 @@ config INET_DCCP_DIAG
25 def_tristate y if (IP_DCCP = y && INET_DIAG = y) 25 def_tristate y if (IP_DCCP = y && INET_DIAG = y)
26 def_tristate m 26 def_tristate m
27 27
28config IP_DCCP_ACKVEC
29 bool
30
31source "net/dccp/ccids/Kconfig" 28source "net/dccp/ccids/Kconfig"
32 29
33menu "DCCP Kernel Hacking" 30menu "DCCP Kernel Hacking"
diff --git a/net/dccp/Makefile b/net/dccp/Makefile
index f4f8793aafff..0c1c9af2bf7e 100644
--- a/net/dccp/Makefile
+++ b/net/dccp/Makefile
@@ -1,6 +1,7 @@
1obj-$(CONFIG_IP_DCCP) += dccp.o dccp_ipv4.o 1obj-$(CONFIG_IP_DCCP) += dccp.o dccp_ipv4.o
2 2
3dccp-y := ccid.o feat.o input.o minisocks.o options.o output.o proto.o timer.o 3dccp-y := ccid.o feat.o input.o minisocks.o options.o \
4 qpolicy.o output.o proto.o timer.o ackvec.o
4 5
5dccp_ipv4-y := ipv4.o 6dccp_ipv4-y := ipv4.o
6 7
@@ -8,8 +9,6 @@ dccp_ipv4-y := ipv4.o
8obj-$(subst y,$(CONFIG_IP_DCCP),$(CONFIG_IPV6)) += dccp_ipv6.o 9obj-$(subst y,$(CONFIG_IP_DCCP),$(CONFIG_IPV6)) += dccp_ipv6.o
9dccp_ipv6-y := ipv6.o 10dccp_ipv6-y := ipv6.o
10 11
11dccp-$(CONFIG_IP_DCCP_ACKVEC) += ackvec.o
12
13obj-$(CONFIG_INET_DCCP_DIAG) += dccp_diag.o 12obj-$(CONFIG_INET_DCCP_DIAG) += dccp_diag.o
14obj-$(CONFIG_NET_DCCPPROBE) += dccp_probe.o 13obj-$(CONFIG_NET_DCCPPROBE) += dccp_probe.o
15 14
diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c
index 1e8be246ad15..41819848bdda 100644
--- a/net/dccp/ackvec.c
+++ b/net/dccp/ackvec.c
@@ -1,445 +1,375 @@
1/* 1/*
2 * net/dccp/ackvec.c 2 * net/dccp/ackvec.c
3 * 3 *
4 * An implementation of the DCCP protocol 4 * An implementation of Ack Vectors for the DCCP protocol
5 * Copyright (c) 2007 University of Aberdeen, Scotland, UK
5 * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@ghostprotocols.net> 6 * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
6 * 7 *
7 * This program is free software; you can redistribute it and/or modify it 8 * This program is free software; you can redistribute it and/or modify it
8 * under the terms of the GNU General Public License as published by the 9 * under the terms of the GNU General Public License as published by the
9 * Free Software Foundation; version 2 of the License; 10 * Free Software Foundation; version 2 of the License;
10 */ 11 */
11
12#include "ackvec.h"
13#include "dccp.h" 12#include "dccp.h"
14
15#include <linux/dccp.h>
16#include <linux/init.h>
17#include <linux/errno.h>
18#include <linux/kernel.h> 13#include <linux/kernel.h>
19#include <linux/skbuff.h>
20#include <linux/slab.h> 14#include <linux/slab.h>
21 15
22#include <net/sock.h>
23
24static struct kmem_cache *dccp_ackvec_slab; 16static struct kmem_cache *dccp_ackvec_slab;
25static struct kmem_cache *dccp_ackvec_record_slab; 17static struct kmem_cache *dccp_ackvec_record_slab;
26 18
27static struct dccp_ackvec_record *dccp_ackvec_record_new(void) 19struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority)
28{ 20{
29 struct dccp_ackvec_record *avr = 21 struct dccp_ackvec *av = kmem_cache_zalloc(dccp_ackvec_slab, priority);
30 kmem_cache_alloc(dccp_ackvec_record_slab, GFP_ATOMIC);
31 22
32 if (avr != NULL) 23 if (av != NULL) {
33 INIT_LIST_HEAD(&avr->avr_node); 24 av->av_buf_head = av->av_buf_tail = DCCPAV_MAX_ACKVEC_LEN - 1;
34 25 INIT_LIST_HEAD(&av->av_records);
35 return avr; 26 }
27 return av;
36} 28}
37 29
38static void dccp_ackvec_record_delete(struct dccp_ackvec_record *avr) 30static void dccp_ackvec_purge_records(struct dccp_ackvec *av)
39{ 31{
40 if (unlikely(avr == NULL)) 32 struct dccp_ackvec_record *cur, *next;
41 return; 33
42 /* Check if deleting a linked record */ 34 list_for_each_entry_safe(cur, next, &av->av_records, avr_node)
43 WARN_ON(!list_empty(&avr->avr_node)); 35 kmem_cache_free(dccp_ackvec_record_slab, cur);
44 kmem_cache_free(dccp_ackvec_record_slab, avr); 36 INIT_LIST_HEAD(&av->av_records);
45} 37}
46 38
47static void dccp_ackvec_insert_avr(struct dccp_ackvec *av, 39void dccp_ackvec_free(struct dccp_ackvec *av)
48 struct dccp_ackvec_record *avr)
49{ 40{
50 /* 41 if (likely(av != NULL)) {
51 * AVRs are sorted by seqno. Since we are sending them in order, we 42 dccp_ackvec_purge_records(av);
52 * just add the AVR at the head of the list. 43 kmem_cache_free(dccp_ackvec_slab, av);
53 * -sorbo.
54 */
55 if (!list_empty(&av->av_records)) {
56 const struct dccp_ackvec_record *head =
57 list_entry(av->av_records.next,
58 struct dccp_ackvec_record,
59 avr_node);
60 BUG_ON(before48(avr->avr_ack_seqno, head->avr_ack_seqno));
61 } 44 }
62
63 list_add(&avr->avr_node, &av->av_records);
64} 45}
65 46
66int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb) 47/**
48 * dccp_ackvec_update_records - Record information about sent Ack Vectors
49 * @av: Ack Vector records to update
50 * @seqno: Sequence number of the packet carrying the Ack Vector just sent
51 * @nonce_sum: The sum of all buffer nonces contained in the Ack Vector
52 */
53int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seqno, u8 nonce_sum)
67{ 54{
68 struct dccp_sock *dp = dccp_sk(sk);
69 struct dccp_ackvec *av = dp->dccps_hc_rx_ackvec;
70 /* Figure out how many options do we need to represent the ackvec */
71 const u16 nr_opts = DIV_ROUND_UP(av->av_vec_len, DCCP_MAX_ACKVEC_OPT_LEN);
72 u16 len = av->av_vec_len + 2 * nr_opts, i;
73 u32 elapsed_time;
74 const unsigned char *tail, *from;
75 unsigned char *to;
76 struct dccp_ackvec_record *avr; 55 struct dccp_ackvec_record *avr;
77 suseconds_t delta;
78
79 if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN)
80 return -1;
81
82 delta = ktime_us_delta(ktime_get_real(), av->av_time);
83 elapsed_time = delta / 10;
84 56
85 if (elapsed_time != 0 && 57 avr = kmem_cache_alloc(dccp_ackvec_record_slab, GFP_ATOMIC);
86 dccp_insert_option_elapsed_time(sk, skb, elapsed_time))
87 return -1;
88
89 avr = dccp_ackvec_record_new();
90 if (avr == NULL) 58 if (avr == NULL)
91 return -1; 59 return -ENOBUFS;
92
93 DCCP_SKB_CB(skb)->dccpd_opt_len += len;
94
95 to = skb_push(skb, len);
96 len = av->av_vec_len;
97 from = av->av_buf + av->av_buf_head;
98 tail = av->av_buf + DCCP_MAX_ACKVEC_LEN;
99
100 for (i = 0; i < nr_opts; ++i) {
101 int copylen = len;
102
103 if (len > DCCP_MAX_ACKVEC_OPT_LEN)
104 copylen = DCCP_MAX_ACKVEC_OPT_LEN;
105
106 *to++ = DCCPO_ACK_VECTOR_0;
107 *to++ = copylen + 2;
108
109 /* Check if buf_head wraps */
110 if (from + copylen > tail) {
111 const u16 tailsize = tail - from;
112
113 memcpy(to, from, tailsize);
114 to += tailsize;
115 len -= tailsize;
116 copylen -= tailsize;
117 from = av->av_buf;
118 }
119
120 memcpy(to, from, copylen);
121 from += copylen;
122 to += copylen;
123 len -= copylen;
124 }
125 60
61 avr->avr_ack_seqno = seqno;
62 avr->avr_ack_ptr = av->av_buf_head;
63 avr->avr_ack_ackno = av->av_buf_ackno;
64 avr->avr_ack_nonce = nonce_sum;
65 avr->avr_ack_runlen = dccp_ackvec_runlen(av->av_buf + av->av_buf_head);
126 /* 66 /*
127 * From RFC 4340, A.2: 67 * When the buffer overflows, we keep no more than one record. This is
128 * 68 * the simplest way of disambiguating sender-Acks dating from before the
129 * For each acknowledgement it sends, the HC-Receiver will add an 69 * overflow from sender-Acks which refer to after the overflow; a simple
130 * acknowledgement record. ack_seqno will equal the HC-Receiver 70 * solution is preferable here since we are handling an exception.
131 * sequence number it used for the ack packet; ack_ptr will equal
132 * buf_head; ack_ackno will equal buf_ackno; and ack_nonce will
133 * equal buf_nonce.
134 */ 71 */
135 avr->avr_ack_seqno = DCCP_SKB_CB(skb)->dccpd_seq; 72 if (av->av_overflow)
136 avr->avr_ack_ptr = av->av_buf_head; 73 dccp_ackvec_purge_records(av);
137 avr->avr_ack_ackno = av->av_buf_ackno; 74 /*
138 avr->avr_ack_nonce = av->av_buf_nonce; 75 * Since GSS is incremented for each packet, the list is automatically
139 avr->avr_sent_len = av->av_vec_len; 76 * arranged in descending order of @ack_seqno.
140 77 */
141 dccp_ackvec_insert_avr(av, avr); 78 list_add(&avr->avr_node, &av->av_records);
142 79
143 dccp_pr_debug("%s ACK Vector 0, len=%d, ack_seqno=%llu, " 80 dccp_pr_debug("Added Vector, ack_seqno=%llu, ack_ackno=%llu (rl=%u)\n",
144 "ack_ackno=%llu\n",
145 dccp_role(sk), avr->avr_sent_len,
146 (unsigned long long)avr->avr_ack_seqno, 81 (unsigned long long)avr->avr_ack_seqno,
147 (unsigned long long)avr->avr_ack_ackno); 82 (unsigned long long)avr->avr_ack_ackno,
83 avr->avr_ack_runlen);
148 return 0; 84 return 0;
149} 85}
150 86
151struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority) 87static struct dccp_ackvec_record *dccp_ackvec_lookup(struct list_head *av_list,
88 const u64 ackno)
152{ 89{
153 struct dccp_ackvec *av = kmem_cache_alloc(dccp_ackvec_slab, priority); 90 struct dccp_ackvec_record *avr;
154 91 /*
155 if (av != NULL) { 92 * Exploit that records are inserted in descending order of sequence
156 av->av_buf_head = DCCP_MAX_ACKVEC_LEN - 1; 93 * number, start with the oldest record first. If @ackno is `before'
157 av->av_buf_ackno = UINT48_MAX + 1; 94 * the earliest ack_ackno, the packet is too old to be considered.
158 av->av_buf_nonce = 0; 95 */
159 av->av_time = ktime_set(0, 0); 96 list_for_each_entry_reverse(avr, av_list, avr_node) {
160 av->av_vec_len = 0; 97 if (avr->avr_ack_seqno == ackno)
161 INIT_LIST_HEAD(&av->av_records); 98 return avr;
99 if (before48(ackno, avr->avr_ack_seqno))
100 break;
162 } 101 }
163 102 return NULL;
164 return av;
165} 103}
166 104
167void dccp_ackvec_free(struct dccp_ackvec *av) 105/*
106 * Buffer index and length computation using modulo-buffersize arithmetic.
107 * Note that, as pointers move from right to left, head is `before' tail.
108 */
109static inline u16 __ackvec_idx_add(const u16 a, const u16 b)
168{ 110{
169 if (unlikely(av == NULL)) 111 return (a + b) % DCCPAV_MAX_ACKVEC_LEN;
170 return;
171
172 if (!list_empty(&av->av_records)) {
173 struct dccp_ackvec_record *avr, *next;
174
175 list_for_each_entry_safe(avr, next, &av->av_records, avr_node) {
176 list_del_init(&avr->avr_node);
177 dccp_ackvec_record_delete(avr);
178 }
179 }
180
181 kmem_cache_free(dccp_ackvec_slab, av);
182} 112}
183 113
184static inline u8 dccp_ackvec_state(const struct dccp_ackvec *av, 114static inline u16 __ackvec_idx_sub(const u16 a, const u16 b)
185 const u32 index)
186{ 115{
187 return av->av_buf[index] & DCCP_ACKVEC_STATE_MASK; 116 return __ackvec_idx_add(a, DCCPAV_MAX_ACKVEC_LEN - b);
188} 117}
189 118
190static inline u8 dccp_ackvec_len(const struct dccp_ackvec *av, 119u16 dccp_ackvec_buflen(const struct dccp_ackvec *av)
191 const u32 index)
192{ 120{
193 return av->av_buf[index] & DCCP_ACKVEC_LEN_MASK; 121 if (unlikely(av->av_overflow))
122 return DCCPAV_MAX_ACKVEC_LEN;
123 return __ackvec_idx_sub(av->av_buf_tail, av->av_buf_head);
194} 124}
195 125
196/* 126/**
197 * If several packets are missing, the HC-Receiver may prefer to enter multiple 127 * dccp_ackvec_update_old - Update previous state as per RFC 4340, 11.4.1
198 * bytes with run length 0, rather than a single byte with a larger run length; 128 * @av: non-empty buffer to update
199 * this simplifies table updates if one of the missing packets arrives. 129 * @distance: negative or zero distance of @seqno from buf_ackno downward
130 * @seqno: the (old) sequence number whose record is to be updated
131 * @state: state in which packet carrying @seqno was received
200 */ 132 */
201static inline int dccp_ackvec_set_buf_head_state(struct dccp_ackvec *av, 133static void dccp_ackvec_update_old(struct dccp_ackvec *av, s64 distance,
202 const unsigned int packets, 134 u64 seqno, enum dccp_ackvec_states state)
203 const unsigned char state)
204{ 135{
205 unsigned int gap; 136 u16 ptr = av->av_buf_head;
206 long new_head;
207 137
208 if (av->av_vec_len + packets > DCCP_MAX_ACKVEC_LEN) 138 BUG_ON(distance > 0);
209 return -ENOBUFS; 139 if (unlikely(dccp_ackvec_is_empty(av)))
140 return;
210 141
211 gap = packets - 1; 142 do {
212 new_head = av->av_buf_head - packets; 143 u8 runlen = dccp_ackvec_runlen(av->av_buf + ptr);
213 144
214 if (new_head < 0) { 145 if (distance + runlen >= 0) {
215 if (gap > 0) { 146 /*
216 memset(av->av_buf, DCCP_ACKVEC_STATE_NOT_RECEIVED, 147 * Only update the state if packet has not been received
217 gap + new_head + 1); 148 * yet. This is OK as per the second table in RFC 4340,
218 gap = -new_head; 149 * 11.4.1; i.e. here we are using the following table:
150 * RECEIVED
151 * 0 1 3
152 * S +---+---+---+
153 * T 0 | 0 | 0 | 0 |
154 * O +---+---+---+
155 * R 1 | 1 | 1 | 1 |
156 * E +---+---+---+
157 * D 3 | 0 | 1 | 3 |
158 * +---+---+---+
159 * The "Not Received" state was set by reserve_seats().
160 */
161 if (av->av_buf[ptr] == DCCPAV_NOT_RECEIVED)
162 av->av_buf[ptr] = state;
163 else
164 dccp_pr_debug("Not changing %llu state to %u\n",
165 (unsigned long long)seqno, state);
166 break;
219 } 167 }
220 new_head += DCCP_MAX_ACKVEC_LEN;
221 }
222 168
223 av->av_buf_head = new_head; 169 distance += runlen + 1;
170 ptr = __ackvec_idx_add(ptr, 1);
224 171
225 if (gap > 0) 172 } while (ptr != av->av_buf_tail);
226 memset(av->av_buf + av->av_buf_head + 1, 173}
227 DCCP_ACKVEC_STATE_NOT_RECEIVED, gap);
228 174
229 av->av_buf[av->av_buf_head] = state; 175/* Mark @num entries after buf_head as "Not yet received". */
230 av->av_vec_len += packets; 176static void dccp_ackvec_reserve_seats(struct dccp_ackvec *av, u16 num)
231 return 0; 177{
178 u16 start = __ackvec_idx_add(av->av_buf_head, 1),
179 len = DCCPAV_MAX_ACKVEC_LEN - start;
180
181 /* check for buffer wrap-around */
182 if (num > len) {
183 memset(av->av_buf + start, DCCPAV_NOT_RECEIVED, len);
184 start = 0;
185 num -= len;
186 }
187 if (num)
188 memset(av->av_buf + start, DCCPAV_NOT_RECEIVED, num);
232} 189}
233 190
234/* 191/**
235 * Implements the RFC 4340, Appendix A 192 * dccp_ackvec_add_new - Record one or more new entries in Ack Vector buffer
193 * @av: container of buffer to update (can be empty or non-empty)
194 * @num_packets: number of packets to register (must be >= 1)
195 * @seqno: sequence number of the first packet in @num_packets
196 * @state: state in which packet carrying @seqno was received
236 */ 197 */
237int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk, 198static void dccp_ackvec_add_new(struct dccp_ackvec *av, u32 num_packets,
238 const u64 ackno, const u8 state) 199 u64 seqno, enum dccp_ackvec_states state)
239{ 200{
240 /* 201 u32 num_cells = num_packets;
241 * Check at the right places if the buffer is full, if it is, tell the
242 * caller to start dropping packets till the HC-Sender acks our ACK
243 * vectors, when we will free up space in av_buf.
244 *
245 * We may well decide to do buffer compression, etc, but for now lets
246 * just drop.
247 *
248 * From Appendix A.1.1 (`New Packets'):
249 *
250 * Of course, the circular buffer may overflow, either when the
251 * HC-Sender is sending data at a very high rate, when the
252 * HC-Receiver's acknowledgements are not reaching the HC-Sender,
253 * or when the HC-Sender is forgetting to acknowledge those acks
254 * (so the HC-Receiver is unable to clean up old state). In this
255 * case, the HC-Receiver should either compress the buffer (by
256 * increasing run lengths when possible), transfer its state to
257 * a larger buffer, or, as a last resort, drop all received
258 * packets, without processing them whatsoever, until its buffer
259 * shrinks again.
260 */
261 202
262 /* See if this is the first ackno being inserted */ 203 if (num_packets > DCCPAV_BURST_THRESH) {
263 if (av->av_vec_len == 0) { 204 u32 lost_packets = num_packets - 1;
264 av->av_buf[av->av_buf_head] = state;
265 av->av_vec_len = 1;
266 } else if (after48(ackno, av->av_buf_ackno)) {
267 const u64 delta = dccp_delta_seqno(av->av_buf_ackno, ackno);
268 205
206 DCCP_WARN("Warning: large burst loss (%u)\n", lost_packets);
269 /* 207 /*
270 * Look if the state of this packet is the same as the 208 * We received 1 packet and have a loss of size "num_packets-1"
271 * previous ackno and if so if we can bump the head len. 209 * which we squeeze into num_cells-1 rather than reserving an
210 * entire byte for each lost packet.
211 * The reason is that the vector grows in O(burst_length); when
212 * it grows too large there will no room left for the payload.
213 * This is a trade-off: if a few packets out of the burst show
214 * up later, their state will not be changed; it is simply too
215 * costly to reshuffle/reallocate/copy the buffer each time.
216 * Should such problems persist, we will need to switch to a
217 * different underlying data structure.
272 */ 218 */
273 if (delta == 1 && 219 for (num_packets = num_cells = 1; lost_packets; ++num_cells) {
274 dccp_ackvec_state(av, av->av_buf_head) == state && 220 u8 len = min(lost_packets, (u32)DCCPAV_MAX_RUNLEN);
275 dccp_ackvec_len(av, av->av_buf_head) < DCCP_ACKVEC_LEN_MASK)
276 av->av_buf[av->av_buf_head]++;
277 else if (dccp_ackvec_set_buf_head_state(av, delta, state))
278 return -ENOBUFS;
279 } else {
280 /*
281 * A.1.2. Old Packets
282 *
283 * When a packet with Sequence Number S <= buf_ackno
284 * arrives, the HC-Receiver will scan the table for
285 * the byte corresponding to S. (Indexing structures
286 * could reduce the complexity of this scan.)
287 */
288 u64 delta = dccp_delta_seqno(ackno, av->av_buf_ackno);
289 u32 index = av->av_buf_head;
290 221
291 while (1) { 222 av->av_buf_head = __ackvec_idx_sub(av->av_buf_head, 1);
292 const u8 len = dccp_ackvec_len(av, index); 223 av->av_buf[av->av_buf_head] = DCCPAV_NOT_RECEIVED | len;
293 const u8 av_state = dccp_ackvec_state(av, index); 224
294 /* 225 lost_packets -= len;
295 * valid packets not yet in av_buf have a reserved
296 * entry, with a len equal to 0.
297 */
298 if (av_state == DCCP_ACKVEC_STATE_NOT_RECEIVED &&
299 len == 0 && delta == 0) { /* Found our
300 reserved seat! */
301 dccp_pr_debug("Found %llu reserved seat!\n",
302 (unsigned long long)ackno);
303 av->av_buf[index] = state;
304 goto out;
305 }
306 /* len == 0 means one packet */
307 if (delta < len + 1)
308 goto out_duplicate;
309
310 delta -= len + 1;
311 if (++index == DCCP_MAX_ACKVEC_LEN)
312 index = 0;
313 } 226 }
314 } 227 }
315 228
316 av->av_buf_ackno = ackno; 229 if (num_cells + dccp_ackvec_buflen(av) >= DCCPAV_MAX_ACKVEC_LEN) {
317 av->av_time = ktime_get_real(); 230 DCCP_CRIT("Ack Vector buffer overflow: dropping old entries\n");
318out: 231 av->av_overflow = true;
319 return 0; 232 }
233
234 av->av_buf_head = __ackvec_idx_sub(av->av_buf_head, num_packets);
235 if (av->av_overflow)
236 av->av_buf_tail = av->av_buf_head;
320 237
321out_duplicate: 238 av->av_buf[av->av_buf_head] = state;
322 /* Duplicate packet */ 239 av->av_buf_ackno = seqno;
323 dccp_pr_debug("Received a dup or already considered lost " 240
324 "packet: %llu\n", (unsigned long long)ackno); 241 if (num_packets > 1)
325 return -EILSEQ; 242 dccp_ackvec_reserve_seats(av, num_packets - 1);
326} 243}
327 244
328static void dccp_ackvec_throw_record(struct dccp_ackvec *av, 245/**
329 struct dccp_ackvec_record *avr) 246 * dccp_ackvec_input - Register incoming packet in the buffer
247 */
248void dccp_ackvec_input(struct dccp_ackvec *av, struct sk_buff *skb)
330{ 249{
331 struct dccp_ackvec_record *next; 250 u64 seqno = DCCP_SKB_CB(skb)->dccpd_seq;
251 enum dccp_ackvec_states state = DCCPAV_RECEIVED;
332 252
333 /* sort out vector length */ 253 if (dccp_ackvec_is_empty(av)) {
334 if (av->av_buf_head <= avr->avr_ack_ptr) 254 dccp_ackvec_add_new(av, 1, seqno, state);
335 av->av_vec_len = avr->avr_ack_ptr - av->av_buf_head; 255 av->av_tail_ackno = seqno;
336 else
337 av->av_vec_len = DCCP_MAX_ACKVEC_LEN - 1 -
338 av->av_buf_head + avr->avr_ack_ptr;
339 256
340 /* free records */ 257 } else {
341 list_for_each_entry_safe_from(avr, next, &av->av_records, avr_node) { 258 s64 num_packets = dccp_delta_seqno(av->av_buf_ackno, seqno);
342 list_del_init(&avr->avr_node); 259 u8 *current_head = av->av_buf + av->av_buf_head;
343 dccp_ackvec_record_delete(avr);
344 }
345}
346 260
347void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av, struct sock *sk, 261 if (num_packets == 1 &&
348 const u64 ackno) 262 dccp_ackvec_state(current_head) == state &&
349{ 263 dccp_ackvec_runlen(current_head) < DCCPAV_MAX_RUNLEN) {
350 struct dccp_ackvec_record *avr;
351 264
352 /* 265 *current_head += 1;
353 * If we traverse backwards, it should be faster when we have large 266 av->av_buf_ackno = seqno;
354 * windows. We will be receiving ACKs for stuff we sent a while back 267
355 * -sorbo. 268 } else if (num_packets > 0) {
356 */ 269 dccp_ackvec_add_new(av, num_packets, seqno, state);
357 list_for_each_entry_reverse(avr, &av->av_records, avr_node) { 270 } else {
358 if (ackno == avr->avr_ack_seqno) { 271 dccp_ackvec_update_old(av, num_packets, seqno, state);
359 dccp_pr_debug("%s ACK packet 0, len=%d, ack_seqno=%llu, " 272 }
360 "ack_ackno=%llu, ACKED!\n",
361 dccp_role(sk), 1,
362 (unsigned long long)avr->avr_ack_seqno,
363 (unsigned long long)avr->avr_ack_ackno);
364 dccp_ackvec_throw_record(av, avr);
365 break;
366 } else if (avr->avr_ack_seqno > ackno)
367 break; /* old news */
368 } 273 }
369} 274}
370 275
371static void dccp_ackvec_check_rcv_ackvector(struct dccp_ackvec *av, 276/**
372 struct sock *sk, u64 *ackno, 277 * dccp_ackvec_clear_state - Perform house-keeping / garbage-collection
373 const unsigned char len, 278 * This routine is called when the peer acknowledges the receipt of Ack Vectors
374 const unsigned char *vector) 279 * up to and including @ackno. While based on on section A.3 of RFC 4340, here
375{ 280 * are additional precautions to prevent corrupted buffer state. In particular,
376 unsigned char i; 281 * we use tail_ackno to identify outdated records; it always marks the earliest
377 struct dccp_ackvec_record *avr; 282 * packet of group (2) in 11.4.2.
283 */
284void dccp_ackvec_clear_state(struct dccp_ackvec *av, const u64 ackno)
285 {
286 struct dccp_ackvec_record *avr, *next;
287 u8 runlen_now, eff_runlen;
288 s64 delta;
378 289
379 /* Check if we actually sent an ACK vector */ 290 avr = dccp_ackvec_lookup(&av->av_records, ackno);
380 if (list_empty(&av->av_records)) 291 if (avr == NULL)
381 return; 292 return;
293 /*
294 * Deal with outdated acknowledgments: this arises when e.g. there are
295 * several old records and the acks from the peer come in slowly. In
296 * that case we may still have records that pre-date tail_ackno.
297 */
298 delta = dccp_delta_seqno(av->av_tail_ackno, avr->avr_ack_ackno);
299 if (delta < 0)
300 goto free_records;
301 /*
302 * Deal with overlapping Ack Vectors: don't subtract more than the
303 * number of packets between tail_ackno and ack_ackno.
304 */
305 eff_runlen = delta < avr->avr_ack_runlen ? delta : avr->avr_ack_runlen;
382 306
383 i = len; 307 runlen_now = dccp_ackvec_runlen(av->av_buf + avr->avr_ack_ptr);
384 /* 308 /*
385 * XXX 309 * The run length of Ack Vector cells does not decrease over time. If
386 * I think it might be more efficient to work backwards. See comment on 310 * the run length is the same as at the time the Ack Vector was sent, we
387 * rcv_ackno. -sorbo. 311 * free the ack_ptr cell. That cell can however not be freed if the run
312 * length has increased: in this case we need to move the tail pointer
313 * backwards (towards higher indices), to its next-oldest neighbour.
388 */ 314 */
389 avr = list_entry(av->av_records.next, struct dccp_ackvec_record, avr_node); 315 if (runlen_now > eff_runlen) {
390 while (i--) {
391 const u8 rl = *vector & DCCP_ACKVEC_LEN_MASK;
392 u64 ackno_end_rl;
393 316
394 dccp_set_seqno(&ackno_end_rl, *ackno - rl); 317 av->av_buf[avr->avr_ack_ptr] -= eff_runlen + 1;
318 av->av_buf_tail = __ackvec_idx_add(avr->avr_ack_ptr, 1);
395 319
320 /* This move may not have cleared the overflow flag. */
321 if (av->av_overflow)
322 av->av_overflow = (av->av_buf_head == av->av_buf_tail);
323 } else {
324 av->av_buf_tail = avr->avr_ack_ptr;
396 /* 325 /*
397 * If our AVR sequence number is greater than the ack, go 326 * We have made sure that avr points to a valid cell within the
398 * forward in the AVR list until it is not so. 327 * buffer. This cell is either older than head, or equals head
328 * (empty buffer): in both cases we no longer have any overflow.
399 */ 329 */
400 list_for_each_entry_from(avr, &av->av_records, avr_node) { 330 av->av_overflow = 0;
401 if (!after48(avr->avr_ack_seqno, *ackno)) 331 }
402 goto found;
403 }
404 /* End of the av_records list, not found, exit */
405 break;
406found:
407 if (between48(avr->avr_ack_seqno, ackno_end_rl, *ackno)) {
408 const u8 state = *vector & DCCP_ACKVEC_STATE_MASK;
409 if (state != DCCP_ACKVEC_STATE_NOT_RECEIVED) {
410 dccp_pr_debug("%s ACK vector 0, len=%d, "
411 "ack_seqno=%llu, ack_ackno=%llu, "
412 "ACKED!\n",
413 dccp_role(sk), len,
414 (unsigned long long)
415 avr->avr_ack_seqno,
416 (unsigned long long)
417 avr->avr_ack_ackno);
418 dccp_ackvec_throw_record(av, avr);
419 break;
420 }
421 /*
422 * If it wasn't received, continue scanning... we might
423 * find another one.
424 */
425 }
426 332
427 dccp_set_seqno(ackno, ackno_end_rl - 1); 333 /*
428 ++vector; 334 * The peer has acknowledged up to and including ack_ackno. Hence the
335 * first packet in group (2) of 11.4.2 is the successor of ack_ackno.
336 */
337 av->av_tail_ackno = ADD48(avr->avr_ack_ackno, 1);
338
339free_records:
340 list_for_each_entry_safe_from(avr, next, &av->av_records, avr_node) {
341 list_del(&avr->avr_node);
342 kmem_cache_free(dccp_ackvec_record_slab, avr);
429 } 343 }
430} 344}
431 345
432int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb, 346/*
433 u64 *ackno, const u8 opt, const u8 *value, const u8 len) 347 * Routines to keep track of Ack Vectors received in an skb
348 */
349int dccp_ackvec_parsed_add(struct list_head *head, u8 *vec, u8 len, u8 nonce)
434{ 350{
435 if (len > DCCP_MAX_ACKVEC_OPT_LEN) 351 struct dccp_ackvec_parsed *new = kmalloc(sizeof(*new), GFP_ATOMIC);
436 return -1; 352
353 if (new == NULL)
354 return -ENOBUFS;
355 new->vec = vec;
356 new->len = len;
357 new->nonce = nonce;
437 358
438 /* dccp_ackvector_print(DCCP_SKB_CB(skb)->dccpd_ack_seq, value, len); */ 359 list_add_tail(&new->node, head);
439 dccp_ackvec_check_rcv_ackvector(dccp_sk(sk)->dccps_hc_rx_ackvec, sk,
440 ackno, len, value);
441 return 0; 360 return 0;
442} 361}
362EXPORT_SYMBOL_GPL(dccp_ackvec_parsed_add);
363
364void dccp_ackvec_parsed_cleanup(struct list_head *parsed_chunks)
365{
366 struct dccp_ackvec_parsed *cur, *next;
367
368 list_for_each_entry_safe(cur, next, parsed_chunks, node)
369 kfree(cur);
370 INIT_LIST_HEAD(parsed_chunks);
371}
372EXPORT_SYMBOL_GPL(dccp_ackvec_parsed_cleanup);
443 373
444int __init dccp_ackvec_init(void) 374int __init dccp_ackvec_init(void)
445{ 375{
@@ -449,10 +379,9 @@ int __init dccp_ackvec_init(void)
449 if (dccp_ackvec_slab == NULL) 379 if (dccp_ackvec_slab == NULL)
450 goto out_err; 380 goto out_err;
451 381
452 dccp_ackvec_record_slab = 382 dccp_ackvec_record_slab = kmem_cache_create("dccp_ackvec_record",
453 kmem_cache_create("dccp_ackvec_record", 383 sizeof(struct dccp_ackvec_record),
454 sizeof(struct dccp_ackvec_record), 384 0, SLAB_HWCACHE_ALIGN, NULL);
455 0, SLAB_HWCACHE_ALIGN, NULL);
456 if (dccp_ackvec_record_slab == NULL) 385 if (dccp_ackvec_record_slab == NULL)
457 goto out_destroy_slab; 386 goto out_destroy_slab;
458 387
diff --git a/net/dccp/ackvec.h b/net/dccp/ackvec.h
index bcb64fb4acef..6cdca79a99f7 100644
--- a/net/dccp/ackvec.h
+++ b/net/dccp/ackvec.h
@@ -3,156 +3,134 @@
3/* 3/*
4 * net/dccp/ackvec.h 4 * net/dccp/ackvec.h
5 * 5 *
6 * An implementation of the DCCP protocol 6 * An implementation of Ack Vectors for the DCCP protocol
7 * Copyright (c) 2007 University of Aberdeen, Scotland, UK
7 * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@mandriva.com> 8 * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@mandriva.com>
8 *
9 * This program is free software; you can redistribute it and/or modify it 9 * This program is free software; you can redistribute it and/or modify it
10 * under the terms of the GNU General Public License version 2 as 10 * under the terms of the GNU General Public License version 2 as
11 * published by the Free Software Foundation. 11 * published by the Free Software Foundation.
12 */ 12 */
13 13
14#include <linux/dccp.h>
14#include <linux/compiler.h> 15#include <linux/compiler.h>
15#include <linux/ktime.h>
16#include <linux/list.h> 16#include <linux/list.h>
17#include <linux/types.h> 17#include <linux/types.h>
18 18
19/* Read about the ECN nonce to see why it is 253 */ 19/*
20#define DCCP_MAX_ACKVEC_OPT_LEN 253 20 * Ack Vector buffer space is static, in multiples of %DCCP_SINGLE_OPT_MAXLEN,
21/* We can spread an ack vector across multiple options */ 21 * the maximum size of a single Ack Vector. Setting %DCCPAV_NUM_ACKVECS to 1
22#define DCCP_MAX_ACKVEC_LEN (DCCP_MAX_ACKVEC_OPT_LEN * 2) 22 * will be sufficient for most cases of low Ack Ratios, using a value of 2 gives
23 * more headroom if Ack Ratio is higher or when the sender acknowledges slowly.
24 * The maximum value is bounded by the u16 types for indices and functions.
25 */
26#define DCCPAV_NUM_ACKVECS 2
27#define DCCPAV_MAX_ACKVEC_LEN (DCCP_SINGLE_OPT_MAXLEN * DCCPAV_NUM_ACKVECS)
23 28
24#define DCCP_ACKVEC_STATE_RECEIVED 0 29/* Estimated minimum average Ack Vector length - used for updating MPS */
25#define DCCP_ACKVEC_STATE_ECN_MARKED (1 << 6) 30#define DCCPAV_MIN_OPTLEN 16
26#define DCCP_ACKVEC_STATE_NOT_RECEIVED (3 << 6)
27 31
28#define DCCP_ACKVEC_STATE_MASK 0xC0 /* 11000000 */ 32/* Threshold for coping with large bursts of losses */
29#define DCCP_ACKVEC_LEN_MASK 0x3F /* 00111111 */ 33#define DCCPAV_BURST_THRESH (DCCPAV_MAX_ACKVEC_LEN / 8)
30 34
31/** struct dccp_ackvec - ack vector 35enum dccp_ackvec_states {
32 * 36 DCCPAV_RECEIVED = 0x00,
33 * This data structure is the one defined in RFC 4340, Appendix A. 37 DCCPAV_ECN_MARKED = 0x40,
34 * 38 DCCPAV_RESERVED = 0x80,
35 * @av_buf_head - circular buffer head 39 DCCPAV_NOT_RECEIVED = 0xC0
36 * @av_buf_tail - circular buffer tail 40};
37 * @av_buf_ackno - ack # of the most recent packet acknowledgeable in the 41#define DCCPAV_MAX_RUNLEN 0x3F
38 * buffer (i.e. %av_buf_head) 42
39 * @av_buf_nonce - the one-bit sum of the ECN Nonces on all packets acked 43static inline u8 dccp_ackvec_runlen(const u8 *cell)
40 * by the buffer with State 0 44{
41 * 45 return *cell & DCCPAV_MAX_RUNLEN;
42 * Additionally, the HC-Receiver must keep some information about the 46}
43 * Ack Vectors it has recently sent. For each packet sent carrying an 47
44 * Ack Vector, it remembers four variables: 48static inline u8 dccp_ackvec_state(const u8 *cell)
49{
50 return *cell & ~DCCPAV_MAX_RUNLEN;
51}
52
53/** struct dccp_ackvec - Ack Vector main data structure
45 * 54 *
46 * @av_records - list of dccp_ackvec_record 55 * This implements a fixed-size circular buffer within an array and is largely
47 * @av_ack_nonce - the one-bit sum of the ECN Nonces for all State 0. 56 * based on Appendix A of RFC 4340.
48 * 57 *
49 * @av_time - the time in usecs 58 * @av_buf: circular buffer storage area
50 * @av_buf - circular buffer of acknowledgeable packets 59 * @av_buf_head: head index; begin of live portion in @av_buf
60 * @av_buf_tail: tail index; first index _after_ the live portion in @av_buf
61 * @av_buf_ackno: highest seqno of acknowledgeable packet recorded in @av_buf
62 * @av_tail_ackno: lowest seqno of acknowledgeable packet recorded in @av_buf
63 * @av_buf_nonce: ECN nonce sums, each covering subsequent segments of up to
64 * %DCCP_SINGLE_OPT_MAXLEN cells in the live portion of @av_buf
65 * @av_overflow: if 1 then buf_head == buf_tail indicates buffer wraparound
66 * @av_records: list of %dccp_ackvec_record (Ack Vectors sent previously)
51 */ 67 */
52struct dccp_ackvec { 68struct dccp_ackvec {
53 u64 av_buf_ackno; 69 u8 av_buf[DCCPAV_MAX_ACKVEC_LEN];
54 struct list_head av_records;
55 ktime_t av_time;
56 u16 av_buf_head; 70 u16 av_buf_head;
57 u16 av_vec_len; 71 u16 av_buf_tail;
58 u8 av_buf_nonce; 72 u64 av_buf_ackno:48;
59 u8 av_ack_nonce; 73 u64 av_tail_ackno:48;
60 u8 av_buf[DCCP_MAX_ACKVEC_LEN]; 74 bool av_buf_nonce[DCCPAV_NUM_ACKVECS];
75 u8 av_overflow:1;
76 struct list_head av_records;
61}; 77};
62 78
63/** struct dccp_ackvec_record - ack vector record 79/** struct dccp_ackvec_record - Records information about sent Ack Vectors
64 * 80 *
65 * ACK vector record as defined in Appendix A of spec. 81 * These list entries define the additional information which the HC-Receiver
82 * keeps about recently-sent Ack Vectors; again refer to RFC 4340, Appendix A.
66 * 83 *
67 * The list is sorted by avr_ack_seqno 84 * @avr_node: the list node in @av_records
85 * @avr_ack_seqno: sequence number of the packet the Ack Vector was sent on
86 * @avr_ack_ackno: the Ack number that this record/Ack Vector refers to
87 * @avr_ack_ptr: pointer into @av_buf where this record starts
88 * @avr_ack_runlen: run length of @avr_ack_ptr at the time of sending
89 * @avr_ack_nonce: the sum of @av_buf_nonce's at the time this record was sent
68 * 90 *
69 * @avr_node - node in av_records 91 * The list as a whole is sorted in descending order by @avr_ack_seqno.
70 * @avr_ack_seqno - sequence number of the packet this record was sent on
71 * @avr_ack_ackno - sequence number being acknowledged
72 * @avr_ack_ptr - pointer into av_buf where this record starts
73 * @avr_ack_nonce - av_ack_nonce at the time this record was sent
74 * @avr_sent_len - lenght of the record in av_buf
75 */ 92 */
76struct dccp_ackvec_record { 93struct dccp_ackvec_record {
77 struct list_head avr_node; 94 struct list_head avr_node;
78 u64 avr_ack_seqno; 95 u64 avr_ack_seqno:48;
79 u64 avr_ack_ackno; 96 u64 avr_ack_ackno:48;
80 u16 avr_ack_ptr; 97 u16 avr_ack_ptr;
81 u16 avr_sent_len; 98 u8 avr_ack_runlen;
82 u8 avr_ack_nonce; 99 u8 avr_ack_nonce:1;
83}; 100};
84 101
85struct sock; 102extern int dccp_ackvec_init(void);
86struct sk_buff;
87
88#ifdef CONFIG_IP_DCCP_ACKVEC
89extern int dccp_ackvec_init(void);
90extern void dccp_ackvec_exit(void); 103extern void dccp_ackvec_exit(void);
91 104
92extern struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority); 105extern struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority);
93extern void dccp_ackvec_free(struct dccp_ackvec *av); 106extern void dccp_ackvec_free(struct dccp_ackvec *av);
94 107
95extern int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk, 108extern void dccp_ackvec_input(struct dccp_ackvec *av, struct sk_buff *skb);
96 const u64 ackno, const u8 state); 109extern int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seq, u8 sum);
97 110extern void dccp_ackvec_clear_state(struct dccp_ackvec *av, const u64 ackno);
98extern void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av, 111extern u16 dccp_ackvec_buflen(const struct dccp_ackvec *av);
99 struct sock *sk, const u64 ackno);
100extern int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb,
101 u64 *ackno, const u8 opt,
102 const u8 *value, const u8 len);
103 112
104extern int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb); 113static inline bool dccp_ackvec_is_empty(const struct dccp_ackvec *av)
105
106static inline int dccp_ackvec_pending(const struct dccp_ackvec *av)
107{
108 return av->av_vec_len;
109}
110#else /* CONFIG_IP_DCCP_ACKVEC */
111static inline int dccp_ackvec_init(void)
112{ 114{
113 return 0; 115 return av->av_overflow == 0 && av->av_buf_head == av->av_buf_tail;
114} 116}
115 117
116static inline void dccp_ackvec_exit(void) 118/**
117{ 119 * struct dccp_ackvec_parsed - Record offsets of Ack Vectors in skb
118} 120 * @vec: start of vector (offset into skb)
119 121 * @len: length of @vec
120static inline struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority) 122 * @nonce: whether @vec had an ECN nonce of 0 or 1
121{ 123 * @node: FIFO - arranged in descending order of ack_ackno
122 return NULL; 124 * This structure is used by CCIDs to access Ack Vectors in a received skb.
123} 125 */
124 126struct dccp_ackvec_parsed {
125static inline void dccp_ackvec_free(struct dccp_ackvec *av) 127 u8 *vec,
126{ 128 len,
127} 129 nonce:1;
128 130 struct list_head node;
129static inline int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk, 131};
130 const u64 ackno, const u8 state)
131{
132 return -1;
133}
134
135static inline void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av,
136 struct sock *sk, const u64 ackno)
137{
138}
139
140static inline int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb,
141 const u64 *ackno, const u8 opt,
142 const u8 *value, const u8 len)
143{
144 return -1;
145}
146
147static inline int dccp_insert_option_ackvec(const struct sock *sk,
148 const struct sk_buff *skb)
149{
150 return -1;
151}
152 132
153static inline int dccp_ackvec_pending(const struct dccp_ackvec *av) 133extern int dccp_ackvec_parsed_add(struct list_head *head,
154{ 134 u8 *vec, u8 len, u8 nonce);
155 return 0; 135extern void dccp_ackvec_parsed_cleanup(struct list_head *parsed_chunks);
156}
157#endif /* CONFIG_IP_DCCP_ACKVEC */
158#endif /* _ACKVEC_H */ 136#endif /* _ACKVEC_H */
diff --git a/net/dccp/ccid.c b/net/dccp/ccid.c
index 4809753d12ae..e3fb52b4f5c6 100644
--- a/net/dccp/ccid.c
+++ b/net/dccp/ccid.c
@@ -13,6 +13,13 @@
13 13
14#include "ccid.h" 14#include "ccid.h"
15 15
16static u8 builtin_ccids[] = {
17 DCCPC_CCID2, /* CCID2 is supported by default */
18#if defined(CONFIG_IP_DCCP_CCID3) || defined(CONFIG_IP_DCCP_CCID3_MODULE)
19 DCCPC_CCID3,
20#endif
21};
22
16static struct ccid_operations *ccids[CCID_MAX]; 23static struct ccid_operations *ccids[CCID_MAX];
17#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT) 24#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
18static atomic_t ccids_lockct = ATOMIC_INIT(0); 25static atomic_t ccids_lockct = ATOMIC_INIT(0);
@@ -86,6 +93,47 @@ static void ccid_kmem_cache_destroy(struct kmem_cache *slab)
86 } 93 }
87} 94}
88 95
96/* check that up to @array_len members in @ccid_array are supported */
97bool ccid_support_check(u8 const *ccid_array, u8 array_len)
98{
99 u8 i, j, found;
100
101 for (i = 0, found = 0; i < array_len; i++, found = 0) {
102 for (j = 0; !found && j < ARRAY_SIZE(builtin_ccids); j++)
103 found = (ccid_array[i] == builtin_ccids[j]);
104 if (!found)
105 return false;
106 }
107 return true;
108}
109
110/**
111 * ccid_get_builtin_ccids - Provide copy of `builtin' CCID array
112 * @ccid_array: pointer to copy into
113 * @array_len: value to return length into
114 * This function allocates memory - caller must see that it is freed after use.
115 */
116int ccid_get_builtin_ccids(u8 **ccid_array, u8 *array_len)
117{
118 *ccid_array = kmemdup(builtin_ccids, sizeof(builtin_ccids), gfp_any());
119 if (*ccid_array == NULL)
120 return -ENOBUFS;
121 *array_len = ARRAY_SIZE(builtin_ccids);
122 return 0;
123}
124
125int ccid_getsockopt_builtin_ccids(struct sock *sk, int len,
126 char __user *optval, int __user *optlen)
127{
128 if (len < sizeof(builtin_ccids))
129 return -EINVAL;
130
131 if (put_user(sizeof(builtin_ccids), optlen) ||
132 copy_to_user(optval, builtin_ccids, sizeof(builtin_ccids)))
133 return -EFAULT;
134 return 0;
135}
136
89int ccid_register(struct ccid_operations *ccid_ops) 137int ccid_register(struct ccid_operations *ccid_ops)
90{ 138{
91 int err = -ENOBUFS; 139 int err = -ENOBUFS;
@@ -148,22 +196,41 @@ int ccid_unregister(struct ccid_operations *ccid_ops)
148 196
149EXPORT_SYMBOL_GPL(ccid_unregister); 197EXPORT_SYMBOL_GPL(ccid_unregister);
150 198
199/**
200 * ccid_request_module - Pre-load CCID module for later use
201 * This should be called only from process context (e.g. during connection
202 * setup) and is necessary for later calls to ccid_new (typically in software
203 * interrupt), so that it has the modules available when they are needed.
204 */
205static int ccid_request_module(u8 id)
206{
207 if (!in_atomic()) {
208 ccids_read_lock();
209 if (ccids[id] == NULL) {
210 ccids_read_unlock();
211 return request_module("net-dccp-ccid-%d", id);
212 }
213 ccids_read_unlock();
214 }
215 return 0;
216}
217
218int ccid_request_modules(u8 const *ccid_array, u8 array_len)
219{
220#ifdef CONFIG_KMOD
221 while (array_len--)
222 if (ccid_request_module(ccid_array[array_len]))
223 return -1;
224#endif
225 return 0;
226}
227
151struct ccid *ccid_new(unsigned char id, struct sock *sk, int rx, gfp_t gfp) 228struct ccid *ccid_new(unsigned char id, struct sock *sk, int rx, gfp_t gfp)
152{ 229{
153 struct ccid_operations *ccid_ops; 230 struct ccid_operations *ccid_ops;
154 struct ccid *ccid = NULL; 231 struct ccid *ccid = NULL;
155 232
156 ccids_read_lock(); 233 ccids_read_lock();
157#ifdef CONFIG_KMOD
158 if (ccids[id] == NULL) {
159 /* We only try to load if in process context */
160 ccids_read_unlock();
161 if (gfp & GFP_ATOMIC)
162 goto out;
163 request_module("net-dccp-ccid-%d", id);
164 ccids_read_lock();
165 }
166#endif
167 ccid_ops = ccids[id]; 234 ccid_ops = ccids[id];
168 if (ccid_ops == NULL) 235 if (ccid_ops == NULL)
169 goto out_unlock; 236 goto out_unlock;
@@ -205,20 +272,6 @@ out_module_put:
205 272
206EXPORT_SYMBOL_GPL(ccid_new); 273EXPORT_SYMBOL_GPL(ccid_new);
207 274
208struct ccid *ccid_hc_rx_new(unsigned char id, struct sock *sk, gfp_t gfp)
209{
210 return ccid_new(id, sk, 1, gfp);
211}
212
213EXPORT_SYMBOL_GPL(ccid_hc_rx_new);
214
215struct ccid *ccid_hc_tx_new(unsigned char id,struct sock *sk, gfp_t gfp)
216{
217 return ccid_new(id, sk, 0, gfp);
218}
219
220EXPORT_SYMBOL_GPL(ccid_hc_tx_new);
221
222static void ccid_delete(struct ccid *ccid, struct sock *sk, int rx) 275static void ccid_delete(struct ccid *ccid, struct sock *sk, int rx)
223{ 276{
224 struct ccid_operations *ccid_ops; 277 struct ccid_operations *ccid_ops;
diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h
index fdeae7b57319..d27054ba2159 100644
--- a/net/dccp/ccid.h
+++ b/net/dccp/ccid.h
@@ -60,22 +60,18 @@ struct ccid_operations {
60 void (*ccid_hc_tx_exit)(struct sock *sk); 60 void (*ccid_hc_tx_exit)(struct sock *sk);
61 void (*ccid_hc_rx_packet_recv)(struct sock *sk, 61 void (*ccid_hc_rx_packet_recv)(struct sock *sk,
62 struct sk_buff *skb); 62 struct sk_buff *skb);
63 int (*ccid_hc_rx_parse_options)(struct sock *sk, 63 int (*ccid_hc_rx_parse_options)(struct sock *sk, u8 pkt,
64 unsigned char option, 64 u8 opt, u8 *val, u8 len);
65 unsigned char len, u16 idx,
66 unsigned char* value);
67 int (*ccid_hc_rx_insert_options)(struct sock *sk, 65 int (*ccid_hc_rx_insert_options)(struct sock *sk,
68 struct sk_buff *skb); 66 struct sk_buff *skb);
69 void (*ccid_hc_tx_packet_recv)(struct sock *sk, 67 void (*ccid_hc_tx_packet_recv)(struct sock *sk,
70 struct sk_buff *skb); 68 struct sk_buff *skb);
71 int (*ccid_hc_tx_parse_options)(struct sock *sk, 69 int (*ccid_hc_tx_parse_options)(struct sock *sk, u8 pkt,
72 unsigned char option, 70 u8 opt, u8 *val, u8 len);
73 unsigned char len, u16 idx,
74 unsigned char* value);
75 int (*ccid_hc_tx_send_packet)(struct sock *sk, 71 int (*ccid_hc_tx_send_packet)(struct sock *sk,
76 struct sk_buff *skb); 72 struct sk_buff *skb);
77 void (*ccid_hc_tx_packet_sent)(struct sock *sk, 73 void (*ccid_hc_tx_packet_sent)(struct sock *sk,
78 int more, unsigned int len); 74 unsigned int len);
79 void (*ccid_hc_rx_get_info)(struct sock *sk, 75 void (*ccid_hc_rx_get_info)(struct sock *sk,
80 struct tcp_info *info); 76 struct tcp_info *info);
81 void (*ccid_hc_tx_get_info)(struct sock *sk, 77 void (*ccid_hc_tx_get_info)(struct sock *sk,
@@ -103,31 +99,78 @@ static inline void *ccid_priv(const struct ccid *ccid)
103 return (void *)ccid->ccid_priv; 99 return (void *)ccid->ccid_priv;
104} 100}
105 101
102extern bool ccid_support_check(u8 const *ccid_array, u8 array_len);
103extern int ccid_get_builtin_ccids(u8 **ccid_array, u8 *array_len);
104extern int ccid_getsockopt_builtin_ccids(struct sock *sk, int len,
105 char __user *, int __user *);
106
107extern int ccid_request_modules(u8 const *ccid_array, u8 array_len);
106extern struct ccid *ccid_new(unsigned char id, struct sock *sk, int rx, 108extern struct ccid *ccid_new(unsigned char id, struct sock *sk, int rx,
107 gfp_t gfp); 109 gfp_t gfp);
108 110
109extern struct ccid *ccid_hc_rx_new(unsigned char id, struct sock *sk, 111static inline int ccid_get_current_rx_ccid(struct dccp_sock *dp)
110 gfp_t gfp); 112{
111extern struct ccid *ccid_hc_tx_new(unsigned char id, struct sock *sk, 113 struct ccid *ccid = dp->dccps_hc_rx_ccid;
112 gfp_t gfp); 114
115 if (ccid == NULL || ccid->ccid_ops == NULL)
116 return -1;
117 return ccid->ccid_ops->ccid_id;
118}
119
120static inline int ccid_get_current_tx_ccid(struct dccp_sock *dp)
121{
122 struct ccid *ccid = dp->dccps_hc_tx_ccid;
123
124 if (ccid == NULL || ccid->ccid_ops == NULL)
125 return -1;
126 return ccid->ccid_ops->ccid_id;
127}
113 128
114extern void ccid_hc_rx_delete(struct ccid *ccid, struct sock *sk); 129extern void ccid_hc_rx_delete(struct ccid *ccid, struct sock *sk);
115extern void ccid_hc_tx_delete(struct ccid *ccid, struct sock *sk); 130extern void ccid_hc_tx_delete(struct ccid *ccid, struct sock *sk);
116 131
132/*
133 * Congestion control of queued data packets via CCID decision.
134 *
135 * The TX CCID performs its congestion-control by indicating whether and when a
136 * queued packet may be sent, using the return code of ccid_hc_tx_send_packet().
137 * The following modes are supported via the symbolic constants below:
138 * - timer-based pacing (CCID returns a delay value in milliseconds);
139 * - autonomous dequeueing (CCID internally schedules dccps_xmitlet).
140 */
141
142enum ccid_dequeueing_decision {
143 CCID_PACKET_SEND_AT_ONCE = 0x00000, /* "green light": no delay */
144 CCID_PACKET_DELAY_MAX = 0x0FFFF, /* maximum delay in msecs */
145 CCID_PACKET_DELAY = 0x10000, /* CCID msec-delay mode */
146 CCID_PACKET_WILL_DEQUEUE_LATER = 0x20000, /* CCID autonomous mode */
147 CCID_PACKET_ERR = 0xF0000, /* error condition */
148};
149
150static inline int ccid_packet_dequeue_eval(const int return_code)
151{
152 if (return_code < 0)
153 return CCID_PACKET_ERR;
154 if (return_code == 0)
155 return CCID_PACKET_SEND_AT_ONCE;
156 if (return_code <= CCID_PACKET_DELAY_MAX)
157 return CCID_PACKET_DELAY;
158 return return_code;
159}
160
117static inline int ccid_hc_tx_send_packet(struct ccid *ccid, struct sock *sk, 161static inline int ccid_hc_tx_send_packet(struct ccid *ccid, struct sock *sk,
118 struct sk_buff *skb) 162 struct sk_buff *skb)
119{ 163{
120 int rc = 0;
121 if (ccid->ccid_ops->ccid_hc_tx_send_packet != NULL) 164 if (ccid->ccid_ops->ccid_hc_tx_send_packet != NULL)
122 rc = ccid->ccid_ops->ccid_hc_tx_send_packet(sk, skb); 165 return ccid->ccid_ops->ccid_hc_tx_send_packet(sk, skb);
123 return rc; 166 return CCID_PACKET_SEND_AT_ONCE;
124} 167}
125 168
126static inline void ccid_hc_tx_packet_sent(struct ccid *ccid, struct sock *sk, 169static inline void ccid_hc_tx_packet_sent(struct ccid *ccid, struct sock *sk,
127 int more, unsigned int len) 170 unsigned int len)
128{ 171{
129 if (ccid->ccid_ops->ccid_hc_tx_packet_sent != NULL) 172 if (ccid->ccid_ops->ccid_hc_tx_packet_sent != NULL)
130 ccid->ccid_ops->ccid_hc_tx_packet_sent(sk, more, len); 173 ccid->ccid_ops->ccid_hc_tx_packet_sent(sk, len);
131} 174}
132 175
133static inline void ccid_hc_rx_packet_recv(struct ccid *ccid, struct sock *sk, 176static inline void ccid_hc_rx_packet_recv(struct ccid *ccid, struct sock *sk,
@@ -144,27 +187,31 @@ static inline void ccid_hc_tx_packet_recv(struct ccid *ccid, struct sock *sk,
144 ccid->ccid_ops->ccid_hc_tx_packet_recv(sk, skb); 187 ccid->ccid_ops->ccid_hc_tx_packet_recv(sk, skb);
145} 188}
146 189
190/**
191 * ccid_hc_tx_parse_options - Parse CCID-specific options sent by the receiver
192 * @pkt: type of packet that @opt appears on (RFC 4340, 5.1)
193 * @opt: the CCID-specific option type (RFC 4340, 5.8 and 10.3)
194 * @val: value of @opt
195 * @len: length of @val in bytes
196 */
147static inline int ccid_hc_tx_parse_options(struct ccid *ccid, struct sock *sk, 197static inline int ccid_hc_tx_parse_options(struct ccid *ccid, struct sock *sk,
148 unsigned char option, 198 u8 pkt, u8 opt, u8 *val, u8 len)
149 unsigned char len, u16 idx,
150 unsigned char* value)
151{ 199{
152 int rc = 0; 200 if (ccid->ccid_ops->ccid_hc_tx_parse_options == NULL)
153 if (ccid->ccid_ops->ccid_hc_tx_parse_options != NULL) 201 return 0;
154 rc = ccid->ccid_ops->ccid_hc_tx_parse_options(sk, option, len, idx, 202 return ccid->ccid_ops->ccid_hc_tx_parse_options(sk, pkt, opt, val, len);
155 value);
156 return rc;
157} 203}
158 204
205/**
206 * ccid_hc_rx_parse_options - Parse CCID-specific options sent by the sender
207 * Arguments are analogous to ccid_hc_tx_parse_options()
208 */
159static inline int ccid_hc_rx_parse_options(struct ccid *ccid, struct sock *sk, 209static inline int ccid_hc_rx_parse_options(struct ccid *ccid, struct sock *sk,
160 unsigned char option, 210 u8 pkt, u8 opt, u8 *val, u8 len)
161 unsigned char len, u16 idx,
162 unsigned char* value)
163{ 211{
164 int rc = 0; 212 if (ccid->ccid_ops->ccid_hc_rx_parse_options == NULL)
165 if (ccid->ccid_ops->ccid_hc_rx_parse_options != NULL) 213 return 0;
166 rc = ccid->ccid_ops->ccid_hc_rx_parse_options(sk, option, len, idx, value); 214 return ccid->ccid_ops->ccid_hc_rx_parse_options(sk, pkt, opt, val, len);
167 return rc;
168} 215}
169 216
170static inline int ccid_hc_rx_insert_options(struct ccid *ccid, struct sock *sk, 217static inline int ccid_hc_rx_insert_options(struct ccid *ccid, struct sock *sk,
diff --git a/net/dccp/ccids/Kconfig b/net/dccp/ccids/Kconfig
index 12275943eab8..fb168be2cb43 100644
--- a/net/dccp/ccids/Kconfig
+++ b/net/dccp/ccids/Kconfig
@@ -1,10 +1,8 @@
1menu "DCCP CCIDs Configuration (EXPERIMENTAL)" 1menu "DCCP CCIDs Configuration (EXPERIMENTAL)"
2 depends on EXPERIMENTAL
3 2
4config IP_DCCP_CCID2 3config IP_DCCP_CCID2
5 tristate "CCID2 (TCP-Like) (EXPERIMENTAL)" 4 tristate "CCID2 (TCP-Like)"
6 def_tristate IP_DCCP 5 def_tristate IP_DCCP
7 select IP_DCCP_ACKVEC
8 ---help--- 6 ---help---
9 CCID 2, TCP-like Congestion Control, denotes Additive Increase, 7 CCID 2, TCP-like Congestion Control, denotes Additive Increase,
10 Multiplicative Decrease (AIMD) congestion control with behavior 8 Multiplicative Decrease (AIMD) congestion control with behavior
@@ -36,7 +34,7 @@ config IP_DCCP_CCID2_DEBUG
36 If in doubt, say N. 34 If in doubt, say N.
37 35
38config IP_DCCP_CCID3 36config IP_DCCP_CCID3
39 tristate "CCID3 (TCP-Friendly) (EXPERIMENTAL)" 37 tristate "CCID3 (TCP-Friendly)"
40 def_tristate IP_DCCP 38 def_tristate IP_DCCP
41 select IP_DCCP_TFRC_LIB 39 select IP_DCCP_TFRC_LIB
42 ---help--- 40 ---help---
@@ -64,9 +62,9 @@ config IP_DCCP_CCID3
64 62
65 If in doubt, say M. 63 If in doubt, say M.
66 64
65if IP_DCCP_CCID3
67config IP_DCCP_CCID3_DEBUG 66config IP_DCCP_CCID3_DEBUG
68 bool "CCID3 debugging messages" 67 bool "CCID3 debugging messages"
69 depends on IP_DCCP_CCID3
70 ---help--- 68 ---help---
71 Enable CCID3-specific debugging messages. 69 Enable CCID3-specific debugging messages.
72 70
@@ -76,10 +74,29 @@ config IP_DCCP_CCID3_DEBUG
76 74
77 If in doubt, say N. 75 If in doubt, say N.
78 76
77choice
78 prompt "Select method for measuring the packet size s"
79 default IP_DCCP_CCID3_MEASURE_S_AS_MPS
80
81config IP_DCCP_CCID3_MEASURE_S_AS_MPS
82 bool "Always use MPS in place of s"
83 ---help---
84 This use is recommended as it is consistent with the initialisation
85 of X and suggested when s varies (rfc3448bis, (1) in section 4.1).
86config IP_DCCP_CCID3_MEASURE_S_AS_AVG
87 bool "Use moving average"
88 ---help---
89 An alternative way of tracking s, also supported by rfc3448bis.
90 This used to be the default for CCID-3 in previous kernels.
91config IP_DCCP_CCID3_MEASURE_S_AS_MAX
92 bool "Track the maximum payload length"
93 ---help---
94 An experimental method based on tracking the maximum packet size.
95endchoice
96
79config IP_DCCP_CCID3_RTO 97config IP_DCCP_CCID3_RTO
80 int "Use higher bound for nofeedback timer" 98 int "Use higher bound for nofeedback timer"
81 default 100 99 default 100
82 depends on IP_DCCP_CCID3 && EXPERIMENTAL
83 ---help--- 100 ---help---
84 Use higher lower bound for nofeedback timer expiration. 101 Use higher lower bound for nofeedback timer expiration.
85 102
@@ -106,6 +123,7 @@ config IP_DCCP_CCID3_RTO
106 The purpose of the nofeedback timer is to slow DCCP down when there 123 The purpose of the nofeedback timer is to slow DCCP down when there
107 is serious network congestion: experimenting with larger values should 124 is serious network congestion: experimenting with larger values should
108 therefore not be performed on WANs. 125 therefore not be performed on WANs.
126endif # IP_DCCP_CCID3
109 127
110config IP_DCCP_TFRC_LIB 128config IP_DCCP_TFRC_LIB
111 tristate 129 tristate
diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c
index 9a430734530c..fa713227c66f 100644
--- a/net/dccp/ccids/ccid2.c
+++ b/net/dccp/ccids/ccid2.c
@@ -25,7 +25,7 @@
25/* 25/*
26 * This implementation should follow RFC 4341 26 * This implementation should follow RFC 4341
27 */ 27 */
28 28#include "../feat.h"
29#include "../ccid.h" 29#include "../ccid.h"
30#include "../dccp.h" 30#include "../dccp.h"
31#include "ccid2.h" 31#include "ccid2.h"
@@ -34,51 +34,8 @@
34#ifdef CONFIG_IP_DCCP_CCID2_DEBUG 34#ifdef CONFIG_IP_DCCP_CCID2_DEBUG
35static int ccid2_debug; 35static int ccid2_debug;
36#define ccid2_pr_debug(format, a...) DCCP_PR_DEBUG(ccid2_debug, format, ##a) 36#define ccid2_pr_debug(format, a...) DCCP_PR_DEBUG(ccid2_debug, format, ##a)
37
38static void ccid2_hc_tx_check_sanity(const struct ccid2_hc_tx_sock *hctx)
39{
40 int len = 0;
41 int pipe = 0;
42 struct ccid2_seq *seqp = hctx->ccid2hctx_seqh;
43
44 /* there is data in the chain */
45 if (seqp != hctx->ccid2hctx_seqt) {
46 seqp = seqp->ccid2s_prev;
47 len++;
48 if (!seqp->ccid2s_acked)
49 pipe++;
50
51 while (seqp != hctx->ccid2hctx_seqt) {
52 struct ccid2_seq *prev = seqp->ccid2s_prev;
53
54 len++;
55 if (!prev->ccid2s_acked)
56 pipe++;
57
58 /* packets are sent sequentially */
59 BUG_ON(dccp_delta_seqno(seqp->ccid2s_seq,
60 prev->ccid2s_seq ) >= 0);
61 BUG_ON(time_before(seqp->ccid2s_sent,
62 prev->ccid2s_sent));
63
64 seqp = prev;
65 }
66 }
67
68 BUG_ON(pipe != hctx->ccid2hctx_pipe);
69 ccid2_pr_debug("len of chain=%d\n", len);
70
71 do {
72 seqp = seqp->ccid2s_prev;
73 len++;
74 } while (seqp != hctx->ccid2hctx_seqh);
75
76 ccid2_pr_debug("total len=%d\n", len);
77 BUG_ON(len != hctx->ccid2hctx_seqbufc * CCID2_SEQBUF_LEN);
78}
79#else 37#else
80#define ccid2_pr_debug(format, a...) 38#define ccid2_pr_debug(format, a...)
81#define ccid2_hc_tx_check_sanity(hctx)
82#endif 39#endif
83 40
84static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hctx) 41static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hctx)
@@ -87,8 +44,7 @@ static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hctx)
87 int i; 44 int i;
88 45
89 /* check if we have space to preserve the pointer to the buffer */ 46 /* check if we have space to preserve the pointer to the buffer */
90 if (hctx->ccid2hctx_seqbufc >= (sizeof(hctx->ccid2hctx_seqbuf) / 47 if (hctx->seqbufc >= sizeof(hctx->seqbuf) / sizeof(struct ccid2_seq *))
91 sizeof(struct ccid2_seq*)))
92 return -ENOMEM; 48 return -ENOMEM;
93 49
94 /* allocate buffer and initialize linked list */ 50 /* allocate buffer and initialize linked list */
@@ -104,38 +60,35 @@ static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hctx)
104 seqp->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1]; 60 seqp->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1];
105 61
106 /* This is the first allocation. Initiate the head and tail. */ 62 /* This is the first allocation. Initiate the head and tail. */
107 if (hctx->ccid2hctx_seqbufc == 0) 63 if (hctx->seqbufc == 0)
108 hctx->ccid2hctx_seqh = hctx->ccid2hctx_seqt = seqp; 64 hctx->seqh = hctx->seqt = seqp;
109 else { 65 else {
110 /* link the existing list with the one we just created */ 66 /* link the existing list with the one we just created */
111 hctx->ccid2hctx_seqh->ccid2s_next = seqp; 67 hctx->seqh->ccid2s_next = seqp;
112 seqp->ccid2s_prev = hctx->ccid2hctx_seqh; 68 seqp->ccid2s_prev = hctx->seqh;
113 69
114 hctx->ccid2hctx_seqt->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1]; 70 hctx->seqt->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1];
115 seqp[CCID2_SEQBUF_LEN - 1].ccid2s_next = hctx->ccid2hctx_seqt; 71 seqp[CCID2_SEQBUF_LEN - 1].ccid2s_next = hctx->seqt;
116 } 72 }
117 73
118 /* store the original pointer to the buffer so we can free it */ 74 /* store the original pointer to the buffer so we can free it */
119 hctx->ccid2hctx_seqbuf[hctx->ccid2hctx_seqbufc] = seqp; 75 hctx->seqbuf[hctx->seqbufc] = seqp;
120 hctx->ccid2hctx_seqbufc++; 76 hctx->seqbufc++;
121 77
122 return 0; 78 return 0;
123} 79}
124 80
125static int ccid2_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) 81static int ccid2_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
126{ 82{
127 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); 83 if (ccid2_cwnd_network_limited(ccid2_hc_tx_sk(sk)))
128 84 return CCID_PACKET_WILL_DEQUEUE_LATER;
129 if (hctx->ccid2hctx_pipe < hctx->ccid2hctx_cwnd) 85 return CCID_PACKET_SEND_AT_ONCE;
130 return 0;
131
132 return 1; /* XXX CCID should dequeue when ready instead of polling */
133} 86}
134 87
135static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val) 88static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val)
136{ 89{
137 struct dccp_sock *dp = dccp_sk(sk); 90 struct dccp_sock *dp = dccp_sk(sk);
138 u32 max_ratio = DIV_ROUND_UP(ccid2_hc_tx_sk(sk)->ccid2hctx_cwnd, 2); 91 u32 max_ratio = DIV_ROUND_UP(ccid2_hc_tx_sk(sk)->cwnd, 2);
139 92
140 /* 93 /*
141 * Ensure that Ack Ratio does not exceed ceil(cwnd/2), which is (2) from 94 * Ensure that Ack Ratio does not exceed ceil(cwnd/2), which is (2) from
@@ -147,8 +100,8 @@ static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val)
147 DCCP_WARN("Limiting Ack Ratio (%u) to %u\n", val, max_ratio); 100 DCCP_WARN("Limiting Ack Ratio (%u) to %u\n", val, max_ratio);
148 val = max_ratio; 101 val = max_ratio;
149 } 102 }
150 if (val > 0xFFFF) /* RFC 4340, 11.3 */ 103 if (val > DCCPF_ACK_RATIO_MAX)
151 val = 0xFFFF; 104 val = DCCPF_ACK_RATIO_MAX;
152 105
153 if (val == dp->dccps_l_ack_ratio) 106 if (val == dp->dccps_l_ack_ratio)
154 return; 107 return;
@@ -157,99 +110,77 @@ static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val)
157 dp->dccps_l_ack_ratio = val; 110 dp->dccps_l_ack_ratio = val;
158} 111}
159 112
160static void ccid2_change_srtt(struct ccid2_hc_tx_sock *hctx, long val)
161{
162 ccid2_pr_debug("change SRTT to %ld\n", val);
163 hctx->ccid2hctx_srtt = val;
164}
165
166static void ccid2_start_rto_timer(struct sock *sk);
167
168static void ccid2_hc_tx_rto_expire(unsigned long data) 113static void ccid2_hc_tx_rto_expire(unsigned long data)
169{ 114{
170 struct sock *sk = (struct sock *)data; 115 struct sock *sk = (struct sock *)data;
171 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); 116 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
172 long s; 117 const bool sender_was_blocked = ccid2_cwnd_network_limited(hctx);
173 118
174 bh_lock_sock(sk); 119 bh_lock_sock(sk);
175 if (sock_owned_by_user(sk)) { 120 if (sock_owned_by_user(sk)) {
176 sk_reset_timer(sk, &hctx->ccid2hctx_rtotimer, 121 sk_reset_timer(sk, &hctx->rtotimer, jiffies + HZ / 5);
177 jiffies + HZ / 5);
178 goto out; 122 goto out;
179 } 123 }
180 124
181 ccid2_pr_debug("RTO_EXPIRE\n"); 125 ccid2_pr_debug("RTO_EXPIRE\n");
182 126
183 ccid2_hc_tx_check_sanity(hctx);
184
185 /* back-off timer */ 127 /* back-off timer */
186 hctx->ccid2hctx_rto <<= 1; 128 hctx->rto <<= 1;
187 129 if (hctx->rto > DCCP_RTO_MAX)
188 s = hctx->ccid2hctx_rto / HZ; 130 hctx->rto = DCCP_RTO_MAX;
189 if (s > 60)
190 hctx->ccid2hctx_rto = 60 * HZ;
191
192 ccid2_start_rto_timer(sk);
193 131
194 /* adjust pipe, cwnd etc */ 132 /* adjust pipe, cwnd etc */
195 hctx->ccid2hctx_ssthresh = hctx->ccid2hctx_cwnd / 2; 133 hctx->ssthresh = hctx->cwnd / 2;
196 if (hctx->ccid2hctx_ssthresh < 2) 134 if (hctx->ssthresh < 2)
197 hctx->ccid2hctx_ssthresh = 2; 135 hctx->ssthresh = 2;
198 hctx->ccid2hctx_cwnd = 1; 136 hctx->cwnd = 1;
199 hctx->ccid2hctx_pipe = 0; 137 hctx->pipe = 0;
200 138
201 /* clear state about stuff we sent */ 139 /* clear state about stuff we sent */
202 hctx->ccid2hctx_seqt = hctx->ccid2hctx_seqh; 140 hctx->seqt = hctx->seqh;
203 hctx->ccid2hctx_packets_acked = 0; 141 hctx->packets_acked = 0;
204 142
205 /* clear ack ratio state. */ 143 /* clear ack ratio state. */
206 hctx->ccid2hctx_rpseq = 0; 144 hctx->rpseq = 0;
207 hctx->ccid2hctx_rpdupack = -1; 145 hctx->rpdupack = -1;
208 ccid2_change_l_ack_ratio(sk, 1); 146 ccid2_change_l_ack_ratio(sk, 1);
209 ccid2_hc_tx_check_sanity(hctx); 147
148 /* if we were blocked before, we may now send cwnd=1 packet */
149 if (sender_was_blocked)
150 tasklet_schedule(&dccp_sk(sk)->dccps_xmitlet);
151 /* restart backed-off timer */
152 sk_reset_timer(sk, &hctx->rtotimer, jiffies + hctx->rto);
210out: 153out:
211 bh_unlock_sock(sk); 154 bh_unlock_sock(sk);
212 sock_put(sk); 155 sock_put(sk);
213} 156}
214 157
215static void ccid2_start_rto_timer(struct sock *sk) 158static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len)
216{
217 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
218
219 ccid2_pr_debug("setting RTO timeout=%ld\n", hctx->ccid2hctx_rto);
220
221 BUG_ON(timer_pending(&hctx->ccid2hctx_rtotimer));
222 sk_reset_timer(sk, &hctx->ccid2hctx_rtotimer,
223 jiffies + hctx->ccid2hctx_rto);
224}
225
226static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len)
227{ 159{
228 struct dccp_sock *dp = dccp_sk(sk); 160 struct dccp_sock *dp = dccp_sk(sk);
229 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); 161 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
230 struct ccid2_seq *next; 162 struct ccid2_seq *next;
231 163
232 hctx->ccid2hctx_pipe++; 164 hctx->pipe++;
233 165
234 hctx->ccid2hctx_seqh->ccid2s_seq = dp->dccps_gss; 166 hctx->seqh->ccid2s_seq = dp->dccps_gss;
235 hctx->ccid2hctx_seqh->ccid2s_acked = 0; 167 hctx->seqh->ccid2s_acked = 0;
236 hctx->ccid2hctx_seqh->ccid2s_sent = jiffies; 168 hctx->seqh->ccid2s_sent = jiffies;
237 169
238 next = hctx->ccid2hctx_seqh->ccid2s_next; 170 next = hctx->seqh->ccid2s_next;
239 /* check if we need to alloc more space */ 171 /* check if we need to alloc more space */
240 if (next == hctx->ccid2hctx_seqt) { 172 if (next == hctx->seqt) {
241 if (ccid2_hc_tx_alloc_seq(hctx)) { 173 if (ccid2_hc_tx_alloc_seq(hctx)) {
242 DCCP_CRIT("packet history - out of memory!"); 174 DCCP_CRIT("packet history - out of memory!");
243 /* FIXME: find a more graceful way to bail out */ 175 /* FIXME: find a more graceful way to bail out */
244 return; 176 return;
245 } 177 }
246 next = hctx->ccid2hctx_seqh->ccid2s_next; 178 next = hctx->seqh->ccid2s_next;
247 BUG_ON(next == hctx->ccid2hctx_seqt); 179 BUG_ON(next == hctx->seqt);
248 } 180 }
249 hctx->ccid2hctx_seqh = next; 181 hctx->seqh = next;
250 182
251 ccid2_pr_debug("cwnd=%d pipe=%d\n", hctx->ccid2hctx_cwnd, 183 ccid2_pr_debug("cwnd=%d pipe=%d\n", hctx->cwnd, hctx->pipe);
252 hctx->ccid2hctx_pipe);
253 184
254 /* 185 /*
255 * FIXME: The code below is broken and the variables have been removed 186 * FIXME: The code below is broken and the variables have been removed
@@ -272,12 +203,12 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len)
272 */ 203 */
273#if 0 204#if 0
274 /* Ack Ratio. Need to maintain a concept of how many windows we sent */ 205 /* Ack Ratio. Need to maintain a concept of how many windows we sent */
275 hctx->ccid2hctx_arsent++; 206 hctx->arsent++;
276 /* We had an ack loss in this window... */ 207 /* We had an ack loss in this window... */
277 if (hctx->ccid2hctx_ackloss) { 208 if (hctx->ackloss) {
278 if (hctx->ccid2hctx_arsent >= hctx->ccid2hctx_cwnd) { 209 if (hctx->arsent >= hctx->cwnd) {
279 hctx->ccid2hctx_arsent = 0; 210 hctx->arsent = 0;
280 hctx->ccid2hctx_ackloss = 0; 211 hctx->ackloss = 0;
281 } 212 }
282 } else { 213 } else {
283 /* No acks lost up to now... */ 214 /* No acks lost up to now... */
@@ -287,28 +218,28 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len)
287 int denom = dp->dccps_l_ack_ratio * dp->dccps_l_ack_ratio - 218 int denom = dp->dccps_l_ack_ratio * dp->dccps_l_ack_ratio -
288 dp->dccps_l_ack_ratio; 219 dp->dccps_l_ack_ratio;
289 220
290 denom = hctx->ccid2hctx_cwnd * hctx->ccid2hctx_cwnd / denom; 221 denom = hctx->cwnd * hctx->cwnd / denom;
291 222
292 if (hctx->ccid2hctx_arsent >= denom) { 223 if (hctx->arsent >= denom) {
293 ccid2_change_l_ack_ratio(sk, dp->dccps_l_ack_ratio - 1); 224 ccid2_change_l_ack_ratio(sk, dp->dccps_l_ack_ratio - 1);
294 hctx->ccid2hctx_arsent = 0; 225 hctx->arsent = 0;
295 } 226 }
296 } else { 227 } else {
297 /* we can't increase ack ratio further [1] */ 228 /* we can't increase ack ratio further [1] */
298 hctx->ccid2hctx_arsent = 0; /* or maybe set it to cwnd*/ 229 hctx->arsent = 0; /* or maybe set it to cwnd*/
299 } 230 }
300 } 231 }
301#endif 232#endif
302 233
303 /* setup RTO timer */ 234 /* setup RTO timer */
304 if (!timer_pending(&hctx->ccid2hctx_rtotimer)) 235 if (!timer_pending(&hctx->rtotimer))
305 ccid2_start_rto_timer(sk); 236 sk_reset_timer(sk, &hctx->rtotimer, jiffies + hctx->rto);
306 237
307#ifdef CONFIG_IP_DCCP_CCID2_DEBUG 238#ifdef CONFIG_IP_DCCP_CCID2_DEBUG
308 do { 239 do {
309 struct ccid2_seq *seqp = hctx->ccid2hctx_seqt; 240 struct ccid2_seq *seqp = hctx->seqt;
310 241
311 while (seqp != hctx->ccid2hctx_seqh) { 242 while (seqp != hctx->seqh) {
312 ccid2_pr_debug("out seq=%llu acked=%d time=%lu\n", 243 ccid2_pr_debug("out seq=%llu acked=%d time=%lu\n",
313 (unsigned long long)seqp->ccid2s_seq, 244 (unsigned long long)seqp->ccid2s_seq,
314 seqp->ccid2s_acked, seqp->ccid2s_sent); 245 seqp->ccid2s_acked, seqp->ccid2s_sent);
@@ -316,205 +247,158 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len)
316 } 247 }
317 } while (0); 248 } while (0);
318 ccid2_pr_debug("=========\n"); 249 ccid2_pr_debug("=========\n");
319 ccid2_hc_tx_check_sanity(hctx);
320#endif 250#endif
321} 251}
322 252
323/* XXX Lame code duplication! 253/**
324 * returns -1 if none was found. 254 * ccid2_rtt_estimator - Sample RTT and compute RTO using RFC2988 algorithm
325 * else returns the next offset to use in the function call. 255 * This code is almost identical with TCP's tcp_rtt_estimator(), since
256 * - it has a higher sampling frequency (recommended by RFC 1323),
257 * - the RTO does not collapse into RTT due to RTTVAR going towards zero,
258 * - it is simple (cf. more complex proposals such as Eifel timer or research
259 * which suggests that the gain should be set according to window size),
260 * - in tests it was found to work well with CCID2 [gerrit].
326 */ 261 */
327static int ccid2_ackvector(struct sock *sk, struct sk_buff *skb, int offset, 262static void ccid2_rtt_estimator(struct sock *sk, const long mrtt)
328 unsigned char **vec, unsigned char *veclen)
329{ 263{
330 const struct dccp_hdr *dh = dccp_hdr(skb); 264 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
331 unsigned char *options = (unsigned char *)dh + dccp_hdr_len(skb); 265 long m = mrtt ? : 1;
332 unsigned char *opt_ptr; 266
333 const unsigned char *opt_end = (unsigned char *)dh + 267 if (hctx->srtt == 0) {
334 (dh->dccph_doff * 4); 268 /* First measurement m */
335 unsigned char opt, len; 269 hctx->srtt = m << 3;
336 unsigned char *value; 270 hctx->mdev = m << 1;
337 271
338 BUG_ON(offset < 0); 272 hctx->mdev_max = max(TCP_RTO_MIN, hctx->mdev);
339 options += offset; 273 hctx->rttvar = hctx->mdev_max;
340 opt_ptr = options; 274 hctx->rtt_seq = dccp_sk(sk)->dccps_gss;
341 if (opt_ptr >= opt_end) 275 } else {
342 return -1; 276 /* Update scaled SRTT as SRTT += 1/8 * (m - SRTT) */
343 277 m -= (hctx->srtt >> 3);
344 while (opt_ptr != opt_end) { 278 hctx->srtt += m;
345 opt = *opt_ptr++; 279
346 len = 0; 280 /* Similarly, update scaled mdev with regard to |m| */
347 value = NULL; 281 if (m < 0) {
348 282 m = -m;
349 /* Check if this isn't a single byte option */ 283 m -= (hctx->mdev >> 2);
350 if (opt > DCCPO_MAX_RESERVED) {
351 if (opt_ptr == opt_end)
352 goto out_invalid_option;
353
354 len = *opt_ptr++;
355 if (len < 3)
356 goto out_invalid_option;
357 /* 284 /*
358 * Remove the type and len fields, leaving 285 * This neutralises RTO increase when RTT < SRTT - mdev
359 * just the value size 286 * (see P. Sarolahti, A. Kuznetsov,"Congestion Control
287 * in Linux TCP", USENIX 2002, pp. 49-62).
360 */ 288 */
361 len -= 2; 289 if (m > 0)
362 value = opt_ptr; 290 m >>= 3;
363 opt_ptr += len; 291 } else {
292 m -= (hctx->mdev >> 2);
293 }
294 hctx->mdev += m;
364 295
365 if (opt_ptr > opt_end) 296 if (hctx->mdev > hctx->mdev_max) {
366 goto out_invalid_option; 297 hctx->mdev_max = hctx->mdev;
298 if (hctx->mdev_max > hctx->rttvar)
299 hctx->rttvar = hctx->mdev_max;
367 } 300 }
368 301
369 switch (opt) { 302 /*
370 case DCCPO_ACK_VECTOR_0: 303 * Decay RTTVAR at most once per flight, exploiting that
371 case DCCPO_ACK_VECTOR_1: 304 * 1) pipe <= cwnd <= Sequence_Window = W (RFC 4340, 7.5.2)
372 *vec = value; 305 * 2) AWL = GSS-W+1 <= GAR <= GSS (RFC 4340, 7.5.1)
373 *veclen = len; 306 * GAR is a useful bound for FlightSize = pipe, AWL is probably
374 return offset + (opt_ptr - options); 307 * too low as it over-estimates pipe.
308 */
309 if (after48(dccp_sk(sk)->dccps_gar, hctx->rtt_seq)) {
310 if (hctx->mdev_max < hctx->rttvar)
311 hctx->rttvar -= (hctx->rttvar -
312 hctx->mdev_max) >> 2;
313 hctx->rtt_seq = dccp_sk(sk)->dccps_gss;
314 hctx->mdev_max = TCP_RTO_MIN;
375 } 315 }
376 } 316 }
377 317
378 return -1; 318 /*
379 319 * Set RTO from SRTT and RTTVAR
380out_invalid_option: 320 * Clock granularity is ignored since the minimum error for RTTVAR is
381 DCCP_BUG("Invalid option - this should not happen (previous parsing)!"); 321 * clamped to 50msec (corresponding to HZ=20). This leads to a minimum
382 return -1; 322 * RTO of 200msec. This agrees with TCP and RFC 4341, 5.: "Because DCCP
383} 323 * does not retransmit data, DCCP does not require TCP's recommended
384 324 * minimum timeout of one second".
385static void ccid2_hc_tx_kill_rto_timer(struct sock *sk) 325 */
386{ 326 hctx->rto = (hctx->srtt >> 3) + hctx->rttvar;
387 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
388 327
389 sk_stop_timer(sk, &hctx->ccid2hctx_rtotimer); 328 if (hctx->rto > DCCP_RTO_MAX)
390 ccid2_pr_debug("deleted RTO timer\n"); 329 hctx->rto = DCCP_RTO_MAX;
391} 330}
392 331
393static inline void ccid2_new_ack(struct sock *sk, 332static void ccid2_new_ack(struct sock *sk, struct ccid2_seq *seqp,
394 struct ccid2_seq *seqp, 333 unsigned int *maxincr)
395 unsigned int *maxincr)
396{ 334{
397 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); 335 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
398 336
399 if (hctx->ccid2hctx_cwnd < hctx->ccid2hctx_ssthresh) { 337 if (hctx->cwnd < hctx->ssthresh) {
400 if (*maxincr > 0 && ++hctx->ccid2hctx_packets_acked == 2) { 338 if (*maxincr > 0 && ++hctx->packets_acked == 2) {
401 hctx->ccid2hctx_cwnd += 1; 339 hctx->cwnd += 1;
402 *maxincr -= 1; 340 *maxincr -= 1;
403 hctx->ccid2hctx_packets_acked = 0; 341 hctx->packets_acked = 0;
404 } 342 }
405 } else if (++hctx->ccid2hctx_packets_acked >= hctx->ccid2hctx_cwnd) { 343 } else if (++hctx->packets_acked >= hctx->cwnd) {
406 hctx->ccid2hctx_cwnd += 1; 344 hctx->cwnd += 1;
407 hctx->ccid2hctx_packets_acked = 0; 345 hctx->packets_acked = 0;
408 } 346 }
409 347 /*
410 /* update RTO */ 348 * FIXME: RTT is sampled several times per acknowledgment (for each
411 if (hctx->ccid2hctx_srtt == -1 || 349 * entry in the Ack Vector), instead of once per Ack (as in TCP SACK).
412 time_after(jiffies, hctx->ccid2hctx_lastrtt + hctx->ccid2hctx_srtt)) { 350 * This causes the RTT to be over-estimated, since the older entries
413 unsigned long r = (long)jiffies - (long)seqp->ccid2s_sent; 351 * in the Ack Vector have earlier sending times.
414 int s; 352 * The cleanest solution is to not use the ccid2s_sent field at all
415 353 * and instead use DCCP timestamps - need to be resolved at some time.
416 /* first measurement */ 354 */
417 if (hctx->ccid2hctx_srtt == -1) { 355 ccid2_rtt_estimator(sk, jiffies - seqp->ccid2s_sent);
418 ccid2_pr_debug("R: %lu Time=%lu seq=%llu\n",
419 r, jiffies,
420 (unsigned long long)seqp->ccid2s_seq);
421 ccid2_change_srtt(hctx, r);
422 hctx->ccid2hctx_rttvar = r >> 1;
423 } else {
424 /* RTTVAR */
425 long tmp = hctx->ccid2hctx_srtt - r;
426 long srtt;
427
428 if (tmp < 0)
429 tmp *= -1;
430
431 tmp >>= 2;
432 hctx->ccid2hctx_rttvar *= 3;
433 hctx->ccid2hctx_rttvar >>= 2;
434 hctx->ccid2hctx_rttvar += tmp;
435
436 /* SRTT */
437 srtt = hctx->ccid2hctx_srtt;
438 srtt *= 7;
439 srtt >>= 3;
440 tmp = r >> 3;
441 srtt += tmp;
442 ccid2_change_srtt(hctx, srtt);
443 }
444 s = hctx->ccid2hctx_rttvar << 2;
445 /* clock granularity is 1 when based on jiffies */
446 if (!s)
447 s = 1;
448 hctx->ccid2hctx_rto = hctx->ccid2hctx_srtt + s;
449
450 /* must be at least a second */
451 s = hctx->ccid2hctx_rto / HZ;
452 /* DCCP doesn't require this [but I like it cuz my code sux] */
453#if 1
454 if (s < 1)
455 hctx->ccid2hctx_rto = HZ;
456#endif
457 /* max 60 seconds */
458 if (s > 60)
459 hctx->ccid2hctx_rto = HZ * 60;
460
461 hctx->ccid2hctx_lastrtt = jiffies;
462
463 ccid2_pr_debug("srtt: %ld rttvar: %ld rto: %ld (HZ=%d) R=%lu\n",
464 hctx->ccid2hctx_srtt, hctx->ccid2hctx_rttvar,
465 hctx->ccid2hctx_rto, HZ, r);
466 }
467
468 /* we got a new ack, so re-start RTO timer */
469 ccid2_hc_tx_kill_rto_timer(sk);
470 ccid2_start_rto_timer(sk);
471}
472
473static void ccid2_hc_tx_dec_pipe(struct sock *sk)
474{
475 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
476
477 if (hctx->ccid2hctx_pipe == 0)
478 DCCP_BUG("pipe == 0");
479 else
480 hctx->ccid2hctx_pipe--;
481
482 if (hctx->ccid2hctx_pipe == 0)
483 ccid2_hc_tx_kill_rto_timer(sk);
484} 356}
485 357
486static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp) 358static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp)
487{ 359{
488 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); 360 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
489 361
490 if (time_before(seqp->ccid2s_sent, hctx->ccid2hctx_last_cong)) { 362 if (time_before(seqp->ccid2s_sent, hctx->last_cong)) {
491 ccid2_pr_debug("Multiple losses in an RTT---treating as one\n"); 363 ccid2_pr_debug("Multiple losses in an RTT---treating as one\n");
492 return; 364 return;
493 } 365 }
494 366
495 hctx->ccid2hctx_last_cong = jiffies; 367 hctx->last_cong = jiffies;
496 368
497 hctx->ccid2hctx_cwnd = hctx->ccid2hctx_cwnd / 2 ? : 1U; 369 hctx->cwnd = hctx->cwnd / 2 ? : 1U;
498 hctx->ccid2hctx_ssthresh = max(hctx->ccid2hctx_cwnd, 2U); 370 hctx->ssthresh = max(hctx->cwnd, 2U);
499 371
500 /* Avoid spurious timeouts resulting from Ack Ratio > cwnd */ 372 /* Avoid spurious timeouts resulting from Ack Ratio > cwnd */
501 if (dccp_sk(sk)->dccps_l_ack_ratio > hctx->ccid2hctx_cwnd) 373 if (dccp_sk(sk)->dccps_l_ack_ratio > hctx->cwnd)
502 ccid2_change_l_ack_ratio(sk, hctx->ccid2hctx_cwnd); 374 ccid2_change_l_ack_ratio(sk, hctx->cwnd);
375}
376
377static int ccid2_hc_tx_parse_options(struct sock *sk, u8 packet_type,
378 u8 option, u8 *optval, u8 optlen)
379{
380 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
381
382 switch (option) {
383 case DCCPO_ACK_VECTOR_0:
384 case DCCPO_ACK_VECTOR_1:
385 return dccp_ackvec_parsed_add(&hctx->av_chunks, optval, optlen,
386 option - DCCPO_ACK_VECTOR_0);
387 }
388 return 0;
503} 389}
504 390
505static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) 391static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
506{ 392{
507 struct dccp_sock *dp = dccp_sk(sk); 393 struct dccp_sock *dp = dccp_sk(sk);
508 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); 394 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
395 const bool sender_was_blocked = ccid2_cwnd_network_limited(hctx);
396 struct dccp_ackvec_parsed *avp;
509 u64 ackno, seqno; 397 u64 ackno, seqno;
510 struct ccid2_seq *seqp; 398 struct ccid2_seq *seqp;
511 unsigned char *vector;
512 unsigned char veclen;
513 int offset = 0;
514 int done = 0; 399 int done = 0;
515 unsigned int maxincr = 0; 400 unsigned int maxincr = 0;
516 401
517 ccid2_hc_tx_check_sanity(hctx);
518 /* check reverse path congestion */ 402 /* check reverse path congestion */
519 seqno = DCCP_SKB_CB(skb)->dccpd_seq; 403 seqno = DCCP_SKB_CB(skb)->dccpd_seq;
520 404
@@ -523,21 +407,21 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
523 * -sorbo. 407 * -sorbo.
524 */ 408 */
525 /* need to bootstrap */ 409 /* need to bootstrap */
526 if (hctx->ccid2hctx_rpdupack == -1) { 410 if (hctx->rpdupack == -1) {
527 hctx->ccid2hctx_rpdupack = 0; 411 hctx->rpdupack = 0;
528 hctx->ccid2hctx_rpseq = seqno; 412 hctx->rpseq = seqno;
529 } else { 413 } else {
530 /* check if packet is consecutive */ 414 /* check if packet is consecutive */
531 if (dccp_delta_seqno(hctx->ccid2hctx_rpseq, seqno) == 1) 415 if (dccp_delta_seqno(hctx->rpseq, seqno) == 1)
532 hctx->ccid2hctx_rpseq = seqno; 416 hctx->rpseq = seqno;
533 /* it's a later packet */ 417 /* it's a later packet */
534 else if (after48(seqno, hctx->ccid2hctx_rpseq)) { 418 else if (after48(seqno, hctx->rpseq)) {
535 hctx->ccid2hctx_rpdupack++; 419 hctx->rpdupack++;
536 420
537 /* check if we got enough dupacks */ 421 /* check if we got enough dupacks */
538 if (hctx->ccid2hctx_rpdupack >= NUMDUPACK) { 422 if (hctx->rpdupack >= NUMDUPACK) {
539 hctx->ccid2hctx_rpdupack = -1; /* XXX lame */ 423 hctx->rpdupack = -1; /* XXX lame */
540 hctx->ccid2hctx_rpseq = 0; 424 hctx->rpseq = 0;
541 425
542 ccid2_change_l_ack_ratio(sk, 2 * dp->dccps_l_ack_ratio); 426 ccid2_change_l_ack_ratio(sk, 2 * dp->dccps_l_ack_ratio);
543 } 427 }
@@ -545,27 +429,22 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
545 } 429 }
546 430
547 /* check forward path congestion */ 431 /* check forward path congestion */
548 /* still didn't send out new data packets */ 432 if (dccp_packet_without_ack(skb))
549 if (hctx->ccid2hctx_seqh == hctx->ccid2hctx_seqt)
550 return; 433 return;
551 434
552 switch (DCCP_SKB_CB(skb)->dccpd_type) { 435 /* still didn't send out new data packets */
553 case DCCP_PKT_ACK: 436 if (hctx->seqh == hctx->seqt)
554 case DCCP_PKT_DATAACK: 437 goto done;
555 break;
556 default:
557 return;
558 }
559 438
560 ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq; 439 ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq;
561 if (after48(ackno, hctx->ccid2hctx_high_ack)) 440 if (after48(ackno, hctx->high_ack))
562 hctx->ccid2hctx_high_ack = ackno; 441 hctx->high_ack = ackno;
563 442
564 seqp = hctx->ccid2hctx_seqt; 443 seqp = hctx->seqt;
565 while (before48(seqp->ccid2s_seq, ackno)) { 444 while (before48(seqp->ccid2s_seq, ackno)) {
566 seqp = seqp->ccid2s_next; 445 seqp = seqp->ccid2s_next;
567 if (seqp == hctx->ccid2hctx_seqh) { 446 if (seqp == hctx->seqh) {
568 seqp = hctx->ccid2hctx_seqh->ccid2s_prev; 447 seqp = hctx->seqh->ccid2s_prev;
569 break; 448 break;
570 } 449 }
571 } 450 }
@@ -575,26 +454,26 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
575 * packets per acknowledgement. Rounding up avoids that cwnd is not 454 * packets per acknowledgement. Rounding up avoids that cwnd is not
576 * advanced when Ack Ratio is 1 and gives a slight edge otherwise. 455 * advanced when Ack Ratio is 1 and gives a slight edge otherwise.
577 */ 456 */
578 if (hctx->ccid2hctx_cwnd < hctx->ccid2hctx_ssthresh) 457 if (hctx->cwnd < hctx->ssthresh)
579 maxincr = DIV_ROUND_UP(dp->dccps_l_ack_ratio, 2); 458 maxincr = DIV_ROUND_UP(dp->dccps_l_ack_ratio, 2);
580 459
581 /* go through all ack vectors */ 460 /* go through all ack vectors */
582 while ((offset = ccid2_ackvector(sk, skb, offset, 461 list_for_each_entry(avp, &hctx->av_chunks, node) {
583 &vector, &veclen)) != -1) {
584 /* go through this ack vector */ 462 /* go through this ack vector */
585 while (veclen--) { 463 for (; avp->len--; avp->vec++) {
586 const u8 rl = *vector & DCCP_ACKVEC_LEN_MASK; 464 u64 ackno_end_rl = SUB48(ackno,
587 u64 ackno_end_rl = SUB48(ackno, rl); 465 dccp_ackvec_runlen(avp->vec));
588 466
589 ccid2_pr_debug("ackvec start:%llu end:%llu\n", 467 ccid2_pr_debug("ackvec %llu |%u,%u|\n",
590 (unsigned long long)ackno, 468 (unsigned long long)ackno,
591 (unsigned long long)ackno_end_rl); 469 dccp_ackvec_state(avp->vec) >> 6,
470 dccp_ackvec_runlen(avp->vec));
592 /* if the seqno we are analyzing is larger than the 471 /* if the seqno we are analyzing is larger than the
593 * current ackno, then move towards the tail of our 472 * current ackno, then move towards the tail of our
594 * seqnos. 473 * seqnos.
595 */ 474 */
596 while (after48(seqp->ccid2s_seq, ackno)) { 475 while (after48(seqp->ccid2s_seq, ackno)) {
597 if (seqp == hctx->ccid2hctx_seqt) { 476 if (seqp == hctx->seqt) {
598 done = 1; 477 done = 1;
599 break; 478 break;
600 } 479 }
@@ -607,26 +486,24 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
607 * run length 486 * run length
608 */ 487 */
609 while (between48(seqp->ccid2s_seq,ackno_end_rl,ackno)) { 488 while (between48(seqp->ccid2s_seq,ackno_end_rl,ackno)) {
610 const u8 state = *vector & 489 const u8 state = dccp_ackvec_state(avp->vec);
611 DCCP_ACKVEC_STATE_MASK;
612 490
613 /* new packet received or marked */ 491 /* new packet received or marked */
614 if (state != DCCP_ACKVEC_STATE_NOT_RECEIVED && 492 if (state != DCCPAV_NOT_RECEIVED &&
615 !seqp->ccid2s_acked) { 493 !seqp->ccid2s_acked) {
616 if (state == 494 if (state == DCCPAV_ECN_MARKED)
617 DCCP_ACKVEC_STATE_ECN_MARKED) {
618 ccid2_congestion_event(sk, 495 ccid2_congestion_event(sk,
619 seqp); 496 seqp);
620 } else 497 else
621 ccid2_new_ack(sk, seqp, 498 ccid2_new_ack(sk, seqp,
622 &maxincr); 499 &maxincr);
623 500
624 seqp->ccid2s_acked = 1; 501 seqp->ccid2s_acked = 1;
625 ccid2_pr_debug("Got ack for %llu\n", 502 ccid2_pr_debug("Got ack for %llu\n",
626 (unsigned long long)seqp->ccid2s_seq); 503 (unsigned long long)seqp->ccid2s_seq);
627 ccid2_hc_tx_dec_pipe(sk); 504 hctx->pipe--;
628 } 505 }
629 if (seqp == hctx->ccid2hctx_seqt) { 506 if (seqp == hctx->seqt) {
630 done = 1; 507 done = 1;
631 break; 508 break;
632 } 509 }
@@ -636,7 +513,6 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
636 break; 513 break;
637 514
638 ackno = SUB48(ackno_end_rl, 1); 515 ackno = SUB48(ackno_end_rl, 1);
639 vector++;
640 } 516 }
641 if (done) 517 if (done)
642 break; 518 break;
@@ -645,11 +521,11 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
645 /* The state about what is acked should be correct now 521 /* The state about what is acked should be correct now
646 * Check for NUMDUPACK 522 * Check for NUMDUPACK
647 */ 523 */
648 seqp = hctx->ccid2hctx_seqt; 524 seqp = hctx->seqt;
649 while (before48(seqp->ccid2s_seq, hctx->ccid2hctx_high_ack)) { 525 while (before48(seqp->ccid2s_seq, hctx->high_ack)) {
650 seqp = seqp->ccid2s_next; 526 seqp = seqp->ccid2s_next;
651 if (seqp == hctx->ccid2hctx_seqh) { 527 if (seqp == hctx->seqh) {
652 seqp = hctx->ccid2hctx_seqh->ccid2s_prev; 528 seqp = hctx->seqh->ccid2s_prev;
653 break; 529 break;
654 } 530 }
655 } 531 }
@@ -660,7 +536,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
660 if (done == NUMDUPACK) 536 if (done == NUMDUPACK)
661 break; 537 break;
662 } 538 }
663 if (seqp == hctx->ccid2hctx_seqt) 539 if (seqp == hctx->seqt)
664 break; 540 break;
665 seqp = seqp->ccid2s_prev; 541 seqp = seqp->ccid2s_prev;
666 } 542 }
@@ -681,25 +557,34 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
681 * one ack vector. 557 * one ack vector.
682 */ 558 */
683 ccid2_congestion_event(sk, seqp); 559 ccid2_congestion_event(sk, seqp);
684 ccid2_hc_tx_dec_pipe(sk); 560 hctx->pipe--;
685 } 561 }
686 if (seqp == hctx->ccid2hctx_seqt) 562 if (seqp == hctx->seqt)
687 break; 563 break;
688 seqp = seqp->ccid2s_prev; 564 seqp = seqp->ccid2s_prev;
689 } 565 }
690 566
691 hctx->ccid2hctx_seqt = last_acked; 567 hctx->seqt = last_acked;
692 } 568 }
693 569
694 /* trim acked packets in tail */ 570 /* trim acked packets in tail */
695 while (hctx->ccid2hctx_seqt != hctx->ccid2hctx_seqh) { 571 while (hctx->seqt != hctx->seqh) {
696 if (!hctx->ccid2hctx_seqt->ccid2s_acked) 572 if (!hctx->seqt->ccid2s_acked)
697 break; 573 break;
698 574
699 hctx->ccid2hctx_seqt = hctx->ccid2hctx_seqt->ccid2s_next; 575 hctx->seqt = hctx->seqt->ccid2s_next;
700 } 576 }
701 577
702 ccid2_hc_tx_check_sanity(hctx); 578 /* restart RTO timer if not all outstanding data has been acked */
579 if (hctx->pipe == 0)
580 sk_stop_timer(sk, &hctx->rtotimer);
581 else
582 sk_reset_timer(sk, &hctx->rtotimer, jiffies + hctx->rto);
583done:
584 /* check if incoming Acks allow pending packets to be sent */
585 if (sender_was_blocked && !ccid2_cwnd_network_limited(hctx))
586 tasklet_schedule(&dccp_sk(sk)->dccps_xmitlet);
587 dccp_ackvec_parsed_cleanup(&hctx->av_chunks);
703} 588}
704 589
705static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk) 590static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk)
@@ -709,17 +594,13 @@ static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk)
709 u32 max_ratio; 594 u32 max_ratio;
710 595
711 /* RFC 4341, 5: initialise ssthresh to arbitrarily high (max) value */ 596 /* RFC 4341, 5: initialise ssthresh to arbitrarily high (max) value */
712 hctx->ccid2hctx_ssthresh = ~0U; 597 hctx->ssthresh = ~0U;
713 598
714 /* 599 /* Use larger initial windows (RFC 3390, rfc2581bis) */
715 * RFC 4341, 5: "The cwnd parameter is initialized to at most four 600 hctx->cwnd = rfc3390_bytes_to_packets(dp->dccps_mss_cache);
716 * packets for new connections, following the rules from [RFC3390]".
717 * We need to convert the bytes of RFC3390 into the packets of RFC 4341.
718 */
719 hctx->ccid2hctx_cwnd = clamp(4380U / dp->dccps_mss_cache, 2U, 4U);
720 601
721 /* Make sure that Ack Ratio is enabled and within bounds. */ 602 /* Make sure that Ack Ratio is enabled and within bounds. */
722 max_ratio = DIV_ROUND_UP(hctx->ccid2hctx_cwnd, 2); 603 max_ratio = DIV_ROUND_UP(hctx->cwnd, 2);
723 if (dp->dccps_l_ack_ratio == 0 || dp->dccps_l_ack_ratio > max_ratio) 604 if (dp->dccps_l_ack_ratio == 0 || dp->dccps_l_ack_ratio > max_ratio)
724 dp->dccps_l_ack_ratio = max_ratio; 605 dp->dccps_l_ack_ratio = max_ratio;
725 606
@@ -727,15 +608,11 @@ static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk)
727 if (ccid2_hc_tx_alloc_seq(hctx)) 608 if (ccid2_hc_tx_alloc_seq(hctx))
728 return -ENOMEM; 609 return -ENOMEM;
729 610
730 hctx->ccid2hctx_rto = 3 * HZ; 611 hctx->rto = DCCP_TIMEOUT_INIT;
731 ccid2_change_srtt(hctx, -1); 612 hctx->rpdupack = -1;
732 hctx->ccid2hctx_rttvar = -1; 613 hctx->last_cong = jiffies;
733 hctx->ccid2hctx_rpdupack = -1; 614 setup_timer(&hctx->rtotimer, ccid2_hc_tx_rto_expire, (unsigned long)sk);
734 hctx->ccid2hctx_last_cong = jiffies; 615 INIT_LIST_HEAD(&hctx->av_chunks);
735 setup_timer(&hctx->ccid2hctx_rtotimer, ccid2_hc_tx_rto_expire,
736 (unsigned long)sk);
737
738 ccid2_hc_tx_check_sanity(hctx);
739 return 0; 616 return 0;
740} 617}
741 618
@@ -744,11 +621,11 @@ static void ccid2_hc_tx_exit(struct sock *sk)
744 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); 621 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
745 int i; 622 int i;
746 623
747 ccid2_hc_tx_kill_rto_timer(sk); 624 sk_stop_timer(sk, &hctx->rtotimer);
748 625
749 for (i = 0; i < hctx->ccid2hctx_seqbufc; i++) 626 for (i = 0; i < hctx->seqbufc; i++)
750 kfree(hctx->ccid2hctx_seqbuf[i]); 627 kfree(hctx->seqbuf[i]);
751 hctx->ccid2hctx_seqbufc = 0; 628 hctx->seqbufc = 0;
752} 629}
753 630
754static void ccid2_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) 631static void ccid2_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
@@ -759,27 +636,28 @@ static void ccid2_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
759 switch (DCCP_SKB_CB(skb)->dccpd_type) { 636 switch (DCCP_SKB_CB(skb)->dccpd_type) {
760 case DCCP_PKT_DATA: 637 case DCCP_PKT_DATA:
761 case DCCP_PKT_DATAACK: 638 case DCCP_PKT_DATAACK:
762 hcrx->ccid2hcrx_data++; 639 hcrx->data++;
763 if (hcrx->ccid2hcrx_data >= dp->dccps_r_ack_ratio) { 640 if (hcrx->data >= dp->dccps_r_ack_ratio) {
764 dccp_send_ack(sk); 641 dccp_send_ack(sk);
765 hcrx->ccid2hcrx_data = 0; 642 hcrx->data = 0;
766 } 643 }
767 break; 644 break;
768 } 645 }
769} 646}
770 647
771static struct ccid_operations ccid2 = { 648static struct ccid_operations ccid2 = {
772 .ccid_id = DCCPC_CCID2, 649 .ccid_id = DCCPC_CCID2,
773 .ccid_name = "TCP-like", 650 .ccid_name = "TCP-like",
774 .ccid_owner = THIS_MODULE, 651 .ccid_owner = THIS_MODULE,
775 .ccid_hc_tx_obj_size = sizeof(struct ccid2_hc_tx_sock), 652 .ccid_hc_tx_obj_size = sizeof(struct ccid2_hc_tx_sock),
776 .ccid_hc_tx_init = ccid2_hc_tx_init, 653 .ccid_hc_tx_init = ccid2_hc_tx_init,
777 .ccid_hc_tx_exit = ccid2_hc_tx_exit, 654 .ccid_hc_tx_exit = ccid2_hc_tx_exit,
778 .ccid_hc_tx_send_packet = ccid2_hc_tx_send_packet, 655 .ccid_hc_tx_send_packet = ccid2_hc_tx_send_packet,
779 .ccid_hc_tx_packet_sent = ccid2_hc_tx_packet_sent, 656 .ccid_hc_tx_packet_sent = ccid2_hc_tx_packet_sent,
780 .ccid_hc_tx_packet_recv = ccid2_hc_tx_packet_recv, 657 .ccid_hc_tx_parse_options = ccid2_hc_tx_parse_options,
781 .ccid_hc_rx_obj_size = sizeof(struct ccid2_hc_rx_sock), 658 .ccid_hc_tx_packet_recv = ccid2_hc_tx_packet_recv,
782 .ccid_hc_rx_packet_recv = ccid2_hc_rx_packet_recv, 659 .ccid_hc_rx_obj_size = sizeof(struct ccid2_hc_rx_sock),
660 .ccid_hc_rx_packet_recv = ccid2_hc_rx_packet_recv,
783}; 661};
784 662
785#ifdef CONFIG_IP_DCCP_CCID2_DEBUG 663#ifdef CONFIG_IP_DCCP_CCID2_DEBUG
diff --git a/net/dccp/ccids/ccid2.h b/net/dccp/ccids/ccid2.h
index 2c94ca029010..8b7a2dee2f6d 100644
--- a/net/dccp/ccids/ccid2.h
+++ b/net/dccp/ccids/ccid2.h
@@ -42,34 +42,49 @@ struct ccid2_seq {
42 42
43/** struct ccid2_hc_tx_sock - CCID2 TX half connection 43/** struct ccid2_hc_tx_sock - CCID2 TX half connection
44 * 44 *
45 * @ccid2hctx_{cwnd,ssthresh,pipe}: as per RFC 4341, section 5 45 * @{cwnd,ssthresh,pipe}: as per RFC 4341, section 5
46 * @ccid2hctx_packets_acked - Ack counter for deriving cwnd growth (RFC 3465) 46 * @packets_acked: Ack counter for deriving cwnd growth (RFC 3465)
47 * @ccid2hctx_lastrtt -time RTT was last measured 47 * @srtt: smoothed RTT estimate, scaled by 2^3
48 * @ccid2hctx_rpseq - last consecutive seqno 48 * @mdev: smoothed RTT variation, scaled by 2^2
49 * @ccid2hctx_rpdupack - dupacks since rpseq 49 * @mdev_max: maximum of @mdev during one flight
50*/ 50 * @rttvar: moving average/maximum of @mdev_max
51 * @rto: RTO value deriving from SRTT and RTTVAR (RFC 2988)
52 * @rtt_seq: to decay RTTVAR at most once per flight
53 * @rpseq: last consecutive seqno
54 * @rpdupack: dupacks since rpseq
55 * @av_chunks: list of Ack Vectors received on current skb
56 */
51struct ccid2_hc_tx_sock { 57struct ccid2_hc_tx_sock {
52 u32 ccid2hctx_cwnd; 58 u32 cwnd;
53 u32 ccid2hctx_ssthresh; 59 u32 ssthresh;
54 u32 ccid2hctx_pipe; 60 u32 pipe;
55 u32 ccid2hctx_packets_acked; 61 u32 packets_acked;
56 struct ccid2_seq *ccid2hctx_seqbuf[CCID2_SEQBUF_MAX]; 62 struct ccid2_seq *seqbuf[CCID2_SEQBUF_MAX];
57 int ccid2hctx_seqbufc; 63 int seqbufc;
58 struct ccid2_seq *ccid2hctx_seqh; 64 struct ccid2_seq *seqh;
59 struct ccid2_seq *ccid2hctx_seqt; 65 struct ccid2_seq *seqt;
60 long ccid2hctx_rto; 66 /* RTT measurement: variables/principles are the same as in TCP */
61 long ccid2hctx_srtt; 67 u32 srtt,
62 long ccid2hctx_rttvar; 68 mdev,
63 unsigned long ccid2hctx_lastrtt; 69 mdev_max,
64 struct timer_list ccid2hctx_rtotimer; 70 rttvar,
65 u64 ccid2hctx_rpseq; 71 rto;
66 int ccid2hctx_rpdupack; 72 u64 rtt_seq:48;
67 unsigned long ccid2hctx_last_cong; 73 struct timer_list rtotimer;
68 u64 ccid2hctx_high_ack; 74 u64 rpseq;
75 int rpdupack;
76 unsigned long last_cong;
77 u64 high_ack;
78 struct list_head av_chunks;
69}; 79};
70 80
81static inline bool ccid2_cwnd_network_limited(struct ccid2_hc_tx_sock *hctx)
82{
83 return (hctx->pipe >= hctx->cwnd);
84}
85
71struct ccid2_hc_rx_sock { 86struct ccid2_hc_rx_sock {
72 int ccid2hcrx_data; 87 int data;
73}; 88};
74 89
75static inline struct ccid2_hc_tx_sock *ccid2_hc_tx_sk(const struct sock *sk) 90static inline struct ccid2_hc_tx_sock *ccid2_hc_tx_sk(const struct sock *sk)
diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c
index 3b8bd7ca6761..06cfdad84a6a 100644
--- a/net/dccp/ccids/ccid3.c
+++ b/net/dccp/ccids/ccid3.c
@@ -49,75 +49,41 @@ static int ccid3_debug;
49/* 49/*
50 * Transmitter Half-Connection Routines 50 * Transmitter Half-Connection Routines
51 */ 51 */
52#ifdef CONFIG_IP_DCCP_CCID3_DEBUG 52/* Oscillation Prevention/Reduction: recommended by rfc3448bis, on by default */
53static const char *ccid3_tx_state_name(enum ccid3_hc_tx_states state) 53static int do_osc_prev = true;
54{
55 static char *ccid3_state_names[] = {
56 [TFRC_SSTATE_NO_SENT] = "NO_SENT",
57 [TFRC_SSTATE_NO_FBACK] = "NO_FBACK",
58 [TFRC_SSTATE_FBACK] = "FBACK",
59 [TFRC_SSTATE_TERM] = "TERM",
60 };
61
62 return ccid3_state_names[state];
63}
64#endif
65
66static void ccid3_hc_tx_set_state(struct sock *sk,
67 enum ccid3_hc_tx_states state)
68{
69 struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
70 enum ccid3_hc_tx_states oldstate = hctx->ccid3hctx_state;
71
72 ccid3_pr_debug("%s(%p) %-8.8s -> %s\n",
73 dccp_role(sk), sk, ccid3_tx_state_name(oldstate),
74 ccid3_tx_state_name(state));
75 WARN_ON(state == oldstate);
76 hctx->ccid3hctx_state = state;
77}
78 54
79/* 55/*
80 * Compute the initial sending rate X_init in the manner of RFC 3390: 56 * Compute the initial sending rate X_init in the manner of RFC 3390:
81 * 57 *
82 * X_init = min(4 * s, max(2 * s, 4380 bytes)) / RTT 58 * X_init = min(4 * MPS, max(2 * MPS, 4380 bytes)) / RTT
83 * 59 *
84 * Note that RFC 3390 uses MSS, RFC 4342 refers to RFC 3390, and rfc3448bis
85 * (rev-02) clarifies the use of RFC 3390 with regard to the above formula.
86 * For consistency with other parts of the code, X_init is scaled by 2^6. 60 * For consistency with other parts of the code, X_init is scaled by 2^6.
87 */ 61 */
88static inline u64 rfc3390_initial_rate(struct sock *sk) 62static inline u64 rfc3390_initial_rate(struct sock *sk)
89{ 63{
90 const struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); 64 const u32 mps = dccp_sk(sk)->dccps_mss_cache,
91 const __u32 w_init = clamp_t(__u32, 4380U, 65 w_init = clamp(4380U, 2 * mps, 4 * mps);
92 2 * hctx->ccid3hctx_s, 4 * hctx->ccid3hctx_s);
93 66
94 return scaled_div(w_init << 6, hctx->ccid3hctx_rtt); 67 return scaled_div(w_init << 6, ccid3_hc_tx_sk(sk)->rtt);
95} 68}
96 69
97/* 70/**
98 * Recalculate t_ipi and delta (should be called whenever X changes) 71 * ccid3_update_send_interval - Calculate new t_ipi = s / X
72 * This respects the granularity of X (64 * bytes/second) and enforces the
73 * scaled minimum of s * 64 / t_mbi = `s' bytes/second as per RFC 3448/4342.
99 */ 74 */
100static void ccid3_update_send_interval(struct ccid3_hc_tx_sock *hctx) 75static void ccid3_update_send_interval(struct ccid3_hc_tx_sock *hctx)
101{ 76{
102 /* Calculate new t_ipi = s / X_inst (X_inst is in 64 * bytes/second) */ 77 if (unlikely(hctx->x <= hctx->s))
103 hctx->ccid3hctx_t_ipi = scaled_div32(((u64)hctx->ccid3hctx_s) << 6, 78 hctx->x = hctx->s;
104 hctx->ccid3hctx_x); 79 hctx->t_ipi = scaled_div32(((u64)hctx->s) << 6, hctx->x);
105
106 /* Calculate new delta by delta = min(t_ipi / 2, t_gran / 2) */
107 hctx->ccid3hctx_delta = min_t(u32, hctx->ccid3hctx_t_ipi / 2,
108 TFRC_OPSYS_HALF_TIME_GRAN);
109
110 ccid3_pr_debug("t_ipi=%u, delta=%u, s=%u, X=%u\n",
111 hctx->ccid3hctx_t_ipi, hctx->ccid3hctx_delta,
112 hctx->ccid3hctx_s, (unsigned)(hctx->ccid3hctx_x >> 6));
113
114} 80}
115 81
116static u32 ccid3_hc_tx_idle_rtt(struct ccid3_hc_tx_sock *hctx, ktime_t now) 82static u32 ccid3_hc_tx_idle_rtt(struct ccid3_hc_tx_sock *hctx, ktime_t now)
117{ 83{
118 u32 delta = ktime_us_delta(now, hctx->ccid3hctx_t_last_win_count); 84 u32 delta = ktime_us_delta(now, hctx->t_last_win_count);
119 85
120 return delta / hctx->ccid3hctx_rtt; 86 return delta / hctx->rtt;
121} 87}
122 88
123/** 89/**
@@ -133,8 +99,8 @@ static u32 ccid3_hc_tx_idle_rtt(struct ccid3_hc_tx_sock *hctx, ktime_t now)
133static void ccid3_hc_tx_update_x(struct sock *sk, ktime_t *stamp) 99static void ccid3_hc_tx_update_x(struct sock *sk, ktime_t *stamp)
134{ 100{
135 struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); 101 struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
136 __u64 min_rate = 2 * hctx->ccid3hctx_x_recv; 102 u64 min_rate = 2 * hctx->x_recv;
137 const __u64 old_x = hctx->ccid3hctx_x; 103 const u64 old_x = hctx->x;
138 ktime_t now = stamp ? *stamp : ktime_get_real(); 104 ktime_t now = stamp ? *stamp : ktime_get_real();
139 105
140 /* 106 /*
@@ -145,50 +111,44 @@ static void ccid3_hc_tx_update_x(struct sock *sk, ktime_t *stamp)
145 */ 111 */
146 if (ccid3_hc_tx_idle_rtt(hctx, now) >= 2) { 112 if (ccid3_hc_tx_idle_rtt(hctx, now) >= 2) {
147 min_rate = rfc3390_initial_rate(sk); 113 min_rate = rfc3390_initial_rate(sk);
148 min_rate = max(min_rate, 2 * hctx->ccid3hctx_x_recv); 114 min_rate = max(min_rate, 2 * hctx->x_recv);
149 } 115 }
150 116
151 if (hctx->ccid3hctx_p > 0) { 117 if (hctx->p > 0) {
152 118
153 hctx->ccid3hctx_x = min(((__u64)hctx->ccid3hctx_x_calc) << 6, 119 hctx->x = min(((u64)hctx->x_calc) << 6, min_rate);
154 min_rate);
155 hctx->ccid3hctx_x = max(hctx->ccid3hctx_x,
156 (((__u64)hctx->ccid3hctx_s) << 6) /
157 TFRC_T_MBI);
158 120
159 } else if (ktime_us_delta(now, hctx->ccid3hctx_t_ld) 121 } else if (ktime_us_delta(now, hctx->t_ld) - (s64)hctx->rtt >= 0) {
160 - (s64)hctx->ccid3hctx_rtt >= 0) {
161 122
162 hctx->ccid3hctx_x = min(2 * hctx->ccid3hctx_x, min_rate); 123 hctx->x = min(2 * hctx->x, min_rate);
163 hctx->ccid3hctx_x = max(hctx->ccid3hctx_x, 124 hctx->x = max(hctx->x,
164 scaled_div(((__u64)hctx->ccid3hctx_s) << 6, 125 scaled_div(((u64)hctx->s) << 6, hctx->rtt));
165 hctx->ccid3hctx_rtt)); 126 hctx->t_ld = now;
166 hctx->ccid3hctx_t_ld = now;
167 } 127 }
168 128
169 if (hctx->ccid3hctx_x != old_x) { 129 if (hctx->x != old_x) {
170 ccid3_pr_debug("X_prev=%u, X_now=%u, X_calc=%u, " 130 ccid3_pr_debug("X_prev=%u, X_now=%u, X_calc=%u, "
171 "X_recv=%u\n", (unsigned)(old_x >> 6), 131 "X_recv=%u\n", (unsigned)(old_x >> 6),
172 (unsigned)(hctx->ccid3hctx_x >> 6), 132 (unsigned)(hctx->x >> 6), hctx->x_calc,
173 hctx->ccid3hctx_x_calc, 133 (unsigned)(hctx->x_recv >> 6));
174 (unsigned)(hctx->ccid3hctx_x_recv >> 6));
175 134
176 ccid3_update_send_interval(hctx); 135 ccid3_update_send_interval(hctx);
177 } 136 }
178} 137}
179 138
180/* 139/*
181 * Track the mean packet size `s' (cf. RFC 4342, 5.3 and RFC 3448, 4.1) 140 * ccid3_hc_tx_measure_packet_size - Measuring the packet size `s' (sec 4.1)
182 * @len: DCCP packet payload size in bytes 141 * @new_len: DCCP payload size in bytes (not used by all methods)
183 */ 142 */
184static inline void ccid3_hc_tx_update_s(struct ccid3_hc_tx_sock *hctx, int len) 143static u32 ccid3_hc_tx_measure_packet_size(struct sock *sk, const u16 new_len)
185{ 144{
186 const u16 old_s = hctx->ccid3hctx_s; 145#if defined(CONFIG_IP_DCCP_CCID3_MEASURE_S_AS_AVG)
187 146 return tfrc_ewma(ccid3_hc_tx_sk(sk)->s, new_len, 9);
188 hctx->ccid3hctx_s = tfrc_ewma(hctx->ccid3hctx_s, len, 9); 147#elif defined(CONFIG_IP_DCCP_CCID3_MEASURE_S_AS_MAX)
189 148 return max(ccid3_hc_tx_sk(sk)->s, new_len);
190 if (hctx->ccid3hctx_s != old_s) 149#else /* CONFIG_IP_DCCP_CCID3_MEASURE_S_AS_MPS */
191 ccid3_update_send_interval(hctx); 150 return dccp_sk(sk)->dccps_mss_cache;
151#endif
192} 152}
193 153
194/* 154/*
@@ -198,13 +158,13 @@ static inline void ccid3_hc_tx_update_s(struct ccid3_hc_tx_sock *hctx, int len)
198static inline void ccid3_hc_tx_update_win_count(struct ccid3_hc_tx_sock *hctx, 158static inline void ccid3_hc_tx_update_win_count(struct ccid3_hc_tx_sock *hctx,
199 ktime_t now) 159 ktime_t now)
200{ 160{
201 u32 delta = ktime_us_delta(now, hctx->ccid3hctx_t_last_win_count), 161 u32 delta = ktime_us_delta(now, hctx->t_last_win_count),
202 quarter_rtts = (4 * delta) / hctx->ccid3hctx_rtt; 162 quarter_rtts = (4 * delta) / hctx->rtt;
203 163
204 if (quarter_rtts > 0) { 164 if (quarter_rtts > 0) {
205 hctx->ccid3hctx_t_last_win_count = now; 165 hctx->t_last_win_count = now;
206 hctx->ccid3hctx_last_win_count += min(quarter_rtts, 5U); 166 hctx->last_win_count += min(quarter_rtts, 5U);
207 hctx->ccid3hctx_last_win_count &= 0xF; /* mod 16 */ 167 hctx->last_win_count &= 0xF; /* mod 16 */
208 } 168 }
209} 169}
210 170
@@ -221,25 +181,26 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data)
221 goto restart_timer; 181 goto restart_timer;
222 } 182 }
223 183
224 ccid3_pr_debug("%s(%p, state=%s) - entry \n", dccp_role(sk), sk, 184 ccid3_pr_debug("%s(%p) entry with%s feedback\n", dccp_role(sk), sk,
225 ccid3_tx_state_name(hctx->ccid3hctx_state)); 185 hctx->feedback ? "" : "out");
226 186
227 if (hctx->ccid3hctx_state == TFRC_SSTATE_FBACK) 187 /* Ignore and do not restart after leaving the established state */
228 ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK); 188 if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN))
229 else if (hctx->ccid3hctx_state != TFRC_SSTATE_NO_FBACK)
230 goto out; 189 goto out;
231 190
191 /* Reset feedback state to "no feedback received" */
192 hctx->feedback = false;
193
232 /* 194 /*
233 * Determine new allowed sending rate X as per draft rfc3448bis-00, 4.4 195 * Determine new allowed sending rate X as per draft rfc3448bis-00, 4.4
196 * RTO is 0 if and only if no feedback has been received yet.
234 */ 197 */
235 if (hctx->ccid3hctx_t_rto == 0 || /* no feedback received yet */ 198 if (hctx->t_rto == 0 || hctx->p == 0) {
236 hctx->ccid3hctx_p == 0) {
237 199
238 /* halve send rate directly */ 200 /* halve send rate directly */
239 hctx->ccid3hctx_x = max(hctx->ccid3hctx_x / 2, 201 hctx->x /= 2;
240 (((__u64)hctx->ccid3hctx_s) << 6) /
241 TFRC_T_MBI);
242 ccid3_update_send_interval(hctx); 202 ccid3_update_send_interval(hctx);
203
243 } else { 204 } else {
244 /* 205 /*
245 * Modify the cached value of X_recv 206 * Modify the cached value of X_recv
@@ -251,44 +212,41 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data)
251 * 212 *
252 * Note that X_recv is scaled by 2^6 while X_calc is not 213 * Note that X_recv is scaled by 2^6 while X_calc is not
253 */ 214 */
254 BUG_ON(hctx->ccid3hctx_p && !hctx->ccid3hctx_x_calc); 215 BUG_ON(hctx->p && !hctx->x_calc);
255 216
256 if (hctx->ccid3hctx_x_calc > (hctx->ccid3hctx_x_recv >> 5)) 217 if (hctx->x_calc > (hctx->x_recv >> 5))
257 hctx->ccid3hctx_x_recv = 218 hctx->x_recv /= 2;
258 max(hctx->ccid3hctx_x_recv / 2,
259 (((__u64)hctx->ccid3hctx_s) << 6) /
260 (2 * TFRC_T_MBI));
261 else { 219 else {
262 hctx->ccid3hctx_x_recv = hctx->ccid3hctx_x_calc; 220 hctx->x_recv = hctx->x_calc;
263 hctx->ccid3hctx_x_recv <<= 4; 221 hctx->x_recv <<= 4;
264 } 222 }
265 ccid3_hc_tx_update_x(sk, NULL); 223 ccid3_hc_tx_update_x(sk, NULL);
266 } 224 }
267 ccid3_pr_debug("Reduced X to %llu/64 bytes/sec\n", 225 ccid3_pr_debug("Reduced X to %llu/64 bytes/sec\n",
268 (unsigned long long)hctx->ccid3hctx_x); 226 (unsigned long long)hctx->x);
269 227
270 /* 228 /*
271 * Set new timeout for the nofeedback timer. 229 * Set new timeout for the nofeedback timer.
272 * See comments in packet_recv() regarding the value of t_RTO. 230 * See comments in packet_recv() regarding the value of t_RTO.
273 */ 231 */
274 if (unlikely(hctx->ccid3hctx_t_rto == 0)) /* no feedback yet */ 232 if (unlikely(hctx->t_rto == 0)) /* no feedback received yet */
275 t_nfb = TFRC_INITIAL_TIMEOUT; 233 t_nfb = TFRC_INITIAL_TIMEOUT;
276 else 234 else
277 t_nfb = max(hctx->ccid3hctx_t_rto, 2 * hctx->ccid3hctx_t_ipi); 235 t_nfb = max(hctx->t_rto, 2 * hctx->t_ipi);
278 236
279restart_timer: 237restart_timer:
280 sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer, 238 sk_reset_timer(sk, &hctx->no_feedback_timer,
281 jiffies + usecs_to_jiffies(t_nfb)); 239 jiffies + usecs_to_jiffies(t_nfb));
282out: 240out:
283 bh_unlock_sock(sk); 241 bh_unlock_sock(sk);
284 sock_put(sk); 242 sock_put(sk);
285} 243}
286 244
287/* 245/**
288 * returns 246 * ccid3_hc_tx_send_packet - Delay-based dequeueing of TX packets
289 * > 0: delay (in msecs) that should pass before actually sending 247 * @skb: next packet candidate to send on @sk
290 * = 0: can send immediately 248 * This function uses the convention of ccid_packet_dequeue_eval() and
291 * < 0: error condition; do not send packet 249 * returns a millisecond-delay value between 0 and t_mbi = 64000 msec.
292 */ 250 */
293static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) 251static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
294{ 252{
@@ -305,18 +263,14 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
305 if (unlikely(skb->len == 0)) 263 if (unlikely(skb->len == 0))
306 return -EBADMSG; 264 return -EBADMSG;
307 265
308 switch (hctx->ccid3hctx_state) { 266 if (hctx->s == 0) {
309 case TFRC_SSTATE_NO_SENT: 267 sk_reset_timer(sk, &hctx->no_feedback_timer, (jiffies +
310 sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer,
311 (jiffies +
312 usecs_to_jiffies(TFRC_INITIAL_TIMEOUT))); 268 usecs_to_jiffies(TFRC_INITIAL_TIMEOUT)));
313 hctx->ccid3hctx_last_win_count = 0; 269 hctx->last_win_count = 0;
314 hctx->ccid3hctx_t_last_win_count = now; 270 hctx->t_last_win_count = now;
315 271
316 /* Set t_0 for initial packet */ 272 /* Set t_0 for initial packet */
317 hctx->ccid3hctx_t_nom = now; 273 hctx->t_nom = now;
318
319 hctx->ccid3hctx_s = skb->len;
320 274
321 /* 275 /*
322 * Use initial RTT sample when available: recommended by erratum 276 * Use initial RTT sample when available: recommended by erratum
@@ -325,9 +279,9 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
325 */ 279 */
326 if (dp->dccps_syn_rtt) { 280 if (dp->dccps_syn_rtt) {
327 ccid3_pr_debug("SYN RTT = %uus\n", dp->dccps_syn_rtt); 281 ccid3_pr_debug("SYN RTT = %uus\n", dp->dccps_syn_rtt);
328 hctx->ccid3hctx_rtt = dp->dccps_syn_rtt; 282 hctx->rtt = dp->dccps_syn_rtt;
329 hctx->ccid3hctx_x = rfc3390_initial_rate(sk); 283 hctx->x = rfc3390_initial_rate(sk);
330 hctx->ccid3hctx_t_ld = now; 284 hctx->t_ld = now;
331 } else { 285 } else {
332 /* 286 /*
333 * Sender does not have RTT sample: 287 * Sender does not have RTT sample:
@@ -335,17 +289,20 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
335 * is needed in several parts (e.g. window counter); 289 * is needed in several parts (e.g. window counter);
336 * - set sending rate X_pps = 1pps as per RFC 3448, 4.2. 290 * - set sending rate X_pps = 1pps as per RFC 3448, 4.2.
337 */ 291 */
338 hctx->ccid3hctx_rtt = DCCP_FALLBACK_RTT; 292 hctx->rtt = DCCP_FALLBACK_RTT;
339 hctx->ccid3hctx_x = hctx->ccid3hctx_s; 293 hctx->x = dp->dccps_mss_cache;
340 hctx->ccid3hctx_x <<= 6; 294 hctx->x <<= 6;
341 } 295 }
296
297 /* Compute t_ipi = s / X */
298 hctx->s = ccid3_hc_tx_measure_packet_size(sk, skb->len);
342 ccid3_update_send_interval(hctx); 299 ccid3_update_send_interval(hctx);
343 300
344 ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK); 301 /* Seed value for Oscillation Prevention (sec. 4.5) */
345 break; 302 hctx->r_sqmean = tfrc_scaled_sqrt(hctx->rtt);
346 case TFRC_SSTATE_NO_FBACK: 303
347 case TFRC_SSTATE_FBACK: 304 } else {
348 delay = ktime_us_delta(hctx->ccid3hctx_t_nom, now); 305 delay = ktime_us_delta(hctx->t_nom, now);
349 ccid3_pr_debug("delay=%ld\n", (long)delay); 306 ccid3_pr_debug("delay=%ld\n", (long)delay);
350 /* 307 /*
351 * Scheduling of packet transmissions [RFC 3448, 4.6] 308 * Scheduling of packet transmissions [RFC 3448, 4.6]
@@ -355,99 +312,80 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
355 * else 312 * else
356 * // send the packet in (t_nom - t_now) milliseconds. 313 * // send the packet in (t_nom - t_now) milliseconds.
357 */ 314 */
358 if (delay - (s64)hctx->ccid3hctx_delta >= 1000) 315 if (delay >= TFRC_T_DELTA)
359 return (u32)delay / 1000L; 316 return (u32)delay / USEC_PER_MSEC;
360 317
361 ccid3_hc_tx_update_win_count(hctx, now); 318 ccid3_hc_tx_update_win_count(hctx, now);
362 break;
363 case TFRC_SSTATE_TERM:
364 DCCP_BUG("%s(%p) - Illegal state TERM", dccp_role(sk), sk);
365 return -EINVAL;
366 } 319 }
367 320
368 /* prepare to send now (add options etc.) */ 321 /* prepare to send now (add options etc.) */
369 dp->dccps_hc_tx_insert_options = 1; 322 dp->dccps_hc_tx_insert_options = 1;
370 DCCP_SKB_CB(skb)->dccpd_ccval = hctx->ccid3hctx_last_win_count; 323 DCCP_SKB_CB(skb)->dccpd_ccval = hctx->last_win_count;
371 324
372 /* set the nominal send time for the next following packet */ 325 /* set the nominal send time for the next following packet */
373 hctx->ccid3hctx_t_nom = ktime_add_us(hctx->ccid3hctx_t_nom, 326 hctx->t_nom = ktime_add_us(hctx->t_nom, hctx->t_ipi);
374 hctx->ccid3hctx_t_ipi); 327 return CCID_PACKET_SEND_AT_ONCE;
375 return 0;
376} 328}
377 329
378static void ccid3_hc_tx_packet_sent(struct sock *sk, int more, 330static void ccid3_hc_tx_packet_sent(struct sock *sk, unsigned int len)
379 unsigned int len)
380{ 331{
381 struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); 332 struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
382 333
383 ccid3_hc_tx_update_s(hctx, len); 334 /* Changes to s will become effective the next time X is computed */
335 hctx->s = ccid3_hc_tx_measure_packet_size(sk, len);
384 336
385 if (tfrc_tx_hist_add(&hctx->ccid3hctx_hist, dccp_sk(sk)->dccps_gss)) 337 if (tfrc_tx_hist_add(&hctx->hist, dccp_sk(sk)->dccps_gss))
386 DCCP_CRIT("packet history - out of memory!"); 338 DCCP_CRIT("packet history - out of memory!");
387} 339}
388 340
389static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) 341static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
390{ 342{
391 struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); 343 struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
392 struct ccid3_options_received *opt_recv; 344 struct tfrc_tx_hist_entry *acked;
393 ktime_t now; 345 ktime_t now;
394 unsigned long t_nfb; 346 unsigned long t_nfb;
395 u32 pinv, r_sample; 347 u32 r_sample;
396 348
397 /* we are only interested in ACKs */ 349 /* we are only interested in ACKs */
398 if (!(DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK || 350 if (!(DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK ||
399 DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_DATAACK)) 351 DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_DATAACK))
400 return; 352 return;
401 /* ... and only in the established state */ 353 /*
402 if (hctx->ccid3hctx_state != TFRC_SSTATE_FBACK && 354 * Locate the acknowledged packet in the TX history.
403 hctx->ccid3hctx_state != TFRC_SSTATE_NO_FBACK) 355 *
404 return; 356 * Returning "entry not found" here can for instance happen when
405 357 * - the host has not sent out anything (e.g. a passive server),
406 opt_recv = &hctx->ccid3hctx_options_received; 358 * - the Ack is outdated (packet with higher Ack number was received),
407 now = ktime_get_real(); 359 * - it is a bogus Ack (for a packet not sent on this connection).
408 360 */
409 /* Estimate RTT from history if ACK number is valid */ 361 acked = tfrc_tx_hist_find_entry(hctx->hist, dccp_hdr_ack_seq(skb));
410 r_sample = tfrc_tx_hist_rtt(hctx->ccid3hctx_hist, 362 if (acked == NULL)
411 DCCP_SKB_CB(skb)->dccpd_ack_seq, now);
412 if (r_sample == 0) {
413 DCCP_WARN("%s(%p): %s with bogus ACK-%llu\n", dccp_role(sk), sk,
414 dccp_packet_name(DCCP_SKB_CB(skb)->dccpd_type),
415 (unsigned long long)DCCP_SKB_CB(skb)->dccpd_ack_seq);
416 return; 363 return;
417 } 364 /* For the sake of RTT sampling, ignore/remove all older entries */
365 tfrc_tx_hist_purge(&acked->next);
418 366
419 /* Update receive rate in units of 64 * bytes/second */ 367 /* Update the moving average for the RTT estimate (RFC 3448, 4.3) */
420 hctx->ccid3hctx_x_recv = opt_recv->ccid3or_receive_rate; 368 now = ktime_get_real();
421 hctx->ccid3hctx_x_recv <<= 6; 369 r_sample = dccp_sample_rtt(sk, ktime_us_delta(now, acked->stamp));
370 hctx->rtt = tfrc_ewma(hctx->rtt, r_sample, 9);
422 371
423 /* Update loss event rate (which is scaled by 1e6) */
424 pinv = opt_recv->ccid3or_loss_event_rate;
425 if (pinv == ~0U || pinv == 0) /* see RFC 4342, 8.5 */
426 hctx->ccid3hctx_p = 0;
427 else /* can not exceed 100% */
428 hctx->ccid3hctx_p = scaled_div(1, pinv);
429 /*
430 * Validate new RTT sample and update moving average
431 */
432 r_sample = dccp_sample_rtt(sk, r_sample);
433 hctx->ccid3hctx_rtt = tfrc_ewma(hctx->ccid3hctx_rtt, r_sample, 9);
434 /* 372 /*
435 * Update allowed sending rate X as per draft rfc3448bis-00, 4.2/3 373 * Update allowed sending rate X as per draft rfc3448bis-00, 4.2/3
436 */ 374 */
437 if (hctx->ccid3hctx_state == TFRC_SSTATE_NO_FBACK) { 375 if (!hctx->feedback) {
438 ccid3_hc_tx_set_state(sk, TFRC_SSTATE_FBACK); 376 hctx->feedback = true;
439 377
440 if (hctx->ccid3hctx_t_rto == 0) { 378 if (hctx->t_rto == 0) {
441 /* 379 /*
442 * Initial feedback packet: Larger Initial Windows (4.2) 380 * Initial feedback packet: Larger Initial Windows (4.2)
443 */ 381 */
444 hctx->ccid3hctx_x = rfc3390_initial_rate(sk); 382 hctx->x = rfc3390_initial_rate(sk);
445 hctx->ccid3hctx_t_ld = now; 383 hctx->t_ld = now;
446 384
447 ccid3_update_send_interval(hctx); 385 ccid3_update_send_interval(hctx);
448 386
449 goto done_computing_x; 387 goto done_computing_x;
450 } else if (hctx->ccid3hctx_p == 0) { 388 } else if (hctx->p == 0) {
451 /* 389 /*
452 * First feedback after nofeedback timer expiry (4.3) 390 * First feedback after nofeedback timer expiry (4.3)
453 */ 391 */
@@ -456,25 +394,52 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
456 } 394 }
457 395
458 /* Update sending rate (step 4 of [RFC 3448, 4.3]) */ 396 /* Update sending rate (step 4 of [RFC 3448, 4.3]) */
459 if (hctx->ccid3hctx_p > 0) 397 if (hctx->p > 0)
460 hctx->ccid3hctx_x_calc = 398 hctx->x_calc = tfrc_calc_x(hctx->s, hctx->rtt, hctx->p);
461 tfrc_calc_x(hctx->ccid3hctx_s,
462 hctx->ccid3hctx_rtt,
463 hctx->ccid3hctx_p);
464 ccid3_hc_tx_update_x(sk, &now); 399 ccid3_hc_tx_update_x(sk, &now);
465 400
466done_computing_x: 401done_computing_x:
467 ccid3_pr_debug("%s(%p), RTT=%uus (sample=%uus), s=%u, " 402 ccid3_pr_debug("%s(%p), RTT=%uus (sample=%uus), s=%u, "
468 "p=%u, X_calc=%u, X_recv=%u, X=%u\n", 403 "p=%u, X_calc=%u, X_recv=%u, X=%u\n",
469 dccp_role(sk), 404 dccp_role(sk), sk, hctx->rtt, r_sample,
470 sk, hctx->ccid3hctx_rtt, r_sample, 405 hctx->s, hctx->p, hctx->x_calc,
471 hctx->ccid3hctx_s, hctx->ccid3hctx_p, 406 (unsigned)(hctx->x_recv >> 6),
472 hctx->ccid3hctx_x_calc, 407 (unsigned)(hctx->x >> 6));
473 (unsigned)(hctx->ccid3hctx_x_recv >> 6), 408 /*
474 (unsigned)(hctx->ccid3hctx_x >> 6)); 409 * Oscillation Reduction (RFC 3448, 4.5) - modifying t_ipi according to
410 * RTT changes, multiplying by X/X_inst = sqrt(R_sample)/R_sqmean. This
411 * can be useful if few connections share a link, avoiding that buffer
412 * fill levels (RTT) oscillate as a result of frequent adjustments to X.
413 * A useful presentation with background information is in
414 * Joerg Widmer, "Equation-Based Congestion Control",
415 * MSc Thesis, University of Mannheim, Germany, 2000
416 * (sec. 3.6.4), who calls this ISM ("Inter-packet Space Modulation").
417 */
418 if (do_osc_prev) {
419 r_sample = tfrc_scaled_sqrt(r_sample);
420 /*
421 * The modulation can work in both ways: increase/decrease t_ipi
422 * according to long-term increases/decreases of the RTT. The
423 * former is a useful measure, since it works against queue
424 * build-up. The latter temporarily increases the sending rate,
425 * so that buffers fill up more quickly. This in turn causes
426 * the RTT to increase, so that either later reduction becomes
427 * necessary or the RTT stays at a very high level. Decreasing
428 * t_ipi is therefore not supported.
429 * Furthermore, during the initial slow-start phase the RTT
430 * naturally increases, where using the algorithm would cause
431 * delays. Hence it is disabled during the initial slow-start.
432 */
433 if (r_sample > hctx->r_sqmean && hctx->p > 0)
434 hctx->t_ipi = div_u64((u64)hctx->t_ipi * (u64)r_sample,
435 hctx->r_sqmean);
436 hctx->t_ipi = min_t(u32, hctx->t_ipi, TFRC_T_MBI);
437 /* update R_sqmean _after_ computing the modulation factor */
438 hctx->r_sqmean = tfrc_ewma(hctx->r_sqmean, r_sample, 9);
439 }
475 440
476 /* unschedule no feedback timer */ 441 /* unschedule no feedback timer */
477 sk_stop_timer(sk, &hctx->ccid3hctx_no_feedback_timer); 442 sk_stop_timer(sk, &hctx->no_feedback_timer);
478 443
479 /* 444 /*
480 * As we have calculated new ipi, delta, t_nom it is possible 445 * As we have calculated new ipi, delta, t_nom it is possible
@@ -488,95 +453,66 @@ done_computing_x:
488 * This can help avoid triggering the nofeedback timer too 453 * This can help avoid triggering the nofeedback timer too
489 * often ('spinning') on LANs with small RTTs. 454 * often ('spinning') on LANs with small RTTs.
490 */ 455 */
491 hctx->ccid3hctx_t_rto = max_t(u32, 4 * hctx->ccid3hctx_rtt, 456 hctx->t_rto = max_t(u32, 4 * hctx->rtt, (CONFIG_IP_DCCP_CCID3_RTO *
492 (CONFIG_IP_DCCP_CCID3_RTO * 457 (USEC_PER_SEC / 1000)));
493 (USEC_PER_SEC / 1000)));
494 /* 458 /*
495 * Schedule no feedback timer to expire in 459 * Schedule no feedback timer to expire in
496 * max(t_RTO, 2 * s/X) = max(t_RTO, 2 * t_ipi) 460 * max(t_RTO, 2 * s/X) = max(t_RTO, 2 * t_ipi)
497 */ 461 */
498 t_nfb = max(hctx->ccid3hctx_t_rto, 2 * hctx->ccid3hctx_t_ipi); 462 t_nfb = max(hctx->t_rto, 2 * hctx->t_ipi);
499 463
500 ccid3_pr_debug("%s(%p), Scheduled no feedback timer to " 464 ccid3_pr_debug("%s(%p), Scheduled no feedback timer to "
501 "expire in %lu jiffies (%luus)\n", 465 "expire in %lu jiffies (%luus)\n",
502 dccp_role(sk), 466 dccp_role(sk), sk, usecs_to_jiffies(t_nfb), t_nfb);
503 sk, usecs_to_jiffies(t_nfb), t_nfb);
504 467
505 sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer, 468 sk_reset_timer(sk, &hctx->no_feedback_timer,
506 jiffies + usecs_to_jiffies(t_nfb)); 469 jiffies + usecs_to_jiffies(t_nfb));
507} 470}
508 471
509static int ccid3_hc_tx_parse_options(struct sock *sk, unsigned char option, 472static int ccid3_hc_tx_parse_options(struct sock *sk, u8 packet_type,
510 unsigned char len, u16 idx, 473 u8 option, u8 *optval, u8 optlen)
511 unsigned char *value)
512{ 474{
513 int rc = 0;
514 const struct dccp_sock *dp = dccp_sk(sk);
515 struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); 475 struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
516 struct ccid3_options_received *opt_recv;
517 __be32 opt_val; 476 __be32 opt_val;
518 477
519 opt_recv = &hctx->ccid3hctx_options_received;
520
521 if (opt_recv->ccid3or_seqno != dp->dccps_gsr) {
522 opt_recv->ccid3or_seqno = dp->dccps_gsr;
523 opt_recv->ccid3or_loss_event_rate = ~0;
524 opt_recv->ccid3or_loss_intervals_idx = 0;
525 opt_recv->ccid3or_loss_intervals_len = 0;
526 opt_recv->ccid3or_receive_rate = 0;
527 }
528
529 switch (option) { 478 switch (option) {
479 case TFRC_OPT_RECEIVE_RATE:
530 case TFRC_OPT_LOSS_EVENT_RATE: 480 case TFRC_OPT_LOSS_EVENT_RATE:
531 if (unlikely(len != 4)) { 481 /* Must be ignored on Data packets, cf. RFC 4342 8.3 and 8.5 */
532 DCCP_WARN("%s(%p), invalid len %d " 482 if (packet_type == DCCP_PKT_DATA)
533 "for TFRC_OPT_LOSS_EVENT_RATE\n", 483 break;
534 dccp_role(sk), sk, len); 484 if (unlikely(optlen != 4)) {
535 rc = -EINVAL; 485 DCCP_WARN("%s(%p), invalid len %d for %u\n",
536 } else { 486 dccp_role(sk), sk, optlen, option);
537 opt_val = get_unaligned((__be32 *)value); 487 return -EINVAL;
538 opt_recv->ccid3or_loss_event_rate = ntohl(opt_val);
539 ccid3_pr_debug("%s(%p), LOSS_EVENT_RATE=%u\n",
540 dccp_role(sk), sk,
541 opt_recv->ccid3or_loss_event_rate);
542 } 488 }
543 break; 489 opt_val = ntohl(get_unaligned((__be32 *)optval));
544 case TFRC_OPT_LOSS_INTERVALS: 490
545 opt_recv->ccid3or_loss_intervals_idx = idx; 491 if (option == TFRC_OPT_RECEIVE_RATE) {
546 opt_recv->ccid3or_loss_intervals_len = len; 492 /* Receive Rate is kept in units of 64 bytes/second */
547 ccid3_pr_debug("%s(%p), LOSS_INTERVALS=(%u, %u)\n", 493 hctx->x_recv = opt_val;
548 dccp_role(sk), sk, 494 hctx->x_recv <<= 6;
549 opt_recv->ccid3or_loss_intervals_idx, 495
550 opt_recv->ccid3or_loss_intervals_len);
551 break;
552 case TFRC_OPT_RECEIVE_RATE:
553 if (unlikely(len != 4)) {
554 DCCP_WARN("%s(%p), invalid len %d "
555 "for TFRC_OPT_RECEIVE_RATE\n",
556 dccp_role(sk), sk, len);
557 rc = -EINVAL;
558 } else {
559 opt_val = get_unaligned((__be32 *)value);
560 opt_recv->ccid3or_receive_rate = ntohl(opt_val);
561 ccid3_pr_debug("%s(%p), RECEIVE_RATE=%u\n", 496 ccid3_pr_debug("%s(%p), RECEIVE_RATE=%u\n",
562 dccp_role(sk), sk, 497 dccp_role(sk), sk, opt_val);
563 opt_recv->ccid3or_receive_rate); 498 } else {
499 /* Update the fixpoint Loss Event Rate fraction */
500 hctx->p = tfrc_invert_loss_event_rate(opt_val);
501
502 ccid3_pr_debug("%s(%p), LOSS_EVENT_RATE=%u\n",
503 dccp_role(sk), sk, opt_val);
564 } 504 }
565 break;
566 } 505 }
567 506 return 0;
568 return rc;
569} 507}
570 508
571static int ccid3_hc_tx_init(struct ccid *ccid, struct sock *sk) 509static int ccid3_hc_tx_init(struct ccid *ccid, struct sock *sk)
572{ 510{
573 struct ccid3_hc_tx_sock *hctx = ccid_priv(ccid); 511 struct ccid3_hc_tx_sock *hctx = ccid_priv(ccid);
574 512
575 hctx->ccid3hctx_state = TFRC_SSTATE_NO_SENT; 513 hctx->hist = NULL;
576 hctx->ccid3hctx_hist = NULL; 514 setup_timer(&hctx->no_feedback_timer,
577 setup_timer(&hctx->ccid3hctx_no_feedback_timer, 515 ccid3_hc_tx_no_feedback_timer, (unsigned long)sk);
578 ccid3_hc_tx_no_feedback_timer, (unsigned long)sk);
579
580 return 0; 516 return 0;
581} 517}
582 518
@@ -584,42 +520,36 @@ static void ccid3_hc_tx_exit(struct sock *sk)
584{ 520{
585 struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); 521 struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
586 522
587 ccid3_hc_tx_set_state(sk, TFRC_SSTATE_TERM); 523 sk_stop_timer(sk, &hctx->no_feedback_timer);
588 sk_stop_timer(sk, &hctx->ccid3hctx_no_feedback_timer); 524 tfrc_tx_hist_purge(&hctx->hist);
589
590 tfrc_tx_hist_purge(&hctx->ccid3hctx_hist);
591} 525}
592 526
593static void ccid3_hc_tx_get_info(struct sock *sk, struct tcp_info *info) 527static void ccid3_hc_tx_get_info(struct sock *sk, struct tcp_info *info)
594{ 528{
595 struct ccid3_hc_tx_sock *hctx; 529 info->tcpi_rto = ccid3_hc_tx_sk(sk)->t_rto;
596 530 info->tcpi_rtt = ccid3_hc_tx_sk(sk)->rtt;
597 /* Listen socks doesn't have a private CCID block */
598 if (sk->sk_state == DCCP_LISTEN)
599 return;
600
601 hctx = ccid3_hc_tx_sk(sk);
602 info->tcpi_rto = hctx->ccid3hctx_t_rto;
603 info->tcpi_rtt = hctx->ccid3hctx_rtt;
604} 531}
605 532
606static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len, 533static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len,
607 u32 __user *optval, int __user *optlen) 534 u32 __user *optval, int __user *optlen)
608{ 535{
609 const struct ccid3_hc_tx_sock *hctx; 536 const struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
537 struct tfrc_tx_info tfrc;
610 const void *val; 538 const void *val;
611 539
612 /* Listen socks doesn't have a private CCID block */
613 if (sk->sk_state == DCCP_LISTEN)
614 return -EINVAL;
615
616 hctx = ccid3_hc_tx_sk(sk);
617 switch (optname) { 540 switch (optname) {
618 case DCCP_SOCKOPT_CCID_TX_INFO: 541 case DCCP_SOCKOPT_CCID_TX_INFO:
619 if (len < sizeof(hctx->ccid3hctx_tfrc)) 542 if (len < sizeof(tfrc))
620 return -EINVAL; 543 return -EINVAL;
621 len = sizeof(hctx->ccid3hctx_tfrc); 544 tfrc.tfrctx_x = hctx->x;
622 val = &hctx->ccid3hctx_tfrc; 545 tfrc.tfrctx_x_recv = hctx->x_recv;
546 tfrc.tfrctx_x_calc = hctx->x_calc;
547 tfrc.tfrctx_rtt = hctx->rtt;
548 tfrc.tfrctx_p = hctx->p;
549 tfrc.tfrctx_rto = hctx->t_rto;
550 tfrc.tfrctx_ipi = hctx->t_ipi;
551 len = sizeof(tfrc);
552 val = &tfrc;
623 break; 553 break;
624 default: 554 default:
625 return -ENOPROTOOPT; 555 return -ENOPROTOOPT;
@@ -634,112 +564,82 @@ static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len,
634/* 564/*
635 * Receiver Half-Connection Routines 565 * Receiver Half-Connection Routines
636 */ 566 */
637
638/* CCID3 feedback types */
639enum ccid3_fback_type {
640 CCID3_FBACK_NONE = 0,
641 CCID3_FBACK_INITIAL,
642 CCID3_FBACK_PERIODIC,
643 CCID3_FBACK_PARAM_CHANGE
644};
645
646#ifdef CONFIG_IP_DCCP_CCID3_DEBUG
647static const char *ccid3_rx_state_name(enum ccid3_hc_rx_states state)
648{
649 static char *ccid3_rx_state_names[] = {
650 [TFRC_RSTATE_NO_DATA] = "NO_DATA",
651 [TFRC_RSTATE_DATA] = "DATA",
652 [TFRC_RSTATE_TERM] = "TERM",
653 };
654
655 return ccid3_rx_state_names[state];
656}
657#endif
658
659static void ccid3_hc_rx_set_state(struct sock *sk,
660 enum ccid3_hc_rx_states state)
661{
662 struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
663 enum ccid3_hc_rx_states oldstate = hcrx->ccid3hcrx_state;
664
665 ccid3_pr_debug("%s(%p) %-8.8s -> %s\n",
666 dccp_role(sk), sk, ccid3_rx_state_name(oldstate),
667 ccid3_rx_state_name(state));
668 WARN_ON(state == oldstate);
669 hcrx->ccid3hcrx_state = state;
670}
671
672static void ccid3_hc_rx_send_feedback(struct sock *sk, 567static void ccid3_hc_rx_send_feedback(struct sock *sk,
673 const struct sk_buff *skb, 568 const struct sk_buff *skb,
674 enum ccid3_fback_type fbtype) 569 enum ccid3_fback_type fbtype)
675{ 570{
676 struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); 571 struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
677 struct dccp_sock *dp = dccp_sk(sk);
678 ktime_t now;
679 s64 delta = 0;
680
681 if (unlikely(hcrx->ccid3hcrx_state == TFRC_RSTATE_TERM))
682 return;
683
684 now = ktime_get_real();
685 572
686 switch (fbtype) { 573 switch (fbtype) {
687 case CCID3_FBACK_INITIAL: 574 case CCID3_FBACK_INITIAL:
688 hcrx->ccid3hcrx_x_recv = 0; 575 hcrx->x_recv = 0;
689 hcrx->ccid3hcrx_pinv = ~0U; /* see RFC 4342, 8.5 */ 576 hcrx->p_inverse = ~0U; /* see RFC 4342, 8.5 */
690 break; 577 break;
691 case CCID3_FBACK_PARAM_CHANGE: 578 case CCID3_FBACK_PARAM_CHANGE:
579 if (unlikely(hcrx->feedback == CCID3_FBACK_NONE)) {
580 /*
581 * rfc3448bis-06, 6.3.1: First packet(s) lost or marked
582 * FIXME: in rfc3448bis the receiver returns X_recv=0
583 * here as it normally would in the first feedback packet.
584 * However this is not possible yet, since the code still
585 * uses RFC 3448, i.e.
586 * If (p > 0)
587 * Calculate X_calc using the TCP throughput equation.
588 * X = max(min(X_calc, 2*X_recv), s/t_mbi);
589 * would bring X down to s/t_mbi. That is why we return
590 * X_recv according to rfc3448bis-06 for the moment.
591 */
592 u32 s = tfrc_rx_hist_packet_size(&hcrx->hist),
593 rtt = tfrc_rx_hist_rtt(&hcrx->hist);
594
595 hcrx->x_recv = scaled_div32(s, 2 * rtt);
596 break;
597 }
692 /* 598 /*
693 * When parameters change (new loss or p > p_prev), we do not 599 * When parameters change (new loss or p > p_prev), we do not
694 * have a reliable estimate for R_m of [RFC 3448, 6.2] and so 600 * have a reliable estimate for R_m of [RFC 3448, 6.2] and so
695 * need to reuse the previous value of X_recv. However, when 601 * always check whether at least RTT time units were covered.
696 * X_recv was 0 (due to early loss), this would kill X down to
697 * s/t_mbi (i.e. one packet in 64 seconds).
698 * To avoid such drastic reduction, we approximate X_recv as
699 * the number of bytes since last feedback.
700 * This is a safe fallback, since X is bounded above by X_calc.
701 */ 602 */
702 if (hcrx->ccid3hcrx_x_recv > 0) 603 hcrx->x_recv = tfrc_rx_hist_x_recv(&hcrx->hist, hcrx->x_recv);
703 break; 604 break;
704 /* fall through */
705 case CCID3_FBACK_PERIODIC: 605 case CCID3_FBACK_PERIODIC:
706 delta = ktime_us_delta(now, hcrx->ccid3hcrx_tstamp_last_feedback); 606 /*
707 if (delta <= 0) 607 * Step (2) of rfc3448bis-06, 6.2:
708 DCCP_BUG("delta (%ld) <= 0", (long)delta); 608 * - if no data packets have been received, just restart timer
709 else 609 * - if data packets have been received, re-compute X_recv
710 hcrx->ccid3hcrx_x_recv = 610 */
711 scaled_div32(hcrx->ccid3hcrx_bytes_recv, delta); 611 if (hcrx->hist.bytes_recvd == 0)
612 goto prepare_for_next_time;
613 hcrx->x_recv = tfrc_rx_hist_x_recv(&hcrx->hist, hcrx->x_recv);
712 break; 614 break;
713 default: 615 default:
714 return; 616 return;
715 } 617 }
716 618
717 ccid3_pr_debug("Interval %ldusec, X_recv=%u, 1/p=%u\n", (long)delta, 619 ccid3_pr_debug("X_recv=%u, 1/p=%u\n", hcrx->x_recv, hcrx->p_inverse);
718 hcrx->ccid3hcrx_x_recv, hcrx->ccid3hcrx_pinv);
719
720 hcrx->ccid3hcrx_tstamp_last_feedback = now;
721 hcrx->ccid3hcrx_last_counter = dccp_hdr(skb)->dccph_ccval;
722 hcrx->ccid3hcrx_bytes_recv = 0;
723 620
724 dp->dccps_hc_rx_insert_options = 1; 621 dccp_sk(sk)->dccps_hc_rx_insert_options = 1;
725 dccp_send_ack(sk); 622 dccp_send_ack(sk);
623
624prepare_for_next_time:
625 tfrc_rx_hist_restart_byte_counter(&hcrx->hist);
626 hcrx->last_counter = dccp_hdr(skb)->dccph_ccval;
627 hcrx->feedback = fbtype;
726} 628}
727 629
728static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb) 630static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb)
729{ 631{
730 const struct ccid3_hc_rx_sock *hcrx; 632 const struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
731 __be32 x_recv, pinv; 633 __be32 x_recv, pinv;
732 634
733 if (!(sk->sk_state == DCCP_OPEN || sk->sk_state == DCCP_PARTOPEN)) 635 if (!(sk->sk_state == DCCP_OPEN || sk->sk_state == DCCP_PARTOPEN))
734 return 0; 636 return 0;
735 637
736 hcrx = ccid3_hc_rx_sk(sk);
737
738 if (dccp_packet_without_ack(skb)) 638 if (dccp_packet_without_ack(skb))
739 return 0; 639 return 0;
740 640
741 x_recv = htonl(hcrx->ccid3hcrx_x_recv); 641 x_recv = htonl(hcrx->x_recv);
742 pinv = htonl(hcrx->ccid3hcrx_pinv); 642 pinv = htonl(hcrx->p_inverse);
743 643
744 if (dccp_insert_option(sk, skb, TFRC_OPT_LOSS_EVENT_RATE, 644 if (dccp_insert_option(sk, skb, TFRC_OPT_LOSS_EVENT_RATE,
745 &pinv, sizeof(pinv)) || 645 &pinv, sizeof(pinv)) ||
@@ -762,171 +662,95 @@ static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb)
762static u32 ccid3_first_li(struct sock *sk) 662static u32 ccid3_first_li(struct sock *sk)
763{ 663{
764 struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); 664 struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
765 u32 x_recv, p, delta; 665 u32 s = tfrc_rx_hist_packet_size(&hcrx->hist),
666 rtt = tfrc_rx_hist_rtt(&hcrx->hist), x_recv, p;
766 u64 fval; 667 u64 fval;
767 668
768 if (hcrx->ccid3hcrx_rtt == 0) { 669 /*
769 DCCP_WARN("No RTT estimate available, using fallback RTT\n"); 670 * rfc3448bis-06, 6.3.1: First data packet(s) are marked or lost. Set p
770 hcrx->ccid3hcrx_rtt = DCCP_FALLBACK_RTT; 671 * to give the equivalent of X_target = s/(2*R). Thus fval = 2 and so p
771 } 672 * is about 20.64%. This yields an interval length of 4.84 (rounded up).
673 */
674 if (unlikely(hcrx->feedback == CCID3_FBACK_NONE))
675 return 5;
772 676
773 delta = ktime_to_us(net_timedelta(hcrx->ccid3hcrx_tstamp_last_feedback)); 677 x_recv = tfrc_rx_hist_x_recv(&hcrx->hist, hcrx->x_recv);
774 x_recv = scaled_div32(hcrx->ccid3hcrx_bytes_recv, delta); 678 if (x_recv == 0)
775 if (x_recv == 0) { /* would also trigger divide-by-zero */ 679 goto failed;
776 DCCP_WARN("X_recv==0\n");
777 if ((x_recv = hcrx->ccid3hcrx_x_recv) == 0) {
778 DCCP_BUG("stored value of X_recv is zero");
779 return ~0U;
780 }
781 }
782 680
783 fval = scaled_div(hcrx->ccid3hcrx_s, hcrx->ccid3hcrx_rtt); 681 fval = scaled_div32(scaled_div(s, rtt), x_recv);
784 fval = scaled_div32(fval, x_recv);
785 p = tfrc_calc_x_reverse_lookup(fval); 682 p = tfrc_calc_x_reverse_lookup(fval);
786 683
787 ccid3_pr_debug("%s(%p), receive rate=%u bytes/s, implied " 684 ccid3_pr_debug("%s(%p), receive rate=%u bytes/s, implied "
788 "loss rate=%u\n", dccp_role(sk), sk, x_recv, p); 685 "loss rate=%u\n", dccp_role(sk), sk, x_recv, p);
789 686
790 return p == 0 ? ~0U : scaled_div(1, p); 687 if (p > 0)
688 return scaled_div(1, p);
689failed:
690 return UINT_MAX;
791} 691}
792 692
793static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) 693static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
794{ 694{
795 struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); 695 struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
796 enum ccid3_fback_type do_feedback = CCID3_FBACK_NONE;
797 const u64 ndp = dccp_sk(sk)->dccps_options_received.dccpor_ndp; 696 const u64 ndp = dccp_sk(sk)->dccps_options_received.dccpor_ndp;
798 const bool is_data_packet = dccp_data_packet(skb); 697 const bool is_data_packet = dccp_data_packet(skb);
799 698
800 if (unlikely(hcrx->ccid3hcrx_state == TFRC_RSTATE_NO_DATA)) {
801 if (is_data_packet) {
802 const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4;
803 do_feedback = CCID3_FBACK_INITIAL;
804 ccid3_hc_rx_set_state(sk, TFRC_RSTATE_DATA);
805 hcrx->ccid3hcrx_s = payload;
806 /*
807 * Not necessary to update ccid3hcrx_bytes_recv here,
808 * since X_recv = 0 for the first feedback packet (cf.
809 * RFC 3448, 6.3) -- gerrit
810 */
811 }
812 goto update_records;
813 }
814
815 if (tfrc_rx_hist_duplicate(&hcrx->ccid3hcrx_hist, skb))
816 return; /* done receiving */
817
818 if (is_data_packet) {
819 const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4;
820 /*
821 * Update moving-average of s and the sum of received payload bytes
822 */
823 hcrx->ccid3hcrx_s = tfrc_ewma(hcrx->ccid3hcrx_s, payload, 9);
824 hcrx->ccid3hcrx_bytes_recv += payload;
825 }
826
827 /* 699 /*
828 * Perform loss detection and handle pending losses 700 * Perform loss detection and handle pending losses
829 */ 701 */
830 if (tfrc_rx_handle_loss(&hcrx->ccid3hcrx_hist, &hcrx->ccid3hcrx_li_hist, 702 if (tfrc_rx_congestion_event(&hcrx->hist, &hcrx->li_hist,
831 skb, ndp, ccid3_first_li, sk)) { 703 skb, ndp, ccid3_first_li, sk))
832 do_feedback = CCID3_FBACK_PARAM_CHANGE; 704 ccid3_hc_rx_send_feedback(sk, skb, CCID3_FBACK_PARAM_CHANGE);
833 goto done_receiving;
834 }
835
836 if (tfrc_rx_hist_loss_pending(&hcrx->ccid3hcrx_hist))
837 return; /* done receiving */
838
839 /* 705 /*
840 * Handle data packets: RTT sampling and monitoring p 706 * Feedback for first non-empty data packet (RFC 3448, 6.3)
841 */ 707 */
842 if (unlikely(!is_data_packet)) 708 else if (unlikely(hcrx->feedback == CCID3_FBACK_NONE && is_data_packet))
843 goto update_records; 709 ccid3_hc_rx_send_feedback(sk, skb, CCID3_FBACK_INITIAL);
844
845 if (!tfrc_lh_is_initialised(&hcrx->ccid3hcrx_li_hist)) {
846 const u32 sample = tfrc_rx_hist_sample_rtt(&hcrx->ccid3hcrx_hist, skb);
847 /*
848 * Empty loss history: no loss so far, hence p stays 0.
849 * Sample RTT values, since an RTT estimate is required for the
850 * computation of p when the first loss occurs; RFC 3448, 6.3.1.
851 */
852 if (sample != 0)
853 hcrx->ccid3hcrx_rtt = tfrc_ewma(hcrx->ccid3hcrx_rtt, sample, 9);
854
855 } else if (tfrc_lh_update_i_mean(&hcrx->ccid3hcrx_li_hist, skb)) {
856 /*
857 * Step (3) of [RFC 3448, 6.1]: Recompute I_mean and, if I_mean
858 * has decreased (resp. p has increased), send feedback now.
859 */
860 do_feedback = CCID3_FBACK_PARAM_CHANGE;
861 }
862
863 /* 710 /*
864 * Check if the periodic once-per-RTT feedback is due; RFC 4342, 10.3 711 * Check if the periodic once-per-RTT feedback is due; RFC 4342, 10.3
865 */ 712 */
866 if (SUB16(dccp_hdr(skb)->dccph_ccval, hcrx->ccid3hcrx_last_counter) > 3) 713 else if (!tfrc_rx_hist_loss_pending(&hcrx->hist) && is_data_packet &&
867 do_feedback = CCID3_FBACK_PERIODIC; 714 SUB16(dccp_hdr(skb)->dccph_ccval, hcrx->last_counter) > 3)
868 715 ccid3_hc_rx_send_feedback(sk, skb, CCID3_FBACK_PERIODIC);
869update_records:
870 tfrc_rx_hist_add_packet(&hcrx->ccid3hcrx_hist, skb, ndp);
871
872done_receiving:
873 if (do_feedback)
874 ccid3_hc_rx_send_feedback(sk, skb, do_feedback);
875} 716}
876 717
877static int ccid3_hc_rx_init(struct ccid *ccid, struct sock *sk) 718static int ccid3_hc_rx_init(struct ccid *ccid, struct sock *sk)
878{ 719{
879 struct ccid3_hc_rx_sock *hcrx = ccid_priv(ccid); 720 struct ccid3_hc_rx_sock *hcrx = ccid_priv(ccid);
880 721
881 hcrx->ccid3hcrx_state = TFRC_RSTATE_NO_DATA; 722 tfrc_lh_init(&hcrx->li_hist);
882 tfrc_lh_init(&hcrx->ccid3hcrx_li_hist); 723 return tfrc_rx_hist_init(&hcrx->hist, sk);
883 return tfrc_rx_hist_alloc(&hcrx->ccid3hcrx_hist);
884} 724}
885 725
886static void ccid3_hc_rx_exit(struct sock *sk) 726static void ccid3_hc_rx_exit(struct sock *sk)
887{ 727{
888 struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); 728 struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
889 729
890 ccid3_hc_rx_set_state(sk, TFRC_RSTATE_TERM); 730 tfrc_rx_hist_purge(&hcrx->hist);
891 731 tfrc_lh_cleanup(&hcrx->li_hist);
892 tfrc_rx_hist_purge(&hcrx->ccid3hcrx_hist);
893 tfrc_lh_cleanup(&hcrx->ccid3hcrx_li_hist);
894} 732}
895 733
896static void ccid3_hc_rx_get_info(struct sock *sk, struct tcp_info *info) 734static void ccid3_hc_rx_get_info(struct sock *sk, struct tcp_info *info)
897{ 735{
898 const struct ccid3_hc_rx_sock *hcrx;
899
900 /* Listen socks doesn't have a private CCID block */
901 if (sk->sk_state == DCCP_LISTEN)
902 return;
903
904 hcrx = ccid3_hc_rx_sk(sk);
905 info->tcpi_ca_state = hcrx->ccid3hcrx_state;
906 info->tcpi_options |= TCPI_OPT_TIMESTAMPS; 736 info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
907 info->tcpi_rcv_rtt = hcrx->ccid3hcrx_rtt; 737 info->tcpi_rcv_rtt = tfrc_rx_hist_rtt(&ccid3_hc_rx_sk(sk)->hist);
908} 738}
909 739
910static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len, 740static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len,
911 u32 __user *optval, int __user *optlen) 741 u32 __user *optval, int __user *optlen)
912{ 742{
913 const struct ccid3_hc_rx_sock *hcrx; 743 const struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
914 struct tfrc_rx_info rx_info; 744 struct tfrc_rx_info rx_info;
915 const void *val; 745 const void *val;
916 746
917 /* Listen socks doesn't have a private CCID block */
918 if (sk->sk_state == DCCP_LISTEN)
919 return -EINVAL;
920
921 hcrx = ccid3_hc_rx_sk(sk);
922 switch (optname) { 747 switch (optname) {
923 case DCCP_SOCKOPT_CCID_RX_INFO: 748 case DCCP_SOCKOPT_CCID_RX_INFO:
924 if (len < sizeof(rx_info)) 749 if (len < sizeof(rx_info))
925 return -EINVAL; 750 return -EINVAL;
926 rx_info.tfrcrx_x_recv = hcrx->ccid3hcrx_x_recv; 751 rx_info.tfrcrx_x_recv = hcrx->x_recv;
927 rx_info.tfrcrx_rtt = hcrx->ccid3hcrx_rtt; 752 rx_info.tfrcrx_rtt = tfrc_rx_hist_rtt(&hcrx->hist);
928 rx_info.tfrcrx_p = hcrx->ccid3hcrx_pinv == 0 ? ~0U : 753 rx_info.tfrcrx_p = tfrc_invert_loss_event_rate(hcrx->p_inverse);
929 scaled_div(1, hcrx->ccid3hcrx_pinv);
930 len = sizeof(rx_info); 754 len = sizeof(rx_info);
931 val = &rx_info; 755 val = &rx_info;
932 break; 756 break;
@@ -962,6 +786,9 @@ static struct ccid_operations ccid3 = {
962 .ccid_hc_tx_getsockopt = ccid3_hc_tx_getsockopt, 786 .ccid_hc_tx_getsockopt = ccid3_hc_tx_getsockopt,
963}; 787};
964 788
789module_param(do_osc_prev, bool, 0644);
790MODULE_PARM_DESC(do_osc_prev, "Use Oscillation Prevention (RFC 3448, 4.5)");
791
965#ifdef CONFIG_IP_DCCP_CCID3_DEBUG 792#ifdef CONFIG_IP_DCCP_CCID3_DEBUG
966module_param(ccid3_debug, bool, 0644); 793module_param(ccid3_debug, bool, 0644);
967MODULE_PARM_DESC(ccid3_debug, "Enable debug messages"); 794MODULE_PARM_DESC(ccid3_debug, "Enable debug messages");
@@ -969,6 +796,19 @@ MODULE_PARM_DESC(ccid3_debug, "Enable debug messages");
969 796
970static __init int ccid3_module_init(void) 797static __init int ccid3_module_init(void)
971{ 798{
799 struct timespec tp;
800
801 /*
802 * Without a fine-grained clock resolution, RTTs/X_recv are not sampled
803 * correctly and feedback is sent either too early or too late.
804 */
805 hrtimer_get_res(CLOCK_MONOTONIC, &tp);
806 if (tp.tv_sec || tp.tv_nsec > DCCP_TIME_RESOLUTION * NSEC_PER_USEC) {
807 printk(KERN_ERR "%s: Timer too coarse (%ld usec), need %u-usec"
808 " resolution - check your clocksource.\n", __func__,
809 tp.tv_nsec/NSEC_PER_USEC, DCCP_TIME_RESOLUTION);
810 return -ESOCKTNOSUPPORT;
811 }
972 return ccid_register(&ccid3); 812 return ccid_register(&ccid3);
973} 813}
974module_init(ccid3_module_init); 814module_init(ccid3_module_init);
diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h
index 49ca32bd7e79..af6e1bf937d9 100644
--- a/net/dccp/ccids/ccid3.h
+++ b/net/dccp/ccids/ccid3.h
@@ -47,11 +47,22 @@
47/* Two seconds as per RFC 3448 4.2 */ 47/* Two seconds as per RFC 3448 4.2 */
48#define TFRC_INITIAL_TIMEOUT (2 * USEC_PER_SEC) 48#define TFRC_INITIAL_TIMEOUT (2 * USEC_PER_SEC)
49 49
50/* In usecs - half the scheduling granularity as per RFC3448 4.6 */ 50/* Maximum backoff interval t_mbi (RFC 3448, 4.3) */
51#define TFRC_OPSYS_HALF_TIME_GRAN (USEC_PER_SEC / (2 * HZ)) 51#define TFRC_T_MBI (64 * USEC_PER_SEC)
52 52
53/* Parameter t_mbi from [RFC 3448, 4.3]: backoff interval in seconds */ 53/*
54#define TFRC_T_MBI 64 54 * The t_delta parameter (RFC 3448, 4.6): delays of less than %USEC_PER_MSEC are
55 * rounded down to 0, since sk_reset_timer() here uses millisecond granularity.
56 * Hence we can use a constant t_delta = %USEC_PER_MSEC when HZ >= 500. A coarse
57 * resolution of HZ < 500 means that the error is below one timer tick (t_gran)
58 * when using the constant t_delta = t_gran / 2 = %USEC_PER_SEC / (2 * HZ).
59 */
60#if (HZ >= 500)
61# define TFRC_T_DELTA USEC_PER_MSEC
62#else
63# define TFRC_T_DELTA (USEC_PER_SEC / (2 * HZ))
64#warning Coarse CONFIG_HZ resolution -- higher value recommended for TFRC.
65#endif
55 66
56enum ccid3_options { 67enum ccid3_options {
57 TFRC_OPT_LOSS_EVENT_RATE = 192, 68 TFRC_OPT_LOSS_EVENT_RATE = 192,
@@ -59,62 +70,43 @@ enum ccid3_options {
59 TFRC_OPT_RECEIVE_RATE = 194, 70 TFRC_OPT_RECEIVE_RATE = 194,
60}; 71};
61 72
62struct ccid3_options_received {
63 u64 ccid3or_seqno:48,
64 ccid3or_loss_intervals_idx:16;
65 u16 ccid3or_loss_intervals_len;
66 u32 ccid3or_loss_event_rate;
67 u32 ccid3or_receive_rate;
68};
69
70/* TFRC sender states */
71enum ccid3_hc_tx_states {
72 TFRC_SSTATE_NO_SENT = 1,
73 TFRC_SSTATE_NO_FBACK,
74 TFRC_SSTATE_FBACK,
75 TFRC_SSTATE_TERM,
76};
77
78/** struct ccid3_hc_tx_sock - CCID3 sender half-connection socket 73/** struct ccid3_hc_tx_sock - CCID3 sender half-connection socket
79 * 74 *
80 * @ccid3hctx_x - Current sending rate in 64 * bytes per second 75 * @x - Current sending rate in 64 * bytes per second
81 * @ccid3hctx_x_recv - Receive rate in 64 * bytes per second 76 * @x_recv - Receive rate in 64 * bytes per second
82 * @ccid3hctx_x_calc - Calculated rate in bytes per second 77 * @x_calc - Calculated rate in bytes per second
83 * @ccid3hctx_rtt - Estimate of current round trip time in usecs 78 * @rtt - Estimate of current round trip time in usecs
84 * @ccid3hctx_p - Current loss event rate (0-1) scaled by 1000000 79 * @r_sqmean - Estimate of long-term RTT (RFC 3448, 4.5)
85 * @ccid3hctx_s - Packet size in bytes 80 * @p - Current loss event rate (0-1) scaled by 1000000
86 * @ccid3hctx_t_rto - Nofeedback Timer setting in usecs 81 * @s - Packet size in bytes
87 * @ccid3hctx_t_ipi - Interpacket (send) interval (RFC 3448, 4.6) in usecs 82 * @t_rto - Nofeedback Timer setting in usecs
88 * @ccid3hctx_state - Sender state, one of %ccid3_hc_tx_states 83 * @t_ipi - Interpacket (send) interval (RFC 3448, 4.6) in usecs
89 * @ccid3hctx_last_win_count - Last window counter sent 84 * @feedback - Whether feedback has been received or not
90 * @ccid3hctx_t_last_win_count - Timestamp of earliest packet 85 * @last_win_count - Last window counter sent
91 * with last_win_count value sent 86 * @t_last_win_count - Timestamp of earliest packet with
92 * @ccid3hctx_no_feedback_timer - Handle to no feedback timer 87 * last_win_count value sent
93 * @ccid3hctx_t_ld - Time last doubled during slow start 88 * @no_feedback_timer - Handle to no feedback timer
94 * @ccid3hctx_t_nom - Nominal send time of next packet 89 * @t_ld - Time last doubled during slow start
95 * @ccid3hctx_delta - Send timer delta (RFC 3448, 4.6) in usecs 90 * @t_nom - Nominal send time of next packet
96 * @ccid3hctx_hist - Packet history 91 * @hist - Packet history
97 * @ccid3hctx_options_received - Parsed set of retrieved options
98 */ 92 */
99struct ccid3_hc_tx_sock { 93struct ccid3_hc_tx_sock {
100 struct tfrc_tx_info ccid3hctx_tfrc; 94 u64 x;
101#define ccid3hctx_x ccid3hctx_tfrc.tfrctx_x 95 u64 x_recv;
102#define ccid3hctx_x_recv ccid3hctx_tfrc.tfrctx_x_recv 96 u32 x_calc;
103#define ccid3hctx_x_calc ccid3hctx_tfrc.tfrctx_x_calc 97 u32 rtt;
104#define ccid3hctx_rtt ccid3hctx_tfrc.tfrctx_rtt 98 u16 r_sqmean;
105#define ccid3hctx_p ccid3hctx_tfrc.tfrctx_p 99 u32 p;
106#define ccid3hctx_t_rto ccid3hctx_tfrc.tfrctx_rto 100 u32 t_rto;
107#define ccid3hctx_t_ipi ccid3hctx_tfrc.tfrctx_ipi 101 u32 t_ipi;
108 u16 ccid3hctx_s; 102 u16 s;
109 enum ccid3_hc_tx_states ccid3hctx_state:8; 103 bool feedback:1;
110 u8 ccid3hctx_last_win_count; 104 u8 last_win_count;
111 ktime_t ccid3hctx_t_last_win_count; 105 ktime_t t_last_win_count;
112 struct timer_list ccid3hctx_no_feedback_timer; 106 struct timer_list no_feedback_timer;
113 ktime_t ccid3hctx_t_ld; 107 ktime_t t_ld;
114 ktime_t ccid3hctx_t_nom; 108 ktime_t t_nom;
115 u32 ccid3hctx_delta; 109 struct tfrc_tx_hist_entry *hist;
116 struct tfrc_tx_hist_entry *ccid3hctx_hist;
117 struct ccid3_options_received ccid3hctx_options_received;
118}; 110};
119 111
120static inline struct ccid3_hc_tx_sock *ccid3_hc_tx_sk(const struct sock *sk) 112static inline struct ccid3_hc_tx_sock *ccid3_hc_tx_sk(const struct sock *sk)
@@ -124,41 +116,32 @@ static inline struct ccid3_hc_tx_sock *ccid3_hc_tx_sk(const struct sock *sk)
124 return hctx; 116 return hctx;
125} 117}
126 118
127/* TFRC receiver states */ 119
128enum ccid3_hc_rx_states { 120enum ccid3_fback_type {
129 TFRC_RSTATE_NO_DATA = 1, 121 CCID3_FBACK_NONE = 0,
130 TFRC_RSTATE_DATA, 122 CCID3_FBACK_INITIAL,
131 TFRC_RSTATE_TERM = 127, 123 CCID3_FBACK_PERIODIC,
124 CCID3_FBACK_PARAM_CHANGE
132}; 125};
133 126
134/** struct ccid3_hc_rx_sock - CCID3 receiver half-connection socket 127/** struct ccid3_hc_rx_sock - CCID3 receiver half-connection socket
135 * 128 *
136 * @ccid3hcrx_x_recv - Receiver estimate of send rate (RFC 3448 4.3) 129 * @last_counter - Tracks window counter (RFC 4342, 8.1)
137 * @ccid3hcrx_rtt - Receiver estimate of rtt (non-standard) 130 * @feedback - The type of the feedback last sent
138 * @ccid3hcrx_p - Current loss event rate (RFC 3448 5.4) 131 * @x_recv - Receiver estimate of send rate (RFC 3448, sec. 4.3)
139 * @ccid3hcrx_last_counter - Tracks window counter (RFC 4342, 8.1) 132 * @tstamp_last_feedback - Time at which last feedback was sent
140 * @ccid3hcrx_state - Receiver state, one of %ccid3_hc_rx_states 133 * @hist - Packet history (loss detection + RTT sampling)
141 * @ccid3hcrx_bytes_recv - Total sum of DCCP payload bytes 134 * @li_hist - Loss Interval database
142 * @ccid3hcrx_x_recv - Receiver estimate of send rate (RFC 3448, sec. 4.3) 135 * @p_inverse - Inverse of Loss Event Rate (RFC 4342, sec. 8.5)
143 * @ccid3hcrx_rtt - Receiver estimate of RTT
144 * @ccid3hcrx_tstamp_last_feedback - Time at which last feedback was sent
145 * @ccid3hcrx_tstamp_last_ack - Time at which last feedback was sent
146 * @ccid3hcrx_hist - Packet history (loss detection + RTT sampling)
147 * @ccid3hcrx_li_hist - Loss Interval database
148 * @ccid3hcrx_s - Received packet size in bytes
149 * @ccid3hcrx_pinv - Inverse of Loss Event Rate (RFC 4342, sec. 8.5)
150 */ 136 */
151struct ccid3_hc_rx_sock { 137struct ccid3_hc_rx_sock {
152 u8 ccid3hcrx_last_counter:4; 138 u8 last_counter:4;
153 enum ccid3_hc_rx_states ccid3hcrx_state:8; 139 enum ccid3_fback_type feedback:4;
154 u32 ccid3hcrx_bytes_recv; 140 u32 x_recv;
155 u32 ccid3hcrx_x_recv; 141 ktime_t tstamp_last_feedback;
156 u32 ccid3hcrx_rtt; 142 struct tfrc_rx_hist hist;
157 ktime_t ccid3hcrx_tstamp_last_feedback; 143 struct tfrc_loss_hist li_hist;
158 struct tfrc_rx_hist ccid3hcrx_hist; 144#define p_inverse li_hist.i_mean
159 struct tfrc_loss_hist ccid3hcrx_li_hist;
160 u16 ccid3hcrx_s;
161#define ccid3hcrx_pinv ccid3hcrx_li_hist.i_mean
162}; 145};
163 146
164static inline struct ccid3_hc_rx_sock *ccid3_hc_rx_sk(const struct sock *sk) 147static inline struct ccid3_hc_rx_sock *ccid3_hc_rx_sk(const struct sock *sk)
diff --git a/net/dccp/ccids/lib/loss_interval.c b/net/dccp/ccids/lib/loss_interval.c
index 5b3ce0688c5c..b1ae8f8259e5 100644
--- a/net/dccp/ccids/lib/loss_interval.c
+++ b/net/dccp/ccids/lib/loss_interval.c
@@ -86,21 +86,26 @@ static void tfrc_lh_calc_i_mean(struct tfrc_loss_hist *lh)
86 86
87/** 87/**
88 * tfrc_lh_update_i_mean - Update the `open' loss interval I_0 88 * tfrc_lh_update_i_mean - Update the `open' loss interval I_0
89 * For recomputing p: returns `true' if p > p_prev <=> 1/p < 1/p_prev 89 * This updates I_mean as the sequence numbers increase. As a consequence, the
90 * open loss interval I_0 increases, hence p = W_tot/max(I_tot0, I_tot1)
91 * decreases, and thus there is no need to send renewed feedback.
90 */ 92 */
91u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *skb) 93void tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *skb)
92{ 94{
93 struct tfrc_loss_interval *cur = tfrc_lh_peek(lh); 95 struct tfrc_loss_interval *cur = tfrc_lh_peek(lh);
94 u32 old_i_mean = lh->i_mean;
95 s64 len; 96 s64 len;
96 97
97 if (cur == NULL) /* not initialised */ 98 if (cur == NULL) /* not initialised */
98 return 0; 99 return;
100
101 /* FIXME: should probably also count non-data packets (RFC 4342, 6.1) */
102 if (!dccp_data_packet(skb))
103 return;
99 104
100 len = dccp_delta_seqno(cur->li_seqno, DCCP_SKB_CB(skb)->dccpd_seq) + 1; 105 len = dccp_delta_seqno(cur->li_seqno, DCCP_SKB_CB(skb)->dccpd_seq) + 1;
101 106
102 if (len - (s64)cur->li_length <= 0) /* duplicate or reordered */ 107 if (len - (s64)cur->li_length <= 0) /* duplicate or reordered */
103 return 0; 108 return;
104 109
105 if (SUB16(dccp_hdr(skb)->dccph_ccval, cur->li_ccval) > 4) 110 if (SUB16(dccp_hdr(skb)->dccph_ccval, cur->li_ccval) > 4)
106 /* 111 /*
@@ -114,14 +119,11 @@ u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *skb)
114 cur->li_is_closed = 1; 119 cur->li_is_closed = 1;
115 120
116 if (tfrc_lh_length(lh) == 1) /* due to RFC 3448, 6.3.1 */ 121 if (tfrc_lh_length(lh) == 1) /* due to RFC 3448, 6.3.1 */
117 return 0; 122 return;
118 123
119 cur->li_length = len; 124 cur->li_length = len;
120 tfrc_lh_calc_i_mean(lh); 125 tfrc_lh_calc_i_mean(lh);
121
122 return (lh->i_mean < old_i_mean);
123} 126}
124EXPORT_SYMBOL_GPL(tfrc_lh_update_i_mean);
125 127
126/* Determine if `new_loss' does begin a new loss interval [RFC 4342, 10.2] */ 128/* Determine if `new_loss' does begin a new loss interval [RFC 4342, 10.2] */
127static inline u8 tfrc_lh_is_new_loss(struct tfrc_loss_interval *cur, 129static inline u8 tfrc_lh_is_new_loss(struct tfrc_loss_interval *cur,
@@ -138,18 +140,18 @@ static inline u8 tfrc_lh_is_new_loss(struct tfrc_loss_interval *cur,
138 * @sk: Used by @calc_first_li in caller-specific way (subtyping) 140 * @sk: Used by @calc_first_li in caller-specific way (subtyping)
139 * Updates I_mean and returns 1 if a new interval has in fact been added to @lh. 141 * Updates I_mean and returns 1 if a new interval has in fact been added to @lh.
140 */ 142 */
141int tfrc_lh_interval_add(struct tfrc_loss_hist *lh, struct tfrc_rx_hist *rh, 143bool tfrc_lh_interval_add(struct tfrc_loss_hist *lh, struct tfrc_rx_hist *rh,
142 u32 (*calc_first_li)(struct sock *), struct sock *sk) 144 u32 (*calc_first_li)(struct sock *), struct sock *sk)
143{ 145{
144 struct tfrc_loss_interval *cur = tfrc_lh_peek(lh), *new; 146 struct tfrc_loss_interval *cur = tfrc_lh_peek(lh), *new;
145 147
146 if (cur != NULL && !tfrc_lh_is_new_loss(cur, tfrc_rx_hist_loss_prev(rh))) 148 if (cur != NULL && !tfrc_lh_is_new_loss(cur, tfrc_rx_hist_loss_prev(rh)))
147 return 0; 149 return false;
148 150
149 new = tfrc_lh_demand_next(lh); 151 new = tfrc_lh_demand_next(lh);
150 if (unlikely(new == NULL)) { 152 if (unlikely(new == NULL)) {
151 DCCP_CRIT("Cannot allocate/add loss record."); 153 DCCP_CRIT("Cannot allocate/add loss record.");
152 return 0; 154 return false;
153 } 155 }
154 156
155 new->li_seqno = tfrc_rx_hist_loss_prev(rh)->tfrchrx_seqno; 157 new->li_seqno = tfrc_rx_hist_loss_prev(rh)->tfrchrx_seqno;
@@ -167,7 +169,7 @@ int tfrc_lh_interval_add(struct tfrc_loss_hist *lh, struct tfrc_rx_hist *rh,
167 169
168 tfrc_lh_calc_i_mean(lh); 170 tfrc_lh_calc_i_mean(lh);
169 } 171 }
170 return 1; 172 return true;
171} 173}
172EXPORT_SYMBOL_GPL(tfrc_lh_interval_add); 174EXPORT_SYMBOL_GPL(tfrc_lh_interval_add);
173 175
diff --git a/net/dccp/ccids/lib/loss_interval.h b/net/dccp/ccids/lib/loss_interval.h
index 246018a3b269..d08a226db43e 100644
--- a/net/dccp/ccids/lib/loss_interval.h
+++ b/net/dccp/ccids/lib/loss_interval.h
@@ -67,9 +67,9 @@ static inline u8 tfrc_lh_length(struct tfrc_loss_hist *lh)
67 67
68struct tfrc_rx_hist; 68struct tfrc_rx_hist;
69 69
70extern int tfrc_lh_interval_add(struct tfrc_loss_hist *, struct tfrc_rx_hist *, 70extern bool tfrc_lh_interval_add(struct tfrc_loss_hist *, struct tfrc_rx_hist *,
71 u32 (*first_li)(struct sock *), struct sock *); 71 u32 (*first_li)(struct sock *), struct sock *);
72extern u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *); 72extern void tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *);
73extern void tfrc_lh_cleanup(struct tfrc_loss_hist *lh); 73extern void tfrc_lh_cleanup(struct tfrc_loss_hist *lh);
74 74
75#endif /* _DCCP_LI_HIST_ */ 75#endif /* _DCCP_LI_HIST_ */
diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c
index 6cc108afdc3b..cce9f03bda3e 100644
--- a/net/dccp/ccids/lib/packet_history.c
+++ b/net/dccp/ccids/lib/packet_history.c
@@ -40,18 +40,6 @@
40#include "packet_history.h" 40#include "packet_history.h"
41#include "../../dccp.h" 41#include "../../dccp.h"
42 42
43/**
44 * tfrc_tx_hist_entry - Simple singly-linked TX history list
45 * @next: next oldest entry (LIFO order)
46 * @seqno: sequence number of this entry
47 * @stamp: send time of packet with sequence number @seqno
48 */
49struct tfrc_tx_hist_entry {
50 struct tfrc_tx_hist_entry *next;
51 u64 seqno;
52 ktime_t stamp;
53};
54
55/* 43/*
56 * Transmitter History Routines 44 * Transmitter History Routines
57 */ 45 */
@@ -73,15 +61,6 @@ void tfrc_tx_packet_history_exit(void)
73 } 61 }
74} 62}
75 63
76static struct tfrc_tx_hist_entry *
77 tfrc_tx_hist_find_entry(struct tfrc_tx_hist_entry *head, u64 seqno)
78{
79 while (head != NULL && head->seqno != seqno)
80 head = head->next;
81
82 return head;
83}
84
85int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno) 64int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno)
86{ 65{
87 struct tfrc_tx_hist_entry *entry = kmem_cache_alloc(tfrc_tx_hist_slab, gfp_any()); 66 struct tfrc_tx_hist_entry *entry = kmem_cache_alloc(tfrc_tx_hist_slab, gfp_any());
@@ -111,25 +90,6 @@ void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp)
111} 90}
112EXPORT_SYMBOL_GPL(tfrc_tx_hist_purge); 91EXPORT_SYMBOL_GPL(tfrc_tx_hist_purge);
113 92
114u32 tfrc_tx_hist_rtt(struct tfrc_tx_hist_entry *head, const u64 seqno,
115 const ktime_t now)
116{
117 u32 rtt = 0;
118 struct tfrc_tx_hist_entry *packet = tfrc_tx_hist_find_entry(head, seqno);
119
120 if (packet != NULL) {
121 rtt = ktime_us_delta(now, packet->stamp);
122 /*
123 * Garbage-collect older (irrelevant) entries:
124 */
125 tfrc_tx_hist_purge(&packet->next);
126 }
127
128 return rtt;
129}
130EXPORT_SYMBOL_GPL(tfrc_tx_hist_rtt);
131
132
133/* 93/*
134 * Receiver History Routines 94 * Receiver History Routines
135 */ 95 */
@@ -191,14 +151,31 @@ int tfrc_rx_hist_duplicate(struct tfrc_rx_hist *h, struct sk_buff *skb)
191} 151}
192EXPORT_SYMBOL_GPL(tfrc_rx_hist_duplicate); 152EXPORT_SYMBOL_GPL(tfrc_rx_hist_duplicate);
193 153
154
155static void __tfrc_rx_hist_swap(struct tfrc_rx_hist *h, const u8 a, const u8 b)
156{
157 struct tfrc_rx_hist_entry *tmp = h->ring[a];
158
159 h->ring[a] = h->ring[b];
160 h->ring[b] = tmp;
161}
162
194static void tfrc_rx_hist_swap(struct tfrc_rx_hist *h, const u8 a, const u8 b) 163static void tfrc_rx_hist_swap(struct tfrc_rx_hist *h, const u8 a, const u8 b)
195{ 164{
196 const u8 idx_a = tfrc_rx_hist_index(h, a), 165 __tfrc_rx_hist_swap(h, tfrc_rx_hist_index(h, a),
197 idx_b = tfrc_rx_hist_index(h, b); 166 tfrc_rx_hist_index(h, b));
198 struct tfrc_rx_hist_entry *tmp = h->ring[idx_a]; 167}
199 168
200 h->ring[idx_a] = h->ring[idx_b]; 169/**
201 h->ring[idx_b] = tmp; 170 * tfrc_rx_hist_resume_rtt_sampling - Prepare RX history for RTT sampling
171 * This is called after loss detection has finished, when the history entry
172 * with the index of `loss_count' holds the highest-received sequence number.
173 * RTT sampling requires this information at ring[0] (tfrc_rx_hist_sample_rtt).
174 */
175static inline void tfrc_rx_hist_resume_rtt_sampling(struct tfrc_rx_hist *h)
176{
177 __tfrc_rx_hist_swap(h, 0, tfrc_rx_hist_index(h, h->loss_count));
178 h->loss_count = h->loss_start = 0;
202} 179}
203 180
204/* 181/*
@@ -215,10 +192,8 @@ static void __do_track_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u64 n1)
215 u64 s0 = tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno, 192 u64 s0 = tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno,
216 s1 = DCCP_SKB_CB(skb)->dccpd_seq; 193 s1 = DCCP_SKB_CB(skb)->dccpd_seq;
217 194
218 if (!dccp_loss_free(s0, s1, n1)) { /* gap between S0 and S1 */ 195 if (!dccp_loss_free(s0, s1, n1)) /* gap between S0 and S1 */
219 h->loss_count = 1; 196 h->loss_count = 1;
220 tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 1), skb, n1);
221 }
222} 197}
223 198
224static void __one_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n2) 199static void __one_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n2)
@@ -240,8 +215,7 @@ static void __one_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n2
240 215
241 if (dccp_loss_free(s2, s1, n1)) { 216 if (dccp_loss_free(s2, s1, n1)) {
242 /* hole is filled: S0, S2, and S1 are consecutive */ 217 /* hole is filled: S0, S2, and S1 are consecutive */
243 h->loss_count = 0; 218 tfrc_rx_hist_resume_rtt_sampling(h);
244 h->loss_start = tfrc_rx_hist_index(h, 1);
245 } else 219 } else
246 /* gap between S2 and S1: just update loss_prev */ 220 /* gap between S2 and S1: just update loss_prev */
247 tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_loss_prev(h), skb, n2); 221 tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_loss_prev(h), skb, n2);
@@ -294,8 +268,7 @@ static int __two_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n3)
294 268
295 if (dccp_loss_free(s1, s2, n2)) { 269 if (dccp_loss_free(s1, s2, n2)) {
296 /* entire hole filled by S0, S3, S1, S2 */ 270 /* entire hole filled by S0, S3, S1, S2 */
297 h->loss_start = tfrc_rx_hist_index(h, 2); 271 tfrc_rx_hist_resume_rtt_sampling(h);
298 h->loss_count = 0;
299 } else { 272 } else {
300 /* gap remains between S1 and S2 */ 273 /* gap remains between S1 and S2 */
301 h->loss_start = tfrc_rx_hist_index(h, 1); 274 h->loss_start = tfrc_rx_hist_index(h, 1);
@@ -339,8 +312,7 @@ static void __three_after_loss(struct tfrc_rx_hist *h)
339 312
340 if (dccp_loss_free(s2, s3, n3)) { 313 if (dccp_loss_free(s2, s3, n3)) {
341 /* no gap between S2 and S3: entire hole is filled */ 314 /* no gap between S2 and S3: entire hole is filled */
342 h->loss_start = tfrc_rx_hist_index(h, 3); 315 tfrc_rx_hist_resume_rtt_sampling(h);
343 h->loss_count = 0;
344 } else { 316 } else {
345 /* gap between S2 and S3 */ 317 /* gap between S2 and S3 */
346 h->loss_start = tfrc_rx_hist_index(h, 2); 318 h->loss_start = tfrc_rx_hist_index(h, 2);
@@ -354,13 +326,13 @@ static void __three_after_loss(struct tfrc_rx_hist *h)
354} 326}
355 327
356/** 328/**
357 * tfrc_rx_handle_loss - Loss detection and further processing 329 * tfrc_rx_congestion_event - Loss detection and further processing
358 * @h: The non-empty RX history object 330 * @h: The non-empty RX history object
359 * @lh: Loss Intervals database to update 331 * @lh: Loss Intervals database to update
360 * @skb: Currently received packet 332 * @skb: Currently received packet
361 * @ndp: The NDP count belonging to @skb 333 * @ndp: The NDP count belonging to @skb
362 * @calc_first_li: Caller-dependent computation of first loss interval in @lh 334 * @first_li: Caller-dependent computation of first loss interval in @lh
363 * @sk: Used by @calc_first_li (see tfrc_lh_interval_add) 335 * @sk: Used by @calc_first_li (see tfrc_lh_interval_add)
364 * Chooses action according to pending loss, updates LI database when a new 336 * Chooses action according to pending loss, updates LI database when a new
365 * loss was detected, and does required post-processing. Returns 1 when caller 337 * loss was detected, and does required post-processing. Returns 1 when caller
366 * should send feedback, 0 otherwise. 338 * should send feedback, 0 otherwise.
@@ -368,15 +340,20 @@ static void __three_after_loss(struct tfrc_rx_hist *h)
368 * records accordingly, the caller should not perform any more RX history 340 * records accordingly, the caller should not perform any more RX history
369 * operations when loss_count is greater than 0 after calling this function. 341 * operations when loss_count is greater than 0 after calling this function.
370 */ 342 */
371int tfrc_rx_handle_loss(struct tfrc_rx_hist *h, 343bool tfrc_rx_congestion_event(struct tfrc_rx_hist *h,
372 struct tfrc_loss_hist *lh, 344 struct tfrc_loss_hist *lh,
373 struct sk_buff *skb, const u64 ndp, 345 struct sk_buff *skb, const u64 ndp,
374 u32 (*calc_first_li)(struct sock *), struct sock *sk) 346 u32 (*first_li)(struct sock *), struct sock *sk)
375{ 347{
376 int is_new_loss = 0; 348 bool new_event = false;
349
350 if (tfrc_rx_hist_duplicate(h, skb))
351 return 0;
377 352
378 if (h->loss_count == 0) { 353 if (h->loss_count == 0) {
379 __do_track_loss(h, skb, ndp); 354 __do_track_loss(h, skb, ndp);
355 tfrc_rx_hist_sample_rtt(h, skb);
356 tfrc_rx_hist_add_packet(h, skb, ndp);
380 } else if (h->loss_count == 1) { 357 } else if (h->loss_count == 1) {
381 __one_after_loss(h, skb, ndp); 358 __one_after_loss(h, skb, ndp);
382 } else if (h->loss_count != 2) { 359 } else if (h->loss_count != 2) {
@@ -385,34 +362,57 @@ int tfrc_rx_handle_loss(struct tfrc_rx_hist *h,
385 /* 362 /*
386 * Update Loss Interval database and recycle RX records 363 * Update Loss Interval database and recycle RX records
387 */ 364 */
388 is_new_loss = tfrc_lh_interval_add(lh, h, calc_first_li, sk); 365 new_event = tfrc_lh_interval_add(lh, h, first_li, sk);
389 __three_after_loss(h); 366 __three_after_loss(h);
390 } 367 }
391 return is_new_loss; 368
369 /*
370 * Update moving-average of `s' and the sum of received payload bytes.
371 */
372 if (dccp_data_packet(skb)) {
373 const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4;
374
375 h->packet_size = tfrc_ewma(h->packet_size, payload, 9);
376 h->bytes_recvd += payload;
377 }
378
379 /* RFC 3448, 6.1: update I_0, whose growth implies p <= p_prev */
380 if (!new_event)
381 tfrc_lh_update_i_mean(lh, skb);
382
383 return new_event;
392} 384}
393EXPORT_SYMBOL_GPL(tfrc_rx_handle_loss); 385EXPORT_SYMBOL_GPL(tfrc_rx_congestion_event);
394 386
395int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h) 387/* Compute the sending rate X_recv measured between feedback intervals */
388u32 tfrc_rx_hist_x_recv(struct tfrc_rx_hist *h, const u32 last_x_recv)
396{ 389{
397 int i; 390 u64 bytes = h->bytes_recvd, last_rtt = h->rtt_estimate;
391 s64 delta = ktime_to_us(net_timedelta(h->bytes_start));
398 392
399 for (i = 0; i <= TFRC_NDUPACK; i++) { 393 WARN_ON(delta <= 0);
400 h->ring[i] = kmem_cache_alloc(tfrc_rx_hist_slab, GFP_ATOMIC); 394 /*
401 if (h->ring[i] == NULL) 395 * Ensure that the sampling interval for X_recv is at least one RTT,
402 goto out_free; 396 * by extending the sampling interval backwards in time, over the last
403 } 397 * R_(m-1) seconds, as per rfc3448bis-06, 6.2.
398 * To reduce noise (e.g. when the RTT changes often), this is only
399 * done when delta is smaller than RTT/2.
400 */
401 if (last_x_recv > 0 && delta < last_rtt/2) {
402 tfrc_pr_debug("delta < RTT ==> %ld us < %u us\n",
403 (long)delta, (unsigned)last_rtt);
404 404
405 h->loss_count = h->loss_start = 0; 405 delta = (bytes ? delta : 0) + last_rtt;
406 return 0; 406 bytes += div_u64((u64)last_x_recv * last_rtt, USEC_PER_SEC);
407 }
407 408
408out_free: 409 if (unlikely(bytes == 0)) {
409 while (i-- != 0) { 410 DCCP_WARN("X_recv == 0, using old value of %u\n", last_x_recv);
410 kmem_cache_free(tfrc_rx_hist_slab, h->ring[i]); 411 return last_x_recv;
411 h->ring[i] = NULL;
412 } 412 }
413 return -ENOBUFS; 413 return scaled_div32(bytes, delta);
414} 414}
415EXPORT_SYMBOL_GPL(tfrc_rx_hist_alloc); 415EXPORT_SYMBOL_GPL(tfrc_rx_hist_x_recv);
416 416
417void tfrc_rx_hist_purge(struct tfrc_rx_hist *h) 417void tfrc_rx_hist_purge(struct tfrc_rx_hist *h)
418{ 418{
@@ -426,73 +426,81 @@ void tfrc_rx_hist_purge(struct tfrc_rx_hist *h)
426} 426}
427EXPORT_SYMBOL_GPL(tfrc_rx_hist_purge); 427EXPORT_SYMBOL_GPL(tfrc_rx_hist_purge);
428 428
429/** 429static int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h)
430 * tfrc_rx_hist_rtt_last_s - reference entry to compute RTT samples against
431 */
432static inline struct tfrc_rx_hist_entry *
433 tfrc_rx_hist_rtt_last_s(const struct tfrc_rx_hist *h)
434{ 430{
435 return h->ring[0]; 431 int i;
432
433 memset(h, 0, sizeof(*h));
434
435 for (i = 0; i <= TFRC_NDUPACK; i++) {
436 h->ring[i] = kmem_cache_alloc(tfrc_rx_hist_slab, GFP_ATOMIC);
437 if (h->ring[i] == NULL) {
438 tfrc_rx_hist_purge(h);
439 return -ENOBUFS;
440 }
441 }
442 return 0;
436} 443}
437 444
438/** 445int tfrc_rx_hist_init(struct tfrc_rx_hist *h, struct sock *sk)
439 * tfrc_rx_hist_rtt_prev_s: previously suitable (wrt rtt_last_s) RTT-sampling entry
440 */
441static inline struct tfrc_rx_hist_entry *
442 tfrc_rx_hist_rtt_prev_s(const struct tfrc_rx_hist *h)
443{ 446{
444 return h->ring[h->rtt_sample_prev]; 447 if (tfrc_rx_hist_alloc(h))
448 return -ENOBUFS;
449 /*
450 * Initialise first entry with GSR to start loss detection as early as
451 * possible. Code using this must not use any other fields. The entry
452 * will be overwritten once the CCID updates its received packets.
453 */
454 tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno = dccp_sk(sk)->dccps_gsr;
455 return 0;
445} 456}
457EXPORT_SYMBOL_GPL(tfrc_rx_hist_init);
446 458
447/** 459/**
448 * tfrc_rx_hist_sample_rtt - Sample RTT from timestamp / CCVal 460 * tfrc_rx_hist_sample_rtt - Sample RTT from timestamp / CCVal
449 * Based on ideas presented in RFC 4342, 8.1. Returns 0 if it was not able 461 * Based on ideas presented in RFC 4342, 8.1. This function expects that no loss
450 * to compute a sample with given data - calling function should check this. 462 * is pending and uses the following history entries (via rtt_sample_prev):
463 * - h->ring[0] contains the most recent history entry prior to @skb;
464 * - h->ring[1] is an unused `dummy' entry when the current difference is 0;
451 */ 465 */
452u32 tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, const struct sk_buff *skb) 466void tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, const struct sk_buff *skb)
453{ 467{
454 u32 sample = 0, 468 struct tfrc_rx_hist_entry *last = h->ring[0];
455 delta_v = SUB16(dccp_hdr(skb)->dccph_ccval, 469 u32 sample, delta_v;
456 tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval);
457
458 if (delta_v < 1 || delta_v > 4) { /* unsuitable CCVal delta */
459 if (h->rtt_sample_prev == 2) { /* previous candidate stored */
460 sample = SUB16(tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_ccval,
461 tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval);
462 if (sample)
463 sample = 4 / sample *
464 ktime_us_delta(tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_tstamp,
465 tfrc_rx_hist_rtt_last_s(h)->tfrchrx_tstamp);
466 else /*
467 * FIXME: This condition is in principle not
468 * possible but occurs when CCID is used for
469 * two-way data traffic. I have tried to trace
470 * it, but the cause does not seem to be here.
471 */
472 DCCP_BUG("please report to dccp@vger.kernel.org"
473 " => prev = %u, last = %u",
474 tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_ccval,
475 tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval);
476 } else if (delta_v < 1) {
477 h->rtt_sample_prev = 1;
478 goto keep_ref_for_next_time;
479 }
480 470
481 } else if (delta_v == 4) /* optimal match */ 471 /*
482 sample = ktime_to_us(net_timedelta(tfrc_rx_hist_rtt_last_s(h)->tfrchrx_tstamp)); 472 * When not to sample:
483 else { /* suboptimal match */ 473 * - on non-data packets
484 h->rtt_sample_prev = 2; 474 * (RFC 4342, 8.1: CCVal only fully defined for data packets);
485 goto keep_ref_for_next_time; 475 * - when no data packets have been received yet
486 } 476 * (FIXME: using sampled packet size as indicator here);
477 * - as long as there are gaps in the sequence space (pending loss).
478 */
479 if (!dccp_data_packet(skb) || h->packet_size == 0 ||
480 tfrc_rx_hist_loss_pending(h))
481 return;
487 482
488 if (unlikely(sample > DCCP_SANE_RTT_MAX)) { 483 h->rtt_sample_prev = 0; /* reset previous candidate */
489 DCCP_WARN("RTT sample %u too large, using max\n", sample); 484
490 sample = DCCP_SANE_RTT_MAX; 485 delta_v = SUB16(dccp_hdr(skb)->dccph_ccval, last->tfrchrx_ccval);
486 if (delta_v == 0) { /* less than RTT/4 difference */
487 h->rtt_sample_prev = 1;
488 return;
491 } 489 }
490 sample = dccp_sane_rtt(ktime_to_us(net_timedelta(last->tfrchrx_tstamp)));
492 491
493 h->rtt_sample_prev = 0; /* use current entry as next reference */ 492 if (delta_v <= 4) /* between RTT/4 and RTT */
494keep_ref_for_next_time: 493 sample *= 4 / delta_v;
494 else if (!(sample < h->rtt_estimate && sample > h->rtt_estimate/2))
495 /*
496 * Optimisation: CCVal difference is greater than 1 RTT, yet the
497 * sample is less than the local RTT estimate; which means that
498 * the RTT estimate is too high.
499 * To avoid noise, it is not done if the sample is below RTT/2.
500 */
501 return;
495 502
496 return sample; 503 /* Use a lower weight than usual to increase responsiveness */
504 h->rtt_estimate = tfrc_ewma(h->rtt_estimate, sample, 5);
497} 505}
498EXPORT_SYMBOL_GPL(tfrc_rx_hist_sample_rtt); 506EXPORT_SYMBOL_GPL(tfrc_rx_hist_sample_rtt);
diff --git a/net/dccp/ccids/lib/packet_history.h b/net/dccp/ccids/lib/packet_history.h
index 461cc91cce88..555e65cd73a0 100644
--- a/net/dccp/ccids/lib/packet_history.h
+++ b/net/dccp/ccids/lib/packet_history.h
@@ -40,12 +40,28 @@
40#include <linux/slab.h> 40#include <linux/slab.h>
41#include "tfrc.h" 41#include "tfrc.h"
42 42
43struct tfrc_tx_hist_entry; 43/**
44 * tfrc_tx_hist_entry - Simple singly-linked TX history list
45 * @next: next oldest entry (LIFO order)
46 * @seqno: sequence number of this entry
47 * @stamp: send time of packet with sequence number @seqno
48 */
49struct tfrc_tx_hist_entry {
50 struct tfrc_tx_hist_entry *next;
51 u64 seqno;
52 ktime_t stamp;
53};
54
55static inline struct tfrc_tx_hist_entry *
56 tfrc_tx_hist_find_entry(struct tfrc_tx_hist_entry *head, u64 seqno)
57{
58 while (head != NULL && head->seqno != seqno)
59 head = head->next;
60 return head;
61}
44 62
45extern int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno); 63extern int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno);
46extern void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp); 64extern void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp);
47extern u32 tfrc_tx_hist_rtt(struct tfrc_tx_hist_entry *head,
48 const u64 seqno, const ktime_t now);
49 65
50/* Subtraction a-b modulo-16, respects circular wrap-around */ 66/* Subtraction a-b modulo-16, respects circular wrap-around */
51#define SUB16(a, b) (((a) + 16 - (b)) & 0xF) 67#define SUB16(a, b) (((a) + 16 - (b)) & 0xF)
@@ -75,12 +91,22 @@ struct tfrc_rx_hist_entry {
75 * @loss_count: Number of entries in circular history 91 * @loss_count: Number of entries in circular history
76 * @loss_start: Movable index (for loss detection) 92 * @loss_start: Movable index (for loss detection)
77 * @rtt_sample_prev: Used during RTT sampling, points to candidate entry 93 * @rtt_sample_prev: Used during RTT sampling, points to candidate entry
94 * @rtt_estimate: Receiver RTT estimate
95 * @packet_size: Packet size in bytes (as per RFC 3448, 3.1)
96 * @bytes_recvd: Number of bytes received since @bytes_start
97 * @bytes_start: Start time for counting @bytes_recvd
78 */ 98 */
79struct tfrc_rx_hist { 99struct tfrc_rx_hist {
80 struct tfrc_rx_hist_entry *ring[TFRC_NDUPACK + 1]; 100 struct tfrc_rx_hist_entry *ring[TFRC_NDUPACK + 1];
81 u8 loss_count:2, 101 u8 loss_count:2,
82 loss_start:2; 102 loss_start:2;
103 /* Receiver RTT sampling */
83#define rtt_sample_prev loss_start 104#define rtt_sample_prev loss_start
105 u32 rtt_estimate;
106 /* Receiver sampling of application payload lengths */
107 u32 packet_size,
108 bytes_recvd;
109 ktime_t bytes_start;
84}; 110};
85 111
86/** 112/**
@@ -124,20 +150,50 @@ static inline bool tfrc_rx_hist_loss_pending(const struct tfrc_rx_hist *h)
124 return h->loss_count > 0; 150 return h->loss_count > 0;
125} 151}
126 152
153/*
154 * Accessor functions to retrieve parameters sampled by the RX history
155 */
156static inline u32 tfrc_rx_hist_packet_size(const struct tfrc_rx_hist *h)
157{
158 if (h->packet_size == 0) {
159 DCCP_WARN("No sample for s, using fallback\n");
160 return TCP_MIN_RCVMSS;
161 }
162 return h->packet_size;
163
164}
165static inline u32 tfrc_rx_hist_rtt(const struct tfrc_rx_hist *h)
166{
167 if (h->rtt_estimate == 0) {
168 DCCP_WARN("No RTT estimate available, using fallback RTT\n");
169 return DCCP_FALLBACK_RTT;
170 }
171 return h->rtt_estimate;
172}
173
174static inline void tfrc_rx_hist_restart_byte_counter(struct tfrc_rx_hist *h)
175{
176 h->bytes_recvd = 0;
177 h->bytes_start = ktime_get_real();
178}
179
180extern u32 tfrc_rx_hist_x_recv(struct tfrc_rx_hist *h, const u32 last_x_recv);
181
182
127extern void tfrc_rx_hist_add_packet(struct tfrc_rx_hist *h, 183extern void tfrc_rx_hist_add_packet(struct tfrc_rx_hist *h,
128 const struct sk_buff *skb, const u64 ndp); 184 const struct sk_buff *skb, const u64 ndp);
129 185
130extern int tfrc_rx_hist_duplicate(struct tfrc_rx_hist *h, struct sk_buff *skb); 186extern int tfrc_rx_hist_duplicate(struct tfrc_rx_hist *h, struct sk_buff *skb);
131 187
132struct tfrc_loss_hist; 188struct tfrc_loss_hist;
133extern int tfrc_rx_handle_loss(struct tfrc_rx_hist *h, 189extern bool tfrc_rx_congestion_event(struct tfrc_rx_hist *h,
134 struct tfrc_loss_hist *lh, 190 struct tfrc_loss_hist *lh,
135 struct sk_buff *skb, const u64 ndp, 191 struct sk_buff *skb, const u64 ndp,
136 u32 (*first_li)(struct sock *sk), 192 u32 (*first_li)(struct sock *sk),
137 struct sock *sk); 193 struct sock *sk);
138extern u32 tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, 194extern void tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h,
139 const struct sk_buff *skb); 195 const struct sk_buff *skb);
140extern int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h); 196extern int tfrc_rx_hist_init(struct tfrc_rx_hist *h, struct sock *sk);
141extern void tfrc_rx_hist_purge(struct tfrc_rx_hist *h); 197extern void tfrc_rx_hist_purge(struct tfrc_rx_hist *h);
142 198
143#endif /* _DCCP_PKT_HIST_ */ 199#endif /* _DCCP_PKT_HIST_ */
diff --git a/net/dccp/ccids/lib/tfrc.h b/net/dccp/ccids/lib/tfrc.h
index ed9857527acf..ede12f53de5a 100644
--- a/net/dccp/ccids/lib/tfrc.h
+++ b/net/dccp/ccids/lib/tfrc.h
@@ -48,6 +48,21 @@ static inline u32 scaled_div32(u64 a, u64 b)
48} 48}
49 49
50/** 50/**
51 * tfrc_scaled_sqrt - Compute scaled integer sqrt(x) for 0 < x < 2^22-1
52 * Uses scaling to improve accuracy of the integer approximation of sqrt(). The
53 * scaling factor of 2^10 limits the maximum @sample to 4e6; this is okay for
54 * clamped RTT samples (dccp_sample_rtt).
55 * Should best be used for expressions of type sqrt(x)/sqrt(y), since then the
56 * scaling factor is neutralised. For this purpose, it avoids returning zero.
57 */
58static inline u16 tfrc_scaled_sqrt(const u32 sample)
59{
60 const unsigned long non_zero_sample = sample ? : 1;
61
62 return int_sqrt(non_zero_sample << 10);
63}
64
65/**
51 * tfrc_ewma - Exponentially weighted moving average 66 * tfrc_ewma - Exponentially weighted moving average
52 * @weight: Weight to be used as damping factor, in units of 1/10 67 * @weight: Weight to be used as damping factor, in units of 1/10
53 */ 68 */
@@ -58,6 +73,7 @@ static inline u32 tfrc_ewma(const u32 avg, const u32 newval, const u8 weight)
58 73
59extern u32 tfrc_calc_x(u16 s, u32 R, u32 p); 74extern u32 tfrc_calc_x(u16 s, u32 R, u32 p);
60extern u32 tfrc_calc_x_reverse_lookup(u32 fvalue); 75extern u32 tfrc_calc_x_reverse_lookup(u32 fvalue);
76extern u32 tfrc_invert_loss_event_rate(u32 loss_event_rate);
61 77
62extern int tfrc_tx_packet_history_init(void); 78extern int tfrc_tx_packet_history_init(void);
63extern void tfrc_tx_packet_history_exit(void); 79extern void tfrc_tx_packet_history_exit(void);
diff --git a/net/dccp/ccids/lib/tfrc_equation.c b/net/dccp/ccids/lib/tfrc_equation.c
index 2f20a29cffe4..38239c4d5e14 100644
--- a/net/dccp/ccids/lib/tfrc_equation.c
+++ b/net/dccp/ccids/lib/tfrc_equation.c
@@ -632,8 +632,16 @@ u32 tfrc_calc_x(u16 s, u32 R, u32 p)
632 632
633 if (p <= TFRC_CALC_X_SPLIT) { /* 0.0000 < p <= 0.05 */ 633 if (p <= TFRC_CALC_X_SPLIT) { /* 0.0000 < p <= 0.05 */
634 if (p < TFRC_SMALLEST_P) { /* 0.0000 < p < 0.0001 */ 634 if (p < TFRC_SMALLEST_P) { /* 0.0000 < p < 0.0001 */
635 DCCP_WARN("Value of p (%d) below resolution. " 635 /*
636 "Substituting %d\n", p, TFRC_SMALLEST_P); 636 * In the congestion-avoidance phase p decays towards 0
637 * when there are no further losses, so this case is
638 * natural. Truncating to p_min = 0.01% means that the
639 * maximum achievable throughput is limited to about
640 * X_calc_max = 122.4 * s/RTT (see RFC 3448, 3.1); e.g.
641 * with s=1500 bytes, RTT=0.01 s: X_calc_max = 147 Mbps.
642 */
643 tfrc_pr_debug("Value of p (%d) below resolution. "
644 "Substituting %d\n", p, TFRC_SMALLEST_P);
637 index = 0; 645 index = 0;
638 } else /* 0.0001 <= p <= 0.05 */ 646 } else /* 0.0001 <= p <= 0.05 */
639 index = p/TFRC_SMALLEST_P - 1; 647 index = p/TFRC_SMALLEST_P - 1;
@@ -658,7 +666,6 @@ u32 tfrc_calc_x(u16 s, u32 R, u32 p)
658 result = scaled_div(s, R); 666 result = scaled_div(s, R);
659 return scaled_div32(result, f); 667 return scaled_div32(result, f);
660} 668}
661
662EXPORT_SYMBOL_GPL(tfrc_calc_x); 669EXPORT_SYMBOL_GPL(tfrc_calc_x);
663 670
664/** 671/**
@@ -693,5 +700,19 @@ u32 tfrc_calc_x_reverse_lookup(u32 fvalue)
693 index = tfrc_binsearch(fvalue, 0); 700 index = tfrc_binsearch(fvalue, 0);
694 return (index + 1) * 1000000 / TFRC_CALC_X_ARRSIZE; 701 return (index + 1) * 1000000 / TFRC_CALC_X_ARRSIZE;
695} 702}
696
697EXPORT_SYMBOL_GPL(tfrc_calc_x_reverse_lookup); 703EXPORT_SYMBOL_GPL(tfrc_calc_x_reverse_lookup);
704
705/**
706 * tfrc_invert_loss_event_rate - Compute p so that 10^6 corresponds to 100%
707 * When @loss_event_rate is large, there is a chance that p is truncated to 0.
708 * To avoid re-entering slow-start in that case, we set p = TFRC_SMALLEST_P > 0.
709 */
710u32 tfrc_invert_loss_event_rate(u32 loss_event_rate)
711{
712 if (loss_event_rate == UINT_MAX) /* see RFC 4342, 8.5 */
713 return 0;
714 if (unlikely(loss_event_rate == 0)) /* map 1/0 into 100% */
715 return 1000000;
716 return max_t(u32, scaled_div(1, loss_event_rate), TFRC_SMALLEST_P);
717}
718EXPORT_SYMBOL_GPL(tfrc_invert_loss_event_rate);
diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
index b4bc6e095a0e..5281190aa19c 100644
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -42,9 +42,11 @@
42extern int dccp_debug; 42extern int dccp_debug;
43#define dccp_pr_debug(format, a...) DCCP_PR_DEBUG(dccp_debug, format, ##a) 43#define dccp_pr_debug(format, a...) DCCP_PR_DEBUG(dccp_debug, format, ##a)
44#define dccp_pr_debug_cat(format, a...) DCCP_PRINTK(dccp_debug, format, ##a) 44#define dccp_pr_debug_cat(format, a...) DCCP_PRINTK(dccp_debug, format, ##a)
45#define dccp_debug(fmt, a...) dccp_pr_debug_cat(KERN_DEBUG fmt, ##a)
45#else 46#else
46#define dccp_pr_debug(format, a...) 47#define dccp_pr_debug(format, a...)
47#define dccp_pr_debug_cat(format, a...) 48#define dccp_pr_debug_cat(format, a...)
49#define dccp_debug(format, a...)
48#endif 50#endif
49 51
50extern struct inet_hashinfo dccp_hashinfo; 52extern struct inet_hashinfo dccp_hashinfo;
@@ -61,11 +63,14 @@ extern void dccp_time_wait(struct sock *sk, int state, int timeo);
61 * - DCCP-Reset with ACK Subheader and 4 bytes of Reset Code fields 63 * - DCCP-Reset with ACK Subheader and 4 bytes of Reset Code fields
62 * Hence a safe upper bound for the maximum option length is 1020-28 = 992 64 * Hence a safe upper bound for the maximum option length is 1020-28 = 992
63 */ 65 */
64#define MAX_DCCP_SPECIFIC_HEADER (255 * sizeof(int)) 66#define MAX_DCCP_SPECIFIC_HEADER (255 * sizeof(uint32_t))
65#define DCCP_MAX_PACKET_HDR 28 67#define DCCP_MAX_PACKET_HDR 28
66#define DCCP_MAX_OPT_LEN (MAX_DCCP_SPECIFIC_HEADER - DCCP_MAX_PACKET_HDR) 68#define DCCP_MAX_OPT_LEN (MAX_DCCP_SPECIFIC_HEADER - DCCP_MAX_PACKET_HDR)
67#define MAX_DCCP_HEADER (MAX_DCCP_SPECIFIC_HEADER + MAX_HEADER) 69#define MAX_DCCP_HEADER (MAX_DCCP_SPECIFIC_HEADER + MAX_HEADER)
68 70
71/* Upper bound for initial feature-negotiation overhead (padded to 32 bits) */
72#define DCCP_FEATNEG_OVERHEAD (32 * sizeof(uint32_t))
73
69#define DCCP_TIMEWAIT_LEN (60 * HZ) /* how long to wait to destroy TIME-WAIT 74#define DCCP_TIMEWAIT_LEN (60 * HZ) /* how long to wait to destroy TIME-WAIT
70 * state, about 60 seconds */ 75 * state, about 60 seconds */
71 76
@@ -81,10 +86,13 @@ extern void dccp_time_wait(struct sock *sk, int state, int timeo);
81 */ 86 */
82#define DCCP_RTO_MAX ((unsigned)(64 * HZ)) 87#define DCCP_RTO_MAX ((unsigned)(64 * HZ))
83 88
89/* DCCP base time resolution - 10 microseconds (RFC 4340, 13.1 ... 13.3) */
90#define DCCP_TIME_RESOLUTION 10
91
84/* 92/*
85 * RTT sampling: sanity bounds and fallback RTT value from RFC 4340, section 3.4 93 * RTT sampling: sanity bounds and fallback RTT value from RFC 4340, section 3.4
86 */ 94 */
87#define DCCP_SANE_RTT_MIN 100 95#define DCCP_SANE_RTT_MIN (10 * DCCP_TIME_RESOLUTION)
88#define DCCP_FALLBACK_RTT (USEC_PER_SEC / 5) 96#define DCCP_FALLBACK_RTT (USEC_PER_SEC / 5)
89#define DCCP_SANE_RTT_MAX (3 * USEC_PER_SEC) 97#define DCCP_SANE_RTT_MAX (3 * USEC_PER_SEC)
90 98
@@ -95,12 +103,6 @@ extern void dccp_time_wait(struct sock *sk, int state, int timeo);
95extern int sysctl_dccp_request_retries; 103extern int sysctl_dccp_request_retries;
96extern int sysctl_dccp_retries1; 104extern int sysctl_dccp_retries1;
97extern int sysctl_dccp_retries2; 105extern int sysctl_dccp_retries2;
98extern int sysctl_dccp_feat_sequence_window;
99extern int sysctl_dccp_feat_rx_ccid;
100extern int sysctl_dccp_feat_tx_ccid;
101extern int sysctl_dccp_feat_ack_ratio;
102extern int sysctl_dccp_feat_send_ack_vector;
103extern int sysctl_dccp_feat_send_ndp_count;
104extern int sysctl_dccp_tx_qlen; 106extern int sysctl_dccp_tx_qlen;
105extern int sysctl_dccp_sync_ratelimit; 107extern int sysctl_dccp_sync_ratelimit;
106 108
@@ -235,8 +237,22 @@ extern void dccp_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
235extern void dccp_send_sync(struct sock *sk, const u64 seq, 237extern void dccp_send_sync(struct sock *sk, const u64 seq,
236 const enum dccp_pkt_type pkt_type); 238 const enum dccp_pkt_type pkt_type);
237 239
238extern void dccp_write_xmit(struct sock *sk, int block); 240/*
241 * TX Packet Dequeueing Interface
242 */
243extern void dccp_qpolicy_push(struct sock *sk, struct sk_buff *skb);
244extern bool dccp_qpolicy_full(struct sock *sk);
245extern void dccp_qpolicy_drop(struct sock *sk, struct sk_buff *skb);
246extern struct sk_buff *dccp_qpolicy_top(struct sock *sk);
247extern struct sk_buff *dccp_qpolicy_pop(struct sock *sk);
248extern bool dccp_qpolicy_param_ok(struct sock *sk, __be32 param);
249
250/*
251 * TX Packet Output and TX Timers
252 */
253extern void dccp_write_xmit(struct sock *sk);
239extern void dccp_write_space(struct sock *sk); 254extern void dccp_write_space(struct sock *sk);
255extern void dccp_flush_write_queue(struct sock *sk, long *time_budget);
240 256
241extern void dccp_init_xmit_timers(struct sock *sk); 257extern void dccp_init_xmit_timers(struct sock *sk);
242static inline void dccp_clear_xmit_timers(struct sock *sk) 258static inline void dccp_clear_xmit_timers(struct sock *sk)
@@ -252,7 +268,8 @@ extern const char *dccp_state_name(const int state);
252extern void dccp_set_state(struct sock *sk, const int state); 268extern void dccp_set_state(struct sock *sk, const int state);
253extern void dccp_done(struct sock *sk); 269extern void dccp_done(struct sock *sk);
254 270
255extern void dccp_reqsk_init(struct request_sock *req, struct sk_buff *skb); 271extern int dccp_reqsk_init(struct request_sock *rq, struct dccp_sock const *dp,
272 struct sk_buff const *skb);
256 273
257extern int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb); 274extern int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb);
258 275
@@ -317,7 +334,14 @@ extern struct sk_buff *dccp_ctl_make_reset(struct sock *sk,
317extern int dccp_send_reset(struct sock *sk, enum dccp_reset_codes code); 334extern int dccp_send_reset(struct sock *sk, enum dccp_reset_codes code);
318extern void dccp_send_close(struct sock *sk, const int active); 335extern void dccp_send_close(struct sock *sk, const int active);
319extern int dccp_invalid_packet(struct sk_buff *skb); 336extern int dccp_invalid_packet(struct sk_buff *skb);
320extern u32 dccp_sample_rtt(struct sock *sk, long delta); 337
338static inline u32 dccp_sane_rtt(long usec_sample)
339{
340 if (unlikely(usec_sample <= 0 || usec_sample > DCCP_SANE_RTT_MAX))
341 DCCP_WARN("RTT sample %ld out of bounds!\n", usec_sample);
342 return clamp_val(usec_sample, DCCP_SANE_RTT_MIN, DCCP_SANE_RTT_MAX);
343}
344extern u32 dccp_sample_rtt(struct sock *sk, long delta);
321 345
322static inline int dccp_bad_service_code(const struct sock *sk, 346static inline int dccp_bad_service_code(const struct sock *sk,
323 const __be32 service) 347 const __be32 service)
@@ -411,36 +435,62 @@ static inline void dccp_hdr_set_ack(struct dccp_hdr_ack_bits *dhack,
411static inline void dccp_update_gsr(struct sock *sk, u64 seq) 435static inline void dccp_update_gsr(struct sock *sk, u64 seq)
412{ 436{
413 struct dccp_sock *dp = dccp_sk(sk); 437 struct dccp_sock *dp = dccp_sk(sk);
414 const struct dccp_minisock *dmsk = dccp_msk(sk);
415 438
416 dp->dccps_gsr = seq; 439 dp->dccps_gsr = seq;
417 dccp_set_seqno(&dp->dccps_swl, 440 /* Sequence validity window depends on remote Sequence Window (7.5.1) */
418 dp->dccps_gsr + 1 - (dmsk->dccpms_sequence_window / 4)); 441 dp->dccps_swl = SUB48(ADD48(dp->dccps_gsr, 1), dp->dccps_r_seq_win / 4);
419 dccp_set_seqno(&dp->dccps_swh, 442 /*
420 dp->dccps_gsr + (3 * dmsk->dccpms_sequence_window) / 4); 443 * Adjust SWL so that it is not below ISR. In contrast to RFC 4340,
444 * 7.5.1 we perform this check beyond the initial handshake: W/W' are
445 * always > 32, so for the first W/W' packets in the lifetime of a
446 * connection we always have to adjust SWL.
447 * A second reason why we are doing this is that the window depends on
448 * the feature-remote value of Sequence Window: nothing stops the peer
449 * from updating this value while we are busy adjusting SWL for the
450 * first W packets (we would have to count from scratch again then).
451 * Therefore it is safer to always make sure that the Sequence Window
452 * is not artificially extended by a peer who grows SWL downwards by
453 * continually updating the feature-remote Sequence-Window.
454 * If sequence numbers wrap it is bad luck. But that will take a while
455 * (48 bit), and this measure prevents Sequence-number attacks.
456 */
457 if (before48(dp->dccps_swl, dp->dccps_isr))
458 dp->dccps_swl = dp->dccps_isr;
459 dp->dccps_swh = ADD48(dp->dccps_gsr, (3 * dp->dccps_r_seq_win) / 4);
421} 460}
422 461
423static inline void dccp_update_gss(struct sock *sk, u64 seq) 462static inline void dccp_update_gss(struct sock *sk, u64 seq)
424{ 463{
425 struct dccp_sock *dp = dccp_sk(sk); 464 struct dccp_sock *dp = dccp_sk(sk);
426 465
427 dp->dccps_awh = dp->dccps_gss = seq; 466 dp->dccps_gss = seq;
428 dccp_set_seqno(&dp->dccps_awl, 467 /* Ack validity window depends on local Sequence Window value (7.5.1) */
429 (dp->dccps_gss - 468 dp->dccps_awl = SUB48(ADD48(dp->dccps_gss, 1), dp->dccps_l_seq_win);
430 dccp_msk(sk)->dccpms_sequence_window + 1)); 469 /* Adjust AWL so that it is not below ISS - see comment above for SWL */
470 if (before48(dp->dccps_awl, dp->dccps_iss))
471 dp->dccps_awl = dp->dccps_iss;
472 dp->dccps_awh = dp->dccps_gss;
473}
474
475static inline int dccp_ackvec_pending(const struct sock *sk)
476{
477 return dccp_sk(sk)->dccps_hc_rx_ackvec != NULL &&
478 !dccp_ackvec_is_empty(dccp_sk(sk)->dccps_hc_rx_ackvec);
431} 479}
432 480
433static inline int dccp_ack_pending(const struct sock *sk) 481static inline int dccp_ack_pending(const struct sock *sk)
434{ 482{
435 const struct dccp_sock *dp = dccp_sk(sk); 483 return dccp_ackvec_pending(sk) || inet_csk_ack_scheduled(sk);
436 return dp->dccps_timestamp_echo != 0 ||
437#ifdef CONFIG_IP_DCCP_ACKVEC
438 (dccp_msk(sk)->dccpms_send_ack_vector &&
439 dccp_ackvec_pending(dp->dccps_hc_rx_ackvec)) ||
440#endif
441 inet_csk_ack_scheduled(sk);
442} 484}
443 485
486extern int dccp_feat_signal_nn_change(struct sock *sk, u8 feat, u64 nn_val);
487extern int dccp_feat_finalise_settings(struct dccp_sock *dp);
488extern int dccp_feat_server_ccid_dependencies(struct dccp_request_sock *dreq);
489extern int dccp_feat_insert_opts(struct dccp_sock*, struct dccp_request_sock*,
490 struct sk_buff *skb);
491extern int dccp_feat_activate_values(struct sock *sk, struct list_head *fn);
492extern void dccp_feat_list_purge(struct list_head *fn_list);
493
444extern int dccp_insert_options(struct sock *sk, struct sk_buff *skb); 494extern int dccp_insert_options(struct sock *sk, struct sk_buff *skb);
445extern int dccp_insert_options_rsk(struct dccp_request_sock*, struct sk_buff*); 495extern int dccp_insert_options_rsk(struct dccp_request_sock*, struct sk_buff*);
446extern int dccp_insert_option_elapsed_time(struct sock *sk, 496extern int dccp_insert_option_elapsed_time(struct sock *sk,
diff --git a/net/dccp/diag.c b/net/dccp/diag.c
index d8a3509b26f6..93aae7c95550 100644
--- a/net/dccp/diag.c
+++ b/net/dccp/diag.c
@@ -29,7 +29,7 @@ static void dccp_get_info(struct sock *sk, struct tcp_info *info)
29 info->tcpi_backoff = icsk->icsk_backoff; 29 info->tcpi_backoff = icsk->icsk_backoff;
30 info->tcpi_pmtu = icsk->icsk_pmtu_cookie; 30 info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
31 31
32 if (dccp_msk(sk)->dccpms_send_ack_vector) 32 if (dp->dccps_hc_rx_ackvec != NULL)
33 info->tcpi_options |= TCPI_OPT_SACK; 33 info->tcpi_options |= TCPI_OPT_SACK;
34 34
35 ccid_hc_rx_get_info(dp->dccps_hc_rx_ccid, sk, info); 35 ccid_hc_rx_get_info(dp->dccps_hc_rx_ccid, sk, info);
diff --git a/net/dccp/feat.c b/net/dccp/feat.c
index 933a0ecf8d46..f94c7c9d1a7f 100644
--- a/net/dccp/feat.c
+++ b/net/dccp/feat.c
@@ -1,11 +1,19 @@
1/* 1/*
2 * net/dccp/feat.c 2 * net/dccp/feat.c
3 * 3 *
4 * An implementation of the DCCP protocol 4 * Feature negotiation for the DCCP protocol (RFC 4340, section 6)
5 * Andrea Bittau <a.bittau@cs.ucl.ac.uk> 5 *
6 * Copyright (c) 2008 The University of Aberdeen, Scotland, UK
7 * Copyright (c) 2008 Gerrit Renker <gerrit@erg.abdn.ac.uk>
8 * Rewrote from scratch, some bits from earlier code by
9 * Copyright (c) 2005 Andrea Bittau <a.bittau@cs.ucl.ac.uk>
10 *
6 * 11 *
7 * ASSUMPTIONS 12 * ASSUMPTIONS
8 * ----------- 13 * -----------
14 * o Feature negotiation is coordinated with connection setup (as in TCP), wild
15 * changes of parameters of an established connection are not supported.
16 * o Changing NN values (Ack Ratio only) is supported in state OPEN/PARTOPEN.
9 * o All currently known SP features have 1-byte quantities. If in the future 17 * o All currently known SP features have 1-byte quantities. If in the future
10 * extensions of RFCs 4340..42 define features with item lengths larger than 18 * extensions of RFCs 4340..42 define features with item lengths larger than
11 * one byte, a feature-specific extension of the code will be required. 19 * one byte, a feature-specific extension of the code will be required.
@@ -15,635 +23,1510 @@
15 * as published by the Free Software Foundation; either version 23 * as published by the Free Software Foundation; either version
16 * 2 of the License, or (at your option) any later version. 24 * 2 of the License, or (at your option) any later version.
17 */ 25 */
18
19#include <linux/module.h> 26#include <linux/module.h>
20
21#include "ccid.h" 27#include "ccid.h"
22#include "feat.h" 28#include "feat.h"
23 29
24#define DCCP_FEAT_SP_NOAGREE (-123) 30/* feature-specific sysctls - initialised to the defaults from RFC 4340, 6.4 */
25 31unsigned long sysctl_dccp_sequence_window __read_mostly = 100;
26int dccp_feat_change(struct dccp_minisock *dmsk, u8 type, u8 feature, 32int sysctl_dccp_rx_ccid __read_mostly = 2,
27 u8 *val, u8 len, gfp_t gfp) 33 sysctl_dccp_tx_ccid __read_mostly = 2;
28{
29 struct dccp_opt_pend *opt;
30
31 dccp_feat_debug(type, feature, *val);
32
33 if (len > 3) {
34 DCCP_WARN("invalid length %d\n", len);
35 return -EINVAL;
36 }
37 /* XXX add further sanity checks */
38
39 /* check if that feature is already being negotiated */
40 list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) {
41 /* ok we found a negotiation for this option already */
42 if (opt->dccpop_feat == feature && opt->dccpop_type == type) {
43 dccp_pr_debug("Replacing old\n");
44 /* replace */
45 BUG_ON(opt->dccpop_val == NULL);
46 kfree(opt->dccpop_val);
47 opt->dccpop_val = val;
48 opt->dccpop_len = len;
49 opt->dccpop_conf = 0;
50 return 0;
51 }
52 }
53
54 /* negotiation for a new feature */
55 opt = kmalloc(sizeof(*opt), gfp);
56 if (opt == NULL)
57 return -ENOMEM;
58
59 opt->dccpop_type = type;
60 opt->dccpop_feat = feature;
61 opt->dccpop_len = len;
62 opt->dccpop_val = val;
63 opt->dccpop_conf = 0;
64 opt->dccpop_sc = NULL;
65
66 BUG_ON(opt->dccpop_val == NULL);
67
68 list_add_tail(&opt->dccpop_node, &dmsk->dccpms_pending);
69 return 0;
70}
71 34
72EXPORT_SYMBOL_GPL(dccp_feat_change); 35/*
73 36 * Feature activation handlers.
74static int dccp_feat_update_ccid(struct sock *sk, u8 type, u8 new_ccid_nr) 37 *
38 * These all use an u64 argument, to provide enough room for NN/SP features. At
39 * this stage the negotiated values have been checked to be within their range.
40 */
41static int dccp_hdlr_ccid(struct sock *sk, u64 ccid, bool rx)
75{ 42{
76 struct dccp_sock *dp = dccp_sk(sk); 43 struct dccp_sock *dp = dccp_sk(sk);
77 struct dccp_minisock *dmsk = dccp_msk(sk); 44 struct ccid *new_ccid = ccid_new(ccid, sk, rx, gfp_any());
78 /* figure out if we are changing our CCID or the peer's */
79 const int rx = type == DCCPO_CHANGE_R;
80 const u8 ccid_nr = rx ? dmsk->dccpms_rx_ccid : dmsk->dccpms_tx_ccid;
81 struct ccid *new_ccid;
82
83 /* Check if nothing is being changed. */
84 if (ccid_nr == new_ccid_nr)
85 return 0;
86 45
87 new_ccid = ccid_new(new_ccid_nr, sk, rx, GFP_ATOMIC);
88 if (new_ccid == NULL) 46 if (new_ccid == NULL)
89 return -ENOMEM; 47 return -ENOMEM;
90 48
91 if (rx) { 49 if (rx) {
92 ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk); 50 ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk);
93 dp->dccps_hc_rx_ccid = new_ccid; 51 dp->dccps_hc_rx_ccid = new_ccid;
94 dmsk->dccpms_rx_ccid = new_ccid_nr;
95 } else { 52 } else {
96 ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk); 53 ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk);
97 dp->dccps_hc_tx_ccid = new_ccid; 54 dp->dccps_hc_tx_ccid = new_ccid;
98 dmsk->dccpms_tx_ccid = new_ccid_nr;
99 } 55 }
100
101 return 0; 56 return 0;
102} 57}
103 58
104static int dccp_feat_update(struct sock *sk, u8 type, u8 feat, u8 val) 59static int dccp_hdlr_seq_win(struct sock *sk, u64 seq_win, bool rx)
105{ 60{
106 dccp_feat_debug(type, feat, val); 61 struct dccp_sock *dp = dccp_sk(sk);
107 62
108 switch (feat) { 63 if (rx) {
109 case DCCPF_CCID: 64 dp->dccps_r_seq_win = seq_win;
110 return dccp_feat_update_ccid(sk, type, val); 65 /* propagate changes to update SWL/SWH */
111 default: 66 dccp_update_gsr(sk, dp->dccps_gsr);
112 dccp_pr_debug("UNIMPLEMENTED: %s(%d, ...)\n", 67 } else {
113 dccp_feat_typename(type), feat); 68 dp->dccps_l_seq_win = seq_win;
114 break; 69 /* propagate changes to update AWL */
70 dccp_update_gss(sk, dp->dccps_gss);
115 } 71 }
116 return 0; 72 return 0;
117} 73}
118 74
119static int dccp_feat_reconcile(struct sock *sk, struct dccp_opt_pend *opt, 75static int dccp_hdlr_ack_ratio(struct sock *sk, u64 ratio, bool rx)
120 u8 *rpref, u8 rlen) 76{
77#ifndef __CCID2_COPES_GRACEFULLY_WITH_DYNAMIC_ACK_RATIO_UPDATES__
78 /*
79 * FIXME: This is required until several problems in the CCID-2 code are
80 * resolved. The CCID-2 code currently does not cope well; using dynamic
81 * Ack Ratios greater than 1 caused instabilities. These were manifest
82 * in hangups and long RTO timeouts (1...3 seconds). Until this has been
83 * stabilised, it is safer not to activate dynamic Ack Ratio changes.
84 */
85 dccp_pr_debug("Not changing %s Ack Ratio from 1 to %u\n",
86 rx ? "RX" : "TX", (u16)ratio);
87 ratio = 1;
88#endif
89 if (rx)
90 dccp_sk(sk)->dccps_r_ack_ratio = ratio;
91 else
92 dccp_sk(sk)->dccps_l_ack_ratio = ratio;
93 return 0;
94}
95
96static int dccp_hdlr_ackvec(struct sock *sk, u64 enable, bool rx)
121{ 97{
122 struct dccp_sock *dp = dccp_sk(sk); 98 struct dccp_sock *dp = dccp_sk(sk);
123 u8 *spref, slen, *res = NULL;
124 int i, j, rc, agree = 1;
125 99
126 BUG_ON(rpref == NULL); 100 if (rx) {
101 if (enable && dp->dccps_hc_rx_ackvec == NULL) {
102 dp->dccps_hc_rx_ackvec = dccp_ackvec_alloc(gfp_any());
103 if (dp->dccps_hc_rx_ackvec == NULL)
104 return -ENOMEM;
105 } else if (!enable) {
106 dccp_ackvec_free(dp->dccps_hc_rx_ackvec);
107 dp->dccps_hc_rx_ackvec = NULL;
108 }
109 }
110 return 0;
111}
127 112
128 /* check if we are the black sheep */ 113static int dccp_hdlr_ndp(struct sock *sk, u64 enable, bool rx)
129 if (dp->dccps_role == DCCP_ROLE_CLIENT) { 114{
130 spref = rpref; 115 if (!rx)
131 slen = rlen; 116 dccp_sk(sk)->dccps_send_ndp_count = (enable > 0);
132 rpref = opt->dccpop_val; 117 return 0;
133 rlen = opt->dccpop_len; 118}
134 } else { 119
135 spref = opt->dccpop_val; 120/*
136 slen = opt->dccpop_len; 121 * Minimum Checksum Coverage is located at the RX side (9.2.1). This means that
122 * `rx' holds when the sending peer informs about his partial coverage via a
123 * ChangeR() option. In the other case, we are the sender and the receiver
124 * announces its coverage via ChangeL() options. The policy here is to honour
125 * such communication by enabling the corresponding partial coverage - but only
126 * if it has not been set manually before; the warning here means that all
127 * packets will be dropped.
128 */
129static int dccp_hdlr_min_cscov(struct sock *sk, u64 cscov, bool rx)
130{
131 struct dccp_sock *dp = dccp_sk(sk);
132
133 if (rx)
134 dp->dccps_pcrlen = cscov;
135 else {
136 if (dp->dccps_pcslen == 0)
137 dp->dccps_pcslen = cscov;
138 else if (cscov > dp->dccps_pcslen)
139 DCCP_WARN("CsCov %u too small, peer requires >= %u\n",
140 dp->dccps_pcslen, (u8)cscov);
137 } 141 }
142 return 0;
143}
144
145static const struct {
146 u8 feat_num; /* DCCPF_xxx */
147 enum dccp_feat_type rxtx; /* RX or TX */
148 enum dccp_feat_type reconciliation; /* SP or NN */
149 u8 default_value; /* as in 6.4 */
150 int (*activation_hdlr)(struct sock *sk, u64 val, bool rx);
151/*
152 * Lookup table for location and type of features (from RFC 4340/4342)
153 * +--------------------------+----+-----+----+----+---------+-----------+
154 * | Feature | Location | Reconc. | Initial | Section |
155 * | | RX | TX | SP | NN | Value | Reference |
156 * +--------------------------+----+-----+----+----+---------+-----------+
157 * | DCCPF_CCID | | X | X | | 2 | 10 |
158 * | DCCPF_SHORT_SEQNOS | | X | X | | 0 | 7.6.1 |
159 * | DCCPF_SEQUENCE_WINDOW | | X | | X | 100 | 7.5.2 |
160 * | DCCPF_ECN_INCAPABLE | X | | X | | 0 | 12.1 |
161 * | DCCPF_ACK_RATIO | | X | | X | 2 | 11.3 |
162 * | DCCPF_SEND_ACK_VECTOR | X | | X | | 0 | 11.5 |
163 * | DCCPF_SEND_NDP_COUNT | | X | X | | 0 | 7.7.2 |
164 * | DCCPF_MIN_CSUM_COVER | X | | X | | 0 | 9.2.1 |
165 * | DCCPF_DATA_CHECKSUM | X | | X | | 0 | 9.3.1 |
166 * | DCCPF_SEND_LEV_RATE | X | | X | | 0 | 4342/8.4 |
167 * +--------------------------+----+-----+----+----+---------+-----------+
168 */
169} dccp_feat_table[] = {
170 { DCCPF_CCID, FEAT_AT_TX, FEAT_SP, 2, dccp_hdlr_ccid },
171 { DCCPF_SHORT_SEQNOS, FEAT_AT_TX, FEAT_SP, 0, NULL },
172 { DCCPF_SEQUENCE_WINDOW, FEAT_AT_TX, FEAT_NN, 100, dccp_hdlr_seq_win },
173 { DCCPF_ECN_INCAPABLE, FEAT_AT_RX, FEAT_SP, 0, NULL },
174 { DCCPF_ACK_RATIO, FEAT_AT_TX, FEAT_NN, 2, dccp_hdlr_ack_ratio},
175 { DCCPF_SEND_ACK_VECTOR, FEAT_AT_RX, FEAT_SP, 0, dccp_hdlr_ackvec },
176 { DCCPF_SEND_NDP_COUNT, FEAT_AT_TX, FEAT_SP, 0, dccp_hdlr_ndp },
177 { DCCPF_MIN_CSUM_COVER, FEAT_AT_RX, FEAT_SP, 0, dccp_hdlr_min_cscov},
178 { DCCPF_DATA_CHECKSUM, FEAT_AT_RX, FEAT_SP, 0, NULL },
179 { DCCPF_SEND_LEV_RATE, FEAT_AT_RX, FEAT_SP, 0, NULL },
180};
181#define DCCP_FEAT_SUPPORTED_MAX ARRAY_SIZE(dccp_feat_table)
182
183/**
184 * dccp_feat_index - Hash function to map feature number into array position
185 * Returns consecutive array index or -1 if the feature is not understood.
186 */
187static int dccp_feat_index(u8 feat_num)
188{
189 /* The first 9 entries are occupied by the types from RFC 4340, 6.4 */
190 if (feat_num > DCCPF_RESERVED && feat_num <= DCCPF_DATA_CHECKSUM)
191 return feat_num - 1;
192
138 /* 193 /*
139 * Now we have server preference list in spref and client preference in 194 * Other features: add cases for new feature types here after adding
140 * rpref 195 * them to the above table.
141 */ 196 */
142 BUG_ON(spref == NULL); 197 switch (feat_num) {
143 BUG_ON(rpref == NULL); 198 case DCCPF_SEND_LEV_RATE:
199 return DCCP_FEAT_SUPPORTED_MAX - 1;
200 }
201 return -1;
202}
144 203
145 /* FIXME sanity check vals */ 204static u8 dccp_feat_type(u8 feat_num)
205{
206 int idx = dccp_feat_index(feat_num);
146 207
147 /* Are values in any order? XXX Lame "algorithm" here */ 208 if (idx < 0)
148 for (i = 0; i < slen; i++) { 209 return FEAT_UNKNOWN;
149 for (j = 0; j < rlen; j++) { 210 return dccp_feat_table[idx].reconciliation;
150 if (spref[i] == rpref[j]) { 211}
151 res = &spref[i];
152 break;
153 }
154 }
155 if (res)
156 break;
157 }
158 212
159 /* we didn't agree on anything */ 213static int dccp_feat_default_value(u8 feat_num)
160 if (res == NULL) { 214{
161 /* confirm previous value */ 215 int idx = dccp_feat_index(feat_num);
162 switch (opt->dccpop_feat) {
163 case DCCPF_CCID:
164 /* XXX did i get this right? =P */
165 if (opt->dccpop_type == DCCPO_CHANGE_L)
166 res = &dccp_msk(sk)->dccpms_tx_ccid;
167 else
168 res = &dccp_msk(sk)->dccpms_rx_ccid;
169 break;
170 216
171 default: 217 return idx < 0 ? : dccp_feat_table[idx].default_value;
172 DCCP_BUG("Fell through, feat=%d", opt->dccpop_feat); 218}
173 /* XXX implement res */
174 return -EFAULT;
175 }
176 219
177 dccp_pr_debug("Don't agree... reconfirming %d\n", *res); 220/*
178 agree = 0; /* this is used for mandatory options... */ 221 * Debugging and verbose-printing section
222 */
223static const char *dccp_feat_fname(const u8 feat)
224{
225 static const char *feature_names[] = {
226 [DCCPF_RESERVED] = "Reserved",
227 [DCCPF_CCID] = "CCID",
228 [DCCPF_SHORT_SEQNOS] = "Allow Short Seqnos",
229 [DCCPF_SEQUENCE_WINDOW] = "Sequence Window",
230 [DCCPF_ECN_INCAPABLE] = "ECN Incapable",
231 [DCCPF_ACK_RATIO] = "Ack Ratio",
232 [DCCPF_SEND_ACK_VECTOR] = "Send ACK Vector",
233 [DCCPF_SEND_NDP_COUNT] = "Send NDP Count",
234 [DCCPF_MIN_CSUM_COVER] = "Min. Csum Coverage",
235 [DCCPF_DATA_CHECKSUM] = "Send Data Checksum",
236 };
237 if (feat > DCCPF_DATA_CHECKSUM && feat < DCCPF_MIN_CCID_SPECIFIC)
238 return feature_names[DCCPF_RESERVED];
239
240 if (feat == DCCPF_SEND_LEV_RATE)
241 return "Send Loss Event Rate";
242 if (feat >= DCCPF_MIN_CCID_SPECIFIC)
243 return "CCID-specific";
244
245 return feature_names[feat];
246}
247
248static const char *dccp_feat_sname[] = { "DEFAULT", "INITIALISING", "CHANGING",
249 "UNSTABLE", "STABLE" };
250
251#ifdef CONFIG_IP_DCCP_DEBUG
252static const char *dccp_feat_oname(const u8 opt)
253{
254 switch (opt) {
255 case DCCPO_CHANGE_L: return "Change_L";
256 case DCCPO_CONFIRM_L: return "Confirm_L";
257 case DCCPO_CHANGE_R: return "Change_R";
258 case DCCPO_CONFIRM_R: return "Confirm_R";
179 } 259 }
260 return NULL;
261}
180 262
181 /* need to put result and our preference list */ 263static void dccp_feat_printval(u8 feat_num, dccp_feat_val const *val)
182 rlen = 1 + opt->dccpop_len; 264{
183 rpref = kmalloc(rlen, GFP_ATOMIC); 265 u8 i, type = dccp_feat_type(feat_num);
184 if (rpref == NULL) 266
185 return -ENOMEM; 267 if (val == NULL || (type == FEAT_SP && val->sp.vec == NULL))
268 dccp_pr_debug_cat("(NULL)");
269 else if (type == FEAT_SP)
270 for (i = 0; i < val->sp.len; i++)
271 dccp_pr_debug_cat("%s%u", i ? " " : "", val->sp.vec[i]);
272 else if (type == FEAT_NN)
273 dccp_pr_debug_cat("%llu", (unsigned long long)val->nn);
274 else
275 dccp_pr_debug_cat("unknown type %u", type);
276}
277
278static void dccp_feat_printvals(u8 feat_num, u8 *list, u8 len)
279{
280 u8 type = dccp_feat_type(feat_num);
281 dccp_feat_val fval = { .sp.vec = list, .sp.len = len };
282
283 if (type == FEAT_NN)
284 fval.nn = dccp_decode_value_var(list, len);
285 dccp_feat_printval(feat_num, &fval);
286}
287
288static void dccp_feat_print_entry(struct dccp_feat_entry const *entry)
289{
290 dccp_debug(" * %s %s = ", entry->is_local ? "local" : "remote",
291 dccp_feat_fname(entry->feat_num));
292 dccp_feat_printval(entry->feat_num, &entry->val);
293 dccp_pr_debug_cat(", state=%s %s\n", dccp_feat_sname[entry->state],
294 entry->needs_confirm ? "(Confirm pending)" : "");
295}
296
297#define dccp_feat_print_opt(opt, feat, val, len, mandatory) do { \
298 dccp_pr_debug("%s(%s, ", dccp_feat_oname(opt), dccp_feat_fname(feat));\
299 dccp_feat_printvals(feat, val, len); \
300 dccp_pr_debug_cat(") %s\n", mandatory ? "!" : ""); } while (0)
301
302#define dccp_feat_print_fnlist(fn_list) { \
303 const struct dccp_feat_entry *___entry; \
304 \
305 dccp_pr_debug("List Dump:\n"); \
306 list_for_each_entry(___entry, fn_list, node) \
307 dccp_feat_print_entry(___entry); \
308}
309#else /* ! CONFIG_IP_DCCP_DEBUG */
310#define dccp_feat_print_opt(opt, feat, val, len, mandatory)
311#define dccp_feat_print_fnlist(fn_list)
312#endif
186 313
187 *rpref = *res; 314static int __dccp_feat_activate(struct sock *sk, const int idx,
188 memcpy(&rpref[1], opt->dccpop_val, opt->dccpop_len); 315 const bool is_local, dccp_feat_val const *fval)
316{
317 bool rx;
318 u64 val;
319
320 if (idx < 0 || idx >= DCCP_FEAT_SUPPORTED_MAX)
321 return -1;
322 if (dccp_feat_table[idx].activation_hdlr == NULL)
323 return 0;
189 324
190 /* put it in the "confirm queue" */ 325 if (fval == NULL) {
191 if (opt->dccpop_sc == NULL) { 326 val = dccp_feat_table[idx].default_value;
192 opt->dccpop_sc = kmalloc(sizeof(*opt->dccpop_sc), GFP_ATOMIC); 327 } else if (dccp_feat_table[idx].reconciliation == FEAT_SP) {
193 if (opt->dccpop_sc == NULL) { 328 if (fval->sp.vec == NULL) {
194 kfree(rpref); 329 /*
195 return -ENOMEM; 330 * This can happen when an empty Confirm is sent
331 * for an SP (i.e. known) feature. In this case
332 * we would be using the default anyway.
333 */
334 DCCP_CRIT("Feature #%d undefined: using default", idx);
335 val = dccp_feat_table[idx].default_value;
336 } else {
337 val = fval->sp.vec[0];
196 } 338 }
197 } else { 339 } else {
198 /* recycle the confirm slot */ 340 val = fval->nn;
199 BUG_ON(opt->dccpop_sc->dccpoc_val == NULL);
200 kfree(opt->dccpop_sc->dccpoc_val);
201 dccp_pr_debug("recycling confirm slot\n");
202 }
203 memset(opt->dccpop_sc, 0, sizeof(*opt->dccpop_sc));
204
205 opt->dccpop_sc->dccpoc_val = rpref;
206 opt->dccpop_sc->dccpoc_len = rlen;
207
208 /* update the option on our side [we are about to send the confirm] */
209 rc = dccp_feat_update(sk, opt->dccpop_type, opt->dccpop_feat, *res);
210 if (rc) {
211 kfree(opt->dccpop_sc->dccpoc_val);
212 kfree(opt->dccpop_sc);
213 opt->dccpop_sc = NULL;
214 return rc;
215 } 341 }
216 342
217 dccp_pr_debug("Will confirm %d\n", *rpref); 343 /* Location is RX if this is a local-RX or remote-TX feature */
344 rx = (is_local == (dccp_feat_table[idx].rxtx == FEAT_AT_RX));
218 345
219 /* say we want to change to X but we just got a confirm X, suppress our 346 dccp_debug(" -> activating %s %s, %sval=%llu\n", rx ? "RX" : "TX",
220 * change 347 dccp_feat_fname(dccp_feat_table[idx].feat_num),
221 */ 348 fval ? "" : "default ", (unsigned long long)val);
222 if (!opt->dccpop_conf) { 349
223 if (*opt->dccpop_val == *res) 350 return dccp_feat_table[idx].activation_hdlr(sk, val, rx);
224 opt->dccpop_conf = 1; 351}
225 dccp_pr_debug("won't ask for change of same feature\n"); 352
353/**
354 * dccp_feat_activate - Activate feature value on socket
355 * @sk: fully connected DCCP socket (after handshake is complete)
356 * @feat_num: feature to activate, one of %dccp_feature_numbers
357 * @local: whether local (1) or remote (0) @feat_num is meant
358 * @fval: the value (SP or NN) to activate, or NULL to use the default value
359 * For general use this function is preferable over __dccp_feat_activate().
360 */
361static int dccp_feat_activate(struct sock *sk, u8 feat_num, bool local,
362 dccp_feat_val const *fval)
363{
364 return __dccp_feat_activate(sk, dccp_feat_index(feat_num), local, fval);
365}
366
367/* Test for "Req'd" feature (RFC 4340, 6.4) */
368static inline int dccp_feat_must_be_understood(u8 feat_num)
369{
370 return feat_num == DCCPF_CCID || feat_num == DCCPF_SHORT_SEQNOS ||
371 feat_num == DCCPF_SEQUENCE_WINDOW;
372}
373
374/* copy constructor, fval must not already contain allocated memory */
375static int dccp_feat_clone_sp_val(dccp_feat_val *fval, u8 const *val, u8 len)
376{
377 fval->sp.len = len;
378 if (fval->sp.len > 0) {
379 fval->sp.vec = kmemdup(val, len, gfp_any());
380 if (fval->sp.vec == NULL) {
381 fval->sp.len = 0;
382 return -ENOBUFS;
383 }
226 } 384 }
385 return 0;
386}
227 387
228 return agree ? 0 : DCCP_FEAT_SP_NOAGREE; /* used for mandatory opts */ 388static void dccp_feat_val_destructor(u8 feat_num, dccp_feat_val *val)
389{
390 if (unlikely(val == NULL))
391 return;
392 if (dccp_feat_type(feat_num) == FEAT_SP)
393 kfree(val->sp.vec);
394 memset(val, 0, sizeof(*val));
229} 395}
230 396
231static int dccp_feat_sp(struct sock *sk, u8 type, u8 feature, u8 *val, u8 len) 397static struct dccp_feat_entry *
398 dccp_feat_clone_entry(struct dccp_feat_entry const *original)
232{ 399{
233 struct dccp_minisock *dmsk = dccp_msk(sk); 400 struct dccp_feat_entry *new;
234 struct dccp_opt_pend *opt; 401 u8 type = dccp_feat_type(original->feat_num);
235 int rc = 1;
236 u8 t;
237 402
238 /* 403 if (type == FEAT_UNKNOWN)
239 * We received a CHANGE. We gotta match it against our own preference 404 return NULL;
240 * list. If we got a CHANGE_R it means it's a change for us, so we need
241 * to compare our CHANGE_L list.
242 */
243 if (type == DCCPO_CHANGE_L)
244 t = DCCPO_CHANGE_R;
245 else
246 t = DCCPO_CHANGE_L;
247 405
248 /* find our preference list for this feature */ 406 new = kmemdup(original, sizeof(struct dccp_feat_entry), gfp_any());
249 list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) { 407 if (new == NULL)
250 if (opt->dccpop_type != t || opt->dccpop_feat != feature) 408 return NULL;
251 continue;
252 409
253 /* find the winner from the two preference lists */ 410 if (type == FEAT_SP && dccp_feat_clone_sp_val(&new->val,
254 rc = dccp_feat_reconcile(sk, opt, val, len); 411 original->val.sp.vec,
255 break; 412 original->val.sp.len)) {
413 kfree(new);
414 return NULL;
256 } 415 }
416 return new;
417}
257 418
258 /* We didn't deal with the change. This can happen if we have no 419static void dccp_feat_entry_destructor(struct dccp_feat_entry *entry)
259 * preference list for the feature. In fact, it just shouldn't 420{
260 * happen---if we understand a feature, we should have a preference list 421 if (entry != NULL) {
261 * with at least the default value. 422 dccp_feat_val_destructor(entry->feat_num, &entry->val);
262 */ 423 kfree(entry);
263 BUG_ON(rc == 1); 424 }
425}
264 426
265 return rc; 427/*
428 * List management functions
429 *
430 * Feature negotiation lists rely on and maintain the following invariants:
431 * - each feat_num in the list is known, i.e. we know its type and default value
432 * - each feat_num/is_local combination is unique (old entries are overwritten)
433 * - SP values are always freshly allocated
434 * - list is sorted in increasing order of feature number (faster lookup)
435 */
436static struct dccp_feat_entry *dccp_feat_list_lookup(struct list_head *fn_list,
437 u8 feat_num, bool is_local)
438{
439 struct dccp_feat_entry *entry;
440
441 list_for_each_entry(entry, fn_list, node)
442 if (entry->feat_num == feat_num && entry->is_local == is_local)
443 return entry;
444 else if (entry->feat_num > feat_num)
445 break;
446 return NULL;
266} 447}
267 448
268static int dccp_feat_nn(struct sock *sk, u8 type, u8 feature, u8 *val, u8 len) 449/**
450 * dccp_feat_entry_new - Central list update routine (called by all others)
451 * @head: list to add to
452 * @feat: feature number
453 * @local: whether the local (1) or remote feature with number @feat is meant
454 * This is the only constructor and serves to ensure the above invariants.
455 */
456static struct dccp_feat_entry *
457 dccp_feat_entry_new(struct list_head *head, u8 feat, bool local)
269{ 458{
270 struct dccp_opt_pend *opt; 459 struct dccp_feat_entry *entry;
271 struct dccp_minisock *dmsk = dccp_msk(sk); 460
272 u8 *copy; 461 list_for_each_entry(entry, head, node)
273 int rc; 462 if (entry->feat_num == feat && entry->is_local == local) {
463 dccp_feat_val_destructor(entry->feat_num, &entry->val);
464 return entry;
465 } else if (entry->feat_num > feat) {
466 head = &entry->node;
467 break;
468 }
274 469
275 /* NN features must be Change L (sec. 6.3.2) */ 470 entry = kmalloc(sizeof(*entry), gfp_any());
276 if (type != DCCPO_CHANGE_L) { 471 if (entry != NULL) {
277 dccp_pr_debug("received %s for NN feature %d\n", 472 entry->feat_num = feat;
278 dccp_feat_typename(type), feature); 473 entry->is_local = local;
279 return -EFAULT; 474 list_add_tail(&entry->node, head);
280 } 475 }
476 return entry;
477}
281 478
282 /* XXX sanity check opt val */ 479/**
480 * dccp_feat_push_change - Add/overwrite a Change option in the list
481 * @fn_list: feature-negotiation list to update
482 * @feat: one of %dccp_feature_numbers
483 * @local: whether local (1) or remote (0) @feat_num is meant
484 * @needs_mandatory: whether to use Mandatory feature negotiation options
485 * @fval: pointer to NN/SP value to be inserted (will be copied)
486 */
487static int dccp_feat_push_change(struct list_head *fn_list, u8 feat, u8 local,
488 u8 mandatory, dccp_feat_val *fval)
489{
490 struct dccp_feat_entry *new = dccp_feat_entry_new(fn_list, feat, local);
283 491
284 /* copy option so we can confirm it */ 492 if (new == NULL)
285 opt = kzalloc(sizeof(*opt), GFP_ATOMIC);
286 if (opt == NULL)
287 return -ENOMEM; 493 return -ENOMEM;
288 494
289 copy = kmemdup(val, len, GFP_ATOMIC); 495 new->feat_num = feat;
290 if (copy == NULL) { 496 new->is_local = local;
291 kfree(opt); 497 new->state = FEAT_INITIALISING;
292 return -ENOMEM; 498 new->needs_confirm = 0;
293 } 499 new->empty_confirm = 0;
500 new->val = *fval;
501 new->needs_mandatory = mandatory;
294 502
295 opt->dccpop_type = DCCPO_CONFIRM_R; /* NN can only confirm R */ 503 return 0;
296 opt->dccpop_feat = feature; 504}
297 opt->dccpop_val = copy;
298 opt->dccpop_len = len;
299 505
300 /* change feature */ 506/**
301 rc = dccp_feat_update(sk, type, feature, *val); 507 * dccp_feat_push_confirm - Add a Confirm entry to the FN list
302 if (rc) { 508 * @fn_list: feature-negotiation list to add to
303 kfree(opt->dccpop_val); 509 * @feat: one of %dccp_feature_numbers
304 kfree(opt); 510 * @local: whether local (1) or remote (0) @feat_num is being confirmed
305 return rc; 511 * @fval: pointer to NN/SP value to be inserted or NULL
306 } 512 * Returns 0 on success, a Reset code for further processing otherwise.
513 */
514static int dccp_feat_push_confirm(struct list_head *fn_list, u8 feat, u8 local,
515 dccp_feat_val *fval)
516{
517 struct dccp_feat_entry *new = dccp_feat_entry_new(fn_list, feat, local);
307 518
308 dccp_feat_debug(type, feature, *copy); 519 if (new == NULL)
520 return DCCP_RESET_CODE_TOO_BUSY;
309 521
310 list_add_tail(&opt->dccpop_node, &dmsk->dccpms_conf); 522 new->feat_num = feat;
523 new->is_local = local;
524 new->state = FEAT_STABLE; /* transition in 6.6.2 */
525 new->needs_confirm = 1;
526 new->empty_confirm = (fval == NULL);
527 new->val.nn = 0; /* zeroes the whole structure */
528 if (!new->empty_confirm)
529 new->val = *fval;
530 new->needs_mandatory = 0;
311 531
312 return 0; 532 return 0;
313} 533}
314 534
315static void dccp_feat_empty_confirm(struct dccp_minisock *dmsk, 535static int dccp_push_empty_confirm(struct list_head *fn_list, u8 feat, u8 local)
316 u8 type, u8 feature)
317{ 536{
318 /* XXX check if other confirms for that are queued and recycle slot */ 537 return dccp_feat_push_confirm(fn_list, feat, local, NULL);
319 struct dccp_opt_pend *opt = kzalloc(sizeof(*opt), GFP_ATOMIC); 538}
320 539
321 if (opt == NULL) { 540static inline void dccp_feat_list_pop(struct dccp_feat_entry *entry)
322 /* XXX what do we do? Ignoring should be fine. It's a change 541{
323 * after all =P 542 list_del(&entry->node);
324 */ 543 dccp_feat_entry_destructor(entry);
325 return; 544}
326 }
327 545
328 switch (type) { 546void dccp_feat_list_purge(struct list_head *fn_list)
329 case DCCPO_CHANGE_L: 547{
330 opt->dccpop_type = DCCPO_CONFIRM_R; 548 struct dccp_feat_entry *entry, *next;
331 break; 549
332 case DCCPO_CHANGE_R: 550 list_for_each_entry_safe(entry, next, fn_list, node)
333 opt->dccpop_type = DCCPO_CONFIRM_L; 551 dccp_feat_entry_destructor(entry);
334 break; 552 INIT_LIST_HEAD(fn_list);
335 default: 553}
336 DCCP_WARN("invalid type %d\n", type); 554EXPORT_SYMBOL_GPL(dccp_feat_list_purge);
337 kfree(opt); 555
338 return; 556/* generate @to as full clone of @from - @to must not contain any nodes */
557int dccp_feat_clone_list(struct list_head const *from, struct list_head *to)
558{
559 struct dccp_feat_entry *entry, *new;
560
561 INIT_LIST_HEAD(to);
562 list_for_each_entry(entry, from, node) {
563 new = dccp_feat_clone_entry(entry);
564 if (new == NULL)
565 goto cloning_failed;
566 list_add_tail(&new->node, to);
339 } 567 }
340 opt->dccpop_feat = feature; 568 return 0;
341 opt->dccpop_val = NULL;
342 opt->dccpop_len = 0;
343 569
344 /* change feature */ 570cloning_failed:
345 dccp_pr_debug("Empty %s(%d)\n", dccp_feat_typename(type), feature); 571 dccp_feat_list_purge(to);
572 return -ENOMEM;
573}
346 574
347 list_add_tail(&opt->dccpop_node, &dmsk->dccpms_conf); 575/**
576 * dccp_feat_valid_nn_length - Enforce length constraints on NN options
577 * Length is between 0 and %DCCP_OPTVAL_MAXLEN. Used for outgoing packets only,
578 * incoming options are accepted as long as their values are valid.
579 */
580static u8 dccp_feat_valid_nn_length(u8 feat_num)
581{
582 if (feat_num == DCCPF_ACK_RATIO) /* RFC 4340, 11.3 and 6.6.8 */
583 return 2;
584 if (feat_num == DCCPF_SEQUENCE_WINDOW) /* RFC 4340, 7.5.2 and 6.5 */
585 return 6;
586 return 0;
348} 587}
349 588
350static void dccp_feat_flush_confirm(struct sock *sk) 589static u8 dccp_feat_is_valid_nn_val(u8 feat_num, u64 val)
351{ 590{
352 struct dccp_minisock *dmsk = dccp_msk(sk); 591 switch (feat_num) {
353 /* Check if there is anything to confirm in the first place */ 592 case DCCPF_ACK_RATIO:
354 int yes = !list_empty(&dmsk->dccpms_conf); 593 return val <= DCCPF_ACK_RATIO_MAX;
594 case DCCPF_SEQUENCE_WINDOW:
595 return val >= DCCPF_SEQ_WMIN && val <= DCCPF_SEQ_WMAX;
596 }
597 return 0; /* feature unknown - so we can't tell */
598}
355 599
356 if (!yes) { 600/* check that SP values are within the ranges defined in RFC 4340 */
357 struct dccp_opt_pend *opt; 601static u8 dccp_feat_is_valid_sp_val(u8 feat_num, u8 val)
602{
603 switch (feat_num) {
604 case DCCPF_CCID:
605 return val == DCCPC_CCID2 || val == DCCPC_CCID3;
606 /* Type-check Boolean feature values: */
607 case DCCPF_SHORT_SEQNOS:
608 case DCCPF_ECN_INCAPABLE:
609 case DCCPF_SEND_ACK_VECTOR:
610 case DCCPF_SEND_NDP_COUNT:
611 case DCCPF_DATA_CHECKSUM:
612 case DCCPF_SEND_LEV_RATE:
613 return val < 2;
614 case DCCPF_MIN_CSUM_COVER:
615 return val < 16;
616 }
617 return 0; /* feature unknown */
618}
619
620static u8 dccp_feat_sp_list_ok(u8 feat_num, u8 const *sp_list, u8 sp_len)
621{
622 if (sp_list == NULL || sp_len < 1)
623 return 0;
624 while (sp_len--)
625 if (!dccp_feat_is_valid_sp_val(feat_num, *sp_list++))
626 return 0;
627 return 1;
628}
358 629
359 list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) { 630/**
360 if (opt->dccpop_conf) { 631 * dccp_feat_insert_opts - Generate FN options from current list state
361 yes = 1; 632 * @skb: next sk_buff to be sent to the peer
362 break; 633 * @dp: for client during handshake and general negotiation
634 * @dreq: used by the server only (all Changes/Confirms in LISTEN/RESPOND)
635 */
636int dccp_feat_insert_opts(struct dccp_sock *dp, struct dccp_request_sock *dreq,
637 struct sk_buff *skb)
638{
639 struct list_head *fn = dreq ? &dreq->dreq_featneg : &dp->dccps_featneg;
640 struct dccp_feat_entry *pos, *next;
641 u8 opt, type, len, *ptr, nn_in_nbo[DCCP_OPTVAL_MAXLEN];
642 bool rpt;
643
644 /* put entries into @skb in the order they appear in the list */
645 list_for_each_entry_safe_reverse(pos, next, fn, node) {
646 opt = dccp_feat_genopt(pos);
647 type = dccp_feat_type(pos->feat_num);
648 rpt = false;
649
650 if (pos->empty_confirm) {
651 len = 0;
652 ptr = NULL;
653 } else {
654 if (type == FEAT_SP) {
655 len = pos->val.sp.len;
656 ptr = pos->val.sp.vec;
657 rpt = pos->needs_confirm;
658 } else if (type == FEAT_NN) {
659 len = dccp_feat_valid_nn_length(pos->feat_num);
660 ptr = nn_in_nbo;
661 dccp_encode_value_var(pos->val.nn, ptr, len);
662 } else {
663 DCCP_BUG("unknown feature %u", pos->feat_num);
664 return -1;
363 } 665 }
364 } 666 }
667 dccp_feat_print_opt(opt, pos->feat_num, ptr, len, 0);
668
669 if (dccp_insert_fn_opt(skb, opt, pos->feat_num, ptr, len, rpt))
670 return -1;
671 if (pos->needs_mandatory && dccp_insert_option_mandatory(skb))
672 return -1;
673 /*
674 * Enter CHANGING after transmitting the Change option (6.6.2).
675 */
676 if (pos->state == FEAT_INITIALISING)
677 pos->state = FEAT_CHANGING;
365 } 678 }
679 return 0;
680}
366 681
367 if (!yes) 682/**
368 return; 683 * __feat_register_nn - Register new NN value on socket
684 * @fn: feature-negotiation list to register with
685 * @feat: an NN feature from %dccp_feature_numbers
686 * @mandatory: use Mandatory option if 1
687 * @nn_val: value to register (restricted to 4 bytes)
688 * Note that NN features are local by definition (RFC 4340, 6.3.2).
689 */
690static int __feat_register_nn(struct list_head *fn, u8 feat,
691 u8 mandatory, u64 nn_val)
692{
693 dccp_feat_val fval = { .nn = nn_val };
369 694
370 /* OK there is something to confirm... */ 695 if (dccp_feat_type(feat) != FEAT_NN ||
371 /* XXX check if packet is in flight? Send delayed ack?? */ 696 !dccp_feat_is_valid_nn_val(feat, nn_val))
372 if (sk->sk_state == DCCP_OPEN) 697 return -EINVAL;
373 dccp_send_ack(sk); 698
699 /* Don't bother with default values, they will be activated anyway. */
700 if (nn_val - (u64)dccp_feat_default_value(feat) == 0)
701 return 0;
702
703 return dccp_feat_push_change(fn, feat, 1, mandatory, &fval);
374} 704}
375 705
376int dccp_feat_change_recv(struct sock *sk, u8 type, u8 feature, u8 *val, u8 len) 706/**
707 * __feat_register_sp - Register new SP value/list on socket
708 * @fn: feature-negotiation list to register with
709 * @feat: an SP feature from %dccp_feature_numbers
710 * @is_local: whether the local (1) or the remote (0) @feat is meant
711 * @mandatory: use Mandatory option if 1
712 * @sp_val: SP value followed by optional preference list
713 * @sp_len: length of @sp_val in bytes
714 */
715static int __feat_register_sp(struct list_head *fn, u8 feat, u8 is_local,
716 u8 mandatory, u8 const *sp_val, u8 sp_len)
377{ 717{
378 int rc; 718 dccp_feat_val fval;
379 719
380 dccp_feat_debug(type, feature, *val); 720 if (dccp_feat_type(feat) != FEAT_SP ||
721 !dccp_feat_sp_list_ok(feat, sp_val, sp_len))
722 return -EINVAL;
381 723
382 /* figure out if it's SP or NN feature */ 724 /* Avoid negotiating alien CCIDs by only advertising supported ones */
383 switch (feature) { 725 if (feat == DCCPF_CCID && !ccid_support_check(sp_val, sp_len))
384 /* deal with SP features */ 726 return -EOPNOTSUPP;
385 case DCCPF_CCID:
386 rc = dccp_feat_sp(sk, type, feature, val, len);
387 break;
388 727
389 /* deal with NN features */ 728 if (dccp_feat_clone_sp_val(&fval, sp_val, sp_len))
390 case DCCPF_ACK_RATIO: 729 return -ENOMEM;
391 rc = dccp_feat_nn(sk, type, feature, val, len);
392 break;
393 730
394 /* XXX implement other features */ 731 return dccp_feat_push_change(fn, feat, is_local, mandatory, &fval);
395 default: 732}
396 dccp_pr_debug("UNIMPLEMENTED: not handling %s(%d, ...)\n", 733
397 dccp_feat_typename(type), feature); 734/**
398 rc = -EFAULT; 735 * dccp_feat_register_sp - Register requests to change SP feature values
399 break; 736 * @sk: client or listening socket
737 * @feat: one of %dccp_feature_numbers
738 * @is_local: whether the local (1) or remote (0) @feat is meant
739 * @list: array of preferred values, in descending order of preference
740 * @len: length of @list in bytes
741 */
742int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local,
743 u8 const *list, u8 len)
744{ /* any changes must be registered before establishing the connection */
745 if (sk->sk_state != DCCP_CLOSED)
746 return -EISCONN;
747 if (dccp_feat_type(feat) != FEAT_SP)
748 return -EINVAL;
749 return __feat_register_sp(&dccp_sk(sk)->dccps_featneg, feat, is_local,
750 0, list, len);
751}
752
753/* Analogous to dccp_feat_register_sp(), but for non-negotiable values */
754int dccp_feat_register_nn(struct sock *sk, u8 feat, u64 val)
755{
756 /* any changes must be registered before establishing the connection */
757 if (sk->sk_state != DCCP_CLOSED)
758 return -EISCONN;
759 if (dccp_feat_type(feat) != FEAT_NN)
760 return -EINVAL;
761 return __feat_register_nn(&dccp_sk(sk)->dccps_featneg, feat, 0, val);
762}
763
764/**
765 * dccp_feat_signal_nn_change - Update NN values for an established connection
766 * @sk: DCCP socket of an established connection
767 * @feat: NN feature number from %dccp_feature_numbers
768 * @nn_val: the new value to use
769 * This function is used to communicate NN updates out-of-band. The difference
770 * to feature negotiation during connection setup is that values are activated
771 * immediately after validation, i.e. we don't wait for the Confirm: either the
772 * value is accepted by the peer (and then the waiting is futile), or it is not
773 * (Reset or empty Confirm). We don't accept empty Confirms - transmitted values
774 * are validated, and the peer "MUST accept any valid value" (RFC 4340, 6.3.2).
775 */
776int dccp_feat_signal_nn_change(struct sock *sk, u8 feat, u64 nn_val)
777{
778 struct list_head *fn = &dccp_sk(sk)->dccps_featneg;
779 dccp_feat_val fval = { .nn = nn_val };
780 struct dccp_feat_entry *entry;
781
782 if (sk->sk_state != DCCP_OPEN && sk->sk_state != DCCP_PARTOPEN)
783 return 0;
784
785 if (dccp_feat_type(feat) != FEAT_NN ||
786 !dccp_feat_is_valid_nn_val(feat, nn_val))
787 return -EINVAL;
788
789 entry = dccp_feat_list_lookup(fn, feat, 1);
790 if (entry != NULL) {
791 dccp_pr_debug("Ignoring %llu, entry %llu exists in state %s\n",
792 (unsigned long long)nn_val,
793 (unsigned long long)entry->val.nn,
794 dccp_feat_sname[entry->state]);
795 return 0;
400 } 796 }
401 797
402 /* check if there were problems changing features */ 798 if (dccp_feat_activate(sk, feat, 1, &fval))
403 if (rc) { 799 return -EADV;
404 /* If we don't agree on SP, we sent a confirm for old value. 800
405 * However we propagate rc to caller in case option was 801 inet_csk_schedule_ack(sk);
406 * mandatory 802 return dccp_feat_push_change(fn, feat, 1, 0, &fval);
803}
804EXPORT_SYMBOL_GPL(dccp_feat_signal_nn_change);
805
806/*
807 * Tracking features whose value depend on the choice of CCID
808 *
809 * This is designed with an extension in mind so that a list walk could be done
810 * before activating any features. However, the existing framework was found to
811 * work satisfactorily up until now, the automatic verification is left open.
812 * When adding new CCIDs, add a corresponding dependency table here.
813 */
814static const struct ccid_dependency *dccp_feat_ccid_deps(u8 ccid, bool is_local)
815{
816 static const struct ccid_dependency ccid2_dependencies[2][2] = {
817 /*
818 * CCID2 mandates Ack Vectors (RFC 4341, 4.): as CCID is a TX
819 * feature and Send Ack Vector is an RX feature, `is_local'
820 * needs to be reversed.
407 */ 821 */
408 if (rc != DCCP_FEAT_SP_NOAGREE) 822 { /* Dependencies of the receiver-side (remote) CCID2 */
409 dccp_feat_empty_confirm(dccp_msk(sk), type, feature); 823 {
824 .dependent_feat = DCCPF_SEND_ACK_VECTOR,
825 .is_local = true,
826 .is_mandatory = true,
827 .val = 1
828 },
829 { 0, 0, 0, 0 }
830 },
831 { /* Dependencies of the sender-side (local) CCID2 */
832 {
833 .dependent_feat = DCCPF_SEND_ACK_VECTOR,
834 .is_local = false,
835 .is_mandatory = true,
836 .val = 1
837 },
838 { 0, 0, 0, 0 }
839 }
840 };
841 static const struct ccid_dependency ccid3_dependencies[2][5] = {
842 { /*
843 * Dependencies of the receiver-side CCID3
844 */
845 { /* locally disable Ack Vectors */
846 .dependent_feat = DCCPF_SEND_ACK_VECTOR,
847 .is_local = true,
848 .is_mandatory = false,
849 .val = 0
850 },
851 { /* see below why Send Loss Event Rate is on */
852 .dependent_feat = DCCPF_SEND_LEV_RATE,
853 .is_local = true,
854 .is_mandatory = true,
855 .val = 1
856 },
857 { /* NDP Count is needed as per RFC 4342, 6.1.1 */
858 .dependent_feat = DCCPF_SEND_NDP_COUNT,
859 .is_local = false,
860 .is_mandatory = true,
861 .val = 1
862 },
863 { 0, 0, 0, 0 },
864 },
865 { /*
866 * CCID3 at the TX side: we request that the HC-receiver
867 * will not send Ack Vectors (they will be ignored, so
868 * Mandatory is not set); we enable Send Loss Event Rate
869 * (Mandatory since the implementation does not support
870 * the Loss Intervals option of RFC 4342, 8.6).
871 * The last two options are for peer's information only.
872 */
873 {
874 .dependent_feat = DCCPF_SEND_ACK_VECTOR,
875 .is_local = false,
876 .is_mandatory = false,
877 .val = 0
878 },
879 {
880 .dependent_feat = DCCPF_SEND_LEV_RATE,
881 .is_local = false,
882 .is_mandatory = true,
883 .val = 1
884 },
885 { /* this CCID does not support Ack Ratio */
886 .dependent_feat = DCCPF_ACK_RATIO,
887 .is_local = true,
888 .is_mandatory = false,
889 .val = 0
890 },
891 { /* tell receiver we are sending NDP counts */
892 .dependent_feat = DCCPF_SEND_NDP_COUNT,
893 .is_local = true,
894 .is_mandatory = false,
895 .val = 1
896 },
897 { 0, 0, 0, 0 }
898 }
899 };
900 switch (ccid) {
901 case DCCPC_CCID2:
902 return ccid2_dependencies[is_local];
903 case DCCPC_CCID3:
904 return ccid3_dependencies[is_local];
905 default:
906 return NULL;
410 } 907 }
908}
411 909
412 /* generate the confirm [if required] */ 910/**
413 dccp_feat_flush_confirm(sk); 911 * dccp_feat_propagate_ccid - Resolve dependencies of features on choice of CCID
414 912 * @fn: feature-negotiation list to update
913 * @id: CCID number to track
914 * @is_local: whether TX CCID (1) or RX CCID (0) is meant
915 * This function needs to be called after registering all other features.
916 */
917static int dccp_feat_propagate_ccid(struct list_head *fn, u8 id, bool is_local)
918{
919 const struct ccid_dependency *table = dccp_feat_ccid_deps(id, is_local);
920 int i, rc = (table == NULL);
921
922 for (i = 0; rc == 0 && table[i].dependent_feat != DCCPF_RESERVED; i++)
923 if (dccp_feat_type(table[i].dependent_feat) == FEAT_SP)
924 rc = __feat_register_sp(fn, table[i].dependent_feat,
925 table[i].is_local,
926 table[i].is_mandatory,
927 &table[i].val, 1);
928 else
929 rc = __feat_register_nn(fn, table[i].dependent_feat,
930 table[i].is_mandatory,
931 table[i].val);
415 return rc; 932 return rc;
416} 933}
417 934
418EXPORT_SYMBOL_GPL(dccp_feat_change_recv); 935/**
936 * dccp_feat_finalise_settings - Finalise settings before starting negotiation
937 * @dp: client or listening socket (settings will be inherited)
938 * This is called after all registrations (socket initialisation, sysctls, and
939 * sockopt calls), and before sending the first packet containing Change options
940 * (ie. client-Request or server-Response), to ensure internal consistency.
941 */
942int dccp_feat_finalise_settings(struct dccp_sock *dp)
943{
944 struct list_head *fn = &dp->dccps_featneg;
945 struct dccp_feat_entry *entry;
946 int i = 2, ccids[2] = { -1, -1 };
947
948 /*
949 * Propagating CCIDs:
950 * 1) not useful to propagate CCID settings if this host advertises more
951 * than one CCID: the choice of CCID may still change - if this is
952 * the client, or if this is the server and the client sends
953 * singleton CCID values.
954 * 2) since is that propagate_ccid changes the list, we defer changing
955 * the sorted list until after the traversal.
956 */
957 list_for_each_entry(entry, fn, node)
958 if (entry->feat_num == DCCPF_CCID && entry->val.sp.len == 1)
959 ccids[entry->is_local] = entry->val.sp.vec[0];
960 while (i--)
961 if (ccids[i] > 0 && dccp_feat_propagate_ccid(fn, ccids[i], i))
962 return -1;
963 dccp_feat_print_fnlist(fn);
964 return 0;
965}
419 966
420int dccp_feat_confirm_recv(struct sock *sk, u8 type, u8 feature, 967/**
421 u8 *val, u8 len) 968 * dccp_feat_server_ccid_dependencies - Resolve CCID-dependent features
969 * It is the server which resolves the dependencies once the CCID has been
970 * fully negotiated. If no CCID has been negotiated, it uses the default CCID.
971 */
972int dccp_feat_server_ccid_dependencies(struct dccp_request_sock *dreq)
422{ 973{
423 u8 t; 974 struct list_head *fn = &dreq->dreq_featneg;
424 struct dccp_opt_pend *opt; 975 struct dccp_feat_entry *entry;
425 struct dccp_minisock *dmsk = dccp_msk(sk); 976 u8 is_local, ccid;
426 int found = 0;
427 int all_confirmed = 1;
428 977
429 dccp_feat_debug(type, feature, *val); 978 for (is_local = 0; is_local <= 1; is_local++) {
979 entry = dccp_feat_list_lookup(fn, DCCPF_CCID, is_local);
430 980
431 /* locate our change request */ 981 if (entry != NULL && !entry->empty_confirm)
432 switch (type) { 982 ccid = entry->val.sp.vec[0];
433 case DCCPO_CONFIRM_L: t = DCCPO_CHANGE_R; break; 983 else
434 case DCCPO_CONFIRM_R: t = DCCPO_CHANGE_L; break; 984 ccid = dccp_feat_default_value(DCCPF_CCID);
435 default: DCCP_WARN("invalid type %d\n", type);
436 return 1;
437 985
986 if (dccp_feat_propagate_ccid(fn, ccid, is_local))
987 return -1;
438 } 988 }
439 /* XXX sanity check feature value */ 989 return 0;
990}
440 991
441 list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) { 992/* Select the first entry in @servlist that also occurs in @clilist (6.3.1) */
442 if (!opt->dccpop_conf && opt->dccpop_type == t && 993static int dccp_feat_preflist_match(u8 *servlist, u8 slen, u8 *clilist, u8 clen)
443 opt->dccpop_feat == feature) { 994{
444 found = 1; 995 u8 c, s;
445 dccp_pr_debug("feature %d found\n", opt->dccpop_feat);
446 996
447 /* XXX do sanity check */ 997 for (s = 0; s < slen; s++)
998 for (c = 0; c < clen; c++)
999 if (servlist[s] == clilist[c])
1000 return servlist[s];
1001 return -1;
1002}
448 1003
449 opt->dccpop_conf = 1; 1004/**
1005 * dccp_feat_prefer - Move preferred entry to the start of array
1006 * Reorder the @array_len elements in @array so that @preferred_value comes
1007 * first. Returns >0 to indicate that @preferred_value does occur in @array.
1008 */
1009static u8 dccp_feat_prefer(u8 preferred_value, u8 *array, u8 array_len)
1010{
1011 u8 i, does_occur = 0;
450 1012
451 /* We got a confirmation---change the option */ 1013 if (array != NULL) {
452 dccp_feat_update(sk, opt->dccpop_type, 1014 for (i = 0; i < array_len; i++)
453 opt->dccpop_feat, *val); 1015 if (array[i] == preferred_value) {
1016 array[i] = array[0];
1017 does_occur++;
1018 }
1019 if (does_occur)
1020 array[0] = preferred_value;
1021 }
1022 return does_occur;
1023}
454 1024
455 /* XXX check the return value of dccp_feat_update */ 1025/**
456 break; 1026 * dccp_feat_reconcile - Reconcile SP preference lists
457 } 1027 * @fval: SP list to reconcile into
1028 * @arr: received SP preference list
1029 * @len: length of @arr in bytes
1030 * @is_server: whether this side is the server (and @fv is the server's list)
1031 * @reorder: whether to reorder the list in @fv after reconciling with @arr
1032 * When successful, > 0 is returned and the reconciled list is in @fval.
1033 * A value of 0 means that negotiation failed (no shared entry).
1034 */
1035static int dccp_feat_reconcile(dccp_feat_val *fv, u8 *arr, u8 len,
1036 bool is_server, bool reorder)
1037{
1038 int rc;
458 1039
459 if (!opt->dccpop_conf) 1040 if (!fv->sp.vec || !arr) {
460 all_confirmed = 0; 1041 DCCP_CRIT("NULL feature value or array");
1042 return 0;
461 } 1043 }
462 1044
463 /* fix re-transmit timer */ 1045 if (is_server)
464 /* XXX gotta make sure that no option negotiation occurs during 1046 rc = dccp_feat_preflist_match(fv->sp.vec, fv->sp.len, arr, len);
465 * connection shutdown. Consider that the CLOSEREQ is sent and timer is 1047 else
466 * on. if all options are confirmed it might kill timer which should 1048 rc = dccp_feat_preflist_match(arr, len, fv->sp.vec, fv->sp.len);
467 * remain alive until close is received.
468 */
469 if (all_confirmed) {
470 dccp_pr_debug("clear feat negotiation timer %p\n", sk);
471 inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
472 }
473 1049
474 if (!found) 1050 if (!reorder)
475 dccp_pr_debug("%s(%d, ...) never requested\n", 1051 return rc;
476 dccp_feat_typename(type), feature); 1052 if (rc < 0)
477 return 0; 1053 return 0;
478}
479 1054
480EXPORT_SYMBOL_GPL(dccp_feat_confirm_recv); 1055 /*
1056 * Reorder list: used for activating features and in dccp_insert_fn_opt.
1057 */
1058 return dccp_feat_prefer(rc, fv->sp.vec, fv->sp.len);
1059}
481 1060
482void dccp_feat_clean(struct dccp_minisock *dmsk) 1061/**
1062 * dccp_feat_change_recv - Process incoming ChangeL/R options
1063 * @fn: feature-negotiation list to update
1064 * @is_mandatory: whether the Change was preceded by a Mandatory option
1065 * @opt: %DCCPO_CHANGE_L or %DCCPO_CHANGE_R
1066 * @feat: one of %dccp_feature_numbers
1067 * @val: NN value or SP value/preference list
1068 * @len: length of @val in bytes
1069 * @server: whether this node is the server (1) or the client (0)
1070 */
1071static u8 dccp_feat_change_recv(struct list_head *fn, u8 is_mandatory, u8 opt,
1072 u8 feat, u8 *val, u8 len, const bool server)
483{ 1073{
484 struct dccp_opt_pend *opt, *next; 1074 u8 defval, type = dccp_feat_type(feat);
1075 const bool local = (opt == DCCPO_CHANGE_R);
1076 struct dccp_feat_entry *entry;
1077 dccp_feat_val fval;
1078
1079 if (len == 0 || type == FEAT_UNKNOWN) /* 6.1 and 6.6.8 */
1080 goto unknown_feature_or_value;
1081
1082 dccp_feat_print_opt(opt, feat, val, len, is_mandatory);
1083
1084 /*
1085 * Negotiation of NN features: Change R is invalid, so there is no
1086 * simultaneous negotiation; hence we do not look up in the list.
1087 */
1088 if (type == FEAT_NN) {
1089 if (local || len > sizeof(fval.nn))
1090 goto unknown_feature_or_value;
485 1091
486 list_for_each_entry_safe(opt, next, &dmsk->dccpms_pending, 1092 /* 6.3.2: "The feature remote MUST accept any valid value..." */
487 dccpop_node) { 1093 fval.nn = dccp_decode_value_var(val, len);
488 BUG_ON(opt->dccpop_val == NULL); 1094 if (!dccp_feat_is_valid_nn_val(feat, fval.nn))
489 kfree(opt->dccpop_val); 1095 goto unknown_feature_or_value;
490 1096
491 if (opt->dccpop_sc != NULL) { 1097 return dccp_feat_push_confirm(fn, feat, local, &fval);
492 BUG_ON(opt->dccpop_sc->dccpoc_val == NULL); 1098 }
493 kfree(opt->dccpop_sc->dccpoc_val); 1099
494 kfree(opt->dccpop_sc); 1100 /*
1101 * Unidirectional/simultaneous negotiation of SP features (6.3.1)
1102 */
1103 entry = dccp_feat_list_lookup(fn, feat, local);
1104 if (entry == NULL) {
1105 /*
1106 * No particular preferences have been registered. We deal with
1107 * this situation by assuming that all valid values are equally
1108 * acceptable, and apply the following checks:
1109 * - if the peer's list is a singleton, we accept a valid value;
1110 * - if we are the server, we first try to see if the peer (the
1111 * client) advertises the default value. If yes, we use it,
1112 * otherwise we accept the preferred value;
1113 * - else if we are the client, we use the first list element.
1114 */
1115 if (dccp_feat_clone_sp_val(&fval, val, 1))
1116 return DCCP_RESET_CODE_TOO_BUSY;
1117
1118 if (len > 1 && server) {
1119 defval = dccp_feat_default_value(feat);
1120 if (dccp_feat_preflist_match(&defval, 1, val, len) > -1)
1121 fval.sp.vec[0] = defval;
1122 } else if (!dccp_feat_is_valid_sp_val(feat, fval.sp.vec[0])) {
1123 kfree(fval.sp.vec);
1124 goto unknown_feature_or_value;
1125 }
1126
1127 /* Treat unsupported CCIDs like invalid values */
1128 if (feat == DCCPF_CCID && !ccid_support_check(fval.sp.vec, 1)) {
1129 kfree(fval.sp.vec);
1130 goto not_valid_or_not_known;
495 } 1131 }
496 1132
497 kfree(opt); 1133 return dccp_feat_push_confirm(fn, feat, local, &fval);
1134
1135 } else if (entry->state == FEAT_UNSTABLE) { /* 6.6.2 */
1136 return 0;
498 } 1137 }
499 INIT_LIST_HEAD(&dmsk->dccpms_pending);
500 1138
501 list_for_each_entry_safe(opt, next, &dmsk->dccpms_conf, dccpop_node) { 1139 if (dccp_feat_reconcile(&entry->val, val, len, server, true)) {
502 BUG_ON(opt == NULL); 1140 entry->empty_confirm = 0;
503 if (opt->dccpop_val != NULL) 1141 } else if (is_mandatory) {
504 kfree(opt->dccpop_val); 1142 return DCCP_RESET_CODE_MANDATORY_ERROR;
505 kfree(opt); 1143 } else if (entry->state == FEAT_INITIALISING) {
1144 /*
1145 * Failed simultaneous negotiation (server only): try to `save'
1146 * the connection by checking whether entry contains the default
1147 * value for @feat. If yes, send an empty Confirm to signal that
1148 * the received Change was not understood - which implies using
1149 * the default value.
1150 * If this also fails, we use Reset as the last resort.
1151 */
1152 WARN_ON(!server);
1153 defval = dccp_feat_default_value(feat);
1154 if (!dccp_feat_reconcile(&entry->val, &defval, 1, server, true))
1155 return DCCP_RESET_CODE_OPTION_ERROR;
1156 entry->empty_confirm = 1;
506 } 1157 }
507 INIT_LIST_HEAD(&dmsk->dccpms_conf); 1158 entry->needs_confirm = 1;
508} 1159 entry->needs_mandatory = 0;
1160 entry->state = FEAT_STABLE;
1161 return 0;
509 1162
510EXPORT_SYMBOL_GPL(dccp_feat_clean); 1163unknown_feature_or_value:
1164 if (!is_mandatory)
1165 return dccp_push_empty_confirm(fn, feat, local);
511 1166
512/* this is to be called only when a listening sock creates its child. It is 1167not_valid_or_not_known:
513 * assumed by the function---the confirm is not duplicated, but rather it is 1168 return is_mandatory ? DCCP_RESET_CODE_MANDATORY_ERROR
514 * "passed on". 1169 : DCCP_RESET_CODE_OPTION_ERROR;
1170}
1171
1172/**
1173 * dccp_feat_confirm_recv - Process received Confirm options
1174 * @fn: feature-negotiation list to update
1175 * @is_mandatory: whether @opt was preceded by a Mandatory option
1176 * @opt: %DCCPO_CONFIRM_L or %DCCPO_CONFIRM_R
1177 * @feat: one of %dccp_feature_numbers
1178 * @val: NN value or SP value/preference list
1179 * @len: length of @val in bytes
1180 * @server: whether this node is server (1) or client (0)
515 */ 1181 */
516int dccp_feat_clone(struct sock *oldsk, struct sock *newsk) 1182static u8 dccp_feat_confirm_recv(struct list_head *fn, u8 is_mandatory, u8 opt,
1183 u8 feat, u8 *val, u8 len, const bool server)
517{ 1184{
518 struct dccp_minisock *olddmsk = dccp_msk(oldsk); 1185 u8 *plist, plen, type = dccp_feat_type(feat);
519 struct dccp_minisock *newdmsk = dccp_msk(newsk); 1186 const bool local = (opt == DCCPO_CONFIRM_R);
520 struct dccp_opt_pend *opt; 1187 struct dccp_feat_entry *entry = dccp_feat_list_lookup(fn, feat, local);
521 int rc = 0;
522 1188
523 INIT_LIST_HEAD(&newdmsk->dccpms_pending); 1189 dccp_feat_print_opt(opt, feat, val, len, is_mandatory);
524 INIT_LIST_HEAD(&newdmsk->dccpms_conf);
525 1190
526 list_for_each_entry(opt, &olddmsk->dccpms_pending, dccpop_node) { 1191 if (entry == NULL) { /* nothing queued: ignore or handle error */
527 struct dccp_opt_pend *newopt; 1192 if (is_mandatory && type == FEAT_UNKNOWN)
528 /* copy the value of the option */ 1193 return DCCP_RESET_CODE_MANDATORY_ERROR;
529 u8 *val = kmemdup(opt->dccpop_val, opt->dccpop_len, GFP_ATOMIC);
530 1194
531 if (val == NULL) 1195 if (!local && type == FEAT_NN) /* 6.3.2 */
532 goto out_clean; 1196 goto confirmation_failed;
533 1197 return 0;
534 newopt = kmemdup(opt, sizeof(*newopt), GFP_ATOMIC); 1198 }
535 if (newopt == NULL) {
536 kfree(val);
537 goto out_clean;
538 }
539 1199
540 /* insert the option */ 1200 if (entry->state != FEAT_CHANGING) /* 6.6.2 */
541 newopt->dccpop_val = val; 1201 return 0;
542 list_add_tail(&newopt->dccpop_node, &newdmsk->dccpms_pending);
543 1202
544 /* XXX what happens with backlogs and multiple connections at 1203 if (len == 0) {
545 * once... 1204 if (dccp_feat_must_be_understood(feat)) /* 6.6.7 */
1205 goto confirmation_failed;
1206 /*
1207 * Empty Confirm during connection setup: this means reverting
1208 * to the `old' value, which in this case is the default. Since
1209 * we handle default values automatically when no other values
1210 * have been set, we revert to the old value by removing this
1211 * entry from the list.
546 */ 1212 */
547 /* the master socket no longer needs to worry about confirms */ 1213 dccp_feat_list_pop(entry);
548 opt->dccpop_sc = NULL; /* it's not a memleak---new socket has it */ 1214 return 0;
1215 }
1216
1217 if (type == FEAT_NN) {
1218 if (len > sizeof(entry->val.nn))
1219 goto confirmation_failed;
1220
1221 if (entry->val.nn == dccp_decode_value_var(val, len))
1222 goto confirmation_succeeded;
1223
1224 DCCP_WARN("Bogus Confirm for non-existing value\n");
1225 goto confirmation_failed;
1226 }
549 1227
550 /* reset state for a new socket */ 1228 /*
551 opt->dccpop_conf = 0; 1229 * Parsing SP Confirms: the first element of @val is the preferred
1230 * SP value which the peer confirms, the remainder depends on @len.
1231 * Note that only the confirmed value need to be a valid SP value.
1232 */
1233 if (!dccp_feat_is_valid_sp_val(feat, *val))
1234 goto confirmation_failed;
1235
1236 if (len == 1) { /* peer didn't supply a preference list */
1237 plist = val;
1238 plen = len;
1239 } else { /* preferred value + preference list */
1240 plist = val + 1;
1241 plen = len - 1;
552 } 1242 }
553 1243
554 /* XXX not doing anything about the conf queue */ 1244 /* Check whether the peer got the reconciliation right (6.6.8) */
1245 if (dccp_feat_reconcile(&entry->val, plist, plen, server, 0) != *val) {
1246 DCCP_WARN("Confirm selected the wrong value %u\n", *val);
1247 return DCCP_RESET_CODE_OPTION_ERROR;
1248 }
1249 entry->val.sp.vec[0] = *val;
555 1250
556out: 1251confirmation_succeeded:
557 return rc; 1252 entry->state = FEAT_STABLE;
1253 return 0;
558 1254
559out_clean: 1255confirmation_failed:
560 dccp_feat_clean(newdmsk); 1256 DCCP_WARN("Confirmation failed\n");
561 rc = -ENOMEM; 1257 return is_mandatory ? DCCP_RESET_CODE_MANDATORY_ERROR
562 goto out; 1258 : DCCP_RESET_CODE_OPTION_ERROR;
563} 1259}
564 1260
565EXPORT_SYMBOL_GPL(dccp_feat_clone); 1261/**
1262 * dccp_feat_handle_nn_established - Fast-path reception of NN options
1263 * @sk: socket of an established DCCP connection
1264 * @mandatory: whether @opt was preceded by a Mandatory option
1265 * @opt: %DCCPO_CHANGE_L | %DCCPO_CONFIRM_R (NN only)
1266 * @feat: NN number, one of %dccp_feature_numbers
1267 * @val: NN value
1268 * @len: length of @val in bytes
1269 * This function combines the functionality of change_recv/confirm_recv, with
1270 * the following differences (reset codes are the same):
1271 * - cleanup after receiving the Confirm;
1272 * - values are directly activated after successful parsing;
1273 * - deliberately restricted to NN features.
1274 * The restriction to NN features is essential since SP features can have non-
1275 * predictable outcomes (depending on the remote configuration), and are inter-
1276 * dependent (CCIDs for instance cause further dependencies).
1277 */
1278static u8 dccp_feat_handle_nn_established(struct sock *sk, u8 mandatory, u8 opt,
1279 u8 feat, u8 *val, u8 len)
1280{
1281 struct list_head *fn = &dccp_sk(sk)->dccps_featneg;
1282 const bool local = (opt == DCCPO_CONFIRM_R);
1283 struct dccp_feat_entry *entry;
1284 u8 type = dccp_feat_type(feat);
1285 dccp_feat_val fval;
1286
1287 dccp_feat_print_opt(opt, feat, val, len, mandatory);
1288
1289 /* Ignore non-mandatory unknown and non-NN features */
1290 if (type == FEAT_UNKNOWN) {
1291 if (local && !mandatory)
1292 return 0;
1293 goto fast_path_unknown;
1294 } else if (type != FEAT_NN) {
1295 return 0;
1296 }
1297
1298 /*
1299 * We don't accept empty Confirms, since in fast-path feature
1300 * negotiation the values are enabled immediately after sending
1301 * the Change option.
1302 * Empty Changes on the other hand are invalid (RFC 4340, 6.1).
1303 */
1304 if (len == 0 || len > sizeof(fval.nn))
1305 goto fast_path_unknown;
1306
1307 if (opt == DCCPO_CHANGE_L) {
1308 fval.nn = dccp_decode_value_var(val, len);
1309 if (!dccp_feat_is_valid_nn_val(feat, fval.nn))
1310 goto fast_path_unknown;
1311
1312 if (dccp_feat_push_confirm(fn, feat, local, &fval) ||
1313 dccp_feat_activate(sk, feat, local, &fval))
1314 return DCCP_RESET_CODE_TOO_BUSY;
1315
1316 /* set the `Ack Pending' flag to piggyback a Confirm */
1317 inet_csk_schedule_ack(sk);
1318
1319 } else if (opt == DCCPO_CONFIRM_R) {
1320 entry = dccp_feat_list_lookup(fn, feat, local);
1321 if (entry == NULL || entry->state != FEAT_CHANGING)
1322 return 0;
1323
1324 fval.nn = dccp_decode_value_var(val, len);
1325 if (fval.nn != entry->val.nn) {
1326 DCCP_WARN("Bogus Confirm for non-existing value\n");
1327 goto fast_path_failed;
1328 }
1329
1330 /* It has been confirmed - so remove the entry */
1331 dccp_feat_list_pop(entry);
1332
1333 } else {
1334 DCCP_WARN("Received illegal option %u\n", opt);
1335 goto fast_path_failed;
1336 }
1337 return 0;
1338
1339fast_path_unknown:
1340 if (!mandatory)
1341 return dccp_push_empty_confirm(fn, feat, local);
1342
1343fast_path_failed:
1344 return mandatory ? DCCP_RESET_CODE_MANDATORY_ERROR
1345 : DCCP_RESET_CODE_OPTION_ERROR;
1346}
566 1347
567static int __dccp_feat_init(struct dccp_minisock *dmsk, u8 type, u8 feat, 1348/**
568 u8 *val, u8 len) 1349 * dccp_feat_parse_options - Process Feature-Negotiation Options
1350 * @sk: for general use and used by the client during connection setup
1351 * @dreq: used by the server during connection setup
1352 * @mandatory: whether @opt was preceded by a Mandatory option
1353 * @opt: %DCCPO_CHANGE_L | %DCCPO_CHANGE_R | %DCCPO_CONFIRM_L | %DCCPO_CONFIRM_R
1354 * @feat: one of %dccp_feature_numbers
1355 * @val: value contents of @opt
1356 * @len: length of @val in bytes
1357 * Returns 0 on success, a Reset code for ending the connection otherwise.
1358 */
1359int dccp_feat_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
1360 u8 mandatory, u8 opt, u8 feat, u8 *val, u8 len)
569{ 1361{
570 int rc = -ENOMEM; 1362 struct dccp_sock *dp = dccp_sk(sk);
571 u8 *copy = kmemdup(val, len, GFP_KERNEL); 1363 struct list_head *fn = dreq ? &dreq->dreq_featneg : &dp->dccps_featneg;
1364 bool server = false;
572 1365
573 if (copy != NULL) { 1366 switch (sk->sk_state) {
574 rc = dccp_feat_change(dmsk, type, feat, copy, len, GFP_KERNEL); 1367 /*
575 if (rc) 1368 * Negotiation during connection setup
576 kfree(copy); 1369 */
1370 case DCCP_LISTEN:
1371 server = true; /* fall through */
1372 case DCCP_REQUESTING:
1373 switch (opt) {
1374 case DCCPO_CHANGE_L:
1375 case DCCPO_CHANGE_R:
1376 return dccp_feat_change_recv(fn, mandatory, opt, feat,
1377 val, len, server);
1378 case DCCPO_CONFIRM_R:
1379 case DCCPO_CONFIRM_L:
1380 return dccp_feat_confirm_recv(fn, mandatory, opt, feat,
1381 val, len, server);
1382 }
1383 break;
1384 /*
1385 * Support for exchanging NN options on an established connection
1386 * This is currently restricted to Ack Ratio (RFC 4341, 6.1.2)
1387 */
1388 case DCCP_OPEN:
1389 case DCCP_PARTOPEN:
1390 return dccp_feat_handle_nn_established(sk, mandatory, opt, feat,
1391 val, len);
577 } 1392 }
578 return rc; 1393 return 0; /* ignore FN options in all other states */
579} 1394}
580 1395
581int dccp_feat_init(struct dccp_minisock *dmsk) 1396/**
1397 * dccp_feat_init - Seed feature negotiation with host-specific defaults
1398 * This initialises global defaults, depending on the value of the sysctls.
1399 * These can later be overridden by registering changes via setsockopt calls.
1400 * The last link in the chain is finalise_settings, to make sure that between
1401 * here and the start of actual feature negotiation no inconsistencies enter.
1402 *
1403 * All features not appearing below use either defaults or are otherwise
1404 * later adjusted through dccp_feat_finalise_settings().
1405 */
1406int dccp_feat_init(struct sock *sk)
582{ 1407{
1408 struct list_head *fn = &dccp_sk(sk)->dccps_featneg;
1409 u8 on = 1, off = 0;
583 int rc; 1410 int rc;
1411 struct {
1412 u8 *val;
1413 u8 len;
1414 } tx, rx;
1415
1416 /* Non-negotiable (NN) features */
1417 rc = __feat_register_nn(fn, DCCPF_SEQUENCE_WINDOW, 0,
1418 sysctl_dccp_sequence_window);
1419 if (rc)
1420 return rc;
584 1421
585 INIT_LIST_HEAD(&dmsk->dccpms_pending); 1422 /* Server-priority (SP) features */
586 INIT_LIST_HEAD(&dmsk->dccpms_conf); 1423
1424 /* Advertise that short seqnos are not supported (7.6.1) */
1425 rc = __feat_register_sp(fn, DCCPF_SHORT_SEQNOS, true, true, &off, 1);
1426 if (rc)
1427 return rc;
587 1428
588 /* CCID L */ 1429 /* RFC 4340 12.1: "If a DCCP is not ECN capable, ..." */
589 rc = __dccp_feat_init(dmsk, DCCPO_CHANGE_L, DCCPF_CCID, 1430 rc = __feat_register_sp(fn, DCCPF_ECN_INCAPABLE, true, true, &on, 1);
590 &dmsk->dccpms_tx_ccid, 1);
591 if (rc) 1431 if (rc)
592 goto out; 1432 return rc;
1433
1434 /*
1435 * We advertise the available list of CCIDs and reorder according to
1436 * preferences, to avoid failure resulting from negotiating different
1437 * singleton values (which always leads to failure).
1438 * These settings can still (later) be overridden via sockopts.
1439 */
1440 if (ccid_get_builtin_ccids(&tx.val, &tx.len) ||
1441 ccid_get_builtin_ccids(&rx.val, &rx.len))
1442 return -ENOBUFS;
1443
1444 /* Pre-load all CCID modules that are going to be advertised */
1445 rc = -EUNATCH;
1446 if (ccid_request_modules(tx.val, tx.len))
1447 goto free_ccid_lists;
1448
1449 if (!dccp_feat_prefer(sysctl_dccp_tx_ccid, tx.val, tx.len) ||
1450 !dccp_feat_prefer(sysctl_dccp_rx_ccid, rx.val, rx.len))
1451 goto free_ccid_lists;
593 1452
594 /* CCID R */ 1453 rc = __feat_register_sp(fn, DCCPF_CCID, true, false, tx.val, tx.len);
595 rc = __dccp_feat_init(dmsk, DCCPO_CHANGE_R, DCCPF_CCID,
596 &dmsk->dccpms_rx_ccid, 1);
597 if (rc) 1454 if (rc)
598 goto out; 1455 goto free_ccid_lists;
599 1456
600 /* Ack ratio */ 1457 rc = __feat_register_sp(fn, DCCPF_CCID, false, false, rx.val, rx.len);
601 rc = __dccp_feat_init(dmsk, DCCPO_CHANGE_L, DCCPF_ACK_RATIO, 1458
602 &dmsk->dccpms_ack_ratio, 1); 1459free_ccid_lists:
603out: 1460 kfree(tx.val);
1461 kfree(rx.val);
604 return rc; 1462 return rc;
605} 1463}
606 1464
607EXPORT_SYMBOL_GPL(dccp_feat_init); 1465int dccp_feat_activate_values(struct sock *sk, struct list_head *fn_list)
608
609#ifdef CONFIG_IP_DCCP_DEBUG
610const char *dccp_feat_typename(const u8 type)
611{ 1466{
612 switch(type) { 1467 struct dccp_sock *dp = dccp_sk(sk);
613 case DCCPO_CHANGE_L: return("ChangeL"); 1468 struct dccp_feat_entry *cur, *next;
614 case DCCPO_CONFIRM_L: return("ConfirmL"); 1469 int idx;
615 case DCCPO_CHANGE_R: return("ChangeR"); 1470 dccp_feat_val *fvals[DCCP_FEAT_SUPPORTED_MAX][2] = {
616 case DCCPO_CONFIRM_R: return("ConfirmR"); 1471 [0 ... DCCP_FEAT_SUPPORTED_MAX-1] = { NULL, NULL }
617 /* the following case must not appear in feature negotation */ 1472 };
618 default: dccp_pr_debug("unknown type %d [BUG!]\n", type); 1473
1474 list_for_each_entry(cur, fn_list, node) {
1475 /*
1476 * An empty Confirm means that either an unknown feature type
1477 * or an invalid value was present. In the first case there is
1478 * nothing to activate, in the other the default value is used.
1479 */
1480 if (cur->empty_confirm)
1481 continue;
1482
1483 idx = dccp_feat_index(cur->feat_num);
1484 if (idx < 0) {
1485 DCCP_BUG("Unknown feature %u", cur->feat_num);
1486 goto activation_failed;
1487 }
1488 if (cur->state != FEAT_STABLE) {
1489 DCCP_CRIT("Negotiation of %s %s failed in state %s",
1490 cur->is_local ? "local" : "remote",
1491 dccp_feat_fname(cur->feat_num),
1492 dccp_feat_sname[cur->state]);
1493 goto activation_failed;
1494 }
1495 fvals[idx][cur->is_local] = &cur->val;
619 } 1496 }
620 return NULL;
621}
622 1497
623EXPORT_SYMBOL_GPL(dccp_feat_typename); 1498 /*
1499 * Activate in decreasing order of index, so that the CCIDs are always
1500 * activated as the last feature. This avoids the case where a CCID
1501 * relies on the initialisation of one or more features that it depends
1502 * on (e.g. Send NDP Count, Send Ack Vector, and Ack Ratio features).
1503 */
1504 for (idx = DCCP_FEAT_SUPPORTED_MAX; --idx >= 0;)
1505 if (__dccp_feat_activate(sk, idx, 0, fvals[idx][0]) ||
1506 __dccp_feat_activate(sk, idx, 1, fvals[idx][1])) {
1507 DCCP_CRIT("Could not activate %d", idx);
1508 goto activation_failed;
1509 }
624 1510
625const char *dccp_feat_name(const u8 feat) 1511 /* Clean up Change options which have been confirmed already */
626{ 1512 list_for_each_entry_safe(cur, next, fn_list, node)
627 static const char *feature_names[] = { 1513 if (!cur->needs_confirm)
628 [DCCPF_RESERVED] = "Reserved", 1514 dccp_feat_list_pop(cur);
629 [DCCPF_CCID] = "CCID",
630 [DCCPF_SHORT_SEQNOS] = "Allow Short Seqnos",
631 [DCCPF_SEQUENCE_WINDOW] = "Sequence Window",
632 [DCCPF_ECN_INCAPABLE] = "ECN Incapable",
633 [DCCPF_ACK_RATIO] = "Ack Ratio",
634 [DCCPF_SEND_ACK_VECTOR] = "Send ACK Vector",
635 [DCCPF_SEND_NDP_COUNT] = "Send NDP Count",
636 [DCCPF_MIN_CSUM_COVER] = "Min. Csum Coverage",
637 [DCCPF_DATA_CHECKSUM] = "Send Data Checksum",
638 };
639 if (feat > DCCPF_DATA_CHECKSUM && feat < DCCPF_MIN_CCID_SPECIFIC)
640 return feature_names[DCCPF_RESERVED];
641 1515
642 if (feat >= DCCPF_MIN_CCID_SPECIFIC) 1516 dccp_pr_debug("Activation OK\n");
643 return "CCID-specific"; 1517 return 0;
644 1518
645 return feature_names[feat]; 1519activation_failed:
1520 /*
1521 * We clean up everything that may have been allocated, since
1522 * it is difficult to track at which stage negotiation failed.
1523 * This is ok, since all allocation functions below are robust
1524 * against NULL arguments.
1525 */
1526 ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk);
1527 ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk);
1528 dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL;
1529 dccp_ackvec_free(dp->dccps_hc_rx_ackvec);
1530 dp->dccps_hc_rx_ackvec = NULL;
1531 return -1;
646} 1532}
647
648EXPORT_SYMBOL_GPL(dccp_feat_name);
649#endif /* CONFIG_IP_DCCP_DEBUG */
diff --git a/net/dccp/feat.h b/net/dccp/feat.h
index e272222c7ace..2217066e22d7 100644
--- a/net/dccp/feat.h
+++ b/net/dccp/feat.h
@@ -3,38 +3,134 @@
3/* 3/*
4 * net/dccp/feat.h 4 * net/dccp/feat.h
5 * 5 *
6 * An implementation of the DCCP protocol 6 * Feature negotiation for the DCCP protocol (RFC 4340, section 6)
7 * Copyright (c) 2008 Gerrit Renker <gerrit@erg.abdn.ac.uk>
7 * Copyright (c) 2005 Andrea Bittau <a.bittau@cs.ucl.ac.uk> 8 * Copyright (c) 2005 Andrea Bittau <a.bittau@cs.ucl.ac.uk>
8 * 9 *
9 * This program is free software; you can redistribute it and/or modify it 10 * This program is free software; you can redistribute it and/or modify it
10 * under the terms of the GNU General Public License version 2 as 11 * under the terms of the GNU General Public License version 2 as
11 * published by the Free Software Foundation. 12 * published by the Free Software Foundation.
12 */ 13 */
13
14#include <linux/types.h> 14#include <linux/types.h>
15#include "dccp.h" 15#include "dccp.h"
16 16
17#ifdef CONFIG_IP_DCCP_DEBUG 17/*
18extern const char *dccp_feat_typename(const u8 type); 18 * Known limit values
19extern const char *dccp_feat_name(const u8 feat); 19 */
20/* Ack Ratio takes 2-byte integer values (11.3) */
21#define DCCPF_ACK_RATIO_MAX 0xFFFF
22/* Wmin=32 and Wmax=2^46-1 from 7.5.2 */
23#define DCCPF_SEQ_WMIN 32
24#define DCCPF_SEQ_WMAX 0x3FFFFFFFFFFFull
25/* Maximum number of SP values that fit in a single (Confirm) option */
26#define DCCP_FEAT_MAX_SP_VALS (DCCP_SINGLE_OPT_MAXLEN - 2)
27
28enum dccp_feat_type {
29 FEAT_AT_RX = 1, /* located at RX side of half-connection */
30 FEAT_AT_TX = 2, /* located at TX side of half-connection */
31 FEAT_SP = 4, /* server-priority reconciliation (6.3.1) */
32 FEAT_NN = 8, /* non-negotiable reconciliation (6.3.2) */
33 FEAT_UNKNOWN = 0xFF /* not understood or invalid feature */
34};
35
36enum dccp_feat_state {
37 FEAT_DEFAULT = 0, /* using default values from 6.4 */
38 FEAT_INITIALISING, /* feature is being initialised */
39 FEAT_CHANGING, /* Change sent but not confirmed yet */
40 FEAT_UNSTABLE, /* local modification in state CHANGING */
41 FEAT_STABLE /* both ends (think they) agree */
42};
20 43
21static inline void dccp_feat_debug(const u8 type, const u8 feat, const u8 val) 44/**
45 * dccp_feat_val - Container for SP or NN feature values
46 * @nn: single NN value
47 * @sp.vec: single SP value plus optional preference list
48 * @sp.len: length of @sp.vec in bytes
49 */
50typedef union {
51 u64 nn;
52 struct {
53 u8 *vec;
54 u8 len;
55 } sp;
56} dccp_feat_val;
57
58/**
59 * struct feat_entry - Data structure to perform feature negotiation
60 * @feat_num: one of %dccp_feature_numbers
61 * @val: feature's current value (SP features may have preference list)
62 * @state: feature's current state
63 * @needs_mandatory: whether Mandatory options should be sent
64 * @needs_confirm: whether to send a Confirm instead of a Change
65 * @empty_confirm: whether to send an empty Confirm (depends on @needs_confirm)
66 * @is_local: feature location (1) or feature-remote (0)
67 * @node: list pointers, entries arranged in FIFO order
68 */
69struct dccp_feat_entry {
70 u8 feat_num;
71 dccp_feat_val val;
72 enum dccp_feat_state state:8;
73 bool needs_mandatory:1,
74 needs_confirm:1,
75 empty_confirm:1,
76 is_local:1;
77
78 struct list_head node;
79};
80
81static inline u8 dccp_feat_genopt(struct dccp_feat_entry *entry)
22{ 82{
23 dccp_pr_debug("%s(%s (%d), %d)\n", dccp_feat_typename(type), 83 if (entry->needs_confirm)
24 dccp_feat_name(feat), feat, val); 84 return entry->is_local ? DCCPO_CONFIRM_L : DCCPO_CONFIRM_R;
85 return entry->is_local ? DCCPO_CHANGE_L : DCCPO_CHANGE_R;
25} 86}
26#else
27#define dccp_feat_debug(type, feat, val)
28#endif /* CONFIG_IP_DCCP_DEBUG */
29
30extern int dccp_feat_change(struct dccp_minisock *dmsk, u8 type, u8 feature,
31 u8 *val, u8 len, gfp_t gfp);
32extern int dccp_feat_change_recv(struct sock *sk, u8 type, u8 feature,
33 u8 *val, u8 len);
34extern int dccp_feat_confirm_recv(struct sock *sk, u8 type, u8 feature,
35 u8 *val, u8 len);
36extern void dccp_feat_clean(struct dccp_minisock *dmsk);
37extern int dccp_feat_clone(struct sock *oldsk, struct sock *newsk);
38extern int dccp_feat_init(struct dccp_minisock *dmsk);
39 87
88/**
89 * struct ccid_dependency - Track changes resulting from choosing a CCID
90 * @dependent_feat: one of %dccp_feature_numbers
91 * @is_local: local (1) or remote (0) @dependent_feat
92 * @is_mandatory: whether presence of @dependent_feat is mission-critical or not
93 * @val: corresponding default value for @dependent_feat (u8 is sufficient here)
94 */
95struct ccid_dependency {
96 u8 dependent_feat;
97 bool is_local:1,
98 is_mandatory:1;
99 u8 val;
100};
101
102/*
103 * Sysctls to seed defaults for feature negotiation
104 */
105extern unsigned long sysctl_dccp_sequence_window;
106extern int sysctl_dccp_rx_ccid;
107extern int sysctl_dccp_tx_ccid;
108
109extern int dccp_feat_init(struct sock *sk);
110extern void dccp_feat_initialise_sysctls(void);
111extern int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local,
112 u8 const *list, u8 len);
113extern int dccp_feat_register_nn(struct sock *sk, u8 feat, u64 val);
114extern int dccp_feat_parse_options(struct sock *, struct dccp_request_sock *,
115 u8 mand, u8 opt, u8 feat, u8 *val, u8 len);
116extern int dccp_feat_clone_list(struct list_head const *, struct list_head *);
117
118/*
119 * Encoding variable-length options and their maximum length.
120 *
121 * This affects NN options (SP options are all u8) and other variable-length
122 * options (see table 3 in RFC 4340). The limit is currently given the Sequence
123 * Window NN value (sec. 7.5.2) and the NDP count (sec. 7.7) option, all other
124 * options consume less than 6 bytes (timestamps are 4 bytes).
125 * When updating this constant (e.g. due to new internet drafts / RFCs), make
126 * sure that you also update all code which refers to it.
127 */
128#define DCCP_OPTVAL_MAXLEN 6
129
130extern void dccp_encode_value_var(const u64 value, u8 *to, const u8 len);
131extern u64 dccp_decode_value_var(const u8 *bf, const u8 len);
132
133extern int dccp_insert_option_mandatory(struct sk_buff *skb);
134extern int dccp_insert_fn_opt(struct sk_buff *skb, u8 type, u8 feat,
135 u8 *val, u8 len, bool repeat_first);
40#endif /* _DCCP_FEAT_H */ 136#endif /* _DCCP_FEAT_H */
diff --git a/net/dccp/input.c b/net/dccp/input.c
index 779d0ed9ae94..df0e6714aa11 100644
--- a/net/dccp/input.c
+++ b/net/dccp/input.c
@@ -159,13 +159,15 @@ static void dccp_rcv_reset(struct sock *sk, struct sk_buff *skb)
159 dccp_time_wait(sk, DCCP_TIME_WAIT, 0); 159 dccp_time_wait(sk, DCCP_TIME_WAIT, 0);
160} 160}
161 161
162static void dccp_event_ack_recv(struct sock *sk, struct sk_buff *skb) 162static void dccp_handle_ackvec_processing(struct sock *sk, struct sk_buff *skb)
163{ 163{
164 struct dccp_sock *dp = dccp_sk(sk); 164 struct dccp_ackvec *av = dccp_sk(sk)->dccps_hc_rx_ackvec;
165 165
166 if (dccp_msk(sk)->dccpms_send_ack_vector) 166 if (av == NULL)
167 dccp_ackvec_check_rcv_ackno(dp->dccps_hc_rx_ackvec, sk, 167 return;
168 DCCP_SKB_CB(skb)->dccpd_ack_seq); 168 if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
169 dccp_ackvec_clear_state(av, DCCP_SKB_CB(skb)->dccpd_ack_seq);
170 dccp_ackvec_input(av, skb);
169} 171}
170 172
171static void dccp_deliver_input_to_ccids(struct sock *sk, struct sk_buff *skb) 173static void dccp_deliver_input_to_ccids(struct sock *sk, struct sk_buff *skb)
@@ -364,22 +366,13 @@ discard:
364int dccp_rcv_established(struct sock *sk, struct sk_buff *skb, 366int dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
365 const struct dccp_hdr *dh, const unsigned len) 367 const struct dccp_hdr *dh, const unsigned len)
366{ 368{
367 struct dccp_sock *dp = dccp_sk(sk);
368
369 if (dccp_check_seqno(sk, skb)) 369 if (dccp_check_seqno(sk, skb))
370 goto discard; 370 goto discard;
371 371
372 if (dccp_parse_options(sk, NULL, skb)) 372 if (dccp_parse_options(sk, NULL, skb))
373 return 1; 373 return 1;
374 374
375 if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) 375 dccp_handle_ackvec_processing(sk, skb);
376 dccp_event_ack_recv(sk, skb);
377
378 if (dccp_msk(sk)->dccpms_send_ack_vector &&
379 dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk,
380 DCCP_SKB_CB(skb)->dccpd_seq,
381 DCCP_ACKVEC_STATE_RECEIVED))
382 goto discard;
383 dccp_deliver_input_to_ccids(sk, skb); 376 dccp_deliver_input_to_ccids(sk, skb);
384 377
385 return __dccp_rcv_established(sk, skb, dh, len); 378 return __dccp_rcv_established(sk, skb, dh, len);
@@ -421,40 +414,33 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk,
421 goto out_invalid_packet; 414 goto out_invalid_packet;
422 } 415 }
423 416
417 /*
418 * If option processing (Step 8) failed, return 1 here so that
419 * dccp_v4_do_rcv() sends a Reset. The Reset code depends on
420 * the option type and is set in dccp_parse_options().
421 */
424 if (dccp_parse_options(sk, NULL, skb)) 422 if (dccp_parse_options(sk, NULL, skb))
425 goto out_invalid_packet; 423 return 1;
426 424
427 /* Obtain usec RTT sample from SYN exchange (used by CCID 3) */ 425 /* Obtain usec RTT sample from SYN exchange (used by CCID 3) */
428 if (likely(dp->dccps_options_received.dccpor_timestamp_echo)) 426 if (likely(dp->dccps_options_received.dccpor_timestamp_echo))
429 dp->dccps_syn_rtt = dccp_sample_rtt(sk, 10 * (tstamp - 427 dp->dccps_syn_rtt = dccp_sample_rtt(sk, 10 * (tstamp -
430 dp->dccps_options_received.dccpor_timestamp_echo)); 428 dp->dccps_options_received.dccpor_timestamp_echo));
431 429
432 if (dccp_msk(sk)->dccpms_send_ack_vector &&
433 dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk,
434 DCCP_SKB_CB(skb)->dccpd_seq,
435 DCCP_ACKVEC_STATE_RECEIVED))
436 goto out_invalid_packet; /* FIXME: change error code */
437
438 /* Stop the REQUEST timer */ 430 /* Stop the REQUEST timer */
439 inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); 431 inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
440 WARN_ON(sk->sk_send_head == NULL); 432 WARN_ON(sk->sk_send_head == NULL);
441 kfree_skb(sk->sk_send_head); 433 kfree_skb(sk->sk_send_head);
442 sk->sk_send_head = NULL; 434 sk->sk_send_head = NULL;
443 435
444 dp->dccps_isr = DCCP_SKB_CB(skb)->dccpd_seq;
445 dccp_update_gsr(sk, dp->dccps_isr);
446 /* 436 /*
447 * SWL and AWL are initially adjusted so that they are not less than 437 * Set ISR, GSR from packet. ISS was set in dccp_v{4,6}_connect
448 * the initial Sequence Numbers received and sent, respectively: 438 * and GSS in dccp_transmit_skb(). Setting AWL/AWH and SWL/SWH
449 * SWL := max(GSR + 1 - floor(W/4), ISR), 439 * is done as part of activating the feature values below, since
450 * AWL := max(GSS - W' + 1, ISS). 440 * these settings depend on the local/remote Sequence Window
451 * These adjustments MUST be applied only at the beginning of the 441 * features, which were undefined or not confirmed until now.
452 * connection.
453 *
454 * AWL was adjusted in dccp_v4_connect -acme
455 */ 442 */
456 dccp_set_seqno(&dp->dccps_swl, 443 dp->dccps_gsr = dp->dccps_isr = DCCP_SKB_CB(skb)->dccpd_seq;
457 max48(dp->dccps_swl, dp->dccps_isr));
458 444
459 dccp_sync_mss(sk, icsk->icsk_pmtu_cookie); 445 dccp_sync_mss(sk, icsk->icsk_pmtu_cookie);
460 446
@@ -475,6 +461,15 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk,
475 */ 461 */
476 dccp_set_state(sk, DCCP_PARTOPEN); 462 dccp_set_state(sk, DCCP_PARTOPEN);
477 463
464 /*
465 * If feature negotiation was successful, activate features now;
466 * an activation failure means that this host could not activate
467 * one ore more features (e.g. insufficient memory), which would
468 * leave at least one feature in an undefined state.
469 */
470 if (dccp_feat_activate_values(sk, &dp->dccps_featneg))
471 goto unable_to_proceed;
472
478 /* Make sure socket is routed, for correct metrics. */ 473 /* Make sure socket is routed, for correct metrics. */
479 icsk->icsk_af_ops->rebuild_header(sk); 474 icsk->icsk_af_ops->rebuild_header(sk);
480 475
@@ -509,6 +504,16 @@ out_invalid_packet:
509 /* dccp_v4_do_rcv will send a reset */ 504 /* dccp_v4_do_rcv will send a reset */
510 DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_PACKET_ERROR; 505 DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_PACKET_ERROR;
511 return 1; 506 return 1;
507
508unable_to_proceed:
509 DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_ABORTED;
510 /*
511 * We mark this socket as no longer usable, so that the loop in
512 * dccp_sendmsg() terminates and the application gets notified.
513 */
514 dccp_set_state(sk, DCCP_CLOSED);
515 sk->sk_err = ECOMM;
516 return 1;
512} 517}
513 518
514static int dccp_rcv_respond_partopen_state_process(struct sock *sk, 519static int dccp_rcv_respond_partopen_state_process(struct sock *sk,
@@ -590,8 +595,6 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
590 if (inet_csk(sk)->icsk_af_ops->conn_request(sk, 595 if (inet_csk(sk)->icsk_af_ops->conn_request(sk,
591 skb) < 0) 596 skb) < 0)
592 return 1; 597 return 1;
593
594 /* FIXME: do congestion control initialization */
595 goto discard; 598 goto discard;
596 } 599 }
597 if (dh->dccph_type == DCCP_PKT_RESET) 600 if (dh->dccph_type == DCCP_PKT_RESET)
@@ -600,30 +603,36 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
600 /* Caller (dccp_v4_do_rcv) will send Reset */ 603 /* Caller (dccp_v4_do_rcv) will send Reset */
601 dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION; 604 dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION;
602 return 1; 605 return 1;
606 } else if (sk->sk_state == DCCP_CLOSED) {
607 dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION;
608 return 1;
603 } 609 }
604 610
605 if (sk->sk_state != DCCP_REQUESTING) { 611 /* Step 6: Check sequence numbers (omitted in LISTEN/REQUEST state) */
606 if (dccp_check_seqno(sk, skb)) 612 if (sk->sk_state != DCCP_REQUESTING && dccp_check_seqno(sk, skb))
607 goto discard; 613 goto discard;
608
609 /*
610 * Step 8: Process options and mark acknowledgeable
611 */
612 if (dccp_parse_options(sk, NULL, skb))
613 return 1;
614
615 if (dcb->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
616 dccp_event_ack_recv(sk, skb);
617
618 if (dccp_msk(sk)->dccpms_send_ack_vector &&
619 dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk,
620 DCCP_SKB_CB(skb)->dccpd_seq,
621 DCCP_ACKVEC_STATE_RECEIVED))
622 goto discard;
623 614
624 dccp_deliver_input_to_ccids(sk, skb); 615 /*
616 * Step 7: Check for unexpected packet types
617 * If (S.is_server and P.type == Response)
618 * or (S.is_client and P.type == Request)
619 * or (S.state == RESPOND and P.type == Data),
620 * Send Sync packet acknowledging P.seqno
621 * Drop packet and return
622 */
623 if ((dp->dccps_role != DCCP_ROLE_CLIENT &&
624 dh->dccph_type == DCCP_PKT_RESPONSE) ||
625 (dp->dccps_role == DCCP_ROLE_CLIENT &&
626 dh->dccph_type == DCCP_PKT_REQUEST) ||
627 (sk->sk_state == DCCP_RESPOND && dh->dccph_type == DCCP_PKT_DATA)) {
628 dccp_send_sync(sk, dcb->dccpd_seq, DCCP_PKT_SYNC);
629 goto discard;
625 } 630 }
626 631
632 /* Step 8: Process options */
633 if (dccp_parse_options(sk, NULL, skb))
634 return 1;
635
627 /* 636 /*
628 * Step 9: Process Reset 637 * Step 9: Process Reset
629 * If P.type == Reset, 638 * If P.type == Reset,
@@ -631,44 +640,22 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
631 * S.state := TIMEWAIT 640 * S.state := TIMEWAIT
632 * Set TIMEWAIT timer 641 * Set TIMEWAIT timer
633 * Drop packet and return 642 * Drop packet and return
634 */ 643 */
635 if (dh->dccph_type == DCCP_PKT_RESET) { 644 if (dh->dccph_type == DCCP_PKT_RESET) {
636 dccp_rcv_reset(sk, skb); 645 dccp_rcv_reset(sk, skb);
637 return 0; 646 return 0;
638 /* 647 } else if (dh->dccph_type == DCCP_PKT_CLOSEREQ) { /* Step 13 */
639 * Step 7: Check for unexpected packet types
640 * If (S.is_server and P.type == Response)
641 * or (S.is_client and P.type == Request)
642 * or (S.state == RESPOND and P.type == Data),
643 * Send Sync packet acknowledging P.seqno
644 * Drop packet and return
645 */
646 } else if ((dp->dccps_role != DCCP_ROLE_CLIENT &&
647 dh->dccph_type == DCCP_PKT_RESPONSE) ||
648 (dp->dccps_role == DCCP_ROLE_CLIENT &&
649 dh->dccph_type == DCCP_PKT_REQUEST) ||
650 (sk->sk_state == DCCP_RESPOND &&
651 dh->dccph_type == DCCP_PKT_DATA)) {
652 dccp_send_sync(sk, dcb->dccpd_seq, DCCP_PKT_SYNC);
653 goto discard;
654 } else if (dh->dccph_type == DCCP_PKT_CLOSEREQ) {
655 if (dccp_rcv_closereq(sk, skb)) 648 if (dccp_rcv_closereq(sk, skb))
656 return 0; 649 return 0;
657 goto discard; 650 goto discard;
658 } else if (dh->dccph_type == DCCP_PKT_CLOSE) { 651 } else if (dh->dccph_type == DCCP_PKT_CLOSE) { /* Step 14 */
659 if (dccp_rcv_close(sk, skb)) 652 if (dccp_rcv_close(sk, skb))
660 return 0; 653 return 0;
661 goto discard; 654 goto discard;
662 } 655 }
663 656
664 switch (sk->sk_state) { 657 switch (sk->sk_state) {
665 case DCCP_CLOSED:
666 dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION;
667 return 1;
668
669 case DCCP_REQUESTING: 658 case DCCP_REQUESTING:
670 /* FIXME: do congestion control initialization */
671
672 queued = dccp_rcv_request_sent_state_process(sk, skb, dh, len); 659 queued = dccp_rcv_request_sent_state_process(sk, skb, dh, len);
673 if (queued >= 0) 660 if (queued >= 0)
674 return queued; 661 return queued;
@@ -676,8 +663,12 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
676 __kfree_skb(skb); 663 __kfree_skb(skb);
677 return 0; 664 return 0;
678 665
679 case DCCP_RESPOND:
680 case DCCP_PARTOPEN: 666 case DCCP_PARTOPEN:
667 /* Step 8: if using Ack Vectors, mark packet acknowledgeable */
668 dccp_handle_ackvec_processing(sk, skb);
669 dccp_deliver_input_to_ccids(sk, skb);
670 /* fall through */
671 case DCCP_RESPOND:
681 queued = dccp_rcv_respond_partopen_state_process(sk, skb, 672 queued = dccp_rcv_respond_partopen_state_process(sk, skb,
682 dh, len); 673 dh, len);
683 break; 674 break;
@@ -716,16 +707,7 @@ u32 dccp_sample_rtt(struct sock *sk, long delta)
716 /* dccpor_elapsed_time is either zeroed out or set and > 0 */ 707 /* dccpor_elapsed_time is either zeroed out or set and > 0 */
717 delta -= dccp_sk(sk)->dccps_options_received.dccpor_elapsed_time * 10; 708 delta -= dccp_sk(sk)->dccps_options_received.dccpor_elapsed_time * 10;
718 709
719 if (unlikely(delta <= 0)) { 710 return dccp_sane_rtt(delta);
720 DCCP_WARN("unusable RTT sample %ld, using min\n", delta);
721 return DCCP_SANE_RTT_MIN;
722 }
723 if (unlikely(delta > DCCP_SANE_RTT_MAX)) {
724 DCCP_WARN("RTT sample %ld too large, using max\n", delta);
725 return DCCP_SANE_RTT_MAX;
726 }
727
728 return delta;
729} 711}
730 712
731EXPORT_SYMBOL_GPL(dccp_sample_rtt); 713EXPORT_SYMBOL_GPL(dccp_sample_rtt);
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 882c5c4de69e..b623f6b25482 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -545,6 +545,7 @@ out:
545 545
546static void dccp_v4_reqsk_destructor(struct request_sock *req) 546static void dccp_v4_reqsk_destructor(struct request_sock *req)
547{ 547{
548 dccp_feat_list_purge(&dccp_rsk(req)->dreq_featneg);
548 kfree(inet_rsk(req)->opt); 549 kfree(inet_rsk(req)->opt);
549} 550}
550 551
@@ -595,7 +596,8 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
595 if (req == NULL) 596 if (req == NULL)
596 goto drop; 597 goto drop;
597 598
598 dccp_reqsk_init(req, skb); 599 if (dccp_reqsk_init(req, dccp_sk(sk), skb))
600 goto drop_and_free;
599 601
600 dreq = dccp_rsk(req); 602 dreq = dccp_rsk(req);
601 if (dccp_parse_options(sk, dreq, skb)) 603 if (dccp_parse_options(sk, dreq, skb))
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 5e1ee0da2c40..ad6212e00435 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -302,6 +302,7 @@ done:
302 302
303static void dccp_v6_reqsk_destructor(struct request_sock *req) 303static void dccp_v6_reqsk_destructor(struct request_sock *req)
304{ 304{
305 dccp_feat_list_purge(&dccp_rsk(req)->dreq_featneg);
305 if (inet6_rsk(req)->pktopts != NULL) 306 if (inet6_rsk(req)->pktopts != NULL)
306 kfree_skb(inet6_rsk(req)->pktopts); 307 kfree_skb(inet6_rsk(req)->pktopts);
307} 308}
@@ -424,7 +425,8 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
424 if (req == NULL) 425 if (req == NULL)
425 goto drop; 426 goto drop;
426 427
427 dccp_reqsk_init(req, skb); 428 if (dccp_reqsk_init(req, dccp_sk(sk), skb))
429 goto drop_and_free;
428 430
429 dreq = dccp_rsk(req); 431 dreq = dccp_rsk(req);
430 if (dccp_parse_options(sk, dreq, skb)) 432 if (dccp_parse_options(sk, dreq, skb))
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
index b2804e2d1b8c..f4d9c8f60ede 100644
--- a/net/dccp/minisocks.c
+++ b/net/dccp/minisocks.c
@@ -42,16 +42,6 @@ struct inet_timewait_death_row dccp_death_row = {
42 42
43EXPORT_SYMBOL_GPL(dccp_death_row); 43EXPORT_SYMBOL_GPL(dccp_death_row);
44 44
45void dccp_minisock_init(struct dccp_minisock *dmsk)
46{
47 dmsk->dccpms_sequence_window = sysctl_dccp_feat_sequence_window;
48 dmsk->dccpms_rx_ccid = sysctl_dccp_feat_rx_ccid;
49 dmsk->dccpms_tx_ccid = sysctl_dccp_feat_tx_ccid;
50 dmsk->dccpms_ack_ratio = sysctl_dccp_feat_ack_ratio;
51 dmsk->dccpms_send_ack_vector = sysctl_dccp_feat_send_ack_vector;
52 dmsk->dccpms_send_ndp_count = sysctl_dccp_feat_send_ndp_count;
53}
54
55void dccp_time_wait(struct sock *sk, int state, int timeo) 45void dccp_time_wait(struct sock *sk, int state, int timeo)
56{ 46{
57 struct inet_timewait_sock *tw = NULL; 47 struct inet_timewait_sock *tw = NULL;
@@ -112,10 +102,9 @@ struct sock *dccp_create_openreq_child(struct sock *sk,
112 struct sock *newsk = inet_csk_clone(sk, req, GFP_ATOMIC); 102 struct sock *newsk = inet_csk_clone(sk, req, GFP_ATOMIC);
113 103
114 if (newsk != NULL) { 104 if (newsk != NULL) {
115 const struct dccp_request_sock *dreq = dccp_rsk(req); 105 struct dccp_request_sock *dreq = dccp_rsk(req);
116 struct inet_connection_sock *newicsk = inet_csk(newsk); 106 struct inet_connection_sock *newicsk = inet_csk(newsk);
117 struct dccp_sock *newdp = dccp_sk(newsk); 107 struct dccp_sock *newdp = dccp_sk(newsk);
118 struct dccp_minisock *newdmsk = dccp_msk(newsk);
119 108
120 newdp->dccps_role = DCCP_ROLE_SERVER; 109 newdp->dccps_role = DCCP_ROLE_SERVER;
121 newdp->dccps_hc_rx_ackvec = NULL; 110 newdp->dccps_hc_rx_ackvec = NULL;
@@ -125,65 +114,32 @@ struct sock *dccp_create_openreq_child(struct sock *sk,
125 newdp->dccps_timestamp_time = dreq->dreq_timestamp_time; 114 newdp->dccps_timestamp_time = dreq->dreq_timestamp_time;
126 newicsk->icsk_rto = DCCP_TIMEOUT_INIT; 115 newicsk->icsk_rto = DCCP_TIMEOUT_INIT;
127 116
128 if (dccp_feat_clone(sk, newsk)) 117 INIT_LIST_HEAD(&newdp->dccps_featneg);
129 goto out_free;
130
131 if (newdmsk->dccpms_send_ack_vector) {
132 newdp->dccps_hc_rx_ackvec =
133 dccp_ackvec_alloc(GFP_ATOMIC);
134 if (unlikely(newdp->dccps_hc_rx_ackvec == NULL))
135 goto out_free;
136 }
137
138 newdp->dccps_hc_rx_ccid =
139 ccid_hc_rx_new(newdmsk->dccpms_rx_ccid,
140 newsk, GFP_ATOMIC);
141 newdp->dccps_hc_tx_ccid =
142 ccid_hc_tx_new(newdmsk->dccpms_tx_ccid,
143 newsk, GFP_ATOMIC);
144 if (unlikely(newdp->dccps_hc_rx_ccid == NULL ||
145 newdp->dccps_hc_tx_ccid == NULL)) {
146 dccp_ackvec_free(newdp->dccps_hc_rx_ackvec);
147 ccid_hc_rx_delete(newdp->dccps_hc_rx_ccid, newsk);
148 ccid_hc_tx_delete(newdp->dccps_hc_tx_ccid, newsk);
149out_free:
150 /* It is still raw copy of parent, so invalidate
151 * destructor and make plain sk_free() */
152 newsk->sk_destruct = NULL;
153 sk_free(newsk);
154 return NULL;
155 }
156
157 /* 118 /*
158 * Step 3: Process LISTEN state 119 * Step 3: Process LISTEN state
159 * 120 *
160 * Choose S.ISS (initial seqno) or set from Init Cookies 121 * Choose S.ISS (initial seqno) or set from Init Cookies
161 * Initialize S.GAR := S.ISS 122 * Initialize S.GAR := S.ISS
162 * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookies 123 * Set S.ISR, S.GSR from packet (or Init Cookies)
124 *
125 * Setting AWL/AWH and SWL/SWH happens as part of the feature
126 * activation below, as these windows all depend on the local
127 * and remote Sequence Window feature values (7.5.2).
163 */ 128 */
164 129 newdp->dccps_gss = newdp->dccps_iss = dreq->dreq_iss;
165 /* See dccp_v4_conn_request */ 130 newdp->dccps_gar = newdp->dccps_iss;
166 newdmsk->dccpms_sequence_window = req->rcv_wnd; 131 newdp->dccps_gsr = newdp->dccps_isr = dreq->dreq_isr;
167
168 newdp->dccps_gar = newdp->dccps_iss = dreq->dreq_iss;
169 dccp_update_gss(newsk, dreq->dreq_iss);
170
171 newdp->dccps_isr = dreq->dreq_isr;
172 dccp_update_gsr(newsk, dreq->dreq_isr);
173 132
174 /* 133 /*
175 * SWL and AWL are initially adjusted so that they are not less than 134 * Activate features: initialise CCIDs, sequence windows etc.
176 * the initial Sequence Numbers received and sent, respectively:
177 * SWL := max(GSR + 1 - floor(W/4), ISR),
178 * AWL := max(GSS - W' + 1, ISS).
179 * These adjustments MUST be applied only at the beginning of the
180 * connection.
181 */ 135 */
182 dccp_set_seqno(&newdp->dccps_swl, 136 if (dccp_feat_activate_values(newsk, &dreq->dreq_featneg)) {
183 max48(newdp->dccps_swl, newdp->dccps_isr)); 137 /* It is still raw copy of parent, so invalidate
184 dccp_set_seqno(&newdp->dccps_awl, 138 * destructor and make plain sk_free() */
185 max48(newdp->dccps_awl, newdp->dccps_iss)); 139 newsk->sk_destruct = NULL;
186 140 sk_free(newsk);
141 return NULL;
142 }
187 dccp_init_xmit_timers(newsk); 143 dccp_init_xmit_timers(newsk);
188 144
189 DCCP_INC_STATS_BH(DCCP_MIB_PASSIVEOPENS); 145 DCCP_INC_STATS_BH(DCCP_MIB_PASSIVEOPENS);
@@ -304,14 +260,17 @@ void dccp_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
304 260
305EXPORT_SYMBOL_GPL(dccp_reqsk_send_ack); 261EXPORT_SYMBOL_GPL(dccp_reqsk_send_ack);
306 262
307void dccp_reqsk_init(struct request_sock *req, struct sk_buff *skb) 263int dccp_reqsk_init(struct request_sock *req,
264 struct dccp_sock const *dp, struct sk_buff const *skb)
308{ 265{
309 struct dccp_request_sock *dreq = dccp_rsk(req); 266 struct dccp_request_sock *dreq = dccp_rsk(req);
310 267
311 inet_rsk(req)->rmt_port = dccp_hdr(skb)->dccph_sport; 268 inet_rsk(req)->rmt_port = dccp_hdr(skb)->dccph_sport;
312 inet_rsk(req)->acked = 0; 269 inet_rsk(req)->acked = 0;
313 req->rcv_wnd = sysctl_dccp_feat_sequence_window;
314 dreq->dreq_timestamp_echo = 0; 270 dreq->dreq_timestamp_echo = 0;
271
272 /* inherit feature negotiation options from listening socket */
273 return dccp_feat_clone_list(&dp->dccps_featneg, &dreq->dreq_featneg);
315} 274}
316 275
317EXPORT_SYMBOL_GPL(dccp_reqsk_init); 276EXPORT_SYMBOL_GPL(dccp_reqsk_init);
diff --git a/net/dccp/options.c b/net/dccp/options.c
index 0809b63cb055..e5a32979d7d7 100644
--- a/net/dccp/options.c
+++ b/net/dccp/options.c
@@ -23,23 +23,20 @@
23#include "dccp.h" 23#include "dccp.h"
24#include "feat.h" 24#include "feat.h"
25 25
26int sysctl_dccp_feat_sequence_window = DCCPF_INITIAL_SEQUENCE_WINDOW; 26u64 dccp_decode_value_var(const u8 *bf, const u8 len)
27int sysctl_dccp_feat_rx_ccid = DCCPF_INITIAL_CCID;
28int sysctl_dccp_feat_tx_ccid = DCCPF_INITIAL_CCID;
29int sysctl_dccp_feat_ack_ratio = DCCPF_INITIAL_ACK_RATIO;
30int sysctl_dccp_feat_send_ack_vector = DCCPF_INITIAL_SEND_ACK_VECTOR;
31int sysctl_dccp_feat_send_ndp_count = DCCPF_INITIAL_SEND_NDP_COUNT;
32
33static u32 dccp_decode_value_var(const unsigned char *bf, const u8 len)
34{ 27{
35 u32 value = 0; 28 u64 value = 0;
36 29
30 if (len >= DCCP_OPTVAL_MAXLEN)
31 value += ((u64)*bf++) << 40;
32 if (len > 4)
33 value += ((u64)*bf++) << 32;
37 if (len > 3) 34 if (len > 3)
38 value += *bf++ << 24; 35 value += ((u64)*bf++) << 24;
39 if (len > 2) 36 if (len > 2)
40 value += *bf++ << 16; 37 value += ((u64)*bf++) << 16;
41 if (len > 1) 38 if (len > 1)
42 value += *bf++ << 8; 39 value += ((u64)*bf++) << 8;
43 if (len > 0) 40 if (len > 0)
44 value += *bf; 41 value += *bf;
45 42
@@ -57,7 +54,6 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
57 struct dccp_sock *dp = dccp_sk(sk); 54 struct dccp_sock *dp = dccp_sk(sk);
58 const struct dccp_hdr *dh = dccp_hdr(skb); 55 const struct dccp_hdr *dh = dccp_hdr(skb);
59 const u8 pkt_type = DCCP_SKB_CB(skb)->dccpd_type; 56 const u8 pkt_type = DCCP_SKB_CB(skb)->dccpd_type;
60 u64 ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq;
61 unsigned char *options = (unsigned char *)dh + dccp_hdr_len(skb); 57 unsigned char *options = (unsigned char *)dh + dccp_hdr_len(skb);
62 unsigned char *opt_ptr = options; 58 unsigned char *opt_ptr = options;
63 const unsigned char *opt_end = (unsigned char *)dh + 59 const unsigned char *opt_end = (unsigned char *)dh +
@@ -99,18 +95,11 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
99 } 95 }
100 96
101 /* 97 /*
102 * CCID-Specific Options (from RFC 4340, sec. 10.3):
103 *
104 * Option numbers 128 through 191 are for options sent from the
105 * HC-Sender to the HC-Receiver; option numbers 192 through 255
106 * are for options sent from the HC-Receiver to the HC-Sender.
107 *
108 * CCID-specific options are ignored during connection setup, as 98 * CCID-specific options are ignored during connection setup, as
109 * negotiation may still be in progress (see RFC 4340, 10.3). 99 * negotiation may still be in progress (see RFC 4340, 10.3).
110 * The same applies to Ack Vectors, as these depend on the CCID. 100 * The same applies to Ack Vectors, as these depend on the CCID.
111 *
112 */ 101 */
113 if (dreq != NULL && (opt >= 128 || 102 if (dreq != NULL && (opt >= DCCPO_MIN_RX_CCID_SPECIFIC ||
114 opt == DCCPO_ACK_VECTOR_0 || opt == DCCPO_ACK_VECTOR_1)) 103 opt == DCCPO_ACK_VECTOR_0 || opt == DCCPO_ACK_VECTOR_1))
115 goto ignore_option; 104 goto ignore_option;
116 105
@@ -131,43 +120,13 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
131 dccp_pr_debug("%s opt: NDP count=%llu\n", dccp_role(sk), 120 dccp_pr_debug("%s opt: NDP count=%llu\n", dccp_role(sk),
132 (unsigned long long)opt_recv->dccpor_ndp); 121 (unsigned long long)opt_recv->dccpor_ndp);
133 break; 122 break;
134 case DCCPO_CHANGE_L: 123 case DCCPO_CHANGE_L ... DCCPO_CONFIRM_R:
135 /* fall through */ 124 if (pkt_type == DCCP_PKT_DATA) /* RFC 4340, 6 */
136 case DCCPO_CHANGE_R:
137 if (pkt_type == DCCP_PKT_DATA)
138 break; 125 break;
139 if (len < 2) 126 rc = dccp_feat_parse_options(sk, dreq, mandatory, opt,
140 goto out_invalid_option; 127 *value, value + 1, len - 1);
141 rc = dccp_feat_change_recv(sk, opt, *value, value + 1, 128 if (rc)
142 len - 1); 129 goto out_featneg_failed;
143 /*
144 * When there is a change error, change_recv is
145 * responsible for dealing with it. i.e. reply with an
146 * empty confirm.
147 * If the change was mandatory, then we need to die.
148 */
149 if (rc && mandatory)
150 goto out_invalid_option;
151 break;
152 case DCCPO_CONFIRM_L:
153 /* fall through */
154 case DCCPO_CONFIRM_R:
155 if (pkt_type == DCCP_PKT_DATA)
156 break;
157 if (len < 2) /* FIXME this disallows empty confirm */
158 goto out_invalid_option;
159 if (dccp_feat_confirm_recv(sk, opt, *value,
160 value + 1, len - 1))
161 goto out_invalid_option;
162 break;
163 case DCCPO_ACK_VECTOR_0:
164 case DCCPO_ACK_VECTOR_1:
165 if (dccp_packet_without_ack(skb)) /* RFC 4340, 11.4 */
166 break;
167
168 if (dccp_msk(sk)->dccpms_send_ack_vector &&
169 dccp_ackvec_parse(sk, skb, &ackno, opt, value, len))
170 goto out_invalid_option;
171 break; 130 break;
172 case DCCPO_TIMESTAMP: 131 case DCCPO_TIMESTAMP:
173 if (len != 4) 132 if (len != 4)
@@ -195,6 +154,8 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
195 dccp_role(sk), ntohl(opt_val), 154 dccp_role(sk), ntohl(opt_val),
196 (unsigned long long) 155 (unsigned long long)
197 DCCP_SKB_CB(skb)->dccpd_ack_seq); 156 DCCP_SKB_CB(skb)->dccpd_ack_seq);
157 /* schedule an Ack in case this sender is quiescent */
158 inet_csk_schedule_ack(sk);
198 break; 159 break;
199 case DCCPO_TIMESTAMP_ECHO: 160 case DCCPO_TIMESTAMP_ECHO:
200 if (len != 4 && len != 6 && len != 8) 161 if (len != 4 && len != 6 && len != 8)
@@ -251,23 +212,25 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
251 dccp_pr_debug("%s rx opt: ELAPSED_TIME=%d\n", 212 dccp_pr_debug("%s rx opt: ELAPSED_TIME=%d\n",
252 dccp_role(sk), elapsed_time); 213 dccp_role(sk), elapsed_time);
253 break; 214 break;
254 case 128 ... 191: { 215 case DCCPO_MIN_RX_CCID_SPECIFIC ... DCCPO_MAX_RX_CCID_SPECIFIC:
255 const u16 idx = value - options;
256
257 if (ccid_hc_rx_parse_options(dp->dccps_hc_rx_ccid, sk, 216 if (ccid_hc_rx_parse_options(dp->dccps_hc_rx_ccid, sk,
258 opt, len, idx, 217 pkt_type, opt, value, len))
259 value) != 0)
260 goto out_invalid_option; 218 goto out_invalid_option;
261 }
262 break; 219 break;
263 case 192 ... 255: { 220 case DCCPO_ACK_VECTOR_0:
264 const u16 idx = value - options; 221 case DCCPO_ACK_VECTOR_1:
265 222 if (dccp_packet_without_ack(skb)) /* RFC 4340, 11.4 */
223 break;
224 /*
225 * Ack vectors are processed by the TX CCID if it is
226 * interested. The RX CCID need not parse Ack Vectors,
227 * since it is only interested in clearing old state.
228 * Fall through.
229 */
230 case DCCPO_MIN_TX_CCID_SPECIFIC ... DCCPO_MAX_TX_CCID_SPECIFIC:
266 if (ccid_hc_tx_parse_options(dp->dccps_hc_tx_ccid, sk, 231 if (ccid_hc_tx_parse_options(dp->dccps_hc_tx_ccid, sk,
267 opt, len, idx, 232 pkt_type, opt, value, len))
268 value) != 0)
269 goto out_invalid_option; 233 goto out_invalid_option;
270 }
271 break; 234 break;
272 default: 235 default:
273 DCCP_CRIT("DCCP(%p): option %d(len=%d) not " 236 DCCP_CRIT("DCCP(%p): option %d(len=%d) not "
@@ -289,8 +252,10 @@ out_nonsensical_length:
289 252
290out_invalid_option: 253out_invalid_option:
291 DCCP_INC_STATS_BH(DCCP_MIB_INVALIDOPT); 254 DCCP_INC_STATS_BH(DCCP_MIB_INVALIDOPT);
292 DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_OPTION_ERROR; 255 rc = DCCP_RESET_CODE_OPTION_ERROR;
293 DCCP_WARN("DCCP(%p): invalid option %d, len=%d", sk, opt, len); 256out_featneg_failed:
257 DCCP_WARN("DCCP(%p): Option %d (len=%d) error=%u\n", sk, opt, len, rc);
258 DCCP_SKB_CB(skb)->dccpd_reset_code = rc;
294 DCCP_SKB_CB(skb)->dccpd_reset_data[0] = opt; 259 DCCP_SKB_CB(skb)->dccpd_reset_data[0] = opt;
295 DCCP_SKB_CB(skb)->dccpd_reset_data[1] = len > 0 ? value[0] : 0; 260 DCCP_SKB_CB(skb)->dccpd_reset_data[1] = len > 0 ? value[0] : 0;
296 DCCP_SKB_CB(skb)->dccpd_reset_data[2] = len > 1 ? value[1] : 0; 261 DCCP_SKB_CB(skb)->dccpd_reset_data[2] = len > 1 ? value[1] : 0;
@@ -299,9 +264,12 @@ out_invalid_option:
299 264
300EXPORT_SYMBOL_GPL(dccp_parse_options); 265EXPORT_SYMBOL_GPL(dccp_parse_options);
301 266
302static void dccp_encode_value_var(const u32 value, unsigned char *to, 267void dccp_encode_value_var(const u64 value, u8 *to, const u8 len)
303 const unsigned int len)
304{ 268{
269 if (len >= DCCP_OPTVAL_MAXLEN)
270 *to++ = (value & 0xFF0000000000ull) >> 40;
271 if (len > 4)
272 *to++ = (value & 0xFF00000000ull) >> 32;
305 if (len > 3) 273 if (len > 3)
306 *to++ = (value & 0xFF000000) >> 24; 274 *to++ = (value & 0xFF000000) >> 24;
307 if (len > 2) 275 if (len > 2)
@@ -461,92 +429,140 @@ static int dccp_insert_option_timestamp_echo(struct dccp_sock *dp,
461 return 0; 429 return 0;
462} 430}
463 431
464static int dccp_insert_feat_opt(struct sk_buff *skb, u8 type, u8 feat, 432static int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb)
465 u8 *val, u8 len)
466{ 433{
467 u8 *to; 434 struct dccp_sock *dp = dccp_sk(sk);
435 struct dccp_ackvec *av = dp->dccps_hc_rx_ackvec;
436 struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
437 const u16 buflen = dccp_ackvec_buflen(av);
438 /* Figure out how many options do we need to represent the ackvec */
439 const u8 nr_opts = DIV_ROUND_UP(buflen, DCCP_SINGLE_OPT_MAXLEN);
440 u16 len = buflen + 2 * nr_opts;
441 u8 i, nonce = 0;
442 const unsigned char *tail, *from;
443 unsigned char *to;
468 444
469 if (DCCP_SKB_CB(skb)->dccpd_opt_len + len + 3 > DCCP_MAX_OPT_LEN) { 445 if (dcb->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) {
470 DCCP_WARN("packet too small for feature %d option!\n", feat); 446 DCCP_WARN("Lacking space for %u bytes on %s packet\n", len,
447 dccp_packet_name(dcb->dccpd_type));
471 return -1; 448 return -1;
472 } 449 }
450 /*
451 * Since Ack Vectors are variable-length, we can not always predict
452 * their size. To catch exception cases where the space is running out
453 * on the skb, a separate Sync is scheduled to carry the Ack Vector.
454 */
455 if (len > DCCPAV_MIN_OPTLEN &&
456 len + dcb->dccpd_opt_len + skb->len > dp->dccps_mss_cache) {
457 DCCP_WARN("No space left for Ack Vector (%u) on skb (%u+%u), "
458 "MPS=%u ==> reduce payload size?\n", len, skb->len,
459 dcb->dccpd_opt_len, dp->dccps_mss_cache);
460 dp->dccps_sync_scheduled = 1;
461 return 0;
462 }
463 dcb->dccpd_opt_len += len;
473 464
474 DCCP_SKB_CB(skb)->dccpd_opt_len += len + 3; 465 to = skb_push(skb, len);
466 len = buflen;
467 from = av->av_buf + av->av_buf_head;
468 tail = av->av_buf + DCCPAV_MAX_ACKVEC_LEN;
475 469
476 to = skb_push(skb, len + 3); 470 for (i = 0; i < nr_opts; ++i) {
477 *to++ = type; 471 int copylen = len;
478 *to++ = len + 3;
479 *to++ = feat;
480 472
481 if (len) 473 if (len > DCCP_SINGLE_OPT_MAXLEN)
482 memcpy(to, val, len); 474 copylen = DCCP_SINGLE_OPT_MAXLEN;
475
476 /*
477 * RFC 4340, 12.2: Encode the Nonce Echo for this Ack Vector via
478 * its type; ack_nonce is the sum of all individual buf_nonce's.
479 */
480 nonce ^= av->av_buf_nonce[i];
481
482 *to++ = DCCPO_ACK_VECTOR_0 + av->av_buf_nonce[i];
483 *to++ = copylen + 2;
483 484
484 dccp_pr_debug("%s(%s (%d), ...), length %d\n", 485 /* Check if buf_head wraps */
485 dccp_feat_typename(type), 486 if (from + copylen > tail) {
486 dccp_feat_name(feat), feat, len); 487 const u16 tailsize = tail - from;
488
489 memcpy(to, from, tailsize);
490 to += tailsize;
491 len -= tailsize;
492 copylen -= tailsize;
493 from = av->av_buf;
494 }
495
496 memcpy(to, from, copylen);
497 from += copylen;
498 to += copylen;
499 len -= copylen;
500 }
501 /*
502 * Each sent Ack Vector is recorded in the list, as per A.2 of RFC 4340.
503 */
504 if (dccp_ackvec_update_records(av, dcb->dccpd_seq, nonce))
505 return -ENOBUFS;
487 return 0; 506 return 0;
488} 507}
489 508
490static int dccp_insert_options_feat(struct sock *sk, struct sk_buff *skb) 509/**
510 * dccp_insert_option_mandatory - Mandatory option (5.8.2)
511 * Note that since we are using skb_push, this function needs to be called
512 * _after_ inserting the option it is supposed to influence (stack order).
513 */
514int dccp_insert_option_mandatory(struct sk_buff *skb)
491{ 515{
492 struct dccp_sock *dp = dccp_sk(sk); 516 if (DCCP_SKB_CB(skb)->dccpd_opt_len >= DCCP_MAX_OPT_LEN)
493 struct dccp_minisock *dmsk = dccp_msk(sk); 517 return -1;
494 struct dccp_opt_pend *opt, *next;
495 int change = 0;
496
497 /* confirm any options [NN opts] */
498 list_for_each_entry_safe(opt, next, &dmsk->dccpms_conf, dccpop_node) {
499 dccp_insert_feat_opt(skb, opt->dccpop_type,
500 opt->dccpop_feat, opt->dccpop_val,
501 opt->dccpop_len);
502 /* fear empty confirms */
503 if (opt->dccpop_val)
504 kfree(opt->dccpop_val);
505 kfree(opt);
506 }
507 INIT_LIST_HEAD(&dmsk->dccpms_conf);
508
509 /* see which features we need to send */
510 list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) {
511 /* see if we need to send any confirm */
512 if (opt->dccpop_sc) {
513 dccp_insert_feat_opt(skb, opt->dccpop_type + 1,
514 opt->dccpop_feat,
515 opt->dccpop_sc->dccpoc_val,
516 opt->dccpop_sc->dccpoc_len);
517
518 BUG_ON(!opt->dccpop_sc->dccpoc_val);
519 kfree(opt->dccpop_sc->dccpoc_val);
520 kfree(opt->dccpop_sc);
521 opt->dccpop_sc = NULL;
522 }
523 518
524 /* any option not confirmed, re-send it */ 519 DCCP_SKB_CB(skb)->dccpd_opt_len++;
525 if (!opt->dccpop_conf) { 520 *skb_push(skb, 1) = DCCPO_MANDATORY;
526 dccp_insert_feat_opt(skb, opt->dccpop_type, 521 return 0;
527 opt->dccpop_feat, opt->dccpop_val, 522}
528 opt->dccpop_len); 523
529 change++; 524/**
530 } 525 * dccp_insert_fn_opt - Insert single Feature-Negotiation option into @skb
526 * @type: %DCCPO_CHANGE_L, %DCCPO_CHANGE_R, %DCCPO_CONFIRM_L, %DCCPO_CONFIRM_R
527 * @feat: one out of %dccp_feature_numbers
528 * @val: NN value or SP array (preferred element first) to copy
529 * @len: true length of @val in bytes (excluding first element repetition)
530 * @repeat_first: whether to copy the first element of @val twice
531 * The last argument is used to construct Confirm options, where the preferred
532 * value and the preference list appear separately (RFC 4340, 6.3.1). Preference
533 * lists are kept such that the preferred entry is always first, so we only need
534 * to copy twice, and avoid the overhead of cloning into a bigger array.
535 */
536int dccp_insert_fn_opt(struct sk_buff *skb, u8 type, u8 feat,
537 u8 *val, u8 len, bool repeat_first)
538{
539 u8 tot_len, *to;
540
541 /* take the `Feature' field and possible repetition into account */
542 if (len > (DCCP_SINGLE_OPT_MAXLEN - 2)) {
543 DCCP_WARN("length %u for feature %u too large\n", len, feat);
544 return -1;
531 } 545 }
532 546
533 /* Retransmit timer. 547 if (unlikely(val == NULL || len == 0))
534 * If this is the master listening sock, we don't set a timer on it. It 548 len = repeat_first = 0;
535 * should be fine because if the dude doesn't receive our RESPONSE 549 tot_len = 3 + repeat_first + len;
536 * [which will contain the CHANGE] he will send another REQUEST which
537 * will "retrnasmit" the change.
538 */
539 if (change && dp->dccps_role != DCCP_ROLE_LISTEN) {
540 dccp_pr_debug("reset feat negotiation timer %p\n", sk);
541 550
542 /* XXX don't reset the timer on re-transmissions. I.e. reset it 551 if (DCCP_SKB_CB(skb)->dccpd_opt_len + tot_len > DCCP_MAX_OPT_LEN) {
543 * only when sending new stuff i guess. Currently the timer 552 DCCP_WARN("packet too small for feature %d option!\n", feat);
544 * never backs off because on re-transmission it just resets it! 553 return -1;
545 */
546 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
547 inet_csk(sk)->icsk_rto, DCCP_RTO_MAX);
548 } 554 }
555 DCCP_SKB_CB(skb)->dccpd_opt_len += tot_len;
556
557 to = skb_push(skb, tot_len);
558 *to++ = type;
559 *to++ = tot_len;
560 *to++ = feat;
549 561
562 if (repeat_first)
563 *to++ = *val;
564 if (len)
565 memcpy(to, val, len);
550 return 0; 566 return 0;
551} 567}
552 568
@@ -565,19 +581,30 @@ static void dccp_insert_option_padding(struct sk_buff *skb)
565int dccp_insert_options(struct sock *sk, struct sk_buff *skb) 581int dccp_insert_options(struct sock *sk, struct sk_buff *skb)
566{ 582{
567 struct dccp_sock *dp = dccp_sk(sk); 583 struct dccp_sock *dp = dccp_sk(sk);
568 struct dccp_minisock *dmsk = dccp_msk(sk);
569 584
570 DCCP_SKB_CB(skb)->dccpd_opt_len = 0; 585 DCCP_SKB_CB(skb)->dccpd_opt_len = 0;
571 586
572 if (dmsk->dccpms_send_ndp_count && 587 if (dp->dccps_send_ndp_count && dccp_insert_option_ndp(sk, skb))
573 dccp_insert_option_ndp(sk, skb))
574 return -1; 588 return -1;
575 589
576 if (!dccp_packet_without_ack(skb)) { 590 if (DCCP_SKB_CB(skb)->dccpd_type != DCCP_PKT_DATA) {
577 if (dmsk->dccpms_send_ack_vector && 591
578 dccp_ackvec_pending(dp->dccps_hc_rx_ackvec) && 592 /* Feature Negotiation */
579 dccp_insert_option_ackvec(sk, skb)) 593 if (dccp_feat_insert_opts(dp, NULL, skb))
580 return -1; 594 return -1;
595
596 if (DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_REQUEST) {
597 /*
598 * Obtain RTT sample from Request/Response exchange.
599 * This is currently used in CCID 3 initialisation.
600 */
601 if (dccp_insert_option_timestamp(sk, skb))
602 return -1;
603
604 } else if (dccp_ackvec_pending(sk) &&
605 dccp_insert_option_ackvec(sk, skb)) {
606 return -1;
607 }
581 } 608 }
582 609
583 if (dp->dccps_hc_rx_insert_options) { 610 if (dp->dccps_hc_rx_insert_options) {
@@ -586,21 +613,6 @@ int dccp_insert_options(struct sock *sk, struct sk_buff *skb)
586 dp->dccps_hc_rx_insert_options = 0; 613 dp->dccps_hc_rx_insert_options = 0;
587 } 614 }
588 615
589 /* Feature negotiation */
590 /* Data packets can't do feat negotiation */
591 if (DCCP_SKB_CB(skb)->dccpd_type != DCCP_PKT_DATA &&
592 DCCP_SKB_CB(skb)->dccpd_type != DCCP_PKT_DATAACK &&
593 dccp_insert_options_feat(sk, skb))
594 return -1;
595
596 /*
597 * Obtain RTT sample from Request/Response exchange.
598 * This is currently used in CCID 3 initialisation.
599 */
600 if (DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_REQUEST &&
601 dccp_insert_option_timestamp(sk, skb))
602 return -1;
603
604 if (dp->dccps_timestamp_echo != 0 && 616 if (dp->dccps_timestamp_echo != 0 &&
605 dccp_insert_option_timestamp_echo(dp, NULL, skb)) 617 dccp_insert_option_timestamp_echo(dp, NULL, skb))
606 return -1; 618 return -1;
@@ -613,6 +625,9 @@ int dccp_insert_options_rsk(struct dccp_request_sock *dreq, struct sk_buff *skb)
613{ 625{
614 DCCP_SKB_CB(skb)->dccpd_opt_len = 0; 626 DCCP_SKB_CB(skb)->dccpd_opt_len = 0;
615 627
628 if (dccp_feat_insert_opts(NULL, dreq, skb))
629 return -1;
630
616 if (dreq->dreq_timestamp_echo != 0 && 631 if (dreq->dreq_timestamp_echo != 0 &&
617 dccp_insert_option_timestamp_echo(NULL, dreq, skb)) 632 dccp_insert_option_timestamp_echo(NULL, dreq, skb))
618 return -1; 633 return -1;
diff --git a/net/dccp/output.c b/net/dccp/output.c
index d06945c7d3df..2532797a8009 100644
--- a/net/dccp/output.c
+++ b/net/dccp/output.c
@@ -26,11 +26,13 @@ static inline void dccp_event_ack_sent(struct sock *sk)
26 inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); 26 inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
27} 27}
28 28
29static void dccp_skb_entail(struct sock *sk, struct sk_buff *skb) 29/* enqueue @skb on sk_send_head for retransmission, return clone to send now */
30static struct sk_buff *dccp_skb_entail(struct sock *sk, struct sk_buff *skb)
30{ 31{
31 skb_set_owner_w(skb, sk); 32 skb_set_owner_w(skb, sk);
32 WARN_ON(sk->sk_send_head); 33 WARN_ON(sk->sk_send_head);
33 sk->sk_send_head = skb; 34 sk->sk_send_head = skb;
35 return skb_clone(sk->sk_send_head, gfp_any());
34} 36}
35 37
36/* 38/*
@@ -161,21 +163,27 @@ unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu)
161 struct inet_connection_sock *icsk = inet_csk(sk); 163 struct inet_connection_sock *icsk = inet_csk(sk);
162 struct dccp_sock *dp = dccp_sk(sk); 164 struct dccp_sock *dp = dccp_sk(sk);
163 u32 ccmps = dccp_determine_ccmps(dp); 165 u32 ccmps = dccp_determine_ccmps(dp);
164 int cur_mps = ccmps ? min(pmtu, ccmps) : pmtu; 166 u32 cur_mps = ccmps ? min(pmtu, ccmps) : pmtu;
165 167
166 /* Account for header lengths and IPv4/v6 option overhead */ 168 /* Account for header lengths and IPv4/v6 option overhead */
167 cur_mps -= (icsk->icsk_af_ops->net_header_len + icsk->icsk_ext_hdr_len + 169 cur_mps -= (icsk->icsk_af_ops->net_header_len + icsk->icsk_ext_hdr_len +
168 sizeof(struct dccp_hdr) + sizeof(struct dccp_hdr_ext)); 170 sizeof(struct dccp_hdr) + sizeof(struct dccp_hdr_ext));
169 171
170 /* 172 /*
171 * FIXME: this should come from the CCID infrastructure, where, say, 173 * Leave enough headroom for common DCCP header options.
172 * TFRC will say it wants TIMESTAMPS, ELAPSED time, etc, for now lets 174 * This only considers options which may appear on DCCP-Data packets, as
173 * put a rough estimate for NDP + TIMESTAMP + TIMESTAMP_ECHO + ELAPSED 175 * per table 3 in RFC 4340, 5.8. When running out of space for other
174 * TIME + TFRC_OPT_LOSS_EVENT_RATE + TFRC_OPT_RECEIVE_RATE + padding to 176 * options (eg. Ack Vector which can take up to 255 bytes), it is better
175 * make it a multiple of 4 177 * to schedule a separate Ack. Thus we leave headroom for the following:
178 * - 1 byte for Slow Receiver (11.6)
179 * - 6 bytes for Timestamp (13.1)
180 * - 10 bytes for Timestamp Echo (13.3)
181 * - 8 bytes for NDP count (7.7, when activated)
182 * - 6 bytes for Data Checksum (9.3)
183 * - %DCCPAV_MIN_OPTLEN bytes for Ack Vector size (11.4, when enabled)
176 */ 184 */
177 185 cur_mps -= roundup(1 + 6 + 10 + dp->dccps_send_ndp_count * 8 + 6 +
178 cur_mps -= ((5 + 6 + 10 + 6 + 6 + 6 + 3) / 4) * 4; 186 (dp->dccps_hc_rx_ackvec ? DCCPAV_MIN_OPTLEN : 0), 4);
179 187
180 /* And store cached results */ 188 /* And store cached results */
181 icsk->icsk_pmtu_cookie = pmtu; 189 icsk->icsk_pmtu_cookie = pmtu;
@@ -200,95 +208,158 @@ void dccp_write_space(struct sock *sk)
200} 208}
201 209
202/** 210/**
203 * dccp_wait_for_ccid - Wait for ccid to tell us we can send a packet 211 * dccp_wait_for_ccid - Await CCID send permission
204 * @sk: socket to wait for 212 * @sk: socket to wait for
205 * @skb: current skb to pass on for waiting 213 * @delay: timeout in jiffies
206 * @delay: sleep timeout in milliseconds (> 0) 214 * This is used by CCIDs which need to delay the send time in process context.
207 * This function is called by default when the socket is closed, and
208 * when a non-zero linger time is set on the socket. For consistency
209 */ 215 */
210static int dccp_wait_for_ccid(struct sock *sk, struct sk_buff *skb, int delay) 216static int dccp_wait_for_ccid(struct sock *sk, unsigned long delay)
211{ 217{
212 struct dccp_sock *dp = dccp_sk(sk);
213 DEFINE_WAIT(wait); 218 DEFINE_WAIT(wait);
214 unsigned long jiffdelay; 219 long remaining;
215 int rc;
216 220
217 do { 221 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
218 dccp_pr_debug("delayed send by %d msec\n", delay); 222 sk->sk_write_pending++;
219 jiffdelay = msecs_to_jiffies(delay); 223 release_sock(sk);
220 224
221 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); 225 remaining = schedule_timeout(delay);
222 226
223 sk->sk_write_pending++; 227 lock_sock(sk);
224 release_sock(sk); 228 sk->sk_write_pending--;
225 schedule_timeout(jiffdelay); 229 finish_wait(sk->sk_sleep, &wait);
226 lock_sock(sk);
227 sk->sk_write_pending--;
228 230
229 if (sk->sk_err) 231 if (signal_pending(current) || sk->sk_err)
230 goto do_error; 232 return -1;
231 if (signal_pending(current)) 233 return remaining;
232 goto do_interrupted; 234}
233 235
234 rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb); 236/**
235 } while ((delay = rc) > 0); 237 * dccp_xmit_packet - Send data packet under control of CCID
236out: 238 * Transmits next-queued payload and informs CCID to account for the packet.
237 finish_wait(sk->sk_sleep, &wait); 239 */
238 return rc; 240static void dccp_xmit_packet(struct sock *sk)
239 241{
240do_error: 242 int err, len;
241 rc = -EPIPE; 243 struct dccp_sock *dp = dccp_sk(sk);
242 goto out; 244 struct sk_buff *skb = dccp_qpolicy_pop(sk);
243do_interrupted: 245
244 rc = -EINTR; 246 if (unlikely(skb == NULL))
245 goto out; 247 return;
248 len = skb->len;
249
250 if (sk->sk_state == DCCP_PARTOPEN) {
251 const u32 cur_mps = dp->dccps_mss_cache - DCCP_FEATNEG_OVERHEAD;
252 /*
253 * See 8.1.5 - Handshake Completion.
254 *
255 * For robustness we resend Confirm options until the client has
256 * entered OPEN. During the initial feature negotiation, the MPS
257 * is smaller than usual, reduced by the Change/Confirm options.
258 */
259 if (!list_empty(&dp->dccps_featneg) && len > cur_mps) {
260 DCCP_WARN("Payload too large (%d) for featneg.\n", len);
261 dccp_send_ack(sk);
262 dccp_feat_list_purge(&dp->dccps_featneg);
263 }
264
265 inet_csk_schedule_ack(sk);
266 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
267 inet_csk(sk)->icsk_rto,
268 DCCP_RTO_MAX);
269 DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATAACK;
270 } else if (dccp_ack_pending(sk)) {
271 DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATAACK;
272 } else {
273 DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATA;
274 }
275
276 err = dccp_transmit_skb(sk, skb);
277 if (err)
278 dccp_pr_debug("transmit_skb() returned err=%d\n", err);
279 /*
280 * Register this one as sent even if an error occurred. To the remote
281 * end a local packet drop is indistinguishable from network loss, i.e.
282 * any local drop will eventually be reported via receiver feedback.
283 */
284 ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, len);
285
286 /*
287 * If the CCID needs to transfer additional header options out-of-band
288 * (e.g. Ack Vectors or feature-negotiation options), it activates this
289 * flag to schedule a Sync. The Sync will automatically incorporate all
290 * currently pending header options, thus clearing the backlog.
291 */
292 if (dp->dccps_sync_scheduled)
293 dccp_send_sync(sk, dp->dccps_gsr, DCCP_PKT_SYNC);
246} 294}
247 295
248void dccp_write_xmit(struct sock *sk, int block) 296/**
297 * dccp_flush_write_queue - Drain queue at end of connection
298 * Since dccp_sendmsg queues packets without waiting for them to be sent, it may
299 * happen that the TX queue is not empty at the end of a connection. We give the
300 * HC-sender CCID a grace period of up to @time_budget jiffies. If this function
301 * returns with a non-empty write queue, it will be purged later.
302 */
303void dccp_flush_write_queue(struct sock *sk, long *time_budget)
249{ 304{
250 struct dccp_sock *dp = dccp_sk(sk); 305 struct dccp_sock *dp = dccp_sk(sk);
251 struct sk_buff *skb; 306 struct sk_buff *skb;
307 long delay, rc;
308
309 while (*time_budget > 0 && (skb = skb_peek(&sk->sk_write_queue))) {
310 rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb);
252 311
253 while ((skb = skb_peek(&sk->sk_write_queue))) { 312 switch (ccid_packet_dequeue_eval(rc)) {
254 int err = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb); 313 case CCID_PACKET_WILL_DEQUEUE_LATER:
255 314 /*
256 if (err > 0) { 315 * If the CCID determines when to send, the next sending
257 if (!block) { 316 * time is unknown or the CCID may not even send again
258 sk_reset_timer(sk, &dp->dccps_xmit_timer, 317 * (e.g. remote host crashes or lost Ack packets).
259 msecs_to_jiffies(err)+jiffies); 318 */
260 break; 319 DCCP_WARN("CCID did not manage to send all packets\n");
261 } else 320 return;
262 err = dccp_wait_for_ccid(sk, skb, err); 321 case CCID_PACKET_DELAY:
263 if (err && err != -EINTR) 322 delay = msecs_to_jiffies(rc);
264 DCCP_BUG("err=%d after dccp_wait_for_ccid", err); 323 if (delay > *time_budget)
324 return;
325 rc = dccp_wait_for_ccid(sk, delay);
326 if (rc < 0)
327 return;
328 *time_budget -= (delay - rc);
329 /* check again if we can send now */
330 break;
331 case CCID_PACKET_SEND_AT_ONCE:
332 dccp_xmit_packet(sk);
333 break;
334 case CCID_PACKET_ERR:
335 skb_dequeue(&sk->sk_write_queue);
336 kfree_skb(skb);
337 dccp_pr_debug("packet discarded due to err=%ld\n", rc);
265 } 338 }
339 }
340}
266 341
267 skb_dequeue(&sk->sk_write_queue); 342void dccp_write_xmit(struct sock *sk)
268 if (err == 0) { 343{
269 struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); 344 struct dccp_sock *dp = dccp_sk(sk);
270 const int len = skb->len; 345 struct sk_buff *skb;
271 346
272 if (sk->sk_state == DCCP_PARTOPEN) { 347 while ((skb = dccp_qpolicy_top(sk))) {
273 /* See 8.1.5. Handshake Completion */ 348 int rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb);
274 inet_csk_schedule_ack(sk); 349
275 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, 350 switch (ccid_packet_dequeue_eval(rc)) {
276 inet_csk(sk)->icsk_rto, 351 case CCID_PACKET_WILL_DEQUEUE_LATER:
277 DCCP_RTO_MAX); 352 return;
278 dcb->dccpd_type = DCCP_PKT_DATAACK; 353 case CCID_PACKET_DELAY:
279 } else if (dccp_ack_pending(sk)) 354 sk_reset_timer(sk, &dp->dccps_xmit_timer,
280 dcb->dccpd_type = DCCP_PKT_DATAACK; 355 jiffies + msecs_to_jiffies(rc));
281 else 356 return;
282 dcb->dccpd_type = DCCP_PKT_DATA; 357 case CCID_PACKET_SEND_AT_ONCE:
283 358 dccp_xmit_packet(sk);
284 err = dccp_transmit_skb(sk, skb); 359 break;
285 ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, 0, len); 360 case CCID_PACKET_ERR:
286 if (err) 361 dccp_qpolicy_drop(sk, skb);
287 DCCP_BUG("err=%d after ccid_hc_tx_packet_sent", 362 dccp_pr_debug("packet discarded due to err=%d\n", rc);
288 err);
289 } else {
290 dccp_pr_debug("packet discarded due to err=%d\n", err);
291 kfree_skb(skb);
292 } 363 }
293 } 364 }
294} 365}
@@ -339,10 +410,12 @@ struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst,
339 DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_RESPONSE; 410 DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_RESPONSE;
340 DCCP_SKB_CB(skb)->dccpd_seq = dreq->dreq_iss; 411 DCCP_SKB_CB(skb)->dccpd_seq = dreq->dreq_iss;
341 412
342 if (dccp_insert_options_rsk(dreq, skb)) { 413 /* Resolve feature dependencies resulting from choice of CCID */
343 kfree_skb(skb); 414 if (dccp_feat_server_ccid_dependencies(dreq))
344 return NULL; 415 goto response_failed;
345 } 416
417 if (dccp_insert_options_rsk(dreq, skb))
418 goto response_failed;
346 419
347 /* Build and checksum header */ 420 /* Build and checksum header */
348 dh = dccp_zeroed_hdr(skb, dccp_header_size); 421 dh = dccp_zeroed_hdr(skb, dccp_header_size);
@@ -363,6 +436,9 @@ struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst,
363 inet_rsk(req)->acked = 1; 436 inet_rsk(req)->acked = 1;
364 DCCP_INC_STATS(DCCP_MIB_OUTSEGS); 437 DCCP_INC_STATS(DCCP_MIB_OUTSEGS);
365 return skb; 438 return skb;
439response_failed:
440 kfree_skb(skb);
441 return NULL;
366} 442}
367 443
368EXPORT_SYMBOL_GPL(dccp_make_response); 444EXPORT_SYMBOL_GPL(dccp_make_response);
@@ -447,8 +523,9 @@ int dccp_send_reset(struct sock *sk, enum dccp_reset_codes code)
447/* 523/*
448 * Do all connect socket setups that can be done AF independent. 524 * Do all connect socket setups that can be done AF independent.
449 */ 525 */
450static inline void dccp_connect_init(struct sock *sk) 526int dccp_connect(struct sock *sk)
451{ 527{
528 struct sk_buff *skb;
452 struct dccp_sock *dp = dccp_sk(sk); 529 struct dccp_sock *dp = dccp_sk(sk);
453 struct dst_entry *dst = __sk_dst_get(sk); 530 struct dst_entry *dst = __sk_dst_get(sk);
454 struct inet_connection_sock *icsk = inet_csk(sk); 531 struct inet_connection_sock *icsk = inet_csk(sk);
@@ -458,19 +535,13 @@ static inline void dccp_connect_init(struct sock *sk)
458 535
459 dccp_sync_mss(sk, dst_mtu(dst)); 536 dccp_sync_mss(sk, dst_mtu(dst));
460 537
538 /* do not connect if feature negotiation setup fails */
539 if (dccp_feat_finalise_settings(dccp_sk(sk)))
540 return -EPROTO;
541
461 /* Initialise GAR as per 8.5; AWL/AWH are set in dccp_transmit_skb() */ 542 /* Initialise GAR as per 8.5; AWL/AWH are set in dccp_transmit_skb() */
462 dp->dccps_gar = dp->dccps_iss; 543 dp->dccps_gar = dp->dccps_iss;
463 544
464 icsk->icsk_retransmits = 0;
465}
466
467int dccp_connect(struct sock *sk)
468{
469 struct sk_buff *skb;
470 struct inet_connection_sock *icsk = inet_csk(sk);
471
472 dccp_connect_init(sk);
473
474 skb = alloc_skb(sk->sk_prot->max_header, sk->sk_allocation); 545 skb = alloc_skb(sk->sk_prot->max_header, sk->sk_allocation);
475 if (unlikely(skb == NULL)) 546 if (unlikely(skb == NULL))
476 return -ENOBUFS; 547 return -ENOBUFS;
@@ -480,11 +551,11 @@ int dccp_connect(struct sock *sk)
480 551
481 DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_REQUEST; 552 DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_REQUEST;
482 553
483 dccp_skb_entail(sk, skb); 554 dccp_transmit_skb(sk, dccp_skb_entail(sk, skb));
484 dccp_transmit_skb(sk, skb_clone(skb, GFP_KERNEL));
485 DCCP_INC_STATS(DCCP_MIB_ACTIVEOPENS); 555 DCCP_INC_STATS(DCCP_MIB_ACTIVEOPENS);
486 556
487 /* Timer for repeating the REQUEST until an answer. */ 557 /* Timer for repeating the REQUEST until an answer. */
558 icsk->icsk_retransmits = 0;
488 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 559 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
489 icsk->icsk_rto, DCCP_RTO_MAX); 560 icsk->icsk_rto, DCCP_RTO_MAX);
490 return 0; 561 return 0;
@@ -571,6 +642,12 @@ void dccp_send_sync(struct sock *sk, const u64 ackno,
571 DCCP_SKB_CB(skb)->dccpd_type = pkt_type; 642 DCCP_SKB_CB(skb)->dccpd_type = pkt_type;
572 DCCP_SKB_CB(skb)->dccpd_ack_seq = ackno; 643 DCCP_SKB_CB(skb)->dccpd_ack_seq = ackno;
573 644
645 /*
646 * Clear the flag in case the Sync was scheduled for out-of-band data,
647 * such as carrying a long Ack Vector.
648 */
649 dccp_sk(sk)->dccps_sync_scheduled = 0;
650
574 dccp_transmit_skb(sk, skb); 651 dccp_transmit_skb(sk, skb);
575} 652}
576 653
@@ -599,9 +676,7 @@ void dccp_send_close(struct sock *sk, const int active)
599 DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_CLOSE; 676 DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_CLOSE;
600 677
601 if (active) { 678 if (active) {
602 dccp_write_xmit(sk, 1); 679 skb = dccp_skb_entail(sk, skb);
603 dccp_skb_entail(sk, skb);
604 dccp_transmit_skb(sk, skb_clone(skb, prio));
605 /* 680 /*
606 * Retransmission timer for active-close: RFC 4340, 8.3 requires 681 * Retransmission timer for active-close: RFC 4340, 8.3 requires
607 * to retransmit the Close/CloseReq until the CLOSING/CLOSEREQ 682 * to retransmit the Close/CloseReq until the CLOSING/CLOSEREQ
@@ -614,6 +689,6 @@ void dccp_send_close(struct sock *sk, const int active)
614 */ 689 */
615 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 690 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
616 DCCP_TIMEOUT_INIT, DCCP_RTO_MAX); 691 DCCP_TIMEOUT_INIT, DCCP_RTO_MAX);
617 } else 692 }
618 dccp_transmit_skb(sk, skb); 693 dccp_transmit_skb(sk, skb);
619} 694}
diff --git a/net/dccp/probe.c b/net/dccp/probe.c
index 81368a7f5379..eaa59d82ab0f 100644
--- a/net/dccp/probe.c
+++ b/net/dccp/probe.c
@@ -46,75 +46,54 @@ static struct {
46 struct kfifo *fifo; 46 struct kfifo *fifo;
47 spinlock_t lock; 47 spinlock_t lock;
48 wait_queue_head_t wait; 48 wait_queue_head_t wait;
49 struct timespec tstart; 49 ktime_t start;
50} dccpw; 50} dccpw;
51 51
52static void printl(const char *fmt, ...) 52static void jdccp_write_xmit(struct sock *sk)
53{ 53{
54 va_list args;
55 int len;
56 struct timespec now;
57 char tbuf[256];
58
59 va_start(args, fmt);
60 getnstimeofday(&now);
61
62 now = timespec_sub(now, dccpw.tstart);
63
64 len = sprintf(tbuf, "%lu.%06lu ",
65 (unsigned long) now.tv_sec,
66 (unsigned long) now.tv_nsec / NSEC_PER_USEC);
67 len += vscnprintf(tbuf+len, sizeof(tbuf)-len, fmt, args);
68 va_end(args);
69
70 kfifo_put(dccpw.fifo, tbuf, len);
71 wake_up(&dccpw.wait);
72}
73
74static int jdccp_sendmsg(struct kiocb *iocb, struct sock *sk,
75 struct msghdr *msg, size_t size)
76{
77 const struct dccp_minisock *dmsk = dccp_msk(sk);
78 const struct inet_sock *inet = inet_sk(sk); 54 const struct inet_sock *inet = inet_sk(sk);
79 const struct ccid3_hc_tx_sock *hctx; 55 struct ccid3_hc_tx_sock *hctx = NULL;
56 struct timespec tv;
57 char buf[256];
58 int len, ccid = ccid_get_current_tx_ccid(dccp_sk(sk));
80 59
81 if (dmsk->dccpms_tx_ccid == DCCPC_CCID3) 60 if (ccid == DCCPC_CCID3)
82 hctx = ccid3_hc_tx_sk(sk); 61 hctx = ccid3_hc_tx_sk(sk);
83 else
84 hctx = NULL;
85 62
86 if (port == 0 || ntohs(inet->dport) == port || 63 if (!port || ntohs(inet->dport) == port || ntohs(inet->sport) == port) {
87 ntohs(inet->sport) == port) { 64
88 if (hctx) 65 tv = ktime_to_timespec(ktime_sub(ktime_get(), dccpw.start));
89 printl("%d.%d.%d.%d:%u %d.%d.%d.%d:%u %d %d %d %d %u " 66 len = sprintf(buf, "%lu.%09lu %d.%d.%d.%d:%u %d.%d.%d.%d:%u %d",
90 "%llu %llu %d\n", 67 (unsigned long)tv.tv_sec,
91 NIPQUAD(inet->saddr), ntohs(inet->sport), 68 (unsigned long)tv.tv_nsec,
92 NIPQUAD(inet->daddr), ntohs(inet->dport), size,
93 hctx->ccid3hctx_s, hctx->ccid3hctx_rtt,
94 hctx->ccid3hctx_p, hctx->ccid3hctx_x_calc,
95 hctx->ccid3hctx_x_recv >> 6,
96 hctx->ccid3hctx_x >> 6, hctx->ccid3hctx_t_ipi);
97 else
98 printl("%d.%d.%d.%d:%u %d.%d.%d.%d:%u %d\n",
99 NIPQUAD(inet->saddr), ntohs(inet->sport), 69 NIPQUAD(inet->saddr), ntohs(inet->sport),
100 NIPQUAD(inet->daddr), ntohs(inet->dport), size); 70 NIPQUAD(inet->daddr), ntohs(inet->dport), ccid);
71
72 if (hctx)
73 len += sprintf(buf + len, " %d %d %d %u %u %u %d",
74 hctx->s, hctx->rtt, hctx->p, hctx->x_calc,
75 (unsigned)(hctx->x_recv >> 6),
76 (unsigned)(hctx->x >> 6), hctx->t_ipi);
77
78 len += sprintf(buf + len, "\n");
79 kfifo_put(dccpw.fifo, buf, len);
80 wake_up(&dccpw.wait);
101 } 81 }
102 82
103 jprobe_return(); 83 jprobe_return();
104 return 0;
105} 84}
106 85
107static struct jprobe dccp_send_probe = { 86static struct jprobe dccp_send_probe = {
108 .kp = { 87 .kp = {
109 .symbol_name = "dccp_sendmsg", 88 .symbol_name = "dccp_write_xmit",
110 }, 89 },
111 .entry = jdccp_sendmsg, 90 .entry = jdccp_write_xmit,
112}; 91};
113 92
114static int dccpprobe_open(struct inode *inode, struct file *file) 93static int dccpprobe_open(struct inode *inode, struct file *file)
115{ 94{
116 kfifo_reset(dccpw.fifo); 95 kfifo_reset(dccpw.fifo);
117 getnstimeofday(&dccpw.tstart); 96 dccpw.start = ktime_get();
118 return 0; 97 return 0;
119} 98}
120 99
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index d0bd34819761..ecf3be961e11 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -67,6 +67,9 @@ void dccp_set_state(struct sock *sk, const int state)
67 case DCCP_OPEN: 67 case DCCP_OPEN:
68 if (oldstate != DCCP_OPEN) 68 if (oldstate != DCCP_OPEN)
69 DCCP_INC_STATS(DCCP_MIB_CURRESTAB); 69 DCCP_INC_STATS(DCCP_MIB_CURRESTAB);
70 /* Client retransmits all Confirm options until entering OPEN */
71 if (oldstate == DCCP_PARTOPEN)
72 dccp_feat_list_purge(&dccp_sk(sk)->dccps_featneg);
70 break; 73 break;
71 74
72 case DCCP_CLOSED: 75 case DCCP_CLOSED:
@@ -175,63 +178,25 @@ EXPORT_SYMBOL_GPL(dccp_state_name);
175int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized) 178int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized)
176{ 179{
177 struct dccp_sock *dp = dccp_sk(sk); 180 struct dccp_sock *dp = dccp_sk(sk);
178 struct dccp_minisock *dmsk = dccp_msk(sk);
179 struct inet_connection_sock *icsk = inet_csk(sk); 181 struct inet_connection_sock *icsk = inet_csk(sk);
180 182
181 dccp_minisock_init(&dp->dccps_minisock);
182
183 icsk->icsk_rto = DCCP_TIMEOUT_INIT; 183 icsk->icsk_rto = DCCP_TIMEOUT_INIT;
184 icsk->icsk_syn_retries = sysctl_dccp_request_retries; 184 icsk->icsk_syn_retries = sysctl_dccp_request_retries;
185 sk->sk_state = DCCP_CLOSED; 185 sk->sk_state = DCCP_CLOSED;
186 sk->sk_write_space = dccp_write_space; 186 sk->sk_write_space = dccp_write_space;
187 icsk->icsk_sync_mss = dccp_sync_mss; 187 icsk->icsk_sync_mss = dccp_sync_mss;
188 dp->dccps_mss_cache = 536; 188 dp->dccps_mss_cache = TCP_MIN_RCVMSS;
189 dp->dccps_rate_last = jiffies; 189 dp->dccps_rate_last = jiffies;
190 dp->dccps_role = DCCP_ROLE_UNDEFINED; 190 dp->dccps_role = DCCP_ROLE_UNDEFINED;
191 dp->dccps_service = DCCP_SERVICE_CODE_IS_ABSENT; 191 dp->dccps_service = DCCP_SERVICE_CODE_IS_ABSENT;
192 dp->dccps_l_ack_ratio = dp->dccps_r_ack_ratio = 1; 192 dp->dccps_tx_qlen = sysctl_dccp_tx_qlen;
193 193
194 dccp_init_xmit_timers(sk); 194 dccp_init_xmit_timers(sk);
195 195
196 /* 196 INIT_LIST_HEAD(&dp->dccps_featneg);
197 * FIXME: We're hardcoding the CCID, and doing this at this point makes 197 /* control socket doesn't need feat nego */
198 * the listening (master) sock get CCID control blocks, which is not 198 if (likely(ctl_sock_initialized))
199 * necessary, but for now, to not mess with the test userspace apps, 199 return dccp_feat_init(sk);
200 * lets leave it here, later the real solution is to do this in a
201 * setsockopt(CCIDs-I-want/accept). -acme
202 */
203 if (likely(ctl_sock_initialized)) {
204 int rc = dccp_feat_init(dmsk);
205
206 if (rc)
207 return rc;
208
209 if (dmsk->dccpms_send_ack_vector) {
210 dp->dccps_hc_rx_ackvec = dccp_ackvec_alloc(GFP_KERNEL);
211 if (dp->dccps_hc_rx_ackvec == NULL)
212 return -ENOMEM;
213 }
214 dp->dccps_hc_rx_ccid = ccid_hc_rx_new(dmsk->dccpms_rx_ccid,
215 sk, GFP_KERNEL);
216 dp->dccps_hc_tx_ccid = ccid_hc_tx_new(dmsk->dccpms_tx_ccid,
217 sk, GFP_KERNEL);
218 if (unlikely(dp->dccps_hc_rx_ccid == NULL ||
219 dp->dccps_hc_tx_ccid == NULL)) {
220 ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk);
221 ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk);
222 if (dmsk->dccpms_send_ack_vector) {
223 dccp_ackvec_free(dp->dccps_hc_rx_ackvec);
224 dp->dccps_hc_rx_ackvec = NULL;
225 }
226 dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL;
227 return -ENOMEM;
228 }
229 } else {
230 /* control socket doesn't need feat nego */
231 INIT_LIST_HEAD(&dmsk->dccpms_pending);
232 INIT_LIST_HEAD(&dmsk->dccpms_conf);
233 }
234
235 return 0; 200 return 0;
236} 201}
237 202
@@ -240,7 +205,6 @@ EXPORT_SYMBOL_GPL(dccp_init_sock);
240void dccp_destroy_sock(struct sock *sk) 205void dccp_destroy_sock(struct sock *sk)
241{ 206{
242 struct dccp_sock *dp = dccp_sk(sk); 207 struct dccp_sock *dp = dccp_sk(sk);
243 struct dccp_minisock *dmsk = dccp_msk(sk);
244 208
245 /* 209 /*
246 * DCCP doesn't use sk_write_queue, just sk_send_head 210 * DCCP doesn't use sk_write_queue, just sk_send_head
@@ -258,7 +222,7 @@ void dccp_destroy_sock(struct sock *sk)
258 kfree(dp->dccps_service_list); 222 kfree(dp->dccps_service_list);
259 dp->dccps_service_list = NULL; 223 dp->dccps_service_list = NULL;
260 224
261 if (dmsk->dccpms_send_ack_vector) { 225 if (dp->dccps_hc_rx_ackvec != NULL) {
262 dccp_ackvec_free(dp->dccps_hc_rx_ackvec); 226 dccp_ackvec_free(dp->dccps_hc_rx_ackvec);
263 dp->dccps_hc_rx_ackvec = NULL; 227 dp->dccps_hc_rx_ackvec = NULL;
264 } 228 }
@@ -267,7 +231,7 @@ void dccp_destroy_sock(struct sock *sk)
267 dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL; 231 dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL;
268 232
269 /* clean up feature negotiation state */ 233 /* clean up feature negotiation state */
270 dccp_feat_clean(dmsk); 234 dccp_feat_list_purge(&dp->dccps_featneg);
271} 235}
272 236
273EXPORT_SYMBOL_GPL(dccp_destroy_sock); 237EXPORT_SYMBOL_GPL(dccp_destroy_sock);
@@ -277,6 +241,9 @@ static inline int dccp_listen_start(struct sock *sk, int backlog)
277 struct dccp_sock *dp = dccp_sk(sk); 241 struct dccp_sock *dp = dccp_sk(sk);
278 242
279 dp->dccps_role = DCCP_ROLE_LISTEN; 243 dp->dccps_role = DCCP_ROLE_LISTEN;
244 /* do not start to listen if feature negotiation setup fails */
245 if (dccp_feat_finalise_settings(dp))
246 return -EPROTO;
280 return inet_csk_listen_start(sk, backlog); 247 return inet_csk_listen_start(sk, backlog);
281} 248}
282 249
@@ -466,42 +433,70 @@ static int dccp_setsockopt_service(struct sock *sk, const __be32 service,
466 return 0; 433 return 0;
467} 434}
468 435
469/* byte 1 is feature. the rest is the preference list */ 436static int dccp_setsockopt_cscov(struct sock *sk, int cscov, bool rx)
470static int dccp_setsockopt_change(struct sock *sk, int type,
471 struct dccp_so_feat __user *optval)
472{ 437{
473 struct dccp_so_feat opt; 438 u8 *list, len;
474 u8 *val; 439 int i, rc;
475 int rc;
476 440
477 if (copy_from_user(&opt, optval, sizeof(opt))) 441 if (cscov < 0 || cscov > 15)
478 return -EFAULT; 442 return -EINVAL;
479 /* 443 /*
480 * rfc4340: 6.1. Change Options 444 * Populate a list of permissible values, in the range cscov...15. This
445 * is necessary since feature negotiation of single values only works if
446 * both sides incidentally choose the same value. Since the list starts
447 * lowest-value first, negotiation will pick the smallest shared value.
481 */ 448 */
482 if (opt.dccpsf_len < 1) 449 if (cscov == 0)
450 return 0;
451 len = 16 - cscov;
452
453 list = kmalloc(len, GFP_KERNEL);
454 if (list == NULL)
455 return -ENOBUFS;
456
457 for (i = 0; i < len; i++)
458 list[i] = cscov++;
459
460 rc = dccp_feat_register_sp(sk, DCCPF_MIN_CSUM_COVER, rx, list, len);
461
462 if (rc == 0) {
463 if (rx)
464 dccp_sk(sk)->dccps_pcrlen = cscov;
465 else
466 dccp_sk(sk)->dccps_pcslen = cscov;
467 }
468 kfree(list);
469 return rc;
470}
471
472static int dccp_setsockopt_ccid(struct sock *sk, int type,
473 char __user *optval, int optlen)
474{
475 u8 *val;
476 int rc = 0;
477
478 if (optlen < 1 || optlen > DCCP_FEAT_MAX_SP_VALS)
483 return -EINVAL; 479 return -EINVAL;
484 480
485 val = kmalloc(opt.dccpsf_len, GFP_KERNEL); 481 val = kmalloc(optlen, GFP_KERNEL);
486 if (!val) 482 if (val == NULL)
487 return -ENOMEM; 483 return -ENOMEM;
488 484
489 if (copy_from_user(val, opt.dccpsf_val, opt.dccpsf_len)) { 485 if (copy_from_user(val, optval, optlen)) {
490 rc = -EFAULT; 486 kfree(val);
491 goto out_free_val; 487 return -EFAULT;
492 } 488 }
493 489
494 rc = dccp_feat_change(dccp_msk(sk), type, opt.dccpsf_feat, 490 lock_sock(sk);
495 val, opt.dccpsf_len, GFP_KERNEL); 491 if (type == DCCP_SOCKOPT_TX_CCID || type == DCCP_SOCKOPT_CCID)
496 if (rc) 492 rc = dccp_feat_register_sp(sk, DCCPF_CCID, 1, val, optlen);
497 goto out_free_val;
498 493
499out: 494 if (!rc && (type == DCCP_SOCKOPT_RX_CCID || type == DCCP_SOCKOPT_CCID))
500 return rc; 495 rc = dccp_feat_register_sp(sk, DCCPF_CCID, 0, val, optlen);
496 release_sock(sk);
501 497
502out_free_val:
503 kfree(val); 498 kfree(val);
504 goto out; 499 return rc;
505} 500}
506 501
507static int do_dccp_setsockopt(struct sock *sk, int level, int optname, 502static int do_dccp_setsockopt(struct sock *sk, int level, int optname,
@@ -510,7 +505,21 @@ static int do_dccp_setsockopt(struct sock *sk, int level, int optname,
510 struct dccp_sock *dp = dccp_sk(sk); 505 struct dccp_sock *dp = dccp_sk(sk);
511 int val, err = 0; 506 int val, err = 0;
512 507
513 if (optlen < sizeof(int)) 508 switch (optname) {
509 case DCCP_SOCKOPT_PACKET_SIZE:
510 DCCP_WARN("sockopt(PACKET_SIZE) is deprecated: fix your app\n");
511 return 0;
512 case DCCP_SOCKOPT_CHANGE_L:
513 case DCCP_SOCKOPT_CHANGE_R:
514 DCCP_WARN("sockopt(CHANGE_L/R) is deprecated: fix your app\n");
515 return 0;
516 case DCCP_SOCKOPT_CCID:
517 case DCCP_SOCKOPT_RX_CCID:
518 case DCCP_SOCKOPT_TX_CCID:
519 return dccp_setsockopt_ccid(sk, optname, optval, optlen);
520 }
521
522 if (optlen < (int)sizeof(int))
514 return -EINVAL; 523 return -EINVAL;
515 524
516 if (get_user(val, (int __user *)optval)) 525 if (get_user(val, (int __user *)optval))
@@ -521,53 +530,38 @@ static int do_dccp_setsockopt(struct sock *sk, int level, int optname,
521 530
522 lock_sock(sk); 531 lock_sock(sk);
523 switch (optname) { 532 switch (optname) {
524 case DCCP_SOCKOPT_PACKET_SIZE:
525 DCCP_WARN("sockopt(PACKET_SIZE) is deprecated: fix your app\n");
526 err = 0;
527 break;
528 case DCCP_SOCKOPT_CHANGE_L:
529 if (optlen != sizeof(struct dccp_so_feat))
530 err = -EINVAL;
531 else
532 err = dccp_setsockopt_change(sk, DCCPO_CHANGE_L,
533 (struct dccp_so_feat __user *)
534 optval);
535 break;
536 case DCCP_SOCKOPT_CHANGE_R:
537 if (optlen != sizeof(struct dccp_so_feat))
538 err = -EINVAL;
539 else
540 err = dccp_setsockopt_change(sk, DCCPO_CHANGE_R,
541 (struct dccp_so_feat __user *)
542 optval);
543 break;
544 case DCCP_SOCKOPT_SERVER_TIMEWAIT: 533 case DCCP_SOCKOPT_SERVER_TIMEWAIT:
545 if (dp->dccps_role != DCCP_ROLE_SERVER) 534 if (dp->dccps_role != DCCP_ROLE_SERVER)
546 err = -EOPNOTSUPP; 535 err = -EOPNOTSUPP;
547 else 536 else
548 dp->dccps_server_timewait = (val != 0); 537 dp->dccps_server_timewait = (val != 0);
549 break; 538 break;
550 case DCCP_SOCKOPT_SEND_CSCOV: /* sender side, RFC 4340, sec. 9.2 */ 539 case DCCP_SOCKOPT_SEND_CSCOV:
551 if (val < 0 || val > 15) 540 err = dccp_setsockopt_cscov(sk, val, false);
541 break;
542 case DCCP_SOCKOPT_RECV_CSCOV:
543 err = dccp_setsockopt_cscov(sk, val, true);
544 break;
545 case DCCP_SOCKOPT_QPOLICY_ID:
546 if (sk->sk_state != DCCP_CLOSED)
547 err = -EISCONN;
548 else if (val < 0 || val >= DCCPQ_POLICY_MAX)
552 err = -EINVAL; 549 err = -EINVAL;
553 else 550 else
554 dp->dccps_pcslen = val; 551 dp->dccps_qpolicy = val;
555 break; 552 break;
556 case DCCP_SOCKOPT_RECV_CSCOV: /* receiver side, RFC 4340 sec. 9.2.1 */ 553 case DCCP_SOCKOPT_QPOLICY_TXQLEN:
557 if (val < 0 || val > 15) 554 if (val < 0)
558 err = -EINVAL; 555 err = -EINVAL;
559 else { 556 else
560 dp->dccps_pcrlen = val; 557 dp->dccps_tx_qlen = val;
561 /* FIXME: add feature negotiation,
562 * ChangeL(MinimumChecksumCoverage, val) */
563 }
564 break; 558 break;
565 default: 559 default:
566 err = -ENOPROTOOPT; 560 err = -ENOPROTOOPT;
567 break; 561 break;
568 } 562 }
569
570 release_sock(sk); 563 release_sock(sk);
564
571 return err; 565 return err;
572} 566}
573 567
@@ -648,6 +642,18 @@ static int do_dccp_getsockopt(struct sock *sk, int level, int optname,
648 case DCCP_SOCKOPT_GET_CUR_MPS: 642 case DCCP_SOCKOPT_GET_CUR_MPS:
649 val = dp->dccps_mss_cache; 643 val = dp->dccps_mss_cache;
650 break; 644 break;
645 case DCCP_SOCKOPT_AVAILABLE_CCIDS:
646 return ccid_getsockopt_builtin_ccids(sk, len, optval, optlen);
647 case DCCP_SOCKOPT_TX_CCID:
648 val = ccid_get_current_tx_ccid(dp);
649 if (val < 0)
650 return -ENOPROTOOPT;
651 break;
652 case DCCP_SOCKOPT_RX_CCID:
653 val = ccid_get_current_rx_ccid(dp);
654 if (val < 0)
655 return -ENOPROTOOPT;
656 break;
651 case DCCP_SOCKOPT_SERVER_TIMEWAIT: 657 case DCCP_SOCKOPT_SERVER_TIMEWAIT:
652 val = dp->dccps_server_timewait; 658 val = dp->dccps_server_timewait;
653 break; 659 break;
@@ -657,6 +663,12 @@ static int do_dccp_getsockopt(struct sock *sk, int level, int optname,
657 case DCCP_SOCKOPT_RECV_CSCOV: 663 case DCCP_SOCKOPT_RECV_CSCOV:
658 val = dp->dccps_pcrlen; 664 val = dp->dccps_pcrlen;
659 break; 665 break;
666 case DCCP_SOCKOPT_QPOLICY_ID:
667 val = dp->dccps_qpolicy;
668 break;
669 case DCCP_SOCKOPT_QPOLICY_TXQLEN:
670 val = dp->dccps_tx_qlen;
671 break;
660 case 128 ... 191: 672 case 128 ... 191:
661 return ccid_hc_rx_getsockopt(dp->dccps_hc_rx_ccid, sk, optname, 673 return ccid_hc_rx_getsockopt(dp->dccps_hc_rx_ccid, sk, optname,
662 len, (u32 __user *)optval, optlen); 674 len, (u32 __user *)optval, optlen);
@@ -699,6 +711,47 @@ int compat_dccp_getsockopt(struct sock *sk, int level, int optname,
699EXPORT_SYMBOL_GPL(compat_dccp_getsockopt); 711EXPORT_SYMBOL_GPL(compat_dccp_getsockopt);
700#endif 712#endif
701 713
714static int dccp_msghdr_parse(struct msghdr *msg, struct sk_buff *skb)
715{
716 struct cmsghdr *cmsg = CMSG_FIRSTHDR(msg);
717
718 /*
719 * Assign an (opaque) qpolicy priority value to skb->priority.
720 *
721 * We are overloading this skb field for use with the qpolicy subystem.
722 * The skb->priority is normally used for the SO_PRIORITY option, which
723 * is initialised from sk_priority. Since the assignment of sk_priority
724 * to skb->priority happens later (on layer 3), we overload this field
725 * for use with queueing priorities as long as the skb is on layer 4.
726 * The default priority value (if nothing is set) is 0.
727 */
728 skb->priority = 0;
729
730 for (; cmsg != NULL; cmsg = CMSG_NXTHDR(msg, cmsg)) {
731
732 if (!CMSG_OK(msg, cmsg))
733 return -EINVAL;
734
735 if (cmsg->cmsg_level != SOL_DCCP)
736 continue;
737
738 if (cmsg->cmsg_type <= DCCP_SCM_QPOLICY_MAX &&
739 !dccp_qpolicy_param_ok(skb->sk, cmsg->cmsg_type))
740 return -EINVAL;
741
742 switch (cmsg->cmsg_type) {
743 case DCCP_SCM_PRIORITY:
744 if (cmsg->cmsg_len != CMSG_LEN(sizeof(__u32)))
745 return -EINVAL;
746 skb->priority = *(__u32 *)CMSG_DATA(cmsg);
747 break;
748 default:
749 return -EINVAL;
750 }
751 }
752 return 0;
753}
754
702int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, 755int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
703 size_t len) 756 size_t len)
704{ 757{
@@ -714,8 +767,7 @@ int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
714 767
715 lock_sock(sk); 768 lock_sock(sk);
716 769
717 if (sysctl_dccp_tx_qlen && 770 if (dccp_qpolicy_full(sk)) {
718 (sk->sk_write_queue.qlen >= sysctl_dccp_tx_qlen)) {
719 rc = -EAGAIN; 771 rc = -EAGAIN;
720 goto out_release; 772 goto out_release;
721 } 773 }
@@ -743,8 +795,12 @@ int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
743 if (rc != 0) 795 if (rc != 0)
744 goto out_discard; 796 goto out_discard;
745 797
746 skb_queue_tail(&sk->sk_write_queue, skb); 798 rc = dccp_msghdr_parse(msg, skb);
747 dccp_write_xmit(sk,0); 799 if (rc != 0)
800 goto out_discard;
801
802 dccp_qpolicy_push(sk, skb);
803 dccp_write_xmit(sk);
748out_release: 804out_release:
749 release_sock(sk); 805 release_sock(sk);
750 return rc ? : len; 806 return rc ? : len;
@@ -967,9 +1023,22 @@ void dccp_close(struct sock *sk, long timeout)
967 /* Check zero linger _after_ checking for unread data. */ 1023 /* Check zero linger _after_ checking for unread data. */
968 sk->sk_prot->disconnect(sk, 0); 1024 sk->sk_prot->disconnect(sk, 0);
969 } else if (sk->sk_state != DCCP_CLOSED) { 1025 } else if (sk->sk_state != DCCP_CLOSED) {
1026 /*
1027 * Normal connection termination. May need to wait if there are
1028 * still packets in the TX queue that are delayed by the CCID.
1029 */
1030 dccp_flush_write_queue(sk, &timeout);
970 dccp_terminate_connection(sk); 1031 dccp_terminate_connection(sk);
971 } 1032 }
972 1033
1034 /*
1035 * Flush write queue. This may be necessary in several cases:
1036 * - we have been closed by the peer but still have application data;
1037 * - abortive termination (unread data or zero linger time),
1038 * - normal termination but queue could not be flushed within time limit
1039 */
1040 __skb_queue_purge(&sk->sk_write_queue);
1041
973 sk_stream_wait_close(sk, timeout); 1042 sk_stream_wait_close(sk, timeout);
974 1043
975adjudge_to_death: 1044adjudge_to_death:
diff --git a/net/dccp/qpolicy.c b/net/dccp/qpolicy.c
new file mode 100644
index 000000000000..27383f88c75f
--- /dev/null
+++ b/net/dccp/qpolicy.c
@@ -0,0 +1,137 @@
1/*
2 * net/dccp/qpolicy.c
3 *
4 * Policy-based packet dequeueing interface for DCCP.
5 *
6 * Copyright (c) 2008 Tomasz Grobelny <tomasz@grobelny.oswiecenia.net>
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License v2
10 * as published by the Free Software Foundation.
11 */
12#include "dccp.h"
13
14/*
15 * Simple Dequeueing Policy:
16 * If tx_qlen is different from 0, enqueue up to tx_qlen elements.
17 */
18static void qpolicy_simple_push(struct sock *sk, struct sk_buff *skb)
19{
20 skb_queue_tail(&sk->sk_write_queue, skb);
21}
22
23static bool qpolicy_simple_full(struct sock *sk)
24{
25 return dccp_sk(sk)->dccps_tx_qlen &&
26 sk->sk_write_queue.qlen >= dccp_sk(sk)->dccps_tx_qlen;
27}
28
29static struct sk_buff *qpolicy_simple_top(struct sock *sk)
30{
31 return skb_peek(&sk->sk_write_queue);
32}
33
34/*
35 * Priority-based Dequeueing Policy:
36 * If tx_qlen is different from 0 and the queue has reached its upper bound
37 * of tx_qlen elements, replace older packets lowest-priority-first.
38 */
39static struct sk_buff *qpolicy_prio_best_skb(struct sock *sk)
40{
41 struct sk_buff *skb, *best = NULL;
42
43 skb_queue_walk(&sk->sk_write_queue, skb)
44 if (best == NULL || skb->priority > best->priority)
45 best = skb;
46 return best;
47}
48
49static struct sk_buff *qpolicy_prio_worst_skb(struct sock *sk)
50{
51 struct sk_buff *skb, *worst = NULL;
52
53 skb_queue_walk(&sk->sk_write_queue, skb)
54 if (worst == NULL || skb->priority < worst->priority)
55 worst = skb;
56 return worst;
57}
58
59static bool qpolicy_prio_full(struct sock *sk)
60{
61 if (qpolicy_simple_full(sk))
62 dccp_qpolicy_drop(sk, qpolicy_prio_worst_skb(sk));
63 return false;
64}
65
66/**
67 * struct dccp_qpolicy_operations - TX Packet Dequeueing Interface
68 * @push: add a new @skb to the write queue
69 * @full: indicates that no more packets will be admitted
70 * @top: peeks at whatever the queueing policy defines as its `top'
71 */
72static struct dccp_qpolicy_operations {
73 void (*push) (struct sock *sk, struct sk_buff *skb);
74 bool (*full) (struct sock *sk);
75 struct sk_buff* (*top) (struct sock *sk);
76 __be32 params;
77
78} qpol_table[DCCPQ_POLICY_MAX] = {
79 [DCCPQ_POLICY_SIMPLE] = {
80 .push = qpolicy_simple_push,
81 .full = qpolicy_simple_full,
82 .top = qpolicy_simple_top,
83 .params = 0,
84 },
85 [DCCPQ_POLICY_PRIO] = {
86 .push = qpolicy_simple_push,
87 .full = qpolicy_prio_full,
88 .top = qpolicy_prio_best_skb,
89 .params = DCCP_SCM_PRIORITY,
90 },
91};
92
93/*
94 * Externally visible interface
95 */
96void dccp_qpolicy_push(struct sock *sk, struct sk_buff *skb)
97{
98 qpol_table[dccp_sk(sk)->dccps_qpolicy].push(sk, skb);
99}
100
101bool dccp_qpolicy_full(struct sock *sk)
102{
103 return qpol_table[dccp_sk(sk)->dccps_qpolicy].full(sk);
104}
105
106void dccp_qpolicy_drop(struct sock *sk, struct sk_buff *skb)
107{
108 if (skb != NULL) {
109 skb_unlink(skb, &sk->sk_write_queue);
110 kfree_skb(skb);
111 }
112}
113
114struct sk_buff *dccp_qpolicy_top(struct sock *sk)
115{
116 return qpol_table[dccp_sk(sk)->dccps_qpolicy].top(sk);
117}
118
119struct sk_buff *dccp_qpolicy_pop(struct sock *sk)
120{
121 struct sk_buff *skb = dccp_qpolicy_top(sk);
122
123 /* Clear any skb fields that we used internally */
124 skb->priority = 0;
125
126 if (skb)
127 skb_unlink(skb, &sk->sk_write_queue);
128 return skb;
129}
130
131bool dccp_qpolicy_param_ok(struct sock *sk, __be32 param)
132{
133 /* check if exactly one bit is set */
134 if (!param || (param & (param - 1)))
135 return false;
136 return (qpol_table[dccp_sk(sk)->dccps_qpolicy].params & param) == param;
137}
diff --git a/net/dccp/sysctl.c b/net/dccp/sysctl.c
index 21295993fdb8..a5a1856234e7 100644
--- a/net/dccp/sysctl.c
+++ b/net/dccp/sysctl.c
@@ -18,76 +18,72 @@
18#error This file should not be compiled without CONFIG_SYSCTL defined 18#error This file should not be compiled without CONFIG_SYSCTL defined
19#endif 19#endif
20 20
21/* Boundary values */
22static int zero = 0,
23 u8_max = 0xFF;
24static unsigned long seqw_min = 32;
25
21static struct ctl_table dccp_default_table[] = { 26static struct ctl_table dccp_default_table[] = {
22 { 27 {
23 .procname = "seq_window", 28 .procname = "seq_window",
24 .data = &sysctl_dccp_feat_sequence_window, 29 .data = &sysctl_dccp_sequence_window,
25 .maxlen = sizeof(sysctl_dccp_feat_sequence_window), 30 .maxlen = sizeof(sysctl_dccp_sequence_window),
26 .mode = 0644, 31 .mode = 0644,
27 .proc_handler = proc_dointvec, 32 .proc_handler = proc_doulongvec_minmax,
33 .extra1 = &seqw_min, /* RFC 4340, 7.5.2 */
28 }, 34 },
29 { 35 {
30 .procname = "rx_ccid", 36 .procname = "rx_ccid",
31 .data = &sysctl_dccp_feat_rx_ccid, 37 .data = &sysctl_dccp_rx_ccid,
32 .maxlen = sizeof(sysctl_dccp_feat_rx_ccid), 38 .maxlen = sizeof(sysctl_dccp_rx_ccid),
33 .mode = 0644, 39 .mode = 0644,
34 .proc_handler = proc_dointvec, 40 .proc_handler = proc_dointvec_minmax,
41 .extra1 = &zero,
42 .extra2 = &u8_max, /* RFC 4340, 10. */
35 }, 43 },
36 { 44 {
37 .procname = "tx_ccid", 45 .procname = "tx_ccid",
38 .data = &sysctl_dccp_feat_tx_ccid, 46 .data = &sysctl_dccp_tx_ccid,
39 .maxlen = sizeof(sysctl_dccp_feat_tx_ccid), 47 .maxlen = sizeof(sysctl_dccp_tx_ccid),
40 .mode = 0644,
41 .proc_handler = proc_dointvec,
42 },
43 {
44 .procname = "ack_ratio",
45 .data = &sysctl_dccp_feat_ack_ratio,
46 .maxlen = sizeof(sysctl_dccp_feat_ack_ratio),
47 .mode = 0644,
48 .proc_handler = proc_dointvec,
49 },
50 {
51 .procname = "send_ackvec",
52 .data = &sysctl_dccp_feat_send_ack_vector,
53 .maxlen = sizeof(sysctl_dccp_feat_send_ack_vector),
54 .mode = 0644,
55 .proc_handler = proc_dointvec,
56 },
57 {
58 .procname = "send_ndp",
59 .data = &sysctl_dccp_feat_send_ndp_count,
60 .maxlen = sizeof(sysctl_dccp_feat_send_ndp_count),
61 .mode = 0644, 48 .mode = 0644,
62 .proc_handler = proc_dointvec, 49 .proc_handler = proc_dointvec_minmax,
50 .extra1 = &zero,
51 .extra2 = &u8_max, /* RFC 4340, 10. */
63 }, 52 },
64 { 53 {
65 .procname = "request_retries", 54 .procname = "request_retries",
66 .data = &sysctl_dccp_request_retries, 55 .data = &sysctl_dccp_request_retries,
67 .maxlen = sizeof(sysctl_dccp_request_retries), 56 .maxlen = sizeof(sysctl_dccp_request_retries),
68 .mode = 0644, 57 .mode = 0644,
69 .proc_handler = proc_dointvec, 58 .proc_handler = proc_dointvec_minmax,
59 .extra1 = &zero,
60 .extra2 = &u8_max,
70 }, 61 },
71 { 62 {
72 .procname = "retries1", 63 .procname = "retries1",
73 .data = &sysctl_dccp_retries1, 64 .data = &sysctl_dccp_retries1,
74 .maxlen = sizeof(sysctl_dccp_retries1), 65 .maxlen = sizeof(sysctl_dccp_retries1),
75 .mode = 0644, 66 .mode = 0644,
76 .proc_handler = proc_dointvec, 67 .proc_handler = proc_dointvec_minmax,
68 .extra1 = &zero,
69 .extra2 = &u8_max,
77 }, 70 },
78 { 71 {
79 .procname = "retries2", 72 .procname = "retries2",
80 .data = &sysctl_dccp_retries2, 73 .data = &sysctl_dccp_retries2,
81 .maxlen = sizeof(sysctl_dccp_retries2), 74 .maxlen = sizeof(sysctl_dccp_retries2),
82 .mode = 0644, 75 .mode = 0644,
83 .proc_handler = proc_dointvec, 76 .proc_handler = proc_dointvec_minmax,
77 .extra1 = &zero,
78 .extra2 = &u8_max,
84 }, 79 },
85 { 80 {
86 .procname = "tx_qlen", 81 .procname = "tx_qlen",
87 .data = &sysctl_dccp_tx_qlen, 82 .data = &sysctl_dccp_tx_qlen,
88 .maxlen = sizeof(sysctl_dccp_tx_qlen), 83 .maxlen = sizeof(sysctl_dccp_tx_qlen),
89 .mode = 0644, 84 .mode = 0644,
90 .proc_handler = proc_dointvec, 85 .proc_handler = proc_dointvec_minmax,
86 .extra1 = &zero,
91 }, 87 },
92 { 88 {
93 .procname = "sync_ratelimit", 89 .procname = "sync_ratelimit",
diff --git a/net/dccp/timer.c b/net/dccp/timer.c
index 54b3c7e9e016..16359e29e7f5 100644
--- a/net/dccp/timer.c
+++ b/net/dccp/timer.c
@@ -87,17 +87,6 @@ static void dccp_retransmit_timer(struct sock *sk)
87{ 87{
88 struct inet_connection_sock *icsk = inet_csk(sk); 88 struct inet_connection_sock *icsk = inet_csk(sk);
89 89
90 /* retransmit timer is used for feature negotiation throughout
91 * connection. In this case, no packet is re-transmitted, but rather an
92 * ack is generated and pending changes are placed into its options.
93 */
94 if (sk->sk_send_head == NULL) {
95 dccp_pr_debug("feat negotiation retransmit timeout %p\n", sk);
96 if (sk->sk_state == DCCP_OPEN)
97 dccp_send_ack(sk);
98 goto backoff;
99 }
100
101 /* 90 /*
102 * More than than 4MSL (8 minutes) has passed, a RESET(aborted) was 91 * More than than 4MSL (8 minutes) has passed, a RESET(aborted) was
103 * sent, no need to retransmit, this sock is dead. 92 * sent, no need to retransmit, this sock is dead.
@@ -126,7 +115,6 @@ static void dccp_retransmit_timer(struct sock *sk)
126 return; 115 return;
127 } 116 }
128 117
129backoff:
130 icsk->icsk_backoff++; 118 icsk->icsk_backoff++;
131 119
132 icsk->icsk_rto = min(icsk->icsk_rto << 1, DCCP_RTO_MAX); 120 icsk->icsk_rto = min(icsk->icsk_rto << 1, DCCP_RTO_MAX);
@@ -249,32 +237,35 @@ out:
249 sock_put(sk); 237 sock_put(sk);
250} 238}
251 239
252/* Transmit-delay timer: used by the CCIDs to delay actual send time */ 240/**
253static void dccp_write_xmit_timer(unsigned long data) 241 * dccp_write_xmitlet - Workhorse for CCID packet dequeueing interface
242 * See the comments above %ccid_dequeueing_decision for supported modes.
243 */
244static void dccp_write_xmitlet(unsigned long data)
254{ 245{
255 struct sock *sk = (struct sock *)data; 246 struct sock *sk = (struct sock *)data;
256 struct dccp_sock *dp = dccp_sk(sk);
257 247
258 bh_lock_sock(sk); 248 bh_lock_sock(sk);
259 if (sock_owned_by_user(sk)) 249 if (sock_owned_by_user(sk))
260 sk_reset_timer(sk, &dp->dccps_xmit_timer, jiffies+1); 250 sk_reset_timer(sk, &dccp_sk(sk)->dccps_xmit_timer, jiffies + 1);
261 else 251 else
262 dccp_write_xmit(sk, 0); 252 dccp_write_xmit(sk);
263 bh_unlock_sock(sk); 253 bh_unlock_sock(sk);
264 sock_put(sk);
265} 254}
266 255
267static void dccp_init_write_xmit_timer(struct sock *sk) 256static void dccp_write_xmit_timer(unsigned long data)
268{ 257{
269 struct dccp_sock *dp = dccp_sk(sk); 258 dccp_write_xmitlet(data);
270 259 sock_put((struct sock *)data);
271 setup_timer(&dp->dccps_xmit_timer, dccp_write_xmit_timer,
272 (unsigned long)sk);
273} 260}
274 261
275void dccp_init_xmit_timers(struct sock *sk) 262void dccp_init_xmit_timers(struct sock *sk)
276{ 263{
277 dccp_init_write_xmit_timer(sk); 264 struct dccp_sock *dp = dccp_sk(sk);
265
266 tasklet_init(&dp->dccps_xmitlet, dccp_write_xmitlet, (unsigned long)sk);
267 setup_timer(&dp->dccps_xmit_timer, dccp_write_xmit_timer,
268 (unsigned long)sk);
278 inet_csk_init_xmit_timers(sk, &dccp_write_timer, &dccp_delack_timer, 269 inet_csk_init_xmit_timers(sk, &dccp_write_timer, &dccp_delack_timer,
279 &dccp_keepalive_timer); 270 &dccp_keepalive_timer);
280} 271}
@@ -290,8 +281,7 @@ u32 dccp_timestamp(void)
290{ 281{
291 s64 delta = ktime_us_delta(ktime_get_real(), dccp_timestamp_seed); 282 s64 delta = ktime_us_delta(ktime_get_real(), dccp_timestamp_seed);
292 283
293 do_div(delta, 10); 284 return div_u64(delta, DCCP_TIME_RESOLUTION);
294 return delta;
295} 285}
296EXPORT_SYMBOL_GPL(dccp_timestamp); 286EXPORT_SYMBOL_GPL(dccp_timestamp);
297 287
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index f79a51607292..9da9f19ece8a 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -811,25 +811,12 @@ void tcp_update_metrics(struct sock *sk)
811 } 811 }
812} 812}
813 813
814/* Numbers are taken from RFC3390.
815 *
816 * John Heffner states:
817 *
818 * The RFC specifies a window of no more than 4380 bytes
819 * unless 2*MSS > 4380. Reading the pseudocode in the RFC
820 * is a bit misleading because they use a clamp at 4380 bytes
821 * rather than use a multiplier in the relevant range.
822 */
823__u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst) 814__u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst)
824{ 815{
825 __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); 816 __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
826 817
827 if (!cwnd) { 818 if (!cwnd)
828 if (tp->mss_cache > 1460) 819 cwnd = rfc3390_bytes_to_packets(tp->mss_cache);
829 cwnd = 2;
830 else
831 cwnd = (tp->mss_cache > 1095) ? 3 : 4;
832 }
833 return min_t(__u32, cwnd, tp->snd_cwnd_clamp); 820 return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
834} 821}
835 822