aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2008-09-09 08:09:24 -0400
committerDavid S. Miller <davem@davemloft.net>2008-09-09 08:09:24 -0400
commitded67c0e2f0388458c36e8fd3b0c8be07c53db08 (patch)
treeccb684c18aa9bfd5659a50d65c97cf1f07da117a
parentf8ef6e44474037b1671bb913bc259e048a7d5548 (diff)
parent410e27a49bb98bc7fa3ff5fc05cc313817b9f253 (diff)
Merge branch 'master' of git://eden-feed.erg.abdn.ac.uk/net-next-2.6
-rw-r--r--Documentation/networking/dccp.txt54
-rw-r--r--include/linux/dccp.h122
-rw-r--r--include/net/tcp.h15
-rw-r--r--net/dccp/Kconfig3
-rw-r--r--net/dccp/Makefile5
-rw-r--r--net/dccp/ackvec.c619
-rw-r--r--net/dccp/ackvec.h204
-rw-r--r--net/dccp/ccid.c101
-rw-r--r--net/dccp/ccid.h113
-rw-r--r--net/dccp/ccids/Kconfig30
-rw-r--r--net/dccp/ccids/ccid2.c622
-rw-r--r--net/dccp/ccids/ccid2.h63
-rw-r--r--net/dccp/ccids/ccid3.c762
-rw-r--r--net/dccp/ccids/ccid3.h153
-rw-r--r--net/dccp/ccids/lib/loss_interval.c30
-rw-r--r--net/dccp/ccids/lib/loss_interval.h4
-rw-r--r--net/dccp/ccids/lib/packet_history.c282
-rw-r--r--net/dccp/ccids/lib/packet_history.h78
-rw-r--r--net/dccp/ccids/lib/tfrc.h16
-rw-r--r--net/dccp/ccids/lib/tfrc_equation.c29
-rw-r--r--net/dccp/dccp.h104
-rw-r--r--net/dccp/diag.c2
-rw-r--r--net/dccp/feat.c1805
-rw-r--r--net/dccp/feat.h144
-rw-r--r--net/dccp/input.c164
-rw-r--r--net/dccp/ipv4.c4
-rw-r--r--net/dccp/ipv6.c4
-rw-r--r--net/dccp/minisocks.c87
-rw-r--r--net/dccp/options.c341
-rw-r--r--net/dccp/output.c279
-rw-r--r--net/dccp/probe.c75
-rw-r--r--net/dccp/proto.c281
-rw-r--r--net/dccp/qpolicy.c137
-rw-r--r--net/dccp/sysctl.c64
-rw-r--r--net/dccp/timer.c42
-rw-r--r--net/ipv4/tcp_input.c17
36 files changed, 2884 insertions, 3971 deletions
diff --git a/Documentation/networking/dccp.txt b/Documentation/networking/dccp.txt
index fcfc12534428..39131a3c78f8 100644
--- a/Documentation/networking/dccp.txt
+++ b/Documentation/networking/dccp.txt
@@ -45,25 +45,6 @@ http://linux-net.osdl.org/index.php/DCCP_Testing#Experimental_DCCP_source_tree
45 45
46Socket options 46Socket options
47============== 47==============
48DCCP_SOCKOPT_QPOLICY_ID sets the dequeuing policy for outgoing packets. It takes
49a policy ID as argument and can only be set before the connection (i.e. changes
50during an established connection are not supported). Currently, two policies are
51defined: the "simple" policy (DCCPQ_POLICY_SIMPLE), which does nothing special,
52and a priority-based variant (DCCPQ_POLICY_PRIO). The latter allows to pass an
53u32 priority value as ancillary data to sendmsg(), where higher numbers indicate
54a higher packet priority (similar to SO_PRIORITY). This ancillary data needs to
55be formatted using a cmsg(3) message header filled in as follows:
56 cmsg->cmsg_level = SOL_DCCP;
57 cmsg->cmsg_type = DCCP_SCM_PRIORITY;
58 cmsg->cmsg_len = CMSG_LEN(sizeof(uint32_t)); /* or CMSG_LEN(4) */
59
60DCCP_SOCKOPT_QPOLICY_TXQLEN sets the maximum length of the output queue. A zero
61value is always interpreted as unbounded queue length. If different from zero,
62the interpretation of this parameter depends on the current dequeuing policy
63(see above): the "simple" policy will enforce a fixed queue size by returning
64EAGAIN, whereas the "prio" policy enforces a fixed queue length by dropping the
65lowest-priority packet first. The default value for this parameter is
66initialised from /proc/sys/net/dccp/default/tx_qlen.
67 48
68DCCP_SOCKOPT_SERVICE sets the service. The specification mandates use of 49DCCP_SOCKOPT_SERVICE sets the service. The specification mandates use of
69service codes (RFC 4340, sec. 8.1.2); if this socket option is not set, 50service codes (RFC 4340, sec. 8.1.2); if this socket option is not set,
@@ -76,24 +57,6 @@ can be set before calling bind().
76DCCP_SOCKOPT_GET_CUR_MPS is read-only and retrieves the current maximum packet 57DCCP_SOCKOPT_GET_CUR_MPS is read-only and retrieves the current maximum packet
77size (application payload size) in bytes, see RFC 4340, section 14. 58size (application payload size) in bytes, see RFC 4340, section 14.
78 59
79DCCP_SOCKOPT_AVAILABLE_CCIDS is also read-only and returns the list of CCIDs
80supported by the endpoint (see include/linux/dccp.h for symbolic constants).
81The caller needs to provide a sufficiently large (> 2) array of type uint8_t.
82
83DCCP_SOCKOPT_CCID is write-only and sets both the TX and RX CCIDs at the same
84time, combining the operation of the next two socket options. This option is
85preferrable over the latter two, since often applications will use the same
86type of CCID for both directions; and mixed use of CCIDs is not currently well
87understood. This socket option takes as argument at least one uint8_t value, or
88an array of uint8_t values, which must match available CCIDS (see above). CCIDs
89must be registered on the socket before calling connect() or listen().
90
91DCCP_SOCKOPT_TX_CCID is read/write. It returns the current CCID (if set) or sets
92the preference list for the TX CCID, using the same format as DCCP_SOCKOPT_CCID.
93Please note that the getsockopt argument type here is `int', not uint8_t.
94
95DCCP_SOCKOPT_RX_CCID is analogous to DCCP_SOCKOPT_TX_CCID, but for the RX CCID.
96
97DCCP_SOCKOPT_SERVER_TIMEWAIT enables the server (listening socket) to hold 60DCCP_SOCKOPT_SERVER_TIMEWAIT enables the server (listening socket) to hold
98timewait state when closing the connection (RFC 4340, 8.3). The usual case is 61timewait state when closing the connection (RFC 4340, 8.3). The usual case is
99that the closing server sends a CloseReq, whereupon the client holds timewait 62that the closing server sends a CloseReq, whereupon the client holds timewait
@@ -152,16 +115,23 @@ retries2
152 importance for retransmitted acknowledgments and feature negotiation, 115 importance for retransmitted acknowledgments and feature negotiation,
153 data packets are never retransmitted. Analogue of tcp_retries2. 116 data packets are never retransmitted. Analogue of tcp_retries2.
154 117
118send_ndp = 1
119 Whether or not to send NDP count options (sec. 7.7.2).
120
121send_ackvec = 1
122 Whether or not to send Ack Vector options (sec. 11.5).
123
124ack_ratio = 2
125 The default Ack Ratio (sec. 11.3) to use.
126
155tx_ccid = 2 127tx_ccid = 2
156 Default CCID for the sender-receiver half-connection. Depending on the 128 Default CCID for the sender-receiver half-connection.
157 choice of CCID, the Send Ack Vector feature is enabled automatically.
158 129
159rx_ccid = 2 130rx_ccid = 2
160 Default CCID for the receiver-sender half-connection; see tx_ccid. 131 Default CCID for the receiver-sender half-connection.
161 132
162seq_window = 100 133seq_window = 100
163 The initial sequence window (sec. 7.5.2) of the sender. This influences 134 The initial sequence window (sec. 7.5.2).
164 the local ackno validity and the remote seqno validity windows (7.5.1).
165 135
166tx_qlen = 5 136tx_qlen = 5
167 The size of the transmit buffer in packets. A value of 0 corresponds 137 The size of the transmit buffer in packets. A value of 0 corresponds
diff --git a/include/linux/dccp.h b/include/linux/dccp.h
index 010e2d87ed75..6080449fbec9 100644
--- a/include/linux/dccp.h
+++ b/include/linux/dccp.h
@@ -165,13 +165,9 @@ enum {
165 DCCPO_TIMESTAMP_ECHO = 42, 165 DCCPO_TIMESTAMP_ECHO = 42,
166 DCCPO_ELAPSED_TIME = 43, 166 DCCPO_ELAPSED_TIME = 43,
167 DCCPO_MAX = 45, 167 DCCPO_MAX = 45,
168 DCCPO_MIN_RX_CCID_SPECIFIC = 128, /* from sender to receiver */ 168 DCCPO_MIN_CCID_SPECIFIC = 128,
169 DCCPO_MAX_RX_CCID_SPECIFIC = 191, 169 DCCPO_MAX_CCID_SPECIFIC = 255,
170 DCCPO_MIN_TX_CCID_SPECIFIC = 192, /* from receiver to sender */
171 DCCPO_MAX_TX_CCID_SPECIFIC = 255,
172}; 170};
173/* maximum size of a single TLV-encoded DCCP option (sans type/len bytes) */
174#define DCCP_SINGLE_OPT_MAXLEN 253
175 171
176/* DCCP CCIDS */ 172/* DCCP CCIDS */
177enum { 173enum {
@@ -180,36 +176,27 @@ enum {
180}; 176};
181 177
182/* DCCP features (RFC 4340 section 6.4) */ 178/* DCCP features (RFC 4340 section 6.4) */
183enum dccp_feature_numbers { 179enum {
184 DCCPF_RESERVED = 0, 180 DCCPF_RESERVED = 0,
185 DCCPF_CCID = 1, 181 DCCPF_CCID = 1,
186 DCCPF_SHORT_SEQNOS = 2, 182 DCCPF_SHORT_SEQNOS = 2, /* XXX: not yet implemented */
187 DCCPF_SEQUENCE_WINDOW = 3, 183 DCCPF_SEQUENCE_WINDOW = 3,
188 DCCPF_ECN_INCAPABLE = 4, 184 DCCPF_ECN_INCAPABLE = 4, /* XXX: not yet implemented */
189 DCCPF_ACK_RATIO = 5, 185 DCCPF_ACK_RATIO = 5,
190 DCCPF_SEND_ACK_VECTOR = 6, 186 DCCPF_SEND_ACK_VECTOR = 6,
191 DCCPF_SEND_NDP_COUNT = 7, 187 DCCPF_SEND_NDP_COUNT = 7,
192 DCCPF_MIN_CSUM_COVER = 8, 188 DCCPF_MIN_CSUM_COVER = 8,
193 DCCPF_DATA_CHECKSUM = 9, 189 DCCPF_DATA_CHECKSUM = 9, /* XXX: not yet implemented */
194 /* 10-127 reserved */ 190 /* 10-127 reserved */
195 DCCPF_MIN_CCID_SPECIFIC = 128, 191 DCCPF_MIN_CCID_SPECIFIC = 128,
196 DCCPF_SEND_LEV_RATE = 192, /* RFC 4342, sec. 8.4 */
197 DCCPF_MAX_CCID_SPECIFIC = 255, 192 DCCPF_MAX_CCID_SPECIFIC = 255,
198}; 193};
199 194
200/* DCCP socket control message types for cmsg */ 195/* this structure is argument to DCCP_SOCKOPT_CHANGE_X */
201enum dccp_cmsg_type { 196struct dccp_so_feat {
202 DCCP_SCM_PRIORITY = 1, 197 __u8 dccpsf_feat;
203 DCCP_SCM_QPOLICY_MAX = 0xFFFF, 198 __u8 __user *dccpsf_val;
204 /* ^-- Up to here reserved exclusively for qpolicy parameters */ 199 __u8 dccpsf_len;
205 DCCP_SCM_MAX
206};
207
208/* DCCP priorities for outgoing/queued packets */
209enum dccp_packet_dequeueing_policy {
210 DCCPQ_POLICY_SIMPLE,
211 DCCPQ_POLICY_PRIO,
212 DCCPQ_POLICY_MAX
213}; 200};
214 201
215/* DCCP socket options */ 202/* DCCP socket options */
@@ -221,12 +208,6 @@ enum dccp_packet_dequeueing_policy {
221#define DCCP_SOCKOPT_SERVER_TIMEWAIT 6 208#define DCCP_SOCKOPT_SERVER_TIMEWAIT 6
222#define DCCP_SOCKOPT_SEND_CSCOV 10 209#define DCCP_SOCKOPT_SEND_CSCOV 10
223#define DCCP_SOCKOPT_RECV_CSCOV 11 210#define DCCP_SOCKOPT_RECV_CSCOV 11
224#define DCCP_SOCKOPT_AVAILABLE_CCIDS 12
225#define DCCP_SOCKOPT_CCID 13
226#define DCCP_SOCKOPT_TX_CCID 14
227#define DCCP_SOCKOPT_RX_CCID 15
228#define DCCP_SOCKOPT_QPOLICY_ID 16
229#define DCCP_SOCKOPT_QPOLICY_TXQLEN 17
230#define DCCP_SOCKOPT_CCID_RX_INFO 128 211#define DCCP_SOCKOPT_CCID_RX_INFO 128
231#define DCCP_SOCKOPT_CCID_TX_INFO 192 212#define DCCP_SOCKOPT_CCID_TX_INFO 192
232 213
@@ -374,13 +355,62 @@ static inline unsigned int dccp_hdr_len(const struct sk_buff *skb)
374 return __dccp_hdr_len(dccp_hdr(skb)); 355 return __dccp_hdr_len(dccp_hdr(skb));
375} 356}
376 357
358
359/* initial values for each feature */
360#define DCCPF_INITIAL_SEQUENCE_WINDOW 100
361#define DCCPF_INITIAL_ACK_RATIO 2
362#define DCCPF_INITIAL_CCID DCCPC_CCID2
363#define DCCPF_INITIAL_SEND_ACK_VECTOR 1
364/* FIXME: for now we're default to 1 but it should really be 0 */
365#define DCCPF_INITIAL_SEND_NDP_COUNT 1
366
367/**
368 * struct dccp_minisock - Minimal DCCP connection representation
369 *
370 * Will be used to pass the state from dccp_request_sock to dccp_sock.
371 *
372 * @dccpms_sequence_window - Sequence Window Feature (section 7.5.2)
373 * @dccpms_ccid - Congestion Control Id (CCID) (section 10)
374 * @dccpms_send_ack_vector - Send Ack Vector Feature (section 11.5)
375 * @dccpms_send_ndp_count - Send NDP Count Feature (7.7.2)
376 * @dccpms_ack_ratio - Ack Ratio Feature (section 11.3)
377 * @dccpms_pending - List of features being negotiated
378 * @dccpms_conf -
379 */
380struct dccp_minisock {
381 __u64 dccpms_sequence_window;
382 __u8 dccpms_rx_ccid;
383 __u8 dccpms_tx_ccid;
384 __u8 dccpms_send_ack_vector;
385 __u8 dccpms_send_ndp_count;
386 __u8 dccpms_ack_ratio;
387 struct list_head dccpms_pending;
388 struct list_head dccpms_conf;
389};
390
391struct dccp_opt_conf {
392 __u8 *dccpoc_val;
393 __u8 dccpoc_len;
394};
395
396struct dccp_opt_pend {
397 struct list_head dccpop_node;
398 __u8 dccpop_type;
399 __u8 dccpop_feat;
400 __u8 *dccpop_val;
401 __u8 dccpop_len;
402 int dccpop_conf;
403 struct dccp_opt_conf *dccpop_sc;
404};
405
406extern void dccp_minisock_init(struct dccp_minisock *dmsk);
407
377/** 408/**
378 * struct dccp_request_sock - represent DCCP-specific connection request 409 * struct dccp_request_sock - represent DCCP-specific connection request
379 * @dreq_inet_rsk: structure inherited from 410 * @dreq_inet_rsk: structure inherited from
380 * @dreq_iss: initial sequence number sent on the Response (RFC 4340, 7.1) 411 * @dreq_iss: initial sequence number sent on the Response (RFC 4340, 7.1)
381 * @dreq_isr: initial sequence number received on the Request 412 * @dreq_isr: initial sequence number received on the Request
382 * @dreq_service: service code present on the Request (there is just one) 413 * @dreq_service: service code present on the Request (there is just one)
383 * @dreq_featneg: feature negotiation options for this connection
384 * The following two fields are analogous to the ones in dccp_sock: 414 * The following two fields are analogous to the ones in dccp_sock:
385 * @dreq_timestamp_echo: last received timestamp to echo (13.1) 415 * @dreq_timestamp_echo: last received timestamp to echo (13.1)
386 * @dreq_timestamp_echo: the time of receiving the last @dreq_timestamp_echo 416 * @dreq_timestamp_echo: the time of receiving the last @dreq_timestamp_echo
@@ -390,7 +420,6 @@ struct dccp_request_sock {
390 __u64 dreq_iss; 420 __u64 dreq_iss;
391 __u64 dreq_isr; 421 __u64 dreq_isr;
392 __be32 dreq_service; 422 __be32 dreq_service;
393 struct list_head dreq_featneg;
394 __u32 dreq_timestamp_echo; 423 __u32 dreq_timestamp_echo;
395 __u32 dreq_timestamp_time; 424 __u32 dreq_timestamp_time;
396}; 425};
@@ -462,28 +491,21 @@ struct dccp_ackvec;
462 * @dccps_timestamp_time - time of receiving latest @dccps_timestamp_echo 491 * @dccps_timestamp_time - time of receiving latest @dccps_timestamp_echo
463 * @dccps_l_ack_ratio - feature-local Ack Ratio 492 * @dccps_l_ack_ratio - feature-local Ack Ratio
464 * @dccps_r_ack_ratio - feature-remote Ack Ratio 493 * @dccps_r_ack_ratio - feature-remote Ack Ratio
465 * @dccps_l_seq_win - local Sequence Window (influences ack number validity)
466 * @dccps_r_seq_win - remote Sequence Window (influences seq number validity)
467 * @dccps_pcslen - sender partial checksum coverage (via sockopt) 494 * @dccps_pcslen - sender partial checksum coverage (via sockopt)
468 * @dccps_pcrlen - receiver partial checksum coverage (via sockopt) 495 * @dccps_pcrlen - receiver partial checksum coverage (via sockopt)
469 * @dccps_send_ndp_count - local Send NDP Count feature (7.7.2)
470 * @dccps_ndp_count - number of Non Data Packets since last data packet 496 * @dccps_ndp_count - number of Non Data Packets since last data packet
471 * @dccps_mss_cache - current value of MSS (path MTU minus header sizes) 497 * @dccps_mss_cache - current value of MSS (path MTU minus header sizes)
472 * @dccps_rate_last - timestamp for rate-limiting DCCP-Sync (RFC 4340, 7.5.4) 498 * @dccps_rate_last - timestamp for rate-limiting DCCP-Sync (RFC 4340, 7.5.4)
473 * @dccps_featneg - tracks feature-negotiation state (mostly during handshake) 499 * @dccps_minisock - associated minisock (accessed via dccp_msk)
474 * @dccps_hc_rx_ackvec - rx half connection ack vector 500 * @dccps_hc_rx_ackvec - rx half connection ack vector
475 * @dccps_hc_rx_ccid - CCID used for the receiver (or receiving half-connection) 501 * @dccps_hc_rx_ccid - CCID used for the receiver (or receiving half-connection)
476 * @dccps_hc_tx_ccid - CCID used for the sender (or sending half-connection) 502 * @dccps_hc_tx_ccid - CCID used for the sender (or sending half-connection)
477 * @dccps_options_received - parsed set of retrieved options 503 * @dccps_options_received - parsed set of retrieved options
478 * @dccps_qpolicy - TX dequeueing policy, one of %dccp_packet_dequeueing_policy
479 * @dccps_tx_qlen - maximum length of the TX queue
480 * @dccps_role - role of this sock, one of %dccp_role 504 * @dccps_role - role of this sock, one of %dccp_role
481 * @dccps_hc_rx_insert_options - receiver wants to add options when acking 505 * @dccps_hc_rx_insert_options - receiver wants to add options when acking
482 * @dccps_hc_tx_insert_options - sender wants to add options when sending 506 * @dccps_hc_tx_insert_options - sender wants to add options when sending
483 * @dccps_server_timewait - server holds timewait state on close (RFC 4340, 8.3) 507 * @dccps_server_timewait - server holds timewait state on close (RFC 4340, 8.3)
484 * @dccps_sync_scheduled - flag which signals "send out-of-band message soon" 508 * @dccps_xmit_timer - timer for when CCID is not ready to send
485 * @dccps_xmitlet - tasklet scheduled by the TX CCID to dequeue data packets
486 * @dccps_xmit_timer - used by the TX CCID to delay sending (rate-based pacing)
487 * @dccps_syn_rtt - RTT sample from Request/Response exchange (in usecs) 509 * @dccps_syn_rtt - RTT sample from Request/Response exchange (in usecs)
488 */ 510 */
489struct dccp_sock { 511struct dccp_sock {
@@ -507,26 +529,19 @@ struct dccp_sock {
507 __u32 dccps_timestamp_time; 529 __u32 dccps_timestamp_time;
508 __u16 dccps_l_ack_ratio; 530 __u16 dccps_l_ack_ratio;
509 __u16 dccps_r_ack_ratio; 531 __u16 dccps_r_ack_ratio;
510 __u64 dccps_l_seq_win:48; 532 __u16 dccps_pcslen;
511 __u64 dccps_r_seq_win:48; 533 __u16 dccps_pcrlen;
512 __u8 dccps_pcslen:4;
513 __u8 dccps_pcrlen:4;
514 __u8 dccps_send_ndp_count:1;
515 __u64 dccps_ndp_count:48; 534 __u64 dccps_ndp_count:48;
516 unsigned long dccps_rate_last; 535 unsigned long dccps_rate_last;
517 struct list_head dccps_featneg; 536 struct dccp_minisock dccps_minisock;
518 struct dccp_ackvec *dccps_hc_rx_ackvec; 537 struct dccp_ackvec *dccps_hc_rx_ackvec;
519 struct ccid *dccps_hc_rx_ccid; 538 struct ccid *dccps_hc_rx_ccid;
520 struct ccid *dccps_hc_tx_ccid; 539 struct ccid *dccps_hc_tx_ccid;
521 struct dccp_options_received dccps_options_received; 540 struct dccp_options_received dccps_options_received;
522 __u8 dccps_qpolicy;
523 __u32 dccps_tx_qlen;
524 enum dccp_role dccps_role:2; 541 enum dccp_role dccps_role:2;
525 __u8 dccps_hc_rx_insert_options:1; 542 __u8 dccps_hc_rx_insert_options:1;
526 __u8 dccps_hc_tx_insert_options:1; 543 __u8 dccps_hc_tx_insert_options:1;
527 __u8 dccps_server_timewait:1; 544 __u8 dccps_server_timewait:1;
528 __u8 dccps_sync_scheduled:1;
529 struct tasklet_struct dccps_xmitlet;
530 struct timer_list dccps_xmit_timer; 545 struct timer_list dccps_xmit_timer;
531}; 546};
532 547
@@ -535,6 +550,11 @@ static inline struct dccp_sock *dccp_sk(const struct sock *sk)
535 return (struct dccp_sock *)sk; 550 return (struct dccp_sock *)sk;
536} 551}
537 552
553static inline struct dccp_minisock *dccp_msk(const struct sock *sk)
554{
555 return (struct dccp_minisock *)&dccp_sk(sk)->dccps_minisock;
556}
557
538static inline const char *dccp_role(const struct sock *sk) 558static inline const char *dccp_role(const struct sock *sk)
539{ 559{
540 switch (dccp_sk(sk)->dccps_role) { 560 switch (dccp_sk(sk)->dccps_role) {
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 6bc4b8148ca0..8983386356a5 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -782,21 +782,6 @@ static inline __u32 tcp_current_ssthresh(const struct sock *sk)
782/* Use define here intentionally to get WARN_ON location shown at the caller */ 782/* Use define here intentionally to get WARN_ON location shown at the caller */
783#define tcp_verify_left_out(tp) WARN_ON(tcp_left_out(tp) > tp->packets_out) 783#define tcp_verify_left_out(tp) WARN_ON(tcp_left_out(tp) > tp->packets_out)
784 784
785/*
786 * Convert RFC3390 larger initial windows into an equivalent number of packets.
787 *
788 * John Heffner states:
789 *
790 * The RFC specifies a window of no more than 4380 bytes
791 * unless 2*MSS > 4380. Reading the pseudocode in the RFC
792 * is a bit misleading because they use a clamp at 4380 bytes
793 * rather than a multiplier in the relevant range.
794 */
795static inline u32 rfc3390_bytes_to_packets(const u32 bytes)
796{
797 return bytes <= 1095 ? 4 : (bytes > 1460 ? 2 : 3);
798}
799
800extern void tcp_enter_cwr(struct sock *sk, const int set_ssthresh); 785extern void tcp_enter_cwr(struct sock *sk, const int set_ssthresh);
801extern __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst); 786extern __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst);
802 787
diff --git a/net/dccp/Kconfig b/net/dccp/Kconfig
index 206c16ad9c3c..7aa2a7acc7ec 100644
--- a/net/dccp/Kconfig
+++ b/net/dccp/Kconfig
@@ -25,6 +25,9 @@ config INET_DCCP_DIAG
25 def_tristate y if (IP_DCCP = y && INET_DIAG = y) 25 def_tristate y if (IP_DCCP = y && INET_DIAG = y)
26 def_tristate m 26 def_tristate m
27 27
28config IP_DCCP_ACKVEC
29 bool
30
28source "net/dccp/ccids/Kconfig" 31source "net/dccp/ccids/Kconfig"
29 32
30menu "DCCP Kernel Hacking" 33menu "DCCP Kernel Hacking"
diff --git a/net/dccp/Makefile b/net/dccp/Makefile
index 0c1c9af2bf7e..f4f8793aafff 100644
--- a/net/dccp/Makefile
+++ b/net/dccp/Makefile
@@ -1,7 +1,6 @@
1obj-$(CONFIG_IP_DCCP) += dccp.o dccp_ipv4.o 1obj-$(CONFIG_IP_DCCP) += dccp.o dccp_ipv4.o
2 2
3dccp-y := ccid.o feat.o input.o minisocks.o options.o \ 3dccp-y := ccid.o feat.o input.o minisocks.o options.o output.o proto.o timer.o
4 qpolicy.o output.o proto.o timer.o ackvec.o
5 4
6dccp_ipv4-y := ipv4.o 5dccp_ipv4-y := ipv4.o
7 6
@@ -9,6 +8,8 @@ dccp_ipv4-y := ipv4.o
9obj-$(subst y,$(CONFIG_IP_DCCP),$(CONFIG_IPV6)) += dccp_ipv6.o 8obj-$(subst y,$(CONFIG_IP_DCCP),$(CONFIG_IPV6)) += dccp_ipv6.o
10dccp_ipv6-y := ipv6.o 9dccp_ipv6-y := ipv6.o
11 10
11dccp-$(CONFIG_IP_DCCP_ACKVEC) += ackvec.o
12
12obj-$(CONFIG_INET_DCCP_DIAG) += dccp_diag.o 13obj-$(CONFIG_INET_DCCP_DIAG) += dccp_diag.o
13obj-$(CONFIG_NET_DCCPPROBE) += dccp_probe.o 14obj-$(CONFIG_NET_DCCPPROBE) += dccp_probe.o
14 15
diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c
index 41819848bdda..1e8be246ad15 100644
--- a/net/dccp/ackvec.c
+++ b/net/dccp/ackvec.c
@@ -1,375 +1,445 @@
1/* 1/*
2 * net/dccp/ackvec.c 2 * net/dccp/ackvec.c
3 * 3 *
4 * An implementation of Ack Vectors for the DCCP protocol 4 * An implementation of the DCCP protocol
5 * Copyright (c) 2007 University of Aberdeen, Scotland, UK
6 * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@ghostprotocols.net> 5 * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
7 * 6 *
8 * This program is free software; you can redistribute it and/or modify it 7 * This program is free software; you can redistribute it and/or modify it
9 * under the terms of the GNU General Public License as published by the 8 * under the terms of the GNU General Public License as published by the
10 * Free Software Foundation; version 2 of the License; 9 * Free Software Foundation; version 2 of the License;
11 */ 10 */
11
12#include "ackvec.h"
12#include "dccp.h" 13#include "dccp.h"
14
15#include <linux/dccp.h>
16#include <linux/init.h>
17#include <linux/errno.h>
13#include <linux/kernel.h> 18#include <linux/kernel.h>
19#include <linux/skbuff.h>
14#include <linux/slab.h> 20#include <linux/slab.h>
15 21
22#include <net/sock.h>
23
16static struct kmem_cache *dccp_ackvec_slab; 24static struct kmem_cache *dccp_ackvec_slab;
17static struct kmem_cache *dccp_ackvec_record_slab; 25static struct kmem_cache *dccp_ackvec_record_slab;
18 26
19struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority) 27static struct dccp_ackvec_record *dccp_ackvec_record_new(void)
20{ 28{
21 struct dccp_ackvec *av = kmem_cache_zalloc(dccp_ackvec_slab, priority); 29 struct dccp_ackvec_record *avr =
30 kmem_cache_alloc(dccp_ackvec_record_slab, GFP_ATOMIC);
22 31
23 if (av != NULL) { 32 if (avr != NULL)
24 av->av_buf_head = av->av_buf_tail = DCCPAV_MAX_ACKVEC_LEN - 1; 33 INIT_LIST_HEAD(&avr->avr_node);
25 INIT_LIST_HEAD(&av->av_records); 34
26 } 35 return avr;
27 return av;
28} 36}
29 37
30static void dccp_ackvec_purge_records(struct dccp_ackvec *av) 38static void dccp_ackvec_record_delete(struct dccp_ackvec_record *avr)
31{ 39{
32 struct dccp_ackvec_record *cur, *next; 40 if (unlikely(avr == NULL))
33 41 return;
34 list_for_each_entry_safe(cur, next, &av->av_records, avr_node) 42 /* Check if deleting a linked record */
35 kmem_cache_free(dccp_ackvec_record_slab, cur); 43 WARN_ON(!list_empty(&avr->avr_node));
36 INIT_LIST_HEAD(&av->av_records); 44 kmem_cache_free(dccp_ackvec_record_slab, avr);
37} 45}
38 46
39void dccp_ackvec_free(struct dccp_ackvec *av) 47static void dccp_ackvec_insert_avr(struct dccp_ackvec *av,
48 struct dccp_ackvec_record *avr)
40{ 49{
41 if (likely(av != NULL)) { 50 /*
42 dccp_ackvec_purge_records(av); 51 * AVRs are sorted by seqno. Since we are sending them in order, we
43 kmem_cache_free(dccp_ackvec_slab, av); 52 * just add the AVR at the head of the list.
53 * -sorbo.
54 */
55 if (!list_empty(&av->av_records)) {
56 const struct dccp_ackvec_record *head =
57 list_entry(av->av_records.next,
58 struct dccp_ackvec_record,
59 avr_node);
60 BUG_ON(before48(avr->avr_ack_seqno, head->avr_ack_seqno));
44 } 61 }
62
63 list_add(&avr->avr_node, &av->av_records);
45} 64}
46 65
47/** 66int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb)
48 * dccp_ackvec_update_records - Record information about sent Ack Vectors
49 * @av: Ack Vector records to update
50 * @seqno: Sequence number of the packet carrying the Ack Vector just sent
51 * @nonce_sum: The sum of all buffer nonces contained in the Ack Vector
52 */
53int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seqno, u8 nonce_sum)
54{ 67{
68 struct dccp_sock *dp = dccp_sk(sk);
69 struct dccp_ackvec *av = dp->dccps_hc_rx_ackvec;
70 /* Figure out how many options do we need to represent the ackvec */
71 const u16 nr_opts = DIV_ROUND_UP(av->av_vec_len, DCCP_MAX_ACKVEC_OPT_LEN);
72 u16 len = av->av_vec_len + 2 * nr_opts, i;
73 u32 elapsed_time;
74 const unsigned char *tail, *from;
75 unsigned char *to;
55 struct dccp_ackvec_record *avr; 76 struct dccp_ackvec_record *avr;
77 suseconds_t delta;
78
79 if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN)
80 return -1;
81
82 delta = ktime_us_delta(ktime_get_real(), av->av_time);
83 elapsed_time = delta / 10;
56 84
57 avr = kmem_cache_alloc(dccp_ackvec_record_slab, GFP_ATOMIC); 85 if (elapsed_time != 0 &&
86 dccp_insert_option_elapsed_time(sk, skb, elapsed_time))
87 return -1;
88
89 avr = dccp_ackvec_record_new();
58 if (avr == NULL) 90 if (avr == NULL)
59 return -ENOBUFS; 91 return -1;
92
93 DCCP_SKB_CB(skb)->dccpd_opt_len += len;
94
95 to = skb_push(skb, len);
96 len = av->av_vec_len;
97 from = av->av_buf + av->av_buf_head;
98 tail = av->av_buf + DCCP_MAX_ACKVEC_LEN;
99
100 for (i = 0; i < nr_opts; ++i) {
101 int copylen = len;
102
103 if (len > DCCP_MAX_ACKVEC_OPT_LEN)
104 copylen = DCCP_MAX_ACKVEC_OPT_LEN;
105
106 *to++ = DCCPO_ACK_VECTOR_0;
107 *to++ = copylen + 2;
108
109 /* Check if buf_head wraps */
110 if (from + copylen > tail) {
111 const u16 tailsize = tail - from;
112
113 memcpy(to, from, tailsize);
114 to += tailsize;
115 len -= tailsize;
116 copylen -= tailsize;
117 from = av->av_buf;
118 }
119
120 memcpy(to, from, copylen);
121 from += copylen;
122 to += copylen;
123 len -= copylen;
124 }
60 125
61 avr->avr_ack_seqno = seqno;
62 avr->avr_ack_ptr = av->av_buf_head;
63 avr->avr_ack_ackno = av->av_buf_ackno;
64 avr->avr_ack_nonce = nonce_sum;
65 avr->avr_ack_runlen = dccp_ackvec_runlen(av->av_buf + av->av_buf_head);
66 /*
67 * When the buffer overflows, we keep no more than one record. This is
68 * the simplest way of disambiguating sender-Acks dating from before the
69 * overflow from sender-Acks which refer to after the overflow; a simple
70 * solution is preferable here since we are handling an exception.
71 */
72 if (av->av_overflow)
73 dccp_ackvec_purge_records(av);
74 /* 126 /*
75 * Since GSS is incremented for each packet, the list is automatically 127 * From RFC 4340, A.2:
76 * arranged in descending order of @ack_seqno. 128 *
129 * For each acknowledgement it sends, the HC-Receiver will add an
130 * acknowledgement record. ack_seqno will equal the HC-Receiver
131 * sequence number it used for the ack packet; ack_ptr will equal
132 * buf_head; ack_ackno will equal buf_ackno; and ack_nonce will
133 * equal buf_nonce.
77 */ 134 */
78 list_add(&avr->avr_node, &av->av_records); 135 avr->avr_ack_seqno = DCCP_SKB_CB(skb)->dccpd_seq;
136 avr->avr_ack_ptr = av->av_buf_head;
137 avr->avr_ack_ackno = av->av_buf_ackno;
138 avr->avr_ack_nonce = av->av_buf_nonce;
139 avr->avr_sent_len = av->av_vec_len;
79 140
80 dccp_pr_debug("Added Vector, ack_seqno=%llu, ack_ackno=%llu (rl=%u)\n", 141 dccp_ackvec_insert_avr(av, avr);
142
143 dccp_pr_debug("%s ACK Vector 0, len=%d, ack_seqno=%llu, "
144 "ack_ackno=%llu\n",
145 dccp_role(sk), avr->avr_sent_len,
81 (unsigned long long)avr->avr_ack_seqno, 146 (unsigned long long)avr->avr_ack_seqno,
82 (unsigned long long)avr->avr_ack_ackno, 147 (unsigned long long)avr->avr_ack_ackno);
83 avr->avr_ack_runlen);
84 return 0; 148 return 0;
85} 149}
86 150
87static struct dccp_ackvec_record *dccp_ackvec_lookup(struct list_head *av_list, 151struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority)
88 const u64 ackno)
89{ 152{
90 struct dccp_ackvec_record *avr; 153 struct dccp_ackvec *av = kmem_cache_alloc(dccp_ackvec_slab, priority);
91 /* 154
92 * Exploit that records are inserted in descending order of sequence 155 if (av != NULL) {
93 * number, start with the oldest record first. If @ackno is `before' 156 av->av_buf_head = DCCP_MAX_ACKVEC_LEN - 1;
94 * the earliest ack_ackno, the packet is too old to be considered. 157 av->av_buf_ackno = UINT48_MAX + 1;
95 */ 158 av->av_buf_nonce = 0;
96 list_for_each_entry_reverse(avr, av_list, avr_node) { 159 av->av_time = ktime_set(0, 0);
97 if (avr->avr_ack_seqno == ackno) 160 av->av_vec_len = 0;
98 return avr; 161 INIT_LIST_HEAD(&av->av_records);
99 if (before48(ackno, avr->avr_ack_seqno))
100 break;
101 } 162 }
102 return NULL; 163
164 return av;
103} 165}
104 166
105/* 167void dccp_ackvec_free(struct dccp_ackvec *av)
106 * Buffer index and length computation using modulo-buffersize arithmetic.
107 * Note that, as pointers move from right to left, head is `before' tail.
108 */
109static inline u16 __ackvec_idx_add(const u16 a, const u16 b)
110{ 168{
111 return (a + b) % DCCPAV_MAX_ACKVEC_LEN; 169 if (unlikely(av == NULL))
170 return;
171
172 if (!list_empty(&av->av_records)) {
173 struct dccp_ackvec_record *avr, *next;
174
175 list_for_each_entry_safe(avr, next, &av->av_records, avr_node) {
176 list_del_init(&avr->avr_node);
177 dccp_ackvec_record_delete(avr);
178 }
179 }
180
181 kmem_cache_free(dccp_ackvec_slab, av);
112} 182}
113 183
114static inline u16 __ackvec_idx_sub(const u16 a, const u16 b) 184static inline u8 dccp_ackvec_state(const struct dccp_ackvec *av,
185 const u32 index)
115{ 186{
116 return __ackvec_idx_add(a, DCCPAV_MAX_ACKVEC_LEN - b); 187 return av->av_buf[index] & DCCP_ACKVEC_STATE_MASK;
117} 188}
118 189
119u16 dccp_ackvec_buflen(const struct dccp_ackvec *av) 190static inline u8 dccp_ackvec_len(const struct dccp_ackvec *av,
191 const u32 index)
120{ 192{
121 if (unlikely(av->av_overflow)) 193 return av->av_buf[index] & DCCP_ACKVEC_LEN_MASK;
122 return DCCPAV_MAX_ACKVEC_LEN;
123 return __ackvec_idx_sub(av->av_buf_tail, av->av_buf_head);
124} 194}
125 195
126/** 196/*
127 * dccp_ackvec_update_old - Update previous state as per RFC 4340, 11.4.1 197 * If several packets are missing, the HC-Receiver may prefer to enter multiple
128 * @av: non-empty buffer to update 198 * bytes with run length 0, rather than a single byte with a larger run length;
129 * @distance: negative or zero distance of @seqno from buf_ackno downward 199 * this simplifies table updates if one of the missing packets arrives.
130 * @seqno: the (old) sequence number whose record is to be updated
131 * @state: state in which packet carrying @seqno was received
132 */ 200 */
133static void dccp_ackvec_update_old(struct dccp_ackvec *av, s64 distance, 201static inline int dccp_ackvec_set_buf_head_state(struct dccp_ackvec *av,
134 u64 seqno, enum dccp_ackvec_states state) 202 const unsigned int packets,
203 const unsigned char state)
135{ 204{
136 u16 ptr = av->av_buf_head; 205 unsigned int gap;
206 long new_head;
137 207
138 BUG_ON(distance > 0); 208 if (av->av_vec_len + packets > DCCP_MAX_ACKVEC_LEN)
139 if (unlikely(dccp_ackvec_is_empty(av))) 209 return -ENOBUFS;
140 return;
141 210
142 do { 211 gap = packets - 1;
143 u8 runlen = dccp_ackvec_runlen(av->av_buf + ptr); 212 new_head = av->av_buf_head - packets;
144 213
145 if (distance + runlen >= 0) { 214 if (new_head < 0) {
146 /* 215 if (gap > 0) {
147 * Only update the state if packet has not been received 216 memset(av->av_buf, DCCP_ACKVEC_STATE_NOT_RECEIVED,
148 * yet. This is OK as per the second table in RFC 4340, 217 gap + new_head + 1);
149 * 11.4.1; i.e. here we are using the following table: 218 gap = -new_head;
150 * RECEIVED
151 * 0 1 3
152 * S +---+---+---+
153 * T 0 | 0 | 0 | 0 |
154 * O +---+---+---+
155 * R 1 | 1 | 1 | 1 |
156 * E +---+---+---+
157 * D 3 | 0 | 1 | 3 |
158 * +---+---+---+
159 * The "Not Received" state was set by reserve_seats().
160 */
161 if (av->av_buf[ptr] == DCCPAV_NOT_RECEIVED)
162 av->av_buf[ptr] = state;
163 else
164 dccp_pr_debug("Not changing %llu state to %u\n",
165 (unsigned long long)seqno, state);
166 break;
167 } 219 }
220 new_head += DCCP_MAX_ACKVEC_LEN;
221 }
168 222
169 distance += runlen + 1; 223 av->av_buf_head = new_head;
170 ptr = __ackvec_idx_add(ptr, 1);
171 224
172 } while (ptr != av->av_buf_tail); 225 if (gap > 0)
173} 226 memset(av->av_buf + av->av_buf_head + 1,
227 DCCP_ACKVEC_STATE_NOT_RECEIVED, gap);
174 228
175/* Mark @num entries after buf_head as "Not yet received". */ 229 av->av_buf[av->av_buf_head] = state;
176static void dccp_ackvec_reserve_seats(struct dccp_ackvec *av, u16 num) 230 av->av_vec_len += packets;
177{ 231 return 0;
178 u16 start = __ackvec_idx_add(av->av_buf_head, 1),
179 len = DCCPAV_MAX_ACKVEC_LEN - start;
180
181 /* check for buffer wrap-around */
182 if (num > len) {
183 memset(av->av_buf + start, DCCPAV_NOT_RECEIVED, len);
184 start = 0;
185 num -= len;
186 }
187 if (num)
188 memset(av->av_buf + start, DCCPAV_NOT_RECEIVED, num);
189} 232}
190 233
191/** 234/*
192 * dccp_ackvec_add_new - Record one or more new entries in Ack Vector buffer 235 * Implements the RFC 4340, Appendix A
193 * @av: container of buffer to update (can be empty or non-empty)
194 * @num_packets: number of packets to register (must be >= 1)
195 * @seqno: sequence number of the first packet in @num_packets
196 * @state: state in which packet carrying @seqno was received
197 */ 236 */
198static void dccp_ackvec_add_new(struct dccp_ackvec *av, u32 num_packets, 237int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk,
199 u64 seqno, enum dccp_ackvec_states state) 238 const u64 ackno, const u8 state)
200{ 239{
201 u32 num_cells = num_packets; 240 /*
241 * Check at the right places if the buffer is full, if it is, tell the
242 * caller to start dropping packets till the HC-Sender acks our ACK
243 * vectors, when we will free up space in av_buf.
244 *
245 * We may well decide to do buffer compression, etc, but for now lets
246 * just drop.
247 *
248 * From Appendix A.1.1 (`New Packets'):
249 *
250 * Of course, the circular buffer may overflow, either when the
251 * HC-Sender is sending data at a very high rate, when the
252 * HC-Receiver's acknowledgements are not reaching the HC-Sender,
253 * or when the HC-Sender is forgetting to acknowledge those acks
254 * (so the HC-Receiver is unable to clean up old state). In this
255 * case, the HC-Receiver should either compress the buffer (by
256 * increasing run lengths when possible), transfer its state to
257 * a larger buffer, or, as a last resort, drop all received
258 * packets, without processing them whatsoever, until its buffer
259 * shrinks again.
260 */
202 261
203 if (num_packets > DCCPAV_BURST_THRESH) { 262 /* See if this is the first ackno being inserted */
204 u32 lost_packets = num_packets - 1; 263 if (av->av_vec_len == 0) {
264 av->av_buf[av->av_buf_head] = state;
265 av->av_vec_len = 1;
266 } else if (after48(ackno, av->av_buf_ackno)) {
267 const u64 delta = dccp_delta_seqno(av->av_buf_ackno, ackno);
205 268
206 DCCP_WARN("Warning: large burst loss (%u)\n", lost_packets);
207 /* 269 /*
208 * We received 1 packet and have a loss of size "num_packets-1" 270 * Look if the state of this packet is the same as the
209 * which we squeeze into num_cells-1 rather than reserving an 271 * previous ackno and if so if we can bump the head len.
210 * entire byte for each lost packet.
211 * The reason is that the vector grows in O(burst_length); when
212 * it grows too large there will no room left for the payload.
213 * This is a trade-off: if a few packets out of the burst show
214 * up later, their state will not be changed; it is simply too
215 * costly to reshuffle/reallocate/copy the buffer each time.
216 * Should such problems persist, we will need to switch to a
217 * different underlying data structure.
218 */ 272 */
219 for (num_packets = num_cells = 1; lost_packets; ++num_cells) { 273 if (delta == 1 &&
220 u8 len = min(lost_packets, (u32)DCCPAV_MAX_RUNLEN); 274 dccp_ackvec_state(av, av->av_buf_head) == state &&
221 275 dccp_ackvec_len(av, av->av_buf_head) < DCCP_ACKVEC_LEN_MASK)
222 av->av_buf_head = __ackvec_idx_sub(av->av_buf_head, 1); 276 av->av_buf[av->av_buf_head]++;
223 av->av_buf[av->av_buf_head] = DCCPAV_NOT_RECEIVED | len; 277 else if (dccp_ackvec_set_buf_head_state(av, delta, state))
278 return -ENOBUFS;
279 } else {
280 /*
281 * A.1.2. Old Packets
282 *
283 * When a packet with Sequence Number S <= buf_ackno
284 * arrives, the HC-Receiver will scan the table for
285 * the byte corresponding to S. (Indexing structures
286 * could reduce the complexity of this scan.)
287 */
288 u64 delta = dccp_delta_seqno(ackno, av->av_buf_ackno);
289 u32 index = av->av_buf_head;
224 290
225 lost_packets -= len; 291 while (1) {
292 const u8 len = dccp_ackvec_len(av, index);
293 const u8 av_state = dccp_ackvec_state(av, index);
294 /*
295 * valid packets not yet in av_buf have a reserved
296 * entry, with a len equal to 0.
297 */
298 if (av_state == DCCP_ACKVEC_STATE_NOT_RECEIVED &&
299 len == 0 && delta == 0) { /* Found our
300 reserved seat! */
301 dccp_pr_debug("Found %llu reserved seat!\n",
302 (unsigned long long)ackno);
303 av->av_buf[index] = state;
304 goto out;
305 }
306 /* len == 0 means one packet */
307 if (delta < len + 1)
308 goto out_duplicate;
309
310 delta -= len + 1;
311 if (++index == DCCP_MAX_ACKVEC_LEN)
312 index = 0;
226 } 313 }
227 } 314 }
228 315
229 if (num_cells + dccp_ackvec_buflen(av) >= DCCPAV_MAX_ACKVEC_LEN) { 316 av->av_buf_ackno = ackno;
230 DCCP_CRIT("Ack Vector buffer overflow: dropping old entries\n"); 317 av->av_time = ktime_get_real();
231 av->av_overflow = true; 318out:
232 } 319 return 0;
233
234 av->av_buf_head = __ackvec_idx_sub(av->av_buf_head, num_packets);
235 if (av->av_overflow)
236 av->av_buf_tail = av->av_buf_head;
237
238 av->av_buf[av->av_buf_head] = state;
239 av->av_buf_ackno = seqno;
240 320
241 if (num_packets > 1) 321out_duplicate:
242 dccp_ackvec_reserve_seats(av, num_packets - 1); 322 /* Duplicate packet */
323 dccp_pr_debug("Received a dup or already considered lost "
324 "packet: %llu\n", (unsigned long long)ackno);
325 return -EILSEQ;
243} 326}
244 327
245/** 328static void dccp_ackvec_throw_record(struct dccp_ackvec *av,
246 * dccp_ackvec_input - Register incoming packet in the buffer 329 struct dccp_ackvec_record *avr)
247 */
248void dccp_ackvec_input(struct dccp_ackvec *av, struct sk_buff *skb)
249{ 330{
250 u64 seqno = DCCP_SKB_CB(skb)->dccpd_seq; 331 struct dccp_ackvec_record *next;
251 enum dccp_ackvec_states state = DCCPAV_RECEIVED;
252 332
253 if (dccp_ackvec_is_empty(av)) { 333 /* sort out vector length */
254 dccp_ackvec_add_new(av, 1, seqno, state); 334 if (av->av_buf_head <= avr->avr_ack_ptr)
255 av->av_tail_ackno = seqno; 335 av->av_vec_len = avr->avr_ack_ptr - av->av_buf_head;
336 else
337 av->av_vec_len = DCCP_MAX_ACKVEC_LEN - 1 -
338 av->av_buf_head + avr->avr_ack_ptr;
256 339
257 } else { 340 /* free records */
258 s64 num_packets = dccp_delta_seqno(av->av_buf_ackno, seqno); 341 list_for_each_entry_safe_from(avr, next, &av->av_records, avr_node) {
259 u8 *current_head = av->av_buf + av->av_buf_head; 342 list_del_init(&avr->avr_node);
260 343 dccp_ackvec_record_delete(avr);
261 if (num_packets == 1 && 344 }
262 dccp_ackvec_state(current_head) == state && 345}
263 dccp_ackvec_runlen(current_head) < DCCPAV_MAX_RUNLEN) {
264 346
265 *current_head += 1; 347void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av, struct sock *sk,
266 av->av_buf_ackno = seqno; 348 const u64 ackno)
349{
350 struct dccp_ackvec_record *avr;
267 351
268 } else if (num_packets > 0) { 352 /*
269 dccp_ackvec_add_new(av, num_packets, seqno, state); 353 * If we traverse backwards, it should be faster when we have large
270 } else { 354 * windows. We will be receiving ACKs for stuff we sent a while back
271 dccp_ackvec_update_old(av, num_packets, seqno, state); 355 * -sorbo.
272 } 356 */
357 list_for_each_entry_reverse(avr, &av->av_records, avr_node) {
358 if (ackno == avr->avr_ack_seqno) {
359 dccp_pr_debug("%s ACK packet 0, len=%d, ack_seqno=%llu, "
360 "ack_ackno=%llu, ACKED!\n",
361 dccp_role(sk), 1,
362 (unsigned long long)avr->avr_ack_seqno,
363 (unsigned long long)avr->avr_ack_ackno);
364 dccp_ackvec_throw_record(av, avr);
365 break;
366 } else if (avr->avr_ack_seqno > ackno)
367 break; /* old news */
273 } 368 }
274} 369}
275 370
276/** 371static void dccp_ackvec_check_rcv_ackvector(struct dccp_ackvec *av,
277 * dccp_ackvec_clear_state - Perform house-keeping / garbage-collection 372 struct sock *sk, u64 *ackno,
278 * This routine is called when the peer acknowledges the receipt of Ack Vectors 373 const unsigned char len,
279 * up to and including @ackno. While based on on section A.3 of RFC 4340, here 374 const unsigned char *vector)
280 * are additional precautions to prevent corrupted buffer state. In particular, 375{
281 * we use tail_ackno to identify outdated records; it always marks the earliest 376 unsigned char i;
282 * packet of group (2) in 11.4.2. 377 struct dccp_ackvec_record *avr;
283 */
284void dccp_ackvec_clear_state(struct dccp_ackvec *av, const u64 ackno)
285 {
286 struct dccp_ackvec_record *avr, *next;
287 u8 runlen_now, eff_runlen;
288 s64 delta;
289 378
290 avr = dccp_ackvec_lookup(&av->av_records, ackno); 379 /* Check if we actually sent an ACK vector */
291 if (avr == NULL) 380 if (list_empty(&av->av_records))
292 return; 381 return;
293 /*
294 * Deal with outdated acknowledgments: this arises when e.g. there are
295 * several old records and the acks from the peer come in slowly. In
296 * that case we may still have records that pre-date tail_ackno.
297 */
298 delta = dccp_delta_seqno(av->av_tail_ackno, avr->avr_ack_ackno);
299 if (delta < 0)
300 goto free_records;
301 /*
302 * Deal with overlapping Ack Vectors: don't subtract more than the
303 * number of packets between tail_ackno and ack_ackno.
304 */
305 eff_runlen = delta < avr->avr_ack_runlen ? delta : avr->avr_ack_runlen;
306 382
307 runlen_now = dccp_ackvec_runlen(av->av_buf + avr->avr_ack_ptr); 383 i = len;
308 /* 384 /*
309 * The run length of Ack Vector cells does not decrease over time. If 385 * XXX
310 * the run length is the same as at the time the Ack Vector was sent, we 386 * I think it might be more efficient to work backwards. See comment on
311 * free the ack_ptr cell. That cell can however not be freed if the run 387 * rcv_ackno. -sorbo.
312 * length has increased: in this case we need to move the tail pointer
313 * backwards (towards higher indices), to its next-oldest neighbour.
314 */ 388 */
315 if (runlen_now > eff_runlen) { 389 avr = list_entry(av->av_records.next, struct dccp_ackvec_record, avr_node);
390 while (i--) {
391 const u8 rl = *vector & DCCP_ACKVEC_LEN_MASK;
392 u64 ackno_end_rl;
316 393
317 av->av_buf[avr->avr_ack_ptr] -= eff_runlen + 1; 394 dccp_set_seqno(&ackno_end_rl, *ackno - rl);
318 av->av_buf_tail = __ackvec_idx_add(avr->avr_ack_ptr, 1);
319 395
320 /* This move may not have cleared the overflow flag. */
321 if (av->av_overflow)
322 av->av_overflow = (av->av_buf_head == av->av_buf_tail);
323 } else {
324 av->av_buf_tail = avr->avr_ack_ptr;
325 /* 396 /*
326 * We have made sure that avr points to a valid cell within the 397 * If our AVR sequence number is greater than the ack, go
327 * buffer. This cell is either older than head, or equals head 398 * forward in the AVR list until it is not so.
328 * (empty buffer): in both cases we no longer have any overflow.
329 */ 399 */
330 av->av_overflow = 0; 400 list_for_each_entry_from(avr, &av->av_records, avr_node) {
331 } 401 if (!after48(avr->avr_ack_seqno, *ackno))
332 402 goto found;
333 /* 403 }
334 * The peer has acknowledged up to and including ack_ackno. Hence the 404 /* End of the av_records list, not found, exit */
335 * first packet in group (2) of 11.4.2 is the successor of ack_ackno. 405 break;
336 */ 406found:
337 av->av_tail_ackno = ADD48(avr->avr_ack_ackno, 1); 407 if (between48(avr->avr_ack_seqno, ackno_end_rl, *ackno)) {
408 const u8 state = *vector & DCCP_ACKVEC_STATE_MASK;
409 if (state != DCCP_ACKVEC_STATE_NOT_RECEIVED) {
410 dccp_pr_debug("%s ACK vector 0, len=%d, "
411 "ack_seqno=%llu, ack_ackno=%llu, "
412 "ACKED!\n",
413 dccp_role(sk), len,
414 (unsigned long long)
415 avr->avr_ack_seqno,
416 (unsigned long long)
417 avr->avr_ack_ackno);
418 dccp_ackvec_throw_record(av, avr);
419 break;
420 }
421 /*
422 * If it wasn't received, continue scanning... we might
423 * find another one.
424 */
425 }
338 426
339free_records: 427 dccp_set_seqno(ackno, ackno_end_rl - 1);
340 list_for_each_entry_safe_from(avr, next, &av->av_records, avr_node) { 428 ++vector;
341 list_del(&avr->avr_node);
342 kmem_cache_free(dccp_ackvec_record_slab, avr);
343 } 429 }
344} 430}
345 431
346/* 432int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb,
347 * Routines to keep track of Ack Vectors received in an skb 433 u64 *ackno, const u8 opt, const u8 *value, const u8 len)
348 */
349int dccp_ackvec_parsed_add(struct list_head *head, u8 *vec, u8 len, u8 nonce)
350{ 434{
351 struct dccp_ackvec_parsed *new = kmalloc(sizeof(*new), GFP_ATOMIC); 435 if (len > DCCP_MAX_ACKVEC_OPT_LEN)
352 436 return -1;
353 if (new == NULL)
354 return -ENOBUFS;
355 new->vec = vec;
356 new->len = len;
357 new->nonce = nonce;
358 437
359 list_add_tail(&new->node, head); 438 /* dccp_ackvector_print(DCCP_SKB_CB(skb)->dccpd_ack_seq, value, len); */
439 dccp_ackvec_check_rcv_ackvector(dccp_sk(sk)->dccps_hc_rx_ackvec, sk,
440 ackno, len, value);
360 return 0; 441 return 0;
361} 442}
362EXPORT_SYMBOL_GPL(dccp_ackvec_parsed_add);
363
364void dccp_ackvec_parsed_cleanup(struct list_head *parsed_chunks)
365{
366 struct dccp_ackvec_parsed *cur, *next;
367
368 list_for_each_entry_safe(cur, next, parsed_chunks, node)
369 kfree(cur);
370 INIT_LIST_HEAD(parsed_chunks);
371}
372EXPORT_SYMBOL_GPL(dccp_ackvec_parsed_cleanup);
373 443
374int __init dccp_ackvec_init(void) 444int __init dccp_ackvec_init(void)
375{ 445{
@@ -379,9 +449,10 @@ int __init dccp_ackvec_init(void)
379 if (dccp_ackvec_slab == NULL) 449 if (dccp_ackvec_slab == NULL)
380 goto out_err; 450 goto out_err;
381 451
382 dccp_ackvec_record_slab = kmem_cache_create("dccp_ackvec_record", 452 dccp_ackvec_record_slab =
383 sizeof(struct dccp_ackvec_record), 453 kmem_cache_create("dccp_ackvec_record",
384 0, SLAB_HWCACHE_ALIGN, NULL); 454 sizeof(struct dccp_ackvec_record),
455 0, SLAB_HWCACHE_ALIGN, NULL);
385 if (dccp_ackvec_record_slab == NULL) 456 if (dccp_ackvec_record_slab == NULL)
386 goto out_destroy_slab; 457 goto out_destroy_slab;
387 458
diff --git a/net/dccp/ackvec.h b/net/dccp/ackvec.h
index 6cdca79a99f7..bcb64fb4acef 100644
--- a/net/dccp/ackvec.h
+++ b/net/dccp/ackvec.h
@@ -3,134 +3,156 @@
3/* 3/*
4 * net/dccp/ackvec.h 4 * net/dccp/ackvec.h
5 * 5 *
6 * An implementation of Ack Vectors for the DCCP protocol 6 * An implementation of the DCCP protocol
7 * Copyright (c) 2007 University of Aberdeen, Scotland, UK
8 * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@mandriva.com> 7 * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@mandriva.com>
8 *
9 * This program is free software; you can redistribute it and/or modify it 9 * This program is free software; you can redistribute it and/or modify it
10 * under the terms of the GNU General Public License version 2 as 10 * under the terms of the GNU General Public License version 2 as
11 * published by the Free Software Foundation. 11 * published by the Free Software Foundation.
12 */ 12 */
13 13
14#include <linux/dccp.h>
15#include <linux/compiler.h> 14#include <linux/compiler.h>
15#include <linux/ktime.h>
16#include <linux/list.h> 16#include <linux/list.h>
17#include <linux/types.h> 17#include <linux/types.h>
18 18
19/* 19/* Read about the ECN nonce to see why it is 253 */
20 * Ack Vector buffer space is static, in multiples of %DCCP_SINGLE_OPT_MAXLEN, 20#define DCCP_MAX_ACKVEC_OPT_LEN 253
21 * the maximum size of a single Ack Vector. Setting %DCCPAV_NUM_ACKVECS to 1 21/* We can spread an ack vector across multiple options */
22 * will be sufficient for most cases of low Ack Ratios, using a value of 2 gives 22#define DCCP_MAX_ACKVEC_LEN (DCCP_MAX_ACKVEC_OPT_LEN * 2)
23 * more headroom if Ack Ratio is higher or when the sender acknowledges slowly.
24 * The maximum value is bounded by the u16 types for indices and functions.
25 */
26#define DCCPAV_NUM_ACKVECS 2
27#define DCCPAV_MAX_ACKVEC_LEN (DCCP_SINGLE_OPT_MAXLEN * DCCPAV_NUM_ACKVECS)
28
29/* Estimated minimum average Ack Vector length - used for updating MPS */
30#define DCCPAV_MIN_OPTLEN 16
31
32/* Threshold for coping with large bursts of losses */
33#define DCCPAV_BURST_THRESH (DCCPAV_MAX_ACKVEC_LEN / 8)
34
35enum dccp_ackvec_states {
36 DCCPAV_RECEIVED = 0x00,
37 DCCPAV_ECN_MARKED = 0x40,
38 DCCPAV_RESERVED = 0x80,
39 DCCPAV_NOT_RECEIVED = 0xC0
40};
41#define DCCPAV_MAX_RUNLEN 0x3F
42 23
43static inline u8 dccp_ackvec_runlen(const u8 *cell) 24#define DCCP_ACKVEC_STATE_RECEIVED 0
44{ 25#define DCCP_ACKVEC_STATE_ECN_MARKED (1 << 6)
45 return *cell & DCCPAV_MAX_RUNLEN; 26#define DCCP_ACKVEC_STATE_NOT_RECEIVED (3 << 6)
46}
47 27
48static inline u8 dccp_ackvec_state(const u8 *cell) 28#define DCCP_ACKVEC_STATE_MASK 0xC0 /* 11000000 */
49{ 29#define DCCP_ACKVEC_LEN_MASK 0x3F /* 00111111 */
50 return *cell & ~DCCPAV_MAX_RUNLEN;
51}
52 30
53/** struct dccp_ackvec - Ack Vector main data structure 31/** struct dccp_ackvec - ack vector
32 *
33 * This data structure is the one defined in RFC 4340, Appendix A.
54 * 34 *
55 * This implements a fixed-size circular buffer within an array and is largely 35 * @av_buf_head - circular buffer head
56 * based on Appendix A of RFC 4340. 36 * @av_buf_tail - circular buffer tail
37 * @av_buf_ackno - ack # of the most recent packet acknowledgeable in the
38 * buffer (i.e. %av_buf_head)
39 * @av_buf_nonce - the one-bit sum of the ECN Nonces on all packets acked
40 * by the buffer with State 0
57 * 41 *
58 * @av_buf: circular buffer storage area 42 * Additionally, the HC-Receiver must keep some information about the
59 * @av_buf_head: head index; begin of live portion in @av_buf 43 * Ack Vectors it has recently sent. For each packet sent carrying an
60 * @av_buf_tail: tail index; first index _after_ the live portion in @av_buf 44 * Ack Vector, it remembers four variables:
61 * @av_buf_ackno: highest seqno of acknowledgeable packet recorded in @av_buf 45 *
62 * @av_tail_ackno: lowest seqno of acknowledgeable packet recorded in @av_buf 46 * @av_records - list of dccp_ackvec_record
63 * @av_buf_nonce: ECN nonce sums, each covering subsequent segments of up to 47 * @av_ack_nonce - the one-bit sum of the ECN Nonces for all State 0.
64 * %DCCP_SINGLE_OPT_MAXLEN cells in the live portion of @av_buf 48 *
65 * @av_overflow: if 1 then buf_head == buf_tail indicates buffer wraparound 49 * @av_time - the time in usecs
66 * @av_records: list of %dccp_ackvec_record (Ack Vectors sent previously) 50 * @av_buf - circular buffer of acknowledgeable packets
67 */ 51 */
68struct dccp_ackvec { 52struct dccp_ackvec {
69 u8 av_buf[DCCPAV_MAX_ACKVEC_LEN]; 53 u64 av_buf_ackno;
70 u16 av_buf_head;
71 u16 av_buf_tail;
72 u64 av_buf_ackno:48;
73 u64 av_tail_ackno:48;
74 bool av_buf_nonce[DCCPAV_NUM_ACKVECS];
75 u8 av_overflow:1;
76 struct list_head av_records; 54 struct list_head av_records;
55 ktime_t av_time;
56 u16 av_buf_head;
57 u16 av_vec_len;
58 u8 av_buf_nonce;
59 u8 av_ack_nonce;
60 u8 av_buf[DCCP_MAX_ACKVEC_LEN];
77}; 61};
78 62
79/** struct dccp_ackvec_record - Records information about sent Ack Vectors 63/** struct dccp_ackvec_record - ack vector record
80 * 64 *
81 * These list entries define the additional information which the HC-Receiver 65 * ACK vector record as defined in Appendix A of spec.
82 * keeps about recently-sent Ack Vectors; again refer to RFC 4340, Appendix A.
83 * 66 *
84 * @avr_node: the list node in @av_records 67 * The list is sorted by avr_ack_seqno
85 * @avr_ack_seqno: sequence number of the packet the Ack Vector was sent on
86 * @avr_ack_ackno: the Ack number that this record/Ack Vector refers to
87 * @avr_ack_ptr: pointer into @av_buf where this record starts
88 * @avr_ack_runlen: run length of @avr_ack_ptr at the time of sending
89 * @avr_ack_nonce: the sum of @av_buf_nonce's at the time this record was sent
90 * 68 *
91 * The list as a whole is sorted in descending order by @avr_ack_seqno. 69 * @avr_node - node in av_records
70 * @avr_ack_seqno - sequence number of the packet this record was sent on
71 * @avr_ack_ackno - sequence number being acknowledged
72 * @avr_ack_ptr - pointer into av_buf where this record starts
73 * @avr_ack_nonce - av_ack_nonce at the time this record was sent
74 * @avr_sent_len - lenght of the record in av_buf
92 */ 75 */
93struct dccp_ackvec_record { 76struct dccp_ackvec_record {
94 struct list_head avr_node; 77 struct list_head avr_node;
95 u64 avr_ack_seqno:48; 78 u64 avr_ack_seqno;
96 u64 avr_ack_ackno:48; 79 u64 avr_ack_ackno;
97 u16 avr_ack_ptr; 80 u16 avr_ack_ptr;
98 u8 avr_ack_runlen; 81 u16 avr_sent_len;
99 u8 avr_ack_nonce:1; 82 u8 avr_ack_nonce;
100}; 83};
101 84
102extern int dccp_ackvec_init(void); 85struct sock;
86struct sk_buff;
87
88#ifdef CONFIG_IP_DCCP_ACKVEC
89extern int dccp_ackvec_init(void);
103extern void dccp_ackvec_exit(void); 90extern void dccp_ackvec_exit(void);
104 91
105extern struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority); 92extern struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority);
106extern void dccp_ackvec_free(struct dccp_ackvec *av); 93extern void dccp_ackvec_free(struct dccp_ackvec *av);
107 94
108extern void dccp_ackvec_input(struct dccp_ackvec *av, struct sk_buff *skb); 95extern int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk,
109extern int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seq, u8 sum); 96 const u64 ackno, const u8 state);
110extern void dccp_ackvec_clear_state(struct dccp_ackvec *av, const u64 ackno); 97
111extern u16 dccp_ackvec_buflen(const struct dccp_ackvec *av); 98extern void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av,
99 struct sock *sk, const u64 ackno);
100extern int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb,
101 u64 *ackno, const u8 opt,
102 const u8 *value, const u8 len);
112 103
113static inline bool dccp_ackvec_is_empty(const struct dccp_ackvec *av) 104extern int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb);
105
106static inline int dccp_ackvec_pending(const struct dccp_ackvec *av)
107{
108 return av->av_vec_len;
109}
110#else /* CONFIG_IP_DCCP_ACKVEC */
111static inline int dccp_ackvec_init(void)
114{ 112{
115 return av->av_overflow == 0 && av->av_buf_head == av->av_buf_tail; 113 return 0;
116} 114}
117 115
118/** 116static inline void dccp_ackvec_exit(void)
119 * struct dccp_ackvec_parsed - Record offsets of Ack Vectors in skb 117{
120 * @vec: start of vector (offset into skb) 118}
121 * @len: length of @vec 119
122 * @nonce: whether @vec had an ECN nonce of 0 or 1 120static inline struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority)
123 * @node: FIFO - arranged in descending order of ack_ackno 121{
124 * This structure is used by CCIDs to access Ack Vectors in a received skb. 122 return NULL;
125 */ 123}
126struct dccp_ackvec_parsed { 124
127 u8 *vec, 125static inline void dccp_ackvec_free(struct dccp_ackvec *av)
128 len, 126{
129 nonce:1; 127}
130 struct list_head node; 128
131}; 129static inline int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk,
130 const u64 ackno, const u8 state)
131{
132 return -1;
133}
132 134
133extern int dccp_ackvec_parsed_add(struct list_head *head, 135static inline void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av,
134 u8 *vec, u8 len, u8 nonce); 136 struct sock *sk, const u64 ackno)
135extern void dccp_ackvec_parsed_cleanup(struct list_head *parsed_chunks); 137{
138}
139
140static inline int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb,
141 const u64 *ackno, const u8 opt,
142 const u8 *value, const u8 len)
143{
144 return -1;
145}
146
147static inline int dccp_insert_option_ackvec(const struct sock *sk,
148 const struct sk_buff *skb)
149{
150 return -1;
151}
152
153static inline int dccp_ackvec_pending(const struct dccp_ackvec *av)
154{
155 return 0;
156}
157#endif /* CONFIG_IP_DCCP_ACKVEC */
136#endif /* _ACKVEC_H */ 158#endif /* _ACKVEC_H */
diff --git a/net/dccp/ccid.c b/net/dccp/ccid.c
index e3fb52b4f5c6..4809753d12ae 100644
--- a/net/dccp/ccid.c
+++ b/net/dccp/ccid.c
@@ -13,13 +13,6 @@
13 13
14#include "ccid.h" 14#include "ccid.h"
15 15
16static u8 builtin_ccids[] = {
17 DCCPC_CCID2, /* CCID2 is supported by default */
18#if defined(CONFIG_IP_DCCP_CCID3) || defined(CONFIG_IP_DCCP_CCID3_MODULE)
19 DCCPC_CCID3,
20#endif
21};
22
23static struct ccid_operations *ccids[CCID_MAX]; 16static struct ccid_operations *ccids[CCID_MAX];
24#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT) 17#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
25static atomic_t ccids_lockct = ATOMIC_INIT(0); 18static atomic_t ccids_lockct = ATOMIC_INIT(0);
@@ -93,47 +86,6 @@ static void ccid_kmem_cache_destroy(struct kmem_cache *slab)
93 } 86 }
94} 87}
95 88
96/* check that up to @array_len members in @ccid_array are supported */
97bool ccid_support_check(u8 const *ccid_array, u8 array_len)
98{
99 u8 i, j, found;
100
101 for (i = 0, found = 0; i < array_len; i++, found = 0) {
102 for (j = 0; !found && j < ARRAY_SIZE(builtin_ccids); j++)
103 found = (ccid_array[i] == builtin_ccids[j]);
104 if (!found)
105 return false;
106 }
107 return true;
108}
109
110/**
111 * ccid_get_builtin_ccids - Provide copy of `builtin' CCID array
112 * @ccid_array: pointer to copy into
113 * @array_len: value to return length into
114 * This function allocates memory - caller must see that it is freed after use.
115 */
116int ccid_get_builtin_ccids(u8 **ccid_array, u8 *array_len)
117{
118 *ccid_array = kmemdup(builtin_ccids, sizeof(builtin_ccids), gfp_any());
119 if (*ccid_array == NULL)
120 return -ENOBUFS;
121 *array_len = ARRAY_SIZE(builtin_ccids);
122 return 0;
123}
124
125int ccid_getsockopt_builtin_ccids(struct sock *sk, int len,
126 char __user *optval, int __user *optlen)
127{
128 if (len < sizeof(builtin_ccids))
129 return -EINVAL;
130
131 if (put_user(sizeof(builtin_ccids), optlen) ||
132 copy_to_user(optval, builtin_ccids, sizeof(builtin_ccids)))
133 return -EFAULT;
134 return 0;
135}
136
137int ccid_register(struct ccid_operations *ccid_ops) 89int ccid_register(struct ccid_operations *ccid_ops)
138{ 90{
139 int err = -ENOBUFS; 91 int err = -ENOBUFS;
@@ -196,41 +148,22 @@ int ccid_unregister(struct ccid_operations *ccid_ops)
196 148
197EXPORT_SYMBOL_GPL(ccid_unregister); 149EXPORT_SYMBOL_GPL(ccid_unregister);
198 150
199/**
200 * ccid_request_module - Pre-load CCID module for later use
201 * This should be called only from process context (e.g. during connection
202 * setup) and is necessary for later calls to ccid_new (typically in software
203 * interrupt), so that it has the modules available when they are needed.
204 */
205static int ccid_request_module(u8 id)
206{
207 if (!in_atomic()) {
208 ccids_read_lock();
209 if (ccids[id] == NULL) {
210 ccids_read_unlock();
211 return request_module("net-dccp-ccid-%d", id);
212 }
213 ccids_read_unlock();
214 }
215 return 0;
216}
217
218int ccid_request_modules(u8 const *ccid_array, u8 array_len)
219{
220#ifdef CONFIG_KMOD
221 while (array_len--)
222 if (ccid_request_module(ccid_array[array_len]))
223 return -1;
224#endif
225 return 0;
226}
227
228struct ccid *ccid_new(unsigned char id, struct sock *sk, int rx, gfp_t gfp) 151struct ccid *ccid_new(unsigned char id, struct sock *sk, int rx, gfp_t gfp)
229{ 152{
230 struct ccid_operations *ccid_ops; 153 struct ccid_operations *ccid_ops;
231 struct ccid *ccid = NULL; 154 struct ccid *ccid = NULL;
232 155
233 ccids_read_lock(); 156 ccids_read_lock();
157#ifdef CONFIG_KMOD
158 if (ccids[id] == NULL) {
159 /* We only try to load if in process context */
160 ccids_read_unlock();
161 if (gfp & GFP_ATOMIC)
162 goto out;
163 request_module("net-dccp-ccid-%d", id);
164 ccids_read_lock();
165 }
166#endif
234 ccid_ops = ccids[id]; 167 ccid_ops = ccids[id];
235 if (ccid_ops == NULL) 168 if (ccid_ops == NULL)
236 goto out_unlock; 169 goto out_unlock;
@@ -272,6 +205,20 @@ out_module_put:
272 205
273EXPORT_SYMBOL_GPL(ccid_new); 206EXPORT_SYMBOL_GPL(ccid_new);
274 207
208struct ccid *ccid_hc_rx_new(unsigned char id, struct sock *sk, gfp_t gfp)
209{
210 return ccid_new(id, sk, 1, gfp);
211}
212
213EXPORT_SYMBOL_GPL(ccid_hc_rx_new);
214
215struct ccid *ccid_hc_tx_new(unsigned char id,struct sock *sk, gfp_t gfp)
216{
217 return ccid_new(id, sk, 0, gfp);
218}
219
220EXPORT_SYMBOL_GPL(ccid_hc_tx_new);
221
275static void ccid_delete(struct ccid *ccid, struct sock *sk, int rx) 222static void ccid_delete(struct ccid *ccid, struct sock *sk, int rx)
276{ 223{
277 struct ccid_operations *ccid_ops; 224 struct ccid_operations *ccid_ops;
diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h
index d27054ba2159..fdeae7b57319 100644
--- a/net/dccp/ccid.h
+++ b/net/dccp/ccid.h
@@ -60,18 +60,22 @@ struct ccid_operations {
60 void (*ccid_hc_tx_exit)(struct sock *sk); 60 void (*ccid_hc_tx_exit)(struct sock *sk);
61 void (*ccid_hc_rx_packet_recv)(struct sock *sk, 61 void (*ccid_hc_rx_packet_recv)(struct sock *sk,
62 struct sk_buff *skb); 62 struct sk_buff *skb);
63 int (*ccid_hc_rx_parse_options)(struct sock *sk, u8 pkt, 63 int (*ccid_hc_rx_parse_options)(struct sock *sk,
64 u8 opt, u8 *val, u8 len); 64 unsigned char option,
65 unsigned char len, u16 idx,
66 unsigned char* value);
65 int (*ccid_hc_rx_insert_options)(struct sock *sk, 67 int (*ccid_hc_rx_insert_options)(struct sock *sk,
66 struct sk_buff *skb); 68 struct sk_buff *skb);
67 void (*ccid_hc_tx_packet_recv)(struct sock *sk, 69 void (*ccid_hc_tx_packet_recv)(struct sock *sk,
68 struct sk_buff *skb); 70 struct sk_buff *skb);
69 int (*ccid_hc_tx_parse_options)(struct sock *sk, u8 pkt, 71 int (*ccid_hc_tx_parse_options)(struct sock *sk,
70 u8 opt, u8 *val, u8 len); 72 unsigned char option,
73 unsigned char len, u16 idx,
74 unsigned char* value);
71 int (*ccid_hc_tx_send_packet)(struct sock *sk, 75 int (*ccid_hc_tx_send_packet)(struct sock *sk,
72 struct sk_buff *skb); 76 struct sk_buff *skb);
73 void (*ccid_hc_tx_packet_sent)(struct sock *sk, 77 void (*ccid_hc_tx_packet_sent)(struct sock *sk,
74 unsigned int len); 78 int more, unsigned int len);
75 void (*ccid_hc_rx_get_info)(struct sock *sk, 79 void (*ccid_hc_rx_get_info)(struct sock *sk,
76 struct tcp_info *info); 80 struct tcp_info *info);
77 void (*ccid_hc_tx_get_info)(struct sock *sk, 81 void (*ccid_hc_tx_get_info)(struct sock *sk,
@@ -99,78 +103,31 @@ static inline void *ccid_priv(const struct ccid *ccid)
99 return (void *)ccid->ccid_priv; 103 return (void *)ccid->ccid_priv;
100} 104}
101 105
102extern bool ccid_support_check(u8 const *ccid_array, u8 array_len);
103extern int ccid_get_builtin_ccids(u8 **ccid_array, u8 *array_len);
104extern int ccid_getsockopt_builtin_ccids(struct sock *sk, int len,
105 char __user *, int __user *);
106
107extern int ccid_request_modules(u8 const *ccid_array, u8 array_len);
108extern struct ccid *ccid_new(unsigned char id, struct sock *sk, int rx, 106extern struct ccid *ccid_new(unsigned char id, struct sock *sk, int rx,
109 gfp_t gfp); 107 gfp_t gfp);
110 108
111static inline int ccid_get_current_rx_ccid(struct dccp_sock *dp) 109extern struct ccid *ccid_hc_rx_new(unsigned char id, struct sock *sk,
112{ 110 gfp_t gfp);
113 struct ccid *ccid = dp->dccps_hc_rx_ccid; 111extern struct ccid *ccid_hc_tx_new(unsigned char id, struct sock *sk,
114 112 gfp_t gfp);
115 if (ccid == NULL || ccid->ccid_ops == NULL)
116 return -1;
117 return ccid->ccid_ops->ccid_id;
118}
119
120static inline int ccid_get_current_tx_ccid(struct dccp_sock *dp)
121{
122 struct ccid *ccid = dp->dccps_hc_tx_ccid;
123
124 if (ccid == NULL || ccid->ccid_ops == NULL)
125 return -1;
126 return ccid->ccid_ops->ccid_id;
127}
128 113
129extern void ccid_hc_rx_delete(struct ccid *ccid, struct sock *sk); 114extern void ccid_hc_rx_delete(struct ccid *ccid, struct sock *sk);
130extern void ccid_hc_tx_delete(struct ccid *ccid, struct sock *sk); 115extern void ccid_hc_tx_delete(struct ccid *ccid, struct sock *sk);
131 116
132/*
133 * Congestion control of queued data packets via CCID decision.
134 *
135 * The TX CCID performs its congestion-control by indicating whether and when a
136 * queued packet may be sent, using the return code of ccid_hc_tx_send_packet().
137 * The following modes are supported via the symbolic constants below:
138 * - timer-based pacing (CCID returns a delay value in milliseconds);
139 * - autonomous dequeueing (CCID internally schedules dccps_xmitlet).
140 */
141
142enum ccid_dequeueing_decision {
143 CCID_PACKET_SEND_AT_ONCE = 0x00000, /* "green light": no delay */
144 CCID_PACKET_DELAY_MAX = 0x0FFFF, /* maximum delay in msecs */
145 CCID_PACKET_DELAY = 0x10000, /* CCID msec-delay mode */
146 CCID_PACKET_WILL_DEQUEUE_LATER = 0x20000, /* CCID autonomous mode */
147 CCID_PACKET_ERR = 0xF0000, /* error condition */
148};
149
150static inline int ccid_packet_dequeue_eval(const int return_code)
151{
152 if (return_code < 0)
153 return CCID_PACKET_ERR;
154 if (return_code == 0)
155 return CCID_PACKET_SEND_AT_ONCE;
156 if (return_code <= CCID_PACKET_DELAY_MAX)
157 return CCID_PACKET_DELAY;
158 return return_code;
159}
160
161static inline int ccid_hc_tx_send_packet(struct ccid *ccid, struct sock *sk, 117static inline int ccid_hc_tx_send_packet(struct ccid *ccid, struct sock *sk,
162 struct sk_buff *skb) 118 struct sk_buff *skb)
163{ 119{
120 int rc = 0;
164 if (ccid->ccid_ops->ccid_hc_tx_send_packet != NULL) 121 if (ccid->ccid_ops->ccid_hc_tx_send_packet != NULL)
165 return ccid->ccid_ops->ccid_hc_tx_send_packet(sk, skb); 122 rc = ccid->ccid_ops->ccid_hc_tx_send_packet(sk, skb);
166 return CCID_PACKET_SEND_AT_ONCE; 123 return rc;
167} 124}
168 125
169static inline void ccid_hc_tx_packet_sent(struct ccid *ccid, struct sock *sk, 126static inline void ccid_hc_tx_packet_sent(struct ccid *ccid, struct sock *sk,
170 unsigned int len) 127 int more, unsigned int len)
171{ 128{
172 if (ccid->ccid_ops->ccid_hc_tx_packet_sent != NULL) 129 if (ccid->ccid_ops->ccid_hc_tx_packet_sent != NULL)
173 ccid->ccid_ops->ccid_hc_tx_packet_sent(sk, len); 130 ccid->ccid_ops->ccid_hc_tx_packet_sent(sk, more, len);
174} 131}
175 132
176static inline void ccid_hc_rx_packet_recv(struct ccid *ccid, struct sock *sk, 133static inline void ccid_hc_rx_packet_recv(struct ccid *ccid, struct sock *sk,
@@ -187,31 +144,27 @@ static inline void ccid_hc_tx_packet_recv(struct ccid *ccid, struct sock *sk,
187 ccid->ccid_ops->ccid_hc_tx_packet_recv(sk, skb); 144 ccid->ccid_ops->ccid_hc_tx_packet_recv(sk, skb);
188} 145}
189 146
190/**
191 * ccid_hc_tx_parse_options - Parse CCID-specific options sent by the receiver
192 * @pkt: type of packet that @opt appears on (RFC 4340, 5.1)
193 * @opt: the CCID-specific option type (RFC 4340, 5.8 and 10.3)
194 * @val: value of @opt
195 * @len: length of @val in bytes
196 */
197static inline int ccid_hc_tx_parse_options(struct ccid *ccid, struct sock *sk, 147static inline int ccid_hc_tx_parse_options(struct ccid *ccid, struct sock *sk,
198 u8 pkt, u8 opt, u8 *val, u8 len) 148 unsigned char option,
149 unsigned char len, u16 idx,
150 unsigned char* value)
199{ 151{
200 if (ccid->ccid_ops->ccid_hc_tx_parse_options == NULL) 152 int rc = 0;
201 return 0; 153 if (ccid->ccid_ops->ccid_hc_tx_parse_options != NULL)
202 return ccid->ccid_ops->ccid_hc_tx_parse_options(sk, pkt, opt, val, len); 154 rc = ccid->ccid_ops->ccid_hc_tx_parse_options(sk, option, len, idx,
155 value);
156 return rc;
203} 157}
204 158
205/**
206 * ccid_hc_rx_parse_options - Parse CCID-specific options sent by the sender
207 * Arguments are analogous to ccid_hc_tx_parse_options()
208 */
209static inline int ccid_hc_rx_parse_options(struct ccid *ccid, struct sock *sk, 159static inline int ccid_hc_rx_parse_options(struct ccid *ccid, struct sock *sk,
210 u8 pkt, u8 opt, u8 *val, u8 len) 160 unsigned char option,
161 unsigned char len, u16 idx,
162 unsigned char* value)
211{ 163{
212 if (ccid->ccid_ops->ccid_hc_rx_parse_options == NULL) 164 int rc = 0;
213 return 0; 165 if (ccid->ccid_ops->ccid_hc_rx_parse_options != NULL)
214 return ccid->ccid_ops->ccid_hc_rx_parse_options(sk, pkt, opt, val, len); 166 rc = ccid->ccid_ops->ccid_hc_rx_parse_options(sk, option, len, idx, value);
167 return rc;
215} 168}
216 169
217static inline int ccid_hc_rx_insert_options(struct ccid *ccid, struct sock *sk, 170static inline int ccid_hc_rx_insert_options(struct ccid *ccid, struct sock *sk,
diff --git a/net/dccp/ccids/Kconfig b/net/dccp/ccids/Kconfig
index fb168be2cb43..12275943eab8 100644
--- a/net/dccp/ccids/Kconfig
+++ b/net/dccp/ccids/Kconfig
@@ -1,8 +1,10 @@
1menu "DCCP CCIDs Configuration (EXPERIMENTAL)" 1menu "DCCP CCIDs Configuration (EXPERIMENTAL)"
2 depends on EXPERIMENTAL
2 3
3config IP_DCCP_CCID2 4config IP_DCCP_CCID2
4 tristate "CCID2 (TCP-Like)" 5 tristate "CCID2 (TCP-Like) (EXPERIMENTAL)"
5 def_tristate IP_DCCP 6 def_tristate IP_DCCP
7 select IP_DCCP_ACKVEC
6 ---help--- 8 ---help---
7 CCID 2, TCP-like Congestion Control, denotes Additive Increase, 9 CCID 2, TCP-like Congestion Control, denotes Additive Increase,
8 Multiplicative Decrease (AIMD) congestion control with behavior 10 Multiplicative Decrease (AIMD) congestion control with behavior
@@ -34,7 +36,7 @@ config IP_DCCP_CCID2_DEBUG
34 If in doubt, say N. 36 If in doubt, say N.
35 37
36config IP_DCCP_CCID3 38config IP_DCCP_CCID3
37 tristate "CCID3 (TCP-Friendly)" 39 tristate "CCID3 (TCP-Friendly) (EXPERIMENTAL)"
38 def_tristate IP_DCCP 40 def_tristate IP_DCCP
39 select IP_DCCP_TFRC_LIB 41 select IP_DCCP_TFRC_LIB
40 ---help--- 42 ---help---
@@ -62,9 +64,9 @@ config IP_DCCP_CCID3
62 64
63 If in doubt, say M. 65 If in doubt, say M.
64 66
65if IP_DCCP_CCID3
66config IP_DCCP_CCID3_DEBUG 67config IP_DCCP_CCID3_DEBUG
67 bool "CCID3 debugging messages" 68 bool "CCID3 debugging messages"
69 depends on IP_DCCP_CCID3
68 ---help--- 70 ---help---
69 Enable CCID3-specific debugging messages. 71 Enable CCID3-specific debugging messages.
70 72
@@ -74,29 +76,10 @@ config IP_DCCP_CCID3_DEBUG
74 76
75 If in doubt, say N. 77 If in doubt, say N.
76 78
77choice
78 prompt "Select method for measuring the packet size s"
79 default IP_DCCP_CCID3_MEASURE_S_AS_MPS
80
81config IP_DCCP_CCID3_MEASURE_S_AS_MPS
82 bool "Always use MPS in place of s"
83 ---help---
84 This use is recommended as it is consistent with the initialisation
85 of X and suggested when s varies (rfc3448bis, (1) in section 4.1).
86config IP_DCCP_CCID3_MEASURE_S_AS_AVG
87 bool "Use moving average"
88 ---help---
89 An alternative way of tracking s, also supported by rfc3448bis.
90 This used to be the default for CCID-3 in previous kernels.
91config IP_DCCP_CCID3_MEASURE_S_AS_MAX
92 bool "Track the maximum payload length"
93 ---help---
94 An experimental method based on tracking the maximum packet size.
95endchoice
96
97config IP_DCCP_CCID3_RTO 79config IP_DCCP_CCID3_RTO
98 int "Use higher bound for nofeedback timer" 80 int "Use higher bound for nofeedback timer"
99 default 100 81 default 100
82 depends on IP_DCCP_CCID3 && EXPERIMENTAL
100 ---help--- 83 ---help---
101 Use higher lower bound for nofeedback timer expiration. 84 Use higher lower bound for nofeedback timer expiration.
102 85
@@ -123,7 +106,6 @@ config IP_DCCP_CCID3_RTO
123 The purpose of the nofeedback timer is to slow DCCP down when there 106 The purpose of the nofeedback timer is to slow DCCP down when there
124 is serious network congestion: experimenting with larger values should 107 is serious network congestion: experimenting with larger values should
125 therefore not be performed on WANs. 108 therefore not be performed on WANs.
126endif # IP_DCCP_CCID3
127 109
128config IP_DCCP_TFRC_LIB 110config IP_DCCP_TFRC_LIB
129 tristate 111 tristate
diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c
index fa713227c66f..9a430734530c 100644
--- a/net/dccp/ccids/ccid2.c
+++ b/net/dccp/ccids/ccid2.c
@@ -25,7 +25,7 @@
25/* 25/*
26 * This implementation should follow RFC 4341 26 * This implementation should follow RFC 4341
27 */ 27 */
28#include "../feat.h" 28
29#include "../ccid.h" 29#include "../ccid.h"
30#include "../dccp.h" 30#include "../dccp.h"
31#include "ccid2.h" 31#include "ccid2.h"
@@ -34,8 +34,51 @@
34#ifdef CONFIG_IP_DCCP_CCID2_DEBUG 34#ifdef CONFIG_IP_DCCP_CCID2_DEBUG
35static int ccid2_debug; 35static int ccid2_debug;
36#define ccid2_pr_debug(format, a...) DCCP_PR_DEBUG(ccid2_debug, format, ##a) 36#define ccid2_pr_debug(format, a...) DCCP_PR_DEBUG(ccid2_debug, format, ##a)
37
38static void ccid2_hc_tx_check_sanity(const struct ccid2_hc_tx_sock *hctx)
39{
40 int len = 0;
41 int pipe = 0;
42 struct ccid2_seq *seqp = hctx->ccid2hctx_seqh;
43
44 /* there is data in the chain */
45 if (seqp != hctx->ccid2hctx_seqt) {
46 seqp = seqp->ccid2s_prev;
47 len++;
48 if (!seqp->ccid2s_acked)
49 pipe++;
50
51 while (seqp != hctx->ccid2hctx_seqt) {
52 struct ccid2_seq *prev = seqp->ccid2s_prev;
53
54 len++;
55 if (!prev->ccid2s_acked)
56 pipe++;
57
58 /* packets are sent sequentially */
59 BUG_ON(dccp_delta_seqno(seqp->ccid2s_seq,
60 prev->ccid2s_seq ) >= 0);
61 BUG_ON(time_before(seqp->ccid2s_sent,
62 prev->ccid2s_sent));
63
64 seqp = prev;
65 }
66 }
67
68 BUG_ON(pipe != hctx->ccid2hctx_pipe);
69 ccid2_pr_debug("len of chain=%d\n", len);
70
71 do {
72 seqp = seqp->ccid2s_prev;
73 len++;
74 } while (seqp != hctx->ccid2hctx_seqh);
75
76 ccid2_pr_debug("total len=%d\n", len);
77 BUG_ON(len != hctx->ccid2hctx_seqbufc * CCID2_SEQBUF_LEN);
78}
37#else 79#else
38#define ccid2_pr_debug(format, a...) 80#define ccid2_pr_debug(format, a...)
81#define ccid2_hc_tx_check_sanity(hctx)
39#endif 82#endif
40 83
41static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hctx) 84static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hctx)
@@ -44,7 +87,8 @@ static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hctx)
44 int i; 87 int i;
45 88
46 /* check if we have space to preserve the pointer to the buffer */ 89 /* check if we have space to preserve the pointer to the buffer */
47 if (hctx->seqbufc >= sizeof(hctx->seqbuf) / sizeof(struct ccid2_seq *)) 90 if (hctx->ccid2hctx_seqbufc >= (sizeof(hctx->ccid2hctx_seqbuf) /
91 sizeof(struct ccid2_seq*)))
48 return -ENOMEM; 92 return -ENOMEM;
49 93
50 /* allocate buffer and initialize linked list */ 94 /* allocate buffer and initialize linked list */
@@ -60,35 +104,38 @@ static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hctx)
60 seqp->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1]; 104 seqp->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1];
61 105
62 /* This is the first allocation. Initiate the head and tail. */ 106 /* This is the first allocation. Initiate the head and tail. */
63 if (hctx->seqbufc == 0) 107 if (hctx->ccid2hctx_seqbufc == 0)
64 hctx->seqh = hctx->seqt = seqp; 108 hctx->ccid2hctx_seqh = hctx->ccid2hctx_seqt = seqp;
65 else { 109 else {
66 /* link the existing list with the one we just created */ 110 /* link the existing list with the one we just created */
67 hctx->seqh->ccid2s_next = seqp; 111 hctx->ccid2hctx_seqh->ccid2s_next = seqp;
68 seqp->ccid2s_prev = hctx->seqh; 112 seqp->ccid2s_prev = hctx->ccid2hctx_seqh;
69 113
70 hctx->seqt->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1]; 114 hctx->ccid2hctx_seqt->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1];
71 seqp[CCID2_SEQBUF_LEN - 1].ccid2s_next = hctx->seqt; 115 seqp[CCID2_SEQBUF_LEN - 1].ccid2s_next = hctx->ccid2hctx_seqt;
72 } 116 }
73 117
74 /* store the original pointer to the buffer so we can free it */ 118 /* store the original pointer to the buffer so we can free it */
75 hctx->seqbuf[hctx->seqbufc] = seqp; 119 hctx->ccid2hctx_seqbuf[hctx->ccid2hctx_seqbufc] = seqp;
76 hctx->seqbufc++; 120 hctx->ccid2hctx_seqbufc++;
77 121
78 return 0; 122 return 0;
79} 123}
80 124
81static int ccid2_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) 125static int ccid2_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
82{ 126{
83 if (ccid2_cwnd_network_limited(ccid2_hc_tx_sk(sk))) 127 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
84 return CCID_PACKET_WILL_DEQUEUE_LATER; 128
85 return CCID_PACKET_SEND_AT_ONCE; 129 if (hctx->ccid2hctx_pipe < hctx->ccid2hctx_cwnd)
130 return 0;
131
132 return 1; /* XXX CCID should dequeue when ready instead of polling */
86} 133}
87 134
88static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val) 135static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val)
89{ 136{
90 struct dccp_sock *dp = dccp_sk(sk); 137 struct dccp_sock *dp = dccp_sk(sk);
91 u32 max_ratio = DIV_ROUND_UP(ccid2_hc_tx_sk(sk)->cwnd, 2); 138 u32 max_ratio = DIV_ROUND_UP(ccid2_hc_tx_sk(sk)->ccid2hctx_cwnd, 2);
92 139
93 /* 140 /*
94 * Ensure that Ack Ratio does not exceed ceil(cwnd/2), which is (2) from 141 * Ensure that Ack Ratio does not exceed ceil(cwnd/2), which is (2) from
@@ -100,8 +147,8 @@ static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val)
100 DCCP_WARN("Limiting Ack Ratio (%u) to %u\n", val, max_ratio); 147 DCCP_WARN("Limiting Ack Ratio (%u) to %u\n", val, max_ratio);
101 val = max_ratio; 148 val = max_ratio;
102 } 149 }
103 if (val > DCCPF_ACK_RATIO_MAX) 150 if (val > 0xFFFF) /* RFC 4340, 11.3 */
104 val = DCCPF_ACK_RATIO_MAX; 151 val = 0xFFFF;
105 152
106 if (val == dp->dccps_l_ack_ratio) 153 if (val == dp->dccps_l_ack_ratio)
107 return; 154 return;
@@ -110,77 +157,99 @@ static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val)
110 dp->dccps_l_ack_ratio = val; 157 dp->dccps_l_ack_ratio = val;
111} 158}
112 159
160static void ccid2_change_srtt(struct ccid2_hc_tx_sock *hctx, long val)
161{
162 ccid2_pr_debug("change SRTT to %ld\n", val);
163 hctx->ccid2hctx_srtt = val;
164}
165
166static void ccid2_start_rto_timer(struct sock *sk);
167
113static void ccid2_hc_tx_rto_expire(unsigned long data) 168static void ccid2_hc_tx_rto_expire(unsigned long data)
114{ 169{
115 struct sock *sk = (struct sock *)data; 170 struct sock *sk = (struct sock *)data;
116 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); 171 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
117 const bool sender_was_blocked = ccid2_cwnd_network_limited(hctx); 172 long s;
118 173
119 bh_lock_sock(sk); 174 bh_lock_sock(sk);
120 if (sock_owned_by_user(sk)) { 175 if (sock_owned_by_user(sk)) {
121 sk_reset_timer(sk, &hctx->rtotimer, jiffies + HZ / 5); 176 sk_reset_timer(sk, &hctx->ccid2hctx_rtotimer,
177 jiffies + HZ / 5);
122 goto out; 178 goto out;
123 } 179 }
124 180
125 ccid2_pr_debug("RTO_EXPIRE\n"); 181 ccid2_pr_debug("RTO_EXPIRE\n");
126 182
183 ccid2_hc_tx_check_sanity(hctx);
184
127 /* back-off timer */ 185 /* back-off timer */
128 hctx->rto <<= 1; 186 hctx->ccid2hctx_rto <<= 1;
129 if (hctx->rto > DCCP_RTO_MAX) 187
130 hctx->rto = DCCP_RTO_MAX; 188 s = hctx->ccid2hctx_rto / HZ;
189 if (s > 60)
190 hctx->ccid2hctx_rto = 60 * HZ;
191
192 ccid2_start_rto_timer(sk);
131 193
132 /* adjust pipe, cwnd etc */ 194 /* adjust pipe, cwnd etc */
133 hctx->ssthresh = hctx->cwnd / 2; 195 hctx->ccid2hctx_ssthresh = hctx->ccid2hctx_cwnd / 2;
134 if (hctx->ssthresh < 2) 196 if (hctx->ccid2hctx_ssthresh < 2)
135 hctx->ssthresh = 2; 197 hctx->ccid2hctx_ssthresh = 2;
136 hctx->cwnd = 1; 198 hctx->ccid2hctx_cwnd = 1;
137 hctx->pipe = 0; 199 hctx->ccid2hctx_pipe = 0;
138 200
139 /* clear state about stuff we sent */ 201 /* clear state about stuff we sent */
140 hctx->seqt = hctx->seqh; 202 hctx->ccid2hctx_seqt = hctx->ccid2hctx_seqh;
141 hctx->packets_acked = 0; 203 hctx->ccid2hctx_packets_acked = 0;
142 204
143 /* clear ack ratio state. */ 205 /* clear ack ratio state. */
144 hctx->rpseq = 0; 206 hctx->ccid2hctx_rpseq = 0;
145 hctx->rpdupack = -1; 207 hctx->ccid2hctx_rpdupack = -1;
146 ccid2_change_l_ack_ratio(sk, 1); 208 ccid2_change_l_ack_ratio(sk, 1);
147 209 ccid2_hc_tx_check_sanity(hctx);
148 /* if we were blocked before, we may now send cwnd=1 packet */
149 if (sender_was_blocked)
150 tasklet_schedule(&dccp_sk(sk)->dccps_xmitlet);
151 /* restart backed-off timer */
152 sk_reset_timer(sk, &hctx->rtotimer, jiffies + hctx->rto);
153out: 210out:
154 bh_unlock_sock(sk); 211 bh_unlock_sock(sk);
155 sock_put(sk); 212 sock_put(sk);
156} 213}
157 214
158static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len) 215static void ccid2_start_rto_timer(struct sock *sk)
216{
217 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
218
219 ccid2_pr_debug("setting RTO timeout=%ld\n", hctx->ccid2hctx_rto);
220
221 BUG_ON(timer_pending(&hctx->ccid2hctx_rtotimer));
222 sk_reset_timer(sk, &hctx->ccid2hctx_rtotimer,
223 jiffies + hctx->ccid2hctx_rto);
224}
225
226static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len)
159{ 227{
160 struct dccp_sock *dp = dccp_sk(sk); 228 struct dccp_sock *dp = dccp_sk(sk);
161 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); 229 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
162 struct ccid2_seq *next; 230 struct ccid2_seq *next;
163 231
164 hctx->pipe++; 232 hctx->ccid2hctx_pipe++;
165 233
166 hctx->seqh->ccid2s_seq = dp->dccps_gss; 234 hctx->ccid2hctx_seqh->ccid2s_seq = dp->dccps_gss;
167 hctx->seqh->ccid2s_acked = 0; 235 hctx->ccid2hctx_seqh->ccid2s_acked = 0;
168 hctx->seqh->ccid2s_sent = jiffies; 236 hctx->ccid2hctx_seqh->ccid2s_sent = jiffies;
169 237
170 next = hctx->seqh->ccid2s_next; 238 next = hctx->ccid2hctx_seqh->ccid2s_next;
171 /* check if we need to alloc more space */ 239 /* check if we need to alloc more space */
172 if (next == hctx->seqt) { 240 if (next == hctx->ccid2hctx_seqt) {
173 if (ccid2_hc_tx_alloc_seq(hctx)) { 241 if (ccid2_hc_tx_alloc_seq(hctx)) {
174 DCCP_CRIT("packet history - out of memory!"); 242 DCCP_CRIT("packet history - out of memory!");
175 /* FIXME: find a more graceful way to bail out */ 243 /* FIXME: find a more graceful way to bail out */
176 return; 244 return;
177 } 245 }
178 next = hctx->seqh->ccid2s_next; 246 next = hctx->ccid2hctx_seqh->ccid2s_next;
179 BUG_ON(next == hctx->seqt); 247 BUG_ON(next == hctx->ccid2hctx_seqt);
180 } 248 }
181 hctx->seqh = next; 249 hctx->ccid2hctx_seqh = next;
182 250
183 ccid2_pr_debug("cwnd=%d pipe=%d\n", hctx->cwnd, hctx->pipe); 251 ccid2_pr_debug("cwnd=%d pipe=%d\n", hctx->ccid2hctx_cwnd,
252 hctx->ccid2hctx_pipe);
184 253
185 /* 254 /*
186 * FIXME: The code below is broken and the variables have been removed 255 * FIXME: The code below is broken and the variables have been removed
@@ -203,12 +272,12 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len)
203 */ 272 */
204#if 0 273#if 0
205 /* Ack Ratio. Need to maintain a concept of how many windows we sent */ 274 /* Ack Ratio. Need to maintain a concept of how many windows we sent */
206 hctx->arsent++; 275 hctx->ccid2hctx_arsent++;
207 /* We had an ack loss in this window... */ 276 /* We had an ack loss in this window... */
208 if (hctx->ackloss) { 277 if (hctx->ccid2hctx_ackloss) {
209 if (hctx->arsent >= hctx->cwnd) { 278 if (hctx->ccid2hctx_arsent >= hctx->ccid2hctx_cwnd) {
210 hctx->arsent = 0; 279 hctx->ccid2hctx_arsent = 0;
211 hctx->ackloss = 0; 280 hctx->ccid2hctx_ackloss = 0;
212 } 281 }
213 } else { 282 } else {
214 /* No acks lost up to now... */ 283 /* No acks lost up to now... */
@@ -218,28 +287,28 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len)
218 int denom = dp->dccps_l_ack_ratio * dp->dccps_l_ack_ratio - 287 int denom = dp->dccps_l_ack_ratio * dp->dccps_l_ack_ratio -
219 dp->dccps_l_ack_ratio; 288 dp->dccps_l_ack_ratio;
220 289
221 denom = hctx->cwnd * hctx->cwnd / denom; 290 denom = hctx->ccid2hctx_cwnd * hctx->ccid2hctx_cwnd / denom;
222 291
223 if (hctx->arsent >= denom) { 292 if (hctx->ccid2hctx_arsent >= denom) {
224 ccid2_change_l_ack_ratio(sk, dp->dccps_l_ack_ratio - 1); 293 ccid2_change_l_ack_ratio(sk, dp->dccps_l_ack_ratio - 1);
225 hctx->arsent = 0; 294 hctx->ccid2hctx_arsent = 0;
226 } 295 }
227 } else { 296 } else {
228 /* we can't increase ack ratio further [1] */ 297 /* we can't increase ack ratio further [1] */
229 hctx->arsent = 0; /* or maybe set it to cwnd*/ 298 hctx->ccid2hctx_arsent = 0; /* or maybe set it to cwnd*/
230 } 299 }
231 } 300 }
232#endif 301#endif
233 302
234 /* setup RTO timer */ 303 /* setup RTO timer */
235 if (!timer_pending(&hctx->rtotimer)) 304 if (!timer_pending(&hctx->ccid2hctx_rtotimer))
236 sk_reset_timer(sk, &hctx->rtotimer, jiffies + hctx->rto); 305 ccid2_start_rto_timer(sk);
237 306
238#ifdef CONFIG_IP_DCCP_CCID2_DEBUG 307#ifdef CONFIG_IP_DCCP_CCID2_DEBUG
239 do { 308 do {
240 struct ccid2_seq *seqp = hctx->seqt; 309 struct ccid2_seq *seqp = hctx->ccid2hctx_seqt;
241 310
242 while (seqp != hctx->seqh) { 311 while (seqp != hctx->ccid2hctx_seqh) {
243 ccid2_pr_debug("out seq=%llu acked=%d time=%lu\n", 312 ccid2_pr_debug("out seq=%llu acked=%d time=%lu\n",
244 (unsigned long long)seqp->ccid2s_seq, 313 (unsigned long long)seqp->ccid2s_seq,
245 seqp->ccid2s_acked, seqp->ccid2s_sent); 314 seqp->ccid2s_acked, seqp->ccid2s_sent);
@@ -247,158 +316,205 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len)
247 } 316 }
248 } while (0); 317 } while (0);
249 ccid2_pr_debug("=========\n"); 318 ccid2_pr_debug("=========\n");
319 ccid2_hc_tx_check_sanity(hctx);
250#endif 320#endif
251} 321}
252 322
253/** 323/* XXX Lame code duplication!
254 * ccid2_rtt_estimator - Sample RTT and compute RTO using RFC2988 algorithm 324 * returns -1 if none was found.
255 * This code is almost identical with TCP's tcp_rtt_estimator(), since 325 * else returns the next offset to use in the function call.
256 * - it has a higher sampling frequency (recommended by RFC 1323),
257 * - the RTO does not collapse into RTT due to RTTVAR going towards zero,
258 * - it is simple (cf. more complex proposals such as Eifel timer or research
259 * which suggests that the gain should be set according to window size),
260 * - in tests it was found to work well with CCID2 [gerrit].
261 */ 326 */
262static void ccid2_rtt_estimator(struct sock *sk, const long mrtt) 327static int ccid2_ackvector(struct sock *sk, struct sk_buff *skb, int offset,
328 unsigned char **vec, unsigned char *veclen)
263{ 329{
264 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); 330 const struct dccp_hdr *dh = dccp_hdr(skb);
265 long m = mrtt ? : 1; 331 unsigned char *options = (unsigned char *)dh + dccp_hdr_len(skb);
266 332 unsigned char *opt_ptr;
267 if (hctx->srtt == 0) { 333 const unsigned char *opt_end = (unsigned char *)dh +
268 /* First measurement m */ 334 (dh->dccph_doff * 4);
269 hctx->srtt = m << 3; 335 unsigned char opt, len;
270 hctx->mdev = m << 1; 336 unsigned char *value;
271 337
272 hctx->mdev_max = max(TCP_RTO_MIN, hctx->mdev); 338 BUG_ON(offset < 0);
273 hctx->rttvar = hctx->mdev_max; 339 options += offset;
274 hctx->rtt_seq = dccp_sk(sk)->dccps_gss; 340 opt_ptr = options;
275 } else { 341 if (opt_ptr >= opt_end)
276 /* Update scaled SRTT as SRTT += 1/8 * (m - SRTT) */ 342 return -1;
277 m -= (hctx->srtt >> 3); 343
278 hctx->srtt += m; 344 while (opt_ptr != opt_end) {
279 345 opt = *opt_ptr++;
280 /* Similarly, update scaled mdev with regard to |m| */ 346 len = 0;
281 if (m < 0) { 347 value = NULL;
282 m = -m; 348
283 m -= (hctx->mdev >> 2); 349 /* Check if this isn't a single byte option */
350 if (opt > DCCPO_MAX_RESERVED) {
351 if (opt_ptr == opt_end)
352 goto out_invalid_option;
353
354 len = *opt_ptr++;
355 if (len < 3)
356 goto out_invalid_option;
284 /* 357 /*
285 * This neutralises RTO increase when RTT < SRTT - mdev 358 * Remove the type and len fields, leaving
286 * (see P. Sarolahti, A. Kuznetsov,"Congestion Control 359 * just the value size
287 * in Linux TCP", USENIX 2002, pp. 49-62).
288 */ 360 */
289 if (m > 0) 361 len -= 2;
290 m >>= 3; 362 value = opt_ptr;
291 } else { 363 opt_ptr += len;
292 m -= (hctx->mdev >> 2);
293 }
294 hctx->mdev += m;
295 364
296 if (hctx->mdev > hctx->mdev_max) { 365 if (opt_ptr > opt_end)
297 hctx->mdev_max = hctx->mdev; 366 goto out_invalid_option;
298 if (hctx->mdev_max > hctx->rttvar)
299 hctx->rttvar = hctx->mdev_max;
300 } 367 }
301 368
302 /* 369 switch (opt) {
303 * Decay RTTVAR at most once per flight, exploiting that 370 case DCCPO_ACK_VECTOR_0:
304 * 1) pipe <= cwnd <= Sequence_Window = W (RFC 4340, 7.5.2) 371 case DCCPO_ACK_VECTOR_1:
305 * 2) AWL = GSS-W+1 <= GAR <= GSS (RFC 4340, 7.5.1) 372 *vec = value;
306 * GAR is a useful bound for FlightSize = pipe, AWL is probably 373 *veclen = len;
307 * too low as it over-estimates pipe. 374 return offset + (opt_ptr - options);
308 */
309 if (after48(dccp_sk(sk)->dccps_gar, hctx->rtt_seq)) {
310 if (hctx->mdev_max < hctx->rttvar)
311 hctx->rttvar -= (hctx->rttvar -
312 hctx->mdev_max) >> 2;
313 hctx->rtt_seq = dccp_sk(sk)->dccps_gss;
314 hctx->mdev_max = TCP_RTO_MIN;
315 } 375 }
316 } 376 }
317 377
318 /* 378 return -1;
319 * Set RTO from SRTT and RTTVAR
320 * Clock granularity is ignored since the minimum error for RTTVAR is
321 * clamped to 50msec (corresponding to HZ=20). This leads to a minimum
322 * RTO of 200msec. This agrees with TCP and RFC 4341, 5.: "Because DCCP
323 * does not retransmit data, DCCP does not require TCP's recommended
324 * minimum timeout of one second".
325 */
326 hctx->rto = (hctx->srtt >> 3) + hctx->rttvar;
327 379
328 if (hctx->rto > DCCP_RTO_MAX) 380out_invalid_option:
329 hctx->rto = DCCP_RTO_MAX; 381 DCCP_BUG("Invalid option - this should not happen (previous parsing)!");
382 return -1;
330} 383}
331 384
332static void ccid2_new_ack(struct sock *sk, struct ccid2_seq *seqp, 385static void ccid2_hc_tx_kill_rto_timer(struct sock *sk)
333 unsigned int *maxincr)
334{ 386{
335 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); 387 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
336 388
337 if (hctx->cwnd < hctx->ssthresh) { 389 sk_stop_timer(sk, &hctx->ccid2hctx_rtotimer);
338 if (*maxincr > 0 && ++hctx->packets_acked == 2) { 390 ccid2_pr_debug("deleted RTO timer\n");
339 hctx->cwnd += 1;
340 *maxincr -= 1;
341 hctx->packets_acked = 0;
342 }
343 } else if (++hctx->packets_acked >= hctx->cwnd) {
344 hctx->cwnd += 1;
345 hctx->packets_acked = 0;
346 }
347 /*
348 * FIXME: RTT is sampled several times per acknowledgment (for each
349 * entry in the Ack Vector), instead of once per Ack (as in TCP SACK).
350 * This causes the RTT to be over-estimated, since the older entries
351 * in the Ack Vector have earlier sending times.
352 * The cleanest solution is to not use the ccid2s_sent field at all
353 * and instead use DCCP timestamps - need to be resolved at some time.
354 */
355 ccid2_rtt_estimator(sk, jiffies - seqp->ccid2s_sent);
356} 391}
357 392
358static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp) 393static inline void ccid2_new_ack(struct sock *sk,
394 struct ccid2_seq *seqp,
395 unsigned int *maxincr)
359{ 396{
360 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); 397 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
361 398
362 if (time_before(seqp->ccid2s_sent, hctx->last_cong)) { 399 if (hctx->ccid2hctx_cwnd < hctx->ccid2hctx_ssthresh) {
363 ccid2_pr_debug("Multiple losses in an RTT---treating as one\n"); 400 if (*maxincr > 0 && ++hctx->ccid2hctx_packets_acked == 2) {
364 return; 401 hctx->ccid2hctx_cwnd += 1;
402 *maxincr -= 1;
403 hctx->ccid2hctx_packets_acked = 0;
404 }
405 } else if (++hctx->ccid2hctx_packets_acked >= hctx->ccid2hctx_cwnd) {
406 hctx->ccid2hctx_cwnd += 1;
407 hctx->ccid2hctx_packets_acked = 0;
365 } 408 }
366 409
367 hctx->last_cong = jiffies; 410 /* update RTO */
411 if (hctx->ccid2hctx_srtt == -1 ||
412 time_after(jiffies, hctx->ccid2hctx_lastrtt + hctx->ccid2hctx_srtt)) {
413 unsigned long r = (long)jiffies - (long)seqp->ccid2s_sent;
414 int s;
415
416 /* first measurement */
417 if (hctx->ccid2hctx_srtt == -1) {
418 ccid2_pr_debug("R: %lu Time=%lu seq=%llu\n",
419 r, jiffies,
420 (unsigned long long)seqp->ccid2s_seq);
421 ccid2_change_srtt(hctx, r);
422 hctx->ccid2hctx_rttvar = r >> 1;
423 } else {
424 /* RTTVAR */
425 long tmp = hctx->ccid2hctx_srtt - r;
426 long srtt;
427
428 if (tmp < 0)
429 tmp *= -1;
430
431 tmp >>= 2;
432 hctx->ccid2hctx_rttvar *= 3;
433 hctx->ccid2hctx_rttvar >>= 2;
434 hctx->ccid2hctx_rttvar += tmp;
435
436 /* SRTT */
437 srtt = hctx->ccid2hctx_srtt;
438 srtt *= 7;
439 srtt >>= 3;
440 tmp = r >> 3;
441 srtt += tmp;
442 ccid2_change_srtt(hctx, srtt);
443 }
444 s = hctx->ccid2hctx_rttvar << 2;
445 /* clock granularity is 1 when based on jiffies */
446 if (!s)
447 s = 1;
448 hctx->ccid2hctx_rto = hctx->ccid2hctx_srtt + s;
449
450 /* must be at least a second */
451 s = hctx->ccid2hctx_rto / HZ;
452 /* DCCP doesn't require this [but I like it cuz my code sux] */
453#if 1
454 if (s < 1)
455 hctx->ccid2hctx_rto = HZ;
456#endif
457 /* max 60 seconds */
458 if (s > 60)
459 hctx->ccid2hctx_rto = HZ * 60;
368 460
369 hctx->cwnd = hctx->cwnd / 2 ? : 1U; 461 hctx->ccid2hctx_lastrtt = jiffies;
370 hctx->ssthresh = max(hctx->cwnd, 2U);
371 462
372 /* Avoid spurious timeouts resulting from Ack Ratio > cwnd */ 463 ccid2_pr_debug("srtt: %ld rttvar: %ld rto: %ld (HZ=%d) R=%lu\n",
373 if (dccp_sk(sk)->dccps_l_ack_ratio > hctx->cwnd) 464 hctx->ccid2hctx_srtt, hctx->ccid2hctx_rttvar,
374 ccid2_change_l_ack_ratio(sk, hctx->cwnd); 465 hctx->ccid2hctx_rto, HZ, r);
466 }
467
468 /* we got a new ack, so re-start RTO timer */
469 ccid2_hc_tx_kill_rto_timer(sk);
470 ccid2_start_rto_timer(sk);
375} 471}
376 472
377static int ccid2_hc_tx_parse_options(struct sock *sk, u8 packet_type, 473static void ccid2_hc_tx_dec_pipe(struct sock *sk)
378 u8 option, u8 *optval, u8 optlen)
379{ 474{
380 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); 475 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
381 476
382 switch (option) { 477 if (hctx->ccid2hctx_pipe == 0)
383 case DCCPO_ACK_VECTOR_0: 478 DCCP_BUG("pipe == 0");
384 case DCCPO_ACK_VECTOR_1: 479 else
385 return dccp_ackvec_parsed_add(&hctx->av_chunks, optval, optlen, 480 hctx->ccid2hctx_pipe--;
386 option - DCCPO_ACK_VECTOR_0); 481
482 if (hctx->ccid2hctx_pipe == 0)
483 ccid2_hc_tx_kill_rto_timer(sk);
484}
485
486static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp)
487{
488 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
489
490 if (time_before(seqp->ccid2s_sent, hctx->ccid2hctx_last_cong)) {
491 ccid2_pr_debug("Multiple losses in an RTT---treating as one\n");
492 return;
387 } 493 }
388 return 0; 494
495 hctx->ccid2hctx_last_cong = jiffies;
496
497 hctx->ccid2hctx_cwnd = hctx->ccid2hctx_cwnd / 2 ? : 1U;
498 hctx->ccid2hctx_ssthresh = max(hctx->ccid2hctx_cwnd, 2U);
499
500 /* Avoid spurious timeouts resulting from Ack Ratio > cwnd */
501 if (dccp_sk(sk)->dccps_l_ack_ratio > hctx->ccid2hctx_cwnd)
502 ccid2_change_l_ack_ratio(sk, hctx->ccid2hctx_cwnd);
389} 503}
390 504
391static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) 505static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
392{ 506{
393 struct dccp_sock *dp = dccp_sk(sk); 507 struct dccp_sock *dp = dccp_sk(sk);
394 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); 508 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
395 const bool sender_was_blocked = ccid2_cwnd_network_limited(hctx);
396 struct dccp_ackvec_parsed *avp;
397 u64 ackno, seqno; 509 u64 ackno, seqno;
398 struct ccid2_seq *seqp; 510 struct ccid2_seq *seqp;
511 unsigned char *vector;
512 unsigned char veclen;
513 int offset = 0;
399 int done = 0; 514 int done = 0;
400 unsigned int maxincr = 0; 515 unsigned int maxincr = 0;
401 516
517 ccid2_hc_tx_check_sanity(hctx);
402 /* check reverse path congestion */ 518 /* check reverse path congestion */
403 seqno = DCCP_SKB_CB(skb)->dccpd_seq; 519 seqno = DCCP_SKB_CB(skb)->dccpd_seq;
404 520
@@ -407,21 +523,21 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
407 * -sorbo. 523 * -sorbo.
408 */ 524 */
409 /* need to bootstrap */ 525 /* need to bootstrap */
410 if (hctx->rpdupack == -1) { 526 if (hctx->ccid2hctx_rpdupack == -1) {
411 hctx->rpdupack = 0; 527 hctx->ccid2hctx_rpdupack = 0;
412 hctx->rpseq = seqno; 528 hctx->ccid2hctx_rpseq = seqno;
413 } else { 529 } else {
414 /* check if packet is consecutive */ 530 /* check if packet is consecutive */
415 if (dccp_delta_seqno(hctx->rpseq, seqno) == 1) 531 if (dccp_delta_seqno(hctx->ccid2hctx_rpseq, seqno) == 1)
416 hctx->rpseq = seqno; 532 hctx->ccid2hctx_rpseq = seqno;
417 /* it's a later packet */ 533 /* it's a later packet */
418 else if (after48(seqno, hctx->rpseq)) { 534 else if (after48(seqno, hctx->ccid2hctx_rpseq)) {
419 hctx->rpdupack++; 535 hctx->ccid2hctx_rpdupack++;
420 536
421 /* check if we got enough dupacks */ 537 /* check if we got enough dupacks */
422 if (hctx->rpdupack >= NUMDUPACK) { 538 if (hctx->ccid2hctx_rpdupack >= NUMDUPACK) {
423 hctx->rpdupack = -1; /* XXX lame */ 539 hctx->ccid2hctx_rpdupack = -1; /* XXX lame */
424 hctx->rpseq = 0; 540 hctx->ccid2hctx_rpseq = 0;
425 541
426 ccid2_change_l_ack_ratio(sk, 2 * dp->dccps_l_ack_ratio); 542 ccid2_change_l_ack_ratio(sk, 2 * dp->dccps_l_ack_ratio);
427 } 543 }
@@ -429,22 +545,27 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
429 } 545 }
430 546
431 /* check forward path congestion */ 547 /* check forward path congestion */
432 if (dccp_packet_without_ack(skb)) 548 /* still didn't send out new data packets */
549 if (hctx->ccid2hctx_seqh == hctx->ccid2hctx_seqt)
433 return; 550 return;
434 551
435 /* still didn't send out new data packets */ 552 switch (DCCP_SKB_CB(skb)->dccpd_type) {
436 if (hctx->seqh == hctx->seqt) 553 case DCCP_PKT_ACK:
437 goto done; 554 case DCCP_PKT_DATAACK:
555 break;
556 default:
557 return;
558 }
438 559
439 ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq; 560 ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq;
440 if (after48(ackno, hctx->high_ack)) 561 if (after48(ackno, hctx->ccid2hctx_high_ack))
441 hctx->high_ack = ackno; 562 hctx->ccid2hctx_high_ack = ackno;
442 563
443 seqp = hctx->seqt; 564 seqp = hctx->ccid2hctx_seqt;
444 while (before48(seqp->ccid2s_seq, ackno)) { 565 while (before48(seqp->ccid2s_seq, ackno)) {
445 seqp = seqp->ccid2s_next; 566 seqp = seqp->ccid2s_next;
446 if (seqp == hctx->seqh) { 567 if (seqp == hctx->ccid2hctx_seqh) {
447 seqp = hctx->seqh->ccid2s_prev; 568 seqp = hctx->ccid2hctx_seqh->ccid2s_prev;
448 break; 569 break;
449 } 570 }
450 } 571 }
@@ -454,26 +575,26 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
454 * packets per acknowledgement. Rounding up avoids that cwnd is not 575 * packets per acknowledgement. Rounding up avoids that cwnd is not
455 * advanced when Ack Ratio is 1 and gives a slight edge otherwise. 576 * advanced when Ack Ratio is 1 and gives a slight edge otherwise.
456 */ 577 */
457 if (hctx->cwnd < hctx->ssthresh) 578 if (hctx->ccid2hctx_cwnd < hctx->ccid2hctx_ssthresh)
458 maxincr = DIV_ROUND_UP(dp->dccps_l_ack_ratio, 2); 579 maxincr = DIV_ROUND_UP(dp->dccps_l_ack_ratio, 2);
459 580
460 /* go through all ack vectors */ 581 /* go through all ack vectors */
461 list_for_each_entry(avp, &hctx->av_chunks, node) { 582 while ((offset = ccid2_ackvector(sk, skb, offset,
583 &vector, &veclen)) != -1) {
462 /* go through this ack vector */ 584 /* go through this ack vector */
463 for (; avp->len--; avp->vec++) { 585 while (veclen--) {
464 u64 ackno_end_rl = SUB48(ackno, 586 const u8 rl = *vector & DCCP_ACKVEC_LEN_MASK;
465 dccp_ackvec_runlen(avp->vec)); 587 u64 ackno_end_rl = SUB48(ackno, rl);
466 588
467 ccid2_pr_debug("ackvec %llu |%u,%u|\n", 589 ccid2_pr_debug("ackvec start:%llu end:%llu\n",
468 (unsigned long long)ackno, 590 (unsigned long long)ackno,
469 dccp_ackvec_state(avp->vec) >> 6, 591 (unsigned long long)ackno_end_rl);
470 dccp_ackvec_runlen(avp->vec));
471 /* if the seqno we are analyzing is larger than the 592 /* if the seqno we are analyzing is larger than the
472 * current ackno, then move towards the tail of our 593 * current ackno, then move towards the tail of our
473 * seqnos. 594 * seqnos.
474 */ 595 */
475 while (after48(seqp->ccid2s_seq, ackno)) { 596 while (after48(seqp->ccid2s_seq, ackno)) {
476 if (seqp == hctx->seqt) { 597 if (seqp == hctx->ccid2hctx_seqt) {
477 done = 1; 598 done = 1;
478 break; 599 break;
479 } 600 }
@@ -486,24 +607,26 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
486 * run length 607 * run length
487 */ 608 */
488 while (between48(seqp->ccid2s_seq,ackno_end_rl,ackno)) { 609 while (between48(seqp->ccid2s_seq,ackno_end_rl,ackno)) {
489 const u8 state = dccp_ackvec_state(avp->vec); 610 const u8 state = *vector &
611 DCCP_ACKVEC_STATE_MASK;
490 612
491 /* new packet received or marked */ 613 /* new packet received or marked */
492 if (state != DCCPAV_NOT_RECEIVED && 614 if (state != DCCP_ACKVEC_STATE_NOT_RECEIVED &&
493 !seqp->ccid2s_acked) { 615 !seqp->ccid2s_acked) {
494 if (state == DCCPAV_ECN_MARKED) 616 if (state ==
617 DCCP_ACKVEC_STATE_ECN_MARKED) {
495 ccid2_congestion_event(sk, 618 ccid2_congestion_event(sk,
496 seqp); 619 seqp);
497 else 620 } else
498 ccid2_new_ack(sk, seqp, 621 ccid2_new_ack(sk, seqp,
499 &maxincr); 622 &maxincr);
500 623
501 seqp->ccid2s_acked = 1; 624 seqp->ccid2s_acked = 1;
502 ccid2_pr_debug("Got ack for %llu\n", 625 ccid2_pr_debug("Got ack for %llu\n",
503 (unsigned long long)seqp->ccid2s_seq); 626 (unsigned long long)seqp->ccid2s_seq);
504 hctx->pipe--; 627 ccid2_hc_tx_dec_pipe(sk);
505 } 628 }
506 if (seqp == hctx->seqt) { 629 if (seqp == hctx->ccid2hctx_seqt) {
507 done = 1; 630 done = 1;
508 break; 631 break;
509 } 632 }
@@ -513,6 +636,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
513 break; 636 break;
514 637
515 ackno = SUB48(ackno_end_rl, 1); 638 ackno = SUB48(ackno_end_rl, 1);
639 vector++;
516 } 640 }
517 if (done) 641 if (done)
518 break; 642 break;
@@ -521,11 +645,11 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
521 /* The state about what is acked should be correct now 645 /* The state about what is acked should be correct now
522 * Check for NUMDUPACK 646 * Check for NUMDUPACK
523 */ 647 */
524 seqp = hctx->seqt; 648 seqp = hctx->ccid2hctx_seqt;
525 while (before48(seqp->ccid2s_seq, hctx->high_ack)) { 649 while (before48(seqp->ccid2s_seq, hctx->ccid2hctx_high_ack)) {
526 seqp = seqp->ccid2s_next; 650 seqp = seqp->ccid2s_next;
527 if (seqp == hctx->seqh) { 651 if (seqp == hctx->ccid2hctx_seqh) {
528 seqp = hctx->seqh->ccid2s_prev; 652 seqp = hctx->ccid2hctx_seqh->ccid2s_prev;
529 break; 653 break;
530 } 654 }
531 } 655 }
@@ -536,7 +660,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
536 if (done == NUMDUPACK) 660 if (done == NUMDUPACK)
537 break; 661 break;
538 } 662 }
539 if (seqp == hctx->seqt) 663 if (seqp == hctx->ccid2hctx_seqt)
540 break; 664 break;
541 seqp = seqp->ccid2s_prev; 665 seqp = seqp->ccid2s_prev;
542 } 666 }
@@ -557,34 +681,25 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
557 * one ack vector. 681 * one ack vector.
558 */ 682 */
559 ccid2_congestion_event(sk, seqp); 683 ccid2_congestion_event(sk, seqp);
560 hctx->pipe--; 684 ccid2_hc_tx_dec_pipe(sk);
561 } 685 }
562 if (seqp == hctx->seqt) 686 if (seqp == hctx->ccid2hctx_seqt)
563 break; 687 break;
564 seqp = seqp->ccid2s_prev; 688 seqp = seqp->ccid2s_prev;
565 } 689 }
566 690
567 hctx->seqt = last_acked; 691 hctx->ccid2hctx_seqt = last_acked;
568 } 692 }
569 693
570 /* trim acked packets in tail */ 694 /* trim acked packets in tail */
571 while (hctx->seqt != hctx->seqh) { 695 while (hctx->ccid2hctx_seqt != hctx->ccid2hctx_seqh) {
572 if (!hctx->seqt->ccid2s_acked) 696 if (!hctx->ccid2hctx_seqt->ccid2s_acked)
573 break; 697 break;
574 698
575 hctx->seqt = hctx->seqt->ccid2s_next; 699 hctx->ccid2hctx_seqt = hctx->ccid2hctx_seqt->ccid2s_next;
576 } 700 }
577 701
578 /* restart RTO timer if not all outstanding data has been acked */ 702 ccid2_hc_tx_check_sanity(hctx);
579 if (hctx->pipe == 0)
580 sk_stop_timer(sk, &hctx->rtotimer);
581 else
582 sk_reset_timer(sk, &hctx->rtotimer, jiffies + hctx->rto);
583done:
584 /* check if incoming Acks allow pending packets to be sent */
585 if (sender_was_blocked && !ccid2_cwnd_network_limited(hctx))
586 tasklet_schedule(&dccp_sk(sk)->dccps_xmitlet);
587 dccp_ackvec_parsed_cleanup(&hctx->av_chunks);
588} 703}
589 704
590static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk) 705static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk)
@@ -594,13 +709,17 @@ static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk)
594 u32 max_ratio; 709 u32 max_ratio;
595 710
596 /* RFC 4341, 5: initialise ssthresh to arbitrarily high (max) value */ 711 /* RFC 4341, 5: initialise ssthresh to arbitrarily high (max) value */
597 hctx->ssthresh = ~0U; 712 hctx->ccid2hctx_ssthresh = ~0U;
598 713
599 /* Use larger initial windows (RFC 3390, rfc2581bis) */ 714 /*
600 hctx->cwnd = rfc3390_bytes_to_packets(dp->dccps_mss_cache); 715 * RFC 4341, 5: "The cwnd parameter is initialized to at most four
716 * packets for new connections, following the rules from [RFC3390]".
717 * We need to convert the bytes of RFC3390 into the packets of RFC 4341.
718 */
719 hctx->ccid2hctx_cwnd = clamp(4380U / dp->dccps_mss_cache, 2U, 4U);
601 720
602 /* Make sure that Ack Ratio is enabled and within bounds. */ 721 /* Make sure that Ack Ratio is enabled and within bounds. */
603 max_ratio = DIV_ROUND_UP(hctx->cwnd, 2); 722 max_ratio = DIV_ROUND_UP(hctx->ccid2hctx_cwnd, 2);
604 if (dp->dccps_l_ack_ratio == 0 || dp->dccps_l_ack_ratio > max_ratio) 723 if (dp->dccps_l_ack_ratio == 0 || dp->dccps_l_ack_ratio > max_ratio)
605 dp->dccps_l_ack_ratio = max_ratio; 724 dp->dccps_l_ack_ratio = max_ratio;
606 725
@@ -608,11 +727,15 @@ static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk)
608 if (ccid2_hc_tx_alloc_seq(hctx)) 727 if (ccid2_hc_tx_alloc_seq(hctx))
609 return -ENOMEM; 728 return -ENOMEM;
610 729
611 hctx->rto = DCCP_TIMEOUT_INIT; 730 hctx->ccid2hctx_rto = 3 * HZ;
612 hctx->rpdupack = -1; 731 ccid2_change_srtt(hctx, -1);
613 hctx->last_cong = jiffies; 732 hctx->ccid2hctx_rttvar = -1;
614 setup_timer(&hctx->rtotimer, ccid2_hc_tx_rto_expire, (unsigned long)sk); 733 hctx->ccid2hctx_rpdupack = -1;
615 INIT_LIST_HEAD(&hctx->av_chunks); 734 hctx->ccid2hctx_last_cong = jiffies;
735 setup_timer(&hctx->ccid2hctx_rtotimer, ccid2_hc_tx_rto_expire,
736 (unsigned long)sk);
737
738 ccid2_hc_tx_check_sanity(hctx);
616 return 0; 739 return 0;
617} 740}
618 741
@@ -621,11 +744,11 @@ static void ccid2_hc_tx_exit(struct sock *sk)
621 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); 744 struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
622 int i; 745 int i;
623 746
624 sk_stop_timer(sk, &hctx->rtotimer); 747 ccid2_hc_tx_kill_rto_timer(sk);
625 748
626 for (i = 0; i < hctx->seqbufc; i++) 749 for (i = 0; i < hctx->ccid2hctx_seqbufc; i++)
627 kfree(hctx->seqbuf[i]); 750 kfree(hctx->ccid2hctx_seqbuf[i]);
628 hctx->seqbufc = 0; 751 hctx->ccid2hctx_seqbufc = 0;
629} 752}
630 753
631static void ccid2_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) 754static void ccid2_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
@@ -636,28 +759,27 @@ static void ccid2_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
636 switch (DCCP_SKB_CB(skb)->dccpd_type) { 759 switch (DCCP_SKB_CB(skb)->dccpd_type) {
637 case DCCP_PKT_DATA: 760 case DCCP_PKT_DATA:
638 case DCCP_PKT_DATAACK: 761 case DCCP_PKT_DATAACK:
639 hcrx->data++; 762 hcrx->ccid2hcrx_data++;
640 if (hcrx->data >= dp->dccps_r_ack_ratio) { 763 if (hcrx->ccid2hcrx_data >= dp->dccps_r_ack_ratio) {
641 dccp_send_ack(sk); 764 dccp_send_ack(sk);
642 hcrx->data = 0; 765 hcrx->ccid2hcrx_data = 0;
643 } 766 }
644 break; 767 break;
645 } 768 }
646} 769}
647 770
648static struct ccid_operations ccid2 = { 771static struct ccid_operations ccid2 = {
649 .ccid_id = DCCPC_CCID2, 772 .ccid_id = DCCPC_CCID2,
650 .ccid_name = "TCP-like", 773 .ccid_name = "TCP-like",
651 .ccid_owner = THIS_MODULE, 774 .ccid_owner = THIS_MODULE,
652 .ccid_hc_tx_obj_size = sizeof(struct ccid2_hc_tx_sock), 775 .ccid_hc_tx_obj_size = sizeof(struct ccid2_hc_tx_sock),
653 .ccid_hc_tx_init = ccid2_hc_tx_init, 776 .ccid_hc_tx_init = ccid2_hc_tx_init,
654 .ccid_hc_tx_exit = ccid2_hc_tx_exit, 777 .ccid_hc_tx_exit = ccid2_hc_tx_exit,
655 .ccid_hc_tx_send_packet = ccid2_hc_tx_send_packet, 778 .ccid_hc_tx_send_packet = ccid2_hc_tx_send_packet,
656 .ccid_hc_tx_packet_sent = ccid2_hc_tx_packet_sent, 779 .ccid_hc_tx_packet_sent = ccid2_hc_tx_packet_sent,
657 .ccid_hc_tx_parse_options = ccid2_hc_tx_parse_options, 780 .ccid_hc_tx_packet_recv = ccid2_hc_tx_packet_recv,
658 .ccid_hc_tx_packet_recv = ccid2_hc_tx_packet_recv, 781 .ccid_hc_rx_obj_size = sizeof(struct ccid2_hc_rx_sock),
659 .ccid_hc_rx_obj_size = sizeof(struct ccid2_hc_rx_sock), 782 .ccid_hc_rx_packet_recv = ccid2_hc_rx_packet_recv,
660 .ccid_hc_rx_packet_recv = ccid2_hc_rx_packet_recv,
661}; 783};
662 784
663#ifdef CONFIG_IP_DCCP_CCID2_DEBUG 785#ifdef CONFIG_IP_DCCP_CCID2_DEBUG
diff --git a/net/dccp/ccids/ccid2.h b/net/dccp/ccids/ccid2.h
index 8b7a2dee2f6d..2c94ca029010 100644
--- a/net/dccp/ccids/ccid2.h
+++ b/net/dccp/ccids/ccid2.h
@@ -42,49 +42,34 @@ struct ccid2_seq {
42 42
43/** struct ccid2_hc_tx_sock - CCID2 TX half connection 43/** struct ccid2_hc_tx_sock - CCID2 TX half connection
44 * 44 *
45 * @{cwnd,ssthresh,pipe}: as per RFC 4341, section 5 45 * @ccid2hctx_{cwnd,ssthresh,pipe}: as per RFC 4341, section 5
46 * @packets_acked: Ack counter for deriving cwnd growth (RFC 3465) 46 * @ccid2hctx_packets_acked - Ack counter for deriving cwnd growth (RFC 3465)
47 * @srtt: smoothed RTT estimate, scaled by 2^3 47 * @ccid2hctx_lastrtt -time RTT was last measured
48 * @mdev: smoothed RTT variation, scaled by 2^2 48 * @ccid2hctx_rpseq - last consecutive seqno
49 * @mdev_max: maximum of @mdev during one flight 49 * @ccid2hctx_rpdupack - dupacks since rpseq
50 * @rttvar: moving average/maximum of @mdev_max 50*/
51 * @rto: RTO value deriving from SRTT and RTTVAR (RFC 2988)
52 * @rtt_seq: to decay RTTVAR at most once per flight
53 * @rpseq: last consecutive seqno
54 * @rpdupack: dupacks since rpseq
55 * @av_chunks: list of Ack Vectors received on current skb
56 */
57struct ccid2_hc_tx_sock { 51struct ccid2_hc_tx_sock {
58 u32 cwnd; 52 u32 ccid2hctx_cwnd;
59 u32 ssthresh; 53 u32 ccid2hctx_ssthresh;
60 u32 pipe; 54 u32 ccid2hctx_pipe;
61 u32 packets_acked; 55 u32 ccid2hctx_packets_acked;
62 struct ccid2_seq *seqbuf[CCID2_SEQBUF_MAX]; 56 struct ccid2_seq *ccid2hctx_seqbuf[CCID2_SEQBUF_MAX];
63 int seqbufc; 57 int ccid2hctx_seqbufc;
64 struct ccid2_seq *seqh; 58 struct ccid2_seq *ccid2hctx_seqh;
65 struct ccid2_seq *seqt; 59 struct ccid2_seq *ccid2hctx_seqt;
66 /* RTT measurement: variables/principles are the same as in TCP */ 60 long ccid2hctx_rto;
67 u32 srtt, 61 long ccid2hctx_srtt;
68 mdev, 62 long ccid2hctx_rttvar;
69 mdev_max, 63 unsigned long ccid2hctx_lastrtt;
70 rttvar, 64 struct timer_list ccid2hctx_rtotimer;
71 rto; 65 u64 ccid2hctx_rpseq;
72 u64 rtt_seq:48; 66 int ccid2hctx_rpdupack;
73 struct timer_list rtotimer; 67 unsigned long ccid2hctx_last_cong;
74 u64 rpseq; 68 u64 ccid2hctx_high_ack;
75 int rpdupack;
76 unsigned long last_cong;
77 u64 high_ack;
78 struct list_head av_chunks;
79}; 69};
80 70
81static inline bool ccid2_cwnd_network_limited(struct ccid2_hc_tx_sock *hctx)
82{
83 return (hctx->pipe >= hctx->cwnd);
84}
85
86struct ccid2_hc_rx_sock { 71struct ccid2_hc_rx_sock {
87 int data; 72 int ccid2hcrx_data;
88}; 73};
89 74
90static inline struct ccid2_hc_tx_sock *ccid2_hc_tx_sk(const struct sock *sk) 75static inline struct ccid2_hc_tx_sock *ccid2_hc_tx_sk(const struct sock *sk)
diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c
index 06cfdad84a6a..3b8bd7ca6761 100644
--- a/net/dccp/ccids/ccid3.c
+++ b/net/dccp/ccids/ccid3.c
@@ -49,41 +49,75 @@ static int ccid3_debug;
49/* 49/*
50 * Transmitter Half-Connection Routines 50 * Transmitter Half-Connection Routines
51 */ 51 */
52/* Oscillation Prevention/Reduction: recommended by rfc3448bis, on by default */ 52#ifdef CONFIG_IP_DCCP_CCID3_DEBUG
53static int do_osc_prev = true; 53static const char *ccid3_tx_state_name(enum ccid3_hc_tx_states state)
54{
55 static char *ccid3_state_names[] = {
56 [TFRC_SSTATE_NO_SENT] = "NO_SENT",
57 [TFRC_SSTATE_NO_FBACK] = "NO_FBACK",
58 [TFRC_SSTATE_FBACK] = "FBACK",
59 [TFRC_SSTATE_TERM] = "TERM",
60 };
61
62 return ccid3_state_names[state];
63}
64#endif
65
66static void ccid3_hc_tx_set_state(struct sock *sk,
67 enum ccid3_hc_tx_states state)
68{
69 struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
70 enum ccid3_hc_tx_states oldstate = hctx->ccid3hctx_state;
71
72 ccid3_pr_debug("%s(%p) %-8.8s -> %s\n",
73 dccp_role(sk), sk, ccid3_tx_state_name(oldstate),
74 ccid3_tx_state_name(state));
75 WARN_ON(state == oldstate);
76 hctx->ccid3hctx_state = state;
77}
54 78
55/* 79/*
56 * Compute the initial sending rate X_init in the manner of RFC 3390: 80 * Compute the initial sending rate X_init in the manner of RFC 3390:
57 * 81 *
58 * X_init = min(4 * MPS, max(2 * MPS, 4380 bytes)) / RTT 82 * X_init = min(4 * s, max(2 * s, 4380 bytes)) / RTT
59 * 83 *
84 * Note that RFC 3390 uses MSS, RFC 4342 refers to RFC 3390, and rfc3448bis
85 * (rev-02) clarifies the use of RFC 3390 with regard to the above formula.
60 * For consistency with other parts of the code, X_init is scaled by 2^6. 86 * For consistency with other parts of the code, X_init is scaled by 2^6.
61 */ 87 */
62static inline u64 rfc3390_initial_rate(struct sock *sk) 88static inline u64 rfc3390_initial_rate(struct sock *sk)
63{ 89{
64 const u32 mps = dccp_sk(sk)->dccps_mss_cache, 90 const struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
65 w_init = clamp(4380U, 2 * mps, 4 * mps); 91 const __u32 w_init = clamp_t(__u32, 4380U,
92 2 * hctx->ccid3hctx_s, 4 * hctx->ccid3hctx_s);
66 93
67 return scaled_div(w_init << 6, ccid3_hc_tx_sk(sk)->rtt); 94 return scaled_div(w_init << 6, hctx->ccid3hctx_rtt);
68} 95}
69 96
70/** 97/*
71 * ccid3_update_send_interval - Calculate new t_ipi = s / X 98 * Recalculate t_ipi and delta (should be called whenever X changes)
72 * This respects the granularity of X (64 * bytes/second) and enforces the
73 * scaled minimum of s * 64 / t_mbi = `s' bytes/second as per RFC 3448/4342.
74 */ 99 */
75static void ccid3_update_send_interval(struct ccid3_hc_tx_sock *hctx) 100static void ccid3_update_send_interval(struct ccid3_hc_tx_sock *hctx)
76{ 101{
77 if (unlikely(hctx->x <= hctx->s)) 102 /* Calculate new t_ipi = s / X_inst (X_inst is in 64 * bytes/second) */
78 hctx->x = hctx->s; 103 hctx->ccid3hctx_t_ipi = scaled_div32(((u64)hctx->ccid3hctx_s) << 6,
79 hctx->t_ipi = scaled_div32(((u64)hctx->s) << 6, hctx->x); 104 hctx->ccid3hctx_x);
105
106 /* Calculate new delta by delta = min(t_ipi / 2, t_gran / 2) */
107 hctx->ccid3hctx_delta = min_t(u32, hctx->ccid3hctx_t_ipi / 2,
108 TFRC_OPSYS_HALF_TIME_GRAN);
109
110 ccid3_pr_debug("t_ipi=%u, delta=%u, s=%u, X=%u\n",
111 hctx->ccid3hctx_t_ipi, hctx->ccid3hctx_delta,
112 hctx->ccid3hctx_s, (unsigned)(hctx->ccid3hctx_x >> 6));
113
80} 114}
81 115
82static u32 ccid3_hc_tx_idle_rtt(struct ccid3_hc_tx_sock *hctx, ktime_t now) 116static u32 ccid3_hc_tx_idle_rtt(struct ccid3_hc_tx_sock *hctx, ktime_t now)
83{ 117{
84 u32 delta = ktime_us_delta(now, hctx->t_last_win_count); 118 u32 delta = ktime_us_delta(now, hctx->ccid3hctx_t_last_win_count);
85 119
86 return delta / hctx->rtt; 120 return delta / hctx->ccid3hctx_rtt;
87} 121}
88 122
89/** 123/**
@@ -99,8 +133,8 @@ static u32 ccid3_hc_tx_idle_rtt(struct ccid3_hc_tx_sock *hctx, ktime_t now)
99static void ccid3_hc_tx_update_x(struct sock *sk, ktime_t *stamp) 133static void ccid3_hc_tx_update_x(struct sock *sk, ktime_t *stamp)
100{ 134{
101 struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); 135 struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
102 u64 min_rate = 2 * hctx->x_recv; 136 __u64 min_rate = 2 * hctx->ccid3hctx_x_recv;
103 const u64 old_x = hctx->x; 137 const __u64 old_x = hctx->ccid3hctx_x;
104 ktime_t now = stamp ? *stamp : ktime_get_real(); 138 ktime_t now = stamp ? *stamp : ktime_get_real();
105 139
106 /* 140 /*
@@ -111,44 +145,50 @@ static void ccid3_hc_tx_update_x(struct sock *sk, ktime_t *stamp)
111 */ 145 */
112 if (ccid3_hc_tx_idle_rtt(hctx, now) >= 2) { 146 if (ccid3_hc_tx_idle_rtt(hctx, now) >= 2) {
113 min_rate = rfc3390_initial_rate(sk); 147 min_rate = rfc3390_initial_rate(sk);
114 min_rate = max(min_rate, 2 * hctx->x_recv); 148 min_rate = max(min_rate, 2 * hctx->ccid3hctx_x_recv);
115 } 149 }
116 150
117 if (hctx->p > 0) { 151 if (hctx->ccid3hctx_p > 0) {
118 152
119 hctx->x = min(((u64)hctx->x_calc) << 6, min_rate); 153 hctx->ccid3hctx_x = min(((__u64)hctx->ccid3hctx_x_calc) << 6,
154 min_rate);
155 hctx->ccid3hctx_x = max(hctx->ccid3hctx_x,
156 (((__u64)hctx->ccid3hctx_s) << 6) /
157 TFRC_T_MBI);
120 158
121 } else if (ktime_us_delta(now, hctx->t_ld) - (s64)hctx->rtt >= 0) { 159 } else if (ktime_us_delta(now, hctx->ccid3hctx_t_ld)
160 - (s64)hctx->ccid3hctx_rtt >= 0) {
122 161
123 hctx->x = min(2 * hctx->x, min_rate); 162 hctx->ccid3hctx_x = min(2 * hctx->ccid3hctx_x, min_rate);
124 hctx->x = max(hctx->x, 163 hctx->ccid3hctx_x = max(hctx->ccid3hctx_x,
125 scaled_div(((u64)hctx->s) << 6, hctx->rtt)); 164 scaled_div(((__u64)hctx->ccid3hctx_s) << 6,
126 hctx->t_ld = now; 165 hctx->ccid3hctx_rtt));
166 hctx->ccid3hctx_t_ld = now;
127 } 167 }
128 168
129 if (hctx->x != old_x) { 169 if (hctx->ccid3hctx_x != old_x) {
130 ccid3_pr_debug("X_prev=%u, X_now=%u, X_calc=%u, " 170 ccid3_pr_debug("X_prev=%u, X_now=%u, X_calc=%u, "
131 "X_recv=%u\n", (unsigned)(old_x >> 6), 171 "X_recv=%u\n", (unsigned)(old_x >> 6),
132 (unsigned)(hctx->x >> 6), hctx->x_calc, 172 (unsigned)(hctx->ccid3hctx_x >> 6),
133 (unsigned)(hctx->x_recv >> 6)); 173 hctx->ccid3hctx_x_calc,
174 (unsigned)(hctx->ccid3hctx_x_recv >> 6));
134 175
135 ccid3_update_send_interval(hctx); 176 ccid3_update_send_interval(hctx);
136 } 177 }
137} 178}
138 179
139/* 180/*
140 * ccid3_hc_tx_measure_packet_size - Measuring the packet size `s' (sec 4.1) 181 * Track the mean packet size `s' (cf. RFC 4342, 5.3 and RFC 3448, 4.1)
141 * @new_len: DCCP payload size in bytes (not used by all methods) 182 * @len: DCCP packet payload size in bytes
142 */ 183 */
143static u32 ccid3_hc_tx_measure_packet_size(struct sock *sk, const u16 new_len) 184static inline void ccid3_hc_tx_update_s(struct ccid3_hc_tx_sock *hctx, int len)
144{ 185{
145#if defined(CONFIG_IP_DCCP_CCID3_MEASURE_S_AS_AVG) 186 const u16 old_s = hctx->ccid3hctx_s;
146 return tfrc_ewma(ccid3_hc_tx_sk(sk)->s, new_len, 9); 187
147#elif defined(CONFIG_IP_DCCP_CCID3_MEASURE_S_AS_MAX) 188 hctx->ccid3hctx_s = tfrc_ewma(hctx->ccid3hctx_s, len, 9);
148 return max(ccid3_hc_tx_sk(sk)->s, new_len); 189
149#else /* CONFIG_IP_DCCP_CCID3_MEASURE_S_AS_MPS */ 190 if (hctx->ccid3hctx_s != old_s)
150 return dccp_sk(sk)->dccps_mss_cache; 191 ccid3_update_send_interval(hctx);
151#endif
152} 192}
153 193
154/* 194/*
@@ -158,13 +198,13 @@ static u32 ccid3_hc_tx_measure_packet_size(struct sock *sk, const u16 new_len)
158static inline void ccid3_hc_tx_update_win_count(struct ccid3_hc_tx_sock *hctx, 198static inline void ccid3_hc_tx_update_win_count(struct ccid3_hc_tx_sock *hctx,
159 ktime_t now) 199 ktime_t now)
160{ 200{
161 u32 delta = ktime_us_delta(now, hctx->t_last_win_count), 201 u32 delta = ktime_us_delta(now, hctx->ccid3hctx_t_last_win_count),
162 quarter_rtts = (4 * delta) / hctx->rtt; 202 quarter_rtts = (4 * delta) / hctx->ccid3hctx_rtt;
163 203
164 if (quarter_rtts > 0) { 204 if (quarter_rtts > 0) {
165 hctx->t_last_win_count = now; 205 hctx->ccid3hctx_t_last_win_count = now;
166 hctx->last_win_count += min(quarter_rtts, 5U); 206 hctx->ccid3hctx_last_win_count += min(quarter_rtts, 5U);
167 hctx->last_win_count &= 0xF; /* mod 16 */ 207 hctx->ccid3hctx_last_win_count &= 0xF; /* mod 16 */
168 } 208 }
169} 209}
170 210
@@ -181,26 +221,25 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data)
181 goto restart_timer; 221 goto restart_timer;
182 } 222 }
183 223
184 ccid3_pr_debug("%s(%p) entry with%s feedback\n", dccp_role(sk), sk, 224 ccid3_pr_debug("%s(%p, state=%s) - entry \n", dccp_role(sk), sk,
185 hctx->feedback ? "" : "out"); 225 ccid3_tx_state_name(hctx->ccid3hctx_state));
186 226
187 /* Ignore and do not restart after leaving the established state */ 227 if (hctx->ccid3hctx_state == TFRC_SSTATE_FBACK)
188 if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN)) 228 ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK);
229 else if (hctx->ccid3hctx_state != TFRC_SSTATE_NO_FBACK)
189 goto out; 230 goto out;
190 231
191 /* Reset feedback state to "no feedback received" */
192 hctx->feedback = false;
193
194 /* 232 /*
195 * Determine new allowed sending rate X as per draft rfc3448bis-00, 4.4 233 * Determine new allowed sending rate X as per draft rfc3448bis-00, 4.4
196 * RTO is 0 if and only if no feedback has been received yet.
197 */ 234 */
198 if (hctx->t_rto == 0 || hctx->p == 0) { 235 if (hctx->ccid3hctx_t_rto == 0 || /* no feedback received yet */
236 hctx->ccid3hctx_p == 0) {
199 237
200 /* halve send rate directly */ 238 /* halve send rate directly */
201 hctx->x /= 2; 239 hctx->ccid3hctx_x = max(hctx->ccid3hctx_x / 2,
240 (((__u64)hctx->ccid3hctx_s) << 6) /
241 TFRC_T_MBI);
202 ccid3_update_send_interval(hctx); 242 ccid3_update_send_interval(hctx);
203
204 } else { 243 } else {
205 /* 244 /*
206 * Modify the cached value of X_recv 245 * Modify the cached value of X_recv
@@ -212,41 +251,44 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data)
212 * 251 *
213 * Note that X_recv is scaled by 2^6 while X_calc is not 252 * Note that X_recv is scaled by 2^6 while X_calc is not
214 */ 253 */
215 BUG_ON(hctx->p && !hctx->x_calc); 254 BUG_ON(hctx->ccid3hctx_p && !hctx->ccid3hctx_x_calc);
216 255
217 if (hctx->x_calc > (hctx->x_recv >> 5)) 256 if (hctx->ccid3hctx_x_calc > (hctx->ccid3hctx_x_recv >> 5))
218 hctx->x_recv /= 2; 257 hctx->ccid3hctx_x_recv =
258 max(hctx->ccid3hctx_x_recv / 2,
259 (((__u64)hctx->ccid3hctx_s) << 6) /
260 (2 * TFRC_T_MBI));
219 else { 261 else {
220 hctx->x_recv = hctx->x_calc; 262 hctx->ccid3hctx_x_recv = hctx->ccid3hctx_x_calc;
221 hctx->x_recv <<= 4; 263 hctx->ccid3hctx_x_recv <<= 4;
222 } 264 }
223 ccid3_hc_tx_update_x(sk, NULL); 265 ccid3_hc_tx_update_x(sk, NULL);
224 } 266 }
225 ccid3_pr_debug("Reduced X to %llu/64 bytes/sec\n", 267 ccid3_pr_debug("Reduced X to %llu/64 bytes/sec\n",
226 (unsigned long long)hctx->x); 268 (unsigned long long)hctx->ccid3hctx_x);
227 269
228 /* 270 /*
229 * Set new timeout for the nofeedback timer. 271 * Set new timeout for the nofeedback timer.
230 * See comments in packet_recv() regarding the value of t_RTO. 272 * See comments in packet_recv() regarding the value of t_RTO.
231 */ 273 */
232 if (unlikely(hctx->t_rto == 0)) /* no feedback received yet */ 274 if (unlikely(hctx->ccid3hctx_t_rto == 0)) /* no feedback yet */
233 t_nfb = TFRC_INITIAL_TIMEOUT; 275 t_nfb = TFRC_INITIAL_TIMEOUT;
234 else 276 else
235 t_nfb = max(hctx->t_rto, 2 * hctx->t_ipi); 277 t_nfb = max(hctx->ccid3hctx_t_rto, 2 * hctx->ccid3hctx_t_ipi);
236 278
237restart_timer: 279restart_timer:
238 sk_reset_timer(sk, &hctx->no_feedback_timer, 280 sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer,
239 jiffies + usecs_to_jiffies(t_nfb)); 281 jiffies + usecs_to_jiffies(t_nfb));
240out: 282out:
241 bh_unlock_sock(sk); 283 bh_unlock_sock(sk);
242 sock_put(sk); 284 sock_put(sk);
243} 285}
244 286
245/** 287/*
246 * ccid3_hc_tx_send_packet - Delay-based dequeueing of TX packets 288 * returns
247 * @skb: next packet candidate to send on @sk 289 * > 0: delay (in msecs) that should pass before actually sending
248 * This function uses the convention of ccid_packet_dequeue_eval() and 290 * = 0: can send immediately
249 * returns a millisecond-delay value between 0 and t_mbi = 64000 msec. 291 * < 0: error condition; do not send packet
250 */ 292 */
251static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) 293static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
252{ 294{
@@ -263,14 +305,18 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
263 if (unlikely(skb->len == 0)) 305 if (unlikely(skb->len == 0))
264 return -EBADMSG; 306 return -EBADMSG;
265 307
266 if (hctx->s == 0) { 308 switch (hctx->ccid3hctx_state) {
267 sk_reset_timer(sk, &hctx->no_feedback_timer, (jiffies + 309 case TFRC_SSTATE_NO_SENT:
310 sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer,
311 (jiffies +
268 usecs_to_jiffies(TFRC_INITIAL_TIMEOUT))); 312 usecs_to_jiffies(TFRC_INITIAL_TIMEOUT)));
269 hctx->last_win_count = 0; 313 hctx->ccid3hctx_last_win_count = 0;
270 hctx->t_last_win_count = now; 314 hctx->ccid3hctx_t_last_win_count = now;
271 315
272 /* Set t_0 for initial packet */ 316 /* Set t_0 for initial packet */
273 hctx->t_nom = now; 317 hctx->ccid3hctx_t_nom = now;
318
319 hctx->ccid3hctx_s = skb->len;
274 320
275 /* 321 /*
276 * Use initial RTT sample when available: recommended by erratum 322 * Use initial RTT sample when available: recommended by erratum
@@ -279,9 +325,9 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
279 */ 325 */
280 if (dp->dccps_syn_rtt) { 326 if (dp->dccps_syn_rtt) {
281 ccid3_pr_debug("SYN RTT = %uus\n", dp->dccps_syn_rtt); 327 ccid3_pr_debug("SYN RTT = %uus\n", dp->dccps_syn_rtt);
282 hctx->rtt = dp->dccps_syn_rtt; 328 hctx->ccid3hctx_rtt = dp->dccps_syn_rtt;
283 hctx->x = rfc3390_initial_rate(sk); 329 hctx->ccid3hctx_x = rfc3390_initial_rate(sk);
284 hctx->t_ld = now; 330 hctx->ccid3hctx_t_ld = now;
285 } else { 331 } else {
286 /* 332 /*
287 * Sender does not have RTT sample: 333 * Sender does not have RTT sample:
@@ -289,20 +335,17 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
289 * is needed in several parts (e.g. window counter); 335 * is needed in several parts (e.g. window counter);
290 * - set sending rate X_pps = 1pps as per RFC 3448, 4.2. 336 * - set sending rate X_pps = 1pps as per RFC 3448, 4.2.
291 */ 337 */
292 hctx->rtt = DCCP_FALLBACK_RTT; 338 hctx->ccid3hctx_rtt = DCCP_FALLBACK_RTT;
293 hctx->x = dp->dccps_mss_cache; 339 hctx->ccid3hctx_x = hctx->ccid3hctx_s;
294 hctx->x <<= 6; 340 hctx->ccid3hctx_x <<= 6;
295 } 341 }
296
297 /* Compute t_ipi = s / X */
298 hctx->s = ccid3_hc_tx_measure_packet_size(sk, skb->len);
299 ccid3_update_send_interval(hctx); 342 ccid3_update_send_interval(hctx);
300 343
301 /* Seed value for Oscillation Prevention (sec. 4.5) */ 344 ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK);
302 hctx->r_sqmean = tfrc_scaled_sqrt(hctx->rtt); 345 break;
303 346 case TFRC_SSTATE_NO_FBACK:
304 } else { 347 case TFRC_SSTATE_FBACK:
305 delay = ktime_us_delta(hctx->t_nom, now); 348 delay = ktime_us_delta(hctx->ccid3hctx_t_nom, now);
306 ccid3_pr_debug("delay=%ld\n", (long)delay); 349 ccid3_pr_debug("delay=%ld\n", (long)delay);
307 /* 350 /*
308 * Scheduling of packet transmissions [RFC 3448, 4.6] 351 * Scheduling of packet transmissions [RFC 3448, 4.6]
@@ -312,80 +355,99 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
312 * else 355 * else
313 * // send the packet in (t_nom - t_now) milliseconds. 356 * // send the packet in (t_nom - t_now) milliseconds.
314 */ 357 */
315 if (delay >= TFRC_T_DELTA) 358 if (delay - (s64)hctx->ccid3hctx_delta >= 1000)
316 return (u32)delay / USEC_PER_MSEC; 359 return (u32)delay / 1000L;
317 360
318 ccid3_hc_tx_update_win_count(hctx, now); 361 ccid3_hc_tx_update_win_count(hctx, now);
362 break;
363 case TFRC_SSTATE_TERM:
364 DCCP_BUG("%s(%p) - Illegal state TERM", dccp_role(sk), sk);
365 return -EINVAL;
319 } 366 }
320 367
321 /* prepare to send now (add options etc.) */ 368 /* prepare to send now (add options etc.) */
322 dp->dccps_hc_tx_insert_options = 1; 369 dp->dccps_hc_tx_insert_options = 1;
323 DCCP_SKB_CB(skb)->dccpd_ccval = hctx->last_win_count; 370 DCCP_SKB_CB(skb)->dccpd_ccval = hctx->ccid3hctx_last_win_count;
324 371
325 /* set the nominal send time for the next following packet */ 372 /* set the nominal send time for the next following packet */
326 hctx->t_nom = ktime_add_us(hctx->t_nom, hctx->t_ipi); 373 hctx->ccid3hctx_t_nom = ktime_add_us(hctx->ccid3hctx_t_nom,
327 return CCID_PACKET_SEND_AT_ONCE; 374 hctx->ccid3hctx_t_ipi);
375 return 0;
328} 376}
329 377
330static void ccid3_hc_tx_packet_sent(struct sock *sk, unsigned int len) 378static void ccid3_hc_tx_packet_sent(struct sock *sk, int more,
379 unsigned int len)
331{ 380{
332 struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); 381 struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
333 382
334 /* Changes to s will become effective the next time X is computed */ 383 ccid3_hc_tx_update_s(hctx, len);
335 hctx->s = ccid3_hc_tx_measure_packet_size(sk, len);
336 384
337 if (tfrc_tx_hist_add(&hctx->hist, dccp_sk(sk)->dccps_gss)) 385 if (tfrc_tx_hist_add(&hctx->ccid3hctx_hist, dccp_sk(sk)->dccps_gss))
338 DCCP_CRIT("packet history - out of memory!"); 386 DCCP_CRIT("packet history - out of memory!");
339} 387}
340 388
341static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) 389static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
342{ 390{
343 struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); 391 struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
344 struct tfrc_tx_hist_entry *acked; 392 struct ccid3_options_received *opt_recv;
345 ktime_t now; 393 ktime_t now;
346 unsigned long t_nfb; 394 unsigned long t_nfb;
347 u32 r_sample; 395 u32 pinv, r_sample;
348 396
349 /* we are only interested in ACKs */ 397 /* we are only interested in ACKs */
350 if (!(DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK || 398 if (!(DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK ||
351 DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_DATAACK)) 399 DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_DATAACK))
352 return; 400 return;
353 /* 401 /* ... and only in the established state */
354 * Locate the acknowledged packet in the TX history. 402 if (hctx->ccid3hctx_state != TFRC_SSTATE_FBACK &&
355 * 403 hctx->ccid3hctx_state != TFRC_SSTATE_NO_FBACK)
356 * Returning "entry not found" here can for instance happen when 404 return;
357 * - the host has not sent out anything (e.g. a passive server), 405
358 * - the Ack is outdated (packet with higher Ack number was received), 406 opt_recv = &hctx->ccid3hctx_options_received;
359 * - it is a bogus Ack (for a packet not sent on this connection). 407 now = ktime_get_real();
360 */ 408
361 acked = tfrc_tx_hist_find_entry(hctx->hist, dccp_hdr_ack_seq(skb)); 409 /* Estimate RTT from history if ACK number is valid */
362 if (acked == NULL) 410 r_sample = tfrc_tx_hist_rtt(hctx->ccid3hctx_hist,
411 DCCP_SKB_CB(skb)->dccpd_ack_seq, now);
412 if (r_sample == 0) {
413 DCCP_WARN("%s(%p): %s with bogus ACK-%llu\n", dccp_role(sk), sk,
414 dccp_packet_name(DCCP_SKB_CB(skb)->dccpd_type),
415 (unsigned long long)DCCP_SKB_CB(skb)->dccpd_ack_seq);
363 return; 416 return;
364 /* For the sake of RTT sampling, ignore/remove all older entries */ 417 }
365 tfrc_tx_hist_purge(&acked->next);
366 418
367 /* Update the moving average for the RTT estimate (RFC 3448, 4.3) */ 419 /* Update receive rate in units of 64 * bytes/second */
368 now = ktime_get_real(); 420 hctx->ccid3hctx_x_recv = opt_recv->ccid3or_receive_rate;
369 r_sample = dccp_sample_rtt(sk, ktime_us_delta(now, acked->stamp)); 421 hctx->ccid3hctx_x_recv <<= 6;
370 hctx->rtt = tfrc_ewma(hctx->rtt, r_sample, 9);
371 422
423 /* Update loss event rate (which is scaled by 1e6) */
424 pinv = opt_recv->ccid3or_loss_event_rate;
425 if (pinv == ~0U || pinv == 0) /* see RFC 4342, 8.5 */
426 hctx->ccid3hctx_p = 0;
427 else /* can not exceed 100% */
428 hctx->ccid3hctx_p = scaled_div(1, pinv);
429 /*
430 * Validate new RTT sample and update moving average
431 */
432 r_sample = dccp_sample_rtt(sk, r_sample);
433 hctx->ccid3hctx_rtt = tfrc_ewma(hctx->ccid3hctx_rtt, r_sample, 9);
372 /* 434 /*
373 * Update allowed sending rate X as per draft rfc3448bis-00, 4.2/3 435 * Update allowed sending rate X as per draft rfc3448bis-00, 4.2/3
374 */ 436 */
375 if (!hctx->feedback) { 437 if (hctx->ccid3hctx_state == TFRC_SSTATE_NO_FBACK) {
376 hctx->feedback = true; 438 ccid3_hc_tx_set_state(sk, TFRC_SSTATE_FBACK);
377 439
378 if (hctx->t_rto == 0) { 440 if (hctx->ccid3hctx_t_rto == 0) {
379 /* 441 /*
380 * Initial feedback packet: Larger Initial Windows (4.2) 442 * Initial feedback packet: Larger Initial Windows (4.2)
381 */ 443 */
382 hctx->x = rfc3390_initial_rate(sk); 444 hctx->ccid3hctx_x = rfc3390_initial_rate(sk);
383 hctx->t_ld = now; 445 hctx->ccid3hctx_t_ld = now;
384 446
385 ccid3_update_send_interval(hctx); 447 ccid3_update_send_interval(hctx);
386 448
387 goto done_computing_x; 449 goto done_computing_x;
388 } else if (hctx->p == 0) { 450 } else if (hctx->ccid3hctx_p == 0) {
389 /* 451 /*
390 * First feedback after nofeedback timer expiry (4.3) 452 * First feedback after nofeedback timer expiry (4.3)
391 */ 453 */
@@ -394,52 +456,25 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
394 } 456 }
395 457
396 /* Update sending rate (step 4 of [RFC 3448, 4.3]) */ 458 /* Update sending rate (step 4 of [RFC 3448, 4.3]) */
397 if (hctx->p > 0) 459 if (hctx->ccid3hctx_p > 0)
398 hctx->x_calc = tfrc_calc_x(hctx->s, hctx->rtt, hctx->p); 460 hctx->ccid3hctx_x_calc =
461 tfrc_calc_x(hctx->ccid3hctx_s,
462 hctx->ccid3hctx_rtt,
463 hctx->ccid3hctx_p);
399 ccid3_hc_tx_update_x(sk, &now); 464 ccid3_hc_tx_update_x(sk, &now);
400 465
401done_computing_x: 466done_computing_x:
402 ccid3_pr_debug("%s(%p), RTT=%uus (sample=%uus), s=%u, " 467 ccid3_pr_debug("%s(%p), RTT=%uus (sample=%uus), s=%u, "
403 "p=%u, X_calc=%u, X_recv=%u, X=%u\n", 468 "p=%u, X_calc=%u, X_recv=%u, X=%u\n",
404 dccp_role(sk), sk, hctx->rtt, r_sample, 469 dccp_role(sk),
405 hctx->s, hctx->p, hctx->x_calc, 470 sk, hctx->ccid3hctx_rtt, r_sample,
406 (unsigned)(hctx->x_recv >> 6), 471 hctx->ccid3hctx_s, hctx->ccid3hctx_p,
407 (unsigned)(hctx->x >> 6)); 472 hctx->ccid3hctx_x_calc,
408 /* 473 (unsigned)(hctx->ccid3hctx_x_recv >> 6),
409 * Oscillation Reduction (RFC 3448, 4.5) - modifying t_ipi according to 474 (unsigned)(hctx->ccid3hctx_x >> 6));
410 * RTT changes, multiplying by X/X_inst = sqrt(R_sample)/R_sqmean. This
411 * can be useful if few connections share a link, avoiding that buffer
412 * fill levels (RTT) oscillate as a result of frequent adjustments to X.
413 * A useful presentation with background information is in
414 * Joerg Widmer, "Equation-Based Congestion Control",
415 * MSc Thesis, University of Mannheim, Germany, 2000
416 * (sec. 3.6.4), who calls this ISM ("Inter-packet Space Modulation").
417 */
418 if (do_osc_prev) {
419 r_sample = tfrc_scaled_sqrt(r_sample);
420 /*
421 * The modulation can work in both ways: increase/decrease t_ipi
422 * according to long-term increases/decreases of the RTT. The
423 * former is a useful measure, since it works against queue
424 * build-up. The latter temporarily increases the sending rate,
425 * so that buffers fill up more quickly. This in turn causes
426 * the RTT to increase, so that either later reduction becomes
427 * necessary or the RTT stays at a very high level. Decreasing
428 * t_ipi is therefore not supported.
429 * Furthermore, during the initial slow-start phase the RTT
430 * naturally increases, where using the algorithm would cause
431 * delays. Hence it is disabled during the initial slow-start.
432 */
433 if (r_sample > hctx->r_sqmean && hctx->p > 0)
434 hctx->t_ipi = div_u64((u64)hctx->t_ipi * (u64)r_sample,
435 hctx->r_sqmean);
436 hctx->t_ipi = min_t(u32, hctx->t_ipi, TFRC_T_MBI);
437 /* update R_sqmean _after_ computing the modulation factor */
438 hctx->r_sqmean = tfrc_ewma(hctx->r_sqmean, r_sample, 9);
439 }
440 475
441 /* unschedule no feedback timer */ 476 /* unschedule no feedback timer */
442 sk_stop_timer(sk, &hctx->no_feedback_timer); 477 sk_stop_timer(sk, &hctx->ccid3hctx_no_feedback_timer);
443 478
444 /* 479 /*
445 * As we have calculated new ipi, delta, t_nom it is possible 480 * As we have calculated new ipi, delta, t_nom it is possible
@@ -453,66 +488,95 @@ done_computing_x:
453 * This can help avoid triggering the nofeedback timer too 488 * This can help avoid triggering the nofeedback timer too
454 * often ('spinning') on LANs with small RTTs. 489 * often ('spinning') on LANs with small RTTs.
455 */ 490 */
456 hctx->t_rto = max_t(u32, 4 * hctx->rtt, (CONFIG_IP_DCCP_CCID3_RTO * 491 hctx->ccid3hctx_t_rto = max_t(u32, 4 * hctx->ccid3hctx_rtt,
457 (USEC_PER_SEC / 1000))); 492 (CONFIG_IP_DCCP_CCID3_RTO *
493 (USEC_PER_SEC / 1000)));
458 /* 494 /*
459 * Schedule no feedback timer to expire in 495 * Schedule no feedback timer to expire in
460 * max(t_RTO, 2 * s/X) = max(t_RTO, 2 * t_ipi) 496 * max(t_RTO, 2 * s/X) = max(t_RTO, 2 * t_ipi)
461 */ 497 */
462 t_nfb = max(hctx->t_rto, 2 * hctx->t_ipi); 498 t_nfb = max(hctx->ccid3hctx_t_rto, 2 * hctx->ccid3hctx_t_ipi);
463 499
464 ccid3_pr_debug("%s(%p), Scheduled no feedback timer to " 500 ccid3_pr_debug("%s(%p), Scheduled no feedback timer to "
465 "expire in %lu jiffies (%luus)\n", 501 "expire in %lu jiffies (%luus)\n",
466 dccp_role(sk), sk, usecs_to_jiffies(t_nfb), t_nfb); 502 dccp_role(sk),
503 sk, usecs_to_jiffies(t_nfb), t_nfb);
467 504
468 sk_reset_timer(sk, &hctx->no_feedback_timer, 505 sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer,
469 jiffies + usecs_to_jiffies(t_nfb)); 506 jiffies + usecs_to_jiffies(t_nfb));
470} 507}
471 508
472static int ccid3_hc_tx_parse_options(struct sock *sk, u8 packet_type, 509static int ccid3_hc_tx_parse_options(struct sock *sk, unsigned char option,
473 u8 option, u8 *optval, u8 optlen) 510 unsigned char len, u16 idx,
511 unsigned char *value)
474{ 512{
513 int rc = 0;
514 const struct dccp_sock *dp = dccp_sk(sk);
475 struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); 515 struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
516 struct ccid3_options_received *opt_recv;
476 __be32 opt_val; 517 __be32 opt_val;
477 518
478 switch (option) { 519 opt_recv = &hctx->ccid3hctx_options_received;
479 case TFRC_OPT_RECEIVE_RATE:
480 case TFRC_OPT_LOSS_EVENT_RATE:
481 /* Must be ignored on Data packets, cf. RFC 4342 8.3 and 8.5 */
482 if (packet_type == DCCP_PKT_DATA)
483 break;
484 if (unlikely(optlen != 4)) {
485 DCCP_WARN("%s(%p), invalid len %d for %u\n",
486 dccp_role(sk), sk, optlen, option);
487 return -EINVAL;
488 }
489 opt_val = ntohl(get_unaligned((__be32 *)optval));
490 520
491 if (option == TFRC_OPT_RECEIVE_RATE) { 521 if (opt_recv->ccid3or_seqno != dp->dccps_gsr) {
492 /* Receive Rate is kept in units of 64 bytes/second */ 522 opt_recv->ccid3or_seqno = dp->dccps_gsr;
493 hctx->x_recv = opt_val; 523 opt_recv->ccid3or_loss_event_rate = ~0;
494 hctx->x_recv <<= 6; 524 opt_recv->ccid3or_loss_intervals_idx = 0;
525 opt_recv->ccid3or_loss_intervals_len = 0;
526 opt_recv->ccid3or_receive_rate = 0;
527 }
495 528
496 ccid3_pr_debug("%s(%p), RECEIVE_RATE=%u\n", 529 switch (option) {
497 dccp_role(sk), sk, opt_val); 530 case TFRC_OPT_LOSS_EVENT_RATE:
531 if (unlikely(len != 4)) {
532 DCCP_WARN("%s(%p), invalid len %d "
533 "for TFRC_OPT_LOSS_EVENT_RATE\n",
534 dccp_role(sk), sk, len);
535 rc = -EINVAL;
498 } else { 536 } else {
499 /* Update the fixpoint Loss Event Rate fraction */ 537 opt_val = get_unaligned((__be32 *)value);
500 hctx->p = tfrc_invert_loss_event_rate(opt_val); 538 opt_recv->ccid3or_loss_event_rate = ntohl(opt_val);
501
502 ccid3_pr_debug("%s(%p), LOSS_EVENT_RATE=%u\n", 539 ccid3_pr_debug("%s(%p), LOSS_EVENT_RATE=%u\n",
503 dccp_role(sk), sk, opt_val); 540 dccp_role(sk), sk,
541 opt_recv->ccid3or_loss_event_rate);
504 } 542 }
543 break;
544 case TFRC_OPT_LOSS_INTERVALS:
545 opt_recv->ccid3or_loss_intervals_idx = idx;
546 opt_recv->ccid3or_loss_intervals_len = len;
547 ccid3_pr_debug("%s(%p), LOSS_INTERVALS=(%u, %u)\n",
548 dccp_role(sk), sk,
549 opt_recv->ccid3or_loss_intervals_idx,
550 opt_recv->ccid3or_loss_intervals_len);
551 break;
552 case TFRC_OPT_RECEIVE_RATE:
553 if (unlikely(len != 4)) {
554 DCCP_WARN("%s(%p), invalid len %d "
555 "for TFRC_OPT_RECEIVE_RATE\n",
556 dccp_role(sk), sk, len);
557 rc = -EINVAL;
558 } else {
559 opt_val = get_unaligned((__be32 *)value);
560 opt_recv->ccid3or_receive_rate = ntohl(opt_val);
561 ccid3_pr_debug("%s(%p), RECEIVE_RATE=%u\n",
562 dccp_role(sk), sk,
563 opt_recv->ccid3or_receive_rate);
564 }
565 break;
505 } 566 }
506 return 0; 567
568 return rc;
507} 569}
508 570
509static int ccid3_hc_tx_init(struct ccid *ccid, struct sock *sk) 571static int ccid3_hc_tx_init(struct ccid *ccid, struct sock *sk)
510{ 572{
511 struct ccid3_hc_tx_sock *hctx = ccid_priv(ccid); 573 struct ccid3_hc_tx_sock *hctx = ccid_priv(ccid);
512 574
513 hctx->hist = NULL; 575 hctx->ccid3hctx_state = TFRC_SSTATE_NO_SENT;
514 setup_timer(&hctx->no_feedback_timer, 576 hctx->ccid3hctx_hist = NULL;
515 ccid3_hc_tx_no_feedback_timer, (unsigned long)sk); 577 setup_timer(&hctx->ccid3hctx_no_feedback_timer,
578 ccid3_hc_tx_no_feedback_timer, (unsigned long)sk);
579
516 return 0; 580 return 0;
517} 581}
518 582
@@ -520,36 +584,42 @@ static void ccid3_hc_tx_exit(struct sock *sk)
520{ 584{
521 struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); 585 struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
522 586
523 sk_stop_timer(sk, &hctx->no_feedback_timer); 587 ccid3_hc_tx_set_state(sk, TFRC_SSTATE_TERM);
524 tfrc_tx_hist_purge(&hctx->hist); 588 sk_stop_timer(sk, &hctx->ccid3hctx_no_feedback_timer);
589
590 tfrc_tx_hist_purge(&hctx->ccid3hctx_hist);
525} 591}
526 592
527static void ccid3_hc_tx_get_info(struct sock *sk, struct tcp_info *info) 593static void ccid3_hc_tx_get_info(struct sock *sk, struct tcp_info *info)
528{ 594{
529 info->tcpi_rto = ccid3_hc_tx_sk(sk)->t_rto; 595 struct ccid3_hc_tx_sock *hctx;
530 info->tcpi_rtt = ccid3_hc_tx_sk(sk)->rtt; 596
597 /* Listen socks doesn't have a private CCID block */
598 if (sk->sk_state == DCCP_LISTEN)
599 return;
600
601 hctx = ccid3_hc_tx_sk(sk);
602 info->tcpi_rto = hctx->ccid3hctx_t_rto;
603 info->tcpi_rtt = hctx->ccid3hctx_rtt;
531} 604}
532 605
533static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len, 606static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len,
534 u32 __user *optval, int __user *optlen) 607 u32 __user *optval, int __user *optlen)
535{ 608{
536 const struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); 609 const struct ccid3_hc_tx_sock *hctx;
537 struct tfrc_tx_info tfrc;
538 const void *val; 610 const void *val;
539 611
612 /* Listen socks doesn't have a private CCID block */
613 if (sk->sk_state == DCCP_LISTEN)
614 return -EINVAL;
615
616 hctx = ccid3_hc_tx_sk(sk);
540 switch (optname) { 617 switch (optname) {
541 case DCCP_SOCKOPT_CCID_TX_INFO: 618 case DCCP_SOCKOPT_CCID_TX_INFO:
542 if (len < sizeof(tfrc)) 619 if (len < sizeof(hctx->ccid3hctx_tfrc))
543 return -EINVAL; 620 return -EINVAL;
544 tfrc.tfrctx_x = hctx->x; 621 len = sizeof(hctx->ccid3hctx_tfrc);
545 tfrc.tfrctx_x_recv = hctx->x_recv; 622 val = &hctx->ccid3hctx_tfrc;
546 tfrc.tfrctx_x_calc = hctx->x_calc;
547 tfrc.tfrctx_rtt = hctx->rtt;
548 tfrc.tfrctx_p = hctx->p;
549 tfrc.tfrctx_rto = hctx->t_rto;
550 tfrc.tfrctx_ipi = hctx->t_ipi;
551 len = sizeof(tfrc);
552 val = &tfrc;
553 break; 623 break;
554 default: 624 default:
555 return -ENOPROTOOPT; 625 return -ENOPROTOOPT;
@@ -564,82 +634,112 @@ static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len,
564/* 634/*
565 * Receiver Half-Connection Routines 635 * Receiver Half-Connection Routines
566 */ 636 */
637
638/* CCID3 feedback types */
639enum ccid3_fback_type {
640 CCID3_FBACK_NONE = 0,
641 CCID3_FBACK_INITIAL,
642 CCID3_FBACK_PERIODIC,
643 CCID3_FBACK_PARAM_CHANGE
644};
645
646#ifdef CONFIG_IP_DCCP_CCID3_DEBUG
647static const char *ccid3_rx_state_name(enum ccid3_hc_rx_states state)
648{
649 static char *ccid3_rx_state_names[] = {
650 [TFRC_RSTATE_NO_DATA] = "NO_DATA",
651 [TFRC_RSTATE_DATA] = "DATA",
652 [TFRC_RSTATE_TERM] = "TERM",
653 };
654
655 return ccid3_rx_state_names[state];
656}
657#endif
658
659static void ccid3_hc_rx_set_state(struct sock *sk,
660 enum ccid3_hc_rx_states state)
661{
662 struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
663 enum ccid3_hc_rx_states oldstate = hcrx->ccid3hcrx_state;
664
665 ccid3_pr_debug("%s(%p) %-8.8s -> %s\n",
666 dccp_role(sk), sk, ccid3_rx_state_name(oldstate),
667 ccid3_rx_state_name(state));
668 WARN_ON(state == oldstate);
669 hcrx->ccid3hcrx_state = state;
670}
671
567static void ccid3_hc_rx_send_feedback(struct sock *sk, 672static void ccid3_hc_rx_send_feedback(struct sock *sk,
568 const struct sk_buff *skb, 673 const struct sk_buff *skb,
569 enum ccid3_fback_type fbtype) 674 enum ccid3_fback_type fbtype)
570{ 675{
571 struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); 676 struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
677 struct dccp_sock *dp = dccp_sk(sk);
678 ktime_t now;
679 s64 delta = 0;
680
681 if (unlikely(hcrx->ccid3hcrx_state == TFRC_RSTATE_TERM))
682 return;
683
684 now = ktime_get_real();
572 685
573 switch (fbtype) { 686 switch (fbtype) {
574 case CCID3_FBACK_INITIAL: 687 case CCID3_FBACK_INITIAL:
575 hcrx->x_recv = 0; 688 hcrx->ccid3hcrx_x_recv = 0;
576 hcrx->p_inverse = ~0U; /* see RFC 4342, 8.5 */ 689 hcrx->ccid3hcrx_pinv = ~0U; /* see RFC 4342, 8.5 */
577 break; 690 break;
578 case CCID3_FBACK_PARAM_CHANGE: 691 case CCID3_FBACK_PARAM_CHANGE:
579 if (unlikely(hcrx->feedback == CCID3_FBACK_NONE)) {
580 /*
581 * rfc3448bis-06, 6.3.1: First packet(s) lost or marked
582 * FIXME: in rfc3448bis the receiver returns X_recv=0
583 * here as it normally would in the first feedback packet.
584 * However this is not possible yet, since the code still
585 * uses RFC 3448, i.e.
586 * If (p > 0)
587 * Calculate X_calc using the TCP throughput equation.
588 * X = max(min(X_calc, 2*X_recv), s/t_mbi);
589 * would bring X down to s/t_mbi. That is why we return
590 * X_recv according to rfc3448bis-06 for the moment.
591 */
592 u32 s = tfrc_rx_hist_packet_size(&hcrx->hist),
593 rtt = tfrc_rx_hist_rtt(&hcrx->hist);
594
595 hcrx->x_recv = scaled_div32(s, 2 * rtt);
596 break;
597 }
598 /* 692 /*
599 * When parameters change (new loss or p > p_prev), we do not 693 * When parameters change (new loss or p > p_prev), we do not
600 * have a reliable estimate for R_m of [RFC 3448, 6.2] and so 694 * have a reliable estimate for R_m of [RFC 3448, 6.2] and so
601 * always check whether at least RTT time units were covered. 695 * need to reuse the previous value of X_recv. However, when
696 * X_recv was 0 (due to early loss), this would kill X down to
697 * s/t_mbi (i.e. one packet in 64 seconds).
698 * To avoid such drastic reduction, we approximate X_recv as
699 * the number of bytes since last feedback.
700 * This is a safe fallback, since X is bounded above by X_calc.
602 */ 701 */
603 hcrx->x_recv = tfrc_rx_hist_x_recv(&hcrx->hist, hcrx->x_recv); 702 if (hcrx->ccid3hcrx_x_recv > 0)
604 break; 703 break;
704 /* fall through */
605 case CCID3_FBACK_PERIODIC: 705 case CCID3_FBACK_PERIODIC:
606 /* 706 delta = ktime_us_delta(now, hcrx->ccid3hcrx_tstamp_last_feedback);
607 * Step (2) of rfc3448bis-06, 6.2: 707 if (delta <= 0)
608 * - if no data packets have been received, just restart timer 708 DCCP_BUG("delta (%ld) <= 0", (long)delta);
609 * - if data packets have been received, re-compute X_recv 709 else
610 */ 710 hcrx->ccid3hcrx_x_recv =
611 if (hcrx->hist.bytes_recvd == 0) 711 scaled_div32(hcrx->ccid3hcrx_bytes_recv, delta);
612 goto prepare_for_next_time;
613 hcrx->x_recv = tfrc_rx_hist_x_recv(&hcrx->hist, hcrx->x_recv);
614 break; 712 break;
615 default: 713 default:
616 return; 714 return;
617 } 715 }
618 716
619 ccid3_pr_debug("X_recv=%u, 1/p=%u\n", hcrx->x_recv, hcrx->p_inverse); 717 ccid3_pr_debug("Interval %ldusec, X_recv=%u, 1/p=%u\n", (long)delta,
718 hcrx->ccid3hcrx_x_recv, hcrx->ccid3hcrx_pinv);
620 719
621 dccp_sk(sk)->dccps_hc_rx_insert_options = 1; 720 hcrx->ccid3hcrx_tstamp_last_feedback = now;
622 dccp_send_ack(sk); 721 hcrx->ccid3hcrx_last_counter = dccp_hdr(skb)->dccph_ccval;
722 hcrx->ccid3hcrx_bytes_recv = 0;
623 723
624prepare_for_next_time: 724 dp->dccps_hc_rx_insert_options = 1;
625 tfrc_rx_hist_restart_byte_counter(&hcrx->hist); 725 dccp_send_ack(sk);
626 hcrx->last_counter = dccp_hdr(skb)->dccph_ccval;
627 hcrx->feedback = fbtype;
628} 726}
629 727
630static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb) 728static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb)
631{ 729{
632 const struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); 730 const struct ccid3_hc_rx_sock *hcrx;
633 __be32 x_recv, pinv; 731 __be32 x_recv, pinv;
634 732
635 if (!(sk->sk_state == DCCP_OPEN || sk->sk_state == DCCP_PARTOPEN)) 733 if (!(sk->sk_state == DCCP_OPEN || sk->sk_state == DCCP_PARTOPEN))
636 return 0; 734 return 0;
637 735
736 hcrx = ccid3_hc_rx_sk(sk);
737
638 if (dccp_packet_without_ack(skb)) 738 if (dccp_packet_without_ack(skb))
639 return 0; 739 return 0;
640 740
641 x_recv = htonl(hcrx->x_recv); 741 x_recv = htonl(hcrx->ccid3hcrx_x_recv);
642 pinv = htonl(hcrx->p_inverse); 742 pinv = htonl(hcrx->ccid3hcrx_pinv);
643 743
644 if (dccp_insert_option(sk, skb, TFRC_OPT_LOSS_EVENT_RATE, 744 if (dccp_insert_option(sk, skb, TFRC_OPT_LOSS_EVENT_RATE,
645 &pinv, sizeof(pinv)) || 745 &pinv, sizeof(pinv)) ||
@@ -662,95 +762,171 @@ static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb)
662static u32 ccid3_first_li(struct sock *sk) 762static u32 ccid3_first_li(struct sock *sk)
663{ 763{
664 struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); 764 struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
665 u32 s = tfrc_rx_hist_packet_size(&hcrx->hist), 765 u32 x_recv, p, delta;
666 rtt = tfrc_rx_hist_rtt(&hcrx->hist), x_recv, p;
667 u64 fval; 766 u64 fval;
668 767
669 /* 768 if (hcrx->ccid3hcrx_rtt == 0) {
670 * rfc3448bis-06, 6.3.1: First data packet(s) are marked or lost. Set p 769 DCCP_WARN("No RTT estimate available, using fallback RTT\n");
671 * to give the equivalent of X_target = s/(2*R). Thus fval = 2 and so p 770 hcrx->ccid3hcrx_rtt = DCCP_FALLBACK_RTT;
672 * is about 20.64%. This yields an interval length of 4.84 (rounded up). 771 }
673 */
674 if (unlikely(hcrx->feedback == CCID3_FBACK_NONE))
675 return 5;
676 772
677 x_recv = tfrc_rx_hist_x_recv(&hcrx->hist, hcrx->x_recv); 773 delta = ktime_to_us(net_timedelta(hcrx->ccid3hcrx_tstamp_last_feedback));
678 if (x_recv == 0) 774 x_recv = scaled_div32(hcrx->ccid3hcrx_bytes_recv, delta);
679 goto failed; 775 if (x_recv == 0) { /* would also trigger divide-by-zero */
776 DCCP_WARN("X_recv==0\n");
777 if ((x_recv = hcrx->ccid3hcrx_x_recv) == 0) {
778 DCCP_BUG("stored value of X_recv is zero");
779 return ~0U;
780 }
781 }
680 782
681 fval = scaled_div32(scaled_div(s, rtt), x_recv); 783 fval = scaled_div(hcrx->ccid3hcrx_s, hcrx->ccid3hcrx_rtt);
784 fval = scaled_div32(fval, x_recv);
682 p = tfrc_calc_x_reverse_lookup(fval); 785 p = tfrc_calc_x_reverse_lookup(fval);
683 786
684 ccid3_pr_debug("%s(%p), receive rate=%u bytes/s, implied " 787 ccid3_pr_debug("%s(%p), receive rate=%u bytes/s, implied "
685 "loss rate=%u\n", dccp_role(sk), sk, x_recv, p); 788 "loss rate=%u\n", dccp_role(sk), sk, x_recv, p);
686 789
687 if (p > 0) 790 return p == 0 ? ~0U : scaled_div(1, p);
688 return scaled_div(1, p);
689failed:
690 return UINT_MAX;
691} 791}
692 792
693static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) 793static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
694{ 794{
695 struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); 795 struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
796 enum ccid3_fback_type do_feedback = CCID3_FBACK_NONE;
696 const u64 ndp = dccp_sk(sk)->dccps_options_received.dccpor_ndp; 797 const u64 ndp = dccp_sk(sk)->dccps_options_received.dccpor_ndp;
697 const bool is_data_packet = dccp_data_packet(skb); 798 const bool is_data_packet = dccp_data_packet(skb);
698 799
800 if (unlikely(hcrx->ccid3hcrx_state == TFRC_RSTATE_NO_DATA)) {
801 if (is_data_packet) {
802 const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4;
803 do_feedback = CCID3_FBACK_INITIAL;
804 ccid3_hc_rx_set_state(sk, TFRC_RSTATE_DATA);
805 hcrx->ccid3hcrx_s = payload;
806 /*
807 * Not necessary to update ccid3hcrx_bytes_recv here,
808 * since X_recv = 0 for the first feedback packet (cf.
809 * RFC 3448, 6.3) -- gerrit
810 */
811 }
812 goto update_records;
813 }
814
815 if (tfrc_rx_hist_duplicate(&hcrx->ccid3hcrx_hist, skb))
816 return; /* done receiving */
817
818 if (is_data_packet) {
819 const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4;
820 /*
821 * Update moving-average of s and the sum of received payload bytes
822 */
823 hcrx->ccid3hcrx_s = tfrc_ewma(hcrx->ccid3hcrx_s, payload, 9);
824 hcrx->ccid3hcrx_bytes_recv += payload;
825 }
826
699 /* 827 /*
700 * Perform loss detection and handle pending losses 828 * Perform loss detection and handle pending losses
701 */ 829 */
702 if (tfrc_rx_congestion_event(&hcrx->hist, &hcrx->li_hist, 830 if (tfrc_rx_handle_loss(&hcrx->ccid3hcrx_hist, &hcrx->ccid3hcrx_li_hist,
703 skb, ndp, ccid3_first_li, sk)) 831 skb, ndp, ccid3_first_li, sk)) {
704 ccid3_hc_rx_send_feedback(sk, skb, CCID3_FBACK_PARAM_CHANGE); 832 do_feedback = CCID3_FBACK_PARAM_CHANGE;
833 goto done_receiving;
834 }
835
836 if (tfrc_rx_hist_loss_pending(&hcrx->ccid3hcrx_hist))
837 return; /* done receiving */
838
705 /* 839 /*
706 * Feedback for first non-empty data packet (RFC 3448, 6.3) 840 * Handle data packets: RTT sampling and monitoring p
707 */ 841 */
708 else if (unlikely(hcrx->feedback == CCID3_FBACK_NONE && is_data_packet)) 842 if (unlikely(!is_data_packet))
709 ccid3_hc_rx_send_feedback(sk, skb, CCID3_FBACK_INITIAL); 843 goto update_records;
844
845 if (!tfrc_lh_is_initialised(&hcrx->ccid3hcrx_li_hist)) {
846 const u32 sample = tfrc_rx_hist_sample_rtt(&hcrx->ccid3hcrx_hist, skb);
847 /*
848 * Empty loss history: no loss so far, hence p stays 0.
849 * Sample RTT values, since an RTT estimate is required for the
850 * computation of p when the first loss occurs; RFC 3448, 6.3.1.
851 */
852 if (sample != 0)
853 hcrx->ccid3hcrx_rtt = tfrc_ewma(hcrx->ccid3hcrx_rtt, sample, 9);
854
855 } else if (tfrc_lh_update_i_mean(&hcrx->ccid3hcrx_li_hist, skb)) {
856 /*
857 * Step (3) of [RFC 3448, 6.1]: Recompute I_mean and, if I_mean
858 * has decreased (resp. p has increased), send feedback now.
859 */
860 do_feedback = CCID3_FBACK_PARAM_CHANGE;
861 }
862
710 /* 863 /*
711 * Check if the periodic once-per-RTT feedback is due; RFC 4342, 10.3 864 * Check if the periodic once-per-RTT feedback is due; RFC 4342, 10.3
712 */ 865 */
713 else if (!tfrc_rx_hist_loss_pending(&hcrx->hist) && is_data_packet && 866 if (SUB16(dccp_hdr(skb)->dccph_ccval, hcrx->ccid3hcrx_last_counter) > 3)
714 SUB16(dccp_hdr(skb)->dccph_ccval, hcrx->last_counter) > 3) 867 do_feedback = CCID3_FBACK_PERIODIC;
715 ccid3_hc_rx_send_feedback(sk, skb, CCID3_FBACK_PERIODIC); 868
869update_records:
870 tfrc_rx_hist_add_packet(&hcrx->ccid3hcrx_hist, skb, ndp);
871
872done_receiving:
873 if (do_feedback)
874 ccid3_hc_rx_send_feedback(sk, skb, do_feedback);
716} 875}
717 876
718static int ccid3_hc_rx_init(struct ccid *ccid, struct sock *sk) 877static int ccid3_hc_rx_init(struct ccid *ccid, struct sock *sk)
719{ 878{
720 struct ccid3_hc_rx_sock *hcrx = ccid_priv(ccid); 879 struct ccid3_hc_rx_sock *hcrx = ccid_priv(ccid);
721 880
722 tfrc_lh_init(&hcrx->li_hist); 881 hcrx->ccid3hcrx_state = TFRC_RSTATE_NO_DATA;
723 return tfrc_rx_hist_init(&hcrx->hist, sk); 882 tfrc_lh_init(&hcrx->ccid3hcrx_li_hist);
883 return tfrc_rx_hist_alloc(&hcrx->ccid3hcrx_hist);
724} 884}
725 885
726static void ccid3_hc_rx_exit(struct sock *sk) 886static void ccid3_hc_rx_exit(struct sock *sk)
727{ 887{
728 struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); 888 struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
729 889
730 tfrc_rx_hist_purge(&hcrx->hist); 890 ccid3_hc_rx_set_state(sk, TFRC_RSTATE_TERM);
731 tfrc_lh_cleanup(&hcrx->li_hist); 891
892 tfrc_rx_hist_purge(&hcrx->ccid3hcrx_hist);
893 tfrc_lh_cleanup(&hcrx->ccid3hcrx_li_hist);
732} 894}
733 895
734static void ccid3_hc_rx_get_info(struct sock *sk, struct tcp_info *info) 896static void ccid3_hc_rx_get_info(struct sock *sk, struct tcp_info *info)
735{ 897{
898 const struct ccid3_hc_rx_sock *hcrx;
899
900 /* Listen socks doesn't have a private CCID block */
901 if (sk->sk_state == DCCP_LISTEN)
902 return;
903
904 hcrx = ccid3_hc_rx_sk(sk);
905 info->tcpi_ca_state = hcrx->ccid3hcrx_state;
736 info->tcpi_options |= TCPI_OPT_TIMESTAMPS; 906 info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
737 info->tcpi_rcv_rtt = tfrc_rx_hist_rtt(&ccid3_hc_rx_sk(sk)->hist); 907 info->tcpi_rcv_rtt = hcrx->ccid3hcrx_rtt;
738} 908}
739 909
740static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len, 910static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len,
741 u32 __user *optval, int __user *optlen) 911 u32 __user *optval, int __user *optlen)
742{ 912{
743 const struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); 913 const struct ccid3_hc_rx_sock *hcrx;
744 struct tfrc_rx_info rx_info; 914 struct tfrc_rx_info rx_info;
745 const void *val; 915 const void *val;
746 916
917 /* Listen socks doesn't have a private CCID block */
918 if (sk->sk_state == DCCP_LISTEN)
919 return -EINVAL;
920
921 hcrx = ccid3_hc_rx_sk(sk);
747 switch (optname) { 922 switch (optname) {
748 case DCCP_SOCKOPT_CCID_RX_INFO: 923 case DCCP_SOCKOPT_CCID_RX_INFO:
749 if (len < sizeof(rx_info)) 924 if (len < sizeof(rx_info))
750 return -EINVAL; 925 return -EINVAL;
751 rx_info.tfrcrx_x_recv = hcrx->x_recv; 926 rx_info.tfrcrx_x_recv = hcrx->ccid3hcrx_x_recv;
752 rx_info.tfrcrx_rtt = tfrc_rx_hist_rtt(&hcrx->hist); 927 rx_info.tfrcrx_rtt = hcrx->ccid3hcrx_rtt;
753 rx_info.tfrcrx_p = tfrc_invert_loss_event_rate(hcrx->p_inverse); 928 rx_info.tfrcrx_p = hcrx->ccid3hcrx_pinv == 0 ? ~0U :
929 scaled_div(1, hcrx->ccid3hcrx_pinv);
754 len = sizeof(rx_info); 930 len = sizeof(rx_info);
755 val = &rx_info; 931 val = &rx_info;
756 break; 932 break;
@@ -786,9 +962,6 @@ static struct ccid_operations ccid3 = {
786 .ccid_hc_tx_getsockopt = ccid3_hc_tx_getsockopt, 962 .ccid_hc_tx_getsockopt = ccid3_hc_tx_getsockopt,
787}; 963};
788 964
789module_param(do_osc_prev, bool, 0644);
790MODULE_PARM_DESC(do_osc_prev, "Use Oscillation Prevention (RFC 3448, 4.5)");
791
792#ifdef CONFIG_IP_DCCP_CCID3_DEBUG 965#ifdef CONFIG_IP_DCCP_CCID3_DEBUG
793module_param(ccid3_debug, bool, 0644); 966module_param(ccid3_debug, bool, 0644);
794MODULE_PARM_DESC(ccid3_debug, "Enable debug messages"); 967MODULE_PARM_DESC(ccid3_debug, "Enable debug messages");
@@ -796,19 +969,6 @@ MODULE_PARM_DESC(ccid3_debug, "Enable debug messages");
796 969
797static __init int ccid3_module_init(void) 970static __init int ccid3_module_init(void)
798{ 971{
799 struct timespec tp;
800
801 /*
802 * Without a fine-grained clock resolution, RTTs/X_recv are not sampled
803 * correctly and feedback is sent either too early or too late.
804 */
805 hrtimer_get_res(CLOCK_MONOTONIC, &tp);
806 if (tp.tv_sec || tp.tv_nsec > DCCP_TIME_RESOLUTION * NSEC_PER_USEC) {
807 printk(KERN_ERR "%s: Timer too coarse (%ld usec), need %u-usec"
808 " resolution - check your clocksource.\n", __func__,
809 tp.tv_nsec/NSEC_PER_USEC, DCCP_TIME_RESOLUTION);
810 return -ESOCKTNOSUPPORT;
811 }
812 return ccid_register(&ccid3); 972 return ccid_register(&ccid3);
813} 973}
814module_init(ccid3_module_init); 974module_init(ccid3_module_init);
diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h
index af6e1bf937d9..49ca32bd7e79 100644
--- a/net/dccp/ccids/ccid3.h
+++ b/net/dccp/ccids/ccid3.h
@@ -47,22 +47,11 @@
47/* Two seconds as per RFC 3448 4.2 */ 47/* Two seconds as per RFC 3448 4.2 */
48#define TFRC_INITIAL_TIMEOUT (2 * USEC_PER_SEC) 48#define TFRC_INITIAL_TIMEOUT (2 * USEC_PER_SEC)
49 49
50/* Maximum backoff interval t_mbi (RFC 3448, 4.3) */ 50/* In usecs - half the scheduling granularity as per RFC3448 4.6 */
51#define TFRC_T_MBI (64 * USEC_PER_SEC) 51#define TFRC_OPSYS_HALF_TIME_GRAN (USEC_PER_SEC / (2 * HZ))
52 52
53/* 53/* Parameter t_mbi from [RFC 3448, 4.3]: backoff interval in seconds */
54 * The t_delta parameter (RFC 3448, 4.6): delays of less than %USEC_PER_MSEC are 54#define TFRC_T_MBI 64
55 * rounded down to 0, since sk_reset_timer() here uses millisecond granularity.
56 * Hence we can use a constant t_delta = %USEC_PER_MSEC when HZ >= 500. A coarse
57 * resolution of HZ < 500 means that the error is below one timer tick (t_gran)
58 * when using the constant t_delta = t_gran / 2 = %USEC_PER_SEC / (2 * HZ).
59 */
60#if (HZ >= 500)
61# define TFRC_T_DELTA USEC_PER_MSEC
62#else
63# define TFRC_T_DELTA (USEC_PER_SEC / (2 * HZ))
64#warning Coarse CONFIG_HZ resolution -- higher value recommended for TFRC.
65#endif
66 55
67enum ccid3_options { 56enum ccid3_options {
68 TFRC_OPT_LOSS_EVENT_RATE = 192, 57 TFRC_OPT_LOSS_EVENT_RATE = 192,
@@ -70,43 +59,62 @@ enum ccid3_options {
70 TFRC_OPT_RECEIVE_RATE = 194, 59 TFRC_OPT_RECEIVE_RATE = 194,
71}; 60};
72 61
62struct ccid3_options_received {
63 u64 ccid3or_seqno:48,
64 ccid3or_loss_intervals_idx:16;
65 u16 ccid3or_loss_intervals_len;
66 u32 ccid3or_loss_event_rate;
67 u32 ccid3or_receive_rate;
68};
69
70/* TFRC sender states */
71enum ccid3_hc_tx_states {
72 TFRC_SSTATE_NO_SENT = 1,
73 TFRC_SSTATE_NO_FBACK,
74 TFRC_SSTATE_FBACK,
75 TFRC_SSTATE_TERM,
76};
77
73/** struct ccid3_hc_tx_sock - CCID3 sender half-connection socket 78/** struct ccid3_hc_tx_sock - CCID3 sender half-connection socket
74 * 79 *
75 * @x - Current sending rate in 64 * bytes per second 80 * @ccid3hctx_x - Current sending rate in 64 * bytes per second
76 * @x_recv - Receive rate in 64 * bytes per second 81 * @ccid3hctx_x_recv - Receive rate in 64 * bytes per second
77 * @x_calc - Calculated rate in bytes per second 82 * @ccid3hctx_x_calc - Calculated rate in bytes per second
78 * @rtt - Estimate of current round trip time in usecs 83 * @ccid3hctx_rtt - Estimate of current round trip time in usecs
79 * @r_sqmean - Estimate of long-term RTT (RFC 3448, 4.5) 84 * @ccid3hctx_p - Current loss event rate (0-1) scaled by 1000000
80 * @p - Current loss event rate (0-1) scaled by 1000000 85 * @ccid3hctx_s - Packet size in bytes
81 * @s - Packet size in bytes 86 * @ccid3hctx_t_rto - Nofeedback Timer setting in usecs
82 * @t_rto - Nofeedback Timer setting in usecs 87 * @ccid3hctx_t_ipi - Interpacket (send) interval (RFC 3448, 4.6) in usecs
83 * @t_ipi - Interpacket (send) interval (RFC 3448, 4.6) in usecs 88 * @ccid3hctx_state - Sender state, one of %ccid3_hc_tx_states
84 * @feedback - Whether feedback has been received or not 89 * @ccid3hctx_last_win_count - Last window counter sent
85 * @last_win_count - Last window counter sent 90 * @ccid3hctx_t_last_win_count - Timestamp of earliest packet
86 * @t_last_win_count - Timestamp of earliest packet with 91 * with last_win_count value sent
87 * last_win_count value sent 92 * @ccid3hctx_no_feedback_timer - Handle to no feedback timer
88 * @no_feedback_timer - Handle to no feedback timer 93 * @ccid3hctx_t_ld - Time last doubled during slow start
89 * @t_ld - Time last doubled during slow start 94 * @ccid3hctx_t_nom - Nominal send time of next packet
90 * @t_nom - Nominal send time of next packet 95 * @ccid3hctx_delta - Send timer delta (RFC 3448, 4.6) in usecs
91 * @hist - Packet history 96 * @ccid3hctx_hist - Packet history
97 * @ccid3hctx_options_received - Parsed set of retrieved options
92 */ 98 */
93struct ccid3_hc_tx_sock { 99struct ccid3_hc_tx_sock {
94 u64 x; 100 struct tfrc_tx_info ccid3hctx_tfrc;
95 u64 x_recv; 101#define ccid3hctx_x ccid3hctx_tfrc.tfrctx_x
96 u32 x_calc; 102#define ccid3hctx_x_recv ccid3hctx_tfrc.tfrctx_x_recv
97 u32 rtt; 103#define ccid3hctx_x_calc ccid3hctx_tfrc.tfrctx_x_calc
98 u16 r_sqmean; 104#define ccid3hctx_rtt ccid3hctx_tfrc.tfrctx_rtt
99 u32 p; 105#define ccid3hctx_p ccid3hctx_tfrc.tfrctx_p
100 u32 t_rto; 106#define ccid3hctx_t_rto ccid3hctx_tfrc.tfrctx_rto
101 u32 t_ipi; 107#define ccid3hctx_t_ipi ccid3hctx_tfrc.tfrctx_ipi
102 u16 s; 108 u16 ccid3hctx_s;
103 bool feedback:1; 109 enum ccid3_hc_tx_states ccid3hctx_state:8;
104 u8 last_win_count; 110 u8 ccid3hctx_last_win_count;
105 ktime_t t_last_win_count; 111 ktime_t ccid3hctx_t_last_win_count;
106 struct timer_list no_feedback_timer; 112 struct timer_list ccid3hctx_no_feedback_timer;
107 ktime_t t_ld; 113 ktime_t ccid3hctx_t_ld;
108 ktime_t t_nom; 114 ktime_t ccid3hctx_t_nom;
109 struct tfrc_tx_hist_entry *hist; 115 u32 ccid3hctx_delta;
116 struct tfrc_tx_hist_entry *ccid3hctx_hist;
117 struct ccid3_options_received ccid3hctx_options_received;
110}; 118};
111 119
112static inline struct ccid3_hc_tx_sock *ccid3_hc_tx_sk(const struct sock *sk) 120static inline struct ccid3_hc_tx_sock *ccid3_hc_tx_sk(const struct sock *sk)
@@ -116,32 +124,41 @@ static inline struct ccid3_hc_tx_sock *ccid3_hc_tx_sk(const struct sock *sk)
116 return hctx; 124 return hctx;
117} 125}
118 126
119 127/* TFRC receiver states */
120enum ccid3_fback_type { 128enum ccid3_hc_rx_states {
121 CCID3_FBACK_NONE = 0, 129 TFRC_RSTATE_NO_DATA = 1,
122 CCID3_FBACK_INITIAL, 130 TFRC_RSTATE_DATA,
123 CCID3_FBACK_PERIODIC, 131 TFRC_RSTATE_TERM = 127,
124 CCID3_FBACK_PARAM_CHANGE
125}; 132};
126 133
127/** struct ccid3_hc_rx_sock - CCID3 receiver half-connection socket 134/** struct ccid3_hc_rx_sock - CCID3 receiver half-connection socket
128 * 135 *
129 * @last_counter - Tracks window counter (RFC 4342, 8.1) 136 * @ccid3hcrx_x_recv - Receiver estimate of send rate (RFC 3448 4.3)
130 * @feedback - The type of the feedback last sent 137 * @ccid3hcrx_rtt - Receiver estimate of rtt (non-standard)
131 * @x_recv - Receiver estimate of send rate (RFC 3448, sec. 4.3) 138 * @ccid3hcrx_p - Current loss event rate (RFC 3448 5.4)
132 * @tstamp_last_feedback - Time at which last feedback was sent 139 * @ccid3hcrx_last_counter - Tracks window counter (RFC 4342, 8.1)
133 * @hist - Packet history (loss detection + RTT sampling) 140 * @ccid3hcrx_state - Receiver state, one of %ccid3_hc_rx_states
134 * @li_hist - Loss Interval database 141 * @ccid3hcrx_bytes_recv - Total sum of DCCP payload bytes
135 * @p_inverse - Inverse of Loss Event Rate (RFC 4342, sec. 8.5) 142 * @ccid3hcrx_x_recv - Receiver estimate of send rate (RFC 3448, sec. 4.3)
143 * @ccid3hcrx_rtt - Receiver estimate of RTT
144 * @ccid3hcrx_tstamp_last_feedback - Time at which last feedback was sent
145 * @ccid3hcrx_tstamp_last_ack - Time at which last feedback was sent
146 * @ccid3hcrx_hist - Packet history (loss detection + RTT sampling)
147 * @ccid3hcrx_li_hist - Loss Interval database
148 * @ccid3hcrx_s - Received packet size in bytes
149 * @ccid3hcrx_pinv - Inverse of Loss Event Rate (RFC 4342, sec. 8.5)
136 */ 150 */
137struct ccid3_hc_rx_sock { 151struct ccid3_hc_rx_sock {
138 u8 last_counter:4; 152 u8 ccid3hcrx_last_counter:4;
139 enum ccid3_fback_type feedback:4; 153 enum ccid3_hc_rx_states ccid3hcrx_state:8;
140 u32 x_recv; 154 u32 ccid3hcrx_bytes_recv;
141 ktime_t tstamp_last_feedback; 155 u32 ccid3hcrx_x_recv;
142 struct tfrc_rx_hist hist; 156 u32 ccid3hcrx_rtt;
143 struct tfrc_loss_hist li_hist; 157 ktime_t ccid3hcrx_tstamp_last_feedback;
144#define p_inverse li_hist.i_mean 158 struct tfrc_rx_hist ccid3hcrx_hist;
159 struct tfrc_loss_hist ccid3hcrx_li_hist;
160 u16 ccid3hcrx_s;
161#define ccid3hcrx_pinv ccid3hcrx_li_hist.i_mean
145}; 162};
146 163
147static inline struct ccid3_hc_rx_sock *ccid3_hc_rx_sk(const struct sock *sk) 164static inline struct ccid3_hc_rx_sock *ccid3_hc_rx_sk(const struct sock *sk)
diff --git a/net/dccp/ccids/lib/loss_interval.c b/net/dccp/ccids/lib/loss_interval.c
index b1ae8f8259e5..5b3ce0688c5c 100644
--- a/net/dccp/ccids/lib/loss_interval.c
+++ b/net/dccp/ccids/lib/loss_interval.c
@@ -86,26 +86,21 @@ static void tfrc_lh_calc_i_mean(struct tfrc_loss_hist *lh)
86 86
87/** 87/**
88 * tfrc_lh_update_i_mean - Update the `open' loss interval I_0 88 * tfrc_lh_update_i_mean - Update the `open' loss interval I_0
89 * This updates I_mean as the sequence numbers increase. As a consequence, the 89 * For recomputing p: returns `true' if p > p_prev <=> 1/p < 1/p_prev
90 * open loss interval I_0 increases, hence p = W_tot/max(I_tot0, I_tot1)
91 * decreases, and thus there is no need to send renewed feedback.
92 */ 90 */
93void tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *skb) 91u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *skb)
94{ 92{
95 struct tfrc_loss_interval *cur = tfrc_lh_peek(lh); 93 struct tfrc_loss_interval *cur = tfrc_lh_peek(lh);
94 u32 old_i_mean = lh->i_mean;
96 s64 len; 95 s64 len;
97 96
98 if (cur == NULL) /* not initialised */ 97 if (cur == NULL) /* not initialised */
99 return; 98 return 0;
100
101 /* FIXME: should probably also count non-data packets (RFC 4342, 6.1) */
102 if (!dccp_data_packet(skb))
103 return;
104 99
105 len = dccp_delta_seqno(cur->li_seqno, DCCP_SKB_CB(skb)->dccpd_seq) + 1; 100 len = dccp_delta_seqno(cur->li_seqno, DCCP_SKB_CB(skb)->dccpd_seq) + 1;
106 101
107 if (len - (s64)cur->li_length <= 0) /* duplicate or reordered */ 102 if (len - (s64)cur->li_length <= 0) /* duplicate or reordered */
108 return; 103 return 0;
109 104
110 if (SUB16(dccp_hdr(skb)->dccph_ccval, cur->li_ccval) > 4) 105 if (SUB16(dccp_hdr(skb)->dccph_ccval, cur->li_ccval) > 4)
111 /* 106 /*
@@ -119,11 +114,14 @@ void tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *skb)
119 cur->li_is_closed = 1; 114 cur->li_is_closed = 1;
120 115
121 if (tfrc_lh_length(lh) == 1) /* due to RFC 3448, 6.3.1 */ 116 if (tfrc_lh_length(lh) == 1) /* due to RFC 3448, 6.3.1 */
122 return; 117 return 0;
123 118
124 cur->li_length = len; 119 cur->li_length = len;
125 tfrc_lh_calc_i_mean(lh); 120 tfrc_lh_calc_i_mean(lh);
121
122 return (lh->i_mean < old_i_mean);
126} 123}
124EXPORT_SYMBOL_GPL(tfrc_lh_update_i_mean);
127 125
128/* Determine if `new_loss' does begin a new loss interval [RFC 4342, 10.2] */ 126/* Determine if `new_loss' does begin a new loss interval [RFC 4342, 10.2] */
129static inline u8 tfrc_lh_is_new_loss(struct tfrc_loss_interval *cur, 127static inline u8 tfrc_lh_is_new_loss(struct tfrc_loss_interval *cur,
@@ -140,18 +138,18 @@ static inline u8 tfrc_lh_is_new_loss(struct tfrc_loss_interval *cur,
140 * @sk: Used by @calc_first_li in caller-specific way (subtyping) 138 * @sk: Used by @calc_first_li in caller-specific way (subtyping)
141 * Updates I_mean and returns 1 if a new interval has in fact been added to @lh. 139 * Updates I_mean and returns 1 if a new interval has in fact been added to @lh.
142 */ 140 */
143bool tfrc_lh_interval_add(struct tfrc_loss_hist *lh, struct tfrc_rx_hist *rh, 141int tfrc_lh_interval_add(struct tfrc_loss_hist *lh, struct tfrc_rx_hist *rh,
144 u32 (*calc_first_li)(struct sock *), struct sock *sk) 142 u32 (*calc_first_li)(struct sock *), struct sock *sk)
145{ 143{
146 struct tfrc_loss_interval *cur = tfrc_lh_peek(lh), *new; 144 struct tfrc_loss_interval *cur = tfrc_lh_peek(lh), *new;
147 145
148 if (cur != NULL && !tfrc_lh_is_new_loss(cur, tfrc_rx_hist_loss_prev(rh))) 146 if (cur != NULL && !tfrc_lh_is_new_loss(cur, tfrc_rx_hist_loss_prev(rh)))
149 return false; 147 return 0;
150 148
151 new = tfrc_lh_demand_next(lh); 149 new = tfrc_lh_demand_next(lh);
152 if (unlikely(new == NULL)) { 150 if (unlikely(new == NULL)) {
153 DCCP_CRIT("Cannot allocate/add loss record."); 151 DCCP_CRIT("Cannot allocate/add loss record.");
154 return false; 152 return 0;
155 } 153 }
156 154
157 new->li_seqno = tfrc_rx_hist_loss_prev(rh)->tfrchrx_seqno; 155 new->li_seqno = tfrc_rx_hist_loss_prev(rh)->tfrchrx_seqno;
@@ -169,7 +167,7 @@ bool tfrc_lh_interval_add(struct tfrc_loss_hist *lh, struct tfrc_rx_hist *rh,
169 167
170 tfrc_lh_calc_i_mean(lh); 168 tfrc_lh_calc_i_mean(lh);
171 } 169 }
172 return true; 170 return 1;
173} 171}
174EXPORT_SYMBOL_GPL(tfrc_lh_interval_add); 172EXPORT_SYMBOL_GPL(tfrc_lh_interval_add);
175 173
diff --git a/net/dccp/ccids/lib/loss_interval.h b/net/dccp/ccids/lib/loss_interval.h
index d08a226db43e..246018a3b269 100644
--- a/net/dccp/ccids/lib/loss_interval.h
+++ b/net/dccp/ccids/lib/loss_interval.h
@@ -67,9 +67,9 @@ static inline u8 tfrc_lh_length(struct tfrc_loss_hist *lh)
67 67
68struct tfrc_rx_hist; 68struct tfrc_rx_hist;
69 69
70extern bool tfrc_lh_interval_add(struct tfrc_loss_hist *, struct tfrc_rx_hist *, 70extern int tfrc_lh_interval_add(struct tfrc_loss_hist *, struct tfrc_rx_hist *,
71 u32 (*first_li)(struct sock *), struct sock *); 71 u32 (*first_li)(struct sock *), struct sock *);
72extern void tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *); 72extern u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *);
73extern void tfrc_lh_cleanup(struct tfrc_loss_hist *lh); 73extern void tfrc_lh_cleanup(struct tfrc_loss_hist *lh);
74 74
75#endif /* _DCCP_LI_HIST_ */ 75#endif /* _DCCP_LI_HIST_ */
diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c
index cce9f03bda3e..6cc108afdc3b 100644
--- a/net/dccp/ccids/lib/packet_history.c
+++ b/net/dccp/ccids/lib/packet_history.c
@@ -40,6 +40,18 @@
40#include "packet_history.h" 40#include "packet_history.h"
41#include "../../dccp.h" 41#include "../../dccp.h"
42 42
43/**
44 * tfrc_tx_hist_entry - Simple singly-linked TX history list
45 * @next: next oldest entry (LIFO order)
46 * @seqno: sequence number of this entry
47 * @stamp: send time of packet with sequence number @seqno
48 */
49struct tfrc_tx_hist_entry {
50 struct tfrc_tx_hist_entry *next;
51 u64 seqno;
52 ktime_t stamp;
53};
54
43/* 55/*
44 * Transmitter History Routines 56 * Transmitter History Routines
45 */ 57 */
@@ -61,6 +73,15 @@ void tfrc_tx_packet_history_exit(void)
61 } 73 }
62} 74}
63 75
76static struct tfrc_tx_hist_entry *
77 tfrc_tx_hist_find_entry(struct tfrc_tx_hist_entry *head, u64 seqno)
78{
79 while (head != NULL && head->seqno != seqno)
80 head = head->next;
81
82 return head;
83}
84
64int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno) 85int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno)
65{ 86{
66 struct tfrc_tx_hist_entry *entry = kmem_cache_alloc(tfrc_tx_hist_slab, gfp_any()); 87 struct tfrc_tx_hist_entry *entry = kmem_cache_alloc(tfrc_tx_hist_slab, gfp_any());
@@ -90,6 +111,25 @@ void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp)
90} 111}
91EXPORT_SYMBOL_GPL(tfrc_tx_hist_purge); 112EXPORT_SYMBOL_GPL(tfrc_tx_hist_purge);
92 113
114u32 tfrc_tx_hist_rtt(struct tfrc_tx_hist_entry *head, const u64 seqno,
115 const ktime_t now)
116{
117 u32 rtt = 0;
118 struct tfrc_tx_hist_entry *packet = tfrc_tx_hist_find_entry(head, seqno);
119
120 if (packet != NULL) {
121 rtt = ktime_us_delta(now, packet->stamp);
122 /*
123 * Garbage-collect older (irrelevant) entries:
124 */
125 tfrc_tx_hist_purge(&packet->next);
126 }
127
128 return rtt;
129}
130EXPORT_SYMBOL_GPL(tfrc_tx_hist_rtt);
131
132
93/* 133/*
94 * Receiver History Routines 134 * Receiver History Routines
95 */ 135 */
@@ -151,31 +191,14 @@ int tfrc_rx_hist_duplicate(struct tfrc_rx_hist *h, struct sk_buff *skb)
151} 191}
152EXPORT_SYMBOL_GPL(tfrc_rx_hist_duplicate); 192EXPORT_SYMBOL_GPL(tfrc_rx_hist_duplicate);
153 193
154
155static void __tfrc_rx_hist_swap(struct tfrc_rx_hist *h, const u8 a, const u8 b)
156{
157 struct tfrc_rx_hist_entry *tmp = h->ring[a];
158
159 h->ring[a] = h->ring[b];
160 h->ring[b] = tmp;
161}
162
163static void tfrc_rx_hist_swap(struct tfrc_rx_hist *h, const u8 a, const u8 b) 194static void tfrc_rx_hist_swap(struct tfrc_rx_hist *h, const u8 a, const u8 b)
164{ 195{
165 __tfrc_rx_hist_swap(h, tfrc_rx_hist_index(h, a), 196 const u8 idx_a = tfrc_rx_hist_index(h, a),
166 tfrc_rx_hist_index(h, b)); 197 idx_b = tfrc_rx_hist_index(h, b);
167} 198 struct tfrc_rx_hist_entry *tmp = h->ring[idx_a];
168 199
169/** 200 h->ring[idx_a] = h->ring[idx_b];
170 * tfrc_rx_hist_resume_rtt_sampling - Prepare RX history for RTT sampling 201 h->ring[idx_b] = tmp;
171 * This is called after loss detection has finished, when the history entry
172 * with the index of `loss_count' holds the highest-received sequence number.
173 * RTT sampling requires this information at ring[0] (tfrc_rx_hist_sample_rtt).
174 */
175static inline void tfrc_rx_hist_resume_rtt_sampling(struct tfrc_rx_hist *h)
176{
177 __tfrc_rx_hist_swap(h, 0, tfrc_rx_hist_index(h, h->loss_count));
178 h->loss_count = h->loss_start = 0;
179} 202}
180 203
181/* 204/*
@@ -192,8 +215,10 @@ static void __do_track_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u64 n1)
192 u64 s0 = tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno, 215 u64 s0 = tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno,
193 s1 = DCCP_SKB_CB(skb)->dccpd_seq; 216 s1 = DCCP_SKB_CB(skb)->dccpd_seq;
194 217
195 if (!dccp_loss_free(s0, s1, n1)) /* gap between S0 and S1 */ 218 if (!dccp_loss_free(s0, s1, n1)) { /* gap between S0 and S1 */
196 h->loss_count = 1; 219 h->loss_count = 1;
220 tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 1), skb, n1);
221 }
197} 222}
198 223
199static void __one_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n2) 224static void __one_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n2)
@@ -215,7 +240,8 @@ static void __one_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n2
215 240
216 if (dccp_loss_free(s2, s1, n1)) { 241 if (dccp_loss_free(s2, s1, n1)) {
217 /* hole is filled: S0, S2, and S1 are consecutive */ 242 /* hole is filled: S0, S2, and S1 are consecutive */
218 tfrc_rx_hist_resume_rtt_sampling(h); 243 h->loss_count = 0;
244 h->loss_start = tfrc_rx_hist_index(h, 1);
219 } else 245 } else
220 /* gap between S2 and S1: just update loss_prev */ 246 /* gap between S2 and S1: just update loss_prev */
221 tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_loss_prev(h), skb, n2); 247 tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_loss_prev(h), skb, n2);
@@ -268,7 +294,8 @@ static int __two_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n3)
268 294
269 if (dccp_loss_free(s1, s2, n2)) { 295 if (dccp_loss_free(s1, s2, n2)) {
270 /* entire hole filled by S0, S3, S1, S2 */ 296 /* entire hole filled by S0, S3, S1, S2 */
271 tfrc_rx_hist_resume_rtt_sampling(h); 297 h->loss_start = tfrc_rx_hist_index(h, 2);
298 h->loss_count = 0;
272 } else { 299 } else {
273 /* gap remains between S1 and S2 */ 300 /* gap remains between S1 and S2 */
274 h->loss_start = tfrc_rx_hist_index(h, 1); 301 h->loss_start = tfrc_rx_hist_index(h, 1);
@@ -312,7 +339,8 @@ static void __three_after_loss(struct tfrc_rx_hist *h)
312 339
313 if (dccp_loss_free(s2, s3, n3)) { 340 if (dccp_loss_free(s2, s3, n3)) {
314 /* no gap between S2 and S3: entire hole is filled */ 341 /* no gap between S2 and S3: entire hole is filled */
315 tfrc_rx_hist_resume_rtt_sampling(h); 342 h->loss_start = tfrc_rx_hist_index(h, 3);
343 h->loss_count = 0;
316 } else { 344 } else {
317 /* gap between S2 and S3 */ 345 /* gap between S2 and S3 */
318 h->loss_start = tfrc_rx_hist_index(h, 2); 346 h->loss_start = tfrc_rx_hist_index(h, 2);
@@ -326,13 +354,13 @@ static void __three_after_loss(struct tfrc_rx_hist *h)
326} 354}
327 355
328/** 356/**
329 * tfrc_rx_congestion_event - Loss detection and further processing 357 * tfrc_rx_handle_loss - Loss detection and further processing
330 * @h: The non-empty RX history object 358 * @h: The non-empty RX history object
331 * @lh: Loss Intervals database to update 359 * @lh: Loss Intervals database to update
332 * @skb: Currently received packet 360 * @skb: Currently received packet
333 * @ndp: The NDP count belonging to @skb 361 * @ndp: The NDP count belonging to @skb
334 * @first_li: Caller-dependent computation of first loss interval in @lh 362 * @calc_first_li: Caller-dependent computation of first loss interval in @lh
335 * @sk: Used by @calc_first_li (see tfrc_lh_interval_add) 363 * @sk: Used by @calc_first_li (see tfrc_lh_interval_add)
336 * Chooses action according to pending loss, updates LI database when a new 364 * Chooses action according to pending loss, updates LI database when a new
337 * loss was detected, and does required post-processing. Returns 1 when caller 365 * loss was detected, and does required post-processing. Returns 1 when caller
338 * should send feedback, 0 otherwise. 366 * should send feedback, 0 otherwise.
@@ -340,20 +368,15 @@ static void __three_after_loss(struct tfrc_rx_hist *h)
340 * records accordingly, the caller should not perform any more RX history 368 * records accordingly, the caller should not perform any more RX history
341 * operations when loss_count is greater than 0 after calling this function. 369 * operations when loss_count is greater than 0 after calling this function.
342 */ 370 */
343bool tfrc_rx_congestion_event(struct tfrc_rx_hist *h, 371int tfrc_rx_handle_loss(struct tfrc_rx_hist *h,
344 struct tfrc_loss_hist *lh, 372 struct tfrc_loss_hist *lh,
345 struct sk_buff *skb, const u64 ndp, 373 struct sk_buff *skb, const u64 ndp,
346 u32 (*first_li)(struct sock *), struct sock *sk) 374 u32 (*calc_first_li)(struct sock *), struct sock *sk)
347{ 375{
348 bool new_event = false; 376 int is_new_loss = 0;
349
350 if (tfrc_rx_hist_duplicate(h, skb))
351 return 0;
352 377
353 if (h->loss_count == 0) { 378 if (h->loss_count == 0) {
354 __do_track_loss(h, skb, ndp); 379 __do_track_loss(h, skb, ndp);
355 tfrc_rx_hist_sample_rtt(h, skb);
356 tfrc_rx_hist_add_packet(h, skb, ndp);
357 } else if (h->loss_count == 1) { 380 } else if (h->loss_count == 1) {
358 __one_after_loss(h, skb, ndp); 381 __one_after_loss(h, skb, ndp);
359 } else if (h->loss_count != 2) { 382 } else if (h->loss_count != 2) {
@@ -362,57 +385,34 @@ bool tfrc_rx_congestion_event(struct tfrc_rx_hist *h,
362 /* 385 /*
363 * Update Loss Interval database and recycle RX records 386 * Update Loss Interval database and recycle RX records
364 */ 387 */
365 new_event = tfrc_lh_interval_add(lh, h, first_li, sk); 388 is_new_loss = tfrc_lh_interval_add(lh, h, calc_first_li, sk);
366 __three_after_loss(h); 389 __three_after_loss(h);
367 } 390 }
368 391 return is_new_loss;
369 /*
370 * Update moving-average of `s' and the sum of received payload bytes.
371 */
372 if (dccp_data_packet(skb)) {
373 const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4;
374
375 h->packet_size = tfrc_ewma(h->packet_size, payload, 9);
376 h->bytes_recvd += payload;
377 }
378
379 /* RFC 3448, 6.1: update I_0, whose growth implies p <= p_prev */
380 if (!new_event)
381 tfrc_lh_update_i_mean(lh, skb);
382
383 return new_event;
384} 392}
385EXPORT_SYMBOL_GPL(tfrc_rx_congestion_event); 393EXPORT_SYMBOL_GPL(tfrc_rx_handle_loss);
386 394
387/* Compute the sending rate X_recv measured between feedback intervals */ 395int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h)
388u32 tfrc_rx_hist_x_recv(struct tfrc_rx_hist *h, const u32 last_x_recv)
389{ 396{
390 u64 bytes = h->bytes_recvd, last_rtt = h->rtt_estimate; 397 int i;
391 s64 delta = ktime_to_us(net_timedelta(h->bytes_start));
392
393 WARN_ON(delta <= 0);
394 /*
395 * Ensure that the sampling interval for X_recv is at least one RTT,
396 * by extending the sampling interval backwards in time, over the last
397 * R_(m-1) seconds, as per rfc3448bis-06, 6.2.
398 * To reduce noise (e.g. when the RTT changes often), this is only
399 * done when delta is smaller than RTT/2.
400 */
401 if (last_x_recv > 0 && delta < last_rtt/2) {
402 tfrc_pr_debug("delta < RTT ==> %ld us < %u us\n",
403 (long)delta, (unsigned)last_rtt);
404 398
405 delta = (bytes ? delta : 0) + last_rtt; 399 for (i = 0; i <= TFRC_NDUPACK; i++) {
406 bytes += div_u64((u64)last_x_recv * last_rtt, USEC_PER_SEC); 400 h->ring[i] = kmem_cache_alloc(tfrc_rx_hist_slab, GFP_ATOMIC);
401 if (h->ring[i] == NULL)
402 goto out_free;
407 } 403 }
408 404
409 if (unlikely(bytes == 0)) { 405 h->loss_count = h->loss_start = 0;
410 DCCP_WARN("X_recv == 0, using old value of %u\n", last_x_recv); 406 return 0;
411 return last_x_recv; 407
408out_free:
409 while (i-- != 0) {
410 kmem_cache_free(tfrc_rx_hist_slab, h->ring[i]);
411 h->ring[i] = NULL;
412 } 412 }
413 return scaled_div32(bytes, delta); 413 return -ENOBUFS;
414} 414}
415EXPORT_SYMBOL_GPL(tfrc_rx_hist_x_recv); 415EXPORT_SYMBOL_GPL(tfrc_rx_hist_alloc);
416 416
417void tfrc_rx_hist_purge(struct tfrc_rx_hist *h) 417void tfrc_rx_hist_purge(struct tfrc_rx_hist *h)
418{ 418{
@@ -426,81 +426,73 @@ void tfrc_rx_hist_purge(struct tfrc_rx_hist *h)
426} 426}
427EXPORT_SYMBOL_GPL(tfrc_rx_hist_purge); 427EXPORT_SYMBOL_GPL(tfrc_rx_hist_purge);
428 428
429static int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h) 429/**
430 * tfrc_rx_hist_rtt_last_s - reference entry to compute RTT samples against
431 */
432static inline struct tfrc_rx_hist_entry *
433 tfrc_rx_hist_rtt_last_s(const struct tfrc_rx_hist *h)
430{ 434{
431 int i; 435 return h->ring[0];
432
433 memset(h, 0, sizeof(*h));
434
435 for (i = 0; i <= TFRC_NDUPACK; i++) {
436 h->ring[i] = kmem_cache_alloc(tfrc_rx_hist_slab, GFP_ATOMIC);
437 if (h->ring[i] == NULL) {
438 tfrc_rx_hist_purge(h);
439 return -ENOBUFS;
440 }
441 }
442 return 0;
443} 436}
444 437
445int tfrc_rx_hist_init(struct tfrc_rx_hist *h, struct sock *sk) 438/**
439 * tfrc_rx_hist_rtt_prev_s: previously suitable (wrt rtt_last_s) RTT-sampling entry
440 */
441static inline struct tfrc_rx_hist_entry *
442 tfrc_rx_hist_rtt_prev_s(const struct tfrc_rx_hist *h)
446{ 443{
447 if (tfrc_rx_hist_alloc(h)) 444 return h->ring[h->rtt_sample_prev];
448 return -ENOBUFS;
449 /*
450 * Initialise first entry with GSR to start loss detection as early as
451 * possible. Code using this must not use any other fields. The entry
452 * will be overwritten once the CCID updates its received packets.
453 */
454 tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno = dccp_sk(sk)->dccps_gsr;
455 return 0;
456} 445}
457EXPORT_SYMBOL_GPL(tfrc_rx_hist_init);
458 446
459/** 447/**
460 * tfrc_rx_hist_sample_rtt - Sample RTT from timestamp / CCVal 448 * tfrc_rx_hist_sample_rtt - Sample RTT from timestamp / CCVal
461 * Based on ideas presented in RFC 4342, 8.1. This function expects that no loss 449 * Based on ideas presented in RFC 4342, 8.1. Returns 0 if it was not able
462 * is pending and uses the following history entries (via rtt_sample_prev): 450 * to compute a sample with given data - calling function should check this.
463 * - h->ring[0] contains the most recent history entry prior to @skb;
464 * - h->ring[1] is an unused `dummy' entry when the current difference is 0;
465 */ 451 */
466void tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, const struct sk_buff *skb) 452u32 tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, const struct sk_buff *skb)
467{ 453{
468 struct tfrc_rx_hist_entry *last = h->ring[0]; 454 u32 sample = 0,
469 u32 sample, delta_v; 455 delta_v = SUB16(dccp_hdr(skb)->dccph_ccval,
470 456 tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval);
471 /* 457
472 * When not to sample: 458 if (delta_v < 1 || delta_v > 4) { /* unsuitable CCVal delta */
473 * - on non-data packets 459 if (h->rtt_sample_prev == 2) { /* previous candidate stored */
474 * (RFC 4342, 8.1: CCVal only fully defined for data packets); 460 sample = SUB16(tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_ccval,
475 * - when no data packets have been received yet 461 tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval);
476 * (FIXME: using sampled packet size as indicator here); 462 if (sample)
477 * - as long as there are gaps in the sequence space (pending loss). 463 sample = 4 / sample *
478 */ 464 ktime_us_delta(tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_tstamp,
479 if (!dccp_data_packet(skb) || h->packet_size == 0 || 465 tfrc_rx_hist_rtt_last_s(h)->tfrchrx_tstamp);
480 tfrc_rx_hist_loss_pending(h)) 466 else /*
481 return; 467 * FIXME: This condition is in principle not
468 * possible but occurs when CCID is used for
469 * two-way data traffic. I have tried to trace
470 * it, but the cause does not seem to be here.
471 */
472 DCCP_BUG("please report to dccp@vger.kernel.org"
473 " => prev = %u, last = %u",
474 tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_ccval,
475 tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval);
476 } else if (delta_v < 1) {
477 h->rtt_sample_prev = 1;
478 goto keep_ref_for_next_time;
479 }
482 480
483 h->rtt_sample_prev = 0; /* reset previous candidate */ 481 } else if (delta_v == 4) /* optimal match */
482 sample = ktime_to_us(net_timedelta(tfrc_rx_hist_rtt_last_s(h)->tfrchrx_tstamp));
483 else { /* suboptimal match */
484 h->rtt_sample_prev = 2;
485 goto keep_ref_for_next_time;
486 }
484 487
485 delta_v = SUB16(dccp_hdr(skb)->dccph_ccval, last->tfrchrx_ccval); 488 if (unlikely(sample > DCCP_SANE_RTT_MAX)) {
486 if (delta_v == 0) { /* less than RTT/4 difference */ 489 DCCP_WARN("RTT sample %u too large, using max\n", sample);
487 h->rtt_sample_prev = 1; 490 sample = DCCP_SANE_RTT_MAX;
488 return;
489 } 491 }
490 sample = dccp_sane_rtt(ktime_to_us(net_timedelta(last->tfrchrx_tstamp)));
491 492
492 if (delta_v <= 4) /* between RTT/4 and RTT */ 493 h->rtt_sample_prev = 0; /* use current entry as next reference */
493 sample *= 4 / delta_v; 494keep_ref_for_next_time:
494 else if (!(sample < h->rtt_estimate && sample > h->rtt_estimate/2))
495 /*
496 * Optimisation: CCVal difference is greater than 1 RTT, yet the
497 * sample is less than the local RTT estimate; which means that
498 * the RTT estimate is too high.
499 * To avoid noise, it is not done if the sample is below RTT/2.
500 */
501 return;
502 495
503 /* Use a lower weight than usual to increase responsiveness */ 496 return sample;
504 h->rtt_estimate = tfrc_ewma(h->rtt_estimate, sample, 5);
505} 497}
506EXPORT_SYMBOL_GPL(tfrc_rx_hist_sample_rtt); 498EXPORT_SYMBOL_GPL(tfrc_rx_hist_sample_rtt);
diff --git a/net/dccp/ccids/lib/packet_history.h b/net/dccp/ccids/lib/packet_history.h
index 555e65cd73a0..461cc91cce88 100644
--- a/net/dccp/ccids/lib/packet_history.h
+++ b/net/dccp/ccids/lib/packet_history.h
@@ -40,28 +40,12 @@
40#include <linux/slab.h> 40#include <linux/slab.h>
41#include "tfrc.h" 41#include "tfrc.h"
42 42
43/** 43struct tfrc_tx_hist_entry;
44 * tfrc_tx_hist_entry - Simple singly-linked TX history list
45 * @next: next oldest entry (LIFO order)
46 * @seqno: sequence number of this entry
47 * @stamp: send time of packet with sequence number @seqno
48 */
49struct tfrc_tx_hist_entry {
50 struct tfrc_tx_hist_entry *next;
51 u64 seqno;
52 ktime_t stamp;
53};
54
55static inline struct tfrc_tx_hist_entry *
56 tfrc_tx_hist_find_entry(struct tfrc_tx_hist_entry *head, u64 seqno)
57{
58 while (head != NULL && head->seqno != seqno)
59 head = head->next;
60 return head;
61}
62 44
63extern int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno); 45extern int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno);
64extern void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp); 46extern void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp);
47extern u32 tfrc_tx_hist_rtt(struct tfrc_tx_hist_entry *head,
48 const u64 seqno, const ktime_t now);
65 49
66/* Subtraction a-b modulo-16, respects circular wrap-around */ 50/* Subtraction a-b modulo-16, respects circular wrap-around */
67#define SUB16(a, b) (((a) + 16 - (b)) & 0xF) 51#define SUB16(a, b) (((a) + 16 - (b)) & 0xF)
@@ -91,22 +75,12 @@ struct tfrc_rx_hist_entry {
91 * @loss_count: Number of entries in circular history 75 * @loss_count: Number of entries in circular history
92 * @loss_start: Movable index (for loss detection) 76 * @loss_start: Movable index (for loss detection)
93 * @rtt_sample_prev: Used during RTT sampling, points to candidate entry 77 * @rtt_sample_prev: Used during RTT sampling, points to candidate entry
94 * @rtt_estimate: Receiver RTT estimate
95 * @packet_size: Packet size in bytes (as per RFC 3448, 3.1)
96 * @bytes_recvd: Number of bytes received since @bytes_start
97 * @bytes_start: Start time for counting @bytes_recvd
98 */ 78 */
99struct tfrc_rx_hist { 79struct tfrc_rx_hist {
100 struct tfrc_rx_hist_entry *ring[TFRC_NDUPACK + 1]; 80 struct tfrc_rx_hist_entry *ring[TFRC_NDUPACK + 1];
101 u8 loss_count:2, 81 u8 loss_count:2,
102 loss_start:2; 82 loss_start:2;
103 /* Receiver RTT sampling */
104#define rtt_sample_prev loss_start 83#define rtt_sample_prev loss_start
105 u32 rtt_estimate;
106 /* Receiver sampling of application payload lengths */
107 u32 packet_size,
108 bytes_recvd;
109 ktime_t bytes_start;
110}; 84};
111 85
112/** 86/**
@@ -150,50 +124,20 @@ static inline bool tfrc_rx_hist_loss_pending(const struct tfrc_rx_hist *h)
150 return h->loss_count > 0; 124 return h->loss_count > 0;
151} 125}
152 126
153/*
154 * Accessor functions to retrieve parameters sampled by the RX history
155 */
156static inline u32 tfrc_rx_hist_packet_size(const struct tfrc_rx_hist *h)
157{
158 if (h->packet_size == 0) {
159 DCCP_WARN("No sample for s, using fallback\n");
160 return TCP_MIN_RCVMSS;
161 }
162 return h->packet_size;
163
164}
165static inline u32 tfrc_rx_hist_rtt(const struct tfrc_rx_hist *h)
166{
167 if (h->rtt_estimate == 0) {
168 DCCP_WARN("No RTT estimate available, using fallback RTT\n");
169 return DCCP_FALLBACK_RTT;
170 }
171 return h->rtt_estimate;
172}
173
174static inline void tfrc_rx_hist_restart_byte_counter(struct tfrc_rx_hist *h)
175{
176 h->bytes_recvd = 0;
177 h->bytes_start = ktime_get_real();
178}
179
180extern u32 tfrc_rx_hist_x_recv(struct tfrc_rx_hist *h, const u32 last_x_recv);
181
182
183extern void tfrc_rx_hist_add_packet(struct tfrc_rx_hist *h, 127extern void tfrc_rx_hist_add_packet(struct tfrc_rx_hist *h,
184 const struct sk_buff *skb, const u64 ndp); 128 const struct sk_buff *skb, const u64 ndp);
185 129
186extern int tfrc_rx_hist_duplicate(struct tfrc_rx_hist *h, struct sk_buff *skb); 130extern int tfrc_rx_hist_duplicate(struct tfrc_rx_hist *h, struct sk_buff *skb);
187 131
188struct tfrc_loss_hist; 132struct tfrc_loss_hist;
189extern bool tfrc_rx_congestion_event(struct tfrc_rx_hist *h, 133extern int tfrc_rx_handle_loss(struct tfrc_rx_hist *h,
190 struct tfrc_loss_hist *lh, 134 struct tfrc_loss_hist *lh,
191 struct sk_buff *skb, const u64 ndp, 135 struct sk_buff *skb, const u64 ndp,
192 u32 (*first_li)(struct sock *sk), 136 u32 (*first_li)(struct sock *sk),
193 struct sock *sk); 137 struct sock *sk);
194extern void tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, 138extern u32 tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h,
195 const struct sk_buff *skb); 139 const struct sk_buff *skb);
196extern int tfrc_rx_hist_init(struct tfrc_rx_hist *h, struct sock *sk); 140extern int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h);
197extern void tfrc_rx_hist_purge(struct tfrc_rx_hist *h); 141extern void tfrc_rx_hist_purge(struct tfrc_rx_hist *h);
198 142
199#endif /* _DCCP_PKT_HIST_ */ 143#endif /* _DCCP_PKT_HIST_ */
diff --git a/net/dccp/ccids/lib/tfrc.h b/net/dccp/ccids/lib/tfrc.h
index ede12f53de5a..ed9857527acf 100644
--- a/net/dccp/ccids/lib/tfrc.h
+++ b/net/dccp/ccids/lib/tfrc.h
@@ -48,21 +48,6 @@ static inline u32 scaled_div32(u64 a, u64 b)
48} 48}
49 49
50/** 50/**
51 * tfrc_scaled_sqrt - Compute scaled integer sqrt(x) for 0 < x < 2^22-1
52 * Uses scaling to improve accuracy of the integer approximation of sqrt(). The
53 * scaling factor of 2^10 limits the maximum @sample to 4e6; this is okay for
54 * clamped RTT samples (dccp_sample_rtt).
55 * Should best be used for expressions of type sqrt(x)/sqrt(y), since then the
56 * scaling factor is neutralised. For this purpose, it avoids returning zero.
57 */
58static inline u16 tfrc_scaled_sqrt(const u32 sample)
59{
60 const unsigned long non_zero_sample = sample ? : 1;
61
62 return int_sqrt(non_zero_sample << 10);
63}
64
65/**
66 * tfrc_ewma - Exponentially weighted moving average 51 * tfrc_ewma - Exponentially weighted moving average
67 * @weight: Weight to be used as damping factor, in units of 1/10 52 * @weight: Weight to be used as damping factor, in units of 1/10
68 */ 53 */
@@ -73,7 +58,6 @@ static inline u32 tfrc_ewma(const u32 avg, const u32 newval, const u8 weight)
73 58
74extern u32 tfrc_calc_x(u16 s, u32 R, u32 p); 59extern u32 tfrc_calc_x(u16 s, u32 R, u32 p);
75extern u32 tfrc_calc_x_reverse_lookup(u32 fvalue); 60extern u32 tfrc_calc_x_reverse_lookup(u32 fvalue);
76extern u32 tfrc_invert_loss_event_rate(u32 loss_event_rate);
77 61
78extern int tfrc_tx_packet_history_init(void); 62extern int tfrc_tx_packet_history_init(void);
79extern void tfrc_tx_packet_history_exit(void); 63extern void tfrc_tx_packet_history_exit(void);
diff --git a/net/dccp/ccids/lib/tfrc_equation.c b/net/dccp/ccids/lib/tfrc_equation.c
index 38239c4d5e14..2f20a29cffe4 100644
--- a/net/dccp/ccids/lib/tfrc_equation.c
+++ b/net/dccp/ccids/lib/tfrc_equation.c
@@ -632,16 +632,8 @@ u32 tfrc_calc_x(u16 s, u32 R, u32 p)
632 632
633 if (p <= TFRC_CALC_X_SPLIT) { /* 0.0000 < p <= 0.05 */ 633 if (p <= TFRC_CALC_X_SPLIT) { /* 0.0000 < p <= 0.05 */
634 if (p < TFRC_SMALLEST_P) { /* 0.0000 < p < 0.0001 */ 634 if (p < TFRC_SMALLEST_P) { /* 0.0000 < p < 0.0001 */
635 /* 635 DCCP_WARN("Value of p (%d) below resolution. "
636 * In the congestion-avoidance phase p decays towards 0 636 "Substituting %d\n", p, TFRC_SMALLEST_P);
637 * when there are no further losses, so this case is
638 * natural. Truncating to p_min = 0.01% means that the
639 * maximum achievable throughput is limited to about
640 * X_calc_max = 122.4 * s/RTT (see RFC 3448, 3.1); e.g.
641 * with s=1500 bytes, RTT=0.01 s: X_calc_max = 147 Mbps.
642 */
643 tfrc_pr_debug("Value of p (%d) below resolution. "
644 "Substituting %d\n", p, TFRC_SMALLEST_P);
645 index = 0; 637 index = 0;
646 } else /* 0.0001 <= p <= 0.05 */ 638 } else /* 0.0001 <= p <= 0.05 */
647 index = p/TFRC_SMALLEST_P - 1; 639 index = p/TFRC_SMALLEST_P - 1;
@@ -666,6 +658,7 @@ u32 tfrc_calc_x(u16 s, u32 R, u32 p)
666 result = scaled_div(s, R); 658 result = scaled_div(s, R);
667 return scaled_div32(result, f); 659 return scaled_div32(result, f);
668} 660}
661
669EXPORT_SYMBOL_GPL(tfrc_calc_x); 662EXPORT_SYMBOL_GPL(tfrc_calc_x);
670 663
671/** 664/**
@@ -700,19 +693,5 @@ u32 tfrc_calc_x_reverse_lookup(u32 fvalue)
700 index = tfrc_binsearch(fvalue, 0); 693 index = tfrc_binsearch(fvalue, 0);
701 return (index + 1) * 1000000 / TFRC_CALC_X_ARRSIZE; 694 return (index + 1) * 1000000 / TFRC_CALC_X_ARRSIZE;
702} 695}
703EXPORT_SYMBOL_GPL(tfrc_calc_x_reverse_lookup);
704 696
705/** 697EXPORT_SYMBOL_GPL(tfrc_calc_x_reverse_lookup);
706 * tfrc_invert_loss_event_rate - Compute p so that 10^6 corresponds to 100%
707 * When @loss_event_rate is large, there is a chance that p is truncated to 0.
708 * To avoid re-entering slow-start in that case, we set p = TFRC_SMALLEST_P > 0.
709 */
710u32 tfrc_invert_loss_event_rate(u32 loss_event_rate)
711{
712 if (loss_event_rate == UINT_MAX) /* see RFC 4342, 8.5 */
713 return 0;
714 if (unlikely(loss_event_rate == 0)) /* map 1/0 into 100% */
715 return 1000000;
716 return max_t(u32, scaled_div(1, loss_event_rate), TFRC_SMALLEST_P);
717}
718EXPORT_SYMBOL_GPL(tfrc_invert_loss_event_rate);
diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
index 5281190aa19c..b4bc6e095a0e 100644
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -42,11 +42,9 @@
42extern int dccp_debug; 42extern int dccp_debug;
43#define dccp_pr_debug(format, a...) DCCP_PR_DEBUG(dccp_debug, format, ##a) 43#define dccp_pr_debug(format, a...) DCCP_PR_DEBUG(dccp_debug, format, ##a)
44#define dccp_pr_debug_cat(format, a...) DCCP_PRINTK(dccp_debug, format, ##a) 44#define dccp_pr_debug_cat(format, a...) DCCP_PRINTK(dccp_debug, format, ##a)
45#define dccp_debug(fmt, a...) dccp_pr_debug_cat(KERN_DEBUG fmt, ##a)
46#else 45#else
47#define dccp_pr_debug(format, a...) 46#define dccp_pr_debug(format, a...)
48#define dccp_pr_debug_cat(format, a...) 47#define dccp_pr_debug_cat(format, a...)
49#define dccp_debug(format, a...)
50#endif 48#endif
51 49
52extern struct inet_hashinfo dccp_hashinfo; 50extern struct inet_hashinfo dccp_hashinfo;
@@ -63,14 +61,11 @@ extern void dccp_time_wait(struct sock *sk, int state, int timeo);
63 * - DCCP-Reset with ACK Subheader and 4 bytes of Reset Code fields 61 * - DCCP-Reset with ACK Subheader and 4 bytes of Reset Code fields
64 * Hence a safe upper bound for the maximum option length is 1020-28 = 992 62 * Hence a safe upper bound for the maximum option length is 1020-28 = 992
65 */ 63 */
66#define MAX_DCCP_SPECIFIC_HEADER (255 * sizeof(uint32_t)) 64#define MAX_DCCP_SPECIFIC_HEADER (255 * sizeof(int))
67#define DCCP_MAX_PACKET_HDR 28 65#define DCCP_MAX_PACKET_HDR 28
68#define DCCP_MAX_OPT_LEN (MAX_DCCP_SPECIFIC_HEADER - DCCP_MAX_PACKET_HDR) 66#define DCCP_MAX_OPT_LEN (MAX_DCCP_SPECIFIC_HEADER - DCCP_MAX_PACKET_HDR)
69#define MAX_DCCP_HEADER (MAX_DCCP_SPECIFIC_HEADER + MAX_HEADER) 67#define MAX_DCCP_HEADER (MAX_DCCP_SPECIFIC_HEADER + MAX_HEADER)
70 68
71/* Upper bound for initial feature-negotiation overhead (padded to 32 bits) */
72#define DCCP_FEATNEG_OVERHEAD (32 * sizeof(uint32_t))
73
74#define DCCP_TIMEWAIT_LEN (60 * HZ) /* how long to wait to destroy TIME-WAIT 69#define DCCP_TIMEWAIT_LEN (60 * HZ) /* how long to wait to destroy TIME-WAIT
75 * state, about 60 seconds */ 70 * state, about 60 seconds */
76 71
@@ -86,13 +81,10 @@ extern void dccp_time_wait(struct sock *sk, int state, int timeo);
86 */ 81 */
87#define DCCP_RTO_MAX ((unsigned)(64 * HZ)) 82#define DCCP_RTO_MAX ((unsigned)(64 * HZ))
88 83
89/* DCCP base time resolution - 10 microseconds (RFC 4340, 13.1 ... 13.3) */
90#define DCCP_TIME_RESOLUTION 10
91
92/* 84/*
93 * RTT sampling: sanity bounds and fallback RTT value from RFC 4340, section 3.4 85 * RTT sampling: sanity bounds and fallback RTT value from RFC 4340, section 3.4
94 */ 86 */
95#define DCCP_SANE_RTT_MIN (10 * DCCP_TIME_RESOLUTION) 87#define DCCP_SANE_RTT_MIN 100
96#define DCCP_FALLBACK_RTT (USEC_PER_SEC / 5) 88#define DCCP_FALLBACK_RTT (USEC_PER_SEC / 5)
97#define DCCP_SANE_RTT_MAX (3 * USEC_PER_SEC) 89#define DCCP_SANE_RTT_MAX (3 * USEC_PER_SEC)
98 90
@@ -103,6 +95,12 @@ extern void dccp_time_wait(struct sock *sk, int state, int timeo);
103extern int sysctl_dccp_request_retries; 95extern int sysctl_dccp_request_retries;
104extern int sysctl_dccp_retries1; 96extern int sysctl_dccp_retries1;
105extern int sysctl_dccp_retries2; 97extern int sysctl_dccp_retries2;
98extern int sysctl_dccp_feat_sequence_window;
99extern int sysctl_dccp_feat_rx_ccid;
100extern int sysctl_dccp_feat_tx_ccid;
101extern int sysctl_dccp_feat_ack_ratio;
102extern int sysctl_dccp_feat_send_ack_vector;
103extern int sysctl_dccp_feat_send_ndp_count;
106extern int sysctl_dccp_tx_qlen; 104extern int sysctl_dccp_tx_qlen;
107extern int sysctl_dccp_sync_ratelimit; 105extern int sysctl_dccp_sync_ratelimit;
108 106
@@ -237,22 +235,8 @@ extern void dccp_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
237extern void dccp_send_sync(struct sock *sk, const u64 seq, 235extern void dccp_send_sync(struct sock *sk, const u64 seq,
238 const enum dccp_pkt_type pkt_type); 236 const enum dccp_pkt_type pkt_type);
239 237
240/* 238extern void dccp_write_xmit(struct sock *sk, int block);
241 * TX Packet Dequeueing Interface
242 */
243extern void dccp_qpolicy_push(struct sock *sk, struct sk_buff *skb);
244extern bool dccp_qpolicy_full(struct sock *sk);
245extern void dccp_qpolicy_drop(struct sock *sk, struct sk_buff *skb);
246extern struct sk_buff *dccp_qpolicy_top(struct sock *sk);
247extern struct sk_buff *dccp_qpolicy_pop(struct sock *sk);
248extern bool dccp_qpolicy_param_ok(struct sock *sk, __be32 param);
249
250/*
251 * TX Packet Output and TX Timers
252 */
253extern void dccp_write_xmit(struct sock *sk);
254extern void dccp_write_space(struct sock *sk); 239extern void dccp_write_space(struct sock *sk);
255extern void dccp_flush_write_queue(struct sock *sk, long *time_budget);
256 240
257extern void dccp_init_xmit_timers(struct sock *sk); 241extern void dccp_init_xmit_timers(struct sock *sk);
258static inline void dccp_clear_xmit_timers(struct sock *sk) 242static inline void dccp_clear_xmit_timers(struct sock *sk)
@@ -268,8 +252,7 @@ extern const char *dccp_state_name(const int state);
268extern void dccp_set_state(struct sock *sk, const int state); 252extern void dccp_set_state(struct sock *sk, const int state);
269extern void dccp_done(struct sock *sk); 253extern void dccp_done(struct sock *sk);
270 254
271extern int dccp_reqsk_init(struct request_sock *rq, struct dccp_sock const *dp, 255extern void dccp_reqsk_init(struct request_sock *req, struct sk_buff *skb);
272 struct sk_buff const *skb);
273 256
274extern int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb); 257extern int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb);
275 258
@@ -334,14 +317,7 @@ extern struct sk_buff *dccp_ctl_make_reset(struct sock *sk,
334extern int dccp_send_reset(struct sock *sk, enum dccp_reset_codes code); 317extern int dccp_send_reset(struct sock *sk, enum dccp_reset_codes code);
335extern void dccp_send_close(struct sock *sk, const int active); 318extern void dccp_send_close(struct sock *sk, const int active);
336extern int dccp_invalid_packet(struct sk_buff *skb); 319extern int dccp_invalid_packet(struct sk_buff *skb);
337 320extern u32 dccp_sample_rtt(struct sock *sk, long delta);
338static inline u32 dccp_sane_rtt(long usec_sample)
339{
340 if (unlikely(usec_sample <= 0 || usec_sample > DCCP_SANE_RTT_MAX))
341 DCCP_WARN("RTT sample %ld out of bounds!\n", usec_sample);
342 return clamp_val(usec_sample, DCCP_SANE_RTT_MIN, DCCP_SANE_RTT_MAX);
343}
344extern u32 dccp_sample_rtt(struct sock *sk, long delta);
345 321
346static inline int dccp_bad_service_code(const struct sock *sk, 322static inline int dccp_bad_service_code(const struct sock *sk,
347 const __be32 service) 323 const __be32 service)
@@ -435,62 +411,36 @@ static inline void dccp_hdr_set_ack(struct dccp_hdr_ack_bits *dhack,
435static inline void dccp_update_gsr(struct sock *sk, u64 seq) 411static inline void dccp_update_gsr(struct sock *sk, u64 seq)
436{ 412{
437 struct dccp_sock *dp = dccp_sk(sk); 413 struct dccp_sock *dp = dccp_sk(sk);
414 const struct dccp_minisock *dmsk = dccp_msk(sk);
438 415
439 dp->dccps_gsr = seq; 416 dp->dccps_gsr = seq;
440 /* Sequence validity window depends on remote Sequence Window (7.5.1) */ 417 dccp_set_seqno(&dp->dccps_swl,
441 dp->dccps_swl = SUB48(ADD48(dp->dccps_gsr, 1), dp->dccps_r_seq_win / 4); 418 dp->dccps_gsr + 1 - (dmsk->dccpms_sequence_window / 4));
442 /* 419 dccp_set_seqno(&dp->dccps_swh,
443 * Adjust SWL so that it is not below ISR. In contrast to RFC 4340, 420 dp->dccps_gsr + (3 * dmsk->dccpms_sequence_window) / 4);
444 * 7.5.1 we perform this check beyond the initial handshake: W/W' are
445 * always > 32, so for the first W/W' packets in the lifetime of a
446 * connection we always have to adjust SWL.
447 * A second reason why we are doing this is that the window depends on
448 * the feature-remote value of Sequence Window: nothing stops the peer
449 * from updating this value while we are busy adjusting SWL for the
450 * first W packets (we would have to count from scratch again then).
451 * Therefore it is safer to always make sure that the Sequence Window
452 * is not artificially extended by a peer who grows SWL downwards by
453 * continually updating the feature-remote Sequence-Window.
454 * If sequence numbers wrap it is bad luck. But that will take a while
455 * (48 bit), and this measure prevents Sequence-number attacks.
456 */
457 if (before48(dp->dccps_swl, dp->dccps_isr))
458 dp->dccps_swl = dp->dccps_isr;
459 dp->dccps_swh = ADD48(dp->dccps_gsr, (3 * dp->dccps_r_seq_win) / 4);
460} 421}
461 422
462static inline void dccp_update_gss(struct sock *sk, u64 seq) 423static inline void dccp_update_gss(struct sock *sk, u64 seq)
463{ 424{
464 struct dccp_sock *dp = dccp_sk(sk); 425 struct dccp_sock *dp = dccp_sk(sk);
465 426
466 dp->dccps_gss = seq; 427 dp->dccps_awh = dp->dccps_gss = seq;
467 /* Ack validity window depends on local Sequence Window value (7.5.1) */ 428 dccp_set_seqno(&dp->dccps_awl,
468 dp->dccps_awl = SUB48(ADD48(dp->dccps_gss, 1), dp->dccps_l_seq_win); 429 (dp->dccps_gss -
469 /* Adjust AWL so that it is not below ISS - see comment above for SWL */ 430 dccp_msk(sk)->dccpms_sequence_window + 1));
470 if (before48(dp->dccps_awl, dp->dccps_iss))
471 dp->dccps_awl = dp->dccps_iss;
472 dp->dccps_awh = dp->dccps_gss;
473}
474
475static inline int dccp_ackvec_pending(const struct sock *sk)
476{
477 return dccp_sk(sk)->dccps_hc_rx_ackvec != NULL &&
478 !dccp_ackvec_is_empty(dccp_sk(sk)->dccps_hc_rx_ackvec);
479} 431}
480 432
481static inline int dccp_ack_pending(const struct sock *sk) 433static inline int dccp_ack_pending(const struct sock *sk)
482{ 434{
483 return dccp_ackvec_pending(sk) || inet_csk_ack_scheduled(sk); 435 const struct dccp_sock *dp = dccp_sk(sk);
436 return dp->dccps_timestamp_echo != 0 ||
437#ifdef CONFIG_IP_DCCP_ACKVEC
438 (dccp_msk(sk)->dccpms_send_ack_vector &&
439 dccp_ackvec_pending(dp->dccps_hc_rx_ackvec)) ||
440#endif
441 inet_csk_ack_scheduled(sk);
484} 442}
485 443
486extern int dccp_feat_signal_nn_change(struct sock *sk, u8 feat, u64 nn_val);
487extern int dccp_feat_finalise_settings(struct dccp_sock *dp);
488extern int dccp_feat_server_ccid_dependencies(struct dccp_request_sock *dreq);
489extern int dccp_feat_insert_opts(struct dccp_sock*, struct dccp_request_sock*,
490 struct sk_buff *skb);
491extern int dccp_feat_activate_values(struct sock *sk, struct list_head *fn);
492extern void dccp_feat_list_purge(struct list_head *fn_list);
493
494extern int dccp_insert_options(struct sock *sk, struct sk_buff *skb); 444extern int dccp_insert_options(struct sock *sk, struct sk_buff *skb);
495extern int dccp_insert_options_rsk(struct dccp_request_sock*, struct sk_buff*); 445extern int dccp_insert_options_rsk(struct dccp_request_sock*, struct sk_buff*);
496extern int dccp_insert_option_elapsed_time(struct sock *sk, 446extern int dccp_insert_option_elapsed_time(struct sock *sk,
diff --git a/net/dccp/diag.c b/net/dccp/diag.c
index 93aae7c95550..d8a3509b26f6 100644
--- a/net/dccp/diag.c
+++ b/net/dccp/diag.c
@@ -29,7 +29,7 @@ static void dccp_get_info(struct sock *sk, struct tcp_info *info)
29 info->tcpi_backoff = icsk->icsk_backoff; 29 info->tcpi_backoff = icsk->icsk_backoff;
30 info->tcpi_pmtu = icsk->icsk_pmtu_cookie; 30 info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
31 31
32 if (dp->dccps_hc_rx_ackvec != NULL) 32 if (dccp_msk(sk)->dccpms_send_ack_vector)
33 info->tcpi_options |= TCPI_OPT_SACK; 33 info->tcpi_options |= TCPI_OPT_SACK;
34 34
35 ccid_hc_rx_get_info(dp->dccps_hc_rx_ccid, sk, info); 35 ccid_hc_rx_get_info(dp->dccps_hc_rx_ccid, sk, info);
diff --git a/net/dccp/feat.c b/net/dccp/feat.c
index f94c7c9d1a7f..933a0ecf8d46 100644
--- a/net/dccp/feat.c
+++ b/net/dccp/feat.c
@@ -1,19 +1,11 @@
1/* 1/*
2 * net/dccp/feat.c 2 * net/dccp/feat.c
3 * 3 *
4 * Feature negotiation for the DCCP protocol (RFC 4340, section 6) 4 * An implementation of the DCCP protocol
5 * 5 * Andrea Bittau <a.bittau@cs.ucl.ac.uk>
6 * Copyright (c) 2008 The University of Aberdeen, Scotland, UK
7 * Copyright (c) 2008 Gerrit Renker <gerrit@erg.abdn.ac.uk>
8 * Rewrote from scratch, some bits from earlier code by
9 * Copyright (c) 2005 Andrea Bittau <a.bittau@cs.ucl.ac.uk>
10 *
11 * 6 *
12 * ASSUMPTIONS 7 * ASSUMPTIONS
13 * ----------- 8 * -----------
14 * o Feature negotiation is coordinated with connection setup (as in TCP), wild
15 * changes of parameters of an established connection are not supported.
16 * o Changing NN values (Ack Ratio only) is supported in state OPEN/PARTOPEN.
17 * o All currently known SP features have 1-byte quantities. If in the future 9 * o All currently known SP features have 1-byte quantities. If in the future
18 * extensions of RFCs 4340..42 define features with item lengths larger than 10 * extensions of RFCs 4340..42 define features with item lengths larger than
19 * one byte, a feature-specific extension of the code will be required. 11 * one byte, a feature-specific extension of the code will be required.
@@ -23,1510 +15,635 @@
23 * as published by the Free Software Foundation; either version 15 * as published by the Free Software Foundation; either version
24 * 2 of the License, or (at your option) any later version. 16 * 2 of the License, or (at your option) any later version.
25 */ 17 */
18
26#include <linux/module.h> 19#include <linux/module.h>
20
27#include "ccid.h" 21#include "ccid.h"
28#include "feat.h" 22#include "feat.h"
29 23
30/* feature-specific sysctls - initialised to the defaults from RFC 4340, 6.4 */ 24#define DCCP_FEAT_SP_NOAGREE (-123)
31unsigned long sysctl_dccp_sequence_window __read_mostly = 100;
32int sysctl_dccp_rx_ccid __read_mostly = 2,
33 sysctl_dccp_tx_ccid __read_mostly = 2;
34 25
35/* 26int dccp_feat_change(struct dccp_minisock *dmsk, u8 type, u8 feature,
36 * Feature activation handlers. 27 u8 *val, u8 len, gfp_t gfp)
37 *
38 * These all use an u64 argument, to provide enough room for NN/SP features. At
39 * this stage the negotiated values have been checked to be within their range.
40 */
41static int dccp_hdlr_ccid(struct sock *sk, u64 ccid, bool rx)
42{ 28{
43 struct dccp_sock *dp = dccp_sk(sk); 29 struct dccp_opt_pend *opt;
44 struct ccid *new_ccid = ccid_new(ccid, sk, rx, gfp_any());
45 30
46 if (new_ccid == NULL) 31 dccp_feat_debug(type, feature, *val);
47 return -ENOMEM;
48 32
49 if (rx) { 33 if (len > 3) {
50 ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk); 34 DCCP_WARN("invalid length %d\n", len);
51 dp->dccps_hc_rx_ccid = new_ccid; 35 return -EINVAL;
52 } else { 36 }
53 ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk); 37 /* XXX add further sanity checks */
54 dp->dccps_hc_tx_ccid = new_ccid; 38
39 /* check if that feature is already being negotiated */
40 list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) {
41 /* ok we found a negotiation for this option already */
42 if (opt->dccpop_feat == feature && opt->dccpop_type == type) {
43 dccp_pr_debug("Replacing old\n");
44 /* replace */
45 BUG_ON(opt->dccpop_val == NULL);
46 kfree(opt->dccpop_val);
47 opt->dccpop_val = val;
48 opt->dccpop_len = len;
49 opt->dccpop_conf = 0;
50 return 0;
51 }
55 } 52 }
56 return 0;
57}
58 53
59static int dccp_hdlr_seq_win(struct sock *sk, u64 seq_win, bool rx) 54 /* negotiation for a new feature */
60{ 55 opt = kmalloc(sizeof(*opt), gfp);
61 struct dccp_sock *dp = dccp_sk(sk); 56 if (opt == NULL)
57 return -ENOMEM;
62 58
63 if (rx) { 59 opt->dccpop_type = type;
64 dp->dccps_r_seq_win = seq_win; 60 opt->dccpop_feat = feature;
65 /* propagate changes to update SWL/SWH */ 61 opt->dccpop_len = len;
66 dccp_update_gsr(sk, dp->dccps_gsr); 62 opt->dccpop_val = val;
67 } else { 63 opt->dccpop_conf = 0;
68 dp->dccps_l_seq_win = seq_win; 64 opt->dccpop_sc = NULL;
69 /* propagate changes to update AWL */
70 dccp_update_gss(sk, dp->dccps_gss);
71 }
72 return 0;
73}
74 65
75static int dccp_hdlr_ack_ratio(struct sock *sk, u64 ratio, bool rx) 66 BUG_ON(opt->dccpop_val == NULL);
76{ 67
77#ifndef __CCID2_COPES_GRACEFULLY_WITH_DYNAMIC_ACK_RATIO_UPDATES__ 68 list_add_tail(&opt->dccpop_node, &dmsk->dccpms_pending);
78 /*
79 * FIXME: This is required until several problems in the CCID-2 code are
80 * resolved. The CCID-2 code currently does not cope well; using dynamic
81 * Ack Ratios greater than 1 caused instabilities. These were manifest
82 * in hangups and long RTO timeouts (1...3 seconds). Until this has been
83 * stabilised, it is safer not to activate dynamic Ack Ratio changes.
84 */
85 dccp_pr_debug("Not changing %s Ack Ratio from 1 to %u\n",
86 rx ? "RX" : "TX", (u16)ratio);
87 ratio = 1;
88#endif
89 if (rx)
90 dccp_sk(sk)->dccps_r_ack_ratio = ratio;
91 else
92 dccp_sk(sk)->dccps_l_ack_ratio = ratio;
93 return 0; 69 return 0;
94} 70}
95 71
96static int dccp_hdlr_ackvec(struct sock *sk, u64 enable, bool rx) 72EXPORT_SYMBOL_GPL(dccp_feat_change);
73
74static int dccp_feat_update_ccid(struct sock *sk, u8 type, u8 new_ccid_nr)
97{ 75{
98 struct dccp_sock *dp = dccp_sk(sk); 76 struct dccp_sock *dp = dccp_sk(sk);
77 struct dccp_minisock *dmsk = dccp_msk(sk);
78 /* figure out if we are changing our CCID or the peer's */
79 const int rx = type == DCCPO_CHANGE_R;
80 const u8 ccid_nr = rx ? dmsk->dccpms_rx_ccid : dmsk->dccpms_tx_ccid;
81 struct ccid *new_ccid;
82
83 /* Check if nothing is being changed. */
84 if (ccid_nr == new_ccid_nr)
85 return 0;
86
87 new_ccid = ccid_new(new_ccid_nr, sk, rx, GFP_ATOMIC);
88 if (new_ccid == NULL)
89 return -ENOMEM;
99 90
100 if (rx) { 91 if (rx) {
101 if (enable && dp->dccps_hc_rx_ackvec == NULL) { 92 ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk);
102 dp->dccps_hc_rx_ackvec = dccp_ackvec_alloc(gfp_any()); 93 dp->dccps_hc_rx_ccid = new_ccid;
103 if (dp->dccps_hc_rx_ackvec == NULL) 94 dmsk->dccpms_rx_ccid = new_ccid_nr;
104 return -ENOMEM; 95 } else {
105 } else if (!enable) { 96 ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk);
106 dccp_ackvec_free(dp->dccps_hc_rx_ackvec); 97 dp->dccps_hc_tx_ccid = new_ccid;
107 dp->dccps_hc_rx_ackvec = NULL; 98 dmsk->dccpms_tx_ccid = new_ccid_nr;
108 }
109 } 99 }
110 return 0;
111}
112 100
113static int dccp_hdlr_ndp(struct sock *sk, u64 enable, bool rx)
114{
115 if (!rx)
116 dccp_sk(sk)->dccps_send_ndp_count = (enable > 0);
117 return 0; 101 return 0;
118} 102}
119 103
120/* 104static int dccp_feat_update(struct sock *sk, u8 type, u8 feat, u8 val)
121 * Minimum Checksum Coverage is located at the RX side (9.2.1). This means that
122 * `rx' holds when the sending peer informs about his partial coverage via a
123 * ChangeR() option. In the other case, we are the sender and the receiver
124 * announces its coverage via ChangeL() options. The policy here is to honour
125 * such communication by enabling the corresponding partial coverage - but only
126 * if it has not been set manually before; the warning here means that all
127 * packets will be dropped.
128 */
129static int dccp_hdlr_min_cscov(struct sock *sk, u64 cscov, bool rx)
130{ 105{
131 struct dccp_sock *dp = dccp_sk(sk); 106 dccp_feat_debug(type, feat, val);
132 107
133 if (rx) 108 switch (feat) {
134 dp->dccps_pcrlen = cscov; 109 case DCCPF_CCID:
135 else { 110 return dccp_feat_update_ccid(sk, type, val);
136 if (dp->dccps_pcslen == 0) 111 default:
137 dp->dccps_pcslen = cscov; 112 dccp_pr_debug("UNIMPLEMENTED: %s(%d, ...)\n",
138 else if (cscov > dp->dccps_pcslen) 113 dccp_feat_typename(type), feat);
139 DCCP_WARN("CsCov %u too small, peer requires >= %u\n", 114 break;
140 dp->dccps_pcslen, (u8)cscov);
141 } 115 }
142 return 0; 116 return 0;
143} 117}
144 118
145static const struct { 119static int dccp_feat_reconcile(struct sock *sk, struct dccp_opt_pend *opt,
146 u8 feat_num; /* DCCPF_xxx */ 120 u8 *rpref, u8 rlen)
147 enum dccp_feat_type rxtx; /* RX or TX */
148 enum dccp_feat_type reconciliation; /* SP or NN */
149 u8 default_value; /* as in 6.4 */
150 int (*activation_hdlr)(struct sock *sk, u64 val, bool rx);
151/*
152 * Lookup table for location and type of features (from RFC 4340/4342)
153 * +--------------------------+----+-----+----+----+---------+-----------+
154 * | Feature | Location | Reconc. | Initial | Section |
155 * | | RX | TX | SP | NN | Value | Reference |
156 * +--------------------------+----+-----+----+----+---------+-----------+
157 * | DCCPF_CCID | | X | X | | 2 | 10 |
158 * | DCCPF_SHORT_SEQNOS | | X | X | | 0 | 7.6.1 |
159 * | DCCPF_SEQUENCE_WINDOW | | X | | X | 100 | 7.5.2 |
160 * | DCCPF_ECN_INCAPABLE | X | | X | | 0 | 12.1 |
161 * | DCCPF_ACK_RATIO | | X | | X | 2 | 11.3 |
162 * | DCCPF_SEND_ACK_VECTOR | X | | X | | 0 | 11.5 |
163 * | DCCPF_SEND_NDP_COUNT | | X | X | | 0 | 7.7.2 |
164 * | DCCPF_MIN_CSUM_COVER | X | | X | | 0 | 9.2.1 |
165 * | DCCPF_DATA_CHECKSUM | X | | X | | 0 | 9.3.1 |
166 * | DCCPF_SEND_LEV_RATE | X | | X | | 0 | 4342/8.4 |
167 * +--------------------------+----+-----+----+----+---------+-----------+
168 */
169} dccp_feat_table[] = {
170 { DCCPF_CCID, FEAT_AT_TX, FEAT_SP, 2, dccp_hdlr_ccid },
171 { DCCPF_SHORT_SEQNOS, FEAT_AT_TX, FEAT_SP, 0, NULL },
172 { DCCPF_SEQUENCE_WINDOW, FEAT_AT_TX, FEAT_NN, 100, dccp_hdlr_seq_win },
173 { DCCPF_ECN_INCAPABLE, FEAT_AT_RX, FEAT_SP, 0, NULL },
174 { DCCPF_ACK_RATIO, FEAT_AT_TX, FEAT_NN, 2, dccp_hdlr_ack_ratio},
175 { DCCPF_SEND_ACK_VECTOR, FEAT_AT_RX, FEAT_SP, 0, dccp_hdlr_ackvec },
176 { DCCPF_SEND_NDP_COUNT, FEAT_AT_TX, FEAT_SP, 0, dccp_hdlr_ndp },
177 { DCCPF_MIN_CSUM_COVER, FEAT_AT_RX, FEAT_SP, 0, dccp_hdlr_min_cscov},
178 { DCCPF_DATA_CHECKSUM, FEAT_AT_RX, FEAT_SP, 0, NULL },
179 { DCCPF_SEND_LEV_RATE, FEAT_AT_RX, FEAT_SP, 0, NULL },
180};
181#define DCCP_FEAT_SUPPORTED_MAX ARRAY_SIZE(dccp_feat_table)
182
183/**
184 * dccp_feat_index - Hash function to map feature number into array position
185 * Returns consecutive array index or -1 if the feature is not understood.
186 */
187static int dccp_feat_index(u8 feat_num)
188{ 121{
189 /* The first 9 entries are occupied by the types from RFC 4340, 6.4 */ 122 struct dccp_sock *dp = dccp_sk(sk);
190 if (feat_num > DCCPF_RESERVED && feat_num <= DCCPF_DATA_CHECKSUM) 123 u8 *spref, slen, *res = NULL;
191 return feat_num - 1; 124 int i, j, rc, agree = 1;
192 125
126 BUG_ON(rpref == NULL);
127
128 /* check if we are the black sheep */
129 if (dp->dccps_role == DCCP_ROLE_CLIENT) {
130 spref = rpref;
131 slen = rlen;
132 rpref = opt->dccpop_val;
133 rlen = opt->dccpop_len;
134 } else {
135 spref = opt->dccpop_val;
136 slen = opt->dccpop_len;
137 }
193 /* 138 /*
194 * Other features: add cases for new feature types here after adding 139 * Now we have server preference list in spref and client preference in
195 * them to the above table. 140 * rpref
196 */ 141 */
197 switch (feat_num) { 142 BUG_ON(spref == NULL);
198 case DCCPF_SEND_LEV_RATE: 143 BUG_ON(rpref == NULL);
199 return DCCP_FEAT_SUPPORTED_MAX - 1;
200 }
201 return -1;
202}
203
204static u8 dccp_feat_type(u8 feat_num)
205{
206 int idx = dccp_feat_index(feat_num);
207
208 if (idx < 0)
209 return FEAT_UNKNOWN;
210 return dccp_feat_table[idx].reconciliation;
211}
212 144
213static int dccp_feat_default_value(u8 feat_num) 145 /* FIXME sanity check vals */
214{
215 int idx = dccp_feat_index(feat_num);
216 146
217 return idx < 0 ? : dccp_feat_table[idx].default_value; 147 /* Are values in any order? XXX Lame "algorithm" here */
218} 148 for (i = 0; i < slen; i++) {
219 149 for (j = 0; j < rlen; j++) {
220/* 150 if (spref[i] == rpref[j]) {
221 * Debugging and verbose-printing section 151 res = &spref[i];
222 */ 152 break;
223static const char *dccp_feat_fname(const u8 feat) 153 }
224{ 154 }
225 static const char *feature_names[] = { 155 if (res)
226 [DCCPF_RESERVED] = "Reserved", 156 break;
227 [DCCPF_CCID] = "CCID",
228 [DCCPF_SHORT_SEQNOS] = "Allow Short Seqnos",
229 [DCCPF_SEQUENCE_WINDOW] = "Sequence Window",
230 [DCCPF_ECN_INCAPABLE] = "ECN Incapable",
231 [DCCPF_ACK_RATIO] = "Ack Ratio",
232 [DCCPF_SEND_ACK_VECTOR] = "Send ACK Vector",
233 [DCCPF_SEND_NDP_COUNT] = "Send NDP Count",
234 [DCCPF_MIN_CSUM_COVER] = "Min. Csum Coverage",
235 [DCCPF_DATA_CHECKSUM] = "Send Data Checksum",
236 };
237 if (feat > DCCPF_DATA_CHECKSUM && feat < DCCPF_MIN_CCID_SPECIFIC)
238 return feature_names[DCCPF_RESERVED];
239
240 if (feat == DCCPF_SEND_LEV_RATE)
241 return "Send Loss Event Rate";
242 if (feat >= DCCPF_MIN_CCID_SPECIFIC)
243 return "CCID-specific";
244
245 return feature_names[feat];
246}
247
248static const char *dccp_feat_sname[] = { "DEFAULT", "INITIALISING", "CHANGING",
249 "UNSTABLE", "STABLE" };
250
251#ifdef CONFIG_IP_DCCP_DEBUG
252static const char *dccp_feat_oname(const u8 opt)
253{
254 switch (opt) {
255 case DCCPO_CHANGE_L: return "Change_L";
256 case DCCPO_CONFIRM_L: return "Confirm_L";
257 case DCCPO_CHANGE_R: return "Change_R";
258 case DCCPO_CONFIRM_R: return "Confirm_R";
259 } 157 }
260 return NULL;
261}
262 158
263static void dccp_feat_printval(u8 feat_num, dccp_feat_val const *val) 159 /* we didn't agree on anything */
264{ 160 if (res == NULL) {
265 u8 i, type = dccp_feat_type(feat_num); 161 /* confirm previous value */
266 162 switch (opt->dccpop_feat) {
267 if (val == NULL || (type == FEAT_SP && val->sp.vec == NULL)) 163 case DCCPF_CCID:
268 dccp_pr_debug_cat("(NULL)"); 164 /* XXX did i get this right? =P */
269 else if (type == FEAT_SP) 165 if (opt->dccpop_type == DCCPO_CHANGE_L)
270 for (i = 0; i < val->sp.len; i++) 166 res = &dccp_msk(sk)->dccpms_tx_ccid;
271 dccp_pr_debug_cat("%s%u", i ? " " : "", val->sp.vec[i]); 167 else
272 else if (type == FEAT_NN) 168 res = &dccp_msk(sk)->dccpms_rx_ccid;
273 dccp_pr_debug_cat("%llu", (unsigned long long)val->nn); 169 break;
274 else
275 dccp_pr_debug_cat("unknown type %u", type);
276}
277
278static void dccp_feat_printvals(u8 feat_num, u8 *list, u8 len)
279{
280 u8 type = dccp_feat_type(feat_num);
281 dccp_feat_val fval = { .sp.vec = list, .sp.len = len };
282
283 if (type == FEAT_NN)
284 fval.nn = dccp_decode_value_var(list, len);
285 dccp_feat_printval(feat_num, &fval);
286}
287 170
288static void dccp_feat_print_entry(struct dccp_feat_entry const *entry) 171 default:
289{ 172 DCCP_BUG("Fell through, feat=%d", opt->dccpop_feat);
290 dccp_debug(" * %s %s = ", entry->is_local ? "local" : "remote", 173 /* XXX implement res */
291 dccp_feat_fname(entry->feat_num)); 174 return -EFAULT;
292 dccp_feat_printval(entry->feat_num, &entry->val); 175 }
293 dccp_pr_debug_cat(", state=%s %s\n", dccp_feat_sname[entry->state],
294 entry->needs_confirm ? "(Confirm pending)" : "");
295}
296 176
297#define dccp_feat_print_opt(opt, feat, val, len, mandatory) do { \ 177 dccp_pr_debug("Don't agree... reconfirming %d\n", *res);
298 dccp_pr_debug("%s(%s, ", dccp_feat_oname(opt), dccp_feat_fname(feat));\ 178 agree = 0; /* this is used for mandatory options... */
299 dccp_feat_printvals(feat, val, len); \ 179 }
300 dccp_pr_debug_cat(") %s\n", mandatory ? "!" : ""); } while (0)
301
302#define dccp_feat_print_fnlist(fn_list) { \
303 const struct dccp_feat_entry *___entry; \
304 \
305 dccp_pr_debug("List Dump:\n"); \
306 list_for_each_entry(___entry, fn_list, node) \
307 dccp_feat_print_entry(___entry); \
308}
309#else /* ! CONFIG_IP_DCCP_DEBUG */
310#define dccp_feat_print_opt(opt, feat, val, len, mandatory)
311#define dccp_feat_print_fnlist(fn_list)
312#endif
313 180
314static int __dccp_feat_activate(struct sock *sk, const int idx, 181 /* need to put result and our preference list */
315 const bool is_local, dccp_feat_val const *fval) 182 rlen = 1 + opt->dccpop_len;
316{ 183 rpref = kmalloc(rlen, GFP_ATOMIC);
317 bool rx; 184 if (rpref == NULL)
318 u64 val; 185 return -ENOMEM;
319 186
320 if (idx < 0 || idx >= DCCP_FEAT_SUPPORTED_MAX) 187 *rpref = *res;
321 return -1; 188 memcpy(&rpref[1], opt->dccpop_val, opt->dccpop_len);
322 if (dccp_feat_table[idx].activation_hdlr == NULL)
323 return 0;
324 189
325 if (fval == NULL) { 190 /* put it in the "confirm queue" */
326 val = dccp_feat_table[idx].default_value; 191 if (opt->dccpop_sc == NULL) {
327 } else if (dccp_feat_table[idx].reconciliation == FEAT_SP) { 192 opt->dccpop_sc = kmalloc(sizeof(*opt->dccpop_sc), GFP_ATOMIC);
328 if (fval->sp.vec == NULL) { 193 if (opt->dccpop_sc == NULL) {
329 /* 194 kfree(rpref);
330 * This can happen when an empty Confirm is sent 195 return -ENOMEM;
331 * for an SP (i.e. known) feature. In this case
332 * we would be using the default anyway.
333 */
334 DCCP_CRIT("Feature #%d undefined: using default", idx);
335 val = dccp_feat_table[idx].default_value;
336 } else {
337 val = fval->sp.vec[0];
338 } 196 }
339 } else { 197 } else {
340 val = fval->nn; 198 /* recycle the confirm slot */
199 BUG_ON(opt->dccpop_sc->dccpoc_val == NULL);
200 kfree(opt->dccpop_sc->dccpoc_val);
201 dccp_pr_debug("recycling confirm slot\n");
202 }
203 memset(opt->dccpop_sc, 0, sizeof(*opt->dccpop_sc));
204
205 opt->dccpop_sc->dccpoc_val = rpref;
206 opt->dccpop_sc->dccpoc_len = rlen;
207
208 /* update the option on our side [we are about to send the confirm] */
209 rc = dccp_feat_update(sk, opt->dccpop_type, opt->dccpop_feat, *res);
210 if (rc) {
211 kfree(opt->dccpop_sc->dccpoc_val);
212 kfree(opt->dccpop_sc);
213 opt->dccpop_sc = NULL;
214 return rc;
341 } 215 }
342 216
343 /* Location is RX if this is a local-RX or remote-TX feature */ 217 dccp_pr_debug("Will confirm %d\n", *rpref);
344 rx = (is_local == (dccp_feat_table[idx].rxtx == FEAT_AT_RX));
345
346 dccp_debug(" -> activating %s %s, %sval=%llu\n", rx ? "RX" : "TX",
347 dccp_feat_fname(dccp_feat_table[idx].feat_num),
348 fval ? "" : "default ", (unsigned long long)val);
349
350 return dccp_feat_table[idx].activation_hdlr(sk, val, rx);
351}
352
353/**
354 * dccp_feat_activate - Activate feature value on socket
355 * @sk: fully connected DCCP socket (after handshake is complete)
356 * @feat_num: feature to activate, one of %dccp_feature_numbers
357 * @local: whether local (1) or remote (0) @feat_num is meant
358 * @fval: the value (SP or NN) to activate, or NULL to use the default value
359 * For general use this function is preferable over __dccp_feat_activate().
360 */
361static int dccp_feat_activate(struct sock *sk, u8 feat_num, bool local,
362 dccp_feat_val const *fval)
363{
364 return __dccp_feat_activate(sk, dccp_feat_index(feat_num), local, fval);
365}
366
367/* Test for "Req'd" feature (RFC 4340, 6.4) */
368static inline int dccp_feat_must_be_understood(u8 feat_num)
369{
370 return feat_num == DCCPF_CCID || feat_num == DCCPF_SHORT_SEQNOS ||
371 feat_num == DCCPF_SEQUENCE_WINDOW;
372}
373 218
374/* copy constructor, fval must not already contain allocated memory */ 219 /* say we want to change to X but we just got a confirm X, suppress our
375static int dccp_feat_clone_sp_val(dccp_feat_val *fval, u8 const *val, u8 len) 220 * change
376{ 221 */
377 fval->sp.len = len; 222 if (!opt->dccpop_conf) {
378 if (fval->sp.len > 0) { 223 if (*opt->dccpop_val == *res)
379 fval->sp.vec = kmemdup(val, len, gfp_any()); 224 opt->dccpop_conf = 1;
380 if (fval->sp.vec == NULL) { 225 dccp_pr_debug("won't ask for change of same feature\n");
381 fval->sp.len = 0;
382 return -ENOBUFS;
383 }
384 } 226 }
385 return 0;
386}
387 227
388static void dccp_feat_val_destructor(u8 feat_num, dccp_feat_val *val) 228 return agree ? 0 : DCCP_FEAT_SP_NOAGREE; /* used for mandatory opts */
389{
390 if (unlikely(val == NULL))
391 return;
392 if (dccp_feat_type(feat_num) == FEAT_SP)
393 kfree(val->sp.vec);
394 memset(val, 0, sizeof(*val));
395} 229}
396 230
397static struct dccp_feat_entry * 231static int dccp_feat_sp(struct sock *sk, u8 type, u8 feature, u8 *val, u8 len)
398 dccp_feat_clone_entry(struct dccp_feat_entry const *original)
399{ 232{
400 struct dccp_feat_entry *new; 233 struct dccp_minisock *dmsk = dccp_msk(sk);
401 u8 type = dccp_feat_type(original->feat_num); 234 struct dccp_opt_pend *opt;
402 235 int rc = 1;
403 if (type == FEAT_UNKNOWN) 236 u8 t;
404 return NULL;
405 237
406 new = kmemdup(original, sizeof(struct dccp_feat_entry), gfp_any()); 238 /*
407 if (new == NULL) 239 * We received a CHANGE. We gotta match it against our own preference
408 return NULL; 240 * list. If we got a CHANGE_R it means it's a change for us, so we need
241 * to compare our CHANGE_L list.
242 */
243 if (type == DCCPO_CHANGE_L)
244 t = DCCPO_CHANGE_R;
245 else
246 t = DCCPO_CHANGE_L;
409 247
410 if (type == FEAT_SP && dccp_feat_clone_sp_val(&new->val, 248 /* find our preference list for this feature */
411 original->val.sp.vec, 249 list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) {
412 original->val.sp.len)) { 250 if (opt->dccpop_type != t || opt->dccpop_feat != feature)
413 kfree(new); 251 continue;
414 return NULL;
415 }
416 return new;
417}
418 252
419static void dccp_feat_entry_destructor(struct dccp_feat_entry *entry) 253 /* find the winner from the two preference lists */
420{ 254 rc = dccp_feat_reconcile(sk, opt, val, len);
421 if (entry != NULL) { 255 break;
422 dccp_feat_val_destructor(entry->feat_num, &entry->val);
423 kfree(entry);
424 } 256 }
425}
426 257
427/* 258 /* We didn't deal with the change. This can happen if we have no
428 * List management functions 259 * preference list for the feature. In fact, it just shouldn't
429 * 260 * happen---if we understand a feature, we should have a preference list
430 * Feature negotiation lists rely on and maintain the following invariants: 261 * with at least the default value.
431 * - each feat_num in the list is known, i.e. we know its type and default value 262 */
432 * - each feat_num/is_local combination is unique (old entries are overwritten) 263 BUG_ON(rc == 1);
433 * - SP values are always freshly allocated
434 * - list is sorted in increasing order of feature number (faster lookup)
435 */
436static struct dccp_feat_entry *dccp_feat_list_lookup(struct list_head *fn_list,
437 u8 feat_num, bool is_local)
438{
439 struct dccp_feat_entry *entry;
440 264
441 list_for_each_entry(entry, fn_list, node) 265 return rc;
442 if (entry->feat_num == feat_num && entry->is_local == is_local)
443 return entry;
444 else if (entry->feat_num > feat_num)
445 break;
446 return NULL;
447} 266}
448 267
449/** 268static int dccp_feat_nn(struct sock *sk, u8 type, u8 feature, u8 *val, u8 len)
450 * dccp_feat_entry_new - Central list update routine (called by all others)
451 * @head: list to add to
452 * @feat: feature number
453 * @local: whether the local (1) or remote feature with number @feat is meant
454 * This is the only constructor and serves to ensure the above invariants.
455 */
456static struct dccp_feat_entry *
457 dccp_feat_entry_new(struct list_head *head, u8 feat, bool local)
458{ 269{
459 struct dccp_feat_entry *entry; 270 struct dccp_opt_pend *opt;
460 271 struct dccp_minisock *dmsk = dccp_msk(sk);
461 list_for_each_entry(entry, head, node) 272 u8 *copy;
462 if (entry->feat_num == feat && entry->is_local == local) { 273 int rc;
463 dccp_feat_val_destructor(entry->feat_num, &entry->val);
464 return entry;
465 } else if (entry->feat_num > feat) {
466 head = &entry->node;
467 break;
468 }
469 274
470 entry = kmalloc(sizeof(*entry), gfp_any()); 275 /* NN features must be Change L (sec. 6.3.2) */
471 if (entry != NULL) { 276 if (type != DCCPO_CHANGE_L) {
472 entry->feat_num = feat; 277 dccp_pr_debug("received %s for NN feature %d\n",
473 entry->is_local = local; 278 dccp_feat_typename(type), feature);
474 list_add_tail(&entry->node, head); 279 return -EFAULT;
475 } 280 }
476 return entry;
477}
478 281
479/** 282 /* XXX sanity check opt val */
480 * dccp_feat_push_change - Add/overwrite a Change option in the list
481 * @fn_list: feature-negotiation list to update
482 * @feat: one of %dccp_feature_numbers
483 * @local: whether local (1) or remote (0) @feat_num is meant
484 * @needs_mandatory: whether to use Mandatory feature negotiation options
485 * @fval: pointer to NN/SP value to be inserted (will be copied)
486 */
487static int dccp_feat_push_change(struct list_head *fn_list, u8 feat, u8 local,
488 u8 mandatory, dccp_feat_val *fval)
489{
490 struct dccp_feat_entry *new = dccp_feat_entry_new(fn_list, feat, local);
491 283
492 if (new == NULL) 284 /* copy option so we can confirm it */
285 opt = kzalloc(sizeof(*opt), GFP_ATOMIC);
286 if (opt == NULL)
493 return -ENOMEM; 287 return -ENOMEM;
494 288
495 new->feat_num = feat; 289 copy = kmemdup(val, len, GFP_ATOMIC);
496 new->is_local = local; 290 if (copy == NULL) {
497 new->state = FEAT_INITIALISING; 291 kfree(opt);
498 new->needs_confirm = 0; 292 return -ENOMEM;
499 new->empty_confirm = 0; 293 }
500 new->val = *fval;
501 new->needs_mandatory = mandatory;
502 294
503 return 0; 295 opt->dccpop_type = DCCPO_CONFIRM_R; /* NN can only confirm R */
504} 296 opt->dccpop_feat = feature;
297 opt->dccpop_val = copy;
298 opt->dccpop_len = len;
505 299
506/** 300 /* change feature */
507 * dccp_feat_push_confirm - Add a Confirm entry to the FN list 301 rc = dccp_feat_update(sk, type, feature, *val);
508 * @fn_list: feature-negotiation list to add to 302 if (rc) {
509 * @feat: one of %dccp_feature_numbers 303 kfree(opt->dccpop_val);
510 * @local: whether local (1) or remote (0) @feat_num is being confirmed 304 kfree(opt);
511 * @fval: pointer to NN/SP value to be inserted or NULL 305 return rc;
512 * Returns 0 on success, a Reset code for further processing otherwise. 306 }
513 */
514static int dccp_feat_push_confirm(struct list_head *fn_list, u8 feat, u8 local,
515 dccp_feat_val *fval)
516{
517 struct dccp_feat_entry *new = dccp_feat_entry_new(fn_list, feat, local);
518 307
519 if (new == NULL) 308 dccp_feat_debug(type, feature, *copy);
520 return DCCP_RESET_CODE_TOO_BUSY;
521 309
522 new->feat_num = feat; 310 list_add_tail(&opt->dccpop_node, &dmsk->dccpms_conf);
523 new->is_local = local;
524 new->state = FEAT_STABLE; /* transition in 6.6.2 */
525 new->needs_confirm = 1;
526 new->empty_confirm = (fval == NULL);
527 new->val.nn = 0; /* zeroes the whole structure */
528 if (!new->empty_confirm)
529 new->val = *fval;
530 new->needs_mandatory = 0;
531 311
532 return 0; 312 return 0;
533} 313}
534 314
535static int dccp_push_empty_confirm(struct list_head *fn_list, u8 feat, u8 local) 315static void dccp_feat_empty_confirm(struct dccp_minisock *dmsk,
316 u8 type, u8 feature)
536{ 317{
537 return dccp_feat_push_confirm(fn_list, feat, local, NULL); 318 /* XXX check if other confirms for that are queued and recycle slot */
538} 319 struct dccp_opt_pend *opt = kzalloc(sizeof(*opt), GFP_ATOMIC);
539 320
540static inline void dccp_feat_list_pop(struct dccp_feat_entry *entry) 321 if (opt == NULL) {
541{ 322 /* XXX what do we do? Ignoring should be fine. It's a change
542 list_del(&entry->node); 323 * after all =P
543 dccp_feat_entry_destructor(entry); 324 */
544} 325 return;
545
546void dccp_feat_list_purge(struct list_head *fn_list)
547{
548 struct dccp_feat_entry *entry, *next;
549
550 list_for_each_entry_safe(entry, next, fn_list, node)
551 dccp_feat_entry_destructor(entry);
552 INIT_LIST_HEAD(fn_list);
553}
554EXPORT_SYMBOL_GPL(dccp_feat_list_purge);
555
556/* generate @to as full clone of @from - @to must not contain any nodes */
557int dccp_feat_clone_list(struct list_head const *from, struct list_head *to)
558{
559 struct dccp_feat_entry *entry, *new;
560
561 INIT_LIST_HEAD(to);
562 list_for_each_entry(entry, from, node) {
563 new = dccp_feat_clone_entry(entry);
564 if (new == NULL)
565 goto cloning_failed;
566 list_add_tail(&new->node, to);
567 } 326 }
568 return 0;
569 327
570cloning_failed: 328 switch (type) {
571 dccp_feat_list_purge(to); 329 case DCCPO_CHANGE_L:
572 return -ENOMEM; 330 opt->dccpop_type = DCCPO_CONFIRM_R;
573} 331 break;
332 case DCCPO_CHANGE_R:
333 opt->dccpop_type = DCCPO_CONFIRM_L;
334 break;
335 default:
336 DCCP_WARN("invalid type %d\n", type);
337 kfree(opt);
338 return;
339 }
340 opt->dccpop_feat = feature;
341 opt->dccpop_val = NULL;
342 opt->dccpop_len = 0;
574 343
575/** 344 /* change feature */
576 * dccp_feat_valid_nn_length - Enforce length constraints on NN options 345 dccp_pr_debug("Empty %s(%d)\n", dccp_feat_typename(type), feature);
577 * Length is between 0 and %DCCP_OPTVAL_MAXLEN. Used for outgoing packets only,
578 * incoming options are accepted as long as their values are valid.
579 */
580static u8 dccp_feat_valid_nn_length(u8 feat_num)
581{
582 if (feat_num == DCCPF_ACK_RATIO) /* RFC 4340, 11.3 and 6.6.8 */
583 return 2;
584 if (feat_num == DCCPF_SEQUENCE_WINDOW) /* RFC 4340, 7.5.2 and 6.5 */
585 return 6;
586 return 0;
587}
588 346
589static u8 dccp_feat_is_valid_nn_val(u8 feat_num, u64 val) 347 list_add_tail(&opt->dccpop_node, &dmsk->dccpms_conf);
590{
591 switch (feat_num) {
592 case DCCPF_ACK_RATIO:
593 return val <= DCCPF_ACK_RATIO_MAX;
594 case DCCPF_SEQUENCE_WINDOW:
595 return val >= DCCPF_SEQ_WMIN && val <= DCCPF_SEQ_WMAX;
596 }
597 return 0; /* feature unknown - so we can't tell */
598} 348}
599 349
600/* check that SP values are within the ranges defined in RFC 4340 */ 350static void dccp_feat_flush_confirm(struct sock *sk)
601static u8 dccp_feat_is_valid_sp_val(u8 feat_num, u8 val)
602{ 351{
603 switch (feat_num) { 352 struct dccp_minisock *dmsk = dccp_msk(sk);
604 case DCCPF_CCID: 353 /* Check if there is anything to confirm in the first place */
605 return val == DCCPC_CCID2 || val == DCCPC_CCID3; 354 int yes = !list_empty(&dmsk->dccpms_conf);
606 /* Type-check Boolean feature values: */
607 case DCCPF_SHORT_SEQNOS:
608 case DCCPF_ECN_INCAPABLE:
609 case DCCPF_SEND_ACK_VECTOR:
610 case DCCPF_SEND_NDP_COUNT:
611 case DCCPF_DATA_CHECKSUM:
612 case DCCPF_SEND_LEV_RATE:
613 return val < 2;
614 case DCCPF_MIN_CSUM_COVER:
615 return val < 16;
616 }
617 return 0; /* feature unknown */
618}
619 355
620static u8 dccp_feat_sp_list_ok(u8 feat_num, u8 const *sp_list, u8 sp_len) 356 if (!yes) {
621{ 357 struct dccp_opt_pend *opt;
622 if (sp_list == NULL || sp_len < 1)
623 return 0;
624 while (sp_len--)
625 if (!dccp_feat_is_valid_sp_val(feat_num, *sp_list++))
626 return 0;
627 return 1;
628}
629 358
630/** 359 list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) {
631 * dccp_feat_insert_opts - Generate FN options from current list state 360 if (opt->dccpop_conf) {
632 * @skb: next sk_buff to be sent to the peer 361 yes = 1;
633 * @dp: for client during handshake and general negotiation 362 break;
634 * @dreq: used by the server only (all Changes/Confirms in LISTEN/RESPOND)
635 */
636int dccp_feat_insert_opts(struct dccp_sock *dp, struct dccp_request_sock *dreq,
637 struct sk_buff *skb)
638{
639 struct list_head *fn = dreq ? &dreq->dreq_featneg : &dp->dccps_featneg;
640 struct dccp_feat_entry *pos, *next;
641 u8 opt, type, len, *ptr, nn_in_nbo[DCCP_OPTVAL_MAXLEN];
642 bool rpt;
643
644 /* put entries into @skb in the order they appear in the list */
645 list_for_each_entry_safe_reverse(pos, next, fn, node) {
646 opt = dccp_feat_genopt(pos);
647 type = dccp_feat_type(pos->feat_num);
648 rpt = false;
649
650 if (pos->empty_confirm) {
651 len = 0;
652 ptr = NULL;
653 } else {
654 if (type == FEAT_SP) {
655 len = pos->val.sp.len;
656 ptr = pos->val.sp.vec;
657 rpt = pos->needs_confirm;
658 } else if (type == FEAT_NN) {
659 len = dccp_feat_valid_nn_length(pos->feat_num);
660 ptr = nn_in_nbo;
661 dccp_encode_value_var(pos->val.nn, ptr, len);
662 } else {
663 DCCP_BUG("unknown feature %u", pos->feat_num);
664 return -1;
665 } 363 }
666 } 364 }
667 dccp_feat_print_opt(opt, pos->feat_num, ptr, len, 0);
668
669 if (dccp_insert_fn_opt(skb, opt, pos->feat_num, ptr, len, rpt))
670 return -1;
671 if (pos->needs_mandatory && dccp_insert_option_mandatory(skb))
672 return -1;
673 /*
674 * Enter CHANGING after transmitting the Change option (6.6.2).
675 */
676 if (pos->state == FEAT_INITIALISING)
677 pos->state = FEAT_CHANGING;
678 } 365 }
679 return 0;
680}
681
682/**
683 * __feat_register_nn - Register new NN value on socket
684 * @fn: feature-negotiation list to register with
685 * @feat: an NN feature from %dccp_feature_numbers
686 * @mandatory: use Mandatory option if 1
687 * @nn_val: value to register (restricted to 4 bytes)
688 * Note that NN features are local by definition (RFC 4340, 6.3.2).
689 */
690static int __feat_register_nn(struct list_head *fn, u8 feat,
691 u8 mandatory, u64 nn_val)
692{
693 dccp_feat_val fval = { .nn = nn_val };
694
695 if (dccp_feat_type(feat) != FEAT_NN ||
696 !dccp_feat_is_valid_nn_val(feat, nn_val))
697 return -EINVAL;
698
699 /* Don't bother with default values, they will be activated anyway. */
700 if (nn_val - (u64)dccp_feat_default_value(feat) == 0)
701 return 0;
702
703 return dccp_feat_push_change(fn, feat, 1, mandatory, &fval);
704}
705
706/**
707 * __feat_register_sp - Register new SP value/list on socket
708 * @fn: feature-negotiation list to register with
709 * @feat: an SP feature from %dccp_feature_numbers
710 * @is_local: whether the local (1) or the remote (0) @feat is meant
711 * @mandatory: use Mandatory option if 1
712 * @sp_val: SP value followed by optional preference list
713 * @sp_len: length of @sp_val in bytes
714 */
715static int __feat_register_sp(struct list_head *fn, u8 feat, u8 is_local,
716 u8 mandatory, u8 const *sp_val, u8 sp_len)
717{
718 dccp_feat_val fval;
719 366
720 if (dccp_feat_type(feat) != FEAT_SP || 367 if (!yes)
721 !dccp_feat_sp_list_ok(feat, sp_val, sp_len)) 368 return;
722 return -EINVAL;
723
724 /* Avoid negotiating alien CCIDs by only advertising supported ones */
725 if (feat == DCCPF_CCID && !ccid_support_check(sp_val, sp_len))
726 return -EOPNOTSUPP;
727
728 if (dccp_feat_clone_sp_val(&fval, sp_val, sp_len))
729 return -ENOMEM;
730 369
731 return dccp_feat_push_change(fn, feat, is_local, mandatory, &fval); 370 /* OK there is something to confirm... */
371 /* XXX check if packet is in flight? Send delayed ack?? */
372 if (sk->sk_state == DCCP_OPEN)
373 dccp_send_ack(sk);
732} 374}
733 375
734/** 376int dccp_feat_change_recv(struct sock *sk, u8 type, u8 feature, u8 *val, u8 len)
735 * dccp_feat_register_sp - Register requests to change SP feature values
736 * @sk: client or listening socket
737 * @feat: one of %dccp_feature_numbers
738 * @is_local: whether the local (1) or remote (0) @feat is meant
739 * @list: array of preferred values, in descending order of preference
740 * @len: length of @list in bytes
741 */
742int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local,
743 u8 const *list, u8 len)
744{ /* any changes must be registered before establishing the connection */
745 if (sk->sk_state != DCCP_CLOSED)
746 return -EISCONN;
747 if (dccp_feat_type(feat) != FEAT_SP)
748 return -EINVAL;
749 return __feat_register_sp(&dccp_sk(sk)->dccps_featneg, feat, is_local,
750 0, list, len);
751}
752
753/* Analogous to dccp_feat_register_sp(), but for non-negotiable values */
754int dccp_feat_register_nn(struct sock *sk, u8 feat, u64 val)
755{ 377{
756 /* any changes must be registered before establishing the connection */ 378 int rc;
757 if (sk->sk_state != DCCP_CLOSED)
758 return -EISCONN;
759 if (dccp_feat_type(feat) != FEAT_NN)
760 return -EINVAL;
761 return __feat_register_nn(&dccp_sk(sk)->dccps_featneg, feat, 0, val);
762}
763 379
764/** 380 dccp_feat_debug(type, feature, *val);
765 * dccp_feat_signal_nn_change - Update NN values for an established connection
766 * @sk: DCCP socket of an established connection
767 * @feat: NN feature number from %dccp_feature_numbers
768 * @nn_val: the new value to use
769 * This function is used to communicate NN updates out-of-band. The difference
770 * to feature negotiation during connection setup is that values are activated
771 * immediately after validation, i.e. we don't wait for the Confirm: either the
772 * value is accepted by the peer (and then the waiting is futile), or it is not
773 * (Reset or empty Confirm). We don't accept empty Confirms - transmitted values
774 * are validated, and the peer "MUST accept any valid value" (RFC 4340, 6.3.2).
775 */
776int dccp_feat_signal_nn_change(struct sock *sk, u8 feat, u64 nn_val)
777{
778 struct list_head *fn = &dccp_sk(sk)->dccps_featneg;
779 dccp_feat_val fval = { .nn = nn_val };
780 struct dccp_feat_entry *entry;
781 381
782 if (sk->sk_state != DCCP_OPEN && sk->sk_state != DCCP_PARTOPEN) 382 /* figure out if it's SP or NN feature */
783 return 0; 383 switch (feature) {
384 /* deal with SP features */
385 case DCCPF_CCID:
386 rc = dccp_feat_sp(sk, type, feature, val, len);
387 break;
784 388
785 if (dccp_feat_type(feat) != FEAT_NN || 389 /* deal with NN features */
786 !dccp_feat_is_valid_nn_val(feat, nn_val)) 390 case DCCPF_ACK_RATIO:
787 return -EINVAL; 391 rc = dccp_feat_nn(sk, type, feature, val, len);
392 break;
788 393
789 entry = dccp_feat_list_lookup(fn, feat, 1); 394 /* XXX implement other features */
790 if (entry != NULL) { 395 default:
791 dccp_pr_debug("Ignoring %llu, entry %llu exists in state %s\n", 396 dccp_pr_debug("UNIMPLEMENTED: not handling %s(%d, ...)\n",
792 (unsigned long long)nn_val, 397 dccp_feat_typename(type), feature);
793 (unsigned long long)entry->val.nn, 398 rc = -EFAULT;
794 dccp_feat_sname[entry->state]); 399 break;
795 return 0;
796 } 400 }
797 401
798 if (dccp_feat_activate(sk, feat, 1, &fval)) 402 /* check if there were problems changing features */
799 return -EADV; 403 if (rc) {
800 404 /* If we don't agree on SP, we sent a confirm for old value.
801 inet_csk_schedule_ack(sk); 405 * However we propagate rc to caller in case option was
802 return dccp_feat_push_change(fn, feat, 1, 0, &fval); 406 * mandatory
803}
804EXPORT_SYMBOL_GPL(dccp_feat_signal_nn_change);
805
806/*
807 * Tracking features whose value depend on the choice of CCID
808 *
809 * This is designed with an extension in mind so that a list walk could be done
810 * before activating any features. However, the existing framework was found to
811 * work satisfactorily up until now, the automatic verification is left open.
812 * When adding new CCIDs, add a corresponding dependency table here.
813 */
814static const struct ccid_dependency *dccp_feat_ccid_deps(u8 ccid, bool is_local)
815{
816 static const struct ccid_dependency ccid2_dependencies[2][2] = {
817 /*
818 * CCID2 mandates Ack Vectors (RFC 4341, 4.): as CCID is a TX
819 * feature and Send Ack Vector is an RX feature, `is_local'
820 * needs to be reversed.
821 */ 407 */
822 { /* Dependencies of the receiver-side (remote) CCID2 */ 408 if (rc != DCCP_FEAT_SP_NOAGREE)
823 { 409 dccp_feat_empty_confirm(dccp_msk(sk), type, feature);
824 .dependent_feat = DCCPF_SEND_ACK_VECTOR,
825 .is_local = true,
826 .is_mandatory = true,
827 .val = 1
828 },
829 { 0, 0, 0, 0 }
830 },
831 { /* Dependencies of the sender-side (local) CCID2 */
832 {
833 .dependent_feat = DCCPF_SEND_ACK_VECTOR,
834 .is_local = false,
835 .is_mandatory = true,
836 .val = 1
837 },
838 { 0, 0, 0, 0 }
839 }
840 };
841 static const struct ccid_dependency ccid3_dependencies[2][5] = {
842 { /*
843 * Dependencies of the receiver-side CCID3
844 */
845 { /* locally disable Ack Vectors */
846 .dependent_feat = DCCPF_SEND_ACK_VECTOR,
847 .is_local = true,
848 .is_mandatory = false,
849 .val = 0
850 },
851 { /* see below why Send Loss Event Rate is on */
852 .dependent_feat = DCCPF_SEND_LEV_RATE,
853 .is_local = true,
854 .is_mandatory = true,
855 .val = 1
856 },
857 { /* NDP Count is needed as per RFC 4342, 6.1.1 */
858 .dependent_feat = DCCPF_SEND_NDP_COUNT,
859 .is_local = false,
860 .is_mandatory = true,
861 .val = 1
862 },
863 { 0, 0, 0, 0 },
864 },
865 { /*
866 * CCID3 at the TX side: we request that the HC-receiver
867 * will not send Ack Vectors (they will be ignored, so
868 * Mandatory is not set); we enable Send Loss Event Rate
869 * (Mandatory since the implementation does not support
870 * the Loss Intervals option of RFC 4342, 8.6).
871 * The last two options are for peer's information only.
872 */
873 {
874 .dependent_feat = DCCPF_SEND_ACK_VECTOR,
875 .is_local = false,
876 .is_mandatory = false,
877 .val = 0
878 },
879 {
880 .dependent_feat = DCCPF_SEND_LEV_RATE,
881 .is_local = false,
882 .is_mandatory = true,
883 .val = 1
884 },
885 { /* this CCID does not support Ack Ratio */
886 .dependent_feat = DCCPF_ACK_RATIO,
887 .is_local = true,
888 .is_mandatory = false,
889 .val = 0
890 },
891 { /* tell receiver we are sending NDP counts */
892 .dependent_feat = DCCPF_SEND_NDP_COUNT,
893 .is_local = true,
894 .is_mandatory = false,
895 .val = 1
896 },
897 { 0, 0, 0, 0 }
898 }
899 };
900 switch (ccid) {
901 case DCCPC_CCID2:
902 return ccid2_dependencies[is_local];
903 case DCCPC_CCID3:
904 return ccid3_dependencies[is_local];
905 default:
906 return NULL;
907 } 410 }
908}
909 411
910/** 412 /* generate the confirm [if required] */
911 * dccp_feat_propagate_ccid - Resolve dependencies of features on choice of CCID 413 dccp_feat_flush_confirm(sk);
912 * @fn: feature-negotiation list to update
913 * @id: CCID number to track
914 * @is_local: whether TX CCID (1) or RX CCID (0) is meant
915 * This function needs to be called after registering all other features.
916 */
917static int dccp_feat_propagate_ccid(struct list_head *fn, u8 id, bool is_local)
918{
919 const struct ccid_dependency *table = dccp_feat_ccid_deps(id, is_local);
920 int i, rc = (table == NULL);
921
922 for (i = 0; rc == 0 && table[i].dependent_feat != DCCPF_RESERVED; i++)
923 if (dccp_feat_type(table[i].dependent_feat) == FEAT_SP)
924 rc = __feat_register_sp(fn, table[i].dependent_feat,
925 table[i].is_local,
926 table[i].is_mandatory,
927 &table[i].val, 1);
928 else
929 rc = __feat_register_nn(fn, table[i].dependent_feat,
930 table[i].is_mandatory,
931 table[i].val);
932 return rc;
933}
934
935/**
936 * dccp_feat_finalise_settings - Finalise settings before starting negotiation
937 * @dp: client or listening socket (settings will be inherited)
938 * This is called after all registrations (socket initialisation, sysctls, and
939 * sockopt calls), and before sending the first packet containing Change options
940 * (ie. client-Request or server-Response), to ensure internal consistency.
941 */
942int dccp_feat_finalise_settings(struct dccp_sock *dp)
943{
944 struct list_head *fn = &dp->dccps_featneg;
945 struct dccp_feat_entry *entry;
946 int i = 2, ccids[2] = { -1, -1 };
947 414
948 /* 415 return rc;
949 * Propagating CCIDs:
950 * 1) not useful to propagate CCID settings if this host advertises more
951 * than one CCID: the choice of CCID may still change - if this is
952 * the client, or if this is the server and the client sends
953 * singleton CCID values.
954 * 2) since is that propagate_ccid changes the list, we defer changing
955 * the sorted list until after the traversal.
956 */
957 list_for_each_entry(entry, fn, node)
958 if (entry->feat_num == DCCPF_CCID && entry->val.sp.len == 1)
959 ccids[entry->is_local] = entry->val.sp.vec[0];
960 while (i--)
961 if (ccids[i] > 0 && dccp_feat_propagate_ccid(fn, ccids[i], i))
962 return -1;
963 dccp_feat_print_fnlist(fn);
964 return 0;
965} 416}
966 417
967/** 418EXPORT_SYMBOL_GPL(dccp_feat_change_recv);
968 * dccp_feat_server_ccid_dependencies - Resolve CCID-dependent features
969 * It is the server which resolves the dependencies once the CCID has been
970 * fully negotiated. If no CCID has been negotiated, it uses the default CCID.
971 */
972int dccp_feat_server_ccid_dependencies(struct dccp_request_sock *dreq)
973{
974 struct list_head *fn = &dreq->dreq_featneg;
975 struct dccp_feat_entry *entry;
976 u8 is_local, ccid;
977
978 for (is_local = 0; is_local <= 1; is_local++) {
979 entry = dccp_feat_list_lookup(fn, DCCPF_CCID, is_local);
980
981 if (entry != NULL && !entry->empty_confirm)
982 ccid = entry->val.sp.vec[0];
983 else
984 ccid = dccp_feat_default_value(DCCPF_CCID);
985
986 if (dccp_feat_propagate_ccid(fn, ccid, is_local))
987 return -1;
988 }
989 return 0;
990}
991 419
992/* Select the first entry in @servlist that also occurs in @clilist (6.3.1) */ 420int dccp_feat_confirm_recv(struct sock *sk, u8 type, u8 feature,
993static int dccp_feat_preflist_match(u8 *servlist, u8 slen, u8 *clilist, u8 clen) 421 u8 *val, u8 len)
994{ 422{
995 u8 c, s; 423 u8 t;
424 struct dccp_opt_pend *opt;
425 struct dccp_minisock *dmsk = dccp_msk(sk);
426 int found = 0;
427 int all_confirmed = 1;
996 428
997 for (s = 0; s < slen; s++) 429 dccp_feat_debug(type, feature, *val);
998 for (c = 0; c < clen; c++)
999 if (servlist[s] == clilist[c])
1000 return servlist[s];
1001 return -1;
1002}
1003 430
1004/** 431 /* locate our change request */
1005 * dccp_feat_prefer - Move preferred entry to the start of array 432 switch (type) {
1006 * Reorder the @array_len elements in @array so that @preferred_value comes 433 case DCCPO_CONFIRM_L: t = DCCPO_CHANGE_R; break;
1007 * first. Returns >0 to indicate that @preferred_value does occur in @array. 434 case DCCPO_CONFIRM_R: t = DCCPO_CHANGE_L; break;
1008 */ 435 default: DCCP_WARN("invalid type %d\n", type);
1009static u8 dccp_feat_prefer(u8 preferred_value, u8 *array, u8 array_len) 436 return 1;
1010{
1011 u8 i, does_occur = 0;
1012 437
1013 if (array != NULL) {
1014 for (i = 0; i < array_len; i++)
1015 if (array[i] == preferred_value) {
1016 array[i] = array[0];
1017 does_occur++;
1018 }
1019 if (does_occur)
1020 array[0] = preferred_value;
1021 } 438 }
1022 return does_occur; 439 /* XXX sanity check feature value */
1023}
1024 440
1025/** 441 list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) {
1026 * dccp_feat_reconcile - Reconcile SP preference lists 442 if (!opt->dccpop_conf && opt->dccpop_type == t &&
1027 * @fval: SP list to reconcile into 443 opt->dccpop_feat == feature) {
1028 * @arr: received SP preference list 444 found = 1;
1029 * @len: length of @arr in bytes 445 dccp_pr_debug("feature %d found\n", opt->dccpop_feat);
1030 * @is_server: whether this side is the server (and @fv is the server's list)
1031 * @reorder: whether to reorder the list in @fv after reconciling with @arr
1032 * When successful, > 0 is returned and the reconciled list is in @fval.
1033 * A value of 0 means that negotiation failed (no shared entry).
1034 */
1035static int dccp_feat_reconcile(dccp_feat_val *fv, u8 *arr, u8 len,
1036 bool is_server, bool reorder)
1037{
1038 int rc;
1039 446
1040 if (!fv->sp.vec || !arr) { 447 /* XXX do sanity check */
1041 DCCP_CRIT("NULL feature value or array");
1042 return 0;
1043 }
1044 448
1045 if (is_server) 449 opt->dccpop_conf = 1;
1046 rc = dccp_feat_preflist_match(fv->sp.vec, fv->sp.len, arr, len);
1047 else
1048 rc = dccp_feat_preflist_match(arr, len, fv->sp.vec, fv->sp.len);
1049
1050 if (!reorder)
1051 return rc;
1052 if (rc < 0)
1053 return 0;
1054 450
1055 /* 451 /* We got a confirmation---change the option */
1056 * Reorder list: used for activating features and in dccp_insert_fn_opt. 452 dccp_feat_update(sk, opt->dccpop_type,
1057 */ 453 opt->dccpop_feat, *val);
1058 return dccp_feat_prefer(rc, fv->sp.vec, fv->sp.len);
1059}
1060 454
1061/** 455 /* XXX check the return value of dccp_feat_update */
1062 * dccp_feat_change_recv - Process incoming ChangeL/R options 456 break;
1063 * @fn: feature-negotiation list to update 457 }
1064 * @is_mandatory: whether the Change was preceded by a Mandatory option
1065 * @opt: %DCCPO_CHANGE_L or %DCCPO_CHANGE_R
1066 * @feat: one of %dccp_feature_numbers
1067 * @val: NN value or SP value/preference list
1068 * @len: length of @val in bytes
1069 * @server: whether this node is the server (1) or the client (0)
1070 */
1071static u8 dccp_feat_change_recv(struct list_head *fn, u8 is_mandatory, u8 opt,
1072 u8 feat, u8 *val, u8 len, const bool server)
1073{
1074 u8 defval, type = dccp_feat_type(feat);
1075 const bool local = (opt == DCCPO_CHANGE_R);
1076 struct dccp_feat_entry *entry;
1077 dccp_feat_val fval;
1078
1079 if (len == 0 || type == FEAT_UNKNOWN) /* 6.1 and 6.6.8 */
1080 goto unknown_feature_or_value;
1081
1082 dccp_feat_print_opt(opt, feat, val, len, is_mandatory);
1083
1084 /*
1085 * Negotiation of NN features: Change R is invalid, so there is no
1086 * simultaneous negotiation; hence we do not look up in the list.
1087 */
1088 if (type == FEAT_NN) {
1089 if (local || len > sizeof(fval.nn))
1090 goto unknown_feature_or_value;
1091
1092 /* 6.3.2: "The feature remote MUST accept any valid value..." */
1093 fval.nn = dccp_decode_value_var(val, len);
1094 if (!dccp_feat_is_valid_nn_val(feat, fval.nn))
1095 goto unknown_feature_or_value;
1096 458
1097 return dccp_feat_push_confirm(fn, feat, local, &fval); 459 if (!opt->dccpop_conf)
460 all_confirmed = 0;
1098 } 461 }
1099 462
1100 /* 463 /* fix re-transmit timer */
1101 * Unidirectional/simultaneous negotiation of SP features (6.3.1) 464 /* XXX gotta make sure that no option negotiation occurs during
465 * connection shutdown. Consider that the CLOSEREQ is sent and timer is
466 * on. if all options are confirmed it might kill timer which should
467 * remain alive until close is received.
1102 */ 468 */
1103 entry = dccp_feat_list_lookup(fn, feat, local); 469 if (all_confirmed) {
1104 if (entry == NULL) { 470 dccp_pr_debug("clear feat negotiation timer %p\n", sk);
1105 /* 471 inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
1106 * No particular preferences have been registered. We deal with
1107 * this situation by assuming that all valid values are equally
1108 * acceptable, and apply the following checks:
1109 * - if the peer's list is a singleton, we accept a valid value;
1110 * - if we are the server, we first try to see if the peer (the
1111 * client) advertises the default value. If yes, we use it,
1112 * otherwise we accept the preferred value;
1113 * - else if we are the client, we use the first list element.
1114 */
1115 if (dccp_feat_clone_sp_val(&fval, val, 1))
1116 return DCCP_RESET_CODE_TOO_BUSY;
1117
1118 if (len > 1 && server) {
1119 defval = dccp_feat_default_value(feat);
1120 if (dccp_feat_preflist_match(&defval, 1, val, len) > -1)
1121 fval.sp.vec[0] = defval;
1122 } else if (!dccp_feat_is_valid_sp_val(feat, fval.sp.vec[0])) {
1123 kfree(fval.sp.vec);
1124 goto unknown_feature_or_value;
1125 }
1126
1127 /* Treat unsupported CCIDs like invalid values */
1128 if (feat == DCCPF_CCID && !ccid_support_check(fval.sp.vec, 1)) {
1129 kfree(fval.sp.vec);
1130 goto not_valid_or_not_known;
1131 }
1132
1133 return dccp_feat_push_confirm(fn, feat, local, &fval);
1134
1135 } else if (entry->state == FEAT_UNSTABLE) { /* 6.6.2 */
1136 return 0;
1137 } 472 }
1138 473
1139 if (dccp_feat_reconcile(&entry->val, val, len, server, true)) { 474 if (!found)
1140 entry->empty_confirm = 0; 475 dccp_pr_debug("%s(%d, ...) never requested\n",
1141 } else if (is_mandatory) { 476 dccp_feat_typename(type), feature);
1142 return DCCP_RESET_CODE_MANDATORY_ERROR;
1143 } else if (entry->state == FEAT_INITIALISING) {
1144 /*
1145 * Failed simultaneous negotiation (server only): try to `save'
1146 * the connection by checking whether entry contains the default
1147 * value for @feat. If yes, send an empty Confirm to signal that
1148 * the received Change was not understood - which implies using
1149 * the default value.
1150 * If this also fails, we use Reset as the last resort.
1151 */
1152 WARN_ON(!server);
1153 defval = dccp_feat_default_value(feat);
1154 if (!dccp_feat_reconcile(&entry->val, &defval, 1, server, true))
1155 return DCCP_RESET_CODE_OPTION_ERROR;
1156 entry->empty_confirm = 1;
1157 }
1158 entry->needs_confirm = 1;
1159 entry->needs_mandatory = 0;
1160 entry->state = FEAT_STABLE;
1161 return 0; 477 return 0;
1162
1163unknown_feature_or_value:
1164 if (!is_mandatory)
1165 return dccp_push_empty_confirm(fn, feat, local);
1166
1167not_valid_or_not_known:
1168 return is_mandatory ? DCCP_RESET_CODE_MANDATORY_ERROR
1169 : DCCP_RESET_CODE_OPTION_ERROR;
1170} 478}
1171 479
1172/** 480EXPORT_SYMBOL_GPL(dccp_feat_confirm_recv);
1173 * dccp_feat_confirm_recv - Process received Confirm options
1174 * @fn: feature-negotiation list to update
1175 * @is_mandatory: whether @opt was preceded by a Mandatory option
1176 * @opt: %DCCPO_CONFIRM_L or %DCCPO_CONFIRM_R
1177 * @feat: one of %dccp_feature_numbers
1178 * @val: NN value or SP value/preference list
1179 * @len: length of @val in bytes
1180 * @server: whether this node is server (1) or client (0)
1181 */
1182static u8 dccp_feat_confirm_recv(struct list_head *fn, u8 is_mandatory, u8 opt,
1183 u8 feat, u8 *val, u8 len, const bool server)
1184{
1185 u8 *plist, plen, type = dccp_feat_type(feat);
1186 const bool local = (opt == DCCPO_CONFIRM_R);
1187 struct dccp_feat_entry *entry = dccp_feat_list_lookup(fn, feat, local);
1188
1189 dccp_feat_print_opt(opt, feat, val, len, is_mandatory);
1190
1191 if (entry == NULL) { /* nothing queued: ignore or handle error */
1192 if (is_mandatory && type == FEAT_UNKNOWN)
1193 return DCCP_RESET_CODE_MANDATORY_ERROR;
1194
1195 if (!local && type == FEAT_NN) /* 6.3.2 */
1196 goto confirmation_failed;
1197 return 0;
1198 }
1199
1200 if (entry->state != FEAT_CHANGING) /* 6.6.2 */
1201 return 0;
1202
1203 if (len == 0) {
1204 if (dccp_feat_must_be_understood(feat)) /* 6.6.7 */
1205 goto confirmation_failed;
1206 /*
1207 * Empty Confirm during connection setup: this means reverting
1208 * to the `old' value, which in this case is the default. Since
1209 * we handle default values automatically when no other values
1210 * have been set, we revert to the old value by removing this
1211 * entry from the list.
1212 */
1213 dccp_feat_list_pop(entry);
1214 return 0;
1215 }
1216 481
1217 if (type == FEAT_NN) { 482void dccp_feat_clean(struct dccp_minisock *dmsk)
1218 if (len > sizeof(entry->val.nn)) 483{
1219 goto confirmation_failed; 484 struct dccp_opt_pend *opt, *next;
1220 485
1221 if (entry->val.nn == dccp_decode_value_var(val, len)) 486 list_for_each_entry_safe(opt, next, &dmsk->dccpms_pending,
1222 goto confirmation_succeeded; 487 dccpop_node) {
488 BUG_ON(opt->dccpop_val == NULL);
489 kfree(opt->dccpop_val);
1223 490
1224 DCCP_WARN("Bogus Confirm for non-existing value\n"); 491 if (opt->dccpop_sc != NULL) {
1225 goto confirmation_failed; 492 BUG_ON(opt->dccpop_sc->dccpoc_val == NULL);
1226 } 493 kfree(opt->dccpop_sc->dccpoc_val);
494 kfree(opt->dccpop_sc);
495 }
1227 496
1228 /* 497 kfree(opt);
1229 * Parsing SP Confirms: the first element of @val is the preferred
1230 * SP value which the peer confirms, the remainder depends on @len.
1231 * Note that only the confirmed value need to be a valid SP value.
1232 */
1233 if (!dccp_feat_is_valid_sp_val(feat, *val))
1234 goto confirmation_failed;
1235
1236 if (len == 1) { /* peer didn't supply a preference list */
1237 plist = val;
1238 plen = len;
1239 } else { /* preferred value + preference list */
1240 plist = val + 1;
1241 plen = len - 1;
1242 } 498 }
499 INIT_LIST_HEAD(&dmsk->dccpms_pending);
1243 500
1244 /* Check whether the peer got the reconciliation right (6.6.8) */ 501 list_for_each_entry_safe(opt, next, &dmsk->dccpms_conf, dccpop_node) {
1245 if (dccp_feat_reconcile(&entry->val, plist, plen, server, 0) != *val) { 502 BUG_ON(opt == NULL);
1246 DCCP_WARN("Confirm selected the wrong value %u\n", *val); 503 if (opt->dccpop_val != NULL)
1247 return DCCP_RESET_CODE_OPTION_ERROR; 504 kfree(opt->dccpop_val);
505 kfree(opt);
1248 } 506 }
1249 entry->val.sp.vec[0] = *val; 507 INIT_LIST_HEAD(&dmsk->dccpms_conf);
1250
1251confirmation_succeeded:
1252 entry->state = FEAT_STABLE;
1253 return 0;
1254
1255confirmation_failed:
1256 DCCP_WARN("Confirmation failed\n");
1257 return is_mandatory ? DCCP_RESET_CODE_MANDATORY_ERROR
1258 : DCCP_RESET_CODE_OPTION_ERROR;
1259} 508}
1260 509
1261/** 510EXPORT_SYMBOL_GPL(dccp_feat_clean);
1262 * dccp_feat_handle_nn_established - Fast-path reception of NN options 511
1263 * @sk: socket of an established DCCP connection 512/* this is to be called only when a listening sock creates its child. It is
1264 * @mandatory: whether @opt was preceded by a Mandatory option 513 * assumed by the function---the confirm is not duplicated, but rather it is
1265 * @opt: %DCCPO_CHANGE_L | %DCCPO_CONFIRM_R (NN only) 514 * "passed on".
1266 * @feat: NN number, one of %dccp_feature_numbers
1267 * @val: NN value
1268 * @len: length of @val in bytes
1269 * This function combines the functionality of change_recv/confirm_recv, with
1270 * the following differences (reset codes are the same):
1271 * - cleanup after receiving the Confirm;
1272 * - values are directly activated after successful parsing;
1273 * - deliberately restricted to NN features.
1274 * The restriction to NN features is essential since SP features can have non-
1275 * predictable outcomes (depending on the remote configuration), and are inter-
1276 * dependent (CCIDs for instance cause further dependencies).
1277 */ 515 */
1278static u8 dccp_feat_handle_nn_established(struct sock *sk, u8 mandatory, u8 opt, 516int dccp_feat_clone(struct sock *oldsk, struct sock *newsk)
1279 u8 feat, u8 *val, u8 len)
1280{ 517{
1281 struct list_head *fn = &dccp_sk(sk)->dccps_featneg; 518 struct dccp_minisock *olddmsk = dccp_msk(oldsk);
1282 const bool local = (opt == DCCPO_CONFIRM_R); 519 struct dccp_minisock *newdmsk = dccp_msk(newsk);
1283 struct dccp_feat_entry *entry; 520 struct dccp_opt_pend *opt;
1284 u8 type = dccp_feat_type(feat); 521 int rc = 0;
1285 dccp_feat_val fval;
1286 522
1287 dccp_feat_print_opt(opt, feat, val, len, mandatory); 523 INIT_LIST_HEAD(&newdmsk->dccpms_pending);
524 INIT_LIST_HEAD(&newdmsk->dccpms_conf);
1288 525
1289 /* Ignore non-mandatory unknown and non-NN features */ 526 list_for_each_entry(opt, &olddmsk->dccpms_pending, dccpop_node) {
1290 if (type == FEAT_UNKNOWN) { 527 struct dccp_opt_pend *newopt;
1291 if (local && !mandatory) 528 /* copy the value of the option */
1292 return 0; 529 u8 *val = kmemdup(opt->dccpop_val, opt->dccpop_len, GFP_ATOMIC);
1293 goto fast_path_unknown;
1294 } else if (type != FEAT_NN) {
1295 return 0;
1296 }
1297
1298 /*
1299 * We don't accept empty Confirms, since in fast-path feature
1300 * negotiation the values are enabled immediately after sending
1301 * the Change option.
1302 * Empty Changes on the other hand are invalid (RFC 4340, 6.1).
1303 */
1304 if (len == 0 || len > sizeof(fval.nn))
1305 goto fast_path_unknown;
1306
1307 if (opt == DCCPO_CHANGE_L) {
1308 fval.nn = dccp_decode_value_var(val, len);
1309 if (!dccp_feat_is_valid_nn_val(feat, fval.nn))
1310 goto fast_path_unknown;
1311 530
1312 if (dccp_feat_push_confirm(fn, feat, local, &fval) || 531 if (val == NULL)
1313 dccp_feat_activate(sk, feat, local, &fval)) 532 goto out_clean;
1314 return DCCP_RESET_CODE_TOO_BUSY;
1315 533
1316 /* set the `Ack Pending' flag to piggyback a Confirm */ 534 newopt = kmemdup(opt, sizeof(*newopt), GFP_ATOMIC);
1317 inet_csk_schedule_ack(sk); 535 if (newopt == NULL) {
1318 536 kfree(val);
1319 } else if (opt == DCCPO_CONFIRM_R) { 537 goto out_clean;
1320 entry = dccp_feat_list_lookup(fn, feat, local);
1321 if (entry == NULL || entry->state != FEAT_CHANGING)
1322 return 0;
1323
1324 fval.nn = dccp_decode_value_var(val, len);
1325 if (fval.nn != entry->val.nn) {
1326 DCCP_WARN("Bogus Confirm for non-existing value\n");
1327 goto fast_path_failed;
1328 } 538 }
1329 539
1330 /* It has been confirmed - so remove the entry */ 540 /* insert the option */
1331 dccp_feat_list_pop(entry); 541 newopt->dccpop_val = val;
542 list_add_tail(&newopt->dccpop_node, &newdmsk->dccpms_pending);
1332 543
1333 } else { 544 /* XXX what happens with backlogs and multiple connections at
1334 DCCP_WARN("Received illegal option %u\n", opt); 545 * once...
1335 goto fast_path_failed; 546 */
547 /* the master socket no longer needs to worry about confirms */
548 opt->dccpop_sc = NULL; /* it's not a memleak---new socket has it */
549
550 /* reset state for a new socket */
551 opt->dccpop_conf = 0;
1336 } 552 }
1337 return 0;
1338 553
1339fast_path_unknown: 554 /* XXX not doing anything about the conf queue */
1340 if (!mandatory) 555
1341 return dccp_push_empty_confirm(fn, feat, local); 556out:
557 return rc;
1342 558
1343fast_path_failed: 559out_clean:
1344 return mandatory ? DCCP_RESET_CODE_MANDATORY_ERROR 560 dccp_feat_clean(newdmsk);
1345 : DCCP_RESET_CODE_OPTION_ERROR; 561 rc = -ENOMEM;
562 goto out;
1346} 563}
1347 564
1348/** 565EXPORT_SYMBOL_GPL(dccp_feat_clone);
1349 * dccp_feat_parse_options - Process Feature-Negotiation Options 566
1350 * @sk: for general use and used by the client during connection setup 567static int __dccp_feat_init(struct dccp_minisock *dmsk, u8 type, u8 feat,
1351 * @dreq: used by the server during connection setup 568 u8 *val, u8 len)
1352 * @mandatory: whether @opt was preceded by a Mandatory option
1353 * @opt: %DCCPO_CHANGE_L | %DCCPO_CHANGE_R | %DCCPO_CONFIRM_L | %DCCPO_CONFIRM_R
1354 * @feat: one of %dccp_feature_numbers
1355 * @val: value contents of @opt
1356 * @len: length of @val in bytes
1357 * Returns 0 on success, a Reset code for ending the connection otherwise.
1358 */
1359int dccp_feat_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
1360 u8 mandatory, u8 opt, u8 feat, u8 *val, u8 len)
1361{ 569{
1362 struct dccp_sock *dp = dccp_sk(sk); 570 int rc = -ENOMEM;
1363 struct list_head *fn = dreq ? &dreq->dreq_featneg : &dp->dccps_featneg; 571 u8 *copy = kmemdup(val, len, GFP_KERNEL);
1364 bool server = false;
1365 572
1366 switch (sk->sk_state) { 573 if (copy != NULL) {
1367 /* 574 rc = dccp_feat_change(dmsk, type, feat, copy, len, GFP_KERNEL);
1368 * Negotiation during connection setup 575 if (rc)
1369 */ 576 kfree(copy);
1370 case DCCP_LISTEN:
1371 server = true; /* fall through */
1372 case DCCP_REQUESTING:
1373 switch (opt) {
1374 case DCCPO_CHANGE_L:
1375 case DCCPO_CHANGE_R:
1376 return dccp_feat_change_recv(fn, mandatory, opt, feat,
1377 val, len, server);
1378 case DCCPO_CONFIRM_R:
1379 case DCCPO_CONFIRM_L:
1380 return dccp_feat_confirm_recv(fn, mandatory, opt, feat,
1381 val, len, server);
1382 }
1383 break;
1384 /*
1385 * Support for exchanging NN options on an established connection
1386 * This is currently restricted to Ack Ratio (RFC 4341, 6.1.2)
1387 */
1388 case DCCP_OPEN:
1389 case DCCP_PARTOPEN:
1390 return dccp_feat_handle_nn_established(sk, mandatory, opt, feat,
1391 val, len);
1392 } 577 }
1393 return 0; /* ignore FN options in all other states */ 578 return rc;
1394} 579}
1395 580
1396/** 581int dccp_feat_init(struct dccp_minisock *dmsk)
1397 * dccp_feat_init - Seed feature negotiation with host-specific defaults
1398 * This initialises global defaults, depending on the value of the sysctls.
1399 * These can later be overridden by registering changes via setsockopt calls.
1400 * The last link in the chain is finalise_settings, to make sure that between
1401 * here and the start of actual feature negotiation no inconsistencies enter.
1402 *
1403 * All features not appearing below use either defaults or are otherwise
1404 * later adjusted through dccp_feat_finalise_settings().
1405 */
1406int dccp_feat_init(struct sock *sk)
1407{ 582{
1408 struct list_head *fn = &dccp_sk(sk)->dccps_featneg;
1409 u8 on = 1, off = 0;
1410 int rc; 583 int rc;
1411 struct {
1412 u8 *val;
1413 u8 len;
1414 } tx, rx;
1415
1416 /* Non-negotiable (NN) features */
1417 rc = __feat_register_nn(fn, DCCPF_SEQUENCE_WINDOW, 0,
1418 sysctl_dccp_sequence_window);
1419 if (rc)
1420 return rc;
1421 584
1422 /* Server-priority (SP) features */ 585 INIT_LIST_HEAD(&dmsk->dccpms_pending);
1423 586 INIT_LIST_HEAD(&dmsk->dccpms_conf);
1424 /* Advertise that short seqnos are not supported (7.6.1) */
1425 rc = __feat_register_sp(fn, DCCPF_SHORT_SEQNOS, true, true, &off, 1);
1426 if (rc)
1427 return rc;
1428 587
1429 /* RFC 4340 12.1: "If a DCCP is not ECN capable, ..." */ 588 /* CCID L */
1430 rc = __feat_register_sp(fn, DCCPF_ECN_INCAPABLE, true, true, &on, 1); 589 rc = __dccp_feat_init(dmsk, DCCPO_CHANGE_L, DCCPF_CCID,
590 &dmsk->dccpms_tx_ccid, 1);
1431 if (rc) 591 if (rc)
1432 return rc; 592 goto out;
1433
1434 /*
1435 * We advertise the available list of CCIDs and reorder according to
1436 * preferences, to avoid failure resulting from negotiating different
1437 * singleton values (which always leads to failure).
1438 * These settings can still (later) be overridden via sockopts.
1439 */
1440 if (ccid_get_builtin_ccids(&tx.val, &tx.len) ||
1441 ccid_get_builtin_ccids(&rx.val, &rx.len))
1442 return -ENOBUFS;
1443
1444 /* Pre-load all CCID modules that are going to be advertised */
1445 rc = -EUNATCH;
1446 if (ccid_request_modules(tx.val, tx.len))
1447 goto free_ccid_lists;
1448
1449 if (!dccp_feat_prefer(sysctl_dccp_tx_ccid, tx.val, tx.len) ||
1450 !dccp_feat_prefer(sysctl_dccp_rx_ccid, rx.val, rx.len))
1451 goto free_ccid_lists;
1452 593
1453 rc = __feat_register_sp(fn, DCCPF_CCID, true, false, tx.val, tx.len); 594 /* CCID R */
595 rc = __dccp_feat_init(dmsk, DCCPO_CHANGE_R, DCCPF_CCID,
596 &dmsk->dccpms_rx_ccid, 1);
1454 if (rc) 597 if (rc)
1455 goto free_ccid_lists; 598 goto out;
1456 599
1457 rc = __feat_register_sp(fn, DCCPF_CCID, false, false, rx.val, rx.len); 600 /* Ack ratio */
1458 601 rc = __dccp_feat_init(dmsk, DCCPO_CHANGE_L, DCCPF_ACK_RATIO,
1459free_ccid_lists: 602 &dmsk->dccpms_ack_ratio, 1);
1460 kfree(tx.val); 603out:
1461 kfree(rx.val);
1462 return rc; 604 return rc;
1463} 605}
1464 606
1465int dccp_feat_activate_values(struct sock *sk, struct list_head *fn_list) 607EXPORT_SYMBOL_GPL(dccp_feat_init);
1466{
1467 struct dccp_sock *dp = dccp_sk(sk);
1468 struct dccp_feat_entry *cur, *next;
1469 int idx;
1470 dccp_feat_val *fvals[DCCP_FEAT_SUPPORTED_MAX][2] = {
1471 [0 ... DCCP_FEAT_SUPPORTED_MAX-1] = { NULL, NULL }
1472 };
1473
1474 list_for_each_entry(cur, fn_list, node) {
1475 /*
1476 * An empty Confirm means that either an unknown feature type
1477 * or an invalid value was present. In the first case there is
1478 * nothing to activate, in the other the default value is used.
1479 */
1480 if (cur->empty_confirm)
1481 continue;
1482 608
1483 idx = dccp_feat_index(cur->feat_num); 609#ifdef CONFIG_IP_DCCP_DEBUG
1484 if (idx < 0) { 610const char *dccp_feat_typename(const u8 type)
1485 DCCP_BUG("Unknown feature %u", cur->feat_num); 611{
1486 goto activation_failed; 612 switch(type) {
1487 } 613 case DCCPO_CHANGE_L: return("ChangeL");
1488 if (cur->state != FEAT_STABLE) { 614 case DCCPO_CONFIRM_L: return("ConfirmL");
1489 DCCP_CRIT("Negotiation of %s %s failed in state %s", 615 case DCCPO_CHANGE_R: return("ChangeR");
1490 cur->is_local ? "local" : "remote", 616 case DCCPO_CONFIRM_R: return("ConfirmR");
1491 dccp_feat_fname(cur->feat_num), 617 /* the following case must not appear in feature negotation */
1492 dccp_feat_sname[cur->state]); 618 default: dccp_pr_debug("unknown type %d [BUG!]\n", type);
1493 goto activation_failed;
1494 }
1495 fvals[idx][cur->is_local] = &cur->val;
1496 } 619 }
620 return NULL;
621}
1497 622
1498 /* 623EXPORT_SYMBOL_GPL(dccp_feat_typename);
1499 * Activate in decreasing order of index, so that the CCIDs are always
1500 * activated as the last feature. This avoids the case where a CCID
1501 * relies on the initialisation of one or more features that it depends
1502 * on (e.g. Send NDP Count, Send Ack Vector, and Ack Ratio features).
1503 */
1504 for (idx = DCCP_FEAT_SUPPORTED_MAX; --idx >= 0;)
1505 if (__dccp_feat_activate(sk, idx, 0, fvals[idx][0]) ||
1506 __dccp_feat_activate(sk, idx, 1, fvals[idx][1])) {
1507 DCCP_CRIT("Could not activate %d", idx);
1508 goto activation_failed;
1509 }
1510 624
1511 /* Clean up Change options which have been confirmed already */ 625const char *dccp_feat_name(const u8 feat)
1512 list_for_each_entry_safe(cur, next, fn_list, node) 626{
1513 if (!cur->needs_confirm) 627 static const char *feature_names[] = {
1514 dccp_feat_list_pop(cur); 628 [DCCPF_RESERVED] = "Reserved",
629 [DCCPF_CCID] = "CCID",
630 [DCCPF_SHORT_SEQNOS] = "Allow Short Seqnos",
631 [DCCPF_SEQUENCE_WINDOW] = "Sequence Window",
632 [DCCPF_ECN_INCAPABLE] = "ECN Incapable",
633 [DCCPF_ACK_RATIO] = "Ack Ratio",
634 [DCCPF_SEND_ACK_VECTOR] = "Send ACK Vector",
635 [DCCPF_SEND_NDP_COUNT] = "Send NDP Count",
636 [DCCPF_MIN_CSUM_COVER] = "Min. Csum Coverage",
637 [DCCPF_DATA_CHECKSUM] = "Send Data Checksum",
638 };
639 if (feat > DCCPF_DATA_CHECKSUM && feat < DCCPF_MIN_CCID_SPECIFIC)
640 return feature_names[DCCPF_RESERVED];
1515 641
1516 dccp_pr_debug("Activation OK\n"); 642 if (feat >= DCCPF_MIN_CCID_SPECIFIC)
1517 return 0; 643 return "CCID-specific";
1518 644
1519activation_failed: 645 return feature_names[feat];
1520 /*
1521 * We clean up everything that may have been allocated, since
1522 * it is difficult to track at which stage negotiation failed.
1523 * This is ok, since all allocation functions below are robust
1524 * against NULL arguments.
1525 */
1526 ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk);
1527 ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk);
1528 dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL;
1529 dccp_ackvec_free(dp->dccps_hc_rx_ackvec);
1530 dp->dccps_hc_rx_ackvec = NULL;
1531 return -1;
1532} 646}
647
648EXPORT_SYMBOL_GPL(dccp_feat_name);
649#endif /* CONFIG_IP_DCCP_DEBUG */
diff --git a/net/dccp/feat.h b/net/dccp/feat.h
index 2217066e22d7..e272222c7ace 100644
--- a/net/dccp/feat.h
+++ b/net/dccp/feat.h
@@ -3,134 +3,38 @@
3/* 3/*
4 * net/dccp/feat.h 4 * net/dccp/feat.h
5 * 5 *
6 * Feature negotiation for the DCCP protocol (RFC 4340, section 6) 6 * An implementation of the DCCP protocol
7 * Copyright (c) 2008 Gerrit Renker <gerrit@erg.abdn.ac.uk>
8 * Copyright (c) 2005 Andrea Bittau <a.bittau@cs.ucl.ac.uk> 7 * Copyright (c) 2005 Andrea Bittau <a.bittau@cs.ucl.ac.uk>
9 * 8 *
10 * This program is free software; you can redistribute it and/or modify it 9 * This program is free software; you can redistribute it and/or modify it
11 * under the terms of the GNU General Public License version 2 as 10 * under the terms of the GNU General Public License version 2 as
12 * published by the Free Software Foundation. 11 * published by the Free Software Foundation.
13 */ 12 */
13
14#include <linux/types.h> 14#include <linux/types.h>
15#include "dccp.h" 15#include "dccp.h"
16 16
17/* 17#ifdef CONFIG_IP_DCCP_DEBUG
18 * Known limit values 18extern const char *dccp_feat_typename(const u8 type);
19 */ 19extern const char *dccp_feat_name(const u8 feat);
20/* Ack Ratio takes 2-byte integer values (11.3) */
21#define DCCPF_ACK_RATIO_MAX 0xFFFF
22/* Wmin=32 and Wmax=2^46-1 from 7.5.2 */
23#define DCCPF_SEQ_WMIN 32
24#define DCCPF_SEQ_WMAX 0x3FFFFFFFFFFFull
25/* Maximum number of SP values that fit in a single (Confirm) option */
26#define DCCP_FEAT_MAX_SP_VALS (DCCP_SINGLE_OPT_MAXLEN - 2)
27
28enum dccp_feat_type {
29 FEAT_AT_RX = 1, /* located at RX side of half-connection */
30 FEAT_AT_TX = 2, /* located at TX side of half-connection */
31 FEAT_SP = 4, /* server-priority reconciliation (6.3.1) */
32 FEAT_NN = 8, /* non-negotiable reconciliation (6.3.2) */
33 FEAT_UNKNOWN = 0xFF /* not understood or invalid feature */
34};
35
36enum dccp_feat_state {
37 FEAT_DEFAULT = 0, /* using default values from 6.4 */
38 FEAT_INITIALISING, /* feature is being initialised */
39 FEAT_CHANGING, /* Change sent but not confirmed yet */
40 FEAT_UNSTABLE, /* local modification in state CHANGING */
41 FEAT_STABLE /* both ends (think they) agree */
42};
43 20
44/** 21static inline void dccp_feat_debug(const u8 type, const u8 feat, const u8 val)
45 * dccp_feat_val - Container for SP or NN feature values
46 * @nn: single NN value
47 * @sp.vec: single SP value plus optional preference list
48 * @sp.len: length of @sp.vec in bytes
49 */
50typedef union {
51 u64 nn;
52 struct {
53 u8 *vec;
54 u8 len;
55 } sp;
56} dccp_feat_val;
57
58/**
59 * struct feat_entry - Data structure to perform feature negotiation
60 * @feat_num: one of %dccp_feature_numbers
61 * @val: feature's current value (SP features may have preference list)
62 * @state: feature's current state
63 * @needs_mandatory: whether Mandatory options should be sent
64 * @needs_confirm: whether to send a Confirm instead of a Change
65 * @empty_confirm: whether to send an empty Confirm (depends on @needs_confirm)
66 * @is_local: feature location (1) or feature-remote (0)
67 * @node: list pointers, entries arranged in FIFO order
68 */
69struct dccp_feat_entry {
70 u8 feat_num;
71 dccp_feat_val val;
72 enum dccp_feat_state state:8;
73 bool needs_mandatory:1,
74 needs_confirm:1,
75 empty_confirm:1,
76 is_local:1;
77
78 struct list_head node;
79};
80
81static inline u8 dccp_feat_genopt(struct dccp_feat_entry *entry)
82{ 22{
83 if (entry->needs_confirm) 23 dccp_pr_debug("%s(%s (%d), %d)\n", dccp_feat_typename(type),
84 return entry->is_local ? DCCPO_CONFIRM_L : DCCPO_CONFIRM_R; 24 dccp_feat_name(feat), feat, val);
85 return entry->is_local ? DCCPO_CHANGE_L : DCCPO_CHANGE_R;
86} 25}
26#else
27#define dccp_feat_debug(type, feat, val)
28#endif /* CONFIG_IP_DCCP_DEBUG */
29
30extern int dccp_feat_change(struct dccp_minisock *dmsk, u8 type, u8 feature,
31 u8 *val, u8 len, gfp_t gfp);
32extern int dccp_feat_change_recv(struct sock *sk, u8 type, u8 feature,
33 u8 *val, u8 len);
34extern int dccp_feat_confirm_recv(struct sock *sk, u8 type, u8 feature,
35 u8 *val, u8 len);
36extern void dccp_feat_clean(struct dccp_minisock *dmsk);
37extern int dccp_feat_clone(struct sock *oldsk, struct sock *newsk);
38extern int dccp_feat_init(struct dccp_minisock *dmsk);
87 39
88/**
89 * struct ccid_dependency - Track changes resulting from choosing a CCID
90 * @dependent_feat: one of %dccp_feature_numbers
91 * @is_local: local (1) or remote (0) @dependent_feat
92 * @is_mandatory: whether presence of @dependent_feat is mission-critical or not
93 * @val: corresponding default value for @dependent_feat (u8 is sufficient here)
94 */
95struct ccid_dependency {
96 u8 dependent_feat;
97 bool is_local:1,
98 is_mandatory:1;
99 u8 val;
100};
101
102/*
103 * Sysctls to seed defaults for feature negotiation
104 */
105extern unsigned long sysctl_dccp_sequence_window;
106extern int sysctl_dccp_rx_ccid;
107extern int sysctl_dccp_tx_ccid;
108
109extern int dccp_feat_init(struct sock *sk);
110extern void dccp_feat_initialise_sysctls(void);
111extern int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local,
112 u8 const *list, u8 len);
113extern int dccp_feat_register_nn(struct sock *sk, u8 feat, u64 val);
114extern int dccp_feat_parse_options(struct sock *, struct dccp_request_sock *,
115 u8 mand, u8 opt, u8 feat, u8 *val, u8 len);
116extern int dccp_feat_clone_list(struct list_head const *, struct list_head *);
117
118/*
119 * Encoding variable-length options and their maximum length.
120 *
121 * This affects NN options (SP options are all u8) and other variable-length
122 * options (see table 3 in RFC 4340). The limit is currently given the Sequence
123 * Window NN value (sec. 7.5.2) and the NDP count (sec. 7.7) option, all other
124 * options consume less than 6 bytes (timestamps are 4 bytes).
125 * When updating this constant (e.g. due to new internet drafts / RFCs), make
126 * sure that you also update all code which refers to it.
127 */
128#define DCCP_OPTVAL_MAXLEN 6
129
130extern void dccp_encode_value_var(const u64 value, u8 *to, const u8 len);
131extern u64 dccp_decode_value_var(const u8 *bf, const u8 len);
132
133extern int dccp_insert_option_mandatory(struct sk_buff *skb);
134extern int dccp_insert_fn_opt(struct sk_buff *skb, u8 type, u8 feat,
135 u8 *val, u8 len, bool repeat_first);
136#endif /* _DCCP_FEAT_H */ 40#endif /* _DCCP_FEAT_H */
diff --git a/net/dccp/input.c b/net/dccp/input.c
index df0e6714aa11..779d0ed9ae94 100644
--- a/net/dccp/input.c
+++ b/net/dccp/input.c
@@ -159,15 +159,13 @@ static void dccp_rcv_reset(struct sock *sk, struct sk_buff *skb)
159 dccp_time_wait(sk, DCCP_TIME_WAIT, 0); 159 dccp_time_wait(sk, DCCP_TIME_WAIT, 0);
160} 160}
161 161
162static void dccp_handle_ackvec_processing(struct sock *sk, struct sk_buff *skb) 162static void dccp_event_ack_recv(struct sock *sk, struct sk_buff *skb)
163{ 163{
164 struct dccp_ackvec *av = dccp_sk(sk)->dccps_hc_rx_ackvec; 164 struct dccp_sock *dp = dccp_sk(sk);
165 165
166 if (av == NULL) 166 if (dccp_msk(sk)->dccpms_send_ack_vector)
167 return; 167 dccp_ackvec_check_rcv_ackno(dp->dccps_hc_rx_ackvec, sk,
168 if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) 168 DCCP_SKB_CB(skb)->dccpd_ack_seq);
169 dccp_ackvec_clear_state(av, DCCP_SKB_CB(skb)->dccpd_ack_seq);
170 dccp_ackvec_input(av, skb);
171} 169}
172 170
173static void dccp_deliver_input_to_ccids(struct sock *sk, struct sk_buff *skb) 171static void dccp_deliver_input_to_ccids(struct sock *sk, struct sk_buff *skb)
@@ -366,13 +364,22 @@ discard:
366int dccp_rcv_established(struct sock *sk, struct sk_buff *skb, 364int dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
367 const struct dccp_hdr *dh, const unsigned len) 365 const struct dccp_hdr *dh, const unsigned len)
368{ 366{
367 struct dccp_sock *dp = dccp_sk(sk);
368
369 if (dccp_check_seqno(sk, skb)) 369 if (dccp_check_seqno(sk, skb))
370 goto discard; 370 goto discard;
371 371
372 if (dccp_parse_options(sk, NULL, skb)) 372 if (dccp_parse_options(sk, NULL, skb))
373 return 1; 373 return 1;
374 374
375 dccp_handle_ackvec_processing(sk, skb); 375 if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
376 dccp_event_ack_recv(sk, skb);
377
378 if (dccp_msk(sk)->dccpms_send_ack_vector &&
379 dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk,
380 DCCP_SKB_CB(skb)->dccpd_seq,
381 DCCP_ACKVEC_STATE_RECEIVED))
382 goto discard;
376 dccp_deliver_input_to_ccids(sk, skb); 383 dccp_deliver_input_to_ccids(sk, skb);
377 384
378 return __dccp_rcv_established(sk, skb, dh, len); 385 return __dccp_rcv_established(sk, skb, dh, len);
@@ -414,33 +421,40 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk,
414 goto out_invalid_packet; 421 goto out_invalid_packet;
415 } 422 }
416 423
417 /*
418 * If option processing (Step 8) failed, return 1 here so that
419 * dccp_v4_do_rcv() sends a Reset. The Reset code depends on
420 * the option type and is set in dccp_parse_options().
421 */
422 if (dccp_parse_options(sk, NULL, skb)) 424 if (dccp_parse_options(sk, NULL, skb))
423 return 1; 425 goto out_invalid_packet;
424 426
425 /* Obtain usec RTT sample from SYN exchange (used by CCID 3) */ 427 /* Obtain usec RTT sample from SYN exchange (used by CCID 3) */
426 if (likely(dp->dccps_options_received.dccpor_timestamp_echo)) 428 if (likely(dp->dccps_options_received.dccpor_timestamp_echo))
427 dp->dccps_syn_rtt = dccp_sample_rtt(sk, 10 * (tstamp - 429 dp->dccps_syn_rtt = dccp_sample_rtt(sk, 10 * (tstamp -
428 dp->dccps_options_received.dccpor_timestamp_echo)); 430 dp->dccps_options_received.dccpor_timestamp_echo));
429 431
432 if (dccp_msk(sk)->dccpms_send_ack_vector &&
433 dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk,
434 DCCP_SKB_CB(skb)->dccpd_seq,
435 DCCP_ACKVEC_STATE_RECEIVED))
436 goto out_invalid_packet; /* FIXME: change error code */
437
430 /* Stop the REQUEST timer */ 438 /* Stop the REQUEST timer */
431 inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); 439 inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
432 WARN_ON(sk->sk_send_head == NULL); 440 WARN_ON(sk->sk_send_head == NULL);
433 kfree_skb(sk->sk_send_head); 441 kfree_skb(sk->sk_send_head);
434 sk->sk_send_head = NULL; 442 sk->sk_send_head = NULL;
435 443
444 dp->dccps_isr = DCCP_SKB_CB(skb)->dccpd_seq;
445 dccp_update_gsr(sk, dp->dccps_isr);
436 /* 446 /*
437 * Set ISR, GSR from packet. ISS was set in dccp_v{4,6}_connect 447 * SWL and AWL are initially adjusted so that they are not less than
438 * and GSS in dccp_transmit_skb(). Setting AWL/AWH and SWL/SWH 448 * the initial Sequence Numbers received and sent, respectively:
439 * is done as part of activating the feature values below, since 449 * SWL := max(GSR + 1 - floor(W/4), ISR),
440 * these settings depend on the local/remote Sequence Window 450 * AWL := max(GSS - W' + 1, ISS).
441 * features, which were undefined or not confirmed until now. 451 * These adjustments MUST be applied only at the beginning of the
452 * connection.
453 *
454 * AWL was adjusted in dccp_v4_connect -acme
442 */ 455 */
443 dp->dccps_gsr = dp->dccps_isr = DCCP_SKB_CB(skb)->dccpd_seq; 456 dccp_set_seqno(&dp->dccps_swl,
457 max48(dp->dccps_swl, dp->dccps_isr));
444 458
445 dccp_sync_mss(sk, icsk->icsk_pmtu_cookie); 459 dccp_sync_mss(sk, icsk->icsk_pmtu_cookie);
446 460
@@ -461,15 +475,6 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk,
461 */ 475 */
462 dccp_set_state(sk, DCCP_PARTOPEN); 476 dccp_set_state(sk, DCCP_PARTOPEN);
463 477
464 /*
465 * If feature negotiation was successful, activate features now;
466 * an activation failure means that this host could not activate
467 * one ore more features (e.g. insufficient memory), which would
468 * leave at least one feature in an undefined state.
469 */
470 if (dccp_feat_activate_values(sk, &dp->dccps_featneg))
471 goto unable_to_proceed;
472
473 /* Make sure socket is routed, for correct metrics. */ 478 /* Make sure socket is routed, for correct metrics. */
474 icsk->icsk_af_ops->rebuild_header(sk); 479 icsk->icsk_af_ops->rebuild_header(sk);
475 480
@@ -504,16 +509,6 @@ out_invalid_packet:
504 /* dccp_v4_do_rcv will send a reset */ 509 /* dccp_v4_do_rcv will send a reset */
505 DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_PACKET_ERROR; 510 DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_PACKET_ERROR;
506 return 1; 511 return 1;
507
508unable_to_proceed:
509 DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_ABORTED;
510 /*
511 * We mark this socket as no longer usable, so that the loop in
512 * dccp_sendmsg() terminates and the application gets notified.
513 */
514 dccp_set_state(sk, DCCP_CLOSED);
515 sk->sk_err = ECOMM;
516 return 1;
517} 512}
518 513
519static int dccp_rcv_respond_partopen_state_process(struct sock *sk, 514static int dccp_rcv_respond_partopen_state_process(struct sock *sk,
@@ -595,6 +590,8 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
595 if (inet_csk(sk)->icsk_af_ops->conn_request(sk, 590 if (inet_csk(sk)->icsk_af_ops->conn_request(sk,
596 skb) < 0) 591 skb) < 0)
597 return 1; 592 return 1;
593
594 /* FIXME: do congestion control initialization */
598 goto discard; 595 goto discard;
599 } 596 }
600 if (dh->dccph_type == DCCP_PKT_RESET) 597 if (dh->dccph_type == DCCP_PKT_RESET)
@@ -603,35 +600,29 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
603 /* Caller (dccp_v4_do_rcv) will send Reset */ 600 /* Caller (dccp_v4_do_rcv) will send Reset */
604 dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION; 601 dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION;
605 return 1; 602 return 1;
606 } else if (sk->sk_state == DCCP_CLOSED) {
607 dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION;
608 return 1;
609 } 603 }
610 604
611 /* Step 6: Check sequence numbers (omitted in LISTEN/REQUEST state) */ 605 if (sk->sk_state != DCCP_REQUESTING) {
612 if (sk->sk_state != DCCP_REQUESTING && dccp_check_seqno(sk, skb)) 606 if (dccp_check_seqno(sk, skb))
613 goto discard; 607 goto discard;
614 608
615 /* 609 /*
616 * Step 7: Check for unexpected packet types 610 * Step 8: Process options and mark acknowledgeable
617 * If (S.is_server and P.type == Response) 611 */
618 * or (S.is_client and P.type == Request) 612 if (dccp_parse_options(sk, NULL, skb))
619 * or (S.state == RESPOND and P.type == Data), 613 return 1;
620 * Send Sync packet acknowledging P.seqno
621 * Drop packet and return
622 */
623 if ((dp->dccps_role != DCCP_ROLE_CLIENT &&
624 dh->dccph_type == DCCP_PKT_RESPONSE) ||
625 (dp->dccps_role == DCCP_ROLE_CLIENT &&
626 dh->dccph_type == DCCP_PKT_REQUEST) ||
627 (sk->sk_state == DCCP_RESPOND && dh->dccph_type == DCCP_PKT_DATA)) {
628 dccp_send_sync(sk, dcb->dccpd_seq, DCCP_PKT_SYNC);
629 goto discard;
630 }
631 614
632 /* Step 8: Process options */ 615 if (dcb->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
633 if (dccp_parse_options(sk, NULL, skb)) 616 dccp_event_ack_recv(sk, skb);
634 return 1; 617
618 if (dccp_msk(sk)->dccpms_send_ack_vector &&
619 dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk,
620 DCCP_SKB_CB(skb)->dccpd_seq,
621 DCCP_ACKVEC_STATE_RECEIVED))
622 goto discard;
623
624 dccp_deliver_input_to_ccids(sk, skb);
625 }
635 626
636 /* 627 /*
637 * Step 9: Process Reset 628 * Step 9: Process Reset
@@ -640,22 +631,44 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
640 * S.state := TIMEWAIT 631 * S.state := TIMEWAIT
641 * Set TIMEWAIT timer 632 * Set TIMEWAIT timer
642 * Drop packet and return 633 * Drop packet and return
643 */ 634 */
644 if (dh->dccph_type == DCCP_PKT_RESET) { 635 if (dh->dccph_type == DCCP_PKT_RESET) {
645 dccp_rcv_reset(sk, skb); 636 dccp_rcv_reset(sk, skb);
646 return 0; 637 return 0;
647 } else if (dh->dccph_type == DCCP_PKT_CLOSEREQ) { /* Step 13 */ 638 /*
639 * Step 7: Check for unexpected packet types
640 * If (S.is_server and P.type == Response)
641 * or (S.is_client and P.type == Request)
642 * or (S.state == RESPOND and P.type == Data),
643 * Send Sync packet acknowledging P.seqno
644 * Drop packet and return
645 */
646 } else if ((dp->dccps_role != DCCP_ROLE_CLIENT &&
647 dh->dccph_type == DCCP_PKT_RESPONSE) ||
648 (dp->dccps_role == DCCP_ROLE_CLIENT &&
649 dh->dccph_type == DCCP_PKT_REQUEST) ||
650 (sk->sk_state == DCCP_RESPOND &&
651 dh->dccph_type == DCCP_PKT_DATA)) {
652 dccp_send_sync(sk, dcb->dccpd_seq, DCCP_PKT_SYNC);
653 goto discard;
654 } else if (dh->dccph_type == DCCP_PKT_CLOSEREQ) {
648 if (dccp_rcv_closereq(sk, skb)) 655 if (dccp_rcv_closereq(sk, skb))
649 return 0; 656 return 0;
650 goto discard; 657 goto discard;
651 } else if (dh->dccph_type == DCCP_PKT_CLOSE) { /* Step 14 */ 658 } else if (dh->dccph_type == DCCP_PKT_CLOSE) {
652 if (dccp_rcv_close(sk, skb)) 659 if (dccp_rcv_close(sk, skb))
653 return 0; 660 return 0;
654 goto discard; 661 goto discard;
655 } 662 }
656 663
657 switch (sk->sk_state) { 664 switch (sk->sk_state) {
665 case DCCP_CLOSED:
666 dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION;
667 return 1;
668
658 case DCCP_REQUESTING: 669 case DCCP_REQUESTING:
670 /* FIXME: do congestion control initialization */
671
659 queued = dccp_rcv_request_sent_state_process(sk, skb, dh, len); 672 queued = dccp_rcv_request_sent_state_process(sk, skb, dh, len);
660 if (queued >= 0) 673 if (queued >= 0)
661 return queued; 674 return queued;
@@ -663,12 +676,8 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
663 __kfree_skb(skb); 676 __kfree_skb(skb);
664 return 0; 677 return 0;
665 678
666 case DCCP_PARTOPEN:
667 /* Step 8: if using Ack Vectors, mark packet acknowledgeable */
668 dccp_handle_ackvec_processing(sk, skb);
669 dccp_deliver_input_to_ccids(sk, skb);
670 /* fall through */
671 case DCCP_RESPOND: 679 case DCCP_RESPOND:
680 case DCCP_PARTOPEN:
672 queued = dccp_rcv_respond_partopen_state_process(sk, skb, 681 queued = dccp_rcv_respond_partopen_state_process(sk, skb,
673 dh, len); 682 dh, len);
674 break; 683 break;
@@ -707,7 +716,16 @@ u32 dccp_sample_rtt(struct sock *sk, long delta)
707 /* dccpor_elapsed_time is either zeroed out or set and > 0 */ 716 /* dccpor_elapsed_time is either zeroed out or set and > 0 */
708 delta -= dccp_sk(sk)->dccps_options_received.dccpor_elapsed_time * 10; 717 delta -= dccp_sk(sk)->dccps_options_received.dccpor_elapsed_time * 10;
709 718
710 return dccp_sane_rtt(delta); 719 if (unlikely(delta <= 0)) {
720 DCCP_WARN("unusable RTT sample %ld, using min\n", delta);
721 return DCCP_SANE_RTT_MIN;
722 }
723 if (unlikely(delta > DCCP_SANE_RTT_MAX)) {
724 DCCP_WARN("RTT sample %ld too large, using max\n", delta);
725 return DCCP_SANE_RTT_MAX;
726 }
727
728 return delta;
711} 729}
712 730
713EXPORT_SYMBOL_GPL(dccp_sample_rtt); 731EXPORT_SYMBOL_GPL(dccp_sample_rtt);
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index b623f6b25482..882c5c4de69e 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -545,7 +545,6 @@ out:
545 545
546static void dccp_v4_reqsk_destructor(struct request_sock *req) 546static void dccp_v4_reqsk_destructor(struct request_sock *req)
547{ 547{
548 dccp_feat_list_purge(&dccp_rsk(req)->dreq_featneg);
549 kfree(inet_rsk(req)->opt); 548 kfree(inet_rsk(req)->opt);
550} 549}
551 550
@@ -596,8 +595,7 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
596 if (req == NULL) 595 if (req == NULL)
597 goto drop; 596 goto drop;
598 597
599 if (dccp_reqsk_init(req, dccp_sk(sk), skb)) 598 dccp_reqsk_init(req, skb);
600 goto drop_and_free;
601 599
602 dreq = dccp_rsk(req); 600 dreq = dccp_rsk(req);
603 if (dccp_parse_options(sk, dreq, skb)) 601 if (dccp_parse_options(sk, dreq, skb))
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index ad6212e00435..5e1ee0da2c40 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -302,7 +302,6 @@ done:
302 302
303static void dccp_v6_reqsk_destructor(struct request_sock *req) 303static void dccp_v6_reqsk_destructor(struct request_sock *req)
304{ 304{
305 dccp_feat_list_purge(&dccp_rsk(req)->dreq_featneg);
306 if (inet6_rsk(req)->pktopts != NULL) 305 if (inet6_rsk(req)->pktopts != NULL)
307 kfree_skb(inet6_rsk(req)->pktopts); 306 kfree_skb(inet6_rsk(req)->pktopts);
308} 307}
@@ -425,8 +424,7 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
425 if (req == NULL) 424 if (req == NULL)
426 goto drop; 425 goto drop;
427 426
428 if (dccp_reqsk_init(req, dccp_sk(sk), skb)) 427 dccp_reqsk_init(req, skb);
429 goto drop_and_free;
430 428
431 dreq = dccp_rsk(req); 429 dreq = dccp_rsk(req);
432 if (dccp_parse_options(sk, dreq, skb)) 430 if (dccp_parse_options(sk, dreq, skb))
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
index f4d9c8f60ede..b2804e2d1b8c 100644
--- a/net/dccp/minisocks.c
+++ b/net/dccp/minisocks.c
@@ -42,6 +42,16 @@ struct inet_timewait_death_row dccp_death_row = {
42 42
43EXPORT_SYMBOL_GPL(dccp_death_row); 43EXPORT_SYMBOL_GPL(dccp_death_row);
44 44
45void dccp_minisock_init(struct dccp_minisock *dmsk)
46{
47 dmsk->dccpms_sequence_window = sysctl_dccp_feat_sequence_window;
48 dmsk->dccpms_rx_ccid = sysctl_dccp_feat_rx_ccid;
49 dmsk->dccpms_tx_ccid = sysctl_dccp_feat_tx_ccid;
50 dmsk->dccpms_ack_ratio = sysctl_dccp_feat_ack_ratio;
51 dmsk->dccpms_send_ack_vector = sysctl_dccp_feat_send_ack_vector;
52 dmsk->dccpms_send_ndp_count = sysctl_dccp_feat_send_ndp_count;
53}
54
45void dccp_time_wait(struct sock *sk, int state, int timeo) 55void dccp_time_wait(struct sock *sk, int state, int timeo)
46{ 56{
47 struct inet_timewait_sock *tw = NULL; 57 struct inet_timewait_sock *tw = NULL;
@@ -102,9 +112,10 @@ struct sock *dccp_create_openreq_child(struct sock *sk,
102 struct sock *newsk = inet_csk_clone(sk, req, GFP_ATOMIC); 112 struct sock *newsk = inet_csk_clone(sk, req, GFP_ATOMIC);
103 113
104 if (newsk != NULL) { 114 if (newsk != NULL) {
105 struct dccp_request_sock *dreq = dccp_rsk(req); 115 const struct dccp_request_sock *dreq = dccp_rsk(req);
106 struct inet_connection_sock *newicsk = inet_csk(newsk); 116 struct inet_connection_sock *newicsk = inet_csk(newsk);
107 struct dccp_sock *newdp = dccp_sk(newsk); 117 struct dccp_sock *newdp = dccp_sk(newsk);
118 struct dccp_minisock *newdmsk = dccp_msk(newsk);
108 119
109 newdp->dccps_role = DCCP_ROLE_SERVER; 120 newdp->dccps_role = DCCP_ROLE_SERVER;
110 newdp->dccps_hc_rx_ackvec = NULL; 121 newdp->dccps_hc_rx_ackvec = NULL;
@@ -114,32 +125,65 @@ struct sock *dccp_create_openreq_child(struct sock *sk,
114 newdp->dccps_timestamp_time = dreq->dreq_timestamp_time; 125 newdp->dccps_timestamp_time = dreq->dreq_timestamp_time;
115 newicsk->icsk_rto = DCCP_TIMEOUT_INIT; 126 newicsk->icsk_rto = DCCP_TIMEOUT_INIT;
116 127
117 INIT_LIST_HEAD(&newdp->dccps_featneg); 128 if (dccp_feat_clone(sk, newsk))
129 goto out_free;
130
131 if (newdmsk->dccpms_send_ack_vector) {
132 newdp->dccps_hc_rx_ackvec =
133 dccp_ackvec_alloc(GFP_ATOMIC);
134 if (unlikely(newdp->dccps_hc_rx_ackvec == NULL))
135 goto out_free;
136 }
137
138 newdp->dccps_hc_rx_ccid =
139 ccid_hc_rx_new(newdmsk->dccpms_rx_ccid,
140 newsk, GFP_ATOMIC);
141 newdp->dccps_hc_tx_ccid =
142 ccid_hc_tx_new(newdmsk->dccpms_tx_ccid,
143 newsk, GFP_ATOMIC);
144 if (unlikely(newdp->dccps_hc_rx_ccid == NULL ||
145 newdp->dccps_hc_tx_ccid == NULL)) {
146 dccp_ackvec_free(newdp->dccps_hc_rx_ackvec);
147 ccid_hc_rx_delete(newdp->dccps_hc_rx_ccid, newsk);
148 ccid_hc_tx_delete(newdp->dccps_hc_tx_ccid, newsk);
149out_free:
150 /* It is still raw copy of parent, so invalidate
151 * destructor and make plain sk_free() */
152 newsk->sk_destruct = NULL;
153 sk_free(newsk);
154 return NULL;
155 }
156
118 /* 157 /*
119 * Step 3: Process LISTEN state 158 * Step 3: Process LISTEN state
120 * 159 *
121 * Choose S.ISS (initial seqno) or set from Init Cookies 160 * Choose S.ISS (initial seqno) or set from Init Cookies
122 * Initialize S.GAR := S.ISS 161 * Initialize S.GAR := S.ISS
123 * Set S.ISR, S.GSR from packet (or Init Cookies) 162 * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookies
124 *
125 * Setting AWL/AWH and SWL/SWH happens as part of the feature
126 * activation below, as these windows all depend on the local
127 * and remote Sequence Window feature values (7.5.2).
128 */ 163 */
129 newdp->dccps_gss = newdp->dccps_iss = dreq->dreq_iss; 164
130 newdp->dccps_gar = newdp->dccps_iss; 165 /* See dccp_v4_conn_request */
131 newdp->dccps_gsr = newdp->dccps_isr = dreq->dreq_isr; 166 newdmsk->dccpms_sequence_window = req->rcv_wnd;
167
168 newdp->dccps_gar = newdp->dccps_iss = dreq->dreq_iss;
169 dccp_update_gss(newsk, dreq->dreq_iss);
170
171 newdp->dccps_isr = dreq->dreq_isr;
172 dccp_update_gsr(newsk, dreq->dreq_isr);
132 173
133 /* 174 /*
134 * Activate features: initialise CCIDs, sequence windows etc. 175 * SWL and AWL are initially adjusted so that they are not less than
176 * the initial Sequence Numbers received and sent, respectively:
177 * SWL := max(GSR + 1 - floor(W/4), ISR),
178 * AWL := max(GSS - W' + 1, ISS).
179 * These adjustments MUST be applied only at the beginning of the
180 * connection.
135 */ 181 */
136 if (dccp_feat_activate_values(newsk, &dreq->dreq_featneg)) { 182 dccp_set_seqno(&newdp->dccps_swl,
137 /* It is still raw copy of parent, so invalidate 183 max48(newdp->dccps_swl, newdp->dccps_isr));
138 * destructor and make plain sk_free() */ 184 dccp_set_seqno(&newdp->dccps_awl,
139 newsk->sk_destruct = NULL; 185 max48(newdp->dccps_awl, newdp->dccps_iss));
140 sk_free(newsk); 186
141 return NULL;
142 }
143 dccp_init_xmit_timers(newsk); 187 dccp_init_xmit_timers(newsk);
144 188
145 DCCP_INC_STATS_BH(DCCP_MIB_PASSIVEOPENS); 189 DCCP_INC_STATS_BH(DCCP_MIB_PASSIVEOPENS);
@@ -260,17 +304,14 @@ void dccp_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
260 304
261EXPORT_SYMBOL_GPL(dccp_reqsk_send_ack); 305EXPORT_SYMBOL_GPL(dccp_reqsk_send_ack);
262 306
263int dccp_reqsk_init(struct request_sock *req, 307void dccp_reqsk_init(struct request_sock *req, struct sk_buff *skb)
264 struct dccp_sock const *dp, struct sk_buff const *skb)
265{ 308{
266 struct dccp_request_sock *dreq = dccp_rsk(req); 309 struct dccp_request_sock *dreq = dccp_rsk(req);
267 310
268 inet_rsk(req)->rmt_port = dccp_hdr(skb)->dccph_sport; 311 inet_rsk(req)->rmt_port = dccp_hdr(skb)->dccph_sport;
269 inet_rsk(req)->acked = 0; 312 inet_rsk(req)->acked = 0;
313 req->rcv_wnd = sysctl_dccp_feat_sequence_window;
270 dreq->dreq_timestamp_echo = 0; 314 dreq->dreq_timestamp_echo = 0;
271
272 /* inherit feature negotiation options from listening socket */
273 return dccp_feat_clone_list(&dp->dccps_featneg, &dreq->dreq_featneg);
274} 315}
275 316
276EXPORT_SYMBOL_GPL(dccp_reqsk_init); 317EXPORT_SYMBOL_GPL(dccp_reqsk_init);
diff --git a/net/dccp/options.c b/net/dccp/options.c
index e5a32979d7d7..0809b63cb055 100644
--- a/net/dccp/options.c
+++ b/net/dccp/options.c
@@ -23,20 +23,23 @@
23#include "dccp.h" 23#include "dccp.h"
24#include "feat.h" 24#include "feat.h"
25 25
26u64 dccp_decode_value_var(const u8 *bf, const u8 len) 26int sysctl_dccp_feat_sequence_window = DCCPF_INITIAL_SEQUENCE_WINDOW;
27int sysctl_dccp_feat_rx_ccid = DCCPF_INITIAL_CCID;
28int sysctl_dccp_feat_tx_ccid = DCCPF_INITIAL_CCID;
29int sysctl_dccp_feat_ack_ratio = DCCPF_INITIAL_ACK_RATIO;
30int sysctl_dccp_feat_send_ack_vector = DCCPF_INITIAL_SEND_ACK_VECTOR;
31int sysctl_dccp_feat_send_ndp_count = DCCPF_INITIAL_SEND_NDP_COUNT;
32
33static u32 dccp_decode_value_var(const unsigned char *bf, const u8 len)
27{ 34{
28 u64 value = 0; 35 u32 value = 0;
29 36
30 if (len >= DCCP_OPTVAL_MAXLEN)
31 value += ((u64)*bf++) << 40;
32 if (len > 4)
33 value += ((u64)*bf++) << 32;
34 if (len > 3) 37 if (len > 3)
35 value += ((u64)*bf++) << 24; 38 value += *bf++ << 24;
36 if (len > 2) 39 if (len > 2)
37 value += ((u64)*bf++) << 16; 40 value += *bf++ << 16;
38 if (len > 1) 41 if (len > 1)
39 value += ((u64)*bf++) << 8; 42 value += *bf++ << 8;
40 if (len > 0) 43 if (len > 0)
41 value += *bf; 44 value += *bf;
42 45
@@ -54,6 +57,7 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
54 struct dccp_sock *dp = dccp_sk(sk); 57 struct dccp_sock *dp = dccp_sk(sk);
55 const struct dccp_hdr *dh = dccp_hdr(skb); 58 const struct dccp_hdr *dh = dccp_hdr(skb);
56 const u8 pkt_type = DCCP_SKB_CB(skb)->dccpd_type; 59 const u8 pkt_type = DCCP_SKB_CB(skb)->dccpd_type;
60 u64 ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq;
57 unsigned char *options = (unsigned char *)dh + dccp_hdr_len(skb); 61 unsigned char *options = (unsigned char *)dh + dccp_hdr_len(skb);
58 unsigned char *opt_ptr = options; 62 unsigned char *opt_ptr = options;
59 const unsigned char *opt_end = (unsigned char *)dh + 63 const unsigned char *opt_end = (unsigned char *)dh +
@@ -95,11 +99,18 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
95 } 99 }
96 100
97 /* 101 /*
102 * CCID-Specific Options (from RFC 4340, sec. 10.3):
103 *
104 * Option numbers 128 through 191 are for options sent from the
105 * HC-Sender to the HC-Receiver; option numbers 192 through 255
106 * are for options sent from the HC-Receiver to the HC-Sender.
107 *
98 * CCID-specific options are ignored during connection setup, as 108 * CCID-specific options are ignored during connection setup, as
99 * negotiation may still be in progress (see RFC 4340, 10.3). 109 * negotiation may still be in progress (see RFC 4340, 10.3).
100 * The same applies to Ack Vectors, as these depend on the CCID. 110 * The same applies to Ack Vectors, as these depend on the CCID.
111 *
101 */ 112 */
102 if (dreq != NULL && (opt >= DCCPO_MIN_RX_CCID_SPECIFIC || 113 if (dreq != NULL && (opt >= 128 ||
103 opt == DCCPO_ACK_VECTOR_0 || opt == DCCPO_ACK_VECTOR_1)) 114 opt == DCCPO_ACK_VECTOR_0 || opt == DCCPO_ACK_VECTOR_1))
104 goto ignore_option; 115 goto ignore_option;
105 116
@@ -120,13 +131,43 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
120 dccp_pr_debug("%s opt: NDP count=%llu\n", dccp_role(sk), 131 dccp_pr_debug("%s opt: NDP count=%llu\n", dccp_role(sk),
121 (unsigned long long)opt_recv->dccpor_ndp); 132 (unsigned long long)opt_recv->dccpor_ndp);
122 break; 133 break;
123 case DCCPO_CHANGE_L ... DCCPO_CONFIRM_R: 134 case DCCPO_CHANGE_L:
124 if (pkt_type == DCCP_PKT_DATA) /* RFC 4340, 6 */ 135 /* fall through */
136 case DCCPO_CHANGE_R:
137 if (pkt_type == DCCP_PKT_DATA)
125 break; 138 break;
126 rc = dccp_feat_parse_options(sk, dreq, mandatory, opt, 139 if (len < 2)
127 *value, value + 1, len - 1); 140 goto out_invalid_option;
128 if (rc) 141 rc = dccp_feat_change_recv(sk, opt, *value, value + 1,
129 goto out_featneg_failed; 142 len - 1);
143 /*
144 * When there is a change error, change_recv is
145 * responsible for dealing with it. i.e. reply with an
146 * empty confirm.
147 * If the change was mandatory, then we need to die.
148 */
149 if (rc && mandatory)
150 goto out_invalid_option;
151 break;
152 case DCCPO_CONFIRM_L:
153 /* fall through */
154 case DCCPO_CONFIRM_R:
155 if (pkt_type == DCCP_PKT_DATA)
156 break;
157 if (len < 2) /* FIXME this disallows empty confirm */
158 goto out_invalid_option;
159 if (dccp_feat_confirm_recv(sk, opt, *value,
160 value + 1, len - 1))
161 goto out_invalid_option;
162 break;
163 case DCCPO_ACK_VECTOR_0:
164 case DCCPO_ACK_VECTOR_1:
165 if (dccp_packet_without_ack(skb)) /* RFC 4340, 11.4 */
166 break;
167
168 if (dccp_msk(sk)->dccpms_send_ack_vector &&
169 dccp_ackvec_parse(sk, skb, &ackno, opt, value, len))
170 goto out_invalid_option;
130 break; 171 break;
131 case DCCPO_TIMESTAMP: 172 case DCCPO_TIMESTAMP:
132 if (len != 4) 173 if (len != 4)
@@ -154,8 +195,6 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
154 dccp_role(sk), ntohl(opt_val), 195 dccp_role(sk), ntohl(opt_val),
155 (unsigned long long) 196 (unsigned long long)
156 DCCP_SKB_CB(skb)->dccpd_ack_seq); 197 DCCP_SKB_CB(skb)->dccpd_ack_seq);
157 /* schedule an Ack in case this sender is quiescent */
158 inet_csk_schedule_ack(sk);
159 break; 198 break;
160 case DCCPO_TIMESTAMP_ECHO: 199 case DCCPO_TIMESTAMP_ECHO:
161 if (len != 4 && len != 6 && len != 8) 200 if (len != 4 && len != 6 && len != 8)
@@ -212,25 +251,23 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
212 dccp_pr_debug("%s rx opt: ELAPSED_TIME=%d\n", 251 dccp_pr_debug("%s rx opt: ELAPSED_TIME=%d\n",
213 dccp_role(sk), elapsed_time); 252 dccp_role(sk), elapsed_time);
214 break; 253 break;
215 case DCCPO_MIN_RX_CCID_SPECIFIC ... DCCPO_MAX_RX_CCID_SPECIFIC: 254 case 128 ... 191: {
255 const u16 idx = value - options;
256
216 if (ccid_hc_rx_parse_options(dp->dccps_hc_rx_ccid, sk, 257 if (ccid_hc_rx_parse_options(dp->dccps_hc_rx_ccid, sk,
217 pkt_type, opt, value, len)) 258 opt, len, idx,
259 value) != 0)
218 goto out_invalid_option; 260 goto out_invalid_option;
261 }
219 break; 262 break;
220 case DCCPO_ACK_VECTOR_0: 263 case 192 ... 255: {
221 case DCCPO_ACK_VECTOR_1: 264 const u16 idx = value - options;
222 if (dccp_packet_without_ack(skb)) /* RFC 4340, 11.4 */ 265
223 break;
224 /*
225 * Ack vectors are processed by the TX CCID if it is
226 * interested. The RX CCID need not parse Ack Vectors,
227 * since it is only interested in clearing old state.
228 * Fall through.
229 */
230 case DCCPO_MIN_TX_CCID_SPECIFIC ... DCCPO_MAX_TX_CCID_SPECIFIC:
231 if (ccid_hc_tx_parse_options(dp->dccps_hc_tx_ccid, sk, 266 if (ccid_hc_tx_parse_options(dp->dccps_hc_tx_ccid, sk,
232 pkt_type, opt, value, len)) 267 opt, len, idx,
268 value) != 0)
233 goto out_invalid_option; 269 goto out_invalid_option;
270 }
234 break; 271 break;
235 default: 272 default:
236 DCCP_CRIT("DCCP(%p): option %d(len=%d) not " 273 DCCP_CRIT("DCCP(%p): option %d(len=%d) not "
@@ -252,10 +289,8 @@ out_nonsensical_length:
252 289
253out_invalid_option: 290out_invalid_option:
254 DCCP_INC_STATS_BH(DCCP_MIB_INVALIDOPT); 291 DCCP_INC_STATS_BH(DCCP_MIB_INVALIDOPT);
255 rc = DCCP_RESET_CODE_OPTION_ERROR; 292 DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_OPTION_ERROR;
256out_featneg_failed: 293 DCCP_WARN("DCCP(%p): invalid option %d, len=%d", sk, opt, len);
257 DCCP_WARN("DCCP(%p): Option %d (len=%d) error=%u\n", sk, opt, len, rc);
258 DCCP_SKB_CB(skb)->dccpd_reset_code = rc;
259 DCCP_SKB_CB(skb)->dccpd_reset_data[0] = opt; 294 DCCP_SKB_CB(skb)->dccpd_reset_data[0] = opt;
260 DCCP_SKB_CB(skb)->dccpd_reset_data[1] = len > 0 ? value[0] : 0; 295 DCCP_SKB_CB(skb)->dccpd_reset_data[1] = len > 0 ? value[0] : 0;
261 DCCP_SKB_CB(skb)->dccpd_reset_data[2] = len > 1 ? value[1] : 0; 296 DCCP_SKB_CB(skb)->dccpd_reset_data[2] = len > 1 ? value[1] : 0;
@@ -264,12 +299,9 @@ out_featneg_failed:
264 299
265EXPORT_SYMBOL_GPL(dccp_parse_options); 300EXPORT_SYMBOL_GPL(dccp_parse_options);
266 301
267void dccp_encode_value_var(const u64 value, u8 *to, const u8 len) 302static void dccp_encode_value_var(const u32 value, unsigned char *to,
303 const unsigned int len)
268{ 304{
269 if (len >= DCCP_OPTVAL_MAXLEN)
270 *to++ = (value & 0xFF0000000000ull) >> 40;
271 if (len > 4)
272 *to++ = (value & 0xFF00000000ull) >> 32;
273 if (len > 3) 305 if (len > 3)
274 *to++ = (value & 0xFF000000) >> 24; 306 *to++ = (value & 0xFF000000) >> 24;
275 if (len > 2) 307 if (len > 2)
@@ -429,140 +461,92 @@ static int dccp_insert_option_timestamp_echo(struct dccp_sock *dp,
429 return 0; 461 return 0;
430} 462}
431 463
432static int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb) 464static int dccp_insert_feat_opt(struct sk_buff *skb, u8 type, u8 feat,
465 u8 *val, u8 len)
433{ 466{
434 struct dccp_sock *dp = dccp_sk(sk); 467 u8 *to;
435 struct dccp_ackvec *av = dp->dccps_hc_rx_ackvec;
436 struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
437 const u16 buflen = dccp_ackvec_buflen(av);
438 /* Figure out how many options do we need to represent the ackvec */
439 const u8 nr_opts = DIV_ROUND_UP(buflen, DCCP_SINGLE_OPT_MAXLEN);
440 u16 len = buflen + 2 * nr_opts;
441 u8 i, nonce = 0;
442 const unsigned char *tail, *from;
443 unsigned char *to;
444 468
445 if (dcb->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) { 469 if (DCCP_SKB_CB(skb)->dccpd_opt_len + len + 3 > DCCP_MAX_OPT_LEN) {
446 DCCP_WARN("Lacking space for %u bytes on %s packet\n", len, 470 DCCP_WARN("packet too small for feature %d option!\n", feat);
447 dccp_packet_name(dcb->dccpd_type));
448 return -1; 471 return -1;
449 } 472 }
450 /*
451 * Since Ack Vectors are variable-length, we can not always predict
452 * their size. To catch exception cases where the space is running out
453 * on the skb, a separate Sync is scheduled to carry the Ack Vector.
454 */
455 if (len > DCCPAV_MIN_OPTLEN &&
456 len + dcb->dccpd_opt_len + skb->len > dp->dccps_mss_cache) {
457 DCCP_WARN("No space left for Ack Vector (%u) on skb (%u+%u), "
458 "MPS=%u ==> reduce payload size?\n", len, skb->len,
459 dcb->dccpd_opt_len, dp->dccps_mss_cache);
460 dp->dccps_sync_scheduled = 1;
461 return 0;
462 }
463 dcb->dccpd_opt_len += len;
464 473
465 to = skb_push(skb, len); 474 DCCP_SKB_CB(skb)->dccpd_opt_len += len + 3;
466 len = buflen;
467 from = av->av_buf + av->av_buf_head;
468 tail = av->av_buf + DCCPAV_MAX_ACKVEC_LEN;
469 475
470 for (i = 0; i < nr_opts; ++i) { 476 to = skb_push(skb, len + 3);
471 int copylen = len; 477 *to++ = type;
472 478 *to++ = len + 3;
473 if (len > DCCP_SINGLE_OPT_MAXLEN) 479 *to++ = feat;
474 copylen = DCCP_SINGLE_OPT_MAXLEN;
475
476 /*
477 * RFC 4340, 12.2: Encode the Nonce Echo for this Ack Vector via
478 * its type; ack_nonce is the sum of all individual buf_nonce's.
479 */
480 nonce ^= av->av_buf_nonce[i];
481
482 *to++ = DCCPO_ACK_VECTOR_0 + av->av_buf_nonce[i];
483 *to++ = copylen + 2;
484
485 /* Check if buf_head wraps */
486 if (from + copylen > tail) {
487 const u16 tailsize = tail - from;
488
489 memcpy(to, from, tailsize);
490 to += tailsize;
491 len -= tailsize;
492 copylen -= tailsize;
493 from = av->av_buf;
494 }
495
496 memcpy(to, from, copylen);
497 from += copylen;
498 to += copylen;
499 len -= copylen;
500 }
501 /*
502 * Each sent Ack Vector is recorded in the list, as per A.2 of RFC 4340.
503 */
504 if (dccp_ackvec_update_records(av, dcb->dccpd_seq, nonce))
505 return -ENOBUFS;
506 return 0;
507}
508 480
509/** 481 if (len)
510 * dccp_insert_option_mandatory - Mandatory option (5.8.2) 482 memcpy(to, val, len);
511 * Note that since we are using skb_push, this function needs to be called
512 * _after_ inserting the option it is supposed to influence (stack order).
513 */
514int dccp_insert_option_mandatory(struct sk_buff *skb)
515{
516 if (DCCP_SKB_CB(skb)->dccpd_opt_len >= DCCP_MAX_OPT_LEN)
517 return -1;
518 483
519 DCCP_SKB_CB(skb)->dccpd_opt_len++; 484 dccp_pr_debug("%s(%s (%d), ...), length %d\n",
520 *skb_push(skb, 1) = DCCPO_MANDATORY; 485 dccp_feat_typename(type),
486 dccp_feat_name(feat), feat, len);
521 return 0; 487 return 0;
522} 488}
523 489
524/** 490static int dccp_insert_options_feat(struct sock *sk, struct sk_buff *skb)
525 * dccp_insert_fn_opt - Insert single Feature-Negotiation option into @skb
526 * @type: %DCCPO_CHANGE_L, %DCCPO_CHANGE_R, %DCCPO_CONFIRM_L, %DCCPO_CONFIRM_R
527 * @feat: one out of %dccp_feature_numbers
528 * @val: NN value or SP array (preferred element first) to copy
529 * @len: true length of @val in bytes (excluding first element repetition)
530 * @repeat_first: whether to copy the first element of @val twice
531 * The last argument is used to construct Confirm options, where the preferred
532 * value and the preference list appear separately (RFC 4340, 6.3.1). Preference
533 * lists are kept such that the preferred entry is always first, so we only need
534 * to copy twice, and avoid the overhead of cloning into a bigger array.
535 */
536int dccp_insert_fn_opt(struct sk_buff *skb, u8 type, u8 feat,
537 u8 *val, u8 len, bool repeat_first)
538{ 491{
539 u8 tot_len, *to; 492 struct dccp_sock *dp = dccp_sk(sk);
493 struct dccp_minisock *dmsk = dccp_msk(sk);
494 struct dccp_opt_pend *opt, *next;
495 int change = 0;
496
497 /* confirm any options [NN opts] */
498 list_for_each_entry_safe(opt, next, &dmsk->dccpms_conf, dccpop_node) {
499 dccp_insert_feat_opt(skb, opt->dccpop_type,
500 opt->dccpop_feat, opt->dccpop_val,
501 opt->dccpop_len);
502 /* fear empty confirms */
503 if (opt->dccpop_val)
504 kfree(opt->dccpop_val);
505 kfree(opt);
506 }
507 INIT_LIST_HEAD(&dmsk->dccpms_conf);
508
509 /* see which features we need to send */
510 list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) {
511 /* see if we need to send any confirm */
512 if (opt->dccpop_sc) {
513 dccp_insert_feat_opt(skb, opt->dccpop_type + 1,
514 opt->dccpop_feat,
515 opt->dccpop_sc->dccpoc_val,
516 opt->dccpop_sc->dccpoc_len);
517
518 BUG_ON(!opt->dccpop_sc->dccpoc_val);
519 kfree(opt->dccpop_sc->dccpoc_val);
520 kfree(opt->dccpop_sc);
521 opt->dccpop_sc = NULL;
522 }
540 523
541 /* take the `Feature' field and possible repetition into account */ 524 /* any option not confirmed, re-send it */
542 if (len > (DCCP_SINGLE_OPT_MAXLEN - 2)) { 525 if (!opt->dccpop_conf) {
543 DCCP_WARN("length %u for feature %u too large\n", len, feat); 526 dccp_insert_feat_opt(skb, opt->dccpop_type,
544 return -1; 527 opt->dccpop_feat, opt->dccpop_val,
528 opt->dccpop_len);
529 change++;
530 }
545 } 531 }
546 532
547 if (unlikely(val == NULL || len == 0)) 533 /* Retransmit timer.
548 len = repeat_first = 0; 534 * If this is the master listening sock, we don't set a timer on it. It
549 tot_len = 3 + repeat_first + len; 535 * should be fine because if the dude doesn't receive our RESPONSE
536 * [which will contain the CHANGE] he will send another REQUEST which
537 * will "retrnasmit" the change.
538 */
539 if (change && dp->dccps_role != DCCP_ROLE_LISTEN) {
540 dccp_pr_debug("reset feat negotiation timer %p\n", sk);
550 541
551 if (DCCP_SKB_CB(skb)->dccpd_opt_len + tot_len > DCCP_MAX_OPT_LEN) { 542 /* XXX don't reset the timer on re-transmissions. I.e. reset it
552 DCCP_WARN("packet too small for feature %d option!\n", feat); 543 * only when sending new stuff i guess. Currently the timer
553 return -1; 544 * never backs off because on re-transmission it just resets it!
545 */
546 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
547 inet_csk(sk)->icsk_rto, DCCP_RTO_MAX);
554 } 548 }
555 DCCP_SKB_CB(skb)->dccpd_opt_len += tot_len;
556
557 to = skb_push(skb, tot_len);
558 *to++ = type;
559 *to++ = tot_len;
560 *to++ = feat;
561 549
562 if (repeat_first)
563 *to++ = *val;
564 if (len)
565 memcpy(to, val, len);
566 return 0; 550 return 0;
567} 551}
568 552
@@ -581,30 +565,19 @@ static void dccp_insert_option_padding(struct sk_buff *skb)
581int dccp_insert_options(struct sock *sk, struct sk_buff *skb) 565int dccp_insert_options(struct sock *sk, struct sk_buff *skb)
582{ 566{
583 struct dccp_sock *dp = dccp_sk(sk); 567 struct dccp_sock *dp = dccp_sk(sk);
568 struct dccp_minisock *dmsk = dccp_msk(sk);
584 569
585 DCCP_SKB_CB(skb)->dccpd_opt_len = 0; 570 DCCP_SKB_CB(skb)->dccpd_opt_len = 0;
586 571
587 if (dp->dccps_send_ndp_count && dccp_insert_option_ndp(sk, skb)) 572 if (dmsk->dccpms_send_ndp_count &&
573 dccp_insert_option_ndp(sk, skb))
588 return -1; 574 return -1;
589 575
590 if (DCCP_SKB_CB(skb)->dccpd_type != DCCP_PKT_DATA) { 576 if (!dccp_packet_without_ack(skb)) {
591 577 if (dmsk->dccpms_send_ack_vector &&
592 /* Feature Negotiation */ 578 dccp_ackvec_pending(dp->dccps_hc_rx_ackvec) &&
593 if (dccp_feat_insert_opts(dp, NULL, skb)) 579 dccp_insert_option_ackvec(sk, skb))
594 return -1; 580 return -1;
595
596 if (DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_REQUEST) {
597 /*
598 * Obtain RTT sample from Request/Response exchange.
599 * This is currently used in CCID 3 initialisation.
600 */
601 if (dccp_insert_option_timestamp(sk, skb))
602 return -1;
603
604 } else if (dccp_ackvec_pending(sk) &&
605 dccp_insert_option_ackvec(sk, skb)) {
606 return -1;
607 }
608 } 581 }
609 582
610 if (dp->dccps_hc_rx_insert_options) { 583 if (dp->dccps_hc_rx_insert_options) {
@@ -613,6 +586,21 @@ int dccp_insert_options(struct sock *sk, struct sk_buff *skb)
613 dp->dccps_hc_rx_insert_options = 0; 586 dp->dccps_hc_rx_insert_options = 0;
614 } 587 }
615 588
589 /* Feature negotiation */
590 /* Data packets can't do feat negotiation */
591 if (DCCP_SKB_CB(skb)->dccpd_type != DCCP_PKT_DATA &&
592 DCCP_SKB_CB(skb)->dccpd_type != DCCP_PKT_DATAACK &&
593 dccp_insert_options_feat(sk, skb))
594 return -1;
595
596 /*
597 * Obtain RTT sample from Request/Response exchange.
598 * This is currently used in CCID 3 initialisation.
599 */
600 if (DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_REQUEST &&
601 dccp_insert_option_timestamp(sk, skb))
602 return -1;
603
616 if (dp->dccps_timestamp_echo != 0 && 604 if (dp->dccps_timestamp_echo != 0 &&
617 dccp_insert_option_timestamp_echo(dp, NULL, skb)) 605 dccp_insert_option_timestamp_echo(dp, NULL, skb))
618 return -1; 606 return -1;
@@ -625,9 +613,6 @@ int dccp_insert_options_rsk(struct dccp_request_sock *dreq, struct sk_buff *skb)
625{ 613{
626 DCCP_SKB_CB(skb)->dccpd_opt_len = 0; 614 DCCP_SKB_CB(skb)->dccpd_opt_len = 0;
627 615
628 if (dccp_feat_insert_opts(NULL, dreq, skb))
629 return -1;
630
631 if (dreq->dreq_timestamp_echo != 0 && 616 if (dreq->dreq_timestamp_echo != 0 &&
632 dccp_insert_option_timestamp_echo(NULL, dreq, skb)) 617 dccp_insert_option_timestamp_echo(NULL, dreq, skb))
633 return -1; 618 return -1;
diff --git a/net/dccp/output.c b/net/dccp/output.c
index 2532797a8009..d06945c7d3df 100644
--- a/net/dccp/output.c
+++ b/net/dccp/output.c
@@ -26,13 +26,11 @@ static inline void dccp_event_ack_sent(struct sock *sk)
26 inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); 26 inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
27} 27}
28 28
29/* enqueue @skb on sk_send_head for retransmission, return clone to send now */ 29static void dccp_skb_entail(struct sock *sk, struct sk_buff *skb)
30static struct sk_buff *dccp_skb_entail(struct sock *sk, struct sk_buff *skb)
31{ 30{
32 skb_set_owner_w(skb, sk); 31 skb_set_owner_w(skb, sk);
33 WARN_ON(sk->sk_send_head); 32 WARN_ON(sk->sk_send_head);
34 sk->sk_send_head = skb; 33 sk->sk_send_head = skb;
35 return skb_clone(sk->sk_send_head, gfp_any());
36} 34}
37 35
38/* 36/*
@@ -163,27 +161,21 @@ unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu)
163 struct inet_connection_sock *icsk = inet_csk(sk); 161 struct inet_connection_sock *icsk = inet_csk(sk);
164 struct dccp_sock *dp = dccp_sk(sk); 162 struct dccp_sock *dp = dccp_sk(sk);
165 u32 ccmps = dccp_determine_ccmps(dp); 163 u32 ccmps = dccp_determine_ccmps(dp);
166 u32 cur_mps = ccmps ? min(pmtu, ccmps) : pmtu; 164 int cur_mps = ccmps ? min(pmtu, ccmps) : pmtu;
167 165
168 /* Account for header lengths and IPv4/v6 option overhead */ 166 /* Account for header lengths and IPv4/v6 option overhead */
169 cur_mps -= (icsk->icsk_af_ops->net_header_len + icsk->icsk_ext_hdr_len + 167 cur_mps -= (icsk->icsk_af_ops->net_header_len + icsk->icsk_ext_hdr_len +
170 sizeof(struct dccp_hdr) + sizeof(struct dccp_hdr_ext)); 168 sizeof(struct dccp_hdr) + sizeof(struct dccp_hdr_ext));
171 169
172 /* 170 /*
173 * Leave enough headroom for common DCCP header options. 171 * FIXME: this should come from the CCID infrastructure, where, say,
174 * This only considers options which may appear on DCCP-Data packets, as 172 * TFRC will say it wants TIMESTAMPS, ELAPSED time, etc, for now lets
175 * per table 3 in RFC 4340, 5.8. When running out of space for other 173 * put a rough estimate for NDP + TIMESTAMP + TIMESTAMP_ECHO + ELAPSED
176 * options (eg. Ack Vector which can take up to 255 bytes), it is better 174 * TIME + TFRC_OPT_LOSS_EVENT_RATE + TFRC_OPT_RECEIVE_RATE + padding to
177 * to schedule a separate Ack. Thus we leave headroom for the following: 175 * make it a multiple of 4
178 * - 1 byte for Slow Receiver (11.6)
179 * - 6 bytes for Timestamp (13.1)
180 * - 10 bytes for Timestamp Echo (13.3)
181 * - 8 bytes for NDP count (7.7, when activated)
182 * - 6 bytes for Data Checksum (9.3)
183 * - %DCCPAV_MIN_OPTLEN bytes for Ack Vector size (11.4, when enabled)
184 */ 176 */
185 cur_mps -= roundup(1 + 6 + 10 + dp->dccps_send_ndp_count * 8 + 6 + 177
186 (dp->dccps_hc_rx_ackvec ? DCCPAV_MIN_OPTLEN : 0), 4); 178 cur_mps -= ((5 + 6 + 10 + 6 + 6 + 6 + 3) / 4) * 4;
187 179
188 /* And store cached results */ 180 /* And store cached results */
189 icsk->icsk_pmtu_cookie = pmtu; 181 icsk->icsk_pmtu_cookie = pmtu;
@@ -208,158 +200,95 @@ void dccp_write_space(struct sock *sk)
208} 200}
209 201
210/** 202/**
211 * dccp_wait_for_ccid - Await CCID send permission 203 * dccp_wait_for_ccid - Wait for ccid to tell us we can send a packet
212 * @sk: socket to wait for 204 * @sk: socket to wait for
213 * @delay: timeout in jiffies 205 * @skb: current skb to pass on for waiting
214 * This is used by CCIDs which need to delay the send time in process context. 206 * @delay: sleep timeout in milliseconds (> 0)
207 * This function is called by default when the socket is closed, and
208 * when a non-zero linger time is set on the socket. For consistency
215 */ 209 */
216static int dccp_wait_for_ccid(struct sock *sk, unsigned long delay) 210static int dccp_wait_for_ccid(struct sock *sk, struct sk_buff *skb, int delay)
217{ 211{
212 struct dccp_sock *dp = dccp_sk(sk);
218 DEFINE_WAIT(wait); 213 DEFINE_WAIT(wait);
219 long remaining; 214 unsigned long jiffdelay;
220 215 int rc;
221 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
222 sk->sk_write_pending++;
223 release_sock(sk);
224 216
225 remaining = schedule_timeout(delay); 217 do {
226 218 dccp_pr_debug("delayed send by %d msec\n", delay);
227 lock_sock(sk); 219 jiffdelay = msecs_to_jiffies(delay);
228 sk->sk_write_pending--;
229 finish_wait(sk->sk_sleep, &wait);
230 220
231 if (signal_pending(current) || sk->sk_err) 221 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
232 return -1;
233 return remaining;
234}
235
236/**
237 * dccp_xmit_packet - Send data packet under control of CCID
238 * Transmits next-queued payload and informs CCID to account for the packet.
239 */
240static void dccp_xmit_packet(struct sock *sk)
241{
242 int err, len;
243 struct dccp_sock *dp = dccp_sk(sk);
244 struct sk_buff *skb = dccp_qpolicy_pop(sk);
245 222
246 if (unlikely(skb == NULL)) 223 sk->sk_write_pending++;
247 return; 224 release_sock(sk);
248 len = skb->len; 225 schedule_timeout(jiffdelay);
226 lock_sock(sk);
227 sk->sk_write_pending--;
249 228
250 if (sk->sk_state == DCCP_PARTOPEN) { 229 if (sk->sk_err)
251 const u32 cur_mps = dp->dccps_mss_cache - DCCP_FEATNEG_OVERHEAD; 230 goto do_error;
252 /* 231 if (signal_pending(current))
253 * See 8.1.5 - Handshake Completion. 232 goto do_interrupted;
254 *
255 * For robustness we resend Confirm options until the client has
256 * entered OPEN. During the initial feature negotiation, the MPS
257 * is smaller than usual, reduced by the Change/Confirm options.
258 */
259 if (!list_empty(&dp->dccps_featneg) && len > cur_mps) {
260 DCCP_WARN("Payload too large (%d) for featneg.\n", len);
261 dccp_send_ack(sk);
262 dccp_feat_list_purge(&dp->dccps_featneg);
263 }
264 233
265 inet_csk_schedule_ack(sk); 234 rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb);
266 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, 235 } while ((delay = rc) > 0);
267 inet_csk(sk)->icsk_rto, 236out:
268 DCCP_RTO_MAX); 237 finish_wait(sk->sk_sleep, &wait);
269 DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATAACK; 238 return rc;
270 } else if (dccp_ack_pending(sk)) { 239
271 DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATAACK; 240do_error:
272 } else { 241 rc = -EPIPE;
273 DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATA; 242 goto out;
274 } 243do_interrupted:
275 244 rc = -EINTR;
276 err = dccp_transmit_skb(sk, skb); 245 goto out;
277 if (err)
278 dccp_pr_debug("transmit_skb() returned err=%d\n", err);
279 /*
280 * Register this one as sent even if an error occurred. To the remote
281 * end a local packet drop is indistinguishable from network loss, i.e.
282 * any local drop will eventually be reported via receiver feedback.
283 */
284 ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, len);
285
286 /*
287 * If the CCID needs to transfer additional header options out-of-band
288 * (e.g. Ack Vectors or feature-negotiation options), it activates this
289 * flag to schedule a Sync. The Sync will automatically incorporate all
290 * currently pending header options, thus clearing the backlog.
291 */
292 if (dp->dccps_sync_scheduled)
293 dccp_send_sync(sk, dp->dccps_gsr, DCCP_PKT_SYNC);
294} 246}
295 247
296/** 248void dccp_write_xmit(struct sock *sk, int block)
297 * dccp_flush_write_queue - Drain queue at end of connection
298 * Since dccp_sendmsg queues packets without waiting for them to be sent, it may
299 * happen that the TX queue is not empty at the end of a connection. We give the
300 * HC-sender CCID a grace period of up to @time_budget jiffies. If this function
301 * returns with a non-empty write queue, it will be purged later.
302 */
303void dccp_flush_write_queue(struct sock *sk, long *time_budget)
304{ 249{
305 struct dccp_sock *dp = dccp_sk(sk); 250 struct dccp_sock *dp = dccp_sk(sk);
306 struct sk_buff *skb; 251 struct sk_buff *skb;
307 long delay, rc;
308
309 while (*time_budget > 0 && (skb = skb_peek(&sk->sk_write_queue))) {
310 rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb);
311 252
312 switch (ccid_packet_dequeue_eval(rc)) { 253 while ((skb = skb_peek(&sk->sk_write_queue))) {
313 case CCID_PACKET_WILL_DEQUEUE_LATER: 254 int err = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb);
314 /* 255
315 * If the CCID determines when to send, the next sending 256 if (err > 0) {
316 * time is unknown or the CCID may not even send again 257 if (!block) {
317 * (e.g. remote host crashes or lost Ack packets). 258 sk_reset_timer(sk, &dp->dccps_xmit_timer,
318 */ 259 msecs_to_jiffies(err)+jiffies);
319 DCCP_WARN("CCID did not manage to send all packets\n"); 260 break;
320 return; 261 } else
321 case CCID_PACKET_DELAY: 262 err = dccp_wait_for_ccid(sk, skb, err);
322 delay = msecs_to_jiffies(rc); 263 if (err && err != -EINTR)
323 if (delay > *time_budget) 264 DCCP_BUG("err=%d after dccp_wait_for_ccid", err);
324 return;
325 rc = dccp_wait_for_ccid(sk, delay);
326 if (rc < 0)
327 return;
328 *time_budget -= (delay - rc);
329 /* check again if we can send now */
330 break;
331 case CCID_PACKET_SEND_AT_ONCE:
332 dccp_xmit_packet(sk);
333 break;
334 case CCID_PACKET_ERR:
335 skb_dequeue(&sk->sk_write_queue);
336 kfree_skb(skb);
337 dccp_pr_debug("packet discarded due to err=%ld\n", rc);
338 } 265 }
339 }
340}
341 266
342void dccp_write_xmit(struct sock *sk) 267 skb_dequeue(&sk->sk_write_queue);
343{ 268 if (err == 0) {
344 struct dccp_sock *dp = dccp_sk(sk); 269 struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
345 struct sk_buff *skb; 270 const int len = skb->len;
346 271
347 while ((skb = dccp_qpolicy_top(sk))) { 272 if (sk->sk_state == DCCP_PARTOPEN) {
348 int rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb); 273 /* See 8.1.5. Handshake Completion */
349 274 inet_csk_schedule_ack(sk);
350 switch (ccid_packet_dequeue_eval(rc)) { 275 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
351 case CCID_PACKET_WILL_DEQUEUE_LATER: 276 inet_csk(sk)->icsk_rto,
352 return; 277 DCCP_RTO_MAX);
353 case CCID_PACKET_DELAY: 278 dcb->dccpd_type = DCCP_PKT_DATAACK;
354 sk_reset_timer(sk, &dp->dccps_xmit_timer, 279 } else if (dccp_ack_pending(sk))
355 jiffies + msecs_to_jiffies(rc)); 280 dcb->dccpd_type = DCCP_PKT_DATAACK;
356 return; 281 else
357 case CCID_PACKET_SEND_AT_ONCE: 282 dcb->dccpd_type = DCCP_PKT_DATA;
358 dccp_xmit_packet(sk); 283
359 break; 284 err = dccp_transmit_skb(sk, skb);
360 case CCID_PACKET_ERR: 285 ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, 0, len);
361 dccp_qpolicy_drop(sk, skb); 286 if (err)
362 dccp_pr_debug("packet discarded due to err=%d\n", rc); 287 DCCP_BUG("err=%d after ccid_hc_tx_packet_sent",
288 err);
289 } else {
290 dccp_pr_debug("packet discarded due to err=%d\n", err);
291 kfree_skb(skb);
363 } 292 }
364 } 293 }
365} 294}
@@ -410,12 +339,10 @@ struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst,
410 DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_RESPONSE; 339 DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_RESPONSE;
411 DCCP_SKB_CB(skb)->dccpd_seq = dreq->dreq_iss; 340 DCCP_SKB_CB(skb)->dccpd_seq = dreq->dreq_iss;
412 341
413 /* Resolve feature dependencies resulting from choice of CCID */ 342 if (dccp_insert_options_rsk(dreq, skb)) {
414 if (dccp_feat_server_ccid_dependencies(dreq)) 343 kfree_skb(skb);
415 goto response_failed; 344 return NULL;
416 345 }
417 if (dccp_insert_options_rsk(dreq, skb))
418 goto response_failed;
419 346
420 /* Build and checksum header */ 347 /* Build and checksum header */
421 dh = dccp_zeroed_hdr(skb, dccp_header_size); 348 dh = dccp_zeroed_hdr(skb, dccp_header_size);
@@ -436,9 +363,6 @@ struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst,
436 inet_rsk(req)->acked = 1; 363 inet_rsk(req)->acked = 1;
437 DCCP_INC_STATS(DCCP_MIB_OUTSEGS); 364 DCCP_INC_STATS(DCCP_MIB_OUTSEGS);
438 return skb; 365 return skb;
439response_failed:
440 kfree_skb(skb);
441 return NULL;
442} 366}
443 367
444EXPORT_SYMBOL_GPL(dccp_make_response); 368EXPORT_SYMBOL_GPL(dccp_make_response);
@@ -523,9 +447,8 @@ int dccp_send_reset(struct sock *sk, enum dccp_reset_codes code)
523/* 447/*
524 * Do all connect socket setups that can be done AF independent. 448 * Do all connect socket setups that can be done AF independent.
525 */ 449 */
526int dccp_connect(struct sock *sk) 450static inline void dccp_connect_init(struct sock *sk)
527{ 451{
528 struct sk_buff *skb;
529 struct dccp_sock *dp = dccp_sk(sk); 452 struct dccp_sock *dp = dccp_sk(sk);
530 struct dst_entry *dst = __sk_dst_get(sk); 453 struct dst_entry *dst = __sk_dst_get(sk);
531 struct inet_connection_sock *icsk = inet_csk(sk); 454 struct inet_connection_sock *icsk = inet_csk(sk);
@@ -535,13 +458,19 @@ int dccp_connect(struct sock *sk)
535 458
536 dccp_sync_mss(sk, dst_mtu(dst)); 459 dccp_sync_mss(sk, dst_mtu(dst));
537 460
538 /* do not connect if feature negotiation setup fails */
539 if (dccp_feat_finalise_settings(dccp_sk(sk)))
540 return -EPROTO;
541
542 /* Initialise GAR as per 8.5; AWL/AWH are set in dccp_transmit_skb() */ 461 /* Initialise GAR as per 8.5; AWL/AWH are set in dccp_transmit_skb() */
543 dp->dccps_gar = dp->dccps_iss; 462 dp->dccps_gar = dp->dccps_iss;
544 463
464 icsk->icsk_retransmits = 0;
465}
466
467int dccp_connect(struct sock *sk)
468{
469 struct sk_buff *skb;
470 struct inet_connection_sock *icsk = inet_csk(sk);
471
472 dccp_connect_init(sk);
473
545 skb = alloc_skb(sk->sk_prot->max_header, sk->sk_allocation); 474 skb = alloc_skb(sk->sk_prot->max_header, sk->sk_allocation);
546 if (unlikely(skb == NULL)) 475 if (unlikely(skb == NULL))
547 return -ENOBUFS; 476 return -ENOBUFS;
@@ -551,11 +480,11 @@ int dccp_connect(struct sock *sk)
551 480
552 DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_REQUEST; 481 DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_REQUEST;
553 482
554 dccp_transmit_skb(sk, dccp_skb_entail(sk, skb)); 483 dccp_skb_entail(sk, skb);
484 dccp_transmit_skb(sk, skb_clone(skb, GFP_KERNEL));
555 DCCP_INC_STATS(DCCP_MIB_ACTIVEOPENS); 485 DCCP_INC_STATS(DCCP_MIB_ACTIVEOPENS);
556 486
557 /* Timer for repeating the REQUEST until an answer. */ 487 /* Timer for repeating the REQUEST until an answer. */
558 icsk->icsk_retransmits = 0;
559 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 488 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
560 icsk->icsk_rto, DCCP_RTO_MAX); 489 icsk->icsk_rto, DCCP_RTO_MAX);
561 return 0; 490 return 0;
@@ -642,12 +571,6 @@ void dccp_send_sync(struct sock *sk, const u64 ackno,
642 DCCP_SKB_CB(skb)->dccpd_type = pkt_type; 571 DCCP_SKB_CB(skb)->dccpd_type = pkt_type;
643 DCCP_SKB_CB(skb)->dccpd_ack_seq = ackno; 572 DCCP_SKB_CB(skb)->dccpd_ack_seq = ackno;
644 573
645 /*
646 * Clear the flag in case the Sync was scheduled for out-of-band data,
647 * such as carrying a long Ack Vector.
648 */
649 dccp_sk(sk)->dccps_sync_scheduled = 0;
650
651 dccp_transmit_skb(sk, skb); 574 dccp_transmit_skb(sk, skb);
652} 575}
653 576
@@ -676,7 +599,9 @@ void dccp_send_close(struct sock *sk, const int active)
676 DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_CLOSE; 599 DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_CLOSE;
677 600
678 if (active) { 601 if (active) {
679 skb = dccp_skb_entail(sk, skb); 602 dccp_write_xmit(sk, 1);
603 dccp_skb_entail(sk, skb);
604 dccp_transmit_skb(sk, skb_clone(skb, prio));
680 /* 605 /*
681 * Retransmission timer for active-close: RFC 4340, 8.3 requires 606 * Retransmission timer for active-close: RFC 4340, 8.3 requires
682 * to retransmit the Close/CloseReq until the CLOSING/CLOSEREQ 607 * to retransmit the Close/CloseReq until the CLOSING/CLOSEREQ
@@ -689,6 +614,6 @@ void dccp_send_close(struct sock *sk, const int active)
689 */ 614 */
690 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 615 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
691 DCCP_TIMEOUT_INIT, DCCP_RTO_MAX); 616 DCCP_TIMEOUT_INIT, DCCP_RTO_MAX);
692 } 617 } else
693 dccp_transmit_skb(sk, skb); 618 dccp_transmit_skb(sk, skb);
694} 619}
diff --git a/net/dccp/probe.c b/net/dccp/probe.c
index eaa59d82ab0f..81368a7f5379 100644
--- a/net/dccp/probe.c
+++ b/net/dccp/probe.c
@@ -46,54 +46,75 @@ static struct {
46 struct kfifo *fifo; 46 struct kfifo *fifo;
47 spinlock_t lock; 47 spinlock_t lock;
48 wait_queue_head_t wait; 48 wait_queue_head_t wait;
49 ktime_t start; 49 struct timespec tstart;
50} dccpw; 50} dccpw;
51 51
52static void jdccp_write_xmit(struct sock *sk) 52static void printl(const char *fmt, ...)
53{ 53{
54 const struct inet_sock *inet = inet_sk(sk); 54 va_list args;
55 struct ccid3_hc_tx_sock *hctx = NULL; 55 int len;
56 struct timespec tv; 56 struct timespec now;
57 char buf[256]; 57 char tbuf[256];
58 int len, ccid = ccid_get_current_tx_ccid(dccp_sk(sk));
59 58
60 if (ccid == DCCPC_CCID3) 59 va_start(args, fmt);
61 hctx = ccid3_hc_tx_sk(sk); 60 getnstimeofday(&now);
62 61
63 if (!port || ntohs(inet->dport) == port || ntohs(inet->sport) == port) { 62 now = timespec_sub(now, dccpw.tstart);
64 63
65 tv = ktime_to_timespec(ktime_sub(ktime_get(), dccpw.start)); 64 len = sprintf(tbuf, "%lu.%06lu ",
66 len = sprintf(buf, "%lu.%09lu %d.%d.%d.%d:%u %d.%d.%d.%d:%u %d", 65 (unsigned long) now.tv_sec,
67 (unsigned long)tv.tv_sec, 66 (unsigned long) now.tv_nsec / NSEC_PER_USEC);
68 (unsigned long)tv.tv_nsec, 67 len += vscnprintf(tbuf+len, sizeof(tbuf)-len, fmt, args);
69 NIPQUAD(inet->saddr), ntohs(inet->sport), 68 va_end(args);
70 NIPQUAD(inet->daddr), ntohs(inet->dport), ccid);
71 69
70 kfifo_put(dccpw.fifo, tbuf, len);
71 wake_up(&dccpw.wait);
72}
73
74static int jdccp_sendmsg(struct kiocb *iocb, struct sock *sk,
75 struct msghdr *msg, size_t size)
76{
77 const struct dccp_minisock *dmsk = dccp_msk(sk);
78 const struct inet_sock *inet = inet_sk(sk);
79 const struct ccid3_hc_tx_sock *hctx;
80
81 if (dmsk->dccpms_tx_ccid == DCCPC_CCID3)
82 hctx = ccid3_hc_tx_sk(sk);
83 else
84 hctx = NULL;
85
86 if (port == 0 || ntohs(inet->dport) == port ||
87 ntohs(inet->sport) == port) {
72 if (hctx) 88 if (hctx)
73 len += sprintf(buf + len, " %d %d %d %u %u %u %d", 89 printl("%d.%d.%d.%d:%u %d.%d.%d.%d:%u %d %d %d %d %u "
74 hctx->s, hctx->rtt, hctx->p, hctx->x_calc, 90 "%llu %llu %d\n",
75 (unsigned)(hctx->x_recv >> 6), 91 NIPQUAD(inet->saddr), ntohs(inet->sport),
76 (unsigned)(hctx->x >> 6), hctx->t_ipi); 92 NIPQUAD(inet->daddr), ntohs(inet->dport), size,
77 93 hctx->ccid3hctx_s, hctx->ccid3hctx_rtt,
78 len += sprintf(buf + len, "\n"); 94 hctx->ccid3hctx_p, hctx->ccid3hctx_x_calc,
79 kfifo_put(dccpw.fifo, buf, len); 95 hctx->ccid3hctx_x_recv >> 6,
80 wake_up(&dccpw.wait); 96 hctx->ccid3hctx_x >> 6, hctx->ccid3hctx_t_ipi);
97 else
98 printl("%d.%d.%d.%d:%u %d.%d.%d.%d:%u %d\n",
99 NIPQUAD(inet->saddr), ntohs(inet->sport),
100 NIPQUAD(inet->daddr), ntohs(inet->dport), size);
81 } 101 }
82 102
83 jprobe_return(); 103 jprobe_return();
104 return 0;
84} 105}
85 106
86static struct jprobe dccp_send_probe = { 107static struct jprobe dccp_send_probe = {
87 .kp = { 108 .kp = {
88 .symbol_name = "dccp_write_xmit", 109 .symbol_name = "dccp_sendmsg",
89 }, 110 },
90 .entry = jdccp_write_xmit, 111 .entry = jdccp_sendmsg,
91}; 112};
92 113
93static int dccpprobe_open(struct inode *inode, struct file *file) 114static int dccpprobe_open(struct inode *inode, struct file *file)
94{ 115{
95 kfifo_reset(dccpw.fifo); 116 kfifo_reset(dccpw.fifo);
96 dccpw.start = ktime_get(); 117 getnstimeofday(&dccpw.tstart);
97 return 0; 118 return 0;
98} 119}
99 120
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index ecf3be961e11..d0bd34819761 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -67,9 +67,6 @@ void dccp_set_state(struct sock *sk, const int state)
67 case DCCP_OPEN: 67 case DCCP_OPEN:
68 if (oldstate != DCCP_OPEN) 68 if (oldstate != DCCP_OPEN)
69 DCCP_INC_STATS(DCCP_MIB_CURRESTAB); 69 DCCP_INC_STATS(DCCP_MIB_CURRESTAB);
70 /* Client retransmits all Confirm options until entering OPEN */
71 if (oldstate == DCCP_PARTOPEN)
72 dccp_feat_list_purge(&dccp_sk(sk)->dccps_featneg);
73 break; 70 break;
74 71
75 case DCCP_CLOSED: 72 case DCCP_CLOSED:
@@ -178,25 +175,63 @@ EXPORT_SYMBOL_GPL(dccp_state_name);
178int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized) 175int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized)
179{ 176{
180 struct dccp_sock *dp = dccp_sk(sk); 177 struct dccp_sock *dp = dccp_sk(sk);
178 struct dccp_minisock *dmsk = dccp_msk(sk);
181 struct inet_connection_sock *icsk = inet_csk(sk); 179 struct inet_connection_sock *icsk = inet_csk(sk);
182 180
181 dccp_minisock_init(&dp->dccps_minisock);
182
183 icsk->icsk_rto = DCCP_TIMEOUT_INIT; 183 icsk->icsk_rto = DCCP_TIMEOUT_INIT;
184 icsk->icsk_syn_retries = sysctl_dccp_request_retries; 184 icsk->icsk_syn_retries = sysctl_dccp_request_retries;
185 sk->sk_state = DCCP_CLOSED; 185 sk->sk_state = DCCP_CLOSED;
186 sk->sk_write_space = dccp_write_space; 186 sk->sk_write_space = dccp_write_space;
187 icsk->icsk_sync_mss = dccp_sync_mss; 187 icsk->icsk_sync_mss = dccp_sync_mss;
188 dp->dccps_mss_cache = TCP_MIN_RCVMSS; 188 dp->dccps_mss_cache = 536;
189 dp->dccps_rate_last = jiffies; 189 dp->dccps_rate_last = jiffies;
190 dp->dccps_role = DCCP_ROLE_UNDEFINED; 190 dp->dccps_role = DCCP_ROLE_UNDEFINED;
191 dp->dccps_service = DCCP_SERVICE_CODE_IS_ABSENT; 191 dp->dccps_service = DCCP_SERVICE_CODE_IS_ABSENT;
192 dp->dccps_tx_qlen = sysctl_dccp_tx_qlen; 192 dp->dccps_l_ack_ratio = dp->dccps_r_ack_ratio = 1;
193 193
194 dccp_init_xmit_timers(sk); 194 dccp_init_xmit_timers(sk);
195 195
196 INIT_LIST_HEAD(&dp->dccps_featneg); 196 /*
197 /* control socket doesn't need feat nego */ 197 * FIXME: We're hardcoding the CCID, and doing this at this point makes
198 if (likely(ctl_sock_initialized)) 198 * the listening (master) sock get CCID control blocks, which is not
199 return dccp_feat_init(sk); 199 * necessary, but for now, to not mess with the test userspace apps,
200 * lets leave it here, later the real solution is to do this in a
201 * setsockopt(CCIDs-I-want/accept). -acme
202 */
203 if (likely(ctl_sock_initialized)) {
204 int rc = dccp_feat_init(dmsk);
205
206 if (rc)
207 return rc;
208
209 if (dmsk->dccpms_send_ack_vector) {
210 dp->dccps_hc_rx_ackvec = dccp_ackvec_alloc(GFP_KERNEL);
211 if (dp->dccps_hc_rx_ackvec == NULL)
212 return -ENOMEM;
213 }
214 dp->dccps_hc_rx_ccid = ccid_hc_rx_new(dmsk->dccpms_rx_ccid,
215 sk, GFP_KERNEL);
216 dp->dccps_hc_tx_ccid = ccid_hc_tx_new(dmsk->dccpms_tx_ccid,
217 sk, GFP_KERNEL);
218 if (unlikely(dp->dccps_hc_rx_ccid == NULL ||
219 dp->dccps_hc_tx_ccid == NULL)) {
220 ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk);
221 ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk);
222 if (dmsk->dccpms_send_ack_vector) {
223 dccp_ackvec_free(dp->dccps_hc_rx_ackvec);
224 dp->dccps_hc_rx_ackvec = NULL;
225 }
226 dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL;
227 return -ENOMEM;
228 }
229 } else {
230 /* control socket doesn't need feat nego */
231 INIT_LIST_HEAD(&dmsk->dccpms_pending);
232 INIT_LIST_HEAD(&dmsk->dccpms_conf);
233 }
234
200 return 0; 235 return 0;
201} 236}
202 237
@@ -205,6 +240,7 @@ EXPORT_SYMBOL_GPL(dccp_init_sock);
205void dccp_destroy_sock(struct sock *sk) 240void dccp_destroy_sock(struct sock *sk)
206{ 241{
207 struct dccp_sock *dp = dccp_sk(sk); 242 struct dccp_sock *dp = dccp_sk(sk);
243 struct dccp_minisock *dmsk = dccp_msk(sk);
208 244
209 /* 245 /*
210 * DCCP doesn't use sk_write_queue, just sk_send_head 246 * DCCP doesn't use sk_write_queue, just sk_send_head
@@ -222,7 +258,7 @@ void dccp_destroy_sock(struct sock *sk)
222 kfree(dp->dccps_service_list); 258 kfree(dp->dccps_service_list);
223 dp->dccps_service_list = NULL; 259 dp->dccps_service_list = NULL;
224 260
225 if (dp->dccps_hc_rx_ackvec != NULL) { 261 if (dmsk->dccpms_send_ack_vector) {
226 dccp_ackvec_free(dp->dccps_hc_rx_ackvec); 262 dccp_ackvec_free(dp->dccps_hc_rx_ackvec);
227 dp->dccps_hc_rx_ackvec = NULL; 263 dp->dccps_hc_rx_ackvec = NULL;
228 } 264 }
@@ -231,7 +267,7 @@ void dccp_destroy_sock(struct sock *sk)
231 dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL; 267 dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL;
232 268
233 /* clean up feature negotiation state */ 269 /* clean up feature negotiation state */
234 dccp_feat_list_purge(&dp->dccps_featneg); 270 dccp_feat_clean(dmsk);
235} 271}
236 272
237EXPORT_SYMBOL_GPL(dccp_destroy_sock); 273EXPORT_SYMBOL_GPL(dccp_destroy_sock);
@@ -241,9 +277,6 @@ static inline int dccp_listen_start(struct sock *sk, int backlog)
241 struct dccp_sock *dp = dccp_sk(sk); 277 struct dccp_sock *dp = dccp_sk(sk);
242 278
243 dp->dccps_role = DCCP_ROLE_LISTEN; 279 dp->dccps_role = DCCP_ROLE_LISTEN;
244 /* do not start to listen if feature negotiation setup fails */
245 if (dccp_feat_finalise_settings(dp))
246 return -EPROTO;
247 return inet_csk_listen_start(sk, backlog); 280 return inet_csk_listen_start(sk, backlog);
248} 281}
249 282
@@ -433,70 +466,42 @@ static int dccp_setsockopt_service(struct sock *sk, const __be32 service,
433 return 0; 466 return 0;
434} 467}
435 468
436static int dccp_setsockopt_cscov(struct sock *sk, int cscov, bool rx) 469/* byte 1 is feature. the rest is the preference list */
470static int dccp_setsockopt_change(struct sock *sk, int type,
471 struct dccp_so_feat __user *optval)
437{ 472{
438 u8 *list, len; 473 struct dccp_so_feat opt;
439 int i, rc; 474 u8 *val;
475 int rc;
440 476
441 if (cscov < 0 || cscov > 15) 477 if (copy_from_user(&opt, optval, sizeof(opt)))
442 return -EINVAL; 478 return -EFAULT;
443 /* 479 /*
444 * Populate a list of permissible values, in the range cscov...15. This 480 * rfc4340: 6.1. Change Options
445 * is necessary since feature negotiation of single values only works if
446 * both sides incidentally choose the same value. Since the list starts
447 * lowest-value first, negotiation will pick the smallest shared value.
448 */ 481 */
449 if (cscov == 0) 482 if (opt.dccpsf_len < 1)
450 return 0;
451 len = 16 - cscov;
452
453 list = kmalloc(len, GFP_KERNEL);
454 if (list == NULL)
455 return -ENOBUFS;
456
457 for (i = 0; i < len; i++)
458 list[i] = cscov++;
459
460 rc = dccp_feat_register_sp(sk, DCCPF_MIN_CSUM_COVER, rx, list, len);
461
462 if (rc == 0) {
463 if (rx)
464 dccp_sk(sk)->dccps_pcrlen = cscov;
465 else
466 dccp_sk(sk)->dccps_pcslen = cscov;
467 }
468 kfree(list);
469 return rc;
470}
471
472static int dccp_setsockopt_ccid(struct sock *sk, int type,
473 char __user *optval, int optlen)
474{
475 u8 *val;
476 int rc = 0;
477
478 if (optlen < 1 || optlen > DCCP_FEAT_MAX_SP_VALS)
479 return -EINVAL; 483 return -EINVAL;
480 484
481 val = kmalloc(optlen, GFP_KERNEL); 485 val = kmalloc(opt.dccpsf_len, GFP_KERNEL);
482 if (val == NULL) 486 if (!val)
483 return -ENOMEM; 487 return -ENOMEM;
484 488
485 if (copy_from_user(val, optval, optlen)) { 489 if (copy_from_user(val, opt.dccpsf_val, opt.dccpsf_len)) {
486 kfree(val); 490 rc = -EFAULT;
487 return -EFAULT; 491 goto out_free_val;
488 } 492 }
489 493
490 lock_sock(sk); 494 rc = dccp_feat_change(dccp_msk(sk), type, opt.dccpsf_feat,
491 if (type == DCCP_SOCKOPT_TX_CCID || type == DCCP_SOCKOPT_CCID) 495 val, opt.dccpsf_len, GFP_KERNEL);
492 rc = dccp_feat_register_sp(sk, DCCPF_CCID, 1, val, optlen); 496 if (rc)
497 goto out_free_val;
493 498
494 if (!rc && (type == DCCP_SOCKOPT_RX_CCID || type == DCCP_SOCKOPT_CCID)) 499out:
495 rc = dccp_feat_register_sp(sk, DCCPF_CCID, 0, val, optlen); 500 return rc;
496 release_sock(sk);
497 501
502out_free_val:
498 kfree(val); 503 kfree(val);
499 return rc; 504 goto out;
500} 505}
501 506
502static int do_dccp_setsockopt(struct sock *sk, int level, int optname, 507static int do_dccp_setsockopt(struct sock *sk, int level, int optname,
@@ -505,21 +510,7 @@ static int do_dccp_setsockopt(struct sock *sk, int level, int optname,
505 struct dccp_sock *dp = dccp_sk(sk); 510 struct dccp_sock *dp = dccp_sk(sk);
506 int val, err = 0; 511 int val, err = 0;
507 512
508 switch (optname) { 513 if (optlen < sizeof(int))
509 case DCCP_SOCKOPT_PACKET_SIZE:
510 DCCP_WARN("sockopt(PACKET_SIZE) is deprecated: fix your app\n");
511 return 0;
512 case DCCP_SOCKOPT_CHANGE_L:
513 case DCCP_SOCKOPT_CHANGE_R:
514 DCCP_WARN("sockopt(CHANGE_L/R) is deprecated: fix your app\n");
515 return 0;
516 case DCCP_SOCKOPT_CCID:
517 case DCCP_SOCKOPT_RX_CCID:
518 case DCCP_SOCKOPT_TX_CCID:
519 return dccp_setsockopt_ccid(sk, optname, optval, optlen);
520 }
521
522 if (optlen < (int)sizeof(int))
523 return -EINVAL; 514 return -EINVAL;
524 515
525 if (get_user(val, (int __user *)optval)) 516 if (get_user(val, (int __user *)optval))
@@ -530,38 +521,53 @@ static int do_dccp_setsockopt(struct sock *sk, int level, int optname,
530 521
531 lock_sock(sk); 522 lock_sock(sk);
532 switch (optname) { 523 switch (optname) {
524 case DCCP_SOCKOPT_PACKET_SIZE:
525 DCCP_WARN("sockopt(PACKET_SIZE) is deprecated: fix your app\n");
526 err = 0;
527 break;
528 case DCCP_SOCKOPT_CHANGE_L:
529 if (optlen != sizeof(struct dccp_so_feat))
530 err = -EINVAL;
531 else
532 err = dccp_setsockopt_change(sk, DCCPO_CHANGE_L,
533 (struct dccp_so_feat __user *)
534 optval);
535 break;
536 case DCCP_SOCKOPT_CHANGE_R:
537 if (optlen != sizeof(struct dccp_so_feat))
538 err = -EINVAL;
539 else
540 err = dccp_setsockopt_change(sk, DCCPO_CHANGE_R,
541 (struct dccp_so_feat __user *)
542 optval);
543 break;
533 case DCCP_SOCKOPT_SERVER_TIMEWAIT: 544 case DCCP_SOCKOPT_SERVER_TIMEWAIT:
534 if (dp->dccps_role != DCCP_ROLE_SERVER) 545 if (dp->dccps_role != DCCP_ROLE_SERVER)
535 err = -EOPNOTSUPP; 546 err = -EOPNOTSUPP;
536 else 547 else
537 dp->dccps_server_timewait = (val != 0); 548 dp->dccps_server_timewait = (val != 0);
538 break; 549 break;
539 case DCCP_SOCKOPT_SEND_CSCOV: 550 case DCCP_SOCKOPT_SEND_CSCOV: /* sender side, RFC 4340, sec. 9.2 */
540 err = dccp_setsockopt_cscov(sk, val, false); 551 if (val < 0 || val > 15)
541 break;
542 case DCCP_SOCKOPT_RECV_CSCOV:
543 err = dccp_setsockopt_cscov(sk, val, true);
544 break;
545 case DCCP_SOCKOPT_QPOLICY_ID:
546 if (sk->sk_state != DCCP_CLOSED)
547 err = -EISCONN;
548 else if (val < 0 || val >= DCCPQ_POLICY_MAX)
549 err = -EINVAL; 552 err = -EINVAL;
550 else 553 else
551 dp->dccps_qpolicy = val; 554 dp->dccps_pcslen = val;
552 break; 555 break;
553 case DCCP_SOCKOPT_QPOLICY_TXQLEN: 556 case DCCP_SOCKOPT_RECV_CSCOV: /* receiver side, RFC 4340 sec. 9.2.1 */
554 if (val < 0) 557 if (val < 0 || val > 15)
555 err = -EINVAL; 558 err = -EINVAL;
556 else 559 else {
557 dp->dccps_tx_qlen = val; 560 dp->dccps_pcrlen = val;
561 /* FIXME: add feature negotiation,
562 * ChangeL(MinimumChecksumCoverage, val) */
563 }
558 break; 564 break;
559 default: 565 default:
560 err = -ENOPROTOOPT; 566 err = -ENOPROTOOPT;
561 break; 567 break;
562 } 568 }
563 release_sock(sk);
564 569
570 release_sock(sk);
565 return err; 571 return err;
566} 572}
567 573
@@ -642,18 +648,6 @@ static int do_dccp_getsockopt(struct sock *sk, int level, int optname,
642 case DCCP_SOCKOPT_GET_CUR_MPS: 648 case DCCP_SOCKOPT_GET_CUR_MPS:
643 val = dp->dccps_mss_cache; 649 val = dp->dccps_mss_cache;
644 break; 650 break;
645 case DCCP_SOCKOPT_AVAILABLE_CCIDS:
646 return ccid_getsockopt_builtin_ccids(sk, len, optval, optlen);
647 case DCCP_SOCKOPT_TX_CCID:
648 val = ccid_get_current_tx_ccid(dp);
649 if (val < 0)
650 return -ENOPROTOOPT;
651 break;
652 case DCCP_SOCKOPT_RX_CCID:
653 val = ccid_get_current_rx_ccid(dp);
654 if (val < 0)
655 return -ENOPROTOOPT;
656 break;
657 case DCCP_SOCKOPT_SERVER_TIMEWAIT: 651 case DCCP_SOCKOPT_SERVER_TIMEWAIT:
658 val = dp->dccps_server_timewait; 652 val = dp->dccps_server_timewait;
659 break; 653 break;
@@ -663,12 +657,6 @@ static int do_dccp_getsockopt(struct sock *sk, int level, int optname,
663 case DCCP_SOCKOPT_RECV_CSCOV: 657 case DCCP_SOCKOPT_RECV_CSCOV:
664 val = dp->dccps_pcrlen; 658 val = dp->dccps_pcrlen;
665 break; 659 break;
666 case DCCP_SOCKOPT_QPOLICY_ID:
667 val = dp->dccps_qpolicy;
668 break;
669 case DCCP_SOCKOPT_QPOLICY_TXQLEN:
670 val = dp->dccps_tx_qlen;
671 break;
672 case 128 ... 191: 660 case 128 ... 191:
673 return ccid_hc_rx_getsockopt(dp->dccps_hc_rx_ccid, sk, optname, 661 return ccid_hc_rx_getsockopt(dp->dccps_hc_rx_ccid, sk, optname,
674 len, (u32 __user *)optval, optlen); 662 len, (u32 __user *)optval, optlen);
@@ -711,47 +699,6 @@ int compat_dccp_getsockopt(struct sock *sk, int level, int optname,
711EXPORT_SYMBOL_GPL(compat_dccp_getsockopt); 699EXPORT_SYMBOL_GPL(compat_dccp_getsockopt);
712#endif 700#endif
713 701
714static int dccp_msghdr_parse(struct msghdr *msg, struct sk_buff *skb)
715{
716 struct cmsghdr *cmsg = CMSG_FIRSTHDR(msg);
717
718 /*
719 * Assign an (opaque) qpolicy priority value to skb->priority.
720 *
721 * We are overloading this skb field for use with the qpolicy subystem.
722 * The skb->priority is normally used for the SO_PRIORITY option, which
723 * is initialised from sk_priority. Since the assignment of sk_priority
724 * to skb->priority happens later (on layer 3), we overload this field
725 * for use with queueing priorities as long as the skb is on layer 4.
726 * The default priority value (if nothing is set) is 0.
727 */
728 skb->priority = 0;
729
730 for (; cmsg != NULL; cmsg = CMSG_NXTHDR(msg, cmsg)) {
731
732 if (!CMSG_OK(msg, cmsg))
733 return -EINVAL;
734
735 if (cmsg->cmsg_level != SOL_DCCP)
736 continue;
737
738 if (cmsg->cmsg_type <= DCCP_SCM_QPOLICY_MAX &&
739 !dccp_qpolicy_param_ok(skb->sk, cmsg->cmsg_type))
740 return -EINVAL;
741
742 switch (cmsg->cmsg_type) {
743 case DCCP_SCM_PRIORITY:
744 if (cmsg->cmsg_len != CMSG_LEN(sizeof(__u32)))
745 return -EINVAL;
746 skb->priority = *(__u32 *)CMSG_DATA(cmsg);
747 break;
748 default:
749 return -EINVAL;
750 }
751 }
752 return 0;
753}
754
755int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, 702int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
756 size_t len) 703 size_t len)
757{ 704{
@@ -767,7 +714,8 @@ int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
767 714
768 lock_sock(sk); 715 lock_sock(sk);
769 716
770 if (dccp_qpolicy_full(sk)) { 717 if (sysctl_dccp_tx_qlen &&
718 (sk->sk_write_queue.qlen >= sysctl_dccp_tx_qlen)) {
771 rc = -EAGAIN; 719 rc = -EAGAIN;
772 goto out_release; 720 goto out_release;
773 } 721 }
@@ -795,12 +743,8 @@ int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
795 if (rc != 0) 743 if (rc != 0)
796 goto out_discard; 744 goto out_discard;
797 745
798 rc = dccp_msghdr_parse(msg, skb); 746 skb_queue_tail(&sk->sk_write_queue, skb);
799 if (rc != 0) 747 dccp_write_xmit(sk,0);
800 goto out_discard;
801
802 dccp_qpolicy_push(sk, skb);
803 dccp_write_xmit(sk);
804out_release: 748out_release:
805 release_sock(sk); 749 release_sock(sk);
806 return rc ? : len; 750 return rc ? : len;
@@ -1023,22 +967,9 @@ void dccp_close(struct sock *sk, long timeout)
1023 /* Check zero linger _after_ checking for unread data. */ 967 /* Check zero linger _after_ checking for unread data. */
1024 sk->sk_prot->disconnect(sk, 0); 968 sk->sk_prot->disconnect(sk, 0);
1025 } else if (sk->sk_state != DCCP_CLOSED) { 969 } else if (sk->sk_state != DCCP_CLOSED) {
1026 /*
1027 * Normal connection termination. May need to wait if there are
1028 * still packets in the TX queue that are delayed by the CCID.
1029 */
1030 dccp_flush_write_queue(sk, &timeout);
1031 dccp_terminate_connection(sk); 970 dccp_terminate_connection(sk);
1032 } 971 }
1033 972
1034 /*
1035 * Flush write queue. This may be necessary in several cases:
1036 * - we have been closed by the peer but still have application data;
1037 * - abortive termination (unread data or zero linger time),
1038 * - normal termination but queue could not be flushed within time limit
1039 */
1040 __skb_queue_purge(&sk->sk_write_queue);
1041
1042 sk_stream_wait_close(sk, timeout); 973 sk_stream_wait_close(sk, timeout);
1043 974
1044adjudge_to_death: 975adjudge_to_death:
diff --git a/net/dccp/qpolicy.c b/net/dccp/qpolicy.c
deleted file mode 100644
index 27383f88c75f..000000000000
--- a/net/dccp/qpolicy.c
+++ /dev/null
@@ -1,137 +0,0 @@
1/*
2 * net/dccp/qpolicy.c
3 *
4 * Policy-based packet dequeueing interface for DCCP.
5 *
6 * Copyright (c) 2008 Tomasz Grobelny <tomasz@grobelny.oswiecenia.net>
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License v2
10 * as published by the Free Software Foundation.
11 */
12#include "dccp.h"
13
14/*
15 * Simple Dequeueing Policy:
16 * If tx_qlen is different from 0, enqueue up to tx_qlen elements.
17 */
18static void qpolicy_simple_push(struct sock *sk, struct sk_buff *skb)
19{
20 skb_queue_tail(&sk->sk_write_queue, skb);
21}
22
23static bool qpolicy_simple_full(struct sock *sk)
24{
25 return dccp_sk(sk)->dccps_tx_qlen &&
26 sk->sk_write_queue.qlen >= dccp_sk(sk)->dccps_tx_qlen;
27}
28
29static struct sk_buff *qpolicy_simple_top(struct sock *sk)
30{
31 return skb_peek(&sk->sk_write_queue);
32}
33
34/*
35 * Priority-based Dequeueing Policy:
36 * If tx_qlen is different from 0 and the queue has reached its upper bound
37 * of tx_qlen elements, replace older packets lowest-priority-first.
38 */
39static struct sk_buff *qpolicy_prio_best_skb(struct sock *sk)
40{
41 struct sk_buff *skb, *best = NULL;
42
43 skb_queue_walk(&sk->sk_write_queue, skb)
44 if (best == NULL || skb->priority > best->priority)
45 best = skb;
46 return best;
47}
48
49static struct sk_buff *qpolicy_prio_worst_skb(struct sock *sk)
50{
51 struct sk_buff *skb, *worst = NULL;
52
53 skb_queue_walk(&sk->sk_write_queue, skb)
54 if (worst == NULL || skb->priority < worst->priority)
55 worst = skb;
56 return worst;
57}
58
59static bool qpolicy_prio_full(struct sock *sk)
60{
61 if (qpolicy_simple_full(sk))
62 dccp_qpolicy_drop(sk, qpolicy_prio_worst_skb(sk));
63 return false;
64}
65
66/**
67 * struct dccp_qpolicy_operations - TX Packet Dequeueing Interface
68 * @push: add a new @skb to the write queue
69 * @full: indicates that no more packets will be admitted
70 * @top: peeks at whatever the queueing policy defines as its `top'
71 */
72static struct dccp_qpolicy_operations {
73 void (*push) (struct sock *sk, struct sk_buff *skb);
74 bool (*full) (struct sock *sk);
75 struct sk_buff* (*top) (struct sock *sk);
76 __be32 params;
77
78} qpol_table[DCCPQ_POLICY_MAX] = {
79 [DCCPQ_POLICY_SIMPLE] = {
80 .push = qpolicy_simple_push,
81 .full = qpolicy_simple_full,
82 .top = qpolicy_simple_top,
83 .params = 0,
84 },
85 [DCCPQ_POLICY_PRIO] = {
86 .push = qpolicy_simple_push,
87 .full = qpolicy_prio_full,
88 .top = qpolicy_prio_best_skb,
89 .params = DCCP_SCM_PRIORITY,
90 },
91};
92
93/*
94 * Externally visible interface
95 */
96void dccp_qpolicy_push(struct sock *sk, struct sk_buff *skb)
97{
98 qpol_table[dccp_sk(sk)->dccps_qpolicy].push(sk, skb);
99}
100
101bool dccp_qpolicy_full(struct sock *sk)
102{
103 return qpol_table[dccp_sk(sk)->dccps_qpolicy].full(sk);
104}
105
106void dccp_qpolicy_drop(struct sock *sk, struct sk_buff *skb)
107{
108 if (skb != NULL) {
109 skb_unlink(skb, &sk->sk_write_queue);
110 kfree_skb(skb);
111 }
112}
113
114struct sk_buff *dccp_qpolicy_top(struct sock *sk)
115{
116 return qpol_table[dccp_sk(sk)->dccps_qpolicy].top(sk);
117}
118
119struct sk_buff *dccp_qpolicy_pop(struct sock *sk)
120{
121 struct sk_buff *skb = dccp_qpolicy_top(sk);
122
123 /* Clear any skb fields that we used internally */
124 skb->priority = 0;
125
126 if (skb)
127 skb_unlink(skb, &sk->sk_write_queue);
128 return skb;
129}
130
131bool dccp_qpolicy_param_ok(struct sock *sk, __be32 param)
132{
133 /* check if exactly one bit is set */
134 if (!param || (param & (param - 1)))
135 return false;
136 return (qpol_table[dccp_sk(sk)->dccps_qpolicy].params & param) == param;
137}
diff --git a/net/dccp/sysctl.c b/net/dccp/sysctl.c
index a5a1856234e7..21295993fdb8 100644
--- a/net/dccp/sysctl.c
+++ b/net/dccp/sysctl.c
@@ -18,72 +18,76 @@
18#error This file should not be compiled without CONFIG_SYSCTL defined 18#error This file should not be compiled without CONFIG_SYSCTL defined
19#endif 19#endif
20 20
21/* Boundary values */
22static int zero = 0,
23 u8_max = 0xFF;
24static unsigned long seqw_min = 32;
25
26static struct ctl_table dccp_default_table[] = { 21static struct ctl_table dccp_default_table[] = {
27 { 22 {
28 .procname = "seq_window", 23 .procname = "seq_window",
29 .data = &sysctl_dccp_sequence_window, 24 .data = &sysctl_dccp_feat_sequence_window,
30 .maxlen = sizeof(sysctl_dccp_sequence_window), 25 .maxlen = sizeof(sysctl_dccp_feat_sequence_window),
31 .mode = 0644, 26 .mode = 0644,
32 .proc_handler = proc_doulongvec_minmax, 27 .proc_handler = proc_dointvec,
33 .extra1 = &seqw_min, /* RFC 4340, 7.5.2 */
34 }, 28 },
35 { 29 {
36 .procname = "rx_ccid", 30 .procname = "rx_ccid",
37 .data = &sysctl_dccp_rx_ccid, 31 .data = &sysctl_dccp_feat_rx_ccid,
38 .maxlen = sizeof(sysctl_dccp_rx_ccid), 32 .maxlen = sizeof(sysctl_dccp_feat_rx_ccid),
39 .mode = 0644, 33 .mode = 0644,
40 .proc_handler = proc_dointvec_minmax, 34 .proc_handler = proc_dointvec,
41 .extra1 = &zero,
42 .extra2 = &u8_max, /* RFC 4340, 10. */
43 }, 35 },
44 { 36 {
45 .procname = "tx_ccid", 37 .procname = "tx_ccid",
46 .data = &sysctl_dccp_tx_ccid, 38 .data = &sysctl_dccp_feat_tx_ccid,
47 .maxlen = sizeof(sysctl_dccp_tx_ccid), 39 .maxlen = sizeof(sysctl_dccp_feat_tx_ccid),
40 .mode = 0644,
41 .proc_handler = proc_dointvec,
42 },
43 {
44 .procname = "ack_ratio",
45 .data = &sysctl_dccp_feat_ack_ratio,
46 .maxlen = sizeof(sysctl_dccp_feat_ack_ratio),
47 .mode = 0644,
48 .proc_handler = proc_dointvec,
49 },
50 {
51 .procname = "send_ackvec",
52 .data = &sysctl_dccp_feat_send_ack_vector,
53 .maxlen = sizeof(sysctl_dccp_feat_send_ack_vector),
54 .mode = 0644,
55 .proc_handler = proc_dointvec,
56 },
57 {
58 .procname = "send_ndp",
59 .data = &sysctl_dccp_feat_send_ndp_count,
60 .maxlen = sizeof(sysctl_dccp_feat_send_ndp_count),
48 .mode = 0644, 61 .mode = 0644,
49 .proc_handler = proc_dointvec_minmax, 62 .proc_handler = proc_dointvec,
50 .extra1 = &zero,
51 .extra2 = &u8_max, /* RFC 4340, 10. */
52 }, 63 },
53 { 64 {
54 .procname = "request_retries", 65 .procname = "request_retries",
55 .data = &sysctl_dccp_request_retries, 66 .data = &sysctl_dccp_request_retries,
56 .maxlen = sizeof(sysctl_dccp_request_retries), 67 .maxlen = sizeof(sysctl_dccp_request_retries),
57 .mode = 0644, 68 .mode = 0644,
58 .proc_handler = proc_dointvec_minmax, 69 .proc_handler = proc_dointvec,
59 .extra1 = &zero,
60 .extra2 = &u8_max,
61 }, 70 },
62 { 71 {
63 .procname = "retries1", 72 .procname = "retries1",
64 .data = &sysctl_dccp_retries1, 73 .data = &sysctl_dccp_retries1,
65 .maxlen = sizeof(sysctl_dccp_retries1), 74 .maxlen = sizeof(sysctl_dccp_retries1),
66 .mode = 0644, 75 .mode = 0644,
67 .proc_handler = proc_dointvec_minmax, 76 .proc_handler = proc_dointvec,
68 .extra1 = &zero,
69 .extra2 = &u8_max,
70 }, 77 },
71 { 78 {
72 .procname = "retries2", 79 .procname = "retries2",
73 .data = &sysctl_dccp_retries2, 80 .data = &sysctl_dccp_retries2,
74 .maxlen = sizeof(sysctl_dccp_retries2), 81 .maxlen = sizeof(sysctl_dccp_retries2),
75 .mode = 0644, 82 .mode = 0644,
76 .proc_handler = proc_dointvec_minmax, 83 .proc_handler = proc_dointvec,
77 .extra1 = &zero,
78 .extra2 = &u8_max,
79 }, 84 },
80 { 85 {
81 .procname = "tx_qlen", 86 .procname = "tx_qlen",
82 .data = &sysctl_dccp_tx_qlen, 87 .data = &sysctl_dccp_tx_qlen,
83 .maxlen = sizeof(sysctl_dccp_tx_qlen), 88 .maxlen = sizeof(sysctl_dccp_tx_qlen),
84 .mode = 0644, 89 .mode = 0644,
85 .proc_handler = proc_dointvec_minmax, 90 .proc_handler = proc_dointvec,
86 .extra1 = &zero,
87 }, 91 },
88 { 92 {
89 .procname = "sync_ratelimit", 93 .procname = "sync_ratelimit",
diff --git a/net/dccp/timer.c b/net/dccp/timer.c
index 16359e29e7f5..54b3c7e9e016 100644
--- a/net/dccp/timer.c
+++ b/net/dccp/timer.c
@@ -87,6 +87,17 @@ static void dccp_retransmit_timer(struct sock *sk)
87{ 87{
88 struct inet_connection_sock *icsk = inet_csk(sk); 88 struct inet_connection_sock *icsk = inet_csk(sk);
89 89
90 /* retransmit timer is used for feature negotiation throughout
91 * connection. In this case, no packet is re-transmitted, but rather an
92 * ack is generated and pending changes are placed into its options.
93 */
94 if (sk->sk_send_head == NULL) {
95 dccp_pr_debug("feat negotiation retransmit timeout %p\n", sk);
96 if (sk->sk_state == DCCP_OPEN)
97 dccp_send_ack(sk);
98 goto backoff;
99 }
100
90 /* 101 /*
91 * More than than 4MSL (8 minutes) has passed, a RESET(aborted) was 102 * More than than 4MSL (8 minutes) has passed, a RESET(aborted) was
92 * sent, no need to retransmit, this sock is dead. 103 * sent, no need to retransmit, this sock is dead.
@@ -115,6 +126,7 @@ static void dccp_retransmit_timer(struct sock *sk)
115 return; 126 return;
116 } 127 }
117 128
129backoff:
118 icsk->icsk_backoff++; 130 icsk->icsk_backoff++;
119 131
120 icsk->icsk_rto = min(icsk->icsk_rto << 1, DCCP_RTO_MAX); 132 icsk->icsk_rto = min(icsk->icsk_rto << 1, DCCP_RTO_MAX);
@@ -237,35 +249,32 @@ out:
237 sock_put(sk); 249 sock_put(sk);
238} 250}
239 251
240/** 252/* Transmit-delay timer: used by the CCIDs to delay actual send time */
241 * dccp_write_xmitlet - Workhorse for CCID packet dequeueing interface 253static void dccp_write_xmit_timer(unsigned long data)
242 * See the comments above %ccid_dequeueing_decision for supported modes.
243 */
244static void dccp_write_xmitlet(unsigned long data)
245{ 254{
246 struct sock *sk = (struct sock *)data; 255 struct sock *sk = (struct sock *)data;
256 struct dccp_sock *dp = dccp_sk(sk);
247 257
248 bh_lock_sock(sk); 258 bh_lock_sock(sk);
249 if (sock_owned_by_user(sk)) 259 if (sock_owned_by_user(sk))
250 sk_reset_timer(sk, &dccp_sk(sk)->dccps_xmit_timer, jiffies + 1); 260 sk_reset_timer(sk, &dp->dccps_xmit_timer, jiffies+1);
251 else 261 else
252 dccp_write_xmit(sk); 262 dccp_write_xmit(sk, 0);
253 bh_unlock_sock(sk); 263 bh_unlock_sock(sk);
264 sock_put(sk);
254} 265}
255 266
256static void dccp_write_xmit_timer(unsigned long data) 267static void dccp_init_write_xmit_timer(struct sock *sk)
257{ 268{
258 dccp_write_xmitlet(data); 269 struct dccp_sock *dp = dccp_sk(sk);
259 sock_put((struct sock *)data); 270
271 setup_timer(&dp->dccps_xmit_timer, dccp_write_xmit_timer,
272 (unsigned long)sk);
260} 273}
261 274
262void dccp_init_xmit_timers(struct sock *sk) 275void dccp_init_xmit_timers(struct sock *sk)
263{ 276{
264 struct dccp_sock *dp = dccp_sk(sk); 277 dccp_init_write_xmit_timer(sk);
265
266 tasklet_init(&dp->dccps_xmitlet, dccp_write_xmitlet, (unsigned long)sk);
267 setup_timer(&dp->dccps_xmit_timer, dccp_write_xmit_timer,
268 (unsigned long)sk);
269 inet_csk_init_xmit_timers(sk, &dccp_write_timer, &dccp_delack_timer, 278 inet_csk_init_xmit_timers(sk, &dccp_write_timer, &dccp_delack_timer,
270 &dccp_keepalive_timer); 279 &dccp_keepalive_timer);
271} 280}
@@ -281,7 +290,8 @@ u32 dccp_timestamp(void)
281{ 290{
282 s64 delta = ktime_us_delta(ktime_get_real(), dccp_timestamp_seed); 291 s64 delta = ktime_us_delta(ktime_get_real(), dccp_timestamp_seed);
283 292
284 return div_u64(delta, DCCP_TIME_RESOLUTION); 293 do_div(delta, 10);
294 return delta;
285} 295}
286EXPORT_SYMBOL_GPL(dccp_timestamp); 296EXPORT_SYMBOL_GPL(dccp_timestamp);
287 297
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 9da9f19ece8a..f79a51607292 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -811,12 +811,25 @@ void tcp_update_metrics(struct sock *sk)
811 } 811 }
812} 812}
813 813
814/* Numbers are taken from RFC3390.
815 *
816 * John Heffner states:
817 *
818 * The RFC specifies a window of no more than 4380 bytes
819 * unless 2*MSS > 4380. Reading the pseudocode in the RFC
820 * is a bit misleading because they use a clamp at 4380 bytes
821 * rather than use a multiplier in the relevant range.
822 */
814__u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst) 823__u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst)
815{ 824{
816 __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); 825 __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
817 826
818 if (!cwnd) 827 if (!cwnd) {
819 cwnd = rfc3390_bytes_to_packets(tp->mss_cache); 828 if (tp->mss_cache > 1460)
829 cwnd = 2;
830 else
831 cwnd = (tp->mss_cache > 1095) ? 3 : 4;
832 }
820 return min_t(__u32, cwnd, tp->snd_cwnd_clamp); 833 return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
821} 834}
822 835