diff options
author | David S. Miller <davem@davemloft.net> | 2008-09-08 20:28:59 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2008-09-08 20:28:59 -0400 |
commit | 0a68a20cc3eafa73bb54097c28b921147d7d3685 (patch) | |
tree | 8e5f315226b618cb8e050a0c7653c8ec134501e3 | |
parent | 17dce5dfe38ae2fb359b61e855f5d8a3a8b7892b (diff) | |
parent | a3cbdde8e9c38b66b4f13ac5d6ff1939ded0ff20 (diff) |
Merge branch 'dccp' of git://eden-feed.erg.abdn.ac.uk/dccp_exp
Conflicts:
net/dccp/input.c
net/dccp/options.c
36 files changed, 3971 insertions, 2884 deletions
diff --git a/Documentation/networking/dccp.txt b/Documentation/networking/dccp.txt index 39131a3c78f8..fcfc12534428 100644 --- a/Documentation/networking/dccp.txt +++ b/Documentation/networking/dccp.txt | |||
@@ -45,6 +45,25 @@ http://linux-net.osdl.org/index.php/DCCP_Testing#Experimental_DCCP_source_tree | |||
45 | 45 | ||
46 | Socket options | 46 | Socket options |
47 | ============== | 47 | ============== |
48 | DCCP_SOCKOPT_QPOLICY_ID sets the dequeuing policy for outgoing packets. It takes | ||
49 | a policy ID as argument and can only be set before the connection (i.e. changes | ||
50 | during an established connection are not supported). Currently, two policies are | ||
51 | defined: the "simple" policy (DCCPQ_POLICY_SIMPLE), which does nothing special, | ||
52 | and a priority-based variant (DCCPQ_POLICY_PRIO). The latter allows to pass an | ||
53 | u32 priority value as ancillary data to sendmsg(), where higher numbers indicate | ||
54 | a higher packet priority (similar to SO_PRIORITY). This ancillary data needs to | ||
55 | be formatted using a cmsg(3) message header filled in as follows: | ||
56 | cmsg->cmsg_level = SOL_DCCP; | ||
57 | cmsg->cmsg_type = DCCP_SCM_PRIORITY; | ||
58 | cmsg->cmsg_len = CMSG_LEN(sizeof(uint32_t)); /* or CMSG_LEN(4) */ | ||
59 | |||
60 | DCCP_SOCKOPT_QPOLICY_TXQLEN sets the maximum length of the output queue. A zero | ||
61 | value is always interpreted as unbounded queue length. If different from zero, | ||
62 | the interpretation of this parameter depends on the current dequeuing policy | ||
63 | (see above): the "simple" policy will enforce a fixed queue size by returning | ||
64 | EAGAIN, whereas the "prio" policy enforces a fixed queue length by dropping the | ||
65 | lowest-priority packet first. The default value for this parameter is | ||
66 | initialised from /proc/sys/net/dccp/default/tx_qlen. | ||
48 | 67 | ||
49 | DCCP_SOCKOPT_SERVICE sets the service. The specification mandates use of | 68 | DCCP_SOCKOPT_SERVICE sets the service. The specification mandates use of |
50 | service codes (RFC 4340, sec. 8.1.2); if this socket option is not set, | 69 | service codes (RFC 4340, sec. 8.1.2); if this socket option is not set, |
@@ -57,6 +76,24 @@ can be set before calling bind(). | |||
57 | DCCP_SOCKOPT_GET_CUR_MPS is read-only and retrieves the current maximum packet | 76 | DCCP_SOCKOPT_GET_CUR_MPS is read-only and retrieves the current maximum packet |
58 | size (application payload size) in bytes, see RFC 4340, section 14. | 77 | size (application payload size) in bytes, see RFC 4340, section 14. |
59 | 78 | ||
79 | DCCP_SOCKOPT_AVAILABLE_CCIDS is also read-only and returns the list of CCIDs | ||
80 | supported by the endpoint (see include/linux/dccp.h for symbolic constants). | ||
81 | The caller needs to provide a sufficiently large (> 2) array of type uint8_t. | ||
82 | |||
83 | DCCP_SOCKOPT_CCID is write-only and sets both the TX and RX CCIDs at the same | ||
84 | time, combining the operation of the next two socket options. This option is | ||
85 | preferrable over the latter two, since often applications will use the same | ||
86 | type of CCID for both directions; and mixed use of CCIDs is not currently well | ||
87 | understood. This socket option takes as argument at least one uint8_t value, or | ||
88 | an array of uint8_t values, which must match available CCIDS (see above). CCIDs | ||
89 | must be registered on the socket before calling connect() or listen(). | ||
90 | |||
91 | DCCP_SOCKOPT_TX_CCID is read/write. It returns the current CCID (if set) or sets | ||
92 | the preference list for the TX CCID, using the same format as DCCP_SOCKOPT_CCID. | ||
93 | Please note that the getsockopt argument type here is `int', not uint8_t. | ||
94 | |||
95 | DCCP_SOCKOPT_RX_CCID is analogous to DCCP_SOCKOPT_TX_CCID, but for the RX CCID. | ||
96 | |||
60 | DCCP_SOCKOPT_SERVER_TIMEWAIT enables the server (listening socket) to hold | 97 | DCCP_SOCKOPT_SERVER_TIMEWAIT enables the server (listening socket) to hold |
61 | timewait state when closing the connection (RFC 4340, 8.3). The usual case is | 98 | timewait state when closing the connection (RFC 4340, 8.3). The usual case is |
62 | that the closing server sends a CloseReq, whereupon the client holds timewait | 99 | that the closing server sends a CloseReq, whereupon the client holds timewait |
@@ -115,23 +152,16 @@ retries2 | |||
115 | importance for retransmitted acknowledgments and feature negotiation, | 152 | importance for retransmitted acknowledgments and feature negotiation, |
116 | data packets are never retransmitted. Analogue of tcp_retries2. | 153 | data packets are never retransmitted. Analogue of tcp_retries2. |
117 | 154 | ||
118 | send_ndp = 1 | ||
119 | Whether or not to send NDP count options (sec. 7.7.2). | ||
120 | |||
121 | send_ackvec = 1 | ||
122 | Whether or not to send Ack Vector options (sec. 11.5). | ||
123 | |||
124 | ack_ratio = 2 | ||
125 | The default Ack Ratio (sec. 11.3) to use. | ||
126 | |||
127 | tx_ccid = 2 | 155 | tx_ccid = 2 |
128 | Default CCID for the sender-receiver half-connection. | 156 | Default CCID for the sender-receiver half-connection. Depending on the |
157 | choice of CCID, the Send Ack Vector feature is enabled automatically. | ||
129 | 158 | ||
130 | rx_ccid = 2 | 159 | rx_ccid = 2 |
131 | Default CCID for the receiver-sender half-connection. | 160 | Default CCID for the receiver-sender half-connection; see tx_ccid. |
132 | 161 | ||
133 | seq_window = 100 | 162 | seq_window = 100 |
134 | The initial sequence window (sec. 7.5.2). | 163 | The initial sequence window (sec. 7.5.2) of the sender. This influences |
164 | the local ackno validity and the remote seqno validity windows (7.5.1). | ||
135 | 165 | ||
136 | tx_qlen = 5 | 166 | tx_qlen = 5 |
137 | The size of the transmit buffer in packets. A value of 0 corresponds | 167 | The size of the transmit buffer in packets. A value of 0 corresponds |
diff --git a/include/linux/dccp.h b/include/linux/dccp.h index 6080449fbec9..010e2d87ed75 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h | |||
@@ -165,9 +165,13 @@ enum { | |||
165 | DCCPO_TIMESTAMP_ECHO = 42, | 165 | DCCPO_TIMESTAMP_ECHO = 42, |
166 | DCCPO_ELAPSED_TIME = 43, | 166 | DCCPO_ELAPSED_TIME = 43, |
167 | DCCPO_MAX = 45, | 167 | DCCPO_MAX = 45, |
168 | DCCPO_MIN_CCID_SPECIFIC = 128, | 168 | DCCPO_MIN_RX_CCID_SPECIFIC = 128, /* from sender to receiver */ |
169 | DCCPO_MAX_CCID_SPECIFIC = 255, | 169 | DCCPO_MAX_RX_CCID_SPECIFIC = 191, |
170 | DCCPO_MIN_TX_CCID_SPECIFIC = 192, /* from receiver to sender */ | ||
171 | DCCPO_MAX_TX_CCID_SPECIFIC = 255, | ||
170 | }; | 172 | }; |
173 | /* maximum size of a single TLV-encoded DCCP option (sans type/len bytes) */ | ||
174 | #define DCCP_SINGLE_OPT_MAXLEN 253 | ||
171 | 175 | ||
172 | /* DCCP CCIDS */ | 176 | /* DCCP CCIDS */ |
173 | enum { | 177 | enum { |
@@ -176,27 +180,36 @@ enum { | |||
176 | }; | 180 | }; |
177 | 181 | ||
178 | /* DCCP features (RFC 4340 section 6.4) */ | 182 | /* DCCP features (RFC 4340 section 6.4) */ |
179 | enum { | 183 | enum dccp_feature_numbers { |
180 | DCCPF_RESERVED = 0, | 184 | DCCPF_RESERVED = 0, |
181 | DCCPF_CCID = 1, | 185 | DCCPF_CCID = 1, |
182 | DCCPF_SHORT_SEQNOS = 2, /* XXX: not yet implemented */ | 186 | DCCPF_SHORT_SEQNOS = 2, |
183 | DCCPF_SEQUENCE_WINDOW = 3, | 187 | DCCPF_SEQUENCE_WINDOW = 3, |
184 | DCCPF_ECN_INCAPABLE = 4, /* XXX: not yet implemented */ | 188 | DCCPF_ECN_INCAPABLE = 4, |
185 | DCCPF_ACK_RATIO = 5, | 189 | DCCPF_ACK_RATIO = 5, |
186 | DCCPF_SEND_ACK_VECTOR = 6, | 190 | DCCPF_SEND_ACK_VECTOR = 6, |
187 | DCCPF_SEND_NDP_COUNT = 7, | 191 | DCCPF_SEND_NDP_COUNT = 7, |
188 | DCCPF_MIN_CSUM_COVER = 8, | 192 | DCCPF_MIN_CSUM_COVER = 8, |
189 | DCCPF_DATA_CHECKSUM = 9, /* XXX: not yet implemented */ | 193 | DCCPF_DATA_CHECKSUM = 9, |
190 | /* 10-127 reserved */ | 194 | /* 10-127 reserved */ |
191 | DCCPF_MIN_CCID_SPECIFIC = 128, | 195 | DCCPF_MIN_CCID_SPECIFIC = 128, |
196 | DCCPF_SEND_LEV_RATE = 192, /* RFC 4342, sec. 8.4 */ | ||
192 | DCCPF_MAX_CCID_SPECIFIC = 255, | 197 | DCCPF_MAX_CCID_SPECIFIC = 255, |
193 | }; | 198 | }; |
194 | 199 | ||
195 | /* this structure is argument to DCCP_SOCKOPT_CHANGE_X */ | 200 | /* DCCP socket control message types for cmsg */ |
196 | struct dccp_so_feat { | 201 | enum dccp_cmsg_type { |
197 | __u8 dccpsf_feat; | 202 | DCCP_SCM_PRIORITY = 1, |
198 | __u8 __user *dccpsf_val; | 203 | DCCP_SCM_QPOLICY_MAX = 0xFFFF, |
199 | __u8 dccpsf_len; | 204 | /* ^-- Up to here reserved exclusively for qpolicy parameters */ |
205 | DCCP_SCM_MAX | ||
206 | }; | ||
207 | |||
208 | /* DCCP priorities for outgoing/queued packets */ | ||
209 | enum dccp_packet_dequeueing_policy { | ||
210 | DCCPQ_POLICY_SIMPLE, | ||
211 | DCCPQ_POLICY_PRIO, | ||
212 | DCCPQ_POLICY_MAX | ||
200 | }; | 213 | }; |
201 | 214 | ||
202 | /* DCCP socket options */ | 215 | /* DCCP socket options */ |
@@ -208,6 +221,12 @@ struct dccp_so_feat { | |||
208 | #define DCCP_SOCKOPT_SERVER_TIMEWAIT 6 | 221 | #define DCCP_SOCKOPT_SERVER_TIMEWAIT 6 |
209 | #define DCCP_SOCKOPT_SEND_CSCOV 10 | 222 | #define DCCP_SOCKOPT_SEND_CSCOV 10 |
210 | #define DCCP_SOCKOPT_RECV_CSCOV 11 | 223 | #define DCCP_SOCKOPT_RECV_CSCOV 11 |
224 | #define DCCP_SOCKOPT_AVAILABLE_CCIDS 12 | ||
225 | #define DCCP_SOCKOPT_CCID 13 | ||
226 | #define DCCP_SOCKOPT_TX_CCID 14 | ||
227 | #define DCCP_SOCKOPT_RX_CCID 15 | ||
228 | #define DCCP_SOCKOPT_QPOLICY_ID 16 | ||
229 | #define DCCP_SOCKOPT_QPOLICY_TXQLEN 17 | ||
211 | #define DCCP_SOCKOPT_CCID_RX_INFO 128 | 230 | #define DCCP_SOCKOPT_CCID_RX_INFO 128 |
212 | #define DCCP_SOCKOPT_CCID_TX_INFO 192 | 231 | #define DCCP_SOCKOPT_CCID_TX_INFO 192 |
213 | 232 | ||
@@ -355,62 +374,13 @@ static inline unsigned int dccp_hdr_len(const struct sk_buff *skb) | |||
355 | return __dccp_hdr_len(dccp_hdr(skb)); | 374 | return __dccp_hdr_len(dccp_hdr(skb)); |
356 | } | 375 | } |
357 | 376 | ||
358 | |||
359 | /* initial values for each feature */ | ||
360 | #define DCCPF_INITIAL_SEQUENCE_WINDOW 100 | ||
361 | #define DCCPF_INITIAL_ACK_RATIO 2 | ||
362 | #define DCCPF_INITIAL_CCID DCCPC_CCID2 | ||
363 | #define DCCPF_INITIAL_SEND_ACK_VECTOR 1 | ||
364 | /* FIXME: for now we're default to 1 but it should really be 0 */ | ||
365 | #define DCCPF_INITIAL_SEND_NDP_COUNT 1 | ||
366 | |||
367 | /** | ||
368 | * struct dccp_minisock - Minimal DCCP connection representation | ||
369 | * | ||
370 | * Will be used to pass the state from dccp_request_sock to dccp_sock. | ||
371 | * | ||
372 | * @dccpms_sequence_window - Sequence Window Feature (section 7.5.2) | ||
373 | * @dccpms_ccid - Congestion Control Id (CCID) (section 10) | ||
374 | * @dccpms_send_ack_vector - Send Ack Vector Feature (section 11.5) | ||
375 | * @dccpms_send_ndp_count - Send NDP Count Feature (7.7.2) | ||
376 | * @dccpms_ack_ratio - Ack Ratio Feature (section 11.3) | ||
377 | * @dccpms_pending - List of features being negotiated | ||
378 | * @dccpms_conf - | ||
379 | */ | ||
380 | struct dccp_minisock { | ||
381 | __u64 dccpms_sequence_window; | ||
382 | __u8 dccpms_rx_ccid; | ||
383 | __u8 dccpms_tx_ccid; | ||
384 | __u8 dccpms_send_ack_vector; | ||
385 | __u8 dccpms_send_ndp_count; | ||
386 | __u8 dccpms_ack_ratio; | ||
387 | struct list_head dccpms_pending; | ||
388 | struct list_head dccpms_conf; | ||
389 | }; | ||
390 | |||
391 | struct dccp_opt_conf { | ||
392 | __u8 *dccpoc_val; | ||
393 | __u8 dccpoc_len; | ||
394 | }; | ||
395 | |||
396 | struct dccp_opt_pend { | ||
397 | struct list_head dccpop_node; | ||
398 | __u8 dccpop_type; | ||
399 | __u8 dccpop_feat; | ||
400 | __u8 *dccpop_val; | ||
401 | __u8 dccpop_len; | ||
402 | int dccpop_conf; | ||
403 | struct dccp_opt_conf *dccpop_sc; | ||
404 | }; | ||
405 | |||
406 | extern void dccp_minisock_init(struct dccp_minisock *dmsk); | ||
407 | |||
408 | /** | 377 | /** |
409 | * struct dccp_request_sock - represent DCCP-specific connection request | 378 | * struct dccp_request_sock - represent DCCP-specific connection request |
410 | * @dreq_inet_rsk: structure inherited from | 379 | * @dreq_inet_rsk: structure inherited from |
411 | * @dreq_iss: initial sequence number sent on the Response (RFC 4340, 7.1) | 380 | * @dreq_iss: initial sequence number sent on the Response (RFC 4340, 7.1) |
412 | * @dreq_isr: initial sequence number received on the Request | 381 | * @dreq_isr: initial sequence number received on the Request |
413 | * @dreq_service: service code present on the Request (there is just one) | 382 | * @dreq_service: service code present on the Request (there is just one) |
383 | * @dreq_featneg: feature negotiation options for this connection | ||
414 | * The following two fields are analogous to the ones in dccp_sock: | 384 | * The following two fields are analogous to the ones in dccp_sock: |
415 | * @dreq_timestamp_echo: last received timestamp to echo (13.1) | 385 | * @dreq_timestamp_echo: last received timestamp to echo (13.1) |
416 | * @dreq_timestamp_echo: the time of receiving the last @dreq_timestamp_echo | 386 | * @dreq_timestamp_echo: the time of receiving the last @dreq_timestamp_echo |
@@ -420,6 +390,7 @@ struct dccp_request_sock { | |||
420 | __u64 dreq_iss; | 390 | __u64 dreq_iss; |
421 | __u64 dreq_isr; | 391 | __u64 dreq_isr; |
422 | __be32 dreq_service; | 392 | __be32 dreq_service; |
393 | struct list_head dreq_featneg; | ||
423 | __u32 dreq_timestamp_echo; | 394 | __u32 dreq_timestamp_echo; |
424 | __u32 dreq_timestamp_time; | 395 | __u32 dreq_timestamp_time; |
425 | }; | 396 | }; |
@@ -491,21 +462,28 @@ struct dccp_ackvec; | |||
491 | * @dccps_timestamp_time - time of receiving latest @dccps_timestamp_echo | 462 | * @dccps_timestamp_time - time of receiving latest @dccps_timestamp_echo |
492 | * @dccps_l_ack_ratio - feature-local Ack Ratio | 463 | * @dccps_l_ack_ratio - feature-local Ack Ratio |
493 | * @dccps_r_ack_ratio - feature-remote Ack Ratio | 464 | * @dccps_r_ack_ratio - feature-remote Ack Ratio |
465 | * @dccps_l_seq_win - local Sequence Window (influences ack number validity) | ||
466 | * @dccps_r_seq_win - remote Sequence Window (influences seq number validity) | ||
494 | * @dccps_pcslen - sender partial checksum coverage (via sockopt) | 467 | * @dccps_pcslen - sender partial checksum coverage (via sockopt) |
495 | * @dccps_pcrlen - receiver partial checksum coverage (via sockopt) | 468 | * @dccps_pcrlen - receiver partial checksum coverage (via sockopt) |
469 | * @dccps_send_ndp_count - local Send NDP Count feature (7.7.2) | ||
496 | * @dccps_ndp_count - number of Non Data Packets since last data packet | 470 | * @dccps_ndp_count - number of Non Data Packets since last data packet |
497 | * @dccps_mss_cache - current value of MSS (path MTU minus header sizes) | 471 | * @dccps_mss_cache - current value of MSS (path MTU minus header sizes) |
498 | * @dccps_rate_last - timestamp for rate-limiting DCCP-Sync (RFC 4340, 7.5.4) | 472 | * @dccps_rate_last - timestamp for rate-limiting DCCP-Sync (RFC 4340, 7.5.4) |
499 | * @dccps_minisock - associated minisock (accessed via dccp_msk) | 473 | * @dccps_featneg - tracks feature-negotiation state (mostly during handshake) |
500 | * @dccps_hc_rx_ackvec - rx half connection ack vector | 474 | * @dccps_hc_rx_ackvec - rx half connection ack vector |
501 | * @dccps_hc_rx_ccid - CCID used for the receiver (or receiving half-connection) | 475 | * @dccps_hc_rx_ccid - CCID used for the receiver (or receiving half-connection) |
502 | * @dccps_hc_tx_ccid - CCID used for the sender (or sending half-connection) | 476 | * @dccps_hc_tx_ccid - CCID used for the sender (or sending half-connection) |
503 | * @dccps_options_received - parsed set of retrieved options | 477 | * @dccps_options_received - parsed set of retrieved options |
478 | * @dccps_qpolicy - TX dequeueing policy, one of %dccp_packet_dequeueing_policy | ||
479 | * @dccps_tx_qlen - maximum length of the TX queue | ||
504 | * @dccps_role - role of this sock, one of %dccp_role | 480 | * @dccps_role - role of this sock, one of %dccp_role |
505 | * @dccps_hc_rx_insert_options - receiver wants to add options when acking | 481 | * @dccps_hc_rx_insert_options - receiver wants to add options when acking |
506 | * @dccps_hc_tx_insert_options - sender wants to add options when sending | 482 | * @dccps_hc_tx_insert_options - sender wants to add options when sending |
507 | * @dccps_server_timewait - server holds timewait state on close (RFC 4340, 8.3) | 483 | * @dccps_server_timewait - server holds timewait state on close (RFC 4340, 8.3) |
508 | * @dccps_xmit_timer - timer for when CCID is not ready to send | 484 | * @dccps_sync_scheduled - flag which signals "send out-of-band message soon" |
485 | * @dccps_xmitlet - tasklet scheduled by the TX CCID to dequeue data packets | ||
486 | * @dccps_xmit_timer - used by the TX CCID to delay sending (rate-based pacing) | ||
509 | * @dccps_syn_rtt - RTT sample from Request/Response exchange (in usecs) | 487 | * @dccps_syn_rtt - RTT sample from Request/Response exchange (in usecs) |
510 | */ | 488 | */ |
511 | struct dccp_sock { | 489 | struct dccp_sock { |
@@ -529,19 +507,26 @@ struct dccp_sock { | |||
529 | __u32 dccps_timestamp_time; | 507 | __u32 dccps_timestamp_time; |
530 | __u16 dccps_l_ack_ratio; | 508 | __u16 dccps_l_ack_ratio; |
531 | __u16 dccps_r_ack_ratio; | 509 | __u16 dccps_r_ack_ratio; |
532 | __u16 dccps_pcslen; | 510 | __u64 dccps_l_seq_win:48; |
533 | __u16 dccps_pcrlen; | 511 | __u64 dccps_r_seq_win:48; |
512 | __u8 dccps_pcslen:4; | ||
513 | __u8 dccps_pcrlen:4; | ||
514 | __u8 dccps_send_ndp_count:1; | ||
534 | __u64 dccps_ndp_count:48; | 515 | __u64 dccps_ndp_count:48; |
535 | unsigned long dccps_rate_last; | 516 | unsigned long dccps_rate_last; |
536 | struct dccp_minisock dccps_minisock; | 517 | struct list_head dccps_featneg; |
537 | struct dccp_ackvec *dccps_hc_rx_ackvec; | 518 | struct dccp_ackvec *dccps_hc_rx_ackvec; |
538 | struct ccid *dccps_hc_rx_ccid; | 519 | struct ccid *dccps_hc_rx_ccid; |
539 | struct ccid *dccps_hc_tx_ccid; | 520 | struct ccid *dccps_hc_tx_ccid; |
540 | struct dccp_options_received dccps_options_received; | 521 | struct dccp_options_received dccps_options_received; |
522 | __u8 dccps_qpolicy; | ||
523 | __u32 dccps_tx_qlen; | ||
541 | enum dccp_role dccps_role:2; | 524 | enum dccp_role dccps_role:2; |
542 | __u8 dccps_hc_rx_insert_options:1; | 525 | __u8 dccps_hc_rx_insert_options:1; |
543 | __u8 dccps_hc_tx_insert_options:1; | 526 | __u8 dccps_hc_tx_insert_options:1; |
544 | __u8 dccps_server_timewait:1; | 527 | __u8 dccps_server_timewait:1; |
528 | __u8 dccps_sync_scheduled:1; | ||
529 | struct tasklet_struct dccps_xmitlet; | ||
545 | struct timer_list dccps_xmit_timer; | 530 | struct timer_list dccps_xmit_timer; |
546 | }; | 531 | }; |
547 | 532 | ||
@@ -550,11 +535,6 @@ static inline struct dccp_sock *dccp_sk(const struct sock *sk) | |||
550 | return (struct dccp_sock *)sk; | 535 | return (struct dccp_sock *)sk; |
551 | } | 536 | } |
552 | 537 | ||
553 | static inline struct dccp_minisock *dccp_msk(const struct sock *sk) | ||
554 | { | ||
555 | return (struct dccp_minisock *)&dccp_sk(sk)->dccps_minisock; | ||
556 | } | ||
557 | |||
558 | static inline const char *dccp_role(const struct sock *sk) | 538 | static inline const char *dccp_role(const struct sock *sk) |
559 | { | 539 | { |
560 | switch (dccp_sk(sk)->dccps_role) { | 540 | switch (dccp_sk(sk)->dccps_role) { |
diff --git a/include/net/tcp.h b/include/net/tcp.h index 8983386356a5..6bc4b8148ca0 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h | |||
@@ -782,6 +782,21 @@ static inline __u32 tcp_current_ssthresh(const struct sock *sk) | |||
782 | /* Use define here intentionally to get WARN_ON location shown at the caller */ | 782 | /* Use define here intentionally to get WARN_ON location shown at the caller */ |
783 | #define tcp_verify_left_out(tp) WARN_ON(tcp_left_out(tp) > tp->packets_out) | 783 | #define tcp_verify_left_out(tp) WARN_ON(tcp_left_out(tp) > tp->packets_out) |
784 | 784 | ||
785 | /* | ||
786 | * Convert RFC3390 larger initial windows into an equivalent number of packets. | ||
787 | * | ||
788 | * John Heffner states: | ||
789 | * | ||
790 | * The RFC specifies a window of no more than 4380 bytes | ||
791 | * unless 2*MSS > 4380. Reading the pseudocode in the RFC | ||
792 | * is a bit misleading because they use a clamp at 4380 bytes | ||
793 | * rather than a multiplier in the relevant range. | ||
794 | */ | ||
795 | static inline u32 rfc3390_bytes_to_packets(const u32 bytes) | ||
796 | { | ||
797 | return bytes <= 1095 ? 4 : (bytes > 1460 ? 2 : 3); | ||
798 | } | ||
799 | |||
785 | extern void tcp_enter_cwr(struct sock *sk, const int set_ssthresh); | 800 | extern void tcp_enter_cwr(struct sock *sk, const int set_ssthresh); |
786 | extern __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst); | 801 | extern __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst); |
787 | 802 | ||
diff --git a/net/dccp/Kconfig b/net/dccp/Kconfig index 7aa2a7acc7ec..206c16ad9c3c 100644 --- a/net/dccp/Kconfig +++ b/net/dccp/Kconfig | |||
@@ -25,9 +25,6 @@ config INET_DCCP_DIAG | |||
25 | def_tristate y if (IP_DCCP = y && INET_DIAG = y) | 25 | def_tristate y if (IP_DCCP = y && INET_DIAG = y) |
26 | def_tristate m | 26 | def_tristate m |
27 | 27 | ||
28 | config IP_DCCP_ACKVEC | ||
29 | bool | ||
30 | |||
31 | source "net/dccp/ccids/Kconfig" | 28 | source "net/dccp/ccids/Kconfig" |
32 | 29 | ||
33 | menu "DCCP Kernel Hacking" | 30 | menu "DCCP Kernel Hacking" |
diff --git a/net/dccp/Makefile b/net/dccp/Makefile index f4f8793aafff..0c1c9af2bf7e 100644 --- a/net/dccp/Makefile +++ b/net/dccp/Makefile | |||
@@ -1,6 +1,7 @@ | |||
1 | obj-$(CONFIG_IP_DCCP) += dccp.o dccp_ipv4.o | 1 | obj-$(CONFIG_IP_DCCP) += dccp.o dccp_ipv4.o |
2 | 2 | ||
3 | dccp-y := ccid.o feat.o input.o minisocks.o options.o output.o proto.o timer.o | 3 | dccp-y := ccid.o feat.o input.o minisocks.o options.o \ |
4 | qpolicy.o output.o proto.o timer.o ackvec.o | ||
4 | 5 | ||
5 | dccp_ipv4-y := ipv4.o | 6 | dccp_ipv4-y := ipv4.o |
6 | 7 | ||
@@ -8,8 +9,6 @@ dccp_ipv4-y := ipv4.o | |||
8 | obj-$(subst y,$(CONFIG_IP_DCCP),$(CONFIG_IPV6)) += dccp_ipv6.o | 9 | obj-$(subst y,$(CONFIG_IP_DCCP),$(CONFIG_IPV6)) += dccp_ipv6.o |
9 | dccp_ipv6-y := ipv6.o | 10 | dccp_ipv6-y := ipv6.o |
10 | 11 | ||
11 | dccp-$(CONFIG_IP_DCCP_ACKVEC) += ackvec.o | ||
12 | |||
13 | obj-$(CONFIG_INET_DCCP_DIAG) += dccp_diag.o | 12 | obj-$(CONFIG_INET_DCCP_DIAG) += dccp_diag.o |
14 | obj-$(CONFIG_NET_DCCPPROBE) += dccp_probe.o | 13 | obj-$(CONFIG_NET_DCCPPROBE) += dccp_probe.o |
15 | 14 | ||
diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c index 1e8be246ad15..41819848bdda 100644 --- a/net/dccp/ackvec.c +++ b/net/dccp/ackvec.c | |||
@@ -1,445 +1,375 @@ | |||
1 | /* | 1 | /* |
2 | * net/dccp/ackvec.c | 2 | * net/dccp/ackvec.c |
3 | * | 3 | * |
4 | * An implementation of the DCCP protocol | 4 | * An implementation of Ack Vectors for the DCCP protocol |
5 | * Copyright (c) 2007 University of Aberdeen, Scotland, UK | ||
5 | * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@ghostprotocols.net> | 6 | * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@ghostprotocols.net> |
6 | * | 7 | * |
7 | * This program is free software; you can redistribute it and/or modify it | 8 | * This program is free software; you can redistribute it and/or modify it |
8 | * under the terms of the GNU General Public License as published by the | 9 | * under the terms of the GNU General Public License as published by the |
9 | * Free Software Foundation; version 2 of the License; | 10 | * Free Software Foundation; version 2 of the License; |
10 | */ | 11 | */ |
11 | |||
12 | #include "ackvec.h" | ||
13 | #include "dccp.h" | 12 | #include "dccp.h" |
14 | |||
15 | #include <linux/dccp.h> | ||
16 | #include <linux/init.h> | ||
17 | #include <linux/errno.h> | ||
18 | #include <linux/kernel.h> | 13 | #include <linux/kernel.h> |
19 | #include <linux/skbuff.h> | ||
20 | #include <linux/slab.h> | 14 | #include <linux/slab.h> |
21 | 15 | ||
22 | #include <net/sock.h> | ||
23 | |||
24 | static struct kmem_cache *dccp_ackvec_slab; | 16 | static struct kmem_cache *dccp_ackvec_slab; |
25 | static struct kmem_cache *dccp_ackvec_record_slab; | 17 | static struct kmem_cache *dccp_ackvec_record_slab; |
26 | 18 | ||
27 | static struct dccp_ackvec_record *dccp_ackvec_record_new(void) | 19 | struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority) |
28 | { | 20 | { |
29 | struct dccp_ackvec_record *avr = | 21 | struct dccp_ackvec *av = kmem_cache_zalloc(dccp_ackvec_slab, priority); |
30 | kmem_cache_alloc(dccp_ackvec_record_slab, GFP_ATOMIC); | ||
31 | 22 | ||
32 | if (avr != NULL) | 23 | if (av != NULL) { |
33 | INIT_LIST_HEAD(&avr->avr_node); | 24 | av->av_buf_head = av->av_buf_tail = DCCPAV_MAX_ACKVEC_LEN - 1; |
34 | 25 | INIT_LIST_HEAD(&av->av_records); | |
35 | return avr; | 26 | } |
27 | return av; | ||
36 | } | 28 | } |
37 | 29 | ||
38 | static void dccp_ackvec_record_delete(struct dccp_ackvec_record *avr) | 30 | static void dccp_ackvec_purge_records(struct dccp_ackvec *av) |
39 | { | 31 | { |
40 | if (unlikely(avr == NULL)) | 32 | struct dccp_ackvec_record *cur, *next; |
41 | return; | 33 | |
42 | /* Check if deleting a linked record */ | 34 | list_for_each_entry_safe(cur, next, &av->av_records, avr_node) |
43 | WARN_ON(!list_empty(&avr->avr_node)); | 35 | kmem_cache_free(dccp_ackvec_record_slab, cur); |
44 | kmem_cache_free(dccp_ackvec_record_slab, avr); | 36 | INIT_LIST_HEAD(&av->av_records); |
45 | } | 37 | } |
46 | 38 | ||
47 | static void dccp_ackvec_insert_avr(struct dccp_ackvec *av, | 39 | void dccp_ackvec_free(struct dccp_ackvec *av) |
48 | struct dccp_ackvec_record *avr) | ||
49 | { | 40 | { |
50 | /* | 41 | if (likely(av != NULL)) { |
51 | * AVRs are sorted by seqno. Since we are sending them in order, we | 42 | dccp_ackvec_purge_records(av); |
52 | * just add the AVR at the head of the list. | 43 | kmem_cache_free(dccp_ackvec_slab, av); |
53 | * -sorbo. | ||
54 | */ | ||
55 | if (!list_empty(&av->av_records)) { | ||
56 | const struct dccp_ackvec_record *head = | ||
57 | list_entry(av->av_records.next, | ||
58 | struct dccp_ackvec_record, | ||
59 | avr_node); | ||
60 | BUG_ON(before48(avr->avr_ack_seqno, head->avr_ack_seqno)); | ||
61 | } | 44 | } |
62 | |||
63 | list_add(&avr->avr_node, &av->av_records); | ||
64 | } | 45 | } |
65 | 46 | ||
66 | int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb) | 47 | /** |
48 | * dccp_ackvec_update_records - Record information about sent Ack Vectors | ||
49 | * @av: Ack Vector records to update | ||
50 | * @seqno: Sequence number of the packet carrying the Ack Vector just sent | ||
51 | * @nonce_sum: The sum of all buffer nonces contained in the Ack Vector | ||
52 | */ | ||
53 | int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seqno, u8 nonce_sum) | ||
67 | { | 54 | { |
68 | struct dccp_sock *dp = dccp_sk(sk); | ||
69 | struct dccp_ackvec *av = dp->dccps_hc_rx_ackvec; | ||
70 | /* Figure out how many options do we need to represent the ackvec */ | ||
71 | const u16 nr_opts = DIV_ROUND_UP(av->av_vec_len, DCCP_MAX_ACKVEC_OPT_LEN); | ||
72 | u16 len = av->av_vec_len + 2 * nr_opts, i; | ||
73 | u32 elapsed_time; | ||
74 | const unsigned char *tail, *from; | ||
75 | unsigned char *to; | ||
76 | struct dccp_ackvec_record *avr; | 55 | struct dccp_ackvec_record *avr; |
77 | suseconds_t delta; | ||
78 | |||
79 | if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) | ||
80 | return -1; | ||
81 | |||
82 | delta = ktime_us_delta(ktime_get_real(), av->av_time); | ||
83 | elapsed_time = delta / 10; | ||
84 | 56 | ||
85 | if (elapsed_time != 0 && | 57 | avr = kmem_cache_alloc(dccp_ackvec_record_slab, GFP_ATOMIC); |
86 | dccp_insert_option_elapsed_time(sk, skb, elapsed_time)) | ||
87 | return -1; | ||
88 | |||
89 | avr = dccp_ackvec_record_new(); | ||
90 | if (avr == NULL) | 58 | if (avr == NULL) |
91 | return -1; | 59 | return -ENOBUFS; |
92 | |||
93 | DCCP_SKB_CB(skb)->dccpd_opt_len += len; | ||
94 | |||
95 | to = skb_push(skb, len); | ||
96 | len = av->av_vec_len; | ||
97 | from = av->av_buf + av->av_buf_head; | ||
98 | tail = av->av_buf + DCCP_MAX_ACKVEC_LEN; | ||
99 | |||
100 | for (i = 0; i < nr_opts; ++i) { | ||
101 | int copylen = len; | ||
102 | |||
103 | if (len > DCCP_MAX_ACKVEC_OPT_LEN) | ||
104 | copylen = DCCP_MAX_ACKVEC_OPT_LEN; | ||
105 | |||
106 | *to++ = DCCPO_ACK_VECTOR_0; | ||
107 | *to++ = copylen + 2; | ||
108 | |||
109 | /* Check if buf_head wraps */ | ||
110 | if (from + copylen > tail) { | ||
111 | const u16 tailsize = tail - from; | ||
112 | |||
113 | memcpy(to, from, tailsize); | ||
114 | to += tailsize; | ||
115 | len -= tailsize; | ||
116 | copylen -= tailsize; | ||
117 | from = av->av_buf; | ||
118 | } | ||
119 | |||
120 | memcpy(to, from, copylen); | ||
121 | from += copylen; | ||
122 | to += copylen; | ||
123 | len -= copylen; | ||
124 | } | ||
125 | 60 | ||
61 | avr->avr_ack_seqno = seqno; | ||
62 | avr->avr_ack_ptr = av->av_buf_head; | ||
63 | avr->avr_ack_ackno = av->av_buf_ackno; | ||
64 | avr->avr_ack_nonce = nonce_sum; | ||
65 | avr->avr_ack_runlen = dccp_ackvec_runlen(av->av_buf + av->av_buf_head); | ||
126 | /* | 66 | /* |
127 | * From RFC 4340, A.2: | 67 | * When the buffer overflows, we keep no more than one record. This is |
128 | * | 68 | * the simplest way of disambiguating sender-Acks dating from before the |
129 | * For each acknowledgement it sends, the HC-Receiver will add an | 69 | * overflow from sender-Acks which refer to after the overflow; a simple |
130 | * acknowledgement record. ack_seqno will equal the HC-Receiver | 70 | * solution is preferable here since we are handling an exception. |
131 | * sequence number it used for the ack packet; ack_ptr will equal | ||
132 | * buf_head; ack_ackno will equal buf_ackno; and ack_nonce will | ||
133 | * equal buf_nonce. | ||
134 | */ | 71 | */ |
135 | avr->avr_ack_seqno = DCCP_SKB_CB(skb)->dccpd_seq; | 72 | if (av->av_overflow) |
136 | avr->avr_ack_ptr = av->av_buf_head; | 73 | dccp_ackvec_purge_records(av); |
137 | avr->avr_ack_ackno = av->av_buf_ackno; | 74 | /* |
138 | avr->avr_ack_nonce = av->av_buf_nonce; | 75 | * Since GSS is incremented for each packet, the list is automatically |
139 | avr->avr_sent_len = av->av_vec_len; | 76 | * arranged in descending order of @ack_seqno. |
140 | 77 | */ | |
141 | dccp_ackvec_insert_avr(av, avr); | 78 | list_add(&avr->avr_node, &av->av_records); |
142 | 79 | ||
143 | dccp_pr_debug("%s ACK Vector 0, len=%d, ack_seqno=%llu, " | 80 | dccp_pr_debug("Added Vector, ack_seqno=%llu, ack_ackno=%llu (rl=%u)\n", |
144 | "ack_ackno=%llu\n", | ||
145 | dccp_role(sk), avr->avr_sent_len, | ||
146 | (unsigned long long)avr->avr_ack_seqno, | 81 | (unsigned long long)avr->avr_ack_seqno, |
147 | (unsigned long long)avr->avr_ack_ackno); | 82 | (unsigned long long)avr->avr_ack_ackno, |
83 | avr->avr_ack_runlen); | ||
148 | return 0; | 84 | return 0; |
149 | } | 85 | } |
150 | 86 | ||
151 | struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority) | 87 | static struct dccp_ackvec_record *dccp_ackvec_lookup(struct list_head *av_list, |
88 | const u64 ackno) | ||
152 | { | 89 | { |
153 | struct dccp_ackvec *av = kmem_cache_alloc(dccp_ackvec_slab, priority); | 90 | struct dccp_ackvec_record *avr; |
154 | 91 | /* | |
155 | if (av != NULL) { | 92 | * Exploit that records are inserted in descending order of sequence |
156 | av->av_buf_head = DCCP_MAX_ACKVEC_LEN - 1; | 93 | * number, start with the oldest record first. If @ackno is `before' |
157 | av->av_buf_ackno = UINT48_MAX + 1; | 94 | * the earliest ack_ackno, the packet is too old to be considered. |
158 | av->av_buf_nonce = 0; | 95 | */ |
159 | av->av_time = ktime_set(0, 0); | 96 | list_for_each_entry_reverse(avr, av_list, avr_node) { |
160 | av->av_vec_len = 0; | 97 | if (avr->avr_ack_seqno == ackno) |
161 | INIT_LIST_HEAD(&av->av_records); | 98 | return avr; |
99 | if (before48(ackno, avr->avr_ack_seqno)) | ||
100 | break; | ||
162 | } | 101 | } |
163 | 102 | return NULL; | |
164 | return av; | ||
165 | } | 103 | } |
166 | 104 | ||
167 | void dccp_ackvec_free(struct dccp_ackvec *av) | 105 | /* |
106 | * Buffer index and length computation using modulo-buffersize arithmetic. | ||
107 | * Note that, as pointers move from right to left, head is `before' tail. | ||
108 | */ | ||
109 | static inline u16 __ackvec_idx_add(const u16 a, const u16 b) | ||
168 | { | 110 | { |
169 | if (unlikely(av == NULL)) | 111 | return (a + b) % DCCPAV_MAX_ACKVEC_LEN; |
170 | return; | ||
171 | |||
172 | if (!list_empty(&av->av_records)) { | ||
173 | struct dccp_ackvec_record *avr, *next; | ||
174 | |||
175 | list_for_each_entry_safe(avr, next, &av->av_records, avr_node) { | ||
176 | list_del_init(&avr->avr_node); | ||
177 | dccp_ackvec_record_delete(avr); | ||
178 | } | ||
179 | } | ||
180 | |||
181 | kmem_cache_free(dccp_ackvec_slab, av); | ||
182 | } | 112 | } |
183 | 113 | ||
184 | static inline u8 dccp_ackvec_state(const struct dccp_ackvec *av, | 114 | static inline u16 __ackvec_idx_sub(const u16 a, const u16 b) |
185 | const u32 index) | ||
186 | { | 115 | { |
187 | return av->av_buf[index] & DCCP_ACKVEC_STATE_MASK; | 116 | return __ackvec_idx_add(a, DCCPAV_MAX_ACKVEC_LEN - b); |
188 | } | 117 | } |
189 | 118 | ||
190 | static inline u8 dccp_ackvec_len(const struct dccp_ackvec *av, | 119 | u16 dccp_ackvec_buflen(const struct dccp_ackvec *av) |
191 | const u32 index) | ||
192 | { | 120 | { |
193 | return av->av_buf[index] & DCCP_ACKVEC_LEN_MASK; | 121 | if (unlikely(av->av_overflow)) |
122 | return DCCPAV_MAX_ACKVEC_LEN; | ||
123 | return __ackvec_idx_sub(av->av_buf_tail, av->av_buf_head); | ||
194 | } | 124 | } |
195 | 125 | ||
196 | /* | 126 | /** |
197 | * If several packets are missing, the HC-Receiver may prefer to enter multiple | 127 | * dccp_ackvec_update_old - Update previous state as per RFC 4340, 11.4.1 |
198 | * bytes with run length 0, rather than a single byte with a larger run length; | 128 | * @av: non-empty buffer to update |
199 | * this simplifies table updates if one of the missing packets arrives. | 129 | * @distance: negative or zero distance of @seqno from buf_ackno downward |
130 | * @seqno: the (old) sequence number whose record is to be updated | ||
131 | * @state: state in which packet carrying @seqno was received | ||
200 | */ | 132 | */ |
201 | static inline int dccp_ackvec_set_buf_head_state(struct dccp_ackvec *av, | 133 | static void dccp_ackvec_update_old(struct dccp_ackvec *av, s64 distance, |
202 | const unsigned int packets, | 134 | u64 seqno, enum dccp_ackvec_states state) |
203 | const unsigned char state) | ||
204 | { | 135 | { |
205 | unsigned int gap; | 136 | u16 ptr = av->av_buf_head; |
206 | long new_head; | ||
207 | 137 | ||
208 | if (av->av_vec_len + packets > DCCP_MAX_ACKVEC_LEN) | 138 | BUG_ON(distance > 0); |
209 | return -ENOBUFS; | 139 | if (unlikely(dccp_ackvec_is_empty(av))) |
140 | return; | ||
210 | 141 | ||
211 | gap = packets - 1; | 142 | do { |
212 | new_head = av->av_buf_head - packets; | 143 | u8 runlen = dccp_ackvec_runlen(av->av_buf + ptr); |
213 | 144 | ||
214 | if (new_head < 0) { | 145 | if (distance + runlen >= 0) { |
215 | if (gap > 0) { | 146 | /* |
216 | memset(av->av_buf, DCCP_ACKVEC_STATE_NOT_RECEIVED, | 147 | * Only update the state if packet has not been received |
217 | gap + new_head + 1); | 148 | * yet. This is OK as per the second table in RFC 4340, |
218 | gap = -new_head; | 149 | * 11.4.1; i.e. here we are using the following table: |
150 | * RECEIVED | ||
151 | * 0 1 3 | ||
152 | * S +---+---+---+ | ||
153 | * T 0 | 0 | 0 | 0 | | ||
154 | * O +---+---+---+ | ||
155 | * R 1 | 1 | 1 | 1 | | ||
156 | * E +---+---+---+ | ||
157 | * D 3 | 0 | 1 | 3 | | ||
158 | * +---+---+---+ | ||
159 | * The "Not Received" state was set by reserve_seats(). | ||
160 | */ | ||
161 | if (av->av_buf[ptr] == DCCPAV_NOT_RECEIVED) | ||
162 | av->av_buf[ptr] = state; | ||
163 | else | ||
164 | dccp_pr_debug("Not changing %llu state to %u\n", | ||
165 | (unsigned long long)seqno, state); | ||
166 | break; | ||
219 | } | 167 | } |
220 | new_head += DCCP_MAX_ACKVEC_LEN; | ||
221 | } | ||
222 | 168 | ||
223 | av->av_buf_head = new_head; | 169 | distance += runlen + 1; |
170 | ptr = __ackvec_idx_add(ptr, 1); | ||
224 | 171 | ||
225 | if (gap > 0) | 172 | } while (ptr != av->av_buf_tail); |
226 | memset(av->av_buf + av->av_buf_head + 1, | 173 | } |
227 | DCCP_ACKVEC_STATE_NOT_RECEIVED, gap); | ||
228 | 174 | ||
229 | av->av_buf[av->av_buf_head] = state; | 175 | /* Mark @num entries after buf_head as "Not yet received". */ |
230 | av->av_vec_len += packets; | 176 | static void dccp_ackvec_reserve_seats(struct dccp_ackvec *av, u16 num) |
231 | return 0; | 177 | { |
178 | u16 start = __ackvec_idx_add(av->av_buf_head, 1), | ||
179 | len = DCCPAV_MAX_ACKVEC_LEN - start; | ||
180 | |||
181 | /* check for buffer wrap-around */ | ||
182 | if (num > len) { | ||
183 | memset(av->av_buf + start, DCCPAV_NOT_RECEIVED, len); | ||
184 | start = 0; | ||
185 | num -= len; | ||
186 | } | ||
187 | if (num) | ||
188 | memset(av->av_buf + start, DCCPAV_NOT_RECEIVED, num); | ||
232 | } | 189 | } |
233 | 190 | ||
234 | /* | 191 | /** |
235 | * Implements the RFC 4340, Appendix A | 192 | * dccp_ackvec_add_new - Record one or more new entries in Ack Vector buffer |
193 | * @av: container of buffer to update (can be empty or non-empty) | ||
194 | * @num_packets: number of packets to register (must be >= 1) | ||
195 | * @seqno: sequence number of the first packet in @num_packets | ||
196 | * @state: state in which packet carrying @seqno was received | ||
236 | */ | 197 | */ |
237 | int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk, | 198 | static void dccp_ackvec_add_new(struct dccp_ackvec *av, u32 num_packets, |
238 | const u64 ackno, const u8 state) | 199 | u64 seqno, enum dccp_ackvec_states state) |
239 | { | 200 | { |
240 | /* | 201 | u32 num_cells = num_packets; |
241 | * Check at the right places if the buffer is full, if it is, tell the | ||
242 | * caller to start dropping packets till the HC-Sender acks our ACK | ||
243 | * vectors, when we will free up space in av_buf. | ||
244 | * | ||
245 | * We may well decide to do buffer compression, etc, but for now lets | ||
246 | * just drop. | ||
247 | * | ||
248 | * From Appendix A.1.1 (`New Packets'): | ||
249 | * | ||
250 | * Of course, the circular buffer may overflow, either when the | ||
251 | * HC-Sender is sending data at a very high rate, when the | ||
252 | * HC-Receiver's acknowledgements are not reaching the HC-Sender, | ||
253 | * or when the HC-Sender is forgetting to acknowledge those acks | ||
254 | * (so the HC-Receiver is unable to clean up old state). In this | ||
255 | * case, the HC-Receiver should either compress the buffer (by | ||
256 | * increasing run lengths when possible), transfer its state to | ||
257 | * a larger buffer, or, as a last resort, drop all received | ||
258 | * packets, without processing them whatsoever, until its buffer | ||
259 | * shrinks again. | ||
260 | */ | ||
261 | 202 | ||
262 | /* See if this is the first ackno being inserted */ | 203 | if (num_packets > DCCPAV_BURST_THRESH) { |
263 | if (av->av_vec_len == 0) { | 204 | u32 lost_packets = num_packets - 1; |
264 | av->av_buf[av->av_buf_head] = state; | ||
265 | av->av_vec_len = 1; | ||
266 | } else if (after48(ackno, av->av_buf_ackno)) { | ||
267 | const u64 delta = dccp_delta_seqno(av->av_buf_ackno, ackno); | ||
268 | 205 | ||
206 | DCCP_WARN("Warning: large burst loss (%u)\n", lost_packets); | ||
269 | /* | 207 | /* |
270 | * Look if the state of this packet is the same as the | 208 | * We received 1 packet and have a loss of size "num_packets-1" |
271 | * previous ackno and if so if we can bump the head len. | 209 | * which we squeeze into num_cells-1 rather than reserving an |
210 | * entire byte for each lost packet. | ||
211 | * The reason is that the vector grows in O(burst_length); when | ||
212 | * it grows too large there will no room left for the payload. | ||
213 | * This is a trade-off: if a few packets out of the burst show | ||
214 | * up later, their state will not be changed; it is simply too | ||
215 | * costly to reshuffle/reallocate/copy the buffer each time. | ||
216 | * Should such problems persist, we will need to switch to a | ||
217 | * different underlying data structure. | ||
272 | */ | 218 | */ |
273 | if (delta == 1 && | 219 | for (num_packets = num_cells = 1; lost_packets; ++num_cells) { |
274 | dccp_ackvec_state(av, av->av_buf_head) == state && | 220 | u8 len = min(lost_packets, (u32)DCCPAV_MAX_RUNLEN); |
275 | dccp_ackvec_len(av, av->av_buf_head) < DCCP_ACKVEC_LEN_MASK) | ||
276 | av->av_buf[av->av_buf_head]++; | ||
277 | else if (dccp_ackvec_set_buf_head_state(av, delta, state)) | ||
278 | return -ENOBUFS; | ||
279 | } else { | ||
280 | /* | ||
281 | * A.1.2. Old Packets | ||
282 | * | ||
283 | * When a packet with Sequence Number S <= buf_ackno | ||
284 | * arrives, the HC-Receiver will scan the table for | ||
285 | * the byte corresponding to S. (Indexing structures | ||
286 | * could reduce the complexity of this scan.) | ||
287 | */ | ||
288 | u64 delta = dccp_delta_seqno(ackno, av->av_buf_ackno); | ||
289 | u32 index = av->av_buf_head; | ||
290 | 221 | ||
291 | while (1) { | 222 | av->av_buf_head = __ackvec_idx_sub(av->av_buf_head, 1); |
292 | const u8 len = dccp_ackvec_len(av, index); | 223 | av->av_buf[av->av_buf_head] = DCCPAV_NOT_RECEIVED | len; |
293 | const u8 av_state = dccp_ackvec_state(av, index); | 224 | |
294 | /* | 225 | lost_packets -= len; |
295 | * valid packets not yet in av_buf have a reserved | ||
296 | * entry, with a len equal to 0. | ||
297 | */ | ||
298 | if (av_state == DCCP_ACKVEC_STATE_NOT_RECEIVED && | ||
299 | len == 0 && delta == 0) { /* Found our | ||
300 | reserved seat! */ | ||
301 | dccp_pr_debug("Found %llu reserved seat!\n", | ||
302 | (unsigned long long)ackno); | ||
303 | av->av_buf[index] = state; | ||
304 | goto out; | ||
305 | } | ||
306 | /* len == 0 means one packet */ | ||
307 | if (delta < len + 1) | ||
308 | goto out_duplicate; | ||
309 | |||
310 | delta -= len + 1; | ||
311 | if (++index == DCCP_MAX_ACKVEC_LEN) | ||
312 | index = 0; | ||
313 | } | 226 | } |
314 | } | 227 | } |
315 | 228 | ||
316 | av->av_buf_ackno = ackno; | 229 | if (num_cells + dccp_ackvec_buflen(av) >= DCCPAV_MAX_ACKVEC_LEN) { |
317 | av->av_time = ktime_get_real(); | 230 | DCCP_CRIT("Ack Vector buffer overflow: dropping old entries\n"); |
318 | out: | 231 | av->av_overflow = true; |
319 | return 0; | 232 | } |
233 | |||
234 | av->av_buf_head = __ackvec_idx_sub(av->av_buf_head, num_packets); | ||
235 | if (av->av_overflow) | ||
236 | av->av_buf_tail = av->av_buf_head; | ||
320 | 237 | ||
321 | out_duplicate: | 238 | av->av_buf[av->av_buf_head] = state; |
322 | /* Duplicate packet */ | 239 | av->av_buf_ackno = seqno; |
323 | dccp_pr_debug("Received a dup or already considered lost " | 240 | |
324 | "packet: %llu\n", (unsigned long long)ackno); | 241 | if (num_packets > 1) |
325 | return -EILSEQ; | 242 | dccp_ackvec_reserve_seats(av, num_packets - 1); |
326 | } | 243 | } |
327 | 244 | ||
328 | static void dccp_ackvec_throw_record(struct dccp_ackvec *av, | 245 | /** |
329 | struct dccp_ackvec_record *avr) | 246 | * dccp_ackvec_input - Register incoming packet in the buffer |
247 | */ | ||
248 | void dccp_ackvec_input(struct dccp_ackvec *av, struct sk_buff *skb) | ||
330 | { | 249 | { |
331 | struct dccp_ackvec_record *next; | 250 | u64 seqno = DCCP_SKB_CB(skb)->dccpd_seq; |
251 | enum dccp_ackvec_states state = DCCPAV_RECEIVED; | ||
332 | 252 | ||
333 | /* sort out vector length */ | 253 | if (dccp_ackvec_is_empty(av)) { |
334 | if (av->av_buf_head <= avr->avr_ack_ptr) | 254 | dccp_ackvec_add_new(av, 1, seqno, state); |
335 | av->av_vec_len = avr->avr_ack_ptr - av->av_buf_head; | 255 | av->av_tail_ackno = seqno; |
336 | else | ||
337 | av->av_vec_len = DCCP_MAX_ACKVEC_LEN - 1 - | ||
338 | av->av_buf_head + avr->avr_ack_ptr; | ||
339 | 256 | ||
340 | /* free records */ | 257 | } else { |
341 | list_for_each_entry_safe_from(avr, next, &av->av_records, avr_node) { | 258 | s64 num_packets = dccp_delta_seqno(av->av_buf_ackno, seqno); |
342 | list_del_init(&avr->avr_node); | 259 | u8 *current_head = av->av_buf + av->av_buf_head; |
343 | dccp_ackvec_record_delete(avr); | ||
344 | } | ||
345 | } | ||
346 | 260 | ||
347 | void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av, struct sock *sk, | 261 | if (num_packets == 1 && |
348 | const u64 ackno) | 262 | dccp_ackvec_state(current_head) == state && |
349 | { | 263 | dccp_ackvec_runlen(current_head) < DCCPAV_MAX_RUNLEN) { |
350 | struct dccp_ackvec_record *avr; | ||
351 | 264 | ||
352 | /* | 265 | *current_head += 1; |
353 | * If we traverse backwards, it should be faster when we have large | 266 | av->av_buf_ackno = seqno; |
354 | * windows. We will be receiving ACKs for stuff we sent a while back | 267 | |
355 | * -sorbo. | 268 | } else if (num_packets > 0) { |
356 | */ | 269 | dccp_ackvec_add_new(av, num_packets, seqno, state); |
357 | list_for_each_entry_reverse(avr, &av->av_records, avr_node) { | 270 | } else { |
358 | if (ackno == avr->avr_ack_seqno) { | 271 | dccp_ackvec_update_old(av, num_packets, seqno, state); |
359 | dccp_pr_debug("%s ACK packet 0, len=%d, ack_seqno=%llu, " | 272 | } |
360 | "ack_ackno=%llu, ACKED!\n", | ||
361 | dccp_role(sk), 1, | ||
362 | (unsigned long long)avr->avr_ack_seqno, | ||
363 | (unsigned long long)avr->avr_ack_ackno); | ||
364 | dccp_ackvec_throw_record(av, avr); | ||
365 | break; | ||
366 | } else if (avr->avr_ack_seqno > ackno) | ||
367 | break; /* old news */ | ||
368 | } | 273 | } |
369 | } | 274 | } |
370 | 275 | ||
371 | static void dccp_ackvec_check_rcv_ackvector(struct dccp_ackvec *av, | 276 | /** |
372 | struct sock *sk, u64 *ackno, | 277 | * dccp_ackvec_clear_state - Perform house-keeping / garbage-collection |
373 | const unsigned char len, | 278 | * This routine is called when the peer acknowledges the receipt of Ack Vectors |
374 | const unsigned char *vector) | 279 | * up to and including @ackno. While based on on section A.3 of RFC 4340, here |
375 | { | 280 | * are additional precautions to prevent corrupted buffer state. In particular, |
376 | unsigned char i; | 281 | * we use tail_ackno to identify outdated records; it always marks the earliest |
377 | struct dccp_ackvec_record *avr; | 282 | * packet of group (2) in 11.4.2. |
283 | */ | ||
284 | void dccp_ackvec_clear_state(struct dccp_ackvec *av, const u64 ackno) | ||
285 | { | ||
286 | struct dccp_ackvec_record *avr, *next; | ||
287 | u8 runlen_now, eff_runlen; | ||
288 | s64 delta; | ||
378 | 289 | ||
379 | /* Check if we actually sent an ACK vector */ | 290 | avr = dccp_ackvec_lookup(&av->av_records, ackno); |
380 | if (list_empty(&av->av_records)) | 291 | if (avr == NULL) |
381 | return; | 292 | return; |
293 | /* | ||
294 | * Deal with outdated acknowledgments: this arises when e.g. there are | ||
295 | * several old records and the acks from the peer come in slowly. In | ||
296 | * that case we may still have records that pre-date tail_ackno. | ||
297 | */ | ||
298 | delta = dccp_delta_seqno(av->av_tail_ackno, avr->avr_ack_ackno); | ||
299 | if (delta < 0) | ||
300 | goto free_records; | ||
301 | /* | ||
302 | * Deal with overlapping Ack Vectors: don't subtract more than the | ||
303 | * number of packets between tail_ackno and ack_ackno. | ||
304 | */ | ||
305 | eff_runlen = delta < avr->avr_ack_runlen ? delta : avr->avr_ack_runlen; | ||
382 | 306 | ||
383 | i = len; | 307 | runlen_now = dccp_ackvec_runlen(av->av_buf + avr->avr_ack_ptr); |
384 | /* | 308 | /* |
385 | * XXX | 309 | * The run length of Ack Vector cells does not decrease over time. If |
386 | * I think it might be more efficient to work backwards. See comment on | 310 | * the run length is the same as at the time the Ack Vector was sent, we |
387 | * rcv_ackno. -sorbo. | 311 | * free the ack_ptr cell. That cell can however not be freed if the run |
312 | * length has increased: in this case we need to move the tail pointer | ||
313 | * backwards (towards higher indices), to its next-oldest neighbour. | ||
388 | */ | 314 | */ |
389 | avr = list_entry(av->av_records.next, struct dccp_ackvec_record, avr_node); | 315 | if (runlen_now > eff_runlen) { |
390 | while (i--) { | ||
391 | const u8 rl = *vector & DCCP_ACKVEC_LEN_MASK; | ||
392 | u64 ackno_end_rl; | ||
393 | 316 | ||
394 | dccp_set_seqno(&ackno_end_rl, *ackno - rl); | 317 | av->av_buf[avr->avr_ack_ptr] -= eff_runlen + 1; |
318 | av->av_buf_tail = __ackvec_idx_add(avr->avr_ack_ptr, 1); | ||
395 | 319 | ||
320 | /* This move may not have cleared the overflow flag. */ | ||
321 | if (av->av_overflow) | ||
322 | av->av_overflow = (av->av_buf_head == av->av_buf_tail); | ||
323 | } else { | ||
324 | av->av_buf_tail = avr->avr_ack_ptr; | ||
396 | /* | 325 | /* |
397 | * If our AVR sequence number is greater than the ack, go | 326 | * We have made sure that avr points to a valid cell within the |
398 | * forward in the AVR list until it is not so. | 327 | * buffer. This cell is either older than head, or equals head |
328 | * (empty buffer): in both cases we no longer have any overflow. | ||
399 | */ | 329 | */ |
400 | list_for_each_entry_from(avr, &av->av_records, avr_node) { | 330 | av->av_overflow = 0; |
401 | if (!after48(avr->avr_ack_seqno, *ackno)) | 331 | } |
402 | goto found; | ||
403 | } | ||
404 | /* End of the av_records list, not found, exit */ | ||
405 | break; | ||
406 | found: | ||
407 | if (between48(avr->avr_ack_seqno, ackno_end_rl, *ackno)) { | ||
408 | const u8 state = *vector & DCCP_ACKVEC_STATE_MASK; | ||
409 | if (state != DCCP_ACKVEC_STATE_NOT_RECEIVED) { | ||
410 | dccp_pr_debug("%s ACK vector 0, len=%d, " | ||
411 | "ack_seqno=%llu, ack_ackno=%llu, " | ||
412 | "ACKED!\n", | ||
413 | dccp_role(sk), len, | ||
414 | (unsigned long long) | ||
415 | avr->avr_ack_seqno, | ||
416 | (unsigned long long) | ||
417 | avr->avr_ack_ackno); | ||
418 | dccp_ackvec_throw_record(av, avr); | ||
419 | break; | ||
420 | } | ||
421 | /* | ||
422 | * If it wasn't received, continue scanning... we might | ||
423 | * find another one. | ||
424 | */ | ||
425 | } | ||
426 | 332 | ||
427 | dccp_set_seqno(ackno, ackno_end_rl - 1); | 333 | /* |
428 | ++vector; | 334 | * The peer has acknowledged up to and including ack_ackno. Hence the |
335 | * first packet in group (2) of 11.4.2 is the successor of ack_ackno. | ||
336 | */ | ||
337 | av->av_tail_ackno = ADD48(avr->avr_ack_ackno, 1); | ||
338 | |||
339 | free_records: | ||
340 | list_for_each_entry_safe_from(avr, next, &av->av_records, avr_node) { | ||
341 | list_del(&avr->avr_node); | ||
342 | kmem_cache_free(dccp_ackvec_record_slab, avr); | ||
429 | } | 343 | } |
430 | } | 344 | } |
431 | 345 | ||
432 | int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb, | 346 | /* |
433 | u64 *ackno, const u8 opt, const u8 *value, const u8 len) | 347 | * Routines to keep track of Ack Vectors received in an skb |
348 | */ | ||
349 | int dccp_ackvec_parsed_add(struct list_head *head, u8 *vec, u8 len, u8 nonce) | ||
434 | { | 350 | { |
435 | if (len > DCCP_MAX_ACKVEC_OPT_LEN) | 351 | struct dccp_ackvec_parsed *new = kmalloc(sizeof(*new), GFP_ATOMIC); |
436 | return -1; | 352 | |
353 | if (new == NULL) | ||
354 | return -ENOBUFS; | ||
355 | new->vec = vec; | ||
356 | new->len = len; | ||
357 | new->nonce = nonce; | ||
437 | 358 | ||
438 | /* dccp_ackvector_print(DCCP_SKB_CB(skb)->dccpd_ack_seq, value, len); */ | 359 | list_add_tail(&new->node, head); |
439 | dccp_ackvec_check_rcv_ackvector(dccp_sk(sk)->dccps_hc_rx_ackvec, sk, | ||
440 | ackno, len, value); | ||
441 | return 0; | 360 | return 0; |
442 | } | 361 | } |
362 | EXPORT_SYMBOL_GPL(dccp_ackvec_parsed_add); | ||
363 | |||
364 | void dccp_ackvec_parsed_cleanup(struct list_head *parsed_chunks) | ||
365 | { | ||
366 | struct dccp_ackvec_parsed *cur, *next; | ||
367 | |||
368 | list_for_each_entry_safe(cur, next, parsed_chunks, node) | ||
369 | kfree(cur); | ||
370 | INIT_LIST_HEAD(parsed_chunks); | ||
371 | } | ||
372 | EXPORT_SYMBOL_GPL(dccp_ackvec_parsed_cleanup); | ||
443 | 373 | ||
444 | int __init dccp_ackvec_init(void) | 374 | int __init dccp_ackvec_init(void) |
445 | { | 375 | { |
@@ -449,10 +379,9 @@ int __init dccp_ackvec_init(void) | |||
449 | if (dccp_ackvec_slab == NULL) | 379 | if (dccp_ackvec_slab == NULL) |
450 | goto out_err; | 380 | goto out_err; |
451 | 381 | ||
452 | dccp_ackvec_record_slab = | 382 | dccp_ackvec_record_slab = kmem_cache_create("dccp_ackvec_record", |
453 | kmem_cache_create("dccp_ackvec_record", | 383 | sizeof(struct dccp_ackvec_record), |
454 | sizeof(struct dccp_ackvec_record), | 384 | 0, SLAB_HWCACHE_ALIGN, NULL); |
455 | 0, SLAB_HWCACHE_ALIGN, NULL); | ||
456 | if (dccp_ackvec_record_slab == NULL) | 385 | if (dccp_ackvec_record_slab == NULL) |
457 | goto out_destroy_slab; | 386 | goto out_destroy_slab; |
458 | 387 | ||
diff --git a/net/dccp/ackvec.h b/net/dccp/ackvec.h index bcb64fb4acef..6cdca79a99f7 100644 --- a/net/dccp/ackvec.h +++ b/net/dccp/ackvec.h | |||
@@ -3,156 +3,134 @@ | |||
3 | /* | 3 | /* |
4 | * net/dccp/ackvec.h | 4 | * net/dccp/ackvec.h |
5 | * | 5 | * |
6 | * An implementation of the DCCP protocol | 6 | * An implementation of Ack Vectors for the DCCP protocol |
7 | * Copyright (c) 2007 University of Aberdeen, Scotland, UK | ||
7 | * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@mandriva.com> | 8 | * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@mandriva.com> |
8 | * | ||
9 | * This program is free software; you can redistribute it and/or modify it | 9 | * This program is free software; you can redistribute it and/or modify it |
10 | * under the terms of the GNU General Public License version 2 as | 10 | * under the terms of the GNU General Public License version 2 as |
11 | * published by the Free Software Foundation. | 11 | * published by the Free Software Foundation. |
12 | */ | 12 | */ |
13 | 13 | ||
14 | #include <linux/dccp.h> | ||
14 | #include <linux/compiler.h> | 15 | #include <linux/compiler.h> |
15 | #include <linux/ktime.h> | ||
16 | #include <linux/list.h> | 16 | #include <linux/list.h> |
17 | #include <linux/types.h> | 17 | #include <linux/types.h> |
18 | 18 | ||
19 | /* Read about the ECN nonce to see why it is 253 */ | 19 | /* |
20 | #define DCCP_MAX_ACKVEC_OPT_LEN 253 | 20 | * Ack Vector buffer space is static, in multiples of %DCCP_SINGLE_OPT_MAXLEN, |
21 | /* We can spread an ack vector across multiple options */ | 21 | * the maximum size of a single Ack Vector. Setting %DCCPAV_NUM_ACKVECS to 1 |
22 | #define DCCP_MAX_ACKVEC_LEN (DCCP_MAX_ACKVEC_OPT_LEN * 2) | 22 | * will be sufficient for most cases of low Ack Ratios, using a value of 2 gives |
23 | * more headroom if Ack Ratio is higher or when the sender acknowledges slowly. | ||
24 | * The maximum value is bounded by the u16 types for indices and functions. | ||
25 | */ | ||
26 | #define DCCPAV_NUM_ACKVECS 2 | ||
27 | #define DCCPAV_MAX_ACKVEC_LEN (DCCP_SINGLE_OPT_MAXLEN * DCCPAV_NUM_ACKVECS) | ||
23 | 28 | ||
24 | #define DCCP_ACKVEC_STATE_RECEIVED 0 | 29 | /* Estimated minimum average Ack Vector length - used for updating MPS */ |
25 | #define DCCP_ACKVEC_STATE_ECN_MARKED (1 << 6) | 30 | #define DCCPAV_MIN_OPTLEN 16 |
26 | #define DCCP_ACKVEC_STATE_NOT_RECEIVED (3 << 6) | ||
27 | 31 | ||
28 | #define DCCP_ACKVEC_STATE_MASK 0xC0 /* 11000000 */ | 32 | /* Threshold for coping with large bursts of losses */ |
29 | #define DCCP_ACKVEC_LEN_MASK 0x3F /* 00111111 */ | 33 | #define DCCPAV_BURST_THRESH (DCCPAV_MAX_ACKVEC_LEN / 8) |
30 | 34 | ||
31 | /** struct dccp_ackvec - ack vector | 35 | enum dccp_ackvec_states { |
32 | * | 36 | DCCPAV_RECEIVED = 0x00, |
33 | * This data structure is the one defined in RFC 4340, Appendix A. | 37 | DCCPAV_ECN_MARKED = 0x40, |
34 | * | 38 | DCCPAV_RESERVED = 0x80, |
35 | * @av_buf_head - circular buffer head | 39 | DCCPAV_NOT_RECEIVED = 0xC0 |
36 | * @av_buf_tail - circular buffer tail | 40 | }; |
37 | * @av_buf_ackno - ack # of the most recent packet acknowledgeable in the | 41 | #define DCCPAV_MAX_RUNLEN 0x3F |
38 | * buffer (i.e. %av_buf_head) | 42 | |
39 | * @av_buf_nonce - the one-bit sum of the ECN Nonces on all packets acked | 43 | static inline u8 dccp_ackvec_runlen(const u8 *cell) |
40 | * by the buffer with State 0 | 44 | { |
41 | * | 45 | return *cell & DCCPAV_MAX_RUNLEN; |
42 | * Additionally, the HC-Receiver must keep some information about the | 46 | } |
43 | * Ack Vectors it has recently sent. For each packet sent carrying an | 47 | |
44 | * Ack Vector, it remembers four variables: | 48 | static inline u8 dccp_ackvec_state(const u8 *cell) |
49 | { | ||
50 | return *cell & ~DCCPAV_MAX_RUNLEN; | ||
51 | } | ||
52 | |||
53 | /** struct dccp_ackvec - Ack Vector main data structure | ||
45 | * | 54 | * |
46 | * @av_records - list of dccp_ackvec_record | 55 | * This implements a fixed-size circular buffer within an array and is largely |
47 | * @av_ack_nonce - the one-bit sum of the ECN Nonces for all State 0. | 56 | * based on Appendix A of RFC 4340. |
48 | * | 57 | * |
49 | * @av_time - the time in usecs | 58 | * @av_buf: circular buffer storage area |
50 | * @av_buf - circular buffer of acknowledgeable packets | 59 | * @av_buf_head: head index; begin of live portion in @av_buf |
60 | * @av_buf_tail: tail index; first index _after_ the live portion in @av_buf | ||
61 | * @av_buf_ackno: highest seqno of acknowledgeable packet recorded in @av_buf | ||
62 | * @av_tail_ackno: lowest seqno of acknowledgeable packet recorded in @av_buf | ||
63 | * @av_buf_nonce: ECN nonce sums, each covering subsequent segments of up to | ||
64 | * %DCCP_SINGLE_OPT_MAXLEN cells in the live portion of @av_buf | ||
65 | * @av_overflow: if 1 then buf_head == buf_tail indicates buffer wraparound | ||
66 | * @av_records: list of %dccp_ackvec_record (Ack Vectors sent previously) | ||
51 | */ | 67 | */ |
52 | struct dccp_ackvec { | 68 | struct dccp_ackvec { |
53 | u64 av_buf_ackno; | 69 | u8 av_buf[DCCPAV_MAX_ACKVEC_LEN]; |
54 | struct list_head av_records; | ||
55 | ktime_t av_time; | ||
56 | u16 av_buf_head; | 70 | u16 av_buf_head; |
57 | u16 av_vec_len; | 71 | u16 av_buf_tail; |
58 | u8 av_buf_nonce; | 72 | u64 av_buf_ackno:48; |
59 | u8 av_ack_nonce; | 73 | u64 av_tail_ackno:48; |
60 | u8 av_buf[DCCP_MAX_ACKVEC_LEN]; | 74 | bool av_buf_nonce[DCCPAV_NUM_ACKVECS]; |
75 | u8 av_overflow:1; | ||
76 | struct list_head av_records; | ||
61 | }; | 77 | }; |
62 | 78 | ||
63 | /** struct dccp_ackvec_record - ack vector record | 79 | /** struct dccp_ackvec_record - Records information about sent Ack Vectors |
64 | * | 80 | * |
65 | * ACK vector record as defined in Appendix A of spec. | 81 | * These list entries define the additional information which the HC-Receiver |
82 | * keeps about recently-sent Ack Vectors; again refer to RFC 4340, Appendix A. | ||
66 | * | 83 | * |
67 | * The list is sorted by avr_ack_seqno | 84 | * @avr_node: the list node in @av_records |
85 | * @avr_ack_seqno: sequence number of the packet the Ack Vector was sent on | ||
86 | * @avr_ack_ackno: the Ack number that this record/Ack Vector refers to | ||
87 | * @avr_ack_ptr: pointer into @av_buf where this record starts | ||
88 | * @avr_ack_runlen: run length of @avr_ack_ptr at the time of sending | ||
89 | * @avr_ack_nonce: the sum of @av_buf_nonce's at the time this record was sent | ||
68 | * | 90 | * |
69 | * @avr_node - node in av_records | 91 | * The list as a whole is sorted in descending order by @avr_ack_seqno. |
70 | * @avr_ack_seqno - sequence number of the packet this record was sent on | ||
71 | * @avr_ack_ackno - sequence number being acknowledged | ||
72 | * @avr_ack_ptr - pointer into av_buf where this record starts | ||
73 | * @avr_ack_nonce - av_ack_nonce at the time this record was sent | ||
74 | * @avr_sent_len - lenght of the record in av_buf | ||
75 | */ | 92 | */ |
76 | struct dccp_ackvec_record { | 93 | struct dccp_ackvec_record { |
77 | struct list_head avr_node; | 94 | struct list_head avr_node; |
78 | u64 avr_ack_seqno; | 95 | u64 avr_ack_seqno:48; |
79 | u64 avr_ack_ackno; | 96 | u64 avr_ack_ackno:48; |
80 | u16 avr_ack_ptr; | 97 | u16 avr_ack_ptr; |
81 | u16 avr_sent_len; | 98 | u8 avr_ack_runlen; |
82 | u8 avr_ack_nonce; | 99 | u8 avr_ack_nonce:1; |
83 | }; | 100 | }; |
84 | 101 | ||
85 | struct sock; | 102 | extern int dccp_ackvec_init(void); |
86 | struct sk_buff; | ||
87 | |||
88 | #ifdef CONFIG_IP_DCCP_ACKVEC | ||
89 | extern int dccp_ackvec_init(void); | ||
90 | extern void dccp_ackvec_exit(void); | 103 | extern void dccp_ackvec_exit(void); |
91 | 104 | ||
92 | extern struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority); | 105 | extern struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority); |
93 | extern void dccp_ackvec_free(struct dccp_ackvec *av); | 106 | extern void dccp_ackvec_free(struct dccp_ackvec *av); |
94 | 107 | ||
95 | extern int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk, | 108 | extern void dccp_ackvec_input(struct dccp_ackvec *av, struct sk_buff *skb); |
96 | const u64 ackno, const u8 state); | 109 | extern int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seq, u8 sum); |
97 | 110 | extern void dccp_ackvec_clear_state(struct dccp_ackvec *av, const u64 ackno); | |
98 | extern void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av, | 111 | extern u16 dccp_ackvec_buflen(const struct dccp_ackvec *av); |
99 | struct sock *sk, const u64 ackno); | ||
100 | extern int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb, | ||
101 | u64 *ackno, const u8 opt, | ||
102 | const u8 *value, const u8 len); | ||
103 | 112 | ||
104 | extern int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb); | 113 | static inline bool dccp_ackvec_is_empty(const struct dccp_ackvec *av) |
105 | |||
106 | static inline int dccp_ackvec_pending(const struct dccp_ackvec *av) | ||
107 | { | ||
108 | return av->av_vec_len; | ||
109 | } | ||
110 | #else /* CONFIG_IP_DCCP_ACKVEC */ | ||
111 | static inline int dccp_ackvec_init(void) | ||
112 | { | 114 | { |
113 | return 0; | 115 | return av->av_overflow == 0 && av->av_buf_head == av->av_buf_tail; |
114 | } | 116 | } |
115 | 117 | ||
116 | static inline void dccp_ackvec_exit(void) | 118 | /** |
117 | { | 119 | * struct dccp_ackvec_parsed - Record offsets of Ack Vectors in skb |
118 | } | 120 | * @vec: start of vector (offset into skb) |
119 | 121 | * @len: length of @vec | |
120 | static inline struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority) | 122 | * @nonce: whether @vec had an ECN nonce of 0 or 1 |
121 | { | 123 | * @node: FIFO - arranged in descending order of ack_ackno |
122 | return NULL; | 124 | * This structure is used by CCIDs to access Ack Vectors in a received skb. |
123 | } | 125 | */ |
124 | 126 | struct dccp_ackvec_parsed { | |
125 | static inline void dccp_ackvec_free(struct dccp_ackvec *av) | 127 | u8 *vec, |
126 | { | 128 | len, |
127 | } | 129 | nonce:1; |
128 | 130 | struct list_head node; | |
129 | static inline int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk, | 131 | }; |
130 | const u64 ackno, const u8 state) | ||
131 | { | ||
132 | return -1; | ||
133 | } | ||
134 | |||
135 | static inline void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av, | ||
136 | struct sock *sk, const u64 ackno) | ||
137 | { | ||
138 | } | ||
139 | |||
140 | static inline int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb, | ||
141 | const u64 *ackno, const u8 opt, | ||
142 | const u8 *value, const u8 len) | ||
143 | { | ||
144 | return -1; | ||
145 | } | ||
146 | |||
147 | static inline int dccp_insert_option_ackvec(const struct sock *sk, | ||
148 | const struct sk_buff *skb) | ||
149 | { | ||
150 | return -1; | ||
151 | } | ||
152 | 132 | ||
153 | static inline int dccp_ackvec_pending(const struct dccp_ackvec *av) | 133 | extern int dccp_ackvec_parsed_add(struct list_head *head, |
154 | { | 134 | u8 *vec, u8 len, u8 nonce); |
155 | return 0; | 135 | extern void dccp_ackvec_parsed_cleanup(struct list_head *parsed_chunks); |
156 | } | ||
157 | #endif /* CONFIG_IP_DCCP_ACKVEC */ | ||
158 | #endif /* _ACKVEC_H */ | 136 | #endif /* _ACKVEC_H */ |
diff --git a/net/dccp/ccid.c b/net/dccp/ccid.c index 4809753d12ae..e3fb52b4f5c6 100644 --- a/net/dccp/ccid.c +++ b/net/dccp/ccid.c | |||
@@ -13,6 +13,13 @@ | |||
13 | 13 | ||
14 | #include "ccid.h" | 14 | #include "ccid.h" |
15 | 15 | ||
16 | static u8 builtin_ccids[] = { | ||
17 | DCCPC_CCID2, /* CCID2 is supported by default */ | ||
18 | #if defined(CONFIG_IP_DCCP_CCID3) || defined(CONFIG_IP_DCCP_CCID3_MODULE) | ||
19 | DCCPC_CCID3, | ||
20 | #endif | ||
21 | }; | ||
22 | |||
16 | static struct ccid_operations *ccids[CCID_MAX]; | 23 | static struct ccid_operations *ccids[CCID_MAX]; |
17 | #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT) | 24 | #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT) |
18 | static atomic_t ccids_lockct = ATOMIC_INIT(0); | 25 | static atomic_t ccids_lockct = ATOMIC_INIT(0); |
@@ -86,6 +93,47 @@ static void ccid_kmem_cache_destroy(struct kmem_cache *slab) | |||
86 | } | 93 | } |
87 | } | 94 | } |
88 | 95 | ||
96 | /* check that up to @array_len members in @ccid_array are supported */ | ||
97 | bool ccid_support_check(u8 const *ccid_array, u8 array_len) | ||
98 | { | ||
99 | u8 i, j, found; | ||
100 | |||
101 | for (i = 0, found = 0; i < array_len; i++, found = 0) { | ||
102 | for (j = 0; !found && j < ARRAY_SIZE(builtin_ccids); j++) | ||
103 | found = (ccid_array[i] == builtin_ccids[j]); | ||
104 | if (!found) | ||
105 | return false; | ||
106 | } | ||
107 | return true; | ||
108 | } | ||
109 | |||
110 | /** | ||
111 | * ccid_get_builtin_ccids - Provide copy of `builtin' CCID array | ||
112 | * @ccid_array: pointer to copy into | ||
113 | * @array_len: value to return length into | ||
114 | * This function allocates memory - caller must see that it is freed after use. | ||
115 | */ | ||
116 | int ccid_get_builtin_ccids(u8 **ccid_array, u8 *array_len) | ||
117 | { | ||
118 | *ccid_array = kmemdup(builtin_ccids, sizeof(builtin_ccids), gfp_any()); | ||
119 | if (*ccid_array == NULL) | ||
120 | return -ENOBUFS; | ||
121 | *array_len = ARRAY_SIZE(builtin_ccids); | ||
122 | return 0; | ||
123 | } | ||
124 | |||
125 | int ccid_getsockopt_builtin_ccids(struct sock *sk, int len, | ||
126 | char __user *optval, int __user *optlen) | ||
127 | { | ||
128 | if (len < sizeof(builtin_ccids)) | ||
129 | return -EINVAL; | ||
130 | |||
131 | if (put_user(sizeof(builtin_ccids), optlen) || | ||
132 | copy_to_user(optval, builtin_ccids, sizeof(builtin_ccids))) | ||
133 | return -EFAULT; | ||
134 | return 0; | ||
135 | } | ||
136 | |||
89 | int ccid_register(struct ccid_operations *ccid_ops) | 137 | int ccid_register(struct ccid_operations *ccid_ops) |
90 | { | 138 | { |
91 | int err = -ENOBUFS; | 139 | int err = -ENOBUFS; |
@@ -148,22 +196,41 @@ int ccid_unregister(struct ccid_operations *ccid_ops) | |||
148 | 196 | ||
149 | EXPORT_SYMBOL_GPL(ccid_unregister); | 197 | EXPORT_SYMBOL_GPL(ccid_unregister); |
150 | 198 | ||
199 | /** | ||
200 | * ccid_request_module - Pre-load CCID module for later use | ||
201 | * This should be called only from process context (e.g. during connection | ||
202 | * setup) and is necessary for later calls to ccid_new (typically in software | ||
203 | * interrupt), so that it has the modules available when they are needed. | ||
204 | */ | ||
205 | static int ccid_request_module(u8 id) | ||
206 | { | ||
207 | if (!in_atomic()) { | ||
208 | ccids_read_lock(); | ||
209 | if (ccids[id] == NULL) { | ||
210 | ccids_read_unlock(); | ||
211 | return request_module("net-dccp-ccid-%d", id); | ||
212 | } | ||
213 | ccids_read_unlock(); | ||
214 | } | ||
215 | return 0; | ||
216 | } | ||
217 | |||
218 | int ccid_request_modules(u8 const *ccid_array, u8 array_len) | ||
219 | { | ||
220 | #ifdef CONFIG_KMOD | ||
221 | while (array_len--) | ||
222 | if (ccid_request_module(ccid_array[array_len])) | ||
223 | return -1; | ||
224 | #endif | ||
225 | return 0; | ||
226 | } | ||
227 | |||
151 | struct ccid *ccid_new(unsigned char id, struct sock *sk, int rx, gfp_t gfp) | 228 | struct ccid *ccid_new(unsigned char id, struct sock *sk, int rx, gfp_t gfp) |
152 | { | 229 | { |
153 | struct ccid_operations *ccid_ops; | 230 | struct ccid_operations *ccid_ops; |
154 | struct ccid *ccid = NULL; | 231 | struct ccid *ccid = NULL; |
155 | 232 | ||
156 | ccids_read_lock(); | 233 | ccids_read_lock(); |
157 | #ifdef CONFIG_KMOD | ||
158 | if (ccids[id] == NULL) { | ||
159 | /* We only try to load if in process context */ | ||
160 | ccids_read_unlock(); | ||
161 | if (gfp & GFP_ATOMIC) | ||
162 | goto out; | ||
163 | request_module("net-dccp-ccid-%d", id); | ||
164 | ccids_read_lock(); | ||
165 | } | ||
166 | #endif | ||
167 | ccid_ops = ccids[id]; | 234 | ccid_ops = ccids[id]; |
168 | if (ccid_ops == NULL) | 235 | if (ccid_ops == NULL) |
169 | goto out_unlock; | 236 | goto out_unlock; |
@@ -205,20 +272,6 @@ out_module_put: | |||
205 | 272 | ||
206 | EXPORT_SYMBOL_GPL(ccid_new); | 273 | EXPORT_SYMBOL_GPL(ccid_new); |
207 | 274 | ||
208 | struct ccid *ccid_hc_rx_new(unsigned char id, struct sock *sk, gfp_t gfp) | ||
209 | { | ||
210 | return ccid_new(id, sk, 1, gfp); | ||
211 | } | ||
212 | |||
213 | EXPORT_SYMBOL_GPL(ccid_hc_rx_new); | ||
214 | |||
215 | struct ccid *ccid_hc_tx_new(unsigned char id,struct sock *sk, gfp_t gfp) | ||
216 | { | ||
217 | return ccid_new(id, sk, 0, gfp); | ||
218 | } | ||
219 | |||
220 | EXPORT_SYMBOL_GPL(ccid_hc_tx_new); | ||
221 | |||
222 | static void ccid_delete(struct ccid *ccid, struct sock *sk, int rx) | 275 | static void ccid_delete(struct ccid *ccid, struct sock *sk, int rx) |
223 | { | 276 | { |
224 | struct ccid_operations *ccid_ops; | 277 | struct ccid_operations *ccid_ops; |
diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h index fdeae7b57319..d27054ba2159 100644 --- a/net/dccp/ccid.h +++ b/net/dccp/ccid.h | |||
@@ -60,22 +60,18 @@ struct ccid_operations { | |||
60 | void (*ccid_hc_tx_exit)(struct sock *sk); | 60 | void (*ccid_hc_tx_exit)(struct sock *sk); |
61 | void (*ccid_hc_rx_packet_recv)(struct sock *sk, | 61 | void (*ccid_hc_rx_packet_recv)(struct sock *sk, |
62 | struct sk_buff *skb); | 62 | struct sk_buff *skb); |
63 | int (*ccid_hc_rx_parse_options)(struct sock *sk, | 63 | int (*ccid_hc_rx_parse_options)(struct sock *sk, u8 pkt, |
64 | unsigned char option, | 64 | u8 opt, u8 *val, u8 len); |
65 | unsigned char len, u16 idx, | ||
66 | unsigned char* value); | ||
67 | int (*ccid_hc_rx_insert_options)(struct sock *sk, | 65 | int (*ccid_hc_rx_insert_options)(struct sock *sk, |
68 | struct sk_buff *skb); | 66 | struct sk_buff *skb); |
69 | void (*ccid_hc_tx_packet_recv)(struct sock *sk, | 67 | void (*ccid_hc_tx_packet_recv)(struct sock *sk, |
70 | struct sk_buff *skb); | 68 | struct sk_buff *skb); |
71 | int (*ccid_hc_tx_parse_options)(struct sock *sk, | 69 | int (*ccid_hc_tx_parse_options)(struct sock *sk, u8 pkt, |
72 | unsigned char option, | 70 | u8 opt, u8 *val, u8 len); |
73 | unsigned char len, u16 idx, | ||
74 | unsigned char* value); | ||
75 | int (*ccid_hc_tx_send_packet)(struct sock *sk, | 71 | int (*ccid_hc_tx_send_packet)(struct sock *sk, |
76 | struct sk_buff *skb); | 72 | struct sk_buff *skb); |
77 | void (*ccid_hc_tx_packet_sent)(struct sock *sk, | 73 | void (*ccid_hc_tx_packet_sent)(struct sock *sk, |
78 | int more, unsigned int len); | 74 | unsigned int len); |
79 | void (*ccid_hc_rx_get_info)(struct sock *sk, | 75 | void (*ccid_hc_rx_get_info)(struct sock *sk, |
80 | struct tcp_info *info); | 76 | struct tcp_info *info); |
81 | void (*ccid_hc_tx_get_info)(struct sock *sk, | 77 | void (*ccid_hc_tx_get_info)(struct sock *sk, |
@@ -103,31 +99,78 @@ static inline void *ccid_priv(const struct ccid *ccid) | |||
103 | return (void *)ccid->ccid_priv; | 99 | return (void *)ccid->ccid_priv; |
104 | } | 100 | } |
105 | 101 | ||
102 | extern bool ccid_support_check(u8 const *ccid_array, u8 array_len); | ||
103 | extern int ccid_get_builtin_ccids(u8 **ccid_array, u8 *array_len); | ||
104 | extern int ccid_getsockopt_builtin_ccids(struct sock *sk, int len, | ||
105 | char __user *, int __user *); | ||
106 | |||
107 | extern int ccid_request_modules(u8 const *ccid_array, u8 array_len); | ||
106 | extern struct ccid *ccid_new(unsigned char id, struct sock *sk, int rx, | 108 | extern struct ccid *ccid_new(unsigned char id, struct sock *sk, int rx, |
107 | gfp_t gfp); | 109 | gfp_t gfp); |
108 | 110 | ||
109 | extern struct ccid *ccid_hc_rx_new(unsigned char id, struct sock *sk, | 111 | static inline int ccid_get_current_rx_ccid(struct dccp_sock *dp) |
110 | gfp_t gfp); | 112 | { |
111 | extern struct ccid *ccid_hc_tx_new(unsigned char id, struct sock *sk, | 113 | struct ccid *ccid = dp->dccps_hc_rx_ccid; |
112 | gfp_t gfp); | 114 | |
115 | if (ccid == NULL || ccid->ccid_ops == NULL) | ||
116 | return -1; | ||
117 | return ccid->ccid_ops->ccid_id; | ||
118 | } | ||
119 | |||
120 | static inline int ccid_get_current_tx_ccid(struct dccp_sock *dp) | ||
121 | { | ||
122 | struct ccid *ccid = dp->dccps_hc_tx_ccid; | ||
123 | |||
124 | if (ccid == NULL || ccid->ccid_ops == NULL) | ||
125 | return -1; | ||
126 | return ccid->ccid_ops->ccid_id; | ||
127 | } | ||
113 | 128 | ||
114 | extern void ccid_hc_rx_delete(struct ccid *ccid, struct sock *sk); | 129 | extern void ccid_hc_rx_delete(struct ccid *ccid, struct sock *sk); |
115 | extern void ccid_hc_tx_delete(struct ccid *ccid, struct sock *sk); | 130 | extern void ccid_hc_tx_delete(struct ccid *ccid, struct sock *sk); |
116 | 131 | ||
132 | /* | ||
133 | * Congestion control of queued data packets via CCID decision. | ||
134 | * | ||
135 | * The TX CCID performs its congestion-control by indicating whether and when a | ||
136 | * queued packet may be sent, using the return code of ccid_hc_tx_send_packet(). | ||
137 | * The following modes are supported via the symbolic constants below: | ||
138 | * - timer-based pacing (CCID returns a delay value in milliseconds); | ||
139 | * - autonomous dequeueing (CCID internally schedules dccps_xmitlet). | ||
140 | */ | ||
141 | |||
142 | enum ccid_dequeueing_decision { | ||
143 | CCID_PACKET_SEND_AT_ONCE = 0x00000, /* "green light": no delay */ | ||
144 | CCID_PACKET_DELAY_MAX = 0x0FFFF, /* maximum delay in msecs */ | ||
145 | CCID_PACKET_DELAY = 0x10000, /* CCID msec-delay mode */ | ||
146 | CCID_PACKET_WILL_DEQUEUE_LATER = 0x20000, /* CCID autonomous mode */ | ||
147 | CCID_PACKET_ERR = 0xF0000, /* error condition */ | ||
148 | }; | ||
149 | |||
150 | static inline int ccid_packet_dequeue_eval(const int return_code) | ||
151 | { | ||
152 | if (return_code < 0) | ||
153 | return CCID_PACKET_ERR; | ||
154 | if (return_code == 0) | ||
155 | return CCID_PACKET_SEND_AT_ONCE; | ||
156 | if (return_code <= CCID_PACKET_DELAY_MAX) | ||
157 | return CCID_PACKET_DELAY; | ||
158 | return return_code; | ||
159 | } | ||
160 | |||
117 | static inline int ccid_hc_tx_send_packet(struct ccid *ccid, struct sock *sk, | 161 | static inline int ccid_hc_tx_send_packet(struct ccid *ccid, struct sock *sk, |
118 | struct sk_buff *skb) | 162 | struct sk_buff *skb) |
119 | { | 163 | { |
120 | int rc = 0; | ||
121 | if (ccid->ccid_ops->ccid_hc_tx_send_packet != NULL) | 164 | if (ccid->ccid_ops->ccid_hc_tx_send_packet != NULL) |
122 | rc = ccid->ccid_ops->ccid_hc_tx_send_packet(sk, skb); | 165 | return ccid->ccid_ops->ccid_hc_tx_send_packet(sk, skb); |
123 | return rc; | 166 | return CCID_PACKET_SEND_AT_ONCE; |
124 | } | 167 | } |
125 | 168 | ||
126 | static inline void ccid_hc_tx_packet_sent(struct ccid *ccid, struct sock *sk, | 169 | static inline void ccid_hc_tx_packet_sent(struct ccid *ccid, struct sock *sk, |
127 | int more, unsigned int len) | 170 | unsigned int len) |
128 | { | 171 | { |
129 | if (ccid->ccid_ops->ccid_hc_tx_packet_sent != NULL) | 172 | if (ccid->ccid_ops->ccid_hc_tx_packet_sent != NULL) |
130 | ccid->ccid_ops->ccid_hc_tx_packet_sent(sk, more, len); | 173 | ccid->ccid_ops->ccid_hc_tx_packet_sent(sk, len); |
131 | } | 174 | } |
132 | 175 | ||
133 | static inline void ccid_hc_rx_packet_recv(struct ccid *ccid, struct sock *sk, | 176 | static inline void ccid_hc_rx_packet_recv(struct ccid *ccid, struct sock *sk, |
@@ -144,27 +187,31 @@ static inline void ccid_hc_tx_packet_recv(struct ccid *ccid, struct sock *sk, | |||
144 | ccid->ccid_ops->ccid_hc_tx_packet_recv(sk, skb); | 187 | ccid->ccid_ops->ccid_hc_tx_packet_recv(sk, skb); |
145 | } | 188 | } |
146 | 189 | ||
190 | /** | ||
191 | * ccid_hc_tx_parse_options - Parse CCID-specific options sent by the receiver | ||
192 | * @pkt: type of packet that @opt appears on (RFC 4340, 5.1) | ||
193 | * @opt: the CCID-specific option type (RFC 4340, 5.8 and 10.3) | ||
194 | * @val: value of @opt | ||
195 | * @len: length of @val in bytes | ||
196 | */ | ||
147 | static inline int ccid_hc_tx_parse_options(struct ccid *ccid, struct sock *sk, | 197 | static inline int ccid_hc_tx_parse_options(struct ccid *ccid, struct sock *sk, |
148 | unsigned char option, | 198 | u8 pkt, u8 opt, u8 *val, u8 len) |
149 | unsigned char len, u16 idx, | ||
150 | unsigned char* value) | ||
151 | { | 199 | { |
152 | int rc = 0; | 200 | if (ccid->ccid_ops->ccid_hc_tx_parse_options == NULL) |
153 | if (ccid->ccid_ops->ccid_hc_tx_parse_options != NULL) | 201 | return 0; |
154 | rc = ccid->ccid_ops->ccid_hc_tx_parse_options(sk, option, len, idx, | 202 | return ccid->ccid_ops->ccid_hc_tx_parse_options(sk, pkt, opt, val, len); |
155 | value); | ||
156 | return rc; | ||
157 | } | 203 | } |
158 | 204 | ||
205 | /** | ||
206 | * ccid_hc_rx_parse_options - Parse CCID-specific options sent by the sender | ||
207 | * Arguments are analogous to ccid_hc_tx_parse_options() | ||
208 | */ | ||
159 | static inline int ccid_hc_rx_parse_options(struct ccid *ccid, struct sock *sk, | 209 | static inline int ccid_hc_rx_parse_options(struct ccid *ccid, struct sock *sk, |
160 | unsigned char option, | 210 | u8 pkt, u8 opt, u8 *val, u8 len) |
161 | unsigned char len, u16 idx, | ||
162 | unsigned char* value) | ||
163 | { | 211 | { |
164 | int rc = 0; | 212 | if (ccid->ccid_ops->ccid_hc_rx_parse_options == NULL) |
165 | if (ccid->ccid_ops->ccid_hc_rx_parse_options != NULL) | 213 | return 0; |
166 | rc = ccid->ccid_ops->ccid_hc_rx_parse_options(sk, option, len, idx, value); | 214 | return ccid->ccid_ops->ccid_hc_rx_parse_options(sk, pkt, opt, val, len); |
167 | return rc; | ||
168 | } | 215 | } |
169 | 216 | ||
170 | static inline int ccid_hc_rx_insert_options(struct ccid *ccid, struct sock *sk, | 217 | static inline int ccid_hc_rx_insert_options(struct ccid *ccid, struct sock *sk, |
diff --git a/net/dccp/ccids/Kconfig b/net/dccp/ccids/Kconfig index 12275943eab8..fb168be2cb43 100644 --- a/net/dccp/ccids/Kconfig +++ b/net/dccp/ccids/Kconfig | |||
@@ -1,10 +1,8 @@ | |||
1 | menu "DCCP CCIDs Configuration (EXPERIMENTAL)" | 1 | menu "DCCP CCIDs Configuration (EXPERIMENTAL)" |
2 | depends on EXPERIMENTAL | ||
3 | 2 | ||
4 | config IP_DCCP_CCID2 | 3 | config IP_DCCP_CCID2 |
5 | tristate "CCID2 (TCP-Like) (EXPERIMENTAL)" | 4 | tristate "CCID2 (TCP-Like)" |
6 | def_tristate IP_DCCP | 5 | def_tristate IP_DCCP |
7 | select IP_DCCP_ACKVEC | ||
8 | ---help--- | 6 | ---help--- |
9 | CCID 2, TCP-like Congestion Control, denotes Additive Increase, | 7 | CCID 2, TCP-like Congestion Control, denotes Additive Increase, |
10 | Multiplicative Decrease (AIMD) congestion control with behavior | 8 | Multiplicative Decrease (AIMD) congestion control with behavior |
@@ -36,7 +34,7 @@ config IP_DCCP_CCID2_DEBUG | |||
36 | If in doubt, say N. | 34 | If in doubt, say N. |
37 | 35 | ||
38 | config IP_DCCP_CCID3 | 36 | config IP_DCCP_CCID3 |
39 | tristate "CCID3 (TCP-Friendly) (EXPERIMENTAL)" | 37 | tristate "CCID3 (TCP-Friendly)" |
40 | def_tristate IP_DCCP | 38 | def_tristate IP_DCCP |
41 | select IP_DCCP_TFRC_LIB | 39 | select IP_DCCP_TFRC_LIB |
42 | ---help--- | 40 | ---help--- |
@@ -64,9 +62,9 @@ config IP_DCCP_CCID3 | |||
64 | 62 | ||
65 | If in doubt, say M. | 63 | If in doubt, say M. |
66 | 64 | ||
65 | if IP_DCCP_CCID3 | ||
67 | config IP_DCCP_CCID3_DEBUG | 66 | config IP_DCCP_CCID3_DEBUG |
68 | bool "CCID3 debugging messages" | 67 | bool "CCID3 debugging messages" |
69 | depends on IP_DCCP_CCID3 | ||
70 | ---help--- | 68 | ---help--- |
71 | Enable CCID3-specific debugging messages. | 69 | Enable CCID3-specific debugging messages. |
72 | 70 | ||
@@ -76,10 +74,29 @@ config IP_DCCP_CCID3_DEBUG | |||
76 | 74 | ||
77 | If in doubt, say N. | 75 | If in doubt, say N. |
78 | 76 | ||
77 | choice | ||
78 | prompt "Select method for measuring the packet size s" | ||
79 | default IP_DCCP_CCID3_MEASURE_S_AS_MPS | ||
80 | |||
81 | config IP_DCCP_CCID3_MEASURE_S_AS_MPS | ||
82 | bool "Always use MPS in place of s" | ||
83 | ---help--- | ||
84 | This use is recommended as it is consistent with the initialisation | ||
85 | of X and suggested when s varies (rfc3448bis, (1) in section 4.1). | ||
86 | config IP_DCCP_CCID3_MEASURE_S_AS_AVG | ||
87 | bool "Use moving average" | ||
88 | ---help--- | ||
89 | An alternative way of tracking s, also supported by rfc3448bis. | ||
90 | This used to be the default for CCID-3 in previous kernels. | ||
91 | config IP_DCCP_CCID3_MEASURE_S_AS_MAX | ||
92 | bool "Track the maximum payload length" | ||
93 | ---help--- | ||
94 | An experimental method based on tracking the maximum packet size. | ||
95 | endchoice | ||
96 | |||
79 | config IP_DCCP_CCID3_RTO | 97 | config IP_DCCP_CCID3_RTO |
80 | int "Use higher bound for nofeedback timer" | 98 | int "Use higher bound for nofeedback timer" |
81 | default 100 | 99 | default 100 |
82 | depends on IP_DCCP_CCID3 && EXPERIMENTAL | ||
83 | ---help--- | 100 | ---help--- |
84 | Use higher lower bound for nofeedback timer expiration. | 101 | Use higher lower bound for nofeedback timer expiration. |
85 | 102 | ||
@@ -106,6 +123,7 @@ config IP_DCCP_CCID3_RTO | |||
106 | The purpose of the nofeedback timer is to slow DCCP down when there | 123 | The purpose of the nofeedback timer is to slow DCCP down when there |
107 | is serious network congestion: experimenting with larger values should | 124 | is serious network congestion: experimenting with larger values should |
108 | therefore not be performed on WANs. | 125 | therefore not be performed on WANs. |
126 | endif # IP_DCCP_CCID3 | ||
109 | 127 | ||
110 | config IP_DCCP_TFRC_LIB | 128 | config IP_DCCP_TFRC_LIB |
111 | tristate | 129 | tristate |
diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c index 9a430734530c..fa713227c66f 100644 --- a/net/dccp/ccids/ccid2.c +++ b/net/dccp/ccids/ccid2.c | |||
@@ -25,7 +25,7 @@ | |||
25 | /* | 25 | /* |
26 | * This implementation should follow RFC 4341 | 26 | * This implementation should follow RFC 4341 |
27 | */ | 27 | */ |
28 | 28 | #include "../feat.h" | |
29 | #include "../ccid.h" | 29 | #include "../ccid.h" |
30 | #include "../dccp.h" | 30 | #include "../dccp.h" |
31 | #include "ccid2.h" | 31 | #include "ccid2.h" |
@@ -34,51 +34,8 @@ | |||
34 | #ifdef CONFIG_IP_DCCP_CCID2_DEBUG | 34 | #ifdef CONFIG_IP_DCCP_CCID2_DEBUG |
35 | static int ccid2_debug; | 35 | static int ccid2_debug; |
36 | #define ccid2_pr_debug(format, a...) DCCP_PR_DEBUG(ccid2_debug, format, ##a) | 36 | #define ccid2_pr_debug(format, a...) DCCP_PR_DEBUG(ccid2_debug, format, ##a) |
37 | |||
38 | static void ccid2_hc_tx_check_sanity(const struct ccid2_hc_tx_sock *hctx) | ||
39 | { | ||
40 | int len = 0; | ||
41 | int pipe = 0; | ||
42 | struct ccid2_seq *seqp = hctx->ccid2hctx_seqh; | ||
43 | |||
44 | /* there is data in the chain */ | ||
45 | if (seqp != hctx->ccid2hctx_seqt) { | ||
46 | seqp = seqp->ccid2s_prev; | ||
47 | len++; | ||
48 | if (!seqp->ccid2s_acked) | ||
49 | pipe++; | ||
50 | |||
51 | while (seqp != hctx->ccid2hctx_seqt) { | ||
52 | struct ccid2_seq *prev = seqp->ccid2s_prev; | ||
53 | |||
54 | len++; | ||
55 | if (!prev->ccid2s_acked) | ||
56 | pipe++; | ||
57 | |||
58 | /* packets are sent sequentially */ | ||
59 | BUG_ON(dccp_delta_seqno(seqp->ccid2s_seq, | ||
60 | prev->ccid2s_seq ) >= 0); | ||
61 | BUG_ON(time_before(seqp->ccid2s_sent, | ||
62 | prev->ccid2s_sent)); | ||
63 | |||
64 | seqp = prev; | ||
65 | } | ||
66 | } | ||
67 | |||
68 | BUG_ON(pipe != hctx->ccid2hctx_pipe); | ||
69 | ccid2_pr_debug("len of chain=%d\n", len); | ||
70 | |||
71 | do { | ||
72 | seqp = seqp->ccid2s_prev; | ||
73 | len++; | ||
74 | } while (seqp != hctx->ccid2hctx_seqh); | ||
75 | |||
76 | ccid2_pr_debug("total len=%d\n", len); | ||
77 | BUG_ON(len != hctx->ccid2hctx_seqbufc * CCID2_SEQBUF_LEN); | ||
78 | } | ||
79 | #else | 37 | #else |
80 | #define ccid2_pr_debug(format, a...) | 38 | #define ccid2_pr_debug(format, a...) |
81 | #define ccid2_hc_tx_check_sanity(hctx) | ||
82 | #endif | 39 | #endif |
83 | 40 | ||
84 | static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hctx) | 41 | static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hctx) |
@@ -87,8 +44,7 @@ static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hctx) | |||
87 | int i; | 44 | int i; |
88 | 45 | ||
89 | /* check if we have space to preserve the pointer to the buffer */ | 46 | /* check if we have space to preserve the pointer to the buffer */ |
90 | if (hctx->ccid2hctx_seqbufc >= (sizeof(hctx->ccid2hctx_seqbuf) / | 47 | if (hctx->seqbufc >= sizeof(hctx->seqbuf) / sizeof(struct ccid2_seq *)) |
91 | sizeof(struct ccid2_seq*))) | ||
92 | return -ENOMEM; | 48 | return -ENOMEM; |
93 | 49 | ||
94 | /* allocate buffer and initialize linked list */ | 50 | /* allocate buffer and initialize linked list */ |
@@ -104,38 +60,35 @@ static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hctx) | |||
104 | seqp->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1]; | 60 | seqp->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1]; |
105 | 61 | ||
106 | /* This is the first allocation. Initiate the head and tail. */ | 62 | /* This is the first allocation. Initiate the head and tail. */ |
107 | if (hctx->ccid2hctx_seqbufc == 0) | 63 | if (hctx->seqbufc == 0) |
108 | hctx->ccid2hctx_seqh = hctx->ccid2hctx_seqt = seqp; | 64 | hctx->seqh = hctx->seqt = seqp; |
109 | else { | 65 | else { |
110 | /* link the existing list with the one we just created */ | 66 | /* link the existing list with the one we just created */ |
111 | hctx->ccid2hctx_seqh->ccid2s_next = seqp; | 67 | hctx->seqh->ccid2s_next = seqp; |
112 | seqp->ccid2s_prev = hctx->ccid2hctx_seqh; | 68 | seqp->ccid2s_prev = hctx->seqh; |
113 | 69 | ||
114 | hctx->ccid2hctx_seqt->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1]; | 70 | hctx->seqt->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1]; |
115 | seqp[CCID2_SEQBUF_LEN - 1].ccid2s_next = hctx->ccid2hctx_seqt; | 71 | seqp[CCID2_SEQBUF_LEN - 1].ccid2s_next = hctx->seqt; |
116 | } | 72 | } |
117 | 73 | ||
118 | /* store the original pointer to the buffer so we can free it */ | 74 | /* store the original pointer to the buffer so we can free it */ |
119 | hctx->ccid2hctx_seqbuf[hctx->ccid2hctx_seqbufc] = seqp; | 75 | hctx->seqbuf[hctx->seqbufc] = seqp; |
120 | hctx->ccid2hctx_seqbufc++; | 76 | hctx->seqbufc++; |
121 | 77 | ||
122 | return 0; | 78 | return 0; |
123 | } | 79 | } |
124 | 80 | ||
125 | static int ccid2_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) | 81 | static int ccid2_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) |
126 | { | 82 | { |
127 | struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); | 83 | if (ccid2_cwnd_network_limited(ccid2_hc_tx_sk(sk))) |
128 | 84 | return CCID_PACKET_WILL_DEQUEUE_LATER; | |
129 | if (hctx->ccid2hctx_pipe < hctx->ccid2hctx_cwnd) | 85 | return CCID_PACKET_SEND_AT_ONCE; |
130 | return 0; | ||
131 | |||
132 | return 1; /* XXX CCID should dequeue when ready instead of polling */ | ||
133 | } | 86 | } |
134 | 87 | ||
135 | static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val) | 88 | static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val) |
136 | { | 89 | { |
137 | struct dccp_sock *dp = dccp_sk(sk); | 90 | struct dccp_sock *dp = dccp_sk(sk); |
138 | u32 max_ratio = DIV_ROUND_UP(ccid2_hc_tx_sk(sk)->ccid2hctx_cwnd, 2); | 91 | u32 max_ratio = DIV_ROUND_UP(ccid2_hc_tx_sk(sk)->cwnd, 2); |
139 | 92 | ||
140 | /* | 93 | /* |
141 | * Ensure that Ack Ratio does not exceed ceil(cwnd/2), which is (2) from | 94 | * Ensure that Ack Ratio does not exceed ceil(cwnd/2), which is (2) from |
@@ -147,8 +100,8 @@ static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val) | |||
147 | DCCP_WARN("Limiting Ack Ratio (%u) to %u\n", val, max_ratio); | 100 | DCCP_WARN("Limiting Ack Ratio (%u) to %u\n", val, max_ratio); |
148 | val = max_ratio; | 101 | val = max_ratio; |
149 | } | 102 | } |
150 | if (val > 0xFFFF) /* RFC 4340, 11.3 */ | 103 | if (val > DCCPF_ACK_RATIO_MAX) |
151 | val = 0xFFFF; | 104 | val = DCCPF_ACK_RATIO_MAX; |
152 | 105 | ||
153 | if (val == dp->dccps_l_ack_ratio) | 106 | if (val == dp->dccps_l_ack_ratio) |
154 | return; | 107 | return; |
@@ -157,99 +110,77 @@ static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val) | |||
157 | dp->dccps_l_ack_ratio = val; | 110 | dp->dccps_l_ack_ratio = val; |
158 | } | 111 | } |
159 | 112 | ||
160 | static void ccid2_change_srtt(struct ccid2_hc_tx_sock *hctx, long val) | ||
161 | { | ||
162 | ccid2_pr_debug("change SRTT to %ld\n", val); | ||
163 | hctx->ccid2hctx_srtt = val; | ||
164 | } | ||
165 | |||
166 | static void ccid2_start_rto_timer(struct sock *sk); | ||
167 | |||
168 | static void ccid2_hc_tx_rto_expire(unsigned long data) | 113 | static void ccid2_hc_tx_rto_expire(unsigned long data) |
169 | { | 114 | { |
170 | struct sock *sk = (struct sock *)data; | 115 | struct sock *sk = (struct sock *)data; |
171 | struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); | 116 | struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); |
172 | long s; | 117 | const bool sender_was_blocked = ccid2_cwnd_network_limited(hctx); |
173 | 118 | ||
174 | bh_lock_sock(sk); | 119 | bh_lock_sock(sk); |
175 | if (sock_owned_by_user(sk)) { | 120 | if (sock_owned_by_user(sk)) { |
176 | sk_reset_timer(sk, &hctx->ccid2hctx_rtotimer, | 121 | sk_reset_timer(sk, &hctx->rtotimer, jiffies + HZ / 5); |
177 | jiffies + HZ / 5); | ||
178 | goto out; | 122 | goto out; |
179 | } | 123 | } |
180 | 124 | ||
181 | ccid2_pr_debug("RTO_EXPIRE\n"); | 125 | ccid2_pr_debug("RTO_EXPIRE\n"); |
182 | 126 | ||
183 | ccid2_hc_tx_check_sanity(hctx); | ||
184 | |||
185 | /* back-off timer */ | 127 | /* back-off timer */ |
186 | hctx->ccid2hctx_rto <<= 1; | 128 | hctx->rto <<= 1; |
187 | 129 | if (hctx->rto > DCCP_RTO_MAX) | |
188 | s = hctx->ccid2hctx_rto / HZ; | 130 | hctx->rto = DCCP_RTO_MAX; |
189 | if (s > 60) | ||
190 | hctx->ccid2hctx_rto = 60 * HZ; | ||
191 | |||
192 | ccid2_start_rto_timer(sk); | ||
193 | 131 | ||
194 | /* adjust pipe, cwnd etc */ | 132 | /* adjust pipe, cwnd etc */ |
195 | hctx->ccid2hctx_ssthresh = hctx->ccid2hctx_cwnd / 2; | 133 | hctx->ssthresh = hctx->cwnd / 2; |
196 | if (hctx->ccid2hctx_ssthresh < 2) | 134 | if (hctx->ssthresh < 2) |
197 | hctx->ccid2hctx_ssthresh = 2; | 135 | hctx->ssthresh = 2; |
198 | hctx->ccid2hctx_cwnd = 1; | 136 | hctx->cwnd = 1; |
199 | hctx->ccid2hctx_pipe = 0; | 137 | hctx->pipe = 0; |
200 | 138 | ||
201 | /* clear state about stuff we sent */ | 139 | /* clear state about stuff we sent */ |
202 | hctx->ccid2hctx_seqt = hctx->ccid2hctx_seqh; | 140 | hctx->seqt = hctx->seqh; |
203 | hctx->ccid2hctx_packets_acked = 0; | 141 | hctx->packets_acked = 0; |
204 | 142 | ||
205 | /* clear ack ratio state. */ | 143 | /* clear ack ratio state. */ |
206 | hctx->ccid2hctx_rpseq = 0; | 144 | hctx->rpseq = 0; |
207 | hctx->ccid2hctx_rpdupack = -1; | 145 | hctx->rpdupack = -1; |
208 | ccid2_change_l_ack_ratio(sk, 1); | 146 | ccid2_change_l_ack_ratio(sk, 1); |
209 | ccid2_hc_tx_check_sanity(hctx); | 147 | |
148 | /* if we were blocked before, we may now send cwnd=1 packet */ | ||
149 | if (sender_was_blocked) | ||
150 | tasklet_schedule(&dccp_sk(sk)->dccps_xmitlet); | ||
151 | /* restart backed-off timer */ | ||
152 | sk_reset_timer(sk, &hctx->rtotimer, jiffies + hctx->rto); | ||
210 | out: | 153 | out: |
211 | bh_unlock_sock(sk); | 154 | bh_unlock_sock(sk); |
212 | sock_put(sk); | 155 | sock_put(sk); |
213 | } | 156 | } |
214 | 157 | ||
215 | static void ccid2_start_rto_timer(struct sock *sk) | 158 | static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len) |
216 | { | ||
217 | struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); | ||
218 | |||
219 | ccid2_pr_debug("setting RTO timeout=%ld\n", hctx->ccid2hctx_rto); | ||
220 | |||
221 | BUG_ON(timer_pending(&hctx->ccid2hctx_rtotimer)); | ||
222 | sk_reset_timer(sk, &hctx->ccid2hctx_rtotimer, | ||
223 | jiffies + hctx->ccid2hctx_rto); | ||
224 | } | ||
225 | |||
226 | static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len) | ||
227 | { | 159 | { |
228 | struct dccp_sock *dp = dccp_sk(sk); | 160 | struct dccp_sock *dp = dccp_sk(sk); |
229 | struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); | 161 | struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); |
230 | struct ccid2_seq *next; | 162 | struct ccid2_seq *next; |
231 | 163 | ||
232 | hctx->ccid2hctx_pipe++; | 164 | hctx->pipe++; |
233 | 165 | ||
234 | hctx->ccid2hctx_seqh->ccid2s_seq = dp->dccps_gss; | 166 | hctx->seqh->ccid2s_seq = dp->dccps_gss; |
235 | hctx->ccid2hctx_seqh->ccid2s_acked = 0; | 167 | hctx->seqh->ccid2s_acked = 0; |
236 | hctx->ccid2hctx_seqh->ccid2s_sent = jiffies; | 168 | hctx->seqh->ccid2s_sent = jiffies; |
237 | 169 | ||
238 | next = hctx->ccid2hctx_seqh->ccid2s_next; | 170 | next = hctx->seqh->ccid2s_next; |
239 | /* check if we need to alloc more space */ | 171 | /* check if we need to alloc more space */ |
240 | if (next == hctx->ccid2hctx_seqt) { | 172 | if (next == hctx->seqt) { |
241 | if (ccid2_hc_tx_alloc_seq(hctx)) { | 173 | if (ccid2_hc_tx_alloc_seq(hctx)) { |
242 | DCCP_CRIT("packet history - out of memory!"); | 174 | DCCP_CRIT("packet history - out of memory!"); |
243 | /* FIXME: find a more graceful way to bail out */ | 175 | /* FIXME: find a more graceful way to bail out */ |
244 | return; | 176 | return; |
245 | } | 177 | } |
246 | next = hctx->ccid2hctx_seqh->ccid2s_next; | 178 | next = hctx->seqh->ccid2s_next; |
247 | BUG_ON(next == hctx->ccid2hctx_seqt); | 179 | BUG_ON(next == hctx->seqt); |
248 | } | 180 | } |
249 | hctx->ccid2hctx_seqh = next; | 181 | hctx->seqh = next; |
250 | 182 | ||
251 | ccid2_pr_debug("cwnd=%d pipe=%d\n", hctx->ccid2hctx_cwnd, | 183 | ccid2_pr_debug("cwnd=%d pipe=%d\n", hctx->cwnd, hctx->pipe); |
252 | hctx->ccid2hctx_pipe); | ||
253 | 184 | ||
254 | /* | 185 | /* |
255 | * FIXME: The code below is broken and the variables have been removed | 186 | * FIXME: The code below is broken and the variables have been removed |
@@ -272,12 +203,12 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len) | |||
272 | */ | 203 | */ |
273 | #if 0 | 204 | #if 0 |
274 | /* Ack Ratio. Need to maintain a concept of how many windows we sent */ | 205 | /* Ack Ratio. Need to maintain a concept of how many windows we sent */ |
275 | hctx->ccid2hctx_arsent++; | 206 | hctx->arsent++; |
276 | /* We had an ack loss in this window... */ | 207 | /* We had an ack loss in this window... */ |
277 | if (hctx->ccid2hctx_ackloss) { | 208 | if (hctx->ackloss) { |
278 | if (hctx->ccid2hctx_arsent >= hctx->ccid2hctx_cwnd) { | 209 | if (hctx->arsent >= hctx->cwnd) { |
279 | hctx->ccid2hctx_arsent = 0; | 210 | hctx->arsent = 0; |
280 | hctx->ccid2hctx_ackloss = 0; | 211 | hctx->ackloss = 0; |
281 | } | 212 | } |
282 | } else { | 213 | } else { |
283 | /* No acks lost up to now... */ | 214 | /* No acks lost up to now... */ |
@@ -287,28 +218,28 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len) | |||
287 | int denom = dp->dccps_l_ack_ratio * dp->dccps_l_ack_ratio - | 218 | int denom = dp->dccps_l_ack_ratio * dp->dccps_l_ack_ratio - |
288 | dp->dccps_l_ack_ratio; | 219 | dp->dccps_l_ack_ratio; |
289 | 220 | ||
290 | denom = hctx->ccid2hctx_cwnd * hctx->ccid2hctx_cwnd / denom; | 221 | denom = hctx->cwnd * hctx->cwnd / denom; |
291 | 222 | ||
292 | if (hctx->ccid2hctx_arsent >= denom) { | 223 | if (hctx->arsent >= denom) { |
293 | ccid2_change_l_ack_ratio(sk, dp->dccps_l_ack_ratio - 1); | 224 | ccid2_change_l_ack_ratio(sk, dp->dccps_l_ack_ratio - 1); |
294 | hctx->ccid2hctx_arsent = 0; | 225 | hctx->arsent = 0; |
295 | } | 226 | } |
296 | } else { | 227 | } else { |
297 | /* we can't increase ack ratio further [1] */ | 228 | /* we can't increase ack ratio further [1] */ |
298 | hctx->ccid2hctx_arsent = 0; /* or maybe set it to cwnd*/ | 229 | hctx->arsent = 0; /* or maybe set it to cwnd*/ |
299 | } | 230 | } |
300 | } | 231 | } |
301 | #endif | 232 | #endif |
302 | 233 | ||
303 | /* setup RTO timer */ | 234 | /* setup RTO timer */ |
304 | if (!timer_pending(&hctx->ccid2hctx_rtotimer)) | 235 | if (!timer_pending(&hctx->rtotimer)) |
305 | ccid2_start_rto_timer(sk); | 236 | sk_reset_timer(sk, &hctx->rtotimer, jiffies + hctx->rto); |
306 | 237 | ||
307 | #ifdef CONFIG_IP_DCCP_CCID2_DEBUG | 238 | #ifdef CONFIG_IP_DCCP_CCID2_DEBUG |
308 | do { | 239 | do { |
309 | struct ccid2_seq *seqp = hctx->ccid2hctx_seqt; | 240 | struct ccid2_seq *seqp = hctx->seqt; |
310 | 241 | ||
311 | while (seqp != hctx->ccid2hctx_seqh) { | 242 | while (seqp != hctx->seqh) { |
312 | ccid2_pr_debug("out seq=%llu acked=%d time=%lu\n", | 243 | ccid2_pr_debug("out seq=%llu acked=%d time=%lu\n", |
313 | (unsigned long long)seqp->ccid2s_seq, | 244 | (unsigned long long)seqp->ccid2s_seq, |
314 | seqp->ccid2s_acked, seqp->ccid2s_sent); | 245 | seqp->ccid2s_acked, seqp->ccid2s_sent); |
@@ -316,205 +247,158 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len) | |||
316 | } | 247 | } |
317 | } while (0); | 248 | } while (0); |
318 | ccid2_pr_debug("=========\n"); | 249 | ccid2_pr_debug("=========\n"); |
319 | ccid2_hc_tx_check_sanity(hctx); | ||
320 | #endif | 250 | #endif |
321 | } | 251 | } |
322 | 252 | ||
323 | /* XXX Lame code duplication! | 253 | /** |
324 | * returns -1 if none was found. | 254 | * ccid2_rtt_estimator - Sample RTT and compute RTO using RFC2988 algorithm |
325 | * else returns the next offset to use in the function call. | 255 | * This code is almost identical with TCP's tcp_rtt_estimator(), since |
256 | * - it has a higher sampling frequency (recommended by RFC 1323), | ||
257 | * - the RTO does not collapse into RTT due to RTTVAR going towards zero, | ||
258 | * - it is simple (cf. more complex proposals such as Eifel timer or research | ||
259 | * which suggests that the gain should be set according to window size), | ||
260 | * - in tests it was found to work well with CCID2 [gerrit]. | ||
326 | */ | 261 | */ |
327 | static int ccid2_ackvector(struct sock *sk, struct sk_buff *skb, int offset, | 262 | static void ccid2_rtt_estimator(struct sock *sk, const long mrtt) |
328 | unsigned char **vec, unsigned char *veclen) | ||
329 | { | 263 | { |
330 | const struct dccp_hdr *dh = dccp_hdr(skb); | 264 | struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); |
331 | unsigned char *options = (unsigned char *)dh + dccp_hdr_len(skb); | 265 | long m = mrtt ? : 1; |
332 | unsigned char *opt_ptr; | 266 | |
333 | const unsigned char *opt_end = (unsigned char *)dh + | 267 | if (hctx->srtt == 0) { |
334 | (dh->dccph_doff * 4); | 268 | /* First measurement m */ |
335 | unsigned char opt, len; | 269 | hctx->srtt = m << 3; |
336 | unsigned char *value; | 270 | hctx->mdev = m << 1; |
337 | 271 | ||
338 | BUG_ON(offset < 0); | 272 | hctx->mdev_max = max(TCP_RTO_MIN, hctx->mdev); |
339 | options += offset; | 273 | hctx->rttvar = hctx->mdev_max; |
340 | opt_ptr = options; | 274 | hctx->rtt_seq = dccp_sk(sk)->dccps_gss; |
341 | if (opt_ptr >= opt_end) | 275 | } else { |
342 | return -1; | 276 | /* Update scaled SRTT as SRTT += 1/8 * (m - SRTT) */ |
343 | 277 | m -= (hctx->srtt >> 3); | |
344 | while (opt_ptr != opt_end) { | 278 | hctx->srtt += m; |
345 | opt = *opt_ptr++; | 279 | |
346 | len = 0; | 280 | /* Similarly, update scaled mdev with regard to |m| */ |
347 | value = NULL; | 281 | if (m < 0) { |
348 | 282 | m = -m; | |
349 | /* Check if this isn't a single byte option */ | 283 | m -= (hctx->mdev >> 2); |
350 | if (opt > DCCPO_MAX_RESERVED) { | ||
351 | if (opt_ptr == opt_end) | ||
352 | goto out_invalid_option; | ||
353 | |||
354 | len = *opt_ptr++; | ||
355 | if (len < 3) | ||
356 | goto out_invalid_option; | ||
357 | /* | 284 | /* |
358 | * Remove the type and len fields, leaving | 285 | * This neutralises RTO increase when RTT < SRTT - mdev |
359 | * just the value size | 286 | * (see P. Sarolahti, A. Kuznetsov,"Congestion Control |
287 | * in Linux TCP", USENIX 2002, pp. 49-62). | ||
360 | */ | 288 | */ |
361 | len -= 2; | 289 | if (m > 0) |
362 | value = opt_ptr; | 290 | m >>= 3; |
363 | opt_ptr += len; | 291 | } else { |
292 | m -= (hctx->mdev >> 2); | ||
293 | } | ||
294 | hctx->mdev += m; | ||
364 | 295 | ||
365 | if (opt_ptr > opt_end) | 296 | if (hctx->mdev > hctx->mdev_max) { |
366 | goto out_invalid_option; | 297 | hctx->mdev_max = hctx->mdev; |
298 | if (hctx->mdev_max > hctx->rttvar) | ||
299 | hctx->rttvar = hctx->mdev_max; | ||
367 | } | 300 | } |
368 | 301 | ||
369 | switch (opt) { | 302 | /* |
370 | case DCCPO_ACK_VECTOR_0: | 303 | * Decay RTTVAR at most once per flight, exploiting that |
371 | case DCCPO_ACK_VECTOR_1: | 304 | * 1) pipe <= cwnd <= Sequence_Window = W (RFC 4340, 7.5.2) |
372 | *vec = value; | 305 | * 2) AWL = GSS-W+1 <= GAR <= GSS (RFC 4340, 7.5.1) |
373 | *veclen = len; | 306 | * GAR is a useful bound for FlightSize = pipe, AWL is probably |
374 | return offset + (opt_ptr - options); | 307 | * too low as it over-estimates pipe. |
308 | */ | ||
309 | if (after48(dccp_sk(sk)->dccps_gar, hctx->rtt_seq)) { | ||
310 | if (hctx->mdev_max < hctx->rttvar) | ||
311 | hctx->rttvar -= (hctx->rttvar - | ||
312 | hctx->mdev_max) >> 2; | ||
313 | hctx->rtt_seq = dccp_sk(sk)->dccps_gss; | ||
314 | hctx->mdev_max = TCP_RTO_MIN; | ||
375 | } | 315 | } |
376 | } | 316 | } |
377 | 317 | ||
378 | return -1; | 318 | /* |
379 | 319 | * Set RTO from SRTT and RTTVAR | |
380 | out_invalid_option: | 320 | * Clock granularity is ignored since the minimum error for RTTVAR is |
381 | DCCP_BUG("Invalid option - this should not happen (previous parsing)!"); | 321 | * clamped to 50msec (corresponding to HZ=20). This leads to a minimum |
382 | return -1; | 322 | * RTO of 200msec. This agrees with TCP and RFC 4341, 5.: "Because DCCP |
383 | } | 323 | * does not retransmit data, DCCP does not require TCP's recommended |
384 | 324 | * minimum timeout of one second". | |
385 | static void ccid2_hc_tx_kill_rto_timer(struct sock *sk) | 325 | */ |
386 | { | 326 | hctx->rto = (hctx->srtt >> 3) + hctx->rttvar; |
387 | struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); | ||
388 | 327 | ||
389 | sk_stop_timer(sk, &hctx->ccid2hctx_rtotimer); | 328 | if (hctx->rto > DCCP_RTO_MAX) |
390 | ccid2_pr_debug("deleted RTO timer\n"); | 329 | hctx->rto = DCCP_RTO_MAX; |
391 | } | 330 | } |
392 | 331 | ||
393 | static inline void ccid2_new_ack(struct sock *sk, | 332 | static void ccid2_new_ack(struct sock *sk, struct ccid2_seq *seqp, |
394 | struct ccid2_seq *seqp, | 333 | unsigned int *maxincr) |
395 | unsigned int *maxincr) | ||
396 | { | 334 | { |
397 | struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); | 335 | struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); |
398 | 336 | ||
399 | if (hctx->ccid2hctx_cwnd < hctx->ccid2hctx_ssthresh) { | 337 | if (hctx->cwnd < hctx->ssthresh) { |
400 | if (*maxincr > 0 && ++hctx->ccid2hctx_packets_acked == 2) { | 338 | if (*maxincr > 0 && ++hctx->packets_acked == 2) { |
401 | hctx->ccid2hctx_cwnd += 1; | 339 | hctx->cwnd += 1; |
402 | *maxincr -= 1; | 340 | *maxincr -= 1; |
403 | hctx->ccid2hctx_packets_acked = 0; | 341 | hctx->packets_acked = 0; |
404 | } | 342 | } |
405 | } else if (++hctx->ccid2hctx_packets_acked >= hctx->ccid2hctx_cwnd) { | 343 | } else if (++hctx->packets_acked >= hctx->cwnd) { |
406 | hctx->ccid2hctx_cwnd += 1; | 344 | hctx->cwnd += 1; |
407 | hctx->ccid2hctx_packets_acked = 0; | 345 | hctx->packets_acked = 0; |
408 | } | 346 | } |
409 | 347 | /* | |
410 | /* update RTO */ | 348 | * FIXME: RTT is sampled several times per acknowledgment (for each |
411 | if (hctx->ccid2hctx_srtt == -1 || | 349 | * entry in the Ack Vector), instead of once per Ack (as in TCP SACK). |
412 | time_after(jiffies, hctx->ccid2hctx_lastrtt + hctx->ccid2hctx_srtt)) { | 350 | * This causes the RTT to be over-estimated, since the older entries |
413 | unsigned long r = (long)jiffies - (long)seqp->ccid2s_sent; | 351 | * in the Ack Vector have earlier sending times. |
414 | int s; | 352 | * The cleanest solution is to not use the ccid2s_sent field at all |
415 | 353 | * and instead use DCCP timestamps - need to be resolved at some time. | |
416 | /* first measurement */ | 354 | */ |
417 | if (hctx->ccid2hctx_srtt == -1) { | 355 | ccid2_rtt_estimator(sk, jiffies - seqp->ccid2s_sent); |
418 | ccid2_pr_debug("R: %lu Time=%lu seq=%llu\n", | ||
419 | r, jiffies, | ||
420 | (unsigned long long)seqp->ccid2s_seq); | ||
421 | ccid2_change_srtt(hctx, r); | ||
422 | hctx->ccid2hctx_rttvar = r >> 1; | ||
423 | } else { | ||
424 | /* RTTVAR */ | ||
425 | long tmp = hctx->ccid2hctx_srtt - r; | ||
426 | long srtt; | ||
427 | |||
428 | if (tmp < 0) | ||
429 | tmp *= -1; | ||
430 | |||
431 | tmp >>= 2; | ||
432 | hctx->ccid2hctx_rttvar *= 3; | ||
433 | hctx->ccid2hctx_rttvar >>= 2; | ||
434 | hctx->ccid2hctx_rttvar += tmp; | ||
435 | |||
436 | /* SRTT */ | ||
437 | srtt = hctx->ccid2hctx_srtt; | ||
438 | srtt *= 7; | ||
439 | srtt >>= 3; | ||
440 | tmp = r >> 3; | ||
441 | srtt += tmp; | ||
442 | ccid2_change_srtt(hctx, srtt); | ||
443 | } | ||
444 | s = hctx->ccid2hctx_rttvar << 2; | ||
445 | /* clock granularity is 1 when based on jiffies */ | ||
446 | if (!s) | ||
447 | s = 1; | ||
448 | hctx->ccid2hctx_rto = hctx->ccid2hctx_srtt + s; | ||
449 | |||
450 | /* must be at least a second */ | ||
451 | s = hctx->ccid2hctx_rto / HZ; | ||
452 | /* DCCP doesn't require this [but I like it cuz my code sux] */ | ||
453 | #if 1 | ||
454 | if (s < 1) | ||
455 | hctx->ccid2hctx_rto = HZ; | ||
456 | #endif | ||
457 | /* max 60 seconds */ | ||
458 | if (s > 60) | ||
459 | hctx->ccid2hctx_rto = HZ * 60; | ||
460 | |||
461 | hctx->ccid2hctx_lastrtt = jiffies; | ||
462 | |||
463 | ccid2_pr_debug("srtt: %ld rttvar: %ld rto: %ld (HZ=%d) R=%lu\n", | ||
464 | hctx->ccid2hctx_srtt, hctx->ccid2hctx_rttvar, | ||
465 | hctx->ccid2hctx_rto, HZ, r); | ||
466 | } | ||
467 | |||
468 | /* we got a new ack, so re-start RTO timer */ | ||
469 | ccid2_hc_tx_kill_rto_timer(sk); | ||
470 | ccid2_start_rto_timer(sk); | ||
471 | } | ||
472 | |||
473 | static void ccid2_hc_tx_dec_pipe(struct sock *sk) | ||
474 | { | ||
475 | struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); | ||
476 | |||
477 | if (hctx->ccid2hctx_pipe == 0) | ||
478 | DCCP_BUG("pipe == 0"); | ||
479 | else | ||
480 | hctx->ccid2hctx_pipe--; | ||
481 | |||
482 | if (hctx->ccid2hctx_pipe == 0) | ||
483 | ccid2_hc_tx_kill_rto_timer(sk); | ||
484 | } | 356 | } |
485 | 357 | ||
486 | static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp) | 358 | static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp) |
487 | { | 359 | { |
488 | struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); | 360 | struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); |
489 | 361 | ||
490 | if (time_before(seqp->ccid2s_sent, hctx->ccid2hctx_last_cong)) { | 362 | if (time_before(seqp->ccid2s_sent, hctx->last_cong)) { |
491 | ccid2_pr_debug("Multiple losses in an RTT---treating as one\n"); | 363 | ccid2_pr_debug("Multiple losses in an RTT---treating as one\n"); |
492 | return; | 364 | return; |
493 | } | 365 | } |
494 | 366 | ||
495 | hctx->ccid2hctx_last_cong = jiffies; | 367 | hctx->last_cong = jiffies; |
496 | 368 | ||
497 | hctx->ccid2hctx_cwnd = hctx->ccid2hctx_cwnd / 2 ? : 1U; | 369 | hctx->cwnd = hctx->cwnd / 2 ? : 1U; |
498 | hctx->ccid2hctx_ssthresh = max(hctx->ccid2hctx_cwnd, 2U); | 370 | hctx->ssthresh = max(hctx->cwnd, 2U); |
499 | 371 | ||
500 | /* Avoid spurious timeouts resulting from Ack Ratio > cwnd */ | 372 | /* Avoid spurious timeouts resulting from Ack Ratio > cwnd */ |
501 | if (dccp_sk(sk)->dccps_l_ack_ratio > hctx->ccid2hctx_cwnd) | 373 | if (dccp_sk(sk)->dccps_l_ack_ratio > hctx->cwnd) |
502 | ccid2_change_l_ack_ratio(sk, hctx->ccid2hctx_cwnd); | 374 | ccid2_change_l_ack_ratio(sk, hctx->cwnd); |
375 | } | ||
376 | |||
377 | static int ccid2_hc_tx_parse_options(struct sock *sk, u8 packet_type, | ||
378 | u8 option, u8 *optval, u8 optlen) | ||
379 | { | ||
380 | struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); | ||
381 | |||
382 | switch (option) { | ||
383 | case DCCPO_ACK_VECTOR_0: | ||
384 | case DCCPO_ACK_VECTOR_1: | ||
385 | return dccp_ackvec_parsed_add(&hctx->av_chunks, optval, optlen, | ||
386 | option - DCCPO_ACK_VECTOR_0); | ||
387 | } | ||
388 | return 0; | ||
503 | } | 389 | } |
504 | 390 | ||
505 | static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) | 391 | static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) |
506 | { | 392 | { |
507 | struct dccp_sock *dp = dccp_sk(sk); | 393 | struct dccp_sock *dp = dccp_sk(sk); |
508 | struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); | 394 | struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); |
395 | const bool sender_was_blocked = ccid2_cwnd_network_limited(hctx); | ||
396 | struct dccp_ackvec_parsed *avp; | ||
509 | u64 ackno, seqno; | 397 | u64 ackno, seqno; |
510 | struct ccid2_seq *seqp; | 398 | struct ccid2_seq *seqp; |
511 | unsigned char *vector; | ||
512 | unsigned char veclen; | ||
513 | int offset = 0; | ||
514 | int done = 0; | 399 | int done = 0; |
515 | unsigned int maxincr = 0; | 400 | unsigned int maxincr = 0; |
516 | 401 | ||
517 | ccid2_hc_tx_check_sanity(hctx); | ||
518 | /* check reverse path congestion */ | 402 | /* check reverse path congestion */ |
519 | seqno = DCCP_SKB_CB(skb)->dccpd_seq; | 403 | seqno = DCCP_SKB_CB(skb)->dccpd_seq; |
520 | 404 | ||
@@ -523,21 +407,21 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) | |||
523 | * -sorbo. | 407 | * -sorbo. |
524 | */ | 408 | */ |
525 | /* need to bootstrap */ | 409 | /* need to bootstrap */ |
526 | if (hctx->ccid2hctx_rpdupack == -1) { | 410 | if (hctx->rpdupack == -1) { |
527 | hctx->ccid2hctx_rpdupack = 0; | 411 | hctx->rpdupack = 0; |
528 | hctx->ccid2hctx_rpseq = seqno; | 412 | hctx->rpseq = seqno; |
529 | } else { | 413 | } else { |
530 | /* check if packet is consecutive */ | 414 | /* check if packet is consecutive */ |
531 | if (dccp_delta_seqno(hctx->ccid2hctx_rpseq, seqno) == 1) | 415 | if (dccp_delta_seqno(hctx->rpseq, seqno) == 1) |
532 | hctx->ccid2hctx_rpseq = seqno; | 416 | hctx->rpseq = seqno; |
533 | /* it's a later packet */ | 417 | /* it's a later packet */ |
534 | else if (after48(seqno, hctx->ccid2hctx_rpseq)) { | 418 | else if (after48(seqno, hctx->rpseq)) { |
535 | hctx->ccid2hctx_rpdupack++; | 419 | hctx->rpdupack++; |
536 | 420 | ||
537 | /* check if we got enough dupacks */ | 421 | /* check if we got enough dupacks */ |
538 | if (hctx->ccid2hctx_rpdupack >= NUMDUPACK) { | 422 | if (hctx->rpdupack >= NUMDUPACK) { |
539 | hctx->ccid2hctx_rpdupack = -1; /* XXX lame */ | 423 | hctx->rpdupack = -1; /* XXX lame */ |
540 | hctx->ccid2hctx_rpseq = 0; | 424 | hctx->rpseq = 0; |
541 | 425 | ||
542 | ccid2_change_l_ack_ratio(sk, 2 * dp->dccps_l_ack_ratio); | 426 | ccid2_change_l_ack_ratio(sk, 2 * dp->dccps_l_ack_ratio); |
543 | } | 427 | } |
@@ -545,27 +429,22 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) | |||
545 | } | 429 | } |
546 | 430 | ||
547 | /* check forward path congestion */ | 431 | /* check forward path congestion */ |
548 | /* still didn't send out new data packets */ | 432 | if (dccp_packet_without_ack(skb)) |
549 | if (hctx->ccid2hctx_seqh == hctx->ccid2hctx_seqt) | ||
550 | return; | 433 | return; |
551 | 434 | ||
552 | switch (DCCP_SKB_CB(skb)->dccpd_type) { | 435 | /* still didn't send out new data packets */ |
553 | case DCCP_PKT_ACK: | 436 | if (hctx->seqh == hctx->seqt) |
554 | case DCCP_PKT_DATAACK: | 437 | goto done; |
555 | break; | ||
556 | default: | ||
557 | return; | ||
558 | } | ||
559 | 438 | ||
560 | ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq; | 439 | ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq; |
561 | if (after48(ackno, hctx->ccid2hctx_high_ack)) | 440 | if (after48(ackno, hctx->high_ack)) |
562 | hctx->ccid2hctx_high_ack = ackno; | 441 | hctx->high_ack = ackno; |
563 | 442 | ||
564 | seqp = hctx->ccid2hctx_seqt; | 443 | seqp = hctx->seqt; |
565 | while (before48(seqp->ccid2s_seq, ackno)) { | 444 | while (before48(seqp->ccid2s_seq, ackno)) { |
566 | seqp = seqp->ccid2s_next; | 445 | seqp = seqp->ccid2s_next; |
567 | if (seqp == hctx->ccid2hctx_seqh) { | 446 | if (seqp == hctx->seqh) { |
568 | seqp = hctx->ccid2hctx_seqh->ccid2s_prev; | 447 | seqp = hctx->seqh->ccid2s_prev; |
569 | break; | 448 | break; |
570 | } | 449 | } |
571 | } | 450 | } |
@@ -575,26 +454,26 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) | |||
575 | * packets per acknowledgement. Rounding up avoids that cwnd is not | 454 | * packets per acknowledgement. Rounding up avoids that cwnd is not |
576 | * advanced when Ack Ratio is 1 and gives a slight edge otherwise. | 455 | * advanced when Ack Ratio is 1 and gives a slight edge otherwise. |
577 | */ | 456 | */ |
578 | if (hctx->ccid2hctx_cwnd < hctx->ccid2hctx_ssthresh) | 457 | if (hctx->cwnd < hctx->ssthresh) |
579 | maxincr = DIV_ROUND_UP(dp->dccps_l_ack_ratio, 2); | 458 | maxincr = DIV_ROUND_UP(dp->dccps_l_ack_ratio, 2); |
580 | 459 | ||
581 | /* go through all ack vectors */ | 460 | /* go through all ack vectors */ |
582 | while ((offset = ccid2_ackvector(sk, skb, offset, | 461 | list_for_each_entry(avp, &hctx->av_chunks, node) { |
583 | &vector, &veclen)) != -1) { | ||
584 | /* go through this ack vector */ | 462 | /* go through this ack vector */ |
585 | while (veclen--) { | 463 | for (; avp->len--; avp->vec++) { |
586 | const u8 rl = *vector & DCCP_ACKVEC_LEN_MASK; | 464 | u64 ackno_end_rl = SUB48(ackno, |
587 | u64 ackno_end_rl = SUB48(ackno, rl); | 465 | dccp_ackvec_runlen(avp->vec)); |
588 | 466 | ||
589 | ccid2_pr_debug("ackvec start:%llu end:%llu\n", | 467 | ccid2_pr_debug("ackvec %llu |%u,%u|\n", |
590 | (unsigned long long)ackno, | 468 | (unsigned long long)ackno, |
591 | (unsigned long long)ackno_end_rl); | 469 | dccp_ackvec_state(avp->vec) >> 6, |
470 | dccp_ackvec_runlen(avp->vec)); | ||
592 | /* if the seqno we are analyzing is larger than the | 471 | /* if the seqno we are analyzing is larger than the |
593 | * current ackno, then move towards the tail of our | 472 | * current ackno, then move towards the tail of our |
594 | * seqnos. | 473 | * seqnos. |
595 | */ | 474 | */ |
596 | while (after48(seqp->ccid2s_seq, ackno)) { | 475 | while (after48(seqp->ccid2s_seq, ackno)) { |
597 | if (seqp == hctx->ccid2hctx_seqt) { | 476 | if (seqp == hctx->seqt) { |
598 | done = 1; | 477 | done = 1; |
599 | break; | 478 | break; |
600 | } | 479 | } |
@@ -607,26 +486,24 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) | |||
607 | * run length | 486 | * run length |
608 | */ | 487 | */ |
609 | while (between48(seqp->ccid2s_seq,ackno_end_rl,ackno)) { | 488 | while (between48(seqp->ccid2s_seq,ackno_end_rl,ackno)) { |
610 | const u8 state = *vector & | 489 | const u8 state = dccp_ackvec_state(avp->vec); |
611 | DCCP_ACKVEC_STATE_MASK; | ||
612 | 490 | ||
613 | /* new packet received or marked */ | 491 | /* new packet received or marked */ |
614 | if (state != DCCP_ACKVEC_STATE_NOT_RECEIVED && | 492 | if (state != DCCPAV_NOT_RECEIVED && |
615 | !seqp->ccid2s_acked) { | 493 | !seqp->ccid2s_acked) { |
616 | if (state == | 494 | if (state == DCCPAV_ECN_MARKED) |
617 | DCCP_ACKVEC_STATE_ECN_MARKED) { | ||
618 | ccid2_congestion_event(sk, | 495 | ccid2_congestion_event(sk, |
619 | seqp); | 496 | seqp); |
620 | } else | 497 | else |
621 | ccid2_new_ack(sk, seqp, | 498 | ccid2_new_ack(sk, seqp, |
622 | &maxincr); | 499 | &maxincr); |
623 | 500 | ||
624 | seqp->ccid2s_acked = 1; | 501 | seqp->ccid2s_acked = 1; |
625 | ccid2_pr_debug("Got ack for %llu\n", | 502 | ccid2_pr_debug("Got ack for %llu\n", |
626 | (unsigned long long)seqp->ccid2s_seq); | 503 | (unsigned long long)seqp->ccid2s_seq); |
627 | ccid2_hc_tx_dec_pipe(sk); | 504 | hctx->pipe--; |
628 | } | 505 | } |
629 | if (seqp == hctx->ccid2hctx_seqt) { | 506 | if (seqp == hctx->seqt) { |
630 | done = 1; | 507 | done = 1; |
631 | break; | 508 | break; |
632 | } | 509 | } |
@@ -636,7 +513,6 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) | |||
636 | break; | 513 | break; |
637 | 514 | ||
638 | ackno = SUB48(ackno_end_rl, 1); | 515 | ackno = SUB48(ackno_end_rl, 1); |
639 | vector++; | ||
640 | } | 516 | } |
641 | if (done) | 517 | if (done) |
642 | break; | 518 | break; |
@@ -645,11 +521,11 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) | |||
645 | /* The state about what is acked should be correct now | 521 | /* The state about what is acked should be correct now |
646 | * Check for NUMDUPACK | 522 | * Check for NUMDUPACK |
647 | */ | 523 | */ |
648 | seqp = hctx->ccid2hctx_seqt; | 524 | seqp = hctx->seqt; |
649 | while (before48(seqp->ccid2s_seq, hctx->ccid2hctx_high_ack)) { | 525 | while (before48(seqp->ccid2s_seq, hctx->high_ack)) { |
650 | seqp = seqp->ccid2s_next; | 526 | seqp = seqp->ccid2s_next; |
651 | if (seqp == hctx->ccid2hctx_seqh) { | 527 | if (seqp == hctx->seqh) { |
652 | seqp = hctx->ccid2hctx_seqh->ccid2s_prev; | 528 | seqp = hctx->seqh->ccid2s_prev; |
653 | break; | 529 | break; |
654 | } | 530 | } |
655 | } | 531 | } |
@@ -660,7 +536,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) | |||
660 | if (done == NUMDUPACK) | 536 | if (done == NUMDUPACK) |
661 | break; | 537 | break; |
662 | } | 538 | } |
663 | if (seqp == hctx->ccid2hctx_seqt) | 539 | if (seqp == hctx->seqt) |
664 | break; | 540 | break; |
665 | seqp = seqp->ccid2s_prev; | 541 | seqp = seqp->ccid2s_prev; |
666 | } | 542 | } |
@@ -681,25 +557,34 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) | |||
681 | * one ack vector. | 557 | * one ack vector. |
682 | */ | 558 | */ |
683 | ccid2_congestion_event(sk, seqp); | 559 | ccid2_congestion_event(sk, seqp); |
684 | ccid2_hc_tx_dec_pipe(sk); | 560 | hctx->pipe--; |
685 | } | 561 | } |
686 | if (seqp == hctx->ccid2hctx_seqt) | 562 | if (seqp == hctx->seqt) |
687 | break; | 563 | break; |
688 | seqp = seqp->ccid2s_prev; | 564 | seqp = seqp->ccid2s_prev; |
689 | } | 565 | } |
690 | 566 | ||
691 | hctx->ccid2hctx_seqt = last_acked; | 567 | hctx->seqt = last_acked; |
692 | } | 568 | } |
693 | 569 | ||
694 | /* trim acked packets in tail */ | 570 | /* trim acked packets in tail */ |
695 | while (hctx->ccid2hctx_seqt != hctx->ccid2hctx_seqh) { | 571 | while (hctx->seqt != hctx->seqh) { |
696 | if (!hctx->ccid2hctx_seqt->ccid2s_acked) | 572 | if (!hctx->seqt->ccid2s_acked) |
697 | break; | 573 | break; |
698 | 574 | ||
699 | hctx->ccid2hctx_seqt = hctx->ccid2hctx_seqt->ccid2s_next; | 575 | hctx->seqt = hctx->seqt->ccid2s_next; |
700 | } | 576 | } |
701 | 577 | ||
702 | ccid2_hc_tx_check_sanity(hctx); | 578 | /* restart RTO timer if not all outstanding data has been acked */ |
579 | if (hctx->pipe == 0) | ||
580 | sk_stop_timer(sk, &hctx->rtotimer); | ||
581 | else | ||
582 | sk_reset_timer(sk, &hctx->rtotimer, jiffies + hctx->rto); | ||
583 | done: | ||
584 | /* check if incoming Acks allow pending packets to be sent */ | ||
585 | if (sender_was_blocked && !ccid2_cwnd_network_limited(hctx)) | ||
586 | tasklet_schedule(&dccp_sk(sk)->dccps_xmitlet); | ||
587 | dccp_ackvec_parsed_cleanup(&hctx->av_chunks); | ||
703 | } | 588 | } |
704 | 589 | ||
705 | static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk) | 590 | static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk) |
@@ -709,17 +594,13 @@ static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk) | |||
709 | u32 max_ratio; | 594 | u32 max_ratio; |
710 | 595 | ||
711 | /* RFC 4341, 5: initialise ssthresh to arbitrarily high (max) value */ | 596 | /* RFC 4341, 5: initialise ssthresh to arbitrarily high (max) value */ |
712 | hctx->ccid2hctx_ssthresh = ~0U; | 597 | hctx->ssthresh = ~0U; |
713 | 598 | ||
714 | /* | 599 | /* Use larger initial windows (RFC 3390, rfc2581bis) */ |
715 | * RFC 4341, 5: "The cwnd parameter is initialized to at most four | 600 | hctx->cwnd = rfc3390_bytes_to_packets(dp->dccps_mss_cache); |
716 | * packets for new connections, following the rules from [RFC3390]". | ||
717 | * We need to convert the bytes of RFC3390 into the packets of RFC 4341. | ||
718 | */ | ||
719 | hctx->ccid2hctx_cwnd = clamp(4380U / dp->dccps_mss_cache, 2U, 4U); | ||
720 | 601 | ||
721 | /* Make sure that Ack Ratio is enabled and within bounds. */ | 602 | /* Make sure that Ack Ratio is enabled and within bounds. */ |
722 | max_ratio = DIV_ROUND_UP(hctx->ccid2hctx_cwnd, 2); | 603 | max_ratio = DIV_ROUND_UP(hctx->cwnd, 2); |
723 | if (dp->dccps_l_ack_ratio == 0 || dp->dccps_l_ack_ratio > max_ratio) | 604 | if (dp->dccps_l_ack_ratio == 0 || dp->dccps_l_ack_ratio > max_ratio) |
724 | dp->dccps_l_ack_ratio = max_ratio; | 605 | dp->dccps_l_ack_ratio = max_ratio; |
725 | 606 | ||
@@ -727,15 +608,11 @@ static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk) | |||
727 | if (ccid2_hc_tx_alloc_seq(hctx)) | 608 | if (ccid2_hc_tx_alloc_seq(hctx)) |
728 | return -ENOMEM; | 609 | return -ENOMEM; |
729 | 610 | ||
730 | hctx->ccid2hctx_rto = 3 * HZ; | 611 | hctx->rto = DCCP_TIMEOUT_INIT; |
731 | ccid2_change_srtt(hctx, -1); | 612 | hctx->rpdupack = -1; |
732 | hctx->ccid2hctx_rttvar = -1; | 613 | hctx->last_cong = jiffies; |
733 | hctx->ccid2hctx_rpdupack = -1; | 614 | setup_timer(&hctx->rtotimer, ccid2_hc_tx_rto_expire, (unsigned long)sk); |
734 | hctx->ccid2hctx_last_cong = jiffies; | 615 | INIT_LIST_HEAD(&hctx->av_chunks); |
735 | setup_timer(&hctx->ccid2hctx_rtotimer, ccid2_hc_tx_rto_expire, | ||
736 | (unsigned long)sk); | ||
737 | |||
738 | ccid2_hc_tx_check_sanity(hctx); | ||
739 | return 0; | 616 | return 0; |
740 | } | 617 | } |
741 | 618 | ||
@@ -744,11 +621,11 @@ static void ccid2_hc_tx_exit(struct sock *sk) | |||
744 | struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); | 621 | struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); |
745 | int i; | 622 | int i; |
746 | 623 | ||
747 | ccid2_hc_tx_kill_rto_timer(sk); | 624 | sk_stop_timer(sk, &hctx->rtotimer); |
748 | 625 | ||
749 | for (i = 0; i < hctx->ccid2hctx_seqbufc; i++) | 626 | for (i = 0; i < hctx->seqbufc; i++) |
750 | kfree(hctx->ccid2hctx_seqbuf[i]); | 627 | kfree(hctx->seqbuf[i]); |
751 | hctx->ccid2hctx_seqbufc = 0; | 628 | hctx->seqbufc = 0; |
752 | } | 629 | } |
753 | 630 | ||
754 | static void ccid2_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) | 631 | static void ccid2_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) |
@@ -759,27 +636,28 @@ static void ccid2_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) | |||
759 | switch (DCCP_SKB_CB(skb)->dccpd_type) { | 636 | switch (DCCP_SKB_CB(skb)->dccpd_type) { |
760 | case DCCP_PKT_DATA: | 637 | case DCCP_PKT_DATA: |
761 | case DCCP_PKT_DATAACK: | 638 | case DCCP_PKT_DATAACK: |
762 | hcrx->ccid2hcrx_data++; | 639 | hcrx->data++; |
763 | if (hcrx->ccid2hcrx_data >= dp->dccps_r_ack_ratio) { | 640 | if (hcrx->data >= dp->dccps_r_ack_ratio) { |
764 | dccp_send_ack(sk); | 641 | dccp_send_ack(sk); |
765 | hcrx->ccid2hcrx_data = 0; | 642 | hcrx->data = 0; |
766 | } | 643 | } |
767 | break; | 644 | break; |
768 | } | 645 | } |
769 | } | 646 | } |
770 | 647 | ||
771 | static struct ccid_operations ccid2 = { | 648 | static struct ccid_operations ccid2 = { |
772 | .ccid_id = DCCPC_CCID2, | 649 | .ccid_id = DCCPC_CCID2, |
773 | .ccid_name = "TCP-like", | 650 | .ccid_name = "TCP-like", |
774 | .ccid_owner = THIS_MODULE, | 651 | .ccid_owner = THIS_MODULE, |
775 | .ccid_hc_tx_obj_size = sizeof(struct ccid2_hc_tx_sock), | 652 | .ccid_hc_tx_obj_size = sizeof(struct ccid2_hc_tx_sock), |
776 | .ccid_hc_tx_init = ccid2_hc_tx_init, | 653 | .ccid_hc_tx_init = ccid2_hc_tx_init, |
777 | .ccid_hc_tx_exit = ccid2_hc_tx_exit, | 654 | .ccid_hc_tx_exit = ccid2_hc_tx_exit, |
778 | .ccid_hc_tx_send_packet = ccid2_hc_tx_send_packet, | 655 | .ccid_hc_tx_send_packet = ccid2_hc_tx_send_packet, |
779 | .ccid_hc_tx_packet_sent = ccid2_hc_tx_packet_sent, | 656 | .ccid_hc_tx_packet_sent = ccid2_hc_tx_packet_sent, |
780 | .ccid_hc_tx_packet_recv = ccid2_hc_tx_packet_recv, | 657 | .ccid_hc_tx_parse_options = ccid2_hc_tx_parse_options, |
781 | .ccid_hc_rx_obj_size = sizeof(struct ccid2_hc_rx_sock), | 658 | .ccid_hc_tx_packet_recv = ccid2_hc_tx_packet_recv, |
782 | .ccid_hc_rx_packet_recv = ccid2_hc_rx_packet_recv, | 659 | .ccid_hc_rx_obj_size = sizeof(struct ccid2_hc_rx_sock), |
660 | .ccid_hc_rx_packet_recv = ccid2_hc_rx_packet_recv, | ||
783 | }; | 661 | }; |
784 | 662 | ||
785 | #ifdef CONFIG_IP_DCCP_CCID2_DEBUG | 663 | #ifdef CONFIG_IP_DCCP_CCID2_DEBUG |
diff --git a/net/dccp/ccids/ccid2.h b/net/dccp/ccids/ccid2.h index 2c94ca029010..8b7a2dee2f6d 100644 --- a/net/dccp/ccids/ccid2.h +++ b/net/dccp/ccids/ccid2.h | |||
@@ -42,34 +42,49 @@ struct ccid2_seq { | |||
42 | 42 | ||
43 | /** struct ccid2_hc_tx_sock - CCID2 TX half connection | 43 | /** struct ccid2_hc_tx_sock - CCID2 TX half connection |
44 | * | 44 | * |
45 | * @ccid2hctx_{cwnd,ssthresh,pipe}: as per RFC 4341, section 5 | 45 | * @{cwnd,ssthresh,pipe}: as per RFC 4341, section 5 |
46 | * @ccid2hctx_packets_acked - Ack counter for deriving cwnd growth (RFC 3465) | 46 | * @packets_acked: Ack counter for deriving cwnd growth (RFC 3465) |
47 | * @ccid2hctx_lastrtt -time RTT was last measured | 47 | * @srtt: smoothed RTT estimate, scaled by 2^3 |
48 | * @ccid2hctx_rpseq - last consecutive seqno | 48 | * @mdev: smoothed RTT variation, scaled by 2^2 |
49 | * @ccid2hctx_rpdupack - dupacks since rpseq | 49 | * @mdev_max: maximum of @mdev during one flight |
50 | */ | 50 | * @rttvar: moving average/maximum of @mdev_max |
51 | * @rto: RTO value deriving from SRTT and RTTVAR (RFC 2988) | ||
52 | * @rtt_seq: to decay RTTVAR at most once per flight | ||
53 | * @rpseq: last consecutive seqno | ||
54 | * @rpdupack: dupacks since rpseq | ||
55 | * @av_chunks: list of Ack Vectors received on current skb | ||
56 | */ | ||
51 | struct ccid2_hc_tx_sock { | 57 | struct ccid2_hc_tx_sock { |
52 | u32 ccid2hctx_cwnd; | 58 | u32 cwnd; |
53 | u32 ccid2hctx_ssthresh; | 59 | u32 ssthresh; |
54 | u32 ccid2hctx_pipe; | 60 | u32 pipe; |
55 | u32 ccid2hctx_packets_acked; | 61 | u32 packets_acked; |
56 | struct ccid2_seq *ccid2hctx_seqbuf[CCID2_SEQBUF_MAX]; | 62 | struct ccid2_seq *seqbuf[CCID2_SEQBUF_MAX]; |
57 | int ccid2hctx_seqbufc; | 63 | int seqbufc; |
58 | struct ccid2_seq *ccid2hctx_seqh; | 64 | struct ccid2_seq *seqh; |
59 | struct ccid2_seq *ccid2hctx_seqt; | 65 | struct ccid2_seq *seqt; |
60 | long ccid2hctx_rto; | 66 | /* RTT measurement: variables/principles are the same as in TCP */ |
61 | long ccid2hctx_srtt; | 67 | u32 srtt, |
62 | long ccid2hctx_rttvar; | 68 | mdev, |
63 | unsigned long ccid2hctx_lastrtt; | 69 | mdev_max, |
64 | struct timer_list ccid2hctx_rtotimer; | 70 | rttvar, |
65 | u64 ccid2hctx_rpseq; | 71 | rto; |
66 | int ccid2hctx_rpdupack; | 72 | u64 rtt_seq:48; |
67 | unsigned long ccid2hctx_last_cong; | 73 | struct timer_list rtotimer; |
68 | u64 ccid2hctx_high_ack; | 74 | u64 rpseq; |
75 | int rpdupack; | ||
76 | unsigned long last_cong; | ||
77 | u64 high_ack; | ||
78 | struct list_head av_chunks; | ||
69 | }; | 79 | }; |
70 | 80 | ||
81 | static inline bool ccid2_cwnd_network_limited(struct ccid2_hc_tx_sock *hctx) | ||
82 | { | ||
83 | return (hctx->pipe >= hctx->cwnd); | ||
84 | } | ||
85 | |||
71 | struct ccid2_hc_rx_sock { | 86 | struct ccid2_hc_rx_sock { |
72 | int ccid2hcrx_data; | 87 | int data; |
73 | }; | 88 | }; |
74 | 89 | ||
75 | static inline struct ccid2_hc_tx_sock *ccid2_hc_tx_sk(const struct sock *sk) | 90 | static inline struct ccid2_hc_tx_sock *ccid2_hc_tx_sk(const struct sock *sk) |
diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index 3b8bd7ca6761..06cfdad84a6a 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c | |||
@@ -49,75 +49,41 @@ static int ccid3_debug; | |||
49 | /* | 49 | /* |
50 | * Transmitter Half-Connection Routines | 50 | * Transmitter Half-Connection Routines |
51 | */ | 51 | */ |
52 | #ifdef CONFIG_IP_DCCP_CCID3_DEBUG | 52 | /* Oscillation Prevention/Reduction: recommended by rfc3448bis, on by default */ |
53 | static const char *ccid3_tx_state_name(enum ccid3_hc_tx_states state) | 53 | static int do_osc_prev = true; |
54 | { | ||
55 | static char *ccid3_state_names[] = { | ||
56 | [TFRC_SSTATE_NO_SENT] = "NO_SENT", | ||
57 | [TFRC_SSTATE_NO_FBACK] = "NO_FBACK", | ||
58 | [TFRC_SSTATE_FBACK] = "FBACK", | ||
59 | [TFRC_SSTATE_TERM] = "TERM", | ||
60 | }; | ||
61 | |||
62 | return ccid3_state_names[state]; | ||
63 | } | ||
64 | #endif | ||
65 | |||
66 | static void ccid3_hc_tx_set_state(struct sock *sk, | ||
67 | enum ccid3_hc_tx_states state) | ||
68 | { | ||
69 | struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); | ||
70 | enum ccid3_hc_tx_states oldstate = hctx->ccid3hctx_state; | ||
71 | |||
72 | ccid3_pr_debug("%s(%p) %-8.8s -> %s\n", | ||
73 | dccp_role(sk), sk, ccid3_tx_state_name(oldstate), | ||
74 | ccid3_tx_state_name(state)); | ||
75 | WARN_ON(state == oldstate); | ||
76 | hctx->ccid3hctx_state = state; | ||
77 | } | ||
78 | 54 | ||
79 | /* | 55 | /* |
80 | * Compute the initial sending rate X_init in the manner of RFC 3390: | 56 | * Compute the initial sending rate X_init in the manner of RFC 3390: |
81 | * | 57 | * |
82 | * X_init = min(4 * s, max(2 * s, 4380 bytes)) / RTT | 58 | * X_init = min(4 * MPS, max(2 * MPS, 4380 bytes)) / RTT |
83 | * | 59 | * |
84 | * Note that RFC 3390 uses MSS, RFC 4342 refers to RFC 3390, and rfc3448bis | ||
85 | * (rev-02) clarifies the use of RFC 3390 with regard to the above formula. | ||
86 | * For consistency with other parts of the code, X_init is scaled by 2^6. | 60 | * For consistency with other parts of the code, X_init is scaled by 2^6. |
87 | */ | 61 | */ |
88 | static inline u64 rfc3390_initial_rate(struct sock *sk) | 62 | static inline u64 rfc3390_initial_rate(struct sock *sk) |
89 | { | 63 | { |
90 | const struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); | 64 | const u32 mps = dccp_sk(sk)->dccps_mss_cache, |
91 | const __u32 w_init = clamp_t(__u32, 4380U, | 65 | w_init = clamp(4380U, 2 * mps, 4 * mps); |
92 | 2 * hctx->ccid3hctx_s, 4 * hctx->ccid3hctx_s); | ||
93 | 66 | ||
94 | return scaled_div(w_init << 6, hctx->ccid3hctx_rtt); | 67 | return scaled_div(w_init << 6, ccid3_hc_tx_sk(sk)->rtt); |
95 | } | 68 | } |
96 | 69 | ||
97 | /* | 70 | /** |
98 | * Recalculate t_ipi and delta (should be called whenever X changes) | 71 | * ccid3_update_send_interval - Calculate new t_ipi = s / X |
72 | * This respects the granularity of X (64 * bytes/second) and enforces the | ||
73 | * scaled minimum of s * 64 / t_mbi = `s' bytes/second as per RFC 3448/4342. | ||
99 | */ | 74 | */ |
100 | static void ccid3_update_send_interval(struct ccid3_hc_tx_sock *hctx) | 75 | static void ccid3_update_send_interval(struct ccid3_hc_tx_sock *hctx) |
101 | { | 76 | { |
102 | /* Calculate new t_ipi = s / X_inst (X_inst is in 64 * bytes/second) */ | 77 | if (unlikely(hctx->x <= hctx->s)) |
103 | hctx->ccid3hctx_t_ipi = scaled_div32(((u64)hctx->ccid3hctx_s) << 6, | 78 | hctx->x = hctx->s; |
104 | hctx->ccid3hctx_x); | 79 | hctx->t_ipi = scaled_div32(((u64)hctx->s) << 6, hctx->x); |
105 | |||
106 | /* Calculate new delta by delta = min(t_ipi / 2, t_gran / 2) */ | ||
107 | hctx->ccid3hctx_delta = min_t(u32, hctx->ccid3hctx_t_ipi / 2, | ||
108 | TFRC_OPSYS_HALF_TIME_GRAN); | ||
109 | |||
110 | ccid3_pr_debug("t_ipi=%u, delta=%u, s=%u, X=%u\n", | ||
111 | hctx->ccid3hctx_t_ipi, hctx->ccid3hctx_delta, | ||
112 | hctx->ccid3hctx_s, (unsigned)(hctx->ccid3hctx_x >> 6)); | ||
113 | |||
114 | } | 80 | } |
115 | 81 | ||
116 | static u32 ccid3_hc_tx_idle_rtt(struct ccid3_hc_tx_sock *hctx, ktime_t now) | 82 | static u32 ccid3_hc_tx_idle_rtt(struct ccid3_hc_tx_sock *hctx, ktime_t now) |
117 | { | 83 | { |
118 | u32 delta = ktime_us_delta(now, hctx->ccid3hctx_t_last_win_count); | 84 | u32 delta = ktime_us_delta(now, hctx->t_last_win_count); |
119 | 85 | ||
120 | return delta / hctx->ccid3hctx_rtt; | 86 | return delta / hctx->rtt; |
121 | } | 87 | } |
122 | 88 | ||
123 | /** | 89 | /** |
@@ -133,8 +99,8 @@ static u32 ccid3_hc_tx_idle_rtt(struct ccid3_hc_tx_sock *hctx, ktime_t now) | |||
133 | static void ccid3_hc_tx_update_x(struct sock *sk, ktime_t *stamp) | 99 | static void ccid3_hc_tx_update_x(struct sock *sk, ktime_t *stamp) |
134 | { | 100 | { |
135 | struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); | 101 | struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); |
136 | __u64 min_rate = 2 * hctx->ccid3hctx_x_recv; | 102 | u64 min_rate = 2 * hctx->x_recv; |
137 | const __u64 old_x = hctx->ccid3hctx_x; | 103 | const u64 old_x = hctx->x; |
138 | ktime_t now = stamp ? *stamp : ktime_get_real(); | 104 | ktime_t now = stamp ? *stamp : ktime_get_real(); |
139 | 105 | ||
140 | /* | 106 | /* |
@@ -145,50 +111,44 @@ static void ccid3_hc_tx_update_x(struct sock *sk, ktime_t *stamp) | |||
145 | */ | 111 | */ |
146 | if (ccid3_hc_tx_idle_rtt(hctx, now) >= 2) { | 112 | if (ccid3_hc_tx_idle_rtt(hctx, now) >= 2) { |
147 | min_rate = rfc3390_initial_rate(sk); | 113 | min_rate = rfc3390_initial_rate(sk); |
148 | min_rate = max(min_rate, 2 * hctx->ccid3hctx_x_recv); | 114 | min_rate = max(min_rate, 2 * hctx->x_recv); |
149 | } | 115 | } |
150 | 116 | ||
151 | if (hctx->ccid3hctx_p > 0) { | 117 | if (hctx->p > 0) { |
152 | 118 | ||
153 | hctx->ccid3hctx_x = min(((__u64)hctx->ccid3hctx_x_calc) << 6, | 119 | hctx->x = min(((u64)hctx->x_calc) << 6, min_rate); |
154 | min_rate); | ||
155 | hctx->ccid3hctx_x = max(hctx->ccid3hctx_x, | ||
156 | (((__u64)hctx->ccid3hctx_s) << 6) / | ||
157 | TFRC_T_MBI); | ||
158 | 120 | ||
159 | } else if (ktime_us_delta(now, hctx->ccid3hctx_t_ld) | 121 | } else if (ktime_us_delta(now, hctx->t_ld) - (s64)hctx->rtt >= 0) { |
160 | - (s64)hctx->ccid3hctx_rtt >= 0) { | ||
161 | 122 | ||
162 | hctx->ccid3hctx_x = min(2 * hctx->ccid3hctx_x, min_rate); | 123 | hctx->x = min(2 * hctx->x, min_rate); |
163 | hctx->ccid3hctx_x = max(hctx->ccid3hctx_x, | 124 | hctx->x = max(hctx->x, |
164 | scaled_div(((__u64)hctx->ccid3hctx_s) << 6, | 125 | scaled_div(((u64)hctx->s) << 6, hctx->rtt)); |
165 | hctx->ccid3hctx_rtt)); | 126 | hctx->t_ld = now; |
166 | hctx->ccid3hctx_t_ld = now; | ||
167 | } | 127 | } |
168 | 128 | ||
169 | if (hctx->ccid3hctx_x != old_x) { | 129 | if (hctx->x != old_x) { |
170 | ccid3_pr_debug("X_prev=%u, X_now=%u, X_calc=%u, " | 130 | ccid3_pr_debug("X_prev=%u, X_now=%u, X_calc=%u, " |
171 | "X_recv=%u\n", (unsigned)(old_x >> 6), | 131 | "X_recv=%u\n", (unsigned)(old_x >> 6), |
172 | (unsigned)(hctx->ccid3hctx_x >> 6), | 132 | (unsigned)(hctx->x >> 6), hctx->x_calc, |
173 | hctx->ccid3hctx_x_calc, | 133 | (unsigned)(hctx->x_recv >> 6)); |
174 | (unsigned)(hctx->ccid3hctx_x_recv >> 6)); | ||
175 | 134 | ||
176 | ccid3_update_send_interval(hctx); | 135 | ccid3_update_send_interval(hctx); |
177 | } | 136 | } |
178 | } | 137 | } |
179 | 138 | ||
180 | /* | 139 | /* |
181 | * Track the mean packet size `s' (cf. RFC 4342, 5.3 and RFC 3448, 4.1) | 140 | * ccid3_hc_tx_measure_packet_size - Measuring the packet size `s' (sec 4.1) |
182 | * @len: DCCP packet payload size in bytes | 141 | * @new_len: DCCP payload size in bytes (not used by all methods) |
183 | */ | 142 | */ |
184 | static inline void ccid3_hc_tx_update_s(struct ccid3_hc_tx_sock *hctx, int len) | 143 | static u32 ccid3_hc_tx_measure_packet_size(struct sock *sk, const u16 new_len) |
185 | { | 144 | { |
186 | const u16 old_s = hctx->ccid3hctx_s; | 145 | #if defined(CONFIG_IP_DCCP_CCID3_MEASURE_S_AS_AVG) |
187 | 146 | return tfrc_ewma(ccid3_hc_tx_sk(sk)->s, new_len, 9); | |
188 | hctx->ccid3hctx_s = tfrc_ewma(hctx->ccid3hctx_s, len, 9); | 147 | #elif defined(CONFIG_IP_DCCP_CCID3_MEASURE_S_AS_MAX) |
189 | 148 | return max(ccid3_hc_tx_sk(sk)->s, new_len); | |
190 | if (hctx->ccid3hctx_s != old_s) | 149 | #else /* CONFIG_IP_DCCP_CCID3_MEASURE_S_AS_MPS */ |
191 | ccid3_update_send_interval(hctx); | 150 | return dccp_sk(sk)->dccps_mss_cache; |
151 | #endif | ||
192 | } | 152 | } |
193 | 153 | ||
194 | /* | 154 | /* |
@@ -198,13 +158,13 @@ static inline void ccid3_hc_tx_update_s(struct ccid3_hc_tx_sock *hctx, int len) | |||
198 | static inline void ccid3_hc_tx_update_win_count(struct ccid3_hc_tx_sock *hctx, | 158 | static inline void ccid3_hc_tx_update_win_count(struct ccid3_hc_tx_sock *hctx, |
199 | ktime_t now) | 159 | ktime_t now) |
200 | { | 160 | { |
201 | u32 delta = ktime_us_delta(now, hctx->ccid3hctx_t_last_win_count), | 161 | u32 delta = ktime_us_delta(now, hctx->t_last_win_count), |
202 | quarter_rtts = (4 * delta) / hctx->ccid3hctx_rtt; | 162 | quarter_rtts = (4 * delta) / hctx->rtt; |
203 | 163 | ||
204 | if (quarter_rtts > 0) { | 164 | if (quarter_rtts > 0) { |
205 | hctx->ccid3hctx_t_last_win_count = now; | 165 | hctx->t_last_win_count = now; |
206 | hctx->ccid3hctx_last_win_count += min(quarter_rtts, 5U); | 166 | hctx->last_win_count += min(quarter_rtts, 5U); |
207 | hctx->ccid3hctx_last_win_count &= 0xF; /* mod 16 */ | 167 | hctx->last_win_count &= 0xF; /* mod 16 */ |
208 | } | 168 | } |
209 | } | 169 | } |
210 | 170 | ||
@@ -221,25 +181,26 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data) | |||
221 | goto restart_timer; | 181 | goto restart_timer; |
222 | } | 182 | } |
223 | 183 | ||
224 | ccid3_pr_debug("%s(%p, state=%s) - entry \n", dccp_role(sk), sk, | 184 | ccid3_pr_debug("%s(%p) entry with%s feedback\n", dccp_role(sk), sk, |
225 | ccid3_tx_state_name(hctx->ccid3hctx_state)); | 185 | hctx->feedback ? "" : "out"); |
226 | 186 | ||
227 | if (hctx->ccid3hctx_state == TFRC_SSTATE_FBACK) | 187 | /* Ignore and do not restart after leaving the established state */ |
228 | ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK); | 188 | if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN)) |
229 | else if (hctx->ccid3hctx_state != TFRC_SSTATE_NO_FBACK) | ||
230 | goto out; | 189 | goto out; |
231 | 190 | ||
191 | /* Reset feedback state to "no feedback received" */ | ||
192 | hctx->feedback = false; | ||
193 | |||
232 | /* | 194 | /* |
233 | * Determine new allowed sending rate X as per draft rfc3448bis-00, 4.4 | 195 | * Determine new allowed sending rate X as per draft rfc3448bis-00, 4.4 |
196 | * RTO is 0 if and only if no feedback has been received yet. | ||
234 | */ | 197 | */ |
235 | if (hctx->ccid3hctx_t_rto == 0 || /* no feedback received yet */ | 198 | if (hctx->t_rto == 0 || hctx->p == 0) { |
236 | hctx->ccid3hctx_p == 0) { | ||
237 | 199 | ||
238 | /* halve send rate directly */ | 200 | /* halve send rate directly */ |
239 | hctx->ccid3hctx_x = max(hctx->ccid3hctx_x / 2, | 201 | hctx->x /= 2; |
240 | (((__u64)hctx->ccid3hctx_s) << 6) / | ||
241 | TFRC_T_MBI); | ||
242 | ccid3_update_send_interval(hctx); | 202 | ccid3_update_send_interval(hctx); |
203 | |||
243 | } else { | 204 | } else { |
244 | /* | 205 | /* |
245 | * Modify the cached value of X_recv | 206 | * Modify the cached value of X_recv |
@@ -251,44 +212,41 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data) | |||
251 | * | 212 | * |
252 | * Note that X_recv is scaled by 2^6 while X_calc is not | 213 | * Note that X_recv is scaled by 2^6 while X_calc is not |
253 | */ | 214 | */ |
254 | BUG_ON(hctx->ccid3hctx_p && !hctx->ccid3hctx_x_calc); | 215 | BUG_ON(hctx->p && !hctx->x_calc); |
255 | 216 | ||
256 | if (hctx->ccid3hctx_x_calc > (hctx->ccid3hctx_x_recv >> 5)) | 217 | if (hctx->x_calc > (hctx->x_recv >> 5)) |
257 | hctx->ccid3hctx_x_recv = | 218 | hctx->x_recv /= 2; |
258 | max(hctx->ccid3hctx_x_recv / 2, | ||
259 | (((__u64)hctx->ccid3hctx_s) << 6) / | ||
260 | (2 * TFRC_T_MBI)); | ||
261 | else { | 219 | else { |
262 | hctx->ccid3hctx_x_recv = hctx->ccid3hctx_x_calc; | 220 | hctx->x_recv = hctx->x_calc; |
263 | hctx->ccid3hctx_x_recv <<= 4; | 221 | hctx->x_recv <<= 4; |
264 | } | 222 | } |
265 | ccid3_hc_tx_update_x(sk, NULL); | 223 | ccid3_hc_tx_update_x(sk, NULL); |
266 | } | 224 | } |
267 | ccid3_pr_debug("Reduced X to %llu/64 bytes/sec\n", | 225 | ccid3_pr_debug("Reduced X to %llu/64 bytes/sec\n", |
268 | (unsigned long long)hctx->ccid3hctx_x); | 226 | (unsigned long long)hctx->x); |
269 | 227 | ||
270 | /* | 228 | /* |
271 | * Set new timeout for the nofeedback timer. | 229 | * Set new timeout for the nofeedback timer. |
272 | * See comments in packet_recv() regarding the value of t_RTO. | 230 | * See comments in packet_recv() regarding the value of t_RTO. |
273 | */ | 231 | */ |
274 | if (unlikely(hctx->ccid3hctx_t_rto == 0)) /* no feedback yet */ | 232 | if (unlikely(hctx->t_rto == 0)) /* no feedback received yet */ |
275 | t_nfb = TFRC_INITIAL_TIMEOUT; | 233 | t_nfb = TFRC_INITIAL_TIMEOUT; |
276 | else | 234 | else |
277 | t_nfb = max(hctx->ccid3hctx_t_rto, 2 * hctx->ccid3hctx_t_ipi); | 235 | t_nfb = max(hctx->t_rto, 2 * hctx->t_ipi); |
278 | 236 | ||
279 | restart_timer: | 237 | restart_timer: |
280 | sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer, | 238 | sk_reset_timer(sk, &hctx->no_feedback_timer, |
281 | jiffies + usecs_to_jiffies(t_nfb)); | 239 | jiffies + usecs_to_jiffies(t_nfb)); |
282 | out: | 240 | out: |
283 | bh_unlock_sock(sk); | 241 | bh_unlock_sock(sk); |
284 | sock_put(sk); | 242 | sock_put(sk); |
285 | } | 243 | } |
286 | 244 | ||
287 | /* | 245 | /** |
288 | * returns | 246 | * ccid3_hc_tx_send_packet - Delay-based dequeueing of TX packets |
289 | * > 0: delay (in msecs) that should pass before actually sending | 247 | * @skb: next packet candidate to send on @sk |
290 | * = 0: can send immediately | 248 | * This function uses the convention of ccid_packet_dequeue_eval() and |
291 | * < 0: error condition; do not send packet | 249 | * returns a millisecond-delay value between 0 and t_mbi = 64000 msec. |
292 | */ | 250 | */ |
293 | static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) | 251 | static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) |
294 | { | 252 | { |
@@ -305,18 +263,14 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) | |||
305 | if (unlikely(skb->len == 0)) | 263 | if (unlikely(skb->len == 0)) |
306 | return -EBADMSG; | 264 | return -EBADMSG; |
307 | 265 | ||
308 | switch (hctx->ccid3hctx_state) { | 266 | if (hctx->s == 0) { |
309 | case TFRC_SSTATE_NO_SENT: | 267 | sk_reset_timer(sk, &hctx->no_feedback_timer, (jiffies + |
310 | sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer, | ||
311 | (jiffies + | ||
312 | usecs_to_jiffies(TFRC_INITIAL_TIMEOUT))); | 268 | usecs_to_jiffies(TFRC_INITIAL_TIMEOUT))); |
313 | hctx->ccid3hctx_last_win_count = 0; | 269 | hctx->last_win_count = 0; |
314 | hctx->ccid3hctx_t_last_win_count = now; | 270 | hctx->t_last_win_count = now; |
315 | 271 | ||
316 | /* Set t_0 for initial packet */ | 272 | /* Set t_0 for initial packet */ |
317 | hctx->ccid3hctx_t_nom = now; | 273 | hctx->t_nom = now; |
318 | |||
319 | hctx->ccid3hctx_s = skb->len; | ||
320 | 274 | ||
321 | /* | 275 | /* |
322 | * Use initial RTT sample when available: recommended by erratum | 276 | * Use initial RTT sample when available: recommended by erratum |
@@ -325,9 +279,9 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) | |||
325 | */ | 279 | */ |
326 | if (dp->dccps_syn_rtt) { | 280 | if (dp->dccps_syn_rtt) { |
327 | ccid3_pr_debug("SYN RTT = %uus\n", dp->dccps_syn_rtt); | 281 | ccid3_pr_debug("SYN RTT = %uus\n", dp->dccps_syn_rtt); |
328 | hctx->ccid3hctx_rtt = dp->dccps_syn_rtt; | 282 | hctx->rtt = dp->dccps_syn_rtt; |
329 | hctx->ccid3hctx_x = rfc3390_initial_rate(sk); | 283 | hctx->x = rfc3390_initial_rate(sk); |
330 | hctx->ccid3hctx_t_ld = now; | 284 | hctx->t_ld = now; |
331 | } else { | 285 | } else { |
332 | /* | 286 | /* |
333 | * Sender does not have RTT sample: | 287 | * Sender does not have RTT sample: |
@@ -335,17 +289,20 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) | |||
335 | * is needed in several parts (e.g. window counter); | 289 | * is needed in several parts (e.g. window counter); |
336 | * - set sending rate X_pps = 1pps as per RFC 3448, 4.2. | 290 | * - set sending rate X_pps = 1pps as per RFC 3448, 4.2. |
337 | */ | 291 | */ |
338 | hctx->ccid3hctx_rtt = DCCP_FALLBACK_RTT; | 292 | hctx->rtt = DCCP_FALLBACK_RTT; |
339 | hctx->ccid3hctx_x = hctx->ccid3hctx_s; | 293 | hctx->x = dp->dccps_mss_cache; |
340 | hctx->ccid3hctx_x <<= 6; | 294 | hctx->x <<= 6; |
341 | } | 295 | } |
296 | |||
297 | /* Compute t_ipi = s / X */ | ||
298 | hctx->s = ccid3_hc_tx_measure_packet_size(sk, skb->len); | ||
342 | ccid3_update_send_interval(hctx); | 299 | ccid3_update_send_interval(hctx); |
343 | 300 | ||
344 | ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK); | 301 | /* Seed value for Oscillation Prevention (sec. 4.5) */ |
345 | break; | 302 | hctx->r_sqmean = tfrc_scaled_sqrt(hctx->rtt); |
346 | case TFRC_SSTATE_NO_FBACK: | 303 | |
347 | case TFRC_SSTATE_FBACK: | 304 | } else { |
348 | delay = ktime_us_delta(hctx->ccid3hctx_t_nom, now); | 305 | delay = ktime_us_delta(hctx->t_nom, now); |
349 | ccid3_pr_debug("delay=%ld\n", (long)delay); | 306 | ccid3_pr_debug("delay=%ld\n", (long)delay); |
350 | /* | 307 | /* |
351 | * Scheduling of packet transmissions [RFC 3448, 4.6] | 308 | * Scheduling of packet transmissions [RFC 3448, 4.6] |
@@ -355,99 +312,80 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) | |||
355 | * else | 312 | * else |
356 | * // send the packet in (t_nom - t_now) milliseconds. | 313 | * // send the packet in (t_nom - t_now) milliseconds. |
357 | */ | 314 | */ |
358 | if (delay - (s64)hctx->ccid3hctx_delta >= 1000) | 315 | if (delay >= TFRC_T_DELTA) |
359 | return (u32)delay / 1000L; | 316 | return (u32)delay / USEC_PER_MSEC; |
360 | 317 | ||
361 | ccid3_hc_tx_update_win_count(hctx, now); | 318 | ccid3_hc_tx_update_win_count(hctx, now); |
362 | break; | ||
363 | case TFRC_SSTATE_TERM: | ||
364 | DCCP_BUG("%s(%p) - Illegal state TERM", dccp_role(sk), sk); | ||
365 | return -EINVAL; | ||
366 | } | 319 | } |
367 | 320 | ||
368 | /* prepare to send now (add options etc.) */ | 321 | /* prepare to send now (add options etc.) */ |
369 | dp->dccps_hc_tx_insert_options = 1; | 322 | dp->dccps_hc_tx_insert_options = 1; |
370 | DCCP_SKB_CB(skb)->dccpd_ccval = hctx->ccid3hctx_last_win_count; | 323 | DCCP_SKB_CB(skb)->dccpd_ccval = hctx->last_win_count; |
371 | 324 | ||
372 | /* set the nominal send time for the next following packet */ | 325 | /* set the nominal send time for the next following packet */ |
373 | hctx->ccid3hctx_t_nom = ktime_add_us(hctx->ccid3hctx_t_nom, | 326 | hctx->t_nom = ktime_add_us(hctx->t_nom, hctx->t_ipi); |
374 | hctx->ccid3hctx_t_ipi); | 327 | return CCID_PACKET_SEND_AT_ONCE; |
375 | return 0; | ||
376 | } | 328 | } |
377 | 329 | ||
378 | static void ccid3_hc_tx_packet_sent(struct sock *sk, int more, | 330 | static void ccid3_hc_tx_packet_sent(struct sock *sk, unsigned int len) |
379 | unsigned int len) | ||
380 | { | 331 | { |
381 | struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); | 332 | struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); |
382 | 333 | ||
383 | ccid3_hc_tx_update_s(hctx, len); | 334 | /* Changes to s will become effective the next time X is computed */ |
335 | hctx->s = ccid3_hc_tx_measure_packet_size(sk, len); | ||
384 | 336 | ||
385 | if (tfrc_tx_hist_add(&hctx->ccid3hctx_hist, dccp_sk(sk)->dccps_gss)) | 337 | if (tfrc_tx_hist_add(&hctx->hist, dccp_sk(sk)->dccps_gss)) |
386 | DCCP_CRIT("packet history - out of memory!"); | 338 | DCCP_CRIT("packet history - out of memory!"); |
387 | } | 339 | } |
388 | 340 | ||
389 | static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) | 341 | static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) |
390 | { | 342 | { |
391 | struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); | 343 | struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); |
392 | struct ccid3_options_received *opt_recv; | 344 | struct tfrc_tx_hist_entry *acked; |
393 | ktime_t now; | 345 | ktime_t now; |
394 | unsigned long t_nfb; | 346 | unsigned long t_nfb; |
395 | u32 pinv, r_sample; | 347 | u32 r_sample; |
396 | 348 | ||
397 | /* we are only interested in ACKs */ | 349 | /* we are only interested in ACKs */ |
398 | if (!(DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK || | 350 | if (!(DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK || |
399 | DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_DATAACK)) | 351 | DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_DATAACK)) |
400 | return; | 352 | return; |
401 | /* ... and only in the established state */ | 353 | /* |
402 | if (hctx->ccid3hctx_state != TFRC_SSTATE_FBACK && | 354 | * Locate the acknowledged packet in the TX history. |
403 | hctx->ccid3hctx_state != TFRC_SSTATE_NO_FBACK) | 355 | * |
404 | return; | 356 | * Returning "entry not found" here can for instance happen when |
405 | 357 | * - the host has not sent out anything (e.g. a passive server), | |
406 | opt_recv = &hctx->ccid3hctx_options_received; | 358 | * - the Ack is outdated (packet with higher Ack number was received), |
407 | now = ktime_get_real(); | 359 | * - it is a bogus Ack (for a packet not sent on this connection). |
408 | 360 | */ | |
409 | /* Estimate RTT from history if ACK number is valid */ | 361 | acked = tfrc_tx_hist_find_entry(hctx->hist, dccp_hdr_ack_seq(skb)); |
410 | r_sample = tfrc_tx_hist_rtt(hctx->ccid3hctx_hist, | 362 | if (acked == NULL) |
411 | DCCP_SKB_CB(skb)->dccpd_ack_seq, now); | ||
412 | if (r_sample == 0) { | ||
413 | DCCP_WARN("%s(%p): %s with bogus ACK-%llu\n", dccp_role(sk), sk, | ||
414 | dccp_packet_name(DCCP_SKB_CB(skb)->dccpd_type), | ||
415 | (unsigned long long)DCCP_SKB_CB(skb)->dccpd_ack_seq); | ||
416 | return; | 363 | return; |
417 | } | 364 | /* For the sake of RTT sampling, ignore/remove all older entries */ |
365 | tfrc_tx_hist_purge(&acked->next); | ||
418 | 366 | ||
419 | /* Update receive rate in units of 64 * bytes/second */ | 367 | /* Update the moving average for the RTT estimate (RFC 3448, 4.3) */ |
420 | hctx->ccid3hctx_x_recv = opt_recv->ccid3or_receive_rate; | 368 | now = ktime_get_real(); |
421 | hctx->ccid3hctx_x_recv <<= 6; | 369 | r_sample = dccp_sample_rtt(sk, ktime_us_delta(now, acked->stamp)); |
370 | hctx->rtt = tfrc_ewma(hctx->rtt, r_sample, 9); | ||
422 | 371 | ||
423 | /* Update loss event rate (which is scaled by 1e6) */ | ||
424 | pinv = opt_recv->ccid3or_loss_event_rate; | ||
425 | if (pinv == ~0U || pinv == 0) /* see RFC 4342, 8.5 */ | ||
426 | hctx->ccid3hctx_p = 0; | ||
427 | else /* can not exceed 100% */ | ||
428 | hctx->ccid3hctx_p = scaled_div(1, pinv); | ||
429 | /* | ||
430 | * Validate new RTT sample and update moving average | ||
431 | */ | ||
432 | r_sample = dccp_sample_rtt(sk, r_sample); | ||
433 | hctx->ccid3hctx_rtt = tfrc_ewma(hctx->ccid3hctx_rtt, r_sample, 9); | ||
434 | /* | 372 | /* |
435 | * Update allowed sending rate X as per draft rfc3448bis-00, 4.2/3 | 373 | * Update allowed sending rate X as per draft rfc3448bis-00, 4.2/3 |
436 | */ | 374 | */ |
437 | if (hctx->ccid3hctx_state == TFRC_SSTATE_NO_FBACK) { | 375 | if (!hctx->feedback) { |
438 | ccid3_hc_tx_set_state(sk, TFRC_SSTATE_FBACK); | 376 | hctx->feedback = true; |
439 | 377 | ||
440 | if (hctx->ccid3hctx_t_rto == 0) { | 378 | if (hctx->t_rto == 0) { |
441 | /* | 379 | /* |
442 | * Initial feedback packet: Larger Initial Windows (4.2) | 380 | * Initial feedback packet: Larger Initial Windows (4.2) |
443 | */ | 381 | */ |
444 | hctx->ccid3hctx_x = rfc3390_initial_rate(sk); | 382 | hctx->x = rfc3390_initial_rate(sk); |
445 | hctx->ccid3hctx_t_ld = now; | 383 | hctx->t_ld = now; |
446 | 384 | ||
447 | ccid3_update_send_interval(hctx); | 385 | ccid3_update_send_interval(hctx); |
448 | 386 | ||
449 | goto done_computing_x; | 387 | goto done_computing_x; |
450 | } else if (hctx->ccid3hctx_p == 0) { | 388 | } else if (hctx->p == 0) { |
451 | /* | 389 | /* |
452 | * First feedback after nofeedback timer expiry (4.3) | 390 | * First feedback after nofeedback timer expiry (4.3) |
453 | */ | 391 | */ |
@@ -456,25 +394,52 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) | |||
456 | } | 394 | } |
457 | 395 | ||
458 | /* Update sending rate (step 4 of [RFC 3448, 4.3]) */ | 396 | /* Update sending rate (step 4 of [RFC 3448, 4.3]) */ |
459 | if (hctx->ccid3hctx_p > 0) | 397 | if (hctx->p > 0) |
460 | hctx->ccid3hctx_x_calc = | 398 | hctx->x_calc = tfrc_calc_x(hctx->s, hctx->rtt, hctx->p); |
461 | tfrc_calc_x(hctx->ccid3hctx_s, | ||
462 | hctx->ccid3hctx_rtt, | ||
463 | hctx->ccid3hctx_p); | ||
464 | ccid3_hc_tx_update_x(sk, &now); | 399 | ccid3_hc_tx_update_x(sk, &now); |
465 | 400 | ||
466 | done_computing_x: | 401 | done_computing_x: |
467 | ccid3_pr_debug("%s(%p), RTT=%uus (sample=%uus), s=%u, " | 402 | ccid3_pr_debug("%s(%p), RTT=%uus (sample=%uus), s=%u, " |
468 | "p=%u, X_calc=%u, X_recv=%u, X=%u\n", | 403 | "p=%u, X_calc=%u, X_recv=%u, X=%u\n", |
469 | dccp_role(sk), | 404 | dccp_role(sk), sk, hctx->rtt, r_sample, |
470 | sk, hctx->ccid3hctx_rtt, r_sample, | 405 | hctx->s, hctx->p, hctx->x_calc, |
471 | hctx->ccid3hctx_s, hctx->ccid3hctx_p, | 406 | (unsigned)(hctx->x_recv >> 6), |
472 | hctx->ccid3hctx_x_calc, | 407 | (unsigned)(hctx->x >> 6)); |
473 | (unsigned)(hctx->ccid3hctx_x_recv >> 6), | 408 | /* |
474 | (unsigned)(hctx->ccid3hctx_x >> 6)); | 409 | * Oscillation Reduction (RFC 3448, 4.5) - modifying t_ipi according to |
410 | * RTT changes, multiplying by X/X_inst = sqrt(R_sample)/R_sqmean. This | ||
411 | * can be useful if few connections share a link, avoiding that buffer | ||
412 | * fill levels (RTT) oscillate as a result of frequent adjustments to X. | ||
413 | * A useful presentation with background information is in | ||
414 | * Joerg Widmer, "Equation-Based Congestion Control", | ||
415 | * MSc Thesis, University of Mannheim, Germany, 2000 | ||
416 | * (sec. 3.6.4), who calls this ISM ("Inter-packet Space Modulation"). | ||
417 | */ | ||
418 | if (do_osc_prev) { | ||
419 | r_sample = tfrc_scaled_sqrt(r_sample); | ||
420 | /* | ||
421 | * The modulation can work in both ways: increase/decrease t_ipi | ||
422 | * according to long-term increases/decreases of the RTT. The | ||
423 | * former is a useful measure, since it works against queue | ||
424 | * build-up. The latter temporarily increases the sending rate, | ||
425 | * so that buffers fill up more quickly. This in turn causes | ||
426 | * the RTT to increase, so that either later reduction becomes | ||
427 | * necessary or the RTT stays at a very high level. Decreasing | ||
428 | * t_ipi is therefore not supported. | ||
429 | * Furthermore, during the initial slow-start phase the RTT | ||
430 | * naturally increases, where using the algorithm would cause | ||
431 | * delays. Hence it is disabled during the initial slow-start. | ||
432 | */ | ||
433 | if (r_sample > hctx->r_sqmean && hctx->p > 0) | ||
434 | hctx->t_ipi = div_u64((u64)hctx->t_ipi * (u64)r_sample, | ||
435 | hctx->r_sqmean); | ||
436 | hctx->t_ipi = min_t(u32, hctx->t_ipi, TFRC_T_MBI); | ||
437 | /* update R_sqmean _after_ computing the modulation factor */ | ||
438 | hctx->r_sqmean = tfrc_ewma(hctx->r_sqmean, r_sample, 9); | ||
439 | } | ||
475 | 440 | ||
476 | /* unschedule no feedback timer */ | 441 | /* unschedule no feedback timer */ |
477 | sk_stop_timer(sk, &hctx->ccid3hctx_no_feedback_timer); | 442 | sk_stop_timer(sk, &hctx->no_feedback_timer); |
478 | 443 | ||
479 | /* | 444 | /* |
480 | * As we have calculated new ipi, delta, t_nom it is possible | 445 | * As we have calculated new ipi, delta, t_nom it is possible |
@@ -488,95 +453,66 @@ done_computing_x: | |||
488 | * This can help avoid triggering the nofeedback timer too | 453 | * This can help avoid triggering the nofeedback timer too |
489 | * often ('spinning') on LANs with small RTTs. | 454 | * often ('spinning') on LANs with small RTTs. |
490 | */ | 455 | */ |
491 | hctx->ccid3hctx_t_rto = max_t(u32, 4 * hctx->ccid3hctx_rtt, | 456 | hctx->t_rto = max_t(u32, 4 * hctx->rtt, (CONFIG_IP_DCCP_CCID3_RTO * |
492 | (CONFIG_IP_DCCP_CCID3_RTO * | 457 | (USEC_PER_SEC / 1000))); |
493 | (USEC_PER_SEC / 1000))); | ||
494 | /* | 458 | /* |
495 | * Schedule no feedback timer to expire in | 459 | * Schedule no feedback timer to expire in |
496 | * max(t_RTO, 2 * s/X) = max(t_RTO, 2 * t_ipi) | 460 | * max(t_RTO, 2 * s/X) = max(t_RTO, 2 * t_ipi) |
497 | */ | 461 | */ |
498 | t_nfb = max(hctx->ccid3hctx_t_rto, 2 * hctx->ccid3hctx_t_ipi); | 462 | t_nfb = max(hctx->t_rto, 2 * hctx->t_ipi); |
499 | 463 | ||
500 | ccid3_pr_debug("%s(%p), Scheduled no feedback timer to " | 464 | ccid3_pr_debug("%s(%p), Scheduled no feedback timer to " |
501 | "expire in %lu jiffies (%luus)\n", | 465 | "expire in %lu jiffies (%luus)\n", |
502 | dccp_role(sk), | 466 | dccp_role(sk), sk, usecs_to_jiffies(t_nfb), t_nfb); |
503 | sk, usecs_to_jiffies(t_nfb), t_nfb); | ||
504 | 467 | ||
505 | sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer, | 468 | sk_reset_timer(sk, &hctx->no_feedback_timer, |
506 | jiffies + usecs_to_jiffies(t_nfb)); | 469 | jiffies + usecs_to_jiffies(t_nfb)); |
507 | } | 470 | } |
508 | 471 | ||
509 | static int ccid3_hc_tx_parse_options(struct sock *sk, unsigned char option, | 472 | static int ccid3_hc_tx_parse_options(struct sock *sk, u8 packet_type, |
510 | unsigned char len, u16 idx, | 473 | u8 option, u8 *optval, u8 optlen) |
511 | unsigned char *value) | ||
512 | { | 474 | { |
513 | int rc = 0; | ||
514 | const struct dccp_sock *dp = dccp_sk(sk); | ||
515 | struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); | 475 | struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); |
516 | struct ccid3_options_received *opt_recv; | ||
517 | __be32 opt_val; | 476 | __be32 opt_val; |
518 | 477 | ||
519 | opt_recv = &hctx->ccid3hctx_options_received; | ||
520 | |||
521 | if (opt_recv->ccid3or_seqno != dp->dccps_gsr) { | ||
522 | opt_recv->ccid3or_seqno = dp->dccps_gsr; | ||
523 | opt_recv->ccid3or_loss_event_rate = ~0; | ||
524 | opt_recv->ccid3or_loss_intervals_idx = 0; | ||
525 | opt_recv->ccid3or_loss_intervals_len = 0; | ||
526 | opt_recv->ccid3or_receive_rate = 0; | ||
527 | } | ||
528 | |||
529 | switch (option) { | 478 | switch (option) { |
479 | case TFRC_OPT_RECEIVE_RATE: | ||
530 | case TFRC_OPT_LOSS_EVENT_RATE: | 480 | case TFRC_OPT_LOSS_EVENT_RATE: |
531 | if (unlikely(len != 4)) { | 481 | /* Must be ignored on Data packets, cf. RFC 4342 8.3 and 8.5 */ |
532 | DCCP_WARN("%s(%p), invalid len %d " | 482 | if (packet_type == DCCP_PKT_DATA) |
533 | "for TFRC_OPT_LOSS_EVENT_RATE\n", | 483 | break; |
534 | dccp_role(sk), sk, len); | 484 | if (unlikely(optlen != 4)) { |
535 | rc = -EINVAL; | 485 | DCCP_WARN("%s(%p), invalid len %d for %u\n", |
536 | } else { | 486 | dccp_role(sk), sk, optlen, option); |
537 | opt_val = get_unaligned((__be32 *)value); | 487 | return -EINVAL; |
538 | opt_recv->ccid3or_loss_event_rate = ntohl(opt_val); | ||
539 | ccid3_pr_debug("%s(%p), LOSS_EVENT_RATE=%u\n", | ||
540 | dccp_role(sk), sk, | ||
541 | opt_recv->ccid3or_loss_event_rate); | ||
542 | } | 488 | } |
543 | break; | 489 | opt_val = ntohl(get_unaligned((__be32 *)optval)); |
544 | case TFRC_OPT_LOSS_INTERVALS: | 490 | |
545 | opt_recv->ccid3or_loss_intervals_idx = idx; | 491 | if (option == TFRC_OPT_RECEIVE_RATE) { |
546 | opt_recv->ccid3or_loss_intervals_len = len; | 492 | /* Receive Rate is kept in units of 64 bytes/second */ |
547 | ccid3_pr_debug("%s(%p), LOSS_INTERVALS=(%u, %u)\n", | 493 | hctx->x_recv = opt_val; |
548 | dccp_role(sk), sk, | 494 | hctx->x_recv <<= 6; |
549 | opt_recv->ccid3or_loss_intervals_idx, | 495 | |
550 | opt_recv->ccid3or_loss_intervals_len); | ||
551 | break; | ||
552 | case TFRC_OPT_RECEIVE_RATE: | ||
553 | if (unlikely(len != 4)) { | ||
554 | DCCP_WARN("%s(%p), invalid len %d " | ||
555 | "for TFRC_OPT_RECEIVE_RATE\n", | ||
556 | dccp_role(sk), sk, len); | ||
557 | rc = -EINVAL; | ||
558 | } else { | ||
559 | opt_val = get_unaligned((__be32 *)value); | ||
560 | opt_recv->ccid3or_receive_rate = ntohl(opt_val); | ||
561 | ccid3_pr_debug("%s(%p), RECEIVE_RATE=%u\n", | 496 | ccid3_pr_debug("%s(%p), RECEIVE_RATE=%u\n", |
562 | dccp_role(sk), sk, | 497 | dccp_role(sk), sk, opt_val); |
563 | opt_recv->ccid3or_receive_rate); | 498 | } else { |
499 | /* Update the fixpoint Loss Event Rate fraction */ | ||
500 | hctx->p = tfrc_invert_loss_event_rate(opt_val); | ||
501 | |||
502 | ccid3_pr_debug("%s(%p), LOSS_EVENT_RATE=%u\n", | ||
503 | dccp_role(sk), sk, opt_val); | ||
564 | } | 504 | } |
565 | break; | ||
566 | } | 505 | } |
567 | 506 | return 0; | |
568 | return rc; | ||
569 | } | 507 | } |
570 | 508 | ||
571 | static int ccid3_hc_tx_init(struct ccid *ccid, struct sock *sk) | 509 | static int ccid3_hc_tx_init(struct ccid *ccid, struct sock *sk) |
572 | { | 510 | { |
573 | struct ccid3_hc_tx_sock *hctx = ccid_priv(ccid); | 511 | struct ccid3_hc_tx_sock *hctx = ccid_priv(ccid); |
574 | 512 | ||
575 | hctx->ccid3hctx_state = TFRC_SSTATE_NO_SENT; | 513 | hctx->hist = NULL; |
576 | hctx->ccid3hctx_hist = NULL; | 514 | setup_timer(&hctx->no_feedback_timer, |
577 | setup_timer(&hctx->ccid3hctx_no_feedback_timer, | 515 | ccid3_hc_tx_no_feedback_timer, (unsigned long)sk); |
578 | ccid3_hc_tx_no_feedback_timer, (unsigned long)sk); | ||
579 | |||
580 | return 0; | 516 | return 0; |
581 | } | 517 | } |
582 | 518 | ||
@@ -584,42 +520,36 @@ static void ccid3_hc_tx_exit(struct sock *sk) | |||
584 | { | 520 | { |
585 | struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); | 521 | struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); |
586 | 522 | ||
587 | ccid3_hc_tx_set_state(sk, TFRC_SSTATE_TERM); | 523 | sk_stop_timer(sk, &hctx->no_feedback_timer); |
588 | sk_stop_timer(sk, &hctx->ccid3hctx_no_feedback_timer); | 524 | tfrc_tx_hist_purge(&hctx->hist); |
589 | |||
590 | tfrc_tx_hist_purge(&hctx->ccid3hctx_hist); | ||
591 | } | 525 | } |
592 | 526 | ||
593 | static void ccid3_hc_tx_get_info(struct sock *sk, struct tcp_info *info) | 527 | static void ccid3_hc_tx_get_info(struct sock *sk, struct tcp_info *info) |
594 | { | 528 | { |
595 | struct ccid3_hc_tx_sock *hctx; | 529 | info->tcpi_rto = ccid3_hc_tx_sk(sk)->t_rto; |
596 | 530 | info->tcpi_rtt = ccid3_hc_tx_sk(sk)->rtt; | |
597 | /* Listen socks doesn't have a private CCID block */ | ||
598 | if (sk->sk_state == DCCP_LISTEN) | ||
599 | return; | ||
600 | |||
601 | hctx = ccid3_hc_tx_sk(sk); | ||
602 | info->tcpi_rto = hctx->ccid3hctx_t_rto; | ||
603 | info->tcpi_rtt = hctx->ccid3hctx_rtt; | ||
604 | } | 531 | } |
605 | 532 | ||
606 | static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len, | 533 | static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len, |
607 | u32 __user *optval, int __user *optlen) | 534 | u32 __user *optval, int __user *optlen) |
608 | { | 535 | { |
609 | const struct ccid3_hc_tx_sock *hctx; | 536 | const struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); |
537 | struct tfrc_tx_info tfrc; | ||
610 | const void *val; | 538 | const void *val; |
611 | 539 | ||
612 | /* Listen socks doesn't have a private CCID block */ | ||
613 | if (sk->sk_state == DCCP_LISTEN) | ||
614 | return -EINVAL; | ||
615 | |||
616 | hctx = ccid3_hc_tx_sk(sk); | ||
617 | switch (optname) { | 540 | switch (optname) { |
618 | case DCCP_SOCKOPT_CCID_TX_INFO: | 541 | case DCCP_SOCKOPT_CCID_TX_INFO: |
619 | if (len < sizeof(hctx->ccid3hctx_tfrc)) | 542 | if (len < sizeof(tfrc)) |
620 | return -EINVAL; | 543 | return -EINVAL; |
621 | len = sizeof(hctx->ccid3hctx_tfrc); | 544 | tfrc.tfrctx_x = hctx->x; |
622 | val = &hctx->ccid3hctx_tfrc; | 545 | tfrc.tfrctx_x_recv = hctx->x_recv; |
546 | tfrc.tfrctx_x_calc = hctx->x_calc; | ||
547 | tfrc.tfrctx_rtt = hctx->rtt; | ||
548 | tfrc.tfrctx_p = hctx->p; | ||
549 | tfrc.tfrctx_rto = hctx->t_rto; | ||
550 | tfrc.tfrctx_ipi = hctx->t_ipi; | ||
551 | len = sizeof(tfrc); | ||
552 | val = &tfrc; | ||
623 | break; | 553 | break; |
624 | default: | 554 | default: |
625 | return -ENOPROTOOPT; | 555 | return -ENOPROTOOPT; |
@@ -634,112 +564,82 @@ static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len, | |||
634 | /* | 564 | /* |
635 | * Receiver Half-Connection Routines | 565 | * Receiver Half-Connection Routines |
636 | */ | 566 | */ |
637 | |||
638 | /* CCID3 feedback types */ | ||
639 | enum ccid3_fback_type { | ||
640 | CCID3_FBACK_NONE = 0, | ||
641 | CCID3_FBACK_INITIAL, | ||
642 | CCID3_FBACK_PERIODIC, | ||
643 | CCID3_FBACK_PARAM_CHANGE | ||
644 | }; | ||
645 | |||
646 | #ifdef CONFIG_IP_DCCP_CCID3_DEBUG | ||
647 | static const char *ccid3_rx_state_name(enum ccid3_hc_rx_states state) | ||
648 | { | ||
649 | static char *ccid3_rx_state_names[] = { | ||
650 | [TFRC_RSTATE_NO_DATA] = "NO_DATA", | ||
651 | [TFRC_RSTATE_DATA] = "DATA", | ||
652 | [TFRC_RSTATE_TERM] = "TERM", | ||
653 | }; | ||
654 | |||
655 | return ccid3_rx_state_names[state]; | ||
656 | } | ||
657 | #endif | ||
658 | |||
659 | static void ccid3_hc_rx_set_state(struct sock *sk, | ||
660 | enum ccid3_hc_rx_states state) | ||
661 | { | ||
662 | struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); | ||
663 | enum ccid3_hc_rx_states oldstate = hcrx->ccid3hcrx_state; | ||
664 | |||
665 | ccid3_pr_debug("%s(%p) %-8.8s -> %s\n", | ||
666 | dccp_role(sk), sk, ccid3_rx_state_name(oldstate), | ||
667 | ccid3_rx_state_name(state)); | ||
668 | WARN_ON(state == oldstate); | ||
669 | hcrx->ccid3hcrx_state = state; | ||
670 | } | ||
671 | |||
672 | static void ccid3_hc_rx_send_feedback(struct sock *sk, | 567 | static void ccid3_hc_rx_send_feedback(struct sock *sk, |
673 | const struct sk_buff *skb, | 568 | const struct sk_buff *skb, |
674 | enum ccid3_fback_type fbtype) | 569 | enum ccid3_fback_type fbtype) |
675 | { | 570 | { |
676 | struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); | 571 | struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); |
677 | struct dccp_sock *dp = dccp_sk(sk); | ||
678 | ktime_t now; | ||
679 | s64 delta = 0; | ||
680 | |||
681 | if (unlikely(hcrx->ccid3hcrx_state == TFRC_RSTATE_TERM)) | ||
682 | return; | ||
683 | |||
684 | now = ktime_get_real(); | ||
685 | 572 | ||
686 | switch (fbtype) { | 573 | switch (fbtype) { |
687 | case CCID3_FBACK_INITIAL: | 574 | case CCID3_FBACK_INITIAL: |
688 | hcrx->ccid3hcrx_x_recv = 0; | 575 | hcrx->x_recv = 0; |
689 | hcrx->ccid3hcrx_pinv = ~0U; /* see RFC 4342, 8.5 */ | 576 | hcrx->p_inverse = ~0U; /* see RFC 4342, 8.5 */ |
690 | break; | 577 | break; |
691 | case CCID3_FBACK_PARAM_CHANGE: | 578 | case CCID3_FBACK_PARAM_CHANGE: |
579 | if (unlikely(hcrx->feedback == CCID3_FBACK_NONE)) { | ||
580 | /* | ||
581 | * rfc3448bis-06, 6.3.1: First packet(s) lost or marked | ||
582 | * FIXME: in rfc3448bis the receiver returns X_recv=0 | ||
583 | * here as it normally would in the first feedback packet. | ||
584 | * However this is not possible yet, since the code still | ||
585 | * uses RFC 3448, i.e. | ||
586 | * If (p > 0) | ||
587 | * Calculate X_calc using the TCP throughput equation. | ||
588 | * X = max(min(X_calc, 2*X_recv), s/t_mbi); | ||
589 | * would bring X down to s/t_mbi. That is why we return | ||
590 | * X_recv according to rfc3448bis-06 for the moment. | ||
591 | */ | ||
592 | u32 s = tfrc_rx_hist_packet_size(&hcrx->hist), | ||
593 | rtt = tfrc_rx_hist_rtt(&hcrx->hist); | ||
594 | |||
595 | hcrx->x_recv = scaled_div32(s, 2 * rtt); | ||
596 | break; | ||
597 | } | ||
692 | /* | 598 | /* |
693 | * When parameters change (new loss or p > p_prev), we do not | 599 | * When parameters change (new loss or p > p_prev), we do not |
694 | * have a reliable estimate for R_m of [RFC 3448, 6.2] and so | 600 | * have a reliable estimate for R_m of [RFC 3448, 6.2] and so |
695 | * need to reuse the previous value of X_recv. However, when | 601 | * always check whether at least RTT time units were covered. |
696 | * X_recv was 0 (due to early loss), this would kill X down to | ||
697 | * s/t_mbi (i.e. one packet in 64 seconds). | ||
698 | * To avoid such drastic reduction, we approximate X_recv as | ||
699 | * the number of bytes since last feedback. | ||
700 | * This is a safe fallback, since X is bounded above by X_calc. | ||
701 | */ | 602 | */ |
702 | if (hcrx->ccid3hcrx_x_recv > 0) | 603 | hcrx->x_recv = tfrc_rx_hist_x_recv(&hcrx->hist, hcrx->x_recv); |
703 | break; | 604 | break; |
704 | /* fall through */ | ||
705 | case CCID3_FBACK_PERIODIC: | 605 | case CCID3_FBACK_PERIODIC: |
706 | delta = ktime_us_delta(now, hcrx->ccid3hcrx_tstamp_last_feedback); | 606 | /* |
707 | if (delta <= 0) | 607 | * Step (2) of rfc3448bis-06, 6.2: |
708 | DCCP_BUG("delta (%ld) <= 0", (long)delta); | 608 | * - if no data packets have been received, just restart timer |
709 | else | 609 | * - if data packets have been received, re-compute X_recv |
710 | hcrx->ccid3hcrx_x_recv = | 610 | */ |
711 | scaled_div32(hcrx->ccid3hcrx_bytes_recv, delta); | 611 | if (hcrx->hist.bytes_recvd == 0) |
612 | goto prepare_for_next_time; | ||
613 | hcrx->x_recv = tfrc_rx_hist_x_recv(&hcrx->hist, hcrx->x_recv); | ||
712 | break; | 614 | break; |
713 | default: | 615 | default: |
714 | return; | 616 | return; |
715 | } | 617 | } |
716 | 618 | ||
717 | ccid3_pr_debug("Interval %ldusec, X_recv=%u, 1/p=%u\n", (long)delta, | 619 | ccid3_pr_debug("X_recv=%u, 1/p=%u\n", hcrx->x_recv, hcrx->p_inverse); |
718 | hcrx->ccid3hcrx_x_recv, hcrx->ccid3hcrx_pinv); | ||
719 | |||
720 | hcrx->ccid3hcrx_tstamp_last_feedback = now; | ||
721 | hcrx->ccid3hcrx_last_counter = dccp_hdr(skb)->dccph_ccval; | ||
722 | hcrx->ccid3hcrx_bytes_recv = 0; | ||
723 | 620 | ||
724 | dp->dccps_hc_rx_insert_options = 1; | 621 | dccp_sk(sk)->dccps_hc_rx_insert_options = 1; |
725 | dccp_send_ack(sk); | 622 | dccp_send_ack(sk); |
623 | |||
624 | prepare_for_next_time: | ||
625 | tfrc_rx_hist_restart_byte_counter(&hcrx->hist); | ||
626 | hcrx->last_counter = dccp_hdr(skb)->dccph_ccval; | ||
627 | hcrx->feedback = fbtype; | ||
726 | } | 628 | } |
727 | 629 | ||
728 | static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb) | 630 | static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb) |
729 | { | 631 | { |
730 | const struct ccid3_hc_rx_sock *hcrx; | 632 | const struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); |
731 | __be32 x_recv, pinv; | 633 | __be32 x_recv, pinv; |
732 | 634 | ||
733 | if (!(sk->sk_state == DCCP_OPEN || sk->sk_state == DCCP_PARTOPEN)) | 635 | if (!(sk->sk_state == DCCP_OPEN || sk->sk_state == DCCP_PARTOPEN)) |
734 | return 0; | 636 | return 0; |
735 | 637 | ||
736 | hcrx = ccid3_hc_rx_sk(sk); | ||
737 | |||
738 | if (dccp_packet_without_ack(skb)) | 638 | if (dccp_packet_without_ack(skb)) |
739 | return 0; | 639 | return 0; |
740 | 640 | ||
741 | x_recv = htonl(hcrx->ccid3hcrx_x_recv); | 641 | x_recv = htonl(hcrx->x_recv); |
742 | pinv = htonl(hcrx->ccid3hcrx_pinv); | 642 | pinv = htonl(hcrx->p_inverse); |
743 | 643 | ||
744 | if (dccp_insert_option(sk, skb, TFRC_OPT_LOSS_EVENT_RATE, | 644 | if (dccp_insert_option(sk, skb, TFRC_OPT_LOSS_EVENT_RATE, |
745 | &pinv, sizeof(pinv)) || | 645 | &pinv, sizeof(pinv)) || |
@@ -762,171 +662,95 @@ static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb) | |||
762 | static u32 ccid3_first_li(struct sock *sk) | 662 | static u32 ccid3_first_li(struct sock *sk) |
763 | { | 663 | { |
764 | struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); | 664 | struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); |
765 | u32 x_recv, p, delta; | 665 | u32 s = tfrc_rx_hist_packet_size(&hcrx->hist), |
666 | rtt = tfrc_rx_hist_rtt(&hcrx->hist), x_recv, p; | ||
766 | u64 fval; | 667 | u64 fval; |
767 | 668 | ||
768 | if (hcrx->ccid3hcrx_rtt == 0) { | 669 | /* |
769 | DCCP_WARN("No RTT estimate available, using fallback RTT\n"); | 670 | * rfc3448bis-06, 6.3.1: First data packet(s) are marked or lost. Set p |
770 | hcrx->ccid3hcrx_rtt = DCCP_FALLBACK_RTT; | 671 | * to give the equivalent of X_target = s/(2*R). Thus fval = 2 and so p |
771 | } | 672 | * is about 20.64%. This yields an interval length of 4.84 (rounded up). |
673 | */ | ||
674 | if (unlikely(hcrx->feedback == CCID3_FBACK_NONE)) | ||
675 | return 5; | ||
772 | 676 | ||
773 | delta = ktime_to_us(net_timedelta(hcrx->ccid3hcrx_tstamp_last_feedback)); | 677 | x_recv = tfrc_rx_hist_x_recv(&hcrx->hist, hcrx->x_recv); |
774 | x_recv = scaled_div32(hcrx->ccid3hcrx_bytes_recv, delta); | 678 | if (x_recv == 0) |
775 | if (x_recv == 0) { /* would also trigger divide-by-zero */ | 679 | goto failed; |
776 | DCCP_WARN("X_recv==0\n"); | ||
777 | if ((x_recv = hcrx->ccid3hcrx_x_recv) == 0) { | ||
778 | DCCP_BUG("stored value of X_recv is zero"); | ||
779 | return ~0U; | ||
780 | } | ||
781 | } | ||
782 | 680 | ||
783 | fval = scaled_div(hcrx->ccid3hcrx_s, hcrx->ccid3hcrx_rtt); | 681 | fval = scaled_div32(scaled_div(s, rtt), x_recv); |
784 | fval = scaled_div32(fval, x_recv); | ||
785 | p = tfrc_calc_x_reverse_lookup(fval); | 682 | p = tfrc_calc_x_reverse_lookup(fval); |
786 | 683 | ||
787 | ccid3_pr_debug("%s(%p), receive rate=%u bytes/s, implied " | 684 | ccid3_pr_debug("%s(%p), receive rate=%u bytes/s, implied " |
788 | "loss rate=%u\n", dccp_role(sk), sk, x_recv, p); | 685 | "loss rate=%u\n", dccp_role(sk), sk, x_recv, p); |
789 | 686 | ||
790 | return p == 0 ? ~0U : scaled_div(1, p); | 687 | if (p > 0) |
688 | return scaled_div(1, p); | ||
689 | failed: | ||
690 | return UINT_MAX; | ||
791 | } | 691 | } |
792 | 692 | ||
793 | static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) | 693 | static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) |
794 | { | 694 | { |
795 | struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); | 695 | struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); |
796 | enum ccid3_fback_type do_feedback = CCID3_FBACK_NONE; | ||
797 | const u64 ndp = dccp_sk(sk)->dccps_options_received.dccpor_ndp; | 696 | const u64 ndp = dccp_sk(sk)->dccps_options_received.dccpor_ndp; |
798 | const bool is_data_packet = dccp_data_packet(skb); | 697 | const bool is_data_packet = dccp_data_packet(skb); |
799 | 698 | ||
800 | if (unlikely(hcrx->ccid3hcrx_state == TFRC_RSTATE_NO_DATA)) { | ||
801 | if (is_data_packet) { | ||
802 | const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4; | ||
803 | do_feedback = CCID3_FBACK_INITIAL; | ||
804 | ccid3_hc_rx_set_state(sk, TFRC_RSTATE_DATA); | ||
805 | hcrx->ccid3hcrx_s = payload; | ||
806 | /* | ||
807 | * Not necessary to update ccid3hcrx_bytes_recv here, | ||
808 | * since X_recv = 0 for the first feedback packet (cf. | ||
809 | * RFC 3448, 6.3) -- gerrit | ||
810 | */ | ||
811 | } | ||
812 | goto update_records; | ||
813 | } | ||
814 | |||
815 | if (tfrc_rx_hist_duplicate(&hcrx->ccid3hcrx_hist, skb)) | ||
816 | return; /* done receiving */ | ||
817 | |||
818 | if (is_data_packet) { | ||
819 | const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4; | ||
820 | /* | ||
821 | * Update moving-average of s and the sum of received payload bytes | ||
822 | */ | ||
823 | hcrx->ccid3hcrx_s = tfrc_ewma(hcrx->ccid3hcrx_s, payload, 9); | ||
824 | hcrx->ccid3hcrx_bytes_recv += payload; | ||
825 | } | ||
826 | |||
827 | /* | 699 | /* |
828 | * Perform loss detection and handle pending losses | 700 | * Perform loss detection and handle pending losses |
829 | */ | 701 | */ |
830 | if (tfrc_rx_handle_loss(&hcrx->ccid3hcrx_hist, &hcrx->ccid3hcrx_li_hist, | 702 | if (tfrc_rx_congestion_event(&hcrx->hist, &hcrx->li_hist, |
831 | skb, ndp, ccid3_first_li, sk)) { | 703 | skb, ndp, ccid3_first_li, sk)) |
832 | do_feedback = CCID3_FBACK_PARAM_CHANGE; | 704 | ccid3_hc_rx_send_feedback(sk, skb, CCID3_FBACK_PARAM_CHANGE); |
833 | goto done_receiving; | ||
834 | } | ||
835 | |||
836 | if (tfrc_rx_hist_loss_pending(&hcrx->ccid3hcrx_hist)) | ||
837 | return; /* done receiving */ | ||
838 | |||
839 | /* | 705 | /* |
840 | * Handle data packets: RTT sampling and monitoring p | 706 | * Feedback for first non-empty data packet (RFC 3448, 6.3) |
841 | */ | 707 | */ |
842 | if (unlikely(!is_data_packet)) | 708 | else if (unlikely(hcrx->feedback == CCID3_FBACK_NONE && is_data_packet)) |
843 | goto update_records; | 709 | ccid3_hc_rx_send_feedback(sk, skb, CCID3_FBACK_INITIAL); |
844 | |||
845 | if (!tfrc_lh_is_initialised(&hcrx->ccid3hcrx_li_hist)) { | ||
846 | const u32 sample = tfrc_rx_hist_sample_rtt(&hcrx->ccid3hcrx_hist, skb); | ||
847 | /* | ||
848 | * Empty loss history: no loss so far, hence p stays 0. | ||
849 | * Sample RTT values, since an RTT estimate is required for the | ||
850 | * computation of p when the first loss occurs; RFC 3448, 6.3.1. | ||
851 | */ | ||
852 | if (sample != 0) | ||
853 | hcrx->ccid3hcrx_rtt = tfrc_ewma(hcrx->ccid3hcrx_rtt, sample, 9); | ||
854 | |||
855 | } else if (tfrc_lh_update_i_mean(&hcrx->ccid3hcrx_li_hist, skb)) { | ||
856 | /* | ||
857 | * Step (3) of [RFC 3448, 6.1]: Recompute I_mean and, if I_mean | ||
858 | * has decreased (resp. p has increased), send feedback now. | ||
859 | */ | ||
860 | do_feedback = CCID3_FBACK_PARAM_CHANGE; | ||
861 | } | ||
862 | |||
863 | /* | 710 | /* |
864 | * Check if the periodic once-per-RTT feedback is due; RFC 4342, 10.3 | 711 | * Check if the periodic once-per-RTT feedback is due; RFC 4342, 10.3 |
865 | */ | 712 | */ |
866 | if (SUB16(dccp_hdr(skb)->dccph_ccval, hcrx->ccid3hcrx_last_counter) > 3) | 713 | else if (!tfrc_rx_hist_loss_pending(&hcrx->hist) && is_data_packet && |
867 | do_feedback = CCID3_FBACK_PERIODIC; | 714 | SUB16(dccp_hdr(skb)->dccph_ccval, hcrx->last_counter) > 3) |
868 | 715 | ccid3_hc_rx_send_feedback(sk, skb, CCID3_FBACK_PERIODIC); | |
869 | update_records: | ||
870 | tfrc_rx_hist_add_packet(&hcrx->ccid3hcrx_hist, skb, ndp); | ||
871 | |||
872 | done_receiving: | ||
873 | if (do_feedback) | ||
874 | ccid3_hc_rx_send_feedback(sk, skb, do_feedback); | ||
875 | } | 716 | } |
876 | 717 | ||
877 | static int ccid3_hc_rx_init(struct ccid *ccid, struct sock *sk) | 718 | static int ccid3_hc_rx_init(struct ccid *ccid, struct sock *sk) |
878 | { | 719 | { |
879 | struct ccid3_hc_rx_sock *hcrx = ccid_priv(ccid); | 720 | struct ccid3_hc_rx_sock *hcrx = ccid_priv(ccid); |
880 | 721 | ||
881 | hcrx->ccid3hcrx_state = TFRC_RSTATE_NO_DATA; | 722 | tfrc_lh_init(&hcrx->li_hist); |
882 | tfrc_lh_init(&hcrx->ccid3hcrx_li_hist); | 723 | return tfrc_rx_hist_init(&hcrx->hist, sk); |
883 | return tfrc_rx_hist_alloc(&hcrx->ccid3hcrx_hist); | ||
884 | } | 724 | } |
885 | 725 | ||
886 | static void ccid3_hc_rx_exit(struct sock *sk) | 726 | static void ccid3_hc_rx_exit(struct sock *sk) |
887 | { | 727 | { |
888 | struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); | 728 | struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); |
889 | 729 | ||
890 | ccid3_hc_rx_set_state(sk, TFRC_RSTATE_TERM); | 730 | tfrc_rx_hist_purge(&hcrx->hist); |
891 | 731 | tfrc_lh_cleanup(&hcrx->li_hist); | |
892 | tfrc_rx_hist_purge(&hcrx->ccid3hcrx_hist); | ||
893 | tfrc_lh_cleanup(&hcrx->ccid3hcrx_li_hist); | ||
894 | } | 732 | } |
895 | 733 | ||
896 | static void ccid3_hc_rx_get_info(struct sock *sk, struct tcp_info *info) | 734 | static void ccid3_hc_rx_get_info(struct sock *sk, struct tcp_info *info) |
897 | { | 735 | { |
898 | const struct ccid3_hc_rx_sock *hcrx; | ||
899 | |||
900 | /* Listen socks doesn't have a private CCID block */ | ||
901 | if (sk->sk_state == DCCP_LISTEN) | ||
902 | return; | ||
903 | |||
904 | hcrx = ccid3_hc_rx_sk(sk); | ||
905 | info->tcpi_ca_state = hcrx->ccid3hcrx_state; | ||
906 | info->tcpi_options |= TCPI_OPT_TIMESTAMPS; | 736 | info->tcpi_options |= TCPI_OPT_TIMESTAMPS; |
907 | info->tcpi_rcv_rtt = hcrx->ccid3hcrx_rtt; | 737 | info->tcpi_rcv_rtt = tfrc_rx_hist_rtt(&ccid3_hc_rx_sk(sk)->hist); |
908 | } | 738 | } |
909 | 739 | ||
910 | static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len, | 740 | static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len, |
911 | u32 __user *optval, int __user *optlen) | 741 | u32 __user *optval, int __user *optlen) |
912 | { | 742 | { |
913 | const struct ccid3_hc_rx_sock *hcrx; | 743 | const struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); |
914 | struct tfrc_rx_info rx_info; | 744 | struct tfrc_rx_info rx_info; |
915 | const void *val; | 745 | const void *val; |
916 | 746 | ||
917 | /* Listen socks doesn't have a private CCID block */ | ||
918 | if (sk->sk_state == DCCP_LISTEN) | ||
919 | return -EINVAL; | ||
920 | |||
921 | hcrx = ccid3_hc_rx_sk(sk); | ||
922 | switch (optname) { | 747 | switch (optname) { |
923 | case DCCP_SOCKOPT_CCID_RX_INFO: | 748 | case DCCP_SOCKOPT_CCID_RX_INFO: |
924 | if (len < sizeof(rx_info)) | 749 | if (len < sizeof(rx_info)) |
925 | return -EINVAL; | 750 | return -EINVAL; |
926 | rx_info.tfrcrx_x_recv = hcrx->ccid3hcrx_x_recv; | 751 | rx_info.tfrcrx_x_recv = hcrx->x_recv; |
927 | rx_info.tfrcrx_rtt = hcrx->ccid3hcrx_rtt; | 752 | rx_info.tfrcrx_rtt = tfrc_rx_hist_rtt(&hcrx->hist); |
928 | rx_info.tfrcrx_p = hcrx->ccid3hcrx_pinv == 0 ? ~0U : | 753 | rx_info.tfrcrx_p = tfrc_invert_loss_event_rate(hcrx->p_inverse); |
929 | scaled_div(1, hcrx->ccid3hcrx_pinv); | ||
930 | len = sizeof(rx_info); | 754 | len = sizeof(rx_info); |
931 | val = &rx_info; | 755 | val = &rx_info; |
932 | break; | 756 | break; |
@@ -962,6 +786,9 @@ static struct ccid_operations ccid3 = { | |||
962 | .ccid_hc_tx_getsockopt = ccid3_hc_tx_getsockopt, | 786 | .ccid_hc_tx_getsockopt = ccid3_hc_tx_getsockopt, |
963 | }; | 787 | }; |
964 | 788 | ||
789 | module_param(do_osc_prev, bool, 0644); | ||
790 | MODULE_PARM_DESC(do_osc_prev, "Use Oscillation Prevention (RFC 3448, 4.5)"); | ||
791 | |||
965 | #ifdef CONFIG_IP_DCCP_CCID3_DEBUG | 792 | #ifdef CONFIG_IP_DCCP_CCID3_DEBUG |
966 | module_param(ccid3_debug, bool, 0644); | 793 | module_param(ccid3_debug, bool, 0644); |
967 | MODULE_PARM_DESC(ccid3_debug, "Enable debug messages"); | 794 | MODULE_PARM_DESC(ccid3_debug, "Enable debug messages"); |
@@ -969,6 +796,19 @@ MODULE_PARM_DESC(ccid3_debug, "Enable debug messages"); | |||
969 | 796 | ||
970 | static __init int ccid3_module_init(void) | 797 | static __init int ccid3_module_init(void) |
971 | { | 798 | { |
799 | struct timespec tp; | ||
800 | |||
801 | /* | ||
802 | * Without a fine-grained clock resolution, RTTs/X_recv are not sampled | ||
803 | * correctly and feedback is sent either too early or too late. | ||
804 | */ | ||
805 | hrtimer_get_res(CLOCK_MONOTONIC, &tp); | ||
806 | if (tp.tv_sec || tp.tv_nsec > DCCP_TIME_RESOLUTION * NSEC_PER_USEC) { | ||
807 | printk(KERN_ERR "%s: Timer too coarse (%ld usec), need %u-usec" | ||
808 | " resolution - check your clocksource.\n", __func__, | ||
809 | tp.tv_nsec/NSEC_PER_USEC, DCCP_TIME_RESOLUTION); | ||
810 | return -ESOCKTNOSUPPORT; | ||
811 | } | ||
972 | return ccid_register(&ccid3); | 812 | return ccid_register(&ccid3); |
973 | } | 813 | } |
974 | module_init(ccid3_module_init); | 814 | module_init(ccid3_module_init); |
diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h index 49ca32bd7e79..af6e1bf937d9 100644 --- a/net/dccp/ccids/ccid3.h +++ b/net/dccp/ccids/ccid3.h | |||
@@ -47,11 +47,22 @@ | |||
47 | /* Two seconds as per RFC 3448 4.2 */ | 47 | /* Two seconds as per RFC 3448 4.2 */ |
48 | #define TFRC_INITIAL_TIMEOUT (2 * USEC_PER_SEC) | 48 | #define TFRC_INITIAL_TIMEOUT (2 * USEC_PER_SEC) |
49 | 49 | ||
50 | /* In usecs - half the scheduling granularity as per RFC3448 4.6 */ | 50 | /* Maximum backoff interval t_mbi (RFC 3448, 4.3) */ |
51 | #define TFRC_OPSYS_HALF_TIME_GRAN (USEC_PER_SEC / (2 * HZ)) | 51 | #define TFRC_T_MBI (64 * USEC_PER_SEC) |
52 | 52 | ||
53 | /* Parameter t_mbi from [RFC 3448, 4.3]: backoff interval in seconds */ | 53 | /* |
54 | #define TFRC_T_MBI 64 | 54 | * The t_delta parameter (RFC 3448, 4.6): delays of less than %USEC_PER_MSEC are |
55 | * rounded down to 0, since sk_reset_timer() here uses millisecond granularity. | ||
56 | * Hence we can use a constant t_delta = %USEC_PER_MSEC when HZ >= 500. A coarse | ||
57 | * resolution of HZ < 500 means that the error is below one timer tick (t_gran) | ||
58 | * when using the constant t_delta = t_gran / 2 = %USEC_PER_SEC / (2 * HZ). | ||
59 | */ | ||
60 | #if (HZ >= 500) | ||
61 | # define TFRC_T_DELTA USEC_PER_MSEC | ||
62 | #else | ||
63 | # define TFRC_T_DELTA (USEC_PER_SEC / (2 * HZ)) | ||
64 | #warning Coarse CONFIG_HZ resolution -- higher value recommended for TFRC. | ||
65 | #endif | ||
55 | 66 | ||
56 | enum ccid3_options { | 67 | enum ccid3_options { |
57 | TFRC_OPT_LOSS_EVENT_RATE = 192, | 68 | TFRC_OPT_LOSS_EVENT_RATE = 192, |
@@ -59,62 +70,43 @@ enum ccid3_options { | |||
59 | TFRC_OPT_RECEIVE_RATE = 194, | 70 | TFRC_OPT_RECEIVE_RATE = 194, |
60 | }; | 71 | }; |
61 | 72 | ||
62 | struct ccid3_options_received { | ||
63 | u64 ccid3or_seqno:48, | ||
64 | ccid3or_loss_intervals_idx:16; | ||
65 | u16 ccid3or_loss_intervals_len; | ||
66 | u32 ccid3or_loss_event_rate; | ||
67 | u32 ccid3or_receive_rate; | ||
68 | }; | ||
69 | |||
70 | /* TFRC sender states */ | ||
71 | enum ccid3_hc_tx_states { | ||
72 | TFRC_SSTATE_NO_SENT = 1, | ||
73 | TFRC_SSTATE_NO_FBACK, | ||
74 | TFRC_SSTATE_FBACK, | ||
75 | TFRC_SSTATE_TERM, | ||
76 | }; | ||
77 | |||
78 | /** struct ccid3_hc_tx_sock - CCID3 sender half-connection socket | 73 | /** struct ccid3_hc_tx_sock - CCID3 sender half-connection socket |
79 | * | 74 | * |
80 | * @ccid3hctx_x - Current sending rate in 64 * bytes per second | 75 | * @x - Current sending rate in 64 * bytes per second |
81 | * @ccid3hctx_x_recv - Receive rate in 64 * bytes per second | 76 | * @x_recv - Receive rate in 64 * bytes per second |
82 | * @ccid3hctx_x_calc - Calculated rate in bytes per second | 77 | * @x_calc - Calculated rate in bytes per second |
83 | * @ccid3hctx_rtt - Estimate of current round trip time in usecs | 78 | * @rtt - Estimate of current round trip time in usecs |
84 | * @ccid3hctx_p - Current loss event rate (0-1) scaled by 1000000 | 79 | * @r_sqmean - Estimate of long-term RTT (RFC 3448, 4.5) |
85 | * @ccid3hctx_s - Packet size in bytes | 80 | * @p - Current loss event rate (0-1) scaled by 1000000 |
86 | * @ccid3hctx_t_rto - Nofeedback Timer setting in usecs | 81 | * @s - Packet size in bytes |
87 | * @ccid3hctx_t_ipi - Interpacket (send) interval (RFC 3448, 4.6) in usecs | 82 | * @t_rto - Nofeedback Timer setting in usecs |
88 | * @ccid3hctx_state - Sender state, one of %ccid3_hc_tx_states | 83 | * @t_ipi - Interpacket (send) interval (RFC 3448, 4.6) in usecs |
89 | * @ccid3hctx_last_win_count - Last window counter sent | 84 | * @feedback - Whether feedback has been received or not |
90 | * @ccid3hctx_t_last_win_count - Timestamp of earliest packet | 85 | * @last_win_count - Last window counter sent |
91 | * with last_win_count value sent | 86 | * @t_last_win_count - Timestamp of earliest packet with |
92 | * @ccid3hctx_no_feedback_timer - Handle to no feedback timer | 87 | * last_win_count value sent |
93 | * @ccid3hctx_t_ld - Time last doubled during slow start | 88 | * @no_feedback_timer - Handle to no feedback timer |
94 | * @ccid3hctx_t_nom - Nominal send time of next packet | 89 | * @t_ld - Time last doubled during slow start |
95 | * @ccid3hctx_delta - Send timer delta (RFC 3448, 4.6) in usecs | 90 | * @t_nom - Nominal send time of next packet |
96 | * @ccid3hctx_hist - Packet history | 91 | * @hist - Packet history |
97 | * @ccid3hctx_options_received - Parsed set of retrieved options | ||
98 | */ | 92 | */ |
99 | struct ccid3_hc_tx_sock { | 93 | struct ccid3_hc_tx_sock { |
100 | struct tfrc_tx_info ccid3hctx_tfrc; | 94 | u64 x; |
101 | #define ccid3hctx_x ccid3hctx_tfrc.tfrctx_x | 95 | u64 x_recv; |
102 | #define ccid3hctx_x_recv ccid3hctx_tfrc.tfrctx_x_recv | 96 | u32 x_calc; |
103 | #define ccid3hctx_x_calc ccid3hctx_tfrc.tfrctx_x_calc | 97 | u32 rtt; |
104 | #define ccid3hctx_rtt ccid3hctx_tfrc.tfrctx_rtt | 98 | u16 r_sqmean; |
105 | #define ccid3hctx_p ccid3hctx_tfrc.tfrctx_p | 99 | u32 p; |
106 | #define ccid3hctx_t_rto ccid3hctx_tfrc.tfrctx_rto | 100 | u32 t_rto; |
107 | #define ccid3hctx_t_ipi ccid3hctx_tfrc.tfrctx_ipi | 101 | u32 t_ipi; |
108 | u16 ccid3hctx_s; | 102 | u16 s; |
109 | enum ccid3_hc_tx_states ccid3hctx_state:8; | 103 | bool feedback:1; |
110 | u8 ccid3hctx_last_win_count; | 104 | u8 last_win_count; |
111 | ktime_t ccid3hctx_t_last_win_count; | 105 | ktime_t t_last_win_count; |
112 | struct timer_list ccid3hctx_no_feedback_timer; | 106 | struct timer_list no_feedback_timer; |
113 | ktime_t ccid3hctx_t_ld; | 107 | ktime_t t_ld; |
114 | ktime_t ccid3hctx_t_nom; | 108 | ktime_t t_nom; |
115 | u32 ccid3hctx_delta; | 109 | struct tfrc_tx_hist_entry *hist; |
116 | struct tfrc_tx_hist_entry *ccid3hctx_hist; | ||
117 | struct ccid3_options_received ccid3hctx_options_received; | ||
118 | }; | 110 | }; |
119 | 111 | ||
120 | static inline struct ccid3_hc_tx_sock *ccid3_hc_tx_sk(const struct sock *sk) | 112 | static inline struct ccid3_hc_tx_sock *ccid3_hc_tx_sk(const struct sock *sk) |
@@ -124,41 +116,32 @@ static inline struct ccid3_hc_tx_sock *ccid3_hc_tx_sk(const struct sock *sk) | |||
124 | return hctx; | 116 | return hctx; |
125 | } | 117 | } |
126 | 118 | ||
127 | /* TFRC receiver states */ | 119 | |
128 | enum ccid3_hc_rx_states { | 120 | enum ccid3_fback_type { |
129 | TFRC_RSTATE_NO_DATA = 1, | 121 | CCID3_FBACK_NONE = 0, |
130 | TFRC_RSTATE_DATA, | 122 | CCID3_FBACK_INITIAL, |
131 | TFRC_RSTATE_TERM = 127, | 123 | CCID3_FBACK_PERIODIC, |
124 | CCID3_FBACK_PARAM_CHANGE | ||
132 | }; | 125 | }; |
133 | 126 | ||
134 | /** struct ccid3_hc_rx_sock - CCID3 receiver half-connection socket | 127 | /** struct ccid3_hc_rx_sock - CCID3 receiver half-connection socket |
135 | * | 128 | * |
136 | * @ccid3hcrx_x_recv - Receiver estimate of send rate (RFC 3448 4.3) | 129 | * @last_counter - Tracks window counter (RFC 4342, 8.1) |
137 | * @ccid3hcrx_rtt - Receiver estimate of rtt (non-standard) | 130 | * @feedback - The type of the feedback last sent |
138 | * @ccid3hcrx_p - Current loss event rate (RFC 3448 5.4) | 131 | * @x_recv - Receiver estimate of send rate (RFC 3448, sec. 4.3) |
139 | * @ccid3hcrx_last_counter - Tracks window counter (RFC 4342, 8.1) | 132 | * @tstamp_last_feedback - Time at which last feedback was sent |
140 | * @ccid3hcrx_state - Receiver state, one of %ccid3_hc_rx_states | 133 | * @hist - Packet history (loss detection + RTT sampling) |
141 | * @ccid3hcrx_bytes_recv - Total sum of DCCP payload bytes | 134 | * @li_hist - Loss Interval database |
142 | * @ccid3hcrx_x_recv - Receiver estimate of send rate (RFC 3448, sec. 4.3) | 135 | * @p_inverse - Inverse of Loss Event Rate (RFC 4342, sec. 8.5) |
143 | * @ccid3hcrx_rtt - Receiver estimate of RTT | ||
144 | * @ccid3hcrx_tstamp_last_feedback - Time at which last feedback was sent | ||
145 | * @ccid3hcrx_tstamp_last_ack - Time at which last feedback was sent | ||
146 | * @ccid3hcrx_hist - Packet history (loss detection + RTT sampling) | ||
147 | * @ccid3hcrx_li_hist - Loss Interval database | ||
148 | * @ccid3hcrx_s - Received packet size in bytes | ||
149 | * @ccid3hcrx_pinv - Inverse of Loss Event Rate (RFC 4342, sec. 8.5) | ||
150 | */ | 136 | */ |
151 | struct ccid3_hc_rx_sock { | 137 | struct ccid3_hc_rx_sock { |
152 | u8 ccid3hcrx_last_counter:4; | 138 | u8 last_counter:4; |
153 | enum ccid3_hc_rx_states ccid3hcrx_state:8; | 139 | enum ccid3_fback_type feedback:4; |
154 | u32 ccid3hcrx_bytes_recv; | 140 | u32 x_recv; |
155 | u32 ccid3hcrx_x_recv; | 141 | ktime_t tstamp_last_feedback; |
156 | u32 ccid3hcrx_rtt; | 142 | struct tfrc_rx_hist hist; |
157 | ktime_t ccid3hcrx_tstamp_last_feedback; | 143 | struct tfrc_loss_hist li_hist; |
158 | struct tfrc_rx_hist ccid3hcrx_hist; | 144 | #define p_inverse li_hist.i_mean |
159 | struct tfrc_loss_hist ccid3hcrx_li_hist; | ||
160 | u16 ccid3hcrx_s; | ||
161 | #define ccid3hcrx_pinv ccid3hcrx_li_hist.i_mean | ||
162 | }; | 145 | }; |
163 | 146 | ||
164 | static inline struct ccid3_hc_rx_sock *ccid3_hc_rx_sk(const struct sock *sk) | 147 | static inline struct ccid3_hc_rx_sock *ccid3_hc_rx_sk(const struct sock *sk) |
diff --git a/net/dccp/ccids/lib/loss_interval.c b/net/dccp/ccids/lib/loss_interval.c index 5b3ce0688c5c..b1ae8f8259e5 100644 --- a/net/dccp/ccids/lib/loss_interval.c +++ b/net/dccp/ccids/lib/loss_interval.c | |||
@@ -86,21 +86,26 @@ static void tfrc_lh_calc_i_mean(struct tfrc_loss_hist *lh) | |||
86 | 86 | ||
87 | /** | 87 | /** |
88 | * tfrc_lh_update_i_mean - Update the `open' loss interval I_0 | 88 | * tfrc_lh_update_i_mean - Update the `open' loss interval I_0 |
89 | * For recomputing p: returns `true' if p > p_prev <=> 1/p < 1/p_prev | 89 | * This updates I_mean as the sequence numbers increase. As a consequence, the |
90 | * open loss interval I_0 increases, hence p = W_tot/max(I_tot0, I_tot1) | ||
91 | * decreases, and thus there is no need to send renewed feedback. | ||
90 | */ | 92 | */ |
91 | u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *skb) | 93 | void tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *skb) |
92 | { | 94 | { |
93 | struct tfrc_loss_interval *cur = tfrc_lh_peek(lh); | 95 | struct tfrc_loss_interval *cur = tfrc_lh_peek(lh); |
94 | u32 old_i_mean = lh->i_mean; | ||
95 | s64 len; | 96 | s64 len; |
96 | 97 | ||
97 | if (cur == NULL) /* not initialised */ | 98 | if (cur == NULL) /* not initialised */ |
98 | return 0; | 99 | return; |
100 | |||
101 | /* FIXME: should probably also count non-data packets (RFC 4342, 6.1) */ | ||
102 | if (!dccp_data_packet(skb)) | ||
103 | return; | ||
99 | 104 | ||
100 | len = dccp_delta_seqno(cur->li_seqno, DCCP_SKB_CB(skb)->dccpd_seq) + 1; | 105 | len = dccp_delta_seqno(cur->li_seqno, DCCP_SKB_CB(skb)->dccpd_seq) + 1; |
101 | 106 | ||
102 | if (len - (s64)cur->li_length <= 0) /* duplicate or reordered */ | 107 | if (len - (s64)cur->li_length <= 0) /* duplicate or reordered */ |
103 | return 0; | 108 | return; |
104 | 109 | ||
105 | if (SUB16(dccp_hdr(skb)->dccph_ccval, cur->li_ccval) > 4) | 110 | if (SUB16(dccp_hdr(skb)->dccph_ccval, cur->li_ccval) > 4) |
106 | /* | 111 | /* |
@@ -114,14 +119,11 @@ u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *skb) | |||
114 | cur->li_is_closed = 1; | 119 | cur->li_is_closed = 1; |
115 | 120 | ||
116 | if (tfrc_lh_length(lh) == 1) /* due to RFC 3448, 6.3.1 */ | 121 | if (tfrc_lh_length(lh) == 1) /* due to RFC 3448, 6.3.1 */ |
117 | return 0; | 122 | return; |
118 | 123 | ||
119 | cur->li_length = len; | 124 | cur->li_length = len; |
120 | tfrc_lh_calc_i_mean(lh); | 125 | tfrc_lh_calc_i_mean(lh); |
121 | |||
122 | return (lh->i_mean < old_i_mean); | ||
123 | } | 126 | } |
124 | EXPORT_SYMBOL_GPL(tfrc_lh_update_i_mean); | ||
125 | 127 | ||
126 | /* Determine if `new_loss' does begin a new loss interval [RFC 4342, 10.2] */ | 128 | /* Determine if `new_loss' does begin a new loss interval [RFC 4342, 10.2] */ |
127 | static inline u8 tfrc_lh_is_new_loss(struct tfrc_loss_interval *cur, | 129 | static inline u8 tfrc_lh_is_new_loss(struct tfrc_loss_interval *cur, |
@@ -138,18 +140,18 @@ static inline u8 tfrc_lh_is_new_loss(struct tfrc_loss_interval *cur, | |||
138 | * @sk: Used by @calc_first_li in caller-specific way (subtyping) | 140 | * @sk: Used by @calc_first_li in caller-specific way (subtyping) |
139 | * Updates I_mean and returns 1 if a new interval has in fact been added to @lh. | 141 | * Updates I_mean and returns 1 if a new interval has in fact been added to @lh. |
140 | */ | 142 | */ |
141 | int tfrc_lh_interval_add(struct tfrc_loss_hist *lh, struct tfrc_rx_hist *rh, | 143 | bool tfrc_lh_interval_add(struct tfrc_loss_hist *lh, struct tfrc_rx_hist *rh, |
142 | u32 (*calc_first_li)(struct sock *), struct sock *sk) | 144 | u32 (*calc_first_li)(struct sock *), struct sock *sk) |
143 | { | 145 | { |
144 | struct tfrc_loss_interval *cur = tfrc_lh_peek(lh), *new; | 146 | struct tfrc_loss_interval *cur = tfrc_lh_peek(lh), *new; |
145 | 147 | ||
146 | if (cur != NULL && !tfrc_lh_is_new_loss(cur, tfrc_rx_hist_loss_prev(rh))) | 148 | if (cur != NULL && !tfrc_lh_is_new_loss(cur, tfrc_rx_hist_loss_prev(rh))) |
147 | return 0; | 149 | return false; |
148 | 150 | ||
149 | new = tfrc_lh_demand_next(lh); | 151 | new = tfrc_lh_demand_next(lh); |
150 | if (unlikely(new == NULL)) { | 152 | if (unlikely(new == NULL)) { |
151 | DCCP_CRIT("Cannot allocate/add loss record."); | 153 | DCCP_CRIT("Cannot allocate/add loss record."); |
152 | return 0; | 154 | return false; |
153 | } | 155 | } |
154 | 156 | ||
155 | new->li_seqno = tfrc_rx_hist_loss_prev(rh)->tfrchrx_seqno; | 157 | new->li_seqno = tfrc_rx_hist_loss_prev(rh)->tfrchrx_seqno; |
@@ -167,7 +169,7 @@ int tfrc_lh_interval_add(struct tfrc_loss_hist *lh, struct tfrc_rx_hist *rh, | |||
167 | 169 | ||
168 | tfrc_lh_calc_i_mean(lh); | 170 | tfrc_lh_calc_i_mean(lh); |
169 | } | 171 | } |
170 | return 1; | 172 | return true; |
171 | } | 173 | } |
172 | EXPORT_SYMBOL_GPL(tfrc_lh_interval_add); | 174 | EXPORT_SYMBOL_GPL(tfrc_lh_interval_add); |
173 | 175 | ||
diff --git a/net/dccp/ccids/lib/loss_interval.h b/net/dccp/ccids/lib/loss_interval.h index 246018a3b269..d08a226db43e 100644 --- a/net/dccp/ccids/lib/loss_interval.h +++ b/net/dccp/ccids/lib/loss_interval.h | |||
@@ -67,9 +67,9 @@ static inline u8 tfrc_lh_length(struct tfrc_loss_hist *lh) | |||
67 | 67 | ||
68 | struct tfrc_rx_hist; | 68 | struct tfrc_rx_hist; |
69 | 69 | ||
70 | extern int tfrc_lh_interval_add(struct tfrc_loss_hist *, struct tfrc_rx_hist *, | 70 | extern bool tfrc_lh_interval_add(struct tfrc_loss_hist *, struct tfrc_rx_hist *, |
71 | u32 (*first_li)(struct sock *), struct sock *); | 71 | u32 (*first_li)(struct sock *), struct sock *); |
72 | extern u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *); | 72 | extern void tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *); |
73 | extern void tfrc_lh_cleanup(struct tfrc_loss_hist *lh); | 73 | extern void tfrc_lh_cleanup(struct tfrc_loss_hist *lh); |
74 | 74 | ||
75 | #endif /* _DCCP_LI_HIST_ */ | 75 | #endif /* _DCCP_LI_HIST_ */ |
diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c index 6cc108afdc3b..cce9f03bda3e 100644 --- a/net/dccp/ccids/lib/packet_history.c +++ b/net/dccp/ccids/lib/packet_history.c | |||
@@ -40,18 +40,6 @@ | |||
40 | #include "packet_history.h" | 40 | #include "packet_history.h" |
41 | #include "../../dccp.h" | 41 | #include "../../dccp.h" |
42 | 42 | ||
43 | /** | ||
44 | * tfrc_tx_hist_entry - Simple singly-linked TX history list | ||
45 | * @next: next oldest entry (LIFO order) | ||
46 | * @seqno: sequence number of this entry | ||
47 | * @stamp: send time of packet with sequence number @seqno | ||
48 | */ | ||
49 | struct tfrc_tx_hist_entry { | ||
50 | struct tfrc_tx_hist_entry *next; | ||
51 | u64 seqno; | ||
52 | ktime_t stamp; | ||
53 | }; | ||
54 | |||
55 | /* | 43 | /* |
56 | * Transmitter History Routines | 44 | * Transmitter History Routines |
57 | */ | 45 | */ |
@@ -73,15 +61,6 @@ void tfrc_tx_packet_history_exit(void) | |||
73 | } | 61 | } |
74 | } | 62 | } |
75 | 63 | ||
76 | static struct tfrc_tx_hist_entry * | ||
77 | tfrc_tx_hist_find_entry(struct tfrc_tx_hist_entry *head, u64 seqno) | ||
78 | { | ||
79 | while (head != NULL && head->seqno != seqno) | ||
80 | head = head->next; | ||
81 | |||
82 | return head; | ||
83 | } | ||
84 | |||
85 | int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno) | 64 | int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno) |
86 | { | 65 | { |
87 | struct tfrc_tx_hist_entry *entry = kmem_cache_alloc(tfrc_tx_hist_slab, gfp_any()); | 66 | struct tfrc_tx_hist_entry *entry = kmem_cache_alloc(tfrc_tx_hist_slab, gfp_any()); |
@@ -111,25 +90,6 @@ void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp) | |||
111 | } | 90 | } |
112 | EXPORT_SYMBOL_GPL(tfrc_tx_hist_purge); | 91 | EXPORT_SYMBOL_GPL(tfrc_tx_hist_purge); |
113 | 92 | ||
114 | u32 tfrc_tx_hist_rtt(struct tfrc_tx_hist_entry *head, const u64 seqno, | ||
115 | const ktime_t now) | ||
116 | { | ||
117 | u32 rtt = 0; | ||
118 | struct tfrc_tx_hist_entry *packet = tfrc_tx_hist_find_entry(head, seqno); | ||
119 | |||
120 | if (packet != NULL) { | ||
121 | rtt = ktime_us_delta(now, packet->stamp); | ||
122 | /* | ||
123 | * Garbage-collect older (irrelevant) entries: | ||
124 | */ | ||
125 | tfrc_tx_hist_purge(&packet->next); | ||
126 | } | ||
127 | |||
128 | return rtt; | ||
129 | } | ||
130 | EXPORT_SYMBOL_GPL(tfrc_tx_hist_rtt); | ||
131 | |||
132 | |||
133 | /* | 93 | /* |
134 | * Receiver History Routines | 94 | * Receiver History Routines |
135 | */ | 95 | */ |
@@ -191,14 +151,31 @@ int tfrc_rx_hist_duplicate(struct tfrc_rx_hist *h, struct sk_buff *skb) | |||
191 | } | 151 | } |
192 | EXPORT_SYMBOL_GPL(tfrc_rx_hist_duplicate); | 152 | EXPORT_SYMBOL_GPL(tfrc_rx_hist_duplicate); |
193 | 153 | ||
154 | |||
155 | static void __tfrc_rx_hist_swap(struct tfrc_rx_hist *h, const u8 a, const u8 b) | ||
156 | { | ||
157 | struct tfrc_rx_hist_entry *tmp = h->ring[a]; | ||
158 | |||
159 | h->ring[a] = h->ring[b]; | ||
160 | h->ring[b] = tmp; | ||
161 | } | ||
162 | |||
194 | static void tfrc_rx_hist_swap(struct tfrc_rx_hist *h, const u8 a, const u8 b) | 163 | static void tfrc_rx_hist_swap(struct tfrc_rx_hist *h, const u8 a, const u8 b) |
195 | { | 164 | { |
196 | const u8 idx_a = tfrc_rx_hist_index(h, a), | 165 | __tfrc_rx_hist_swap(h, tfrc_rx_hist_index(h, a), |
197 | idx_b = tfrc_rx_hist_index(h, b); | 166 | tfrc_rx_hist_index(h, b)); |
198 | struct tfrc_rx_hist_entry *tmp = h->ring[idx_a]; | 167 | } |
199 | 168 | ||
200 | h->ring[idx_a] = h->ring[idx_b]; | 169 | /** |
201 | h->ring[idx_b] = tmp; | 170 | * tfrc_rx_hist_resume_rtt_sampling - Prepare RX history for RTT sampling |
171 | * This is called after loss detection has finished, when the history entry | ||
172 | * with the index of `loss_count' holds the highest-received sequence number. | ||
173 | * RTT sampling requires this information at ring[0] (tfrc_rx_hist_sample_rtt). | ||
174 | */ | ||
175 | static inline void tfrc_rx_hist_resume_rtt_sampling(struct tfrc_rx_hist *h) | ||
176 | { | ||
177 | __tfrc_rx_hist_swap(h, 0, tfrc_rx_hist_index(h, h->loss_count)); | ||
178 | h->loss_count = h->loss_start = 0; | ||
202 | } | 179 | } |
203 | 180 | ||
204 | /* | 181 | /* |
@@ -215,10 +192,8 @@ static void __do_track_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u64 n1) | |||
215 | u64 s0 = tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno, | 192 | u64 s0 = tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno, |
216 | s1 = DCCP_SKB_CB(skb)->dccpd_seq; | 193 | s1 = DCCP_SKB_CB(skb)->dccpd_seq; |
217 | 194 | ||
218 | if (!dccp_loss_free(s0, s1, n1)) { /* gap between S0 and S1 */ | 195 | if (!dccp_loss_free(s0, s1, n1)) /* gap between S0 and S1 */ |
219 | h->loss_count = 1; | 196 | h->loss_count = 1; |
220 | tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 1), skb, n1); | ||
221 | } | ||
222 | } | 197 | } |
223 | 198 | ||
224 | static void __one_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n2) | 199 | static void __one_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n2) |
@@ -240,8 +215,7 @@ static void __one_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n2 | |||
240 | 215 | ||
241 | if (dccp_loss_free(s2, s1, n1)) { | 216 | if (dccp_loss_free(s2, s1, n1)) { |
242 | /* hole is filled: S0, S2, and S1 are consecutive */ | 217 | /* hole is filled: S0, S2, and S1 are consecutive */ |
243 | h->loss_count = 0; | 218 | tfrc_rx_hist_resume_rtt_sampling(h); |
244 | h->loss_start = tfrc_rx_hist_index(h, 1); | ||
245 | } else | 219 | } else |
246 | /* gap between S2 and S1: just update loss_prev */ | 220 | /* gap between S2 and S1: just update loss_prev */ |
247 | tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_loss_prev(h), skb, n2); | 221 | tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_loss_prev(h), skb, n2); |
@@ -294,8 +268,7 @@ static int __two_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n3) | |||
294 | 268 | ||
295 | if (dccp_loss_free(s1, s2, n2)) { | 269 | if (dccp_loss_free(s1, s2, n2)) { |
296 | /* entire hole filled by S0, S3, S1, S2 */ | 270 | /* entire hole filled by S0, S3, S1, S2 */ |
297 | h->loss_start = tfrc_rx_hist_index(h, 2); | 271 | tfrc_rx_hist_resume_rtt_sampling(h); |
298 | h->loss_count = 0; | ||
299 | } else { | 272 | } else { |
300 | /* gap remains between S1 and S2 */ | 273 | /* gap remains between S1 and S2 */ |
301 | h->loss_start = tfrc_rx_hist_index(h, 1); | 274 | h->loss_start = tfrc_rx_hist_index(h, 1); |
@@ -339,8 +312,7 @@ static void __three_after_loss(struct tfrc_rx_hist *h) | |||
339 | 312 | ||
340 | if (dccp_loss_free(s2, s3, n3)) { | 313 | if (dccp_loss_free(s2, s3, n3)) { |
341 | /* no gap between S2 and S3: entire hole is filled */ | 314 | /* no gap between S2 and S3: entire hole is filled */ |
342 | h->loss_start = tfrc_rx_hist_index(h, 3); | 315 | tfrc_rx_hist_resume_rtt_sampling(h); |
343 | h->loss_count = 0; | ||
344 | } else { | 316 | } else { |
345 | /* gap between S2 and S3 */ | 317 | /* gap between S2 and S3 */ |
346 | h->loss_start = tfrc_rx_hist_index(h, 2); | 318 | h->loss_start = tfrc_rx_hist_index(h, 2); |
@@ -354,13 +326,13 @@ static void __three_after_loss(struct tfrc_rx_hist *h) | |||
354 | } | 326 | } |
355 | 327 | ||
356 | /** | 328 | /** |
357 | * tfrc_rx_handle_loss - Loss detection and further processing | 329 | * tfrc_rx_congestion_event - Loss detection and further processing |
358 | * @h: The non-empty RX history object | 330 | * @h: The non-empty RX history object |
359 | * @lh: Loss Intervals database to update | 331 | * @lh: Loss Intervals database to update |
360 | * @skb: Currently received packet | 332 | * @skb: Currently received packet |
361 | * @ndp: The NDP count belonging to @skb | 333 | * @ndp: The NDP count belonging to @skb |
362 | * @calc_first_li: Caller-dependent computation of first loss interval in @lh | 334 | * @first_li: Caller-dependent computation of first loss interval in @lh |
363 | * @sk: Used by @calc_first_li (see tfrc_lh_interval_add) | 335 | * @sk: Used by @calc_first_li (see tfrc_lh_interval_add) |
364 | * Chooses action according to pending loss, updates LI database when a new | 336 | * Chooses action according to pending loss, updates LI database when a new |
365 | * loss was detected, and does required post-processing. Returns 1 when caller | 337 | * loss was detected, and does required post-processing. Returns 1 when caller |
366 | * should send feedback, 0 otherwise. | 338 | * should send feedback, 0 otherwise. |
@@ -368,15 +340,20 @@ static void __three_after_loss(struct tfrc_rx_hist *h) | |||
368 | * records accordingly, the caller should not perform any more RX history | 340 | * records accordingly, the caller should not perform any more RX history |
369 | * operations when loss_count is greater than 0 after calling this function. | 341 | * operations when loss_count is greater than 0 after calling this function. |
370 | */ | 342 | */ |
371 | int tfrc_rx_handle_loss(struct tfrc_rx_hist *h, | 343 | bool tfrc_rx_congestion_event(struct tfrc_rx_hist *h, |
372 | struct tfrc_loss_hist *lh, | 344 | struct tfrc_loss_hist *lh, |
373 | struct sk_buff *skb, const u64 ndp, | 345 | struct sk_buff *skb, const u64 ndp, |
374 | u32 (*calc_first_li)(struct sock *), struct sock *sk) | 346 | u32 (*first_li)(struct sock *), struct sock *sk) |
375 | { | 347 | { |
376 | int is_new_loss = 0; | 348 | bool new_event = false; |
349 | |||
350 | if (tfrc_rx_hist_duplicate(h, skb)) | ||
351 | return 0; | ||
377 | 352 | ||
378 | if (h->loss_count == 0) { | 353 | if (h->loss_count == 0) { |
379 | __do_track_loss(h, skb, ndp); | 354 | __do_track_loss(h, skb, ndp); |
355 | tfrc_rx_hist_sample_rtt(h, skb); | ||
356 | tfrc_rx_hist_add_packet(h, skb, ndp); | ||
380 | } else if (h->loss_count == 1) { | 357 | } else if (h->loss_count == 1) { |
381 | __one_after_loss(h, skb, ndp); | 358 | __one_after_loss(h, skb, ndp); |
382 | } else if (h->loss_count != 2) { | 359 | } else if (h->loss_count != 2) { |
@@ -385,34 +362,57 @@ int tfrc_rx_handle_loss(struct tfrc_rx_hist *h, | |||
385 | /* | 362 | /* |
386 | * Update Loss Interval database and recycle RX records | 363 | * Update Loss Interval database and recycle RX records |
387 | */ | 364 | */ |
388 | is_new_loss = tfrc_lh_interval_add(lh, h, calc_first_li, sk); | 365 | new_event = tfrc_lh_interval_add(lh, h, first_li, sk); |
389 | __three_after_loss(h); | 366 | __three_after_loss(h); |
390 | } | 367 | } |
391 | return is_new_loss; | 368 | |
369 | /* | ||
370 | * Update moving-average of `s' and the sum of received payload bytes. | ||
371 | */ | ||
372 | if (dccp_data_packet(skb)) { | ||
373 | const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4; | ||
374 | |||
375 | h->packet_size = tfrc_ewma(h->packet_size, payload, 9); | ||
376 | h->bytes_recvd += payload; | ||
377 | } | ||
378 | |||
379 | /* RFC 3448, 6.1: update I_0, whose growth implies p <= p_prev */ | ||
380 | if (!new_event) | ||
381 | tfrc_lh_update_i_mean(lh, skb); | ||
382 | |||
383 | return new_event; | ||
392 | } | 384 | } |
393 | EXPORT_SYMBOL_GPL(tfrc_rx_handle_loss); | 385 | EXPORT_SYMBOL_GPL(tfrc_rx_congestion_event); |
394 | 386 | ||
395 | int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h) | 387 | /* Compute the sending rate X_recv measured between feedback intervals */ |
388 | u32 tfrc_rx_hist_x_recv(struct tfrc_rx_hist *h, const u32 last_x_recv) | ||
396 | { | 389 | { |
397 | int i; | 390 | u64 bytes = h->bytes_recvd, last_rtt = h->rtt_estimate; |
391 | s64 delta = ktime_to_us(net_timedelta(h->bytes_start)); | ||
398 | 392 | ||
399 | for (i = 0; i <= TFRC_NDUPACK; i++) { | 393 | WARN_ON(delta <= 0); |
400 | h->ring[i] = kmem_cache_alloc(tfrc_rx_hist_slab, GFP_ATOMIC); | 394 | /* |
401 | if (h->ring[i] == NULL) | 395 | * Ensure that the sampling interval for X_recv is at least one RTT, |
402 | goto out_free; | 396 | * by extending the sampling interval backwards in time, over the last |
403 | } | 397 | * R_(m-1) seconds, as per rfc3448bis-06, 6.2. |
398 | * To reduce noise (e.g. when the RTT changes often), this is only | ||
399 | * done when delta is smaller than RTT/2. | ||
400 | */ | ||
401 | if (last_x_recv > 0 && delta < last_rtt/2) { | ||
402 | tfrc_pr_debug("delta < RTT ==> %ld us < %u us\n", | ||
403 | (long)delta, (unsigned)last_rtt); | ||
404 | 404 | ||
405 | h->loss_count = h->loss_start = 0; | 405 | delta = (bytes ? delta : 0) + last_rtt; |
406 | return 0; | 406 | bytes += div_u64((u64)last_x_recv * last_rtt, USEC_PER_SEC); |
407 | } | ||
407 | 408 | ||
408 | out_free: | 409 | if (unlikely(bytes == 0)) { |
409 | while (i-- != 0) { | 410 | DCCP_WARN("X_recv == 0, using old value of %u\n", last_x_recv); |
410 | kmem_cache_free(tfrc_rx_hist_slab, h->ring[i]); | 411 | return last_x_recv; |
411 | h->ring[i] = NULL; | ||
412 | } | 412 | } |
413 | return -ENOBUFS; | 413 | return scaled_div32(bytes, delta); |
414 | } | 414 | } |
415 | EXPORT_SYMBOL_GPL(tfrc_rx_hist_alloc); | 415 | EXPORT_SYMBOL_GPL(tfrc_rx_hist_x_recv); |
416 | 416 | ||
417 | void tfrc_rx_hist_purge(struct tfrc_rx_hist *h) | 417 | void tfrc_rx_hist_purge(struct tfrc_rx_hist *h) |
418 | { | 418 | { |
@@ -426,73 +426,81 @@ void tfrc_rx_hist_purge(struct tfrc_rx_hist *h) | |||
426 | } | 426 | } |
427 | EXPORT_SYMBOL_GPL(tfrc_rx_hist_purge); | 427 | EXPORT_SYMBOL_GPL(tfrc_rx_hist_purge); |
428 | 428 | ||
429 | /** | 429 | static int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h) |
430 | * tfrc_rx_hist_rtt_last_s - reference entry to compute RTT samples against | ||
431 | */ | ||
432 | static inline struct tfrc_rx_hist_entry * | ||
433 | tfrc_rx_hist_rtt_last_s(const struct tfrc_rx_hist *h) | ||
434 | { | 430 | { |
435 | return h->ring[0]; | 431 | int i; |
432 | |||
433 | memset(h, 0, sizeof(*h)); | ||
434 | |||
435 | for (i = 0; i <= TFRC_NDUPACK; i++) { | ||
436 | h->ring[i] = kmem_cache_alloc(tfrc_rx_hist_slab, GFP_ATOMIC); | ||
437 | if (h->ring[i] == NULL) { | ||
438 | tfrc_rx_hist_purge(h); | ||
439 | return -ENOBUFS; | ||
440 | } | ||
441 | } | ||
442 | return 0; | ||
436 | } | 443 | } |
437 | 444 | ||
438 | /** | 445 | int tfrc_rx_hist_init(struct tfrc_rx_hist *h, struct sock *sk) |
439 | * tfrc_rx_hist_rtt_prev_s: previously suitable (wrt rtt_last_s) RTT-sampling entry | ||
440 | */ | ||
441 | static inline struct tfrc_rx_hist_entry * | ||
442 | tfrc_rx_hist_rtt_prev_s(const struct tfrc_rx_hist *h) | ||
443 | { | 446 | { |
444 | return h->ring[h->rtt_sample_prev]; | 447 | if (tfrc_rx_hist_alloc(h)) |
448 | return -ENOBUFS; | ||
449 | /* | ||
450 | * Initialise first entry with GSR to start loss detection as early as | ||
451 | * possible. Code using this must not use any other fields. The entry | ||
452 | * will be overwritten once the CCID updates its received packets. | ||
453 | */ | ||
454 | tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno = dccp_sk(sk)->dccps_gsr; | ||
455 | return 0; | ||
445 | } | 456 | } |
457 | EXPORT_SYMBOL_GPL(tfrc_rx_hist_init); | ||
446 | 458 | ||
447 | /** | 459 | /** |
448 | * tfrc_rx_hist_sample_rtt - Sample RTT from timestamp / CCVal | 460 | * tfrc_rx_hist_sample_rtt - Sample RTT from timestamp / CCVal |
449 | * Based on ideas presented in RFC 4342, 8.1. Returns 0 if it was not able | 461 | * Based on ideas presented in RFC 4342, 8.1. This function expects that no loss |
450 | * to compute a sample with given data - calling function should check this. | 462 | * is pending and uses the following history entries (via rtt_sample_prev): |
463 | * - h->ring[0] contains the most recent history entry prior to @skb; | ||
464 | * - h->ring[1] is an unused `dummy' entry when the current difference is 0; | ||
451 | */ | 465 | */ |
452 | u32 tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, const struct sk_buff *skb) | 466 | void tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, const struct sk_buff *skb) |
453 | { | 467 | { |
454 | u32 sample = 0, | 468 | struct tfrc_rx_hist_entry *last = h->ring[0]; |
455 | delta_v = SUB16(dccp_hdr(skb)->dccph_ccval, | 469 | u32 sample, delta_v; |
456 | tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval); | ||
457 | |||
458 | if (delta_v < 1 || delta_v > 4) { /* unsuitable CCVal delta */ | ||
459 | if (h->rtt_sample_prev == 2) { /* previous candidate stored */ | ||
460 | sample = SUB16(tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_ccval, | ||
461 | tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval); | ||
462 | if (sample) | ||
463 | sample = 4 / sample * | ||
464 | ktime_us_delta(tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_tstamp, | ||
465 | tfrc_rx_hist_rtt_last_s(h)->tfrchrx_tstamp); | ||
466 | else /* | ||
467 | * FIXME: This condition is in principle not | ||
468 | * possible but occurs when CCID is used for | ||
469 | * two-way data traffic. I have tried to trace | ||
470 | * it, but the cause does not seem to be here. | ||
471 | */ | ||
472 | DCCP_BUG("please report to dccp@vger.kernel.org" | ||
473 | " => prev = %u, last = %u", | ||
474 | tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_ccval, | ||
475 | tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval); | ||
476 | } else if (delta_v < 1) { | ||
477 | h->rtt_sample_prev = 1; | ||
478 | goto keep_ref_for_next_time; | ||
479 | } | ||
480 | 470 | ||
481 | } else if (delta_v == 4) /* optimal match */ | 471 | /* |
482 | sample = ktime_to_us(net_timedelta(tfrc_rx_hist_rtt_last_s(h)->tfrchrx_tstamp)); | 472 | * When not to sample: |
483 | else { /* suboptimal match */ | 473 | * - on non-data packets |
484 | h->rtt_sample_prev = 2; | 474 | * (RFC 4342, 8.1: CCVal only fully defined for data packets); |
485 | goto keep_ref_for_next_time; | 475 | * - when no data packets have been received yet |
486 | } | 476 | * (FIXME: using sampled packet size as indicator here); |
477 | * - as long as there are gaps in the sequence space (pending loss). | ||
478 | */ | ||
479 | if (!dccp_data_packet(skb) || h->packet_size == 0 || | ||
480 | tfrc_rx_hist_loss_pending(h)) | ||
481 | return; | ||
487 | 482 | ||
488 | if (unlikely(sample > DCCP_SANE_RTT_MAX)) { | 483 | h->rtt_sample_prev = 0; /* reset previous candidate */ |
489 | DCCP_WARN("RTT sample %u too large, using max\n", sample); | 484 | |
490 | sample = DCCP_SANE_RTT_MAX; | 485 | delta_v = SUB16(dccp_hdr(skb)->dccph_ccval, last->tfrchrx_ccval); |
486 | if (delta_v == 0) { /* less than RTT/4 difference */ | ||
487 | h->rtt_sample_prev = 1; | ||
488 | return; | ||
491 | } | 489 | } |
490 | sample = dccp_sane_rtt(ktime_to_us(net_timedelta(last->tfrchrx_tstamp))); | ||
492 | 491 | ||
493 | h->rtt_sample_prev = 0; /* use current entry as next reference */ | 492 | if (delta_v <= 4) /* between RTT/4 and RTT */ |
494 | keep_ref_for_next_time: | 493 | sample *= 4 / delta_v; |
494 | else if (!(sample < h->rtt_estimate && sample > h->rtt_estimate/2)) | ||
495 | /* | ||
496 | * Optimisation: CCVal difference is greater than 1 RTT, yet the | ||
497 | * sample is less than the local RTT estimate; which means that | ||
498 | * the RTT estimate is too high. | ||
499 | * To avoid noise, it is not done if the sample is below RTT/2. | ||
500 | */ | ||
501 | return; | ||
495 | 502 | ||
496 | return sample; | 503 | /* Use a lower weight than usual to increase responsiveness */ |
504 | h->rtt_estimate = tfrc_ewma(h->rtt_estimate, sample, 5); | ||
497 | } | 505 | } |
498 | EXPORT_SYMBOL_GPL(tfrc_rx_hist_sample_rtt); | 506 | EXPORT_SYMBOL_GPL(tfrc_rx_hist_sample_rtt); |
diff --git a/net/dccp/ccids/lib/packet_history.h b/net/dccp/ccids/lib/packet_history.h index 461cc91cce88..555e65cd73a0 100644 --- a/net/dccp/ccids/lib/packet_history.h +++ b/net/dccp/ccids/lib/packet_history.h | |||
@@ -40,12 +40,28 @@ | |||
40 | #include <linux/slab.h> | 40 | #include <linux/slab.h> |
41 | #include "tfrc.h" | 41 | #include "tfrc.h" |
42 | 42 | ||
43 | struct tfrc_tx_hist_entry; | 43 | /** |
44 | * tfrc_tx_hist_entry - Simple singly-linked TX history list | ||
45 | * @next: next oldest entry (LIFO order) | ||
46 | * @seqno: sequence number of this entry | ||
47 | * @stamp: send time of packet with sequence number @seqno | ||
48 | */ | ||
49 | struct tfrc_tx_hist_entry { | ||
50 | struct tfrc_tx_hist_entry *next; | ||
51 | u64 seqno; | ||
52 | ktime_t stamp; | ||
53 | }; | ||
54 | |||
55 | static inline struct tfrc_tx_hist_entry * | ||
56 | tfrc_tx_hist_find_entry(struct tfrc_tx_hist_entry *head, u64 seqno) | ||
57 | { | ||
58 | while (head != NULL && head->seqno != seqno) | ||
59 | head = head->next; | ||
60 | return head; | ||
61 | } | ||
44 | 62 | ||
45 | extern int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno); | 63 | extern int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno); |
46 | extern void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp); | 64 | extern void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp); |
47 | extern u32 tfrc_tx_hist_rtt(struct tfrc_tx_hist_entry *head, | ||
48 | const u64 seqno, const ktime_t now); | ||
49 | 65 | ||
50 | /* Subtraction a-b modulo-16, respects circular wrap-around */ | 66 | /* Subtraction a-b modulo-16, respects circular wrap-around */ |
51 | #define SUB16(a, b) (((a) + 16 - (b)) & 0xF) | 67 | #define SUB16(a, b) (((a) + 16 - (b)) & 0xF) |
@@ -75,12 +91,22 @@ struct tfrc_rx_hist_entry { | |||
75 | * @loss_count: Number of entries in circular history | 91 | * @loss_count: Number of entries in circular history |
76 | * @loss_start: Movable index (for loss detection) | 92 | * @loss_start: Movable index (for loss detection) |
77 | * @rtt_sample_prev: Used during RTT sampling, points to candidate entry | 93 | * @rtt_sample_prev: Used during RTT sampling, points to candidate entry |
94 | * @rtt_estimate: Receiver RTT estimate | ||
95 | * @packet_size: Packet size in bytes (as per RFC 3448, 3.1) | ||
96 | * @bytes_recvd: Number of bytes received since @bytes_start | ||
97 | * @bytes_start: Start time for counting @bytes_recvd | ||
78 | */ | 98 | */ |
79 | struct tfrc_rx_hist { | 99 | struct tfrc_rx_hist { |
80 | struct tfrc_rx_hist_entry *ring[TFRC_NDUPACK + 1]; | 100 | struct tfrc_rx_hist_entry *ring[TFRC_NDUPACK + 1]; |
81 | u8 loss_count:2, | 101 | u8 loss_count:2, |
82 | loss_start:2; | 102 | loss_start:2; |
103 | /* Receiver RTT sampling */ | ||
83 | #define rtt_sample_prev loss_start | 104 | #define rtt_sample_prev loss_start |
105 | u32 rtt_estimate; | ||
106 | /* Receiver sampling of application payload lengths */ | ||
107 | u32 packet_size, | ||
108 | bytes_recvd; | ||
109 | ktime_t bytes_start; | ||
84 | }; | 110 | }; |
85 | 111 | ||
86 | /** | 112 | /** |
@@ -124,20 +150,50 @@ static inline bool tfrc_rx_hist_loss_pending(const struct tfrc_rx_hist *h) | |||
124 | return h->loss_count > 0; | 150 | return h->loss_count > 0; |
125 | } | 151 | } |
126 | 152 | ||
153 | /* | ||
154 | * Accessor functions to retrieve parameters sampled by the RX history | ||
155 | */ | ||
156 | static inline u32 tfrc_rx_hist_packet_size(const struct tfrc_rx_hist *h) | ||
157 | { | ||
158 | if (h->packet_size == 0) { | ||
159 | DCCP_WARN("No sample for s, using fallback\n"); | ||
160 | return TCP_MIN_RCVMSS; | ||
161 | } | ||
162 | return h->packet_size; | ||
163 | |||
164 | } | ||
165 | static inline u32 tfrc_rx_hist_rtt(const struct tfrc_rx_hist *h) | ||
166 | { | ||
167 | if (h->rtt_estimate == 0) { | ||
168 | DCCP_WARN("No RTT estimate available, using fallback RTT\n"); | ||
169 | return DCCP_FALLBACK_RTT; | ||
170 | } | ||
171 | return h->rtt_estimate; | ||
172 | } | ||
173 | |||
174 | static inline void tfrc_rx_hist_restart_byte_counter(struct tfrc_rx_hist *h) | ||
175 | { | ||
176 | h->bytes_recvd = 0; | ||
177 | h->bytes_start = ktime_get_real(); | ||
178 | } | ||
179 | |||
180 | extern u32 tfrc_rx_hist_x_recv(struct tfrc_rx_hist *h, const u32 last_x_recv); | ||
181 | |||
182 | |||
127 | extern void tfrc_rx_hist_add_packet(struct tfrc_rx_hist *h, | 183 | extern void tfrc_rx_hist_add_packet(struct tfrc_rx_hist *h, |
128 | const struct sk_buff *skb, const u64 ndp); | 184 | const struct sk_buff *skb, const u64 ndp); |
129 | 185 | ||
130 | extern int tfrc_rx_hist_duplicate(struct tfrc_rx_hist *h, struct sk_buff *skb); | 186 | extern int tfrc_rx_hist_duplicate(struct tfrc_rx_hist *h, struct sk_buff *skb); |
131 | 187 | ||
132 | struct tfrc_loss_hist; | 188 | struct tfrc_loss_hist; |
133 | extern int tfrc_rx_handle_loss(struct tfrc_rx_hist *h, | 189 | extern bool tfrc_rx_congestion_event(struct tfrc_rx_hist *h, |
134 | struct tfrc_loss_hist *lh, | 190 | struct tfrc_loss_hist *lh, |
135 | struct sk_buff *skb, const u64 ndp, | 191 | struct sk_buff *skb, const u64 ndp, |
136 | u32 (*first_li)(struct sock *sk), | 192 | u32 (*first_li)(struct sock *sk), |
137 | struct sock *sk); | 193 | struct sock *sk); |
138 | extern u32 tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, | 194 | extern void tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, |
139 | const struct sk_buff *skb); | 195 | const struct sk_buff *skb); |
140 | extern int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h); | 196 | extern int tfrc_rx_hist_init(struct tfrc_rx_hist *h, struct sock *sk); |
141 | extern void tfrc_rx_hist_purge(struct tfrc_rx_hist *h); | 197 | extern void tfrc_rx_hist_purge(struct tfrc_rx_hist *h); |
142 | 198 | ||
143 | #endif /* _DCCP_PKT_HIST_ */ | 199 | #endif /* _DCCP_PKT_HIST_ */ |
diff --git a/net/dccp/ccids/lib/tfrc.h b/net/dccp/ccids/lib/tfrc.h index ed9857527acf..ede12f53de5a 100644 --- a/net/dccp/ccids/lib/tfrc.h +++ b/net/dccp/ccids/lib/tfrc.h | |||
@@ -48,6 +48,21 @@ static inline u32 scaled_div32(u64 a, u64 b) | |||
48 | } | 48 | } |
49 | 49 | ||
50 | /** | 50 | /** |
51 | * tfrc_scaled_sqrt - Compute scaled integer sqrt(x) for 0 < x < 2^22-1 | ||
52 | * Uses scaling to improve accuracy of the integer approximation of sqrt(). The | ||
53 | * scaling factor of 2^10 limits the maximum @sample to 4e6; this is okay for | ||
54 | * clamped RTT samples (dccp_sample_rtt). | ||
55 | * Should best be used for expressions of type sqrt(x)/sqrt(y), since then the | ||
56 | * scaling factor is neutralised. For this purpose, it avoids returning zero. | ||
57 | */ | ||
58 | static inline u16 tfrc_scaled_sqrt(const u32 sample) | ||
59 | { | ||
60 | const unsigned long non_zero_sample = sample ? : 1; | ||
61 | |||
62 | return int_sqrt(non_zero_sample << 10); | ||
63 | } | ||
64 | |||
65 | /** | ||
51 | * tfrc_ewma - Exponentially weighted moving average | 66 | * tfrc_ewma - Exponentially weighted moving average |
52 | * @weight: Weight to be used as damping factor, in units of 1/10 | 67 | * @weight: Weight to be used as damping factor, in units of 1/10 |
53 | */ | 68 | */ |
@@ -58,6 +73,7 @@ static inline u32 tfrc_ewma(const u32 avg, const u32 newval, const u8 weight) | |||
58 | 73 | ||
59 | extern u32 tfrc_calc_x(u16 s, u32 R, u32 p); | 74 | extern u32 tfrc_calc_x(u16 s, u32 R, u32 p); |
60 | extern u32 tfrc_calc_x_reverse_lookup(u32 fvalue); | 75 | extern u32 tfrc_calc_x_reverse_lookup(u32 fvalue); |
76 | extern u32 tfrc_invert_loss_event_rate(u32 loss_event_rate); | ||
61 | 77 | ||
62 | extern int tfrc_tx_packet_history_init(void); | 78 | extern int tfrc_tx_packet_history_init(void); |
63 | extern void tfrc_tx_packet_history_exit(void); | 79 | extern void tfrc_tx_packet_history_exit(void); |
diff --git a/net/dccp/ccids/lib/tfrc_equation.c b/net/dccp/ccids/lib/tfrc_equation.c index 2f20a29cffe4..38239c4d5e14 100644 --- a/net/dccp/ccids/lib/tfrc_equation.c +++ b/net/dccp/ccids/lib/tfrc_equation.c | |||
@@ -632,8 +632,16 @@ u32 tfrc_calc_x(u16 s, u32 R, u32 p) | |||
632 | 632 | ||
633 | if (p <= TFRC_CALC_X_SPLIT) { /* 0.0000 < p <= 0.05 */ | 633 | if (p <= TFRC_CALC_X_SPLIT) { /* 0.0000 < p <= 0.05 */ |
634 | if (p < TFRC_SMALLEST_P) { /* 0.0000 < p < 0.0001 */ | 634 | if (p < TFRC_SMALLEST_P) { /* 0.0000 < p < 0.0001 */ |
635 | DCCP_WARN("Value of p (%d) below resolution. " | 635 | /* |
636 | "Substituting %d\n", p, TFRC_SMALLEST_P); | 636 | * In the congestion-avoidance phase p decays towards 0 |
637 | * when there are no further losses, so this case is | ||
638 | * natural. Truncating to p_min = 0.01% means that the | ||
639 | * maximum achievable throughput is limited to about | ||
640 | * X_calc_max = 122.4 * s/RTT (see RFC 3448, 3.1); e.g. | ||
641 | * with s=1500 bytes, RTT=0.01 s: X_calc_max = 147 Mbps. | ||
642 | */ | ||
643 | tfrc_pr_debug("Value of p (%d) below resolution. " | ||
644 | "Substituting %d\n", p, TFRC_SMALLEST_P); | ||
637 | index = 0; | 645 | index = 0; |
638 | } else /* 0.0001 <= p <= 0.05 */ | 646 | } else /* 0.0001 <= p <= 0.05 */ |
639 | index = p/TFRC_SMALLEST_P - 1; | 647 | index = p/TFRC_SMALLEST_P - 1; |
@@ -658,7 +666,6 @@ u32 tfrc_calc_x(u16 s, u32 R, u32 p) | |||
658 | result = scaled_div(s, R); | 666 | result = scaled_div(s, R); |
659 | return scaled_div32(result, f); | 667 | return scaled_div32(result, f); |
660 | } | 668 | } |
661 | |||
662 | EXPORT_SYMBOL_GPL(tfrc_calc_x); | 669 | EXPORT_SYMBOL_GPL(tfrc_calc_x); |
663 | 670 | ||
664 | /** | 671 | /** |
@@ -693,5 +700,19 @@ u32 tfrc_calc_x_reverse_lookup(u32 fvalue) | |||
693 | index = tfrc_binsearch(fvalue, 0); | 700 | index = tfrc_binsearch(fvalue, 0); |
694 | return (index + 1) * 1000000 / TFRC_CALC_X_ARRSIZE; | 701 | return (index + 1) * 1000000 / TFRC_CALC_X_ARRSIZE; |
695 | } | 702 | } |
696 | |||
697 | EXPORT_SYMBOL_GPL(tfrc_calc_x_reverse_lookup); | 703 | EXPORT_SYMBOL_GPL(tfrc_calc_x_reverse_lookup); |
704 | |||
705 | /** | ||
706 | * tfrc_invert_loss_event_rate - Compute p so that 10^6 corresponds to 100% | ||
707 | * When @loss_event_rate is large, there is a chance that p is truncated to 0. | ||
708 | * To avoid re-entering slow-start in that case, we set p = TFRC_SMALLEST_P > 0. | ||
709 | */ | ||
710 | u32 tfrc_invert_loss_event_rate(u32 loss_event_rate) | ||
711 | { | ||
712 | if (loss_event_rate == UINT_MAX) /* see RFC 4342, 8.5 */ | ||
713 | return 0; | ||
714 | if (unlikely(loss_event_rate == 0)) /* map 1/0 into 100% */ | ||
715 | return 1000000; | ||
716 | return max_t(u32, scaled_div(1, loss_event_rate), TFRC_SMALLEST_P); | ||
717 | } | ||
718 | EXPORT_SYMBOL_GPL(tfrc_invert_loss_event_rate); | ||
diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index b4bc6e095a0e..5281190aa19c 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h | |||
@@ -42,9 +42,11 @@ | |||
42 | extern int dccp_debug; | 42 | extern int dccp_debug; |
43 | #define dccp_pr_debug(format, a...) DCCP_PR_DEBUG(dccp_debug, format, ##a) | 43 | #define dccp_pr_debug(format, a...) DCCP_PR_DEBUG(dccp_debug, format, ##a) |
44 | #define dccp_pr_debug_cat(format, a...) DCCP_PRINTK(dccp_debug, format, ##a) | 44 | #define dccp_pr_debug_cat(format, a...) DCCP_PRINTK(dccp_debug, format, ##a) |
45 | #define dccp_debug(fmt, a...) dccp_pr_debug_cat(KERN_DEBUG fmt, ##a) | ||
45 | #else | 46 | #else |
46 | #define dccp_pr_debug(format, a...) | 47 | #define dccp_pr_debug(format, a...) |
47 | #define dccp_pr_debug_cat(format, a...) | 48 | #define dccp_pr_debug_cat(format, a...) |
49 | #define dccp_debug(format, a...) | ||
48 | #endif | 50 | #endif |
49 | 51 | ||
50 | extern struct inet_hashinfo dccp_hashinfo; | 52 | extern struct inet_hashinfo dccp_hashinfo; |
@@ -61,11 +63,14 @@ extern void dccp_time_wait(struct sock *sk, int state, int timeo); | |||
61 | * - DCCP-Reset with ACK Subheader and 4 bytes of Reset Code fields | 63 | * - DCCP-Reset with ACK Subheader and 4 bytes of Reset Code fields |
62 | * Hence a safe upper bound for the maximum option length is 1020-28 = 992 | 64 | * Hence a safe upper bound for the maximum option length is 1020-28 = 992 |
63 | */ | 65 | */ |
64 | #define MAX_DCCP_SPECIFIC_HEADER (255 * sizeof(int)) | 66 | #define MAX_DCCP_SPECIFIC_HEADER (255 * sizeof(uint32_t)) |
65 | #define DCCP_MAX_PACKET_HDR 28 | 67 | #define DCCP_MAX_PACKET_HDR 28 |
66 | #define DCCP_MAX_OPT_LEN (MAX_DCCP_SPECIFIC_HEADER - DCCP_MAX_PACKET_HDR) | 68 | #define DCCP_MAX_OPT_LEN (MAX_DCCP_SPECIFIC_HEADER - DCCP_MAX_PACKET_HDR) |
67 | #define MAX_DCCP_HEADER (MAX_DCCP_SPECIFIC_HEADER + MAX_HEADER) | 69 | #define MAX_DCCP_HEADER (MAX_DCCP_SPECIFIC_HEADER + MAX_HEADER) |
68 | 70 | ||
71 | /* Upper bound for initial feature-negotiation overhead (padded to 32 bits) */ | ||
72 | #define DCCP_FEATNEG_OVERHEAD (32 * sizeof(uint32_t)) | ||
73 | |||
69 | #define DCCP_TIMEWAIT_LEN (60 * HZ) /* how long to wait to destroy TIME-WAIT | 74 | #define DCCP_TIMEWAIT_LEN (60 * HZ) /* how long to wait to destroy TIME-WAIT |
70 | * state, about 60 seconds */ | 75 | * state, about 60 seconds */ |
71 | 76 | ||
@@ -81,10 +86,13 @@ extern void dccp_time_wait(struct sock *sk, int state, int timeo); | |||
81 | */ | 86 | */ |
82 | #define DCCP_RTO_MAX ((unsigned)(64 * HZ)) | 87 | #define DCCP_RTO_MAX ((unsigned)(64 * HZ)) |
83 | 88 | ||
89 | /* DCCP base time resolution - 10 microseconds (RFC 4340, 13.1 ... 13.3) */ | ||
90 | #define DCCP_TIME_RESOLUTION 10 | ||
91 | |||
84 | /* | 92 | /* |
85 | * RTT sampling: sanity bounds and fallback RTT value from RFC 4340, section 3.4 | 93 | * RTT sampling: sanity bounds and fallback RTT value from RFC 4340, section 3.4 |
86 | */ | 94 | */ |
87 | #define DCCP_SANE_RTT_MIN 100 | 95 | #define DCCP_SANE_RTT_MIN (10 * DCCP_TIME_RESOLUTION) |
88 | #define DCCP_FALLBACK_RTT (USEC_PER_SEC / 5) | 96 | #define DCCP_FALLBACK_RTT (USEC_PER_SEC / 5) |
89 | #define DCCP_SANE_RTT_MAX (3 * USEC_PER_SEC) | 97 | #define DCCP_SANE_RTT_MAX (3 * USEC_PER_SEC) |
90 | 98 | ||
@@ -95,12 +103,6 @@ extern void dccp_time_wait(struct sock *sk, int state, int timeo); | |||
95 | extern int sysctl_dccp_request_retries; | 103 | extern int sysctl_dccp_request_retries; |
96 | extern int sysctl_dccp_retries1; | 104 | extern int sysctl_dccp_retries1; |
97 | extern int sysctl_dccp_retries2; | 105 | extern int sysctl_dccp_retries2; |
98 | extern int sysctl_dccp_feat_sequence_window; | ||
99 | extern int sysctl_dccp_feat_rx_ccid; | ||
100 | extern int sysctl_dccp_feat_tx_ccid; | ||
101 | extern int sysctl_dccp_feat_ack_ratio; | ||
102 | extern int sysctl_dccp_feat_send_ack_vector; | ||
103 | extern int sysctl_dccp_feat_send_ndp_count; | ||
104 | extern int sysctl_dccp_tx_qlen; | 106 | extern int sysctl_dccp_tx_qlen; |
105 | extern int sysctl_dccp_sync_ratelimit; | 107 | extern int sysctl_dccp_sync_ratelimit; |
106 | 108 | ||
@@ -235,8 +237,22 @@ extern void dccp_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, | |||
235 | extern void dccp_send_sync(struct sock *sk, const u64 seq, | 237 | extern void dccp_send_sync(struct sock *sk, const u64 seq, |
236 | const enum dccp_pkt_type pkt_type); | 238 | const enum dccp_pkt_type pkt_type); |
237 | 239 | ||
238 | extern void dccp_write_xmit(struct sock *sk, int block); | 240 | /* |
241 | * TX Packet Dequeueing Interface | ||
242 | */ | ||
243 | extern void dccp_qpolicy_push(struct sock *sk, struct sk_buff *skb); | ||
244 | extern bool dccp_qpolicy_full(struct sock *sk); | ||
245 | extern void dccp_qpolicy_drop(struct sock *sk, struct sk_buff *skb); | ||
246 | extern struct sk_buff *dccp_qpolicy_top(struct sock *sk); | ||
247 | extern struct sk_buff *dccp_qpolicy_pop(struct sock *sk); | ||
248 | extern bool dccp_qpolicy_param_ok(struct sock *sk, __be32 param); | ||
249 | |||
250 | /* | ||
251 | * TX Packet Output and TX Timers | ||
252 | */ | ||
253 | extern void dccp_write_xmit(struct sock *sk); | ||
239 | extern void dccp_write_space(struct sock *sk); | 254 | extern void dccp_write_space(struct sock *sk); |
255 | extern void dccp_flush_write_queue(struct sock *sk, long *time_budget); | ||
240 | 256 | ||
241 | extern void dccp_init_xmit_timers(struct sock *sk); | 257 | extern void dccp_init_xmit_timers(struct sock *sk); |
242 | static inline void dccp_clear_xmit_timers(struct sock *sk) | 258 | static inline void dccp_clear_xmit_timers(struct sock *sk) |
@@ -252,7 +268,8 @@ extern const char *dccp_state_name(const int state); | |||
252 | extern void dccp_set_state(struct sock *sk, const int state); | 268 | extern void dccp_set_state(struct sock *sk, const int state); |
253 | extern void dccp_done(struct sock *sk); | 269 | extern void dccp_done(struct sock *sk); |
254 | 270 | ||
255 | extern void dccp_reqsk_init(struct request_sock *req, struct sk_buff *skb); | 271 | extern int dccp_reqsk_init(struct request_sock *rq, struct dccp_sock const *dp, |
272 | struct sk_buff const *skb); | ||
256 | 273 | ||
257 | extern int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb); | 274 | extern int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb); |
258 | 275 | ||
@@ -317,7 +334,14 @@ extern struct sk_buff *dccp_ctl_make_reset(struct sock *sk, | |||
317 | extern int dccp_send_reset(struct sock *sk, enum dccp_reset_codes code); | 334 | extern int dccp_send_reset(struct sock *sk, enum dccp_reset_codes code); |
318 | extern void dccp_send_close(struct sock *sk, const int active); | 335 | extern void dccp_send_close(struct sock *sk, const int active); |
319 | extern int dccp_invalid_packet(struct sk_buff *skb); | 336 | extern int dccp_invalid_packet(struct sk_buff *skb); |
320 | extern u32 dccp_sample_rtt(struct sock *sk, long delta); | 337 | |
338 | static inline u32 dccp_sane_rtt(long usec_sample) | ||
339 | { | ||
340 | if (unlikely(usec_sample <= 0 || usec_sample > DCCP_SANE_RTT_MAX)) | ||
341 | DCCP_WARN("RTT sample %ld out of bounds!\n", usec_sample); | ||
342 | return clamp_val(usec_sample, DCCP_SANE_RTT_MIN, DCCP_SANE_RTT_MAX); | ||
343 | } | ||
344 | extern u32 dccp_sample_rtt(struct sock *sk, long delta); | ||
321 | 345 | ||
322 | static inline int dccp_bad_service_code(const struct sock *sk, | 346 | static inline int dccp_bad_service_code(const struct sock *sk, |
323 | const __be32 service) | 347 | const __be32 service) |
@@ -411,36 +435,62 @@ static inline void dccp_hdr_set_ack(struct dccp_hdr_ack_bits *dhack, | |||
411 | static inline void dccp_update_gsr(struct sock *sk, u64 seq) | 435 | static inline void dccp_update_gsr(struct sock *sk, u64 seq) |
412 | { | 436 | { |
413 | struct dccp_sock *dp = dccp_sk(sk); | 437 | struct dccp_sock *dp = dccp_sk(sk); |
414 | const struct dccp_minisock *dmsk = dccp_msk(sk); | ||
415 | 438 | ||
416 | dp->dccps_gsr = seq; | 439 | dp->dccps_gsr = seq; |
417 | dccp_set_seqno(&dp->dccps_swl, | 440 | /* Sequence validity window depends on remote Sequence Window (7.5.1) */ |
418 | dp->dccps_gsr + 1 - (dmsk->dccpms_sequence_window / 4)); | 441 | dp->dccps_swl = SUB48(ADD48(dp->dccps_gsr, 1), dp->dccps_r_seq_win / 4); |
419 | dccp_set_seqno(&dp->dccps_swh, | 442 | /* |
420 | dp->dccps_gsr + (3 * dmsk->dccpms_sequence_window) / 4); | 443 | * Adjust SWL so that it is not below ISR. In contrast to RFC 4340, |
444 | * 7.5.1 we perform this check beyond the initial handshake: W/W' are | ||
445 | * always > 32, so for the first W/W' packets in the lifetime of a | ||
446 | * connection we always have to adjust SWL. | ||
447 | * A second reason why we are doing this is that the window depends on | ||
448 | * the feature-remote value of Sequence Window: nothing stops the peer | ||
449 | * from updating this value while we are busy adjusting SWL for the | ||
450 | * first W packets (we would have to count from scratch again then). | ||
451 | * Therefore it is safer to always make sure that the Sequence Window | ||
452 | * is not artificially extended by a peer who grows SWL downwards by | ||
453 | * continually updating the feature-remote Sequence-Window. | ||
454 | * If sequence numbers wrap it is bad luck. But that will take a while | ||
455 | * (48 bit), and this measure prevents Sequence-number attacks. | ||
456 | */ | ||
457 | if (before48(dp->dccps_swl, dp->dccps_isr)) | ||
458 | dp->dccps_swl = dp->dccps_isr; | ||
459 | dp->dccps_swh = ADD48(dp->dccps_gsr, (3 * dp->dccps_r_seq_win) / 4); | ||
421 | } | 460 | } |
422 | 461 | ||
423 | static inline void dccp_update_gss(struct sock *sk, u64 seq) | 462 | static inline void dccp_update_gss(struct sock *sk, u64 seq) |
424 | { | 463 | { |
425 | struct dccp_sock *dp = dccp_sk(sk); | 464 | struct dccp_sock *dp = dccp_sk(sk); |
426 | 465 | ||
427 | dp->dccps_awh = dp->dccps_gss = seq; | 466 | dp->dccps_gss = seq; |
428 | dccp_set_seqno(&dp->dccps_awl, | 467 | /* Ack validity window depends on local Sequence Window value (7.5.1) */ |
429 | (dp->dccps_gss - | 468 | dp->dccps_awl = SUB48(ADD48(dp->dccps_gss, 1), dp->dccps_l_seq_win); |
430 | dccp_msk(sk)->dccpms_sequence_window + 1)); | 469 | /* Adjust AWL so that it is not below ISS - see comment above for SWL */ |
470 | if (before48(dp->dccps_awl, dp->dccps_iss)) | ||
471 | dp->dccps_awl = dp->dccps_iss; | ||
472 | dp->dccps_awh = dp->dccps_gss; | ||
473 | } | ||
474 | |||
475 | static inline int dccp_ackvec_pending(const struct sock *sk) | ||
476 | { | ||
477 | return dccp_sk(sk)->dccps_hc_rx_ackvec != NULL && | ||
478 | !dccp_ackvec_is_empty(dccp_sk(sk)->dccps_hc_rx_ackvec); | ||
431 | } | 479 | } |
432 | 480 | ||
433 | static inline int dccp_ack_pending(const struct sock *sk) | 481 | static inline int dccp_ack_pending(const struct sock *sk) |
434 | { | 482 | { |
435 | const struct dccp_sock *dp = dccp_sk(sk); | 483 | return dccp_ackvec_pending(sk) || inet_csk_ack_scheduled(sk); |
436 | return dp->dccps_timestamp_echo != 0 || | ||
437 | #ifdef CONFIG_IP_DCCP_ACKVEC | ||
438 | (dccp_msk(sk)->dccpms_send_ack_vector && | ||
439 | dccp_ackvec_pending(dp->dccps_hc_rx_ackvec)) || | ||
440 | #endif | ||
441 | inet_csk_ack_scheduled(sk); | ||
442 | } | 484 | } |
443 | 485 | ||
486 | extern int dccp_feat_signal_nn_change(struct sock *sk, u8 feat, u64 nn_val); | ||
487 | extern int dccp_feat_finalise_settings(struct dccp_sock *dp); | ||
488 | extern int dccp_feat_server_ccid_dependencies(struct dccp_request_sock *dreq); | ||
489 | extern int dccp_feat_insert_opts(struct dccp_sock*, struct dccp_request_sock*, | ||
490 | struct sk_buff *skb); | ||
491 | extern int dccp_feat_activate_values(struct sock *sk, struct list_head *fn); | ||
492 | extern void dccp_feat_list_purge(struct list_head *fn_list); | ||
493 | |||
444 | extern int dccp_insert_options(struct sock *sk, struct sk_buff *skb); | 494 | extern int dccp_insert_options(struct sock *sk, struct sk_buff *skb); |
445 | extern int dccp_insert_options_rsk(struct dccp_request_sock*, struct sk_buff*); | 495 | extern int dccp_insert_options_rsk(struct dccp_request_sock*, struct sk_buff*); |
446 | extern int dccp_insert_option_elapsed_time(struct sock *sk, | 496 | extern int dccp_insert_option_elapsed_time(struct sock *sk, |
diff --git a/net/dccp/diag.c b/net/dccp/diag.c index d8a3509b26f6..93aae7c95550 100644 --- a/net/dccp/diag.c +++ b/net/dccp/diag.c | |||
@@ -29,7 +29,7 @@ static void dccp_get_info(struct sock *sk, struct tcp_info *info) | |||
29 | info->tcpi_backoff = icsk->icsk_backoff; | 29 | info->tcpi_backoff = icsk->icsk_backoff; |
30 | info->tcpi_pmtu = icsk->icsk_pmtu_cookie; | 30 | info->tcpi_pmtu = icsk->icsk_pmtu_cookie; |
31 | 31 | ||
32 | if (dccp_msk(sk)->dccpms_send_ack_vector) | 32 | if (dp->dccps_hc_rx_ackvec != NULL) |
33 | info->tcpi_options |= TCPI_OPT_SACK; | 33 | info->tcpi_options |= TCPI_OPT_SACK; |
34 | 34 | ||
35 | ccid_hc_rx_get_info(dp->dccps_hc_rx_ccid, sk, info); | 35 | ccid_hc_rx_get_info(dp->dccps_hc_rx_ccid, sk, info); |
diff --git a/net/dccp/feat.c b/net/dccp/feat.c index 933a0ecf8d46..f94c7c9d1a7f 100644 --- a/net/dccp/feat.c +++ b/net/dccp/feat.c | |||
@@ -1,11 +1,19 @@ | |||
1 | /* | 1 | /* |
2 | * net/dccp/feat.c | 2 | * net/dccp/feat.c |
3 | * | 3 | * |
4 | * An implementation of the DCCP protocol | 4 | * Feature negotiation for the DCCP protocol (RFC 4340, section 6) |
5 | * Andrea Bittau <a.bittau@cs.ucl.ac.uk> | 5 | * |
6 | * Copyright (c) 2008 The University of Aberdeen, Scotland, UK | ||
7 | * Copyright (c) 2008 Gerrit Renker <gerrit@erg.abdn.ac.uk> | ||
8 | * Rewrote from scratch, some bits from earlier code by | ||
9 | * Copyright (c) 2005 Andrea Bittau <a.bittau@cs.ucl.ac.uk> | ||
10 | * | ||
6 | * | 11 | * |
7 | * ASSUMPTIONS | 12 | * ASSUMPTIONS |
8 | * ----------- | 13 | * ----------- |
14 | * o Feature negotiation is coordinated with connection setup (as in TCP), wild | ||
15 | * changes of parameters of an established connection are not supported. | ||
16 | * o Changing NN values (Ack Ratio only) is supported in state OPEN/PARTOPEN. | ||
9 | * o All currently known SP features have 1-byte quantities. If in the future | 17 | * o All currently known SP features have 1-byte quantities. If in the future |
10 | * extensions of RFCs 4340..42 define features with item lengths larger than | 18 | * extensions of RFCs 4340..42 define features with item lengths larger than |
11 | * one byte, a feature-specific extension of the code will be required. | 19 | * one byte, a feature-specific extension of the code will be required. |
@@ -15,635 +23,1510 @@ | |||
15 | * as published by the Free Software Foundation; either version | 23 | * as published by the Free Software Foundation; either version |
16 | * 2 of the License, or (at your option) any later version. | 24 | * 2 of the License, or (at your option) any later version. |
17 | */ | 25 | */ |
18 | |||
19 | #include <linux/module.h> | 26 | #include <linux/module.h> |
20 | |||
21 | #include "ccid.h" | 27 | #include "ccid.h" |
22 | #include "feat.h" | 28 | #include "feat.h" |
23 | 29 | ||
24 | #define DCCP_FEAT_SP_NOAGREE (-123) | 30 | /* feature-specific sysctls - initialised to the defaults from RFC 4340, 6.4 */ |
25 | 31 | unsigned long sysctl_dccp_sequence_window __read_mostly = 100; | |
26 | int dccp_feat_change(struct dccp_minisock *dmsk, u8 type, u8 feature, | 32 | int sysctl_dccp_rx_ccid __read_mostly = 2, |
27 | u8 *val, u8 len, gfp_t gfp) | 33 | sysctl_dccp_tx_ccid __read_mostly = 2; |
28 | { | ||
29 | struct dccp_opt_pend *opt; | ||
30 | |||
31 | dccp_feat_debug(type, feature, *val); | ||
32 | |||
33 | if (len > 3) { | ||
34 | DCCP_WARN("invalid length %d\n", len); | ||
35 | return -EINVAL; | ||
36 | } | ||
37 | /* XXX add further sanity checks */ | ||
38 | |||
39 | /* check if that feature is already being negotiated */ | ||
40 | list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) { | ||
41 | /* ok we found a negotiation for this option already */ | ||
42 | if (opt->dccpop_feat == feature && opt->dccpop_type == type) { | ||
43 | dccp_pr_debug("Replacing old\n"); | ||
44 | /* replace */ | ||
45 | BUG_ON(opt->dccpop_val == NULL); | ||
46 | kfree(opt->dccpop_val); | ||
47 | opt->dccpop_val = val; | ||
48 | opt->dccpop_len = len; | ||
49 | opt->dccpop_conf = 0; | ||
50 | return 0; | ||
51 | } | ||
52 | } | ||
53 | |||
54 | /* negotiation for a new feature */ | ||
55 | opt = kmalloc(sizeof(*opt), gfp); | ||
56 | if (opt == NULL) | ||
57 | return -ENOMEM; | ||
58 | |||
59 | opt->dccpop_type = type; | ||
60 | opt->dccpop_feat = feature; | ||
61 | opt->dccpop_len = len; | ||
62 | opt->dccpop_val = val; | ||
63 | opt->dccpop_conf = 0; | ||
64 | opt->dccpop_sc = NULL; | ||
65 | |||
66 | BUG_ON(opt->dccpop_val == NULL); | ||
67 | |||
68 | list_add_tail(&opt->dccpop_node, &dmsk->dccpms_pending); | ||
69 | return 0; | ||
70 | } | ||
71 | 34 | ||
72 | EXPORT_SYMBOL_GPL(dccp_feat_change); | 35 | /* |
73 | 36 | * Feature activation handlers. | |
74 | static int dccp_feat_update_ccid(struct sock *sk, u8 type, u8 new_ccid_nr) | 37 | * |
38 | * These all use an u64 argument, to provide enough room for NN/SP features. At | ||
39 | * this stage the negotiated values have been checked to be within their range. | ||
40 | */ | ||
41 | static int dccp_hdlr_ccid(struct sock *sk, u64 ccid, bool rx) | ||
75 | { | 42 | { |
76 | struct dccp_sock *dp = dccp_sk(sk); | 43 | struct dccp_sock *dp = dccp_sk(sk); |
77 | struct dccp_minisock *dmsk = dccp_msk(sk); | 44 | struct ccid *new_ccid = ccid_new(ccid, sk, rx, gfp_any()); |
78 | /* figure out if we are changing our CCID or the peer's */ | ||
79 | const int rx = type == DCCPO_CHANGE_R; | ||
80 | const u8 ccid_nr = rx ? dmsk->dccpms_rx_ccid : dmsk->dccpms_tx_ccid; | ||
81 | struct ccid *new_ccid; | ||
82 | |||
83 | /* Check if nothing is being changed. */ | ||
84 | if (ccid_nr == new_ccid_nr) | ||
85 | return 0; | ||
86 | 45 | ||
87 | new_ccid = ccid_new(new_ccid_nr, sk, rx, GFP_ATOMIC); | ||
88 | if (new_ccid == NULL) | 46 | if (new_ccid == NULL) |
89 | return -ENOMEM; | 47 | return -ENOMEM; |
90 | 48 | ||
91 | if (rx) { | 49 | if (rx) { |
92 | ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk); | 50 | ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk); |
93 | dp->dccps_hc_rx_ccid = new_ccid; | 51 | dp->dccps_hc_rx_ccid = new_ccid; |
94 | dmsk->dccpms_rx_ccid = new_ccid_nr; | ||
95 | } else { | 52 | } else { |
96 | ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk); | 53 | ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk); |
97 | dp->dccps_hc_tx_ccid = new_ccid; | 54 | dp->dccps_hc_tx_ccid = new_ccid; |
98 | dmsk->dccpms_tx_ccid = new_ccid_nr; | ||
99 | } | 55 | } |
100 | |||
101 | return 0; | 56 | return 0; |
102 | } | 57 | } |
103 | 58 | ||
104 | static int dccp_feat_update(struct sock *sk, u8 type, u8 feat, u8 val) | 59 | static int dccp_hdlr_seq_win(struct sock *sk, u64 seq_win, bool rx) |
105 | { | 60 | { |
106 | dccp_feat_debug(type, feat, val); | 61 | struct dccp_sock *dp = dccp_sk(sk); |
107 | 62 | ||
108 | switch (feat) { | 63 | if (rx) { |
109 | case DCCPF_CCID: | 64 | dp->dccps_r_seq_win = seq_win; |
110 | return dccp_feat_update_ccid(sk, type, val); | 65 | /* propagate changes to update SWL/SWH */ |
111 | default: | 66 | dccp_update_gsr(sk, dp->dccps_gsr); |
112 | dccp_pr_debug("UNIMPLEMENTED: %s(%d, ...)\n", | 67 | } else { |
113 | dccp_feat_typename(type), feat); | 68 | dp->dccps_l_seq_win = seq_win; |
114 | break; | 69 | /* propagate changes to update AWL */ |
70 | dccp_update_gss(sk, dp->dccps_gss); | ||
115 | } | 71 | } |
116 | return 0; | 72 | return 0; |
117 | } | 73 | } |
118 | 74 | ||
119 | static int dccp_feat_reconcile(struct sock *sk, struct dccp_opt_pend *opt, | 75 | static int dccp_hdlr_ack_ratio(struct sock *sk, u64 ratio, bool rx) |
120 | u8 *rpref, u8 rlen) | 76 | { |
77 | #ifndef __CCID2_COPES_GRACEFULLY_WITH_DYNAMIC_ACK_RATIO_UPDATES__ | ||
78 | /* | ||
79 | * FIXME: This is required until several problems in the CCID-2 code are | ||
80 | * resolved. The CCID-2 code currently does not cope well; using dynamic | ||
81 | * Ack Ratios greater than 1 caused instabilities. These were manifest | ||
82 | * in hangups and long RTO timeouts (1...3 seconds). Until this has been | ||
83 | * stabilised, it is safer not to activate dynamic Ack Ratio changes. | ||
84 | */ | ||
85 | dccp_pr_debug("Not changing %s Ack Ratio from 1 to %u\n", | ||
86 | rx ? "RX" : "TX", (u16)ratio); | ||
87 | ratio = 1; | ||
88 | #endif | ||
89 | if (rx) | ||
90 | dccp_sk(sk)->dccps_r_ack_ratio = ratio; | ||
91 | else | ||
92 | dccp_sk(sk)->dccps_l_ack_ratio = ratio; | ||
93 | return 0; | ||
94 | } | ||
95 | |||
96 | static int dccp_hdlr_ackvec(struct sock *sk, u64 enable, bool rx) | ||
121 | { | 97 | { |
122 | struct dccp_sock *dp = dccp_sk(sk); | 98 | struct dccp_sock *dp = dccp_sk(sk); |
123 | u8 *spref, slen, *res = NULL; | ||
124 | int i, j, rc, agree = 1; | ||
125 | 99 | ||
126 | BUG_ON(rpref == NULL); | 100 | if (rx) { |
101 | if (enable && dp->dccps_hc_rx_ackvec == NULL) { | ||
102 | dp->dccps_hc_rx_ackvec = dccp_ackvec_alloc(gfp_any()); | ||
103 | if (dp->dccps_hc_rx_ackvec == NULL) | ||
104 | return -ENOMEM; | ||
105 | } else if (!enable) { | ||
106 | dccp_ackvec_free(dp->dccps_hc_rx_ackvec); | ||
107 | dp->dccps_hc_rx_ackvec = NULL; | ||
108 | } | ||
109 | } | ||
110 | return 0; | ||
111 | } | ||
127 | 112 | ||
128 | /* check if we are the black sheep */ | 113 | static int dccp_hdlr_ndp(struct sock *sk, u64 enable, bool rx) |
129 | if (dp->dccps_role == DCCP_ROLE_CLIENT) { | 114 | { |
130 | spref = rpref; | 115 | if (!rx) |
131 | slen = rlen; | 116 | dccp_sk(sk)->dccps_send_ndp_count = (enable > 0); |
132 | rpref = opt->dccpop_val; | 117 | return 0; |
133 | rlen = opt->dccpop_len; | 118 | } |
134 | } else { | 119 | |
135 | spref = opt->dccpop_val; | 120 | /* |
136 | slen = opt->dccpop_len; | 121 | * Minimum Checksum Coverage is located at the RX side (9.2.1). This means that |
122 | * `rx' holds when the sending peer informs about his partial coverage via a | ||
123 | * ChangeR() option. In the other case, we are the sender and the receiver | ||
124 | * announces its coverage via ChangeL() options. The policy here is to honour | ||
125 | * such communication by enabling the corresponding partial coverage - but only | ||
126 | * if it has not been set manually before; the warning here means that all | ||
127 | * packets will be dropped. | ||
128 | */ | ||
129 | static int dccp_hdlr_min_cscov(struct sock *sk, u64 cscov, bool rx) | ||
130 | { | ||
131 | struct dccp_sock *dp = dccp_sk(sk); | ||
132 | |||
133 | if (rx) | ||
134 | dp->dccps_pcrlen = cscov; | ||
135 | else { | ||
136 | if (dp->dccps_pcslen == 0) | ||
137 | dp->dccps_pcslen = cscov; | ||
138 | else if (cscov > dp->dccps_pcslen) | ||
139 | DCCP_WARN("CsCov %u too small, peer requires >= %u\n", | ||
140 | dp->dccps_pcslen, (u8)cscov); | ||
137 | } | 141 | } |
142 | return 0; | ||
143 | } | ||
144 | |||
145 | static const struct { | ||
146 | u8 feat_num; /* DCCPF_xxx */ | ||
147 | enum dccp_feat_type rxtx; /* RX or TX */ | ||
148 | enum dccp_feat_type reconciliation; /* SP or NN */ | ||
149 | u8 default_value; /* as in 6.4 */ | ||
150 | int (*activation_hdlr)(struct sock *sk, u64 val, bool rx); | ||
151 | /* | ||
152 | * Lookup table for location and type of features (from RFC 4340/4342) | ||
153 | * +--------------------------+----+-----+----+----+---------+-----------+ | ||
154 | * | Feature | Location | Reconc. | Initial | Section | | ||
155 | * | | RX | TX | SP | NN | Value | Reference | | ||
156 | * +--------------------------+----+-----+----+----+---------+-----------+ | ||
157 | * | DCCPF_CCID | | X | X | | 2 | 10 | | ||
158 | * | DCCPF_SHORT_SEQNOS | | X | X | | 0 | 7.6.1 | | ||
159 | * | DCCPF_SEQUENCE_WINDOW | | X | | X | 100 | 7.5.2 | | ||
160 | * | DCCPF_ECN_INCAPABLE | X | | X | | 0 | 12.1 | | ||
161 | * | DCCPF_ACK_RATIO | | X | | X | 2 | 11.3 | | ||
162 | * | DCCPF_SEND_ACK_VECTOR | X | | X | | 0 | 11.5 | | ||
163 | * | DCCPF_SEND_NDP_COUNT | | X | X | | 0 | 7.7.2 | | ||
164 | * | DCCPF_MIN_CSUM_COVER | X | | X | | 0 | 9.2.1 | | ||
165 | * | DCCPF_DATA_CHECKSUM | X | | X | | 0 | 9.3.1 | | ||
166 | * | DCCPF_SEND_LEV_RATE | X | | X | | 0 | 4342/8.4 | | ||
167 | * +--------------------------+----+-----+----+----+---------+-----------+ | ||
168 | */ | ||
169 | } dccp_feat_table[] = { | ||
170 | { DCCPF_CCID, FEAT_AT_TX, FEAT_SP, 2, dccp_hdlr_ccid }, | ||
171 | { DCCPF_SHORT_SEQNOS, FEAT_AT_TX, FEAT_SP, 0, NULL }, | ||
172 | { DCCPF_SEQUENCE_WINDOW, FEAT_AT_TX, FEAT_NN, 100, dccp_hdlr_seq_win }, | ||
173 | { DCCPF_ECN_INCAPABLE, FEAT_AT_RX, FEAT_SP, 0, NULL }, | ||
174 | { DCCPF_ACK_RATIO, FEAT_AT_TX, FEAT_NN, 2, dccp_hdlr_ack_ratio}, | ||
175 | { DCCPF_SEND_ACK_VECTOR, FEAT_AT_RX, FEAT_SP, 0, dccp_hdlr_ackvec }, | ||
176 | { DCCPF_SEND_NDP_COUNT, FEAT_AT_TX, FEAT_SP, 0, dccp_hdlr_ndp }, | ||
177 | { DCCPF_MIN_CSUM_COVER, FEAT_AT_RX, FEAT_SP, 0, dccp_hdlr_min_cscov}, | ||
178 | { DCCPF_DATA_CHECKSUM, FEAT_AT_RX, FEAT_SP, 0, NULL }, | ||
179 | { DCCPF_SEND_LEV_RATE, FEAT_AT_RX, FEAT_SP, 0, NULL }, | ||
180 | }; | ||
181 | #define DCCP_FEAT_SUPPORTED_MAX ARRAY_SIZE(dccp_feat_table) | ||
182 | |||
183 | /** | ||
184 | * dccp_feat_index - Hash function to map feature number into array position | ||
185 | * Returns consecutive array index or -1 if the feature is not understood. | ||
186 | */ | ||
187 | static int dccp_feat_index(u8 feat_num) | ||
188 | { | ||
189 | /* The first 9 entries are occupied by the types from RFC 4340, 6.4 */ | ||
190 | if (feat_num > DCCPF_RESERVED && feat_num <= DCCPF_DATA_CHECKSUM) | ||
191 | return feat_num - 1; | ||
192 | |||
138 | /* | 193 | /* |
139 | * Now we have server preference list in spref and client preference in | 194 | * Other features: add cases for new feature types here after adding |
140 | * rpref | 195 | * them to the above table. |
141 | */ | 196 | */ |
142 | BUG_ON(spref == NULL); | 197 | switch (feat_num) { |
143 | BUG_ON(rpref == NULL); | 198 | case DCCPF_SEND_LEV_RATE: |
199 | return DCCP_FEAT_SUPPORTED_MAX - 1; | ||
200 | } | ||
201 | return -1; | ||
202 | } | ||
144 | 203 | ||
145 | /* FIXME sanity check vals */ | 204 | static u8 dccp_feat_type(u8 feat_num) |
205 | { | ||
206 | int idx = dccp_feat_index(feat_num); | ||
146 | 207 | ||
147 | /* Are values in any order? XXX Lame "algorithm" here */ | 208 | if (idx < 0) |
148 | for (i = 0; i < slen; i++) { | 209 | return FEAT_UNKNOWN; |
149 | for (j = 0; j < rlen; j++) { | 210 | return dccp_feat_table[idx].reconciliation; |
150 | if (spref[i] == rpref[j]) { | 211 | } |
151 | res = &spref[i]; | ||
152 | break; | ||
153 | } | ||
154 | } | ||
155 | if (res) | ||
156 | break; | ||
157 | } | ||
158 | 212 | ||
159 | /* we didn't agree on anything */ | 213 | static int dccp_feat_default_value(u8 feat_num) |
160 | if (res == NULL) { | 214 | { |
161 | /* confirm previous value */ | 215 | int idx = dccp_feat_index(feat_num); |
162 | switch (opt->dccpop_feat) { | ||
163 | case DCCPF_CCID: | ||
164 | /* XXX did i get this right? =P */ | ||
165 | if (opt->dccpop_type == DCCPO_CHANGE_L) | ||
166 | res = &dccp_msk(sk)->dccpms_tx_ccid; | ||
167 | else | ||
168 | res = &dccp_msk(sk)->dccpms_rx_ccid; | ||
169 | break; | ||
170 | 216 | ||
171 | default: | 217 | return idx < 0 ? : dccp_feat_table[idx].default_value; |
172 | DCCP_BUG("Fell through, feat=%d", opt->dccpop_feat); | 218 | } |
173 | /* XXX implement res */ | ||
174 | return -EFAULT; | ||
175 | } | ||
176 | 219 | ||
177 | dccp_pr_debug("Don't agree... reconfirming %d\n", *res); | 220 | /* |
178 | agree = 0; /* this is used for mandatory options... */ | 221 | * Debugging and verbose-printing section |
222 | */ | ||
223 | static const char *dccp_feat_fname(const u8 feat) | ||
224 | { | ||
225 | static const char *feature_names[] = { | ||
226 | [DCCPF_RESERVED] = "Reserved", | ||
227 | [DCCPF_CCID] = "CCID", | ||
228 | [DCCPF_SHORT_SEQNOS] = "Allow Short Seqnos", | ||
229 | [DCCPF_SEQUENCE_WINDOW] = "Sequence Window", | ||
230 | [DCCPF_ECN_INCAPABLE] = "ECN Incapable", | ||
231 | [DCCPF_ACK_RATIO] = "Ack Ratio", | ||
232 | [DCCPF_SEND_ACK_VECTOR] = "Send ACK Vector", | ||
233 | [DCCPF_SEND_NDP_COUNT] = "Send NDP Count", | ||
234 | [DCCPF_MIN_CSUM_COVER] = "Min. Csum Coverage", | ||
235 | [DCCPF_DATA_CHECKSUM] = "Send Data Checksum", | ||
236 | }; | ||
237 | if (feat > DCCPF_DATA_CHECKSUM && feat < DCCPF_MIN_CCID_SPECIFIC) | ||
238 | return feature_names[DCCPF_RESERVED]; | ||
239 | |||
240 | if (feat == DCCPF_SEND_LEV_RATE) | ||
241 | return "Send Loss Event Rate"; | ||
242 | if (feat >= DCCPF_MIN_CCID_SPECIFIC) | ||
243 | return "CCID-specific"; | ||
244 | |||
245 | return feature_names[feat]; | ||
246 | } | ||
247 | |||
248 | static const char *dccp_feat_sname[] = { "DEFAULT", "INITIALISING", "CHANGING", | ||
249 | "UNSTABLE", "STABLE" }; | ||
250 | |||
251 | #ifdef CONFIG_IP_DCCP_DEBUG | ||
252 | static const char *dccp_feat_oname(const u8 opt) | ||
253 | { | ||
254 | switch (opt) { | ||
255 | case DCCPO_CHANGE_L: return "Change_L"; | ||
256 | case DCCPO_CONFIRM_L: return "Confirm_L"; | ||
257 | case DCCPO_CHANGE_R: return "Change_R"; | ||
258 | case DCCPO_CONFIRM_R: return "Confirm_R"; | ||
179 | } | 259 | } |
260 | return NULL; | ||
261 | } | ||
180 | 262 | ||
181 | /* need to put result and our preference list */ | 263 | static void dccp_feat_printval(u8 feat_num, dccp_feat_val const *val) |
182 | rlen = 1 + opt->dccpop_len; | 264 | { |
183 | rpref = kmalloc(rlen, GFP_ATOMIC); | 265 | u8 i, type = dccp_feat_type(feat_num); |
184 | if (rpref == NULL) | 266 | |
185 | return -ENOMEM; | 267 | if (val == NULL || (type == FEAT_SP && val->sp.vec == NULL)) |
268 | dccp_pr_debug_cat("(NULL)"); | ||
269 | else if (type == FEAT_SP) | ||
270 | for (i = 0; i < val->sp.len; i++) | ||
271 | dccp_pr_debug_cat("%s%u", i ? " " : "", val->sp.vec[i]); | ||
272 | else if (type == FEAT_NN) | ||
273 | dccp_pr_debug_cat("%llu", (unsigned long long)val->nn); | ||
274 | else | ||
275 | dccp_pr_debug_cat("unknown type %u", type); | ||
276 | } | ||
277 | |||
278 | static void dccp_feat_printvals(u8 feat_num, u8 *list, u8 len) | ||
279 | { | ||
280 | u8 type = dccp_feat_type(feat_num); | ||
281 | dccp_feat_val fval = { .sp.vec = list, .sp.len = len }; | ||
282 | |||
283 | if (type == FEAT_NN) | ||
284 | fval.nn = dccp_decode_value_var(list, len); | ||
285 | dccp_feat_printval(feat_num, &fval); | ||
286 | } | ||
287 | |||
288 | static void dccp_feat_print_entry(struct dccp_feat_entry const *entry) | ||
289 | { | ||
290 | dccp_debug(" * %s %s = ", entry->is_local ? "local" : "remote", | ||
291 | dccp_feat_fname(entry->feat_num)); | ||
292 | dccp_feat_printval(entry->feat_num, &entry->val); | ||
293 | dccp_pr_debug_cat(", state=%s %s\n", dccp_feat_sname[entry->state], | ||
294 | entry->needs_confirm ? "(Confirm pending)" : ""); | ||
295 | } | ||
296 | |||
297 | #define dccp_feat_print_opt(opt, feat, val, len, mandatory) do { \ | ||
298 | dccp_pr_debug("%s(%s, ", dccp_feat_oname(opt), dccp_feat_fname(feat));\ | ||
299 | dccp_feat_printvals(feat, val, len); \ | ||
300 | dccp_pr_debug_cat(") %s\n", mandatory ? "!" : ""); } while (0) | ||
301 | |||
302 | #define dccp_feat_print_fnlist(fn_list) { \ | ||
303 | const struct dccp_feat_entry *___entry; \ | ||
304 | \ | ||
305 | dccp_pr_debug("List Dump:\n"); \ | ||
306 | list_for_each_entry(___entry, fn_list, node) \ | ||
307 | dccp_feat_print_entry(___entry); \ | ||
308 | } | ||
309 | #else /* ! CONFIG_IP_DCCP_DEBUG */ | ||
310 | #define dccp_feat_print_opt(opt, feat, val, len, mandatory) | ||
311 | #define dccp_feat_print_fnlist(fn_list) | ||
312 | #endif | ||
186 | 313 | ||
187 | *rpref = *res; | 314 | static int __dccp_feat_activate(struct sock *sk, const int idx, |
188 | memcpy(&rpref[1], opt->dccpop_val, opt->dccpop_len); | 315 | const bool is_local, dccp_feat_val const *fval) |
316 | { | ||
317 | bool rx; | ||
318 | u64 val; | ||
319 | |||
320 | if (idx < 0 || idx >= DCCP_FEAT_SUPPORTED_MAX) | ||
321 | return -1; | ||
322 | if (dccp_feat_table[idx].activation_hdlr == NULL) | ||
323 | return 0; | ||
189 | 324 | ||
190 | /* put it in the "confirm queue" */ | 325 | if (fval == NULL) { |
191 | if (opt->dccpop_sc == NULL) { | 326 | val = dccp_feat_table[idx].default_value; |
192 | opt->dccpop_sc = kmalloc(sizeof(*opt->dccpop_sc), GFP_ATOMIC); | 327 | } else if (dccp_feat_table[idx].reconciliation == FEAT_SP) { |
193 | if (opt->dccpop_sc == NULL) { | 328 | if (fval->sp.vec == NULL) { |
194 | kfree(rpref); | 329 | /* |
195 | return -ENOMEM; | 330 | * This can happen when an empty Confirm is sent |
331 | * for an SP (i.e. known) feature. In this case | ||
332 | * we would be using the default anyway. | ||
333 | */ | ||
334 | DCCP_CRIT("Feature #%d undefined: using default", idx); | ||
335 | val = dccp_feat_table[idx].default_value; | ||
336 | } else { | ||
337 | val = fval->sp.vec[0]; | ||
196 | } | 338 | } |
197 | } else { | 339 | } else { |
198 | /* recycle the confirm slot */ | 340 | val = fval->nn; |
199 | BUG_ON(opt->dccpop_sc->dccpoc_val == NULL); | ||
200 | kfree(opt->dccpop_sc->dccpoc_val); | ||
201 | dccp_pr_debug("recycling confirm slot\n"); | ||
202 | } | ||
203 | memset(opt->dccpop_sc, 0, sizeof(*opt->dccpop_sc)); | ||
204 | |||
205 | opt->dccpop_sc->dccpoc_val = rpref; | ||
206 | opt->dccpop_sc->dccpoc_len = rlen; | ||
207 | |||
208 | /* update the option on our side [we are about to send the confirm] */ | ||
209 | rc = dccp_feat_update(sk, opt->dccpop_type, opt->dccpop_feat, *res); | ||
210 | if (rc) { | ||
211 | kfree(opt->dccpop_sc->dccpoc_val); | ||
212 | kfree(opt->dccpop_sc); | ||
213 | opt->dccpop_sc = NULL; | ||
214 | return rc; | ||
215 | } | 341 | } |
216 | 342 | ||
217 | dccp_pr_debug("Will confirm %d\n", *rpref); | 343 | /* Location is RX if this is a local-RX or remote-TX feature */ |
344 | rx = (is_local == (dccp_feat_table[idx].rxtx == FEAT_AT_RX)); | ||
218 | 345 | ||
219 | /* say we want to change to X but we just got a confirm X, suppress our | 346 | dccp_debug(" -> activating %s %s, %sval=%llu\n", rx ? "RX" : "TX", |
220 | * change | 347 | dccp_feat_fname(dccp_feat_table[idx].feat_num), |
221 | */ | 348 | fval ? "" : "default ", (unsigned long long)val); |
222 | if (!opt->dccpop_conf) { | 349 | |
223 | if (*opt->dccpop_val == *res) | 350 | return dccp_feat_table[idx].activation_hdlr(sk, val, rx); |
224 | opt->dccpop_conf = 1; | 351 | } |
225 | dccp_pr_debug("won't ask for change of same feature\n"); | 352 | |
353 | /** | ||
354 | * dccp_feat_activate - Activate feature value on socket | ||
355 | * @sk: fully connected DCCP socket (after handshake is complete) | ||
356 | * @feat_num: feature to activate, one of %dccp_feature_numbers | ||
357 | * @local: whether local (1) or remote (0) @feat_num is meant | ||
358 | * @fval: the value (SP or NN) to activate, or NULL to use the default value | ||
359 | * For general use this function is preferable over __dccp_feat_activate(). | ||
360 | */ | ||
361 | static int dccp_feat_activate(struct sock *sk, u8 feat_num, bool local, | ||
362 | dccp_feat_val const *fval) | ||
363 | { | ||
364 | return __dccp_feat_activate(sk, dccp_feat_index(feat_num), local, fval); | ||
365 | } | ||
366 | |||
367 | /* Test for "Req'd" feature (RFC 4340, 6.4) */ | ||
368 | static inline int dccp_feat_must_be_understood(u8 feat_num) | ||
369 | { | ||
370 | return feat_num == DCCPF_CCID || feat_num == DCCPF_SHORT_SEQNOS || | ||
371 | feat_num == DCCPF_SEQUENCE_WINDOW; | ||
372 | } | ||
373 | |||
374 | /* copy constructor, fval must not already contain allocated memory */ | ||
375 | static int dccp_feat_clone_sp_val(dccp_feat_val *fval, u8 const *val, u8 len) | ||
376 | { | ||
377 | fval->sp.len = len; | ||
378 | if (fval->sp.len > 0) { | ||
379 | fval->sp.vec = kmemdup(val, len, gfp_any()); | ||
380 | if (fval->sp.vec == NULL) { | ||
381 | fval->sp.len = 0; | ||
382 | return -ENOBUFS; | ||
383 | } | ||
226 | } | 384 | } |
385 | return 0; | ||
386 | } | ||
227 | 387 | ||
228 | return agree ? 0 : DCCP_FEAT_SP_NOAGREE; /* used for mandatory opts */ | 388 | static void dccp_feat_val_destructor(u8 feat_num, dccp_feat_val *val) |
389 | { | ||
390 | if (unlikely(val == NULL)) | ||
391 | return; | ||
392 | if (dccp_feat_type(feat_num) == FEAT_SP) | ||
393 | kfree(val->sp.vec); | ||
394 | memset(val, 0, sizeof(*val)); | ||
229 | } | 395 | } |
230 | 396 | ||
231 | static int dccp_feat_sp(struct sock *sk, u8 type, u8 feature, u8 *val, u8 len) | 397 | static struct dccp_feat_entry * |
398 | dccp_feat_clone_entry(struct dccp_feat_entry const *original) | ||
232 | { | 399 | { |
233 | struct dccp_minisock *dmsk = dccp_msk(sk); | 400 | struct dccp_feat_entry *new; |
234 | struct dccp_opt_pend *opt; | 401 | u8 type = dccp_feat_type(original->feat_num); |
235 | int rc = 1; | ||
236 | u8 t; | ||
237 | 402 | ||
238 | /* | 403 | if (type == FEAT_UNKNOWN) |
239 | * We received a CHANGE. We gotta match it against our own preference | 404 | return NULL; |
240 | * list. If we got a CHANGE_R it means it's a change for us, so we need | ||
241 | * to compare our CHANGE_L list. | ||
242 | */ | ||
243 | if (type == DCCPO_CHANGE_L) | ||
244 | t = DCCPO_CHANGE_R; | ||
245 | else | ||
246 | t = DCCPO_CHANGE_L; | ||
247 | 405 | ||
248 | /* find our preference list for this feature */ | 406 | new = kmemdup(original, sizeof(struct dccp_feat_entry), gfp_any()); |
249 | list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) { | 407 | if (new == NULL) |
250 | if (opt->dccpop_type != t || opt->dccpop_feat != feature) | 408 | return NULL; |
251 | continue; | ||
252 | 409 | ||
253 | /* find the winner from the two preference lists */ | 410 | if (type == FEAT_SP && dccp_feat_clone_sp_val(&new->val, |
254 | rc = dccp_feat_reconcile(sk, opt, val, len); | 411 | original->val.sp.vec, |
255 | break; | 412 | original->val.sp.len)) { |
413 | kfree(new); | ||
414 | return NULL; | ||
256 | } | 415 | } |
416 | return new; | ||
417 | } | ||
257 | 418 | ||
258 | /* We didn't deal with the change. This can happen if we have no | 419 | static void dccp_feat_entry_destructor(struct dccp_feat_entry *entry) |
259 | * preference list for the feature. In fact, it just shouldn't | 420 | { |
260 | * happen---if we understand a feature, we should have a preference list | 421 | if (entry != NULL) { |
261 | * with at least the default value. | 422 | dccp_feat_val_destructor(entry->feat_num, &entry->val); |
262 | */ | 423 | kfree(entry); |
263 | BUG_ON(rc == 1); | 424 | } |
425 | } | ||
264 | 426 | ||
265 | return rc; | 427 | /* |
428 | * List management functions | ||
429 | * | ||
430 | * Feature negotiation lists rely on and maintain the following invariants: | ||
431 | * - each feat_num in the list is known, i.e. we know its type and default value | ||
432 | * - each feat_num/is_local combination is unique (old entries are overwritten) | ||
433 | * - SP values are always freshly allocated | ||
434 | * - list is sorted in increasing order of feature number (faster lookup) | ||
435 | */ | ||
436 | static struct dccp_feat_entry *dccp_feat_list_lookup(struct list_head *fn_list, | ||
437 | u8 feat_num, bool is_local) | ||
438 | { | ||
439 | struct dccp_feat_entry *entry; | ||
440 | |||
441 | list_for_each_entry(entry, fn_list, node) | ||
442 | if (entry->feat_num == feat_num && entry->is_local == is_local) | ||
443 | return entry; | ||
444 | else if (entry->feat_num > feat_num) | ||
445 | break; | ||
446 | return NULL; | ||
266 | } | 447 | } |
267 | 448 | ||
268 | static int dccp_feat_nn(struct sock *sk, u8 type, u8 feature, u8 *val, u8 len) | 449 | /** |
450 | * dccp_feat_entry_new - Central list update routine (called by all others) | ||
451 | * @head: list to add to | ||
452 | * @feat: feature number | ||
453 | * @local: whether the local (1) or remote feature with number @feat is meant | ||
454 | * This is the only constructor and serves to ensure the above invariants. | ||
455 | */ | ||
456 | static struct dccp_feat_entry * | ||
457 | dccp_feat_entry_new(struct list_head *head, u8 feat, bool local) | ||
269 | { | 458 | { |
270 | struct dccp_opt_pend *opt; | 459 | struct dccp_feat_entry *entry; |
271 | struct dccp_minisock *dmsk = dccp_msk(sk); | 460 | |
272 | u8 *copy; | 461 | list_for_each_entry(entry, head, node) |
273 | int rc; | 462 | if (entry->feat_num == feat && entry->is_local == local) { |
463 | dccp_feat_val_destructor(entry->feat_num, &entry->val); | ||
464 | return entry; | ||
465 | } else if (entry->feat_num > feat) { | ||
466 | head = &entry->node; | ||
467 | break; | ||
468 | } | ||
274 | 469 | ||
275 | /* NN features must be Change L (sec. 6.3.2) */ | 470 | entry = kmalloc(sizeof(*entry), gfp_any()); |
276 | if (type != DCCPO_CHANGE_L) { | 471 | if (entry != NULL) { |
277 | dccp_pr_debug("received %s for NN feature %d\n", | 472 | entry->feat_num = feat; |
278 | dccp_feat_typename(type), feature); | 473 | entry->is_local = local; |
279 | return -EFAULT; | 474 | list_add_tail(&entry->node, head); |
280 | } | 475 | } |
476 | return entry; | ||
477 | } | ||
281 | 478 | ||
282 | /* XXX sanity check opt val */ | 479 | /** |
480 | * dccp_feat_push_change - Add/overwrite a Change option in the list | ||
481 | * @fn_list: feature-negotiation list to update | ||
482 | * @feat: one of %dccp_feature_numbers | ||
483 | * @local: whether local (1) or remote (0) @feat_num is meant | ||
484 | * @needs_mandatory: whether to use Mandatory feature negotiation options | ||
485 | * @fval: pointer to NN/SP value to be inserted (will be copied) | ||
486 | */ | ||
487 | static int dccp_feat_push_change(struct list_head *fn_list, u8 feat, u8 local, | ||
488 | u8 mandatory, dccp_feat_val *fval) | ||
489 | { | ||
490 | struct dccp_feat_entry *new = dccp_feat_entry_new(fn_list, feat, local); | ||
283 | 491 | ||
284 | /* copy option so we can confirm it */ | 492 | if (new == NULL) |
285 | opt = kzalloc(sizeof(*opt), GFP_ATOMIC); | ||
286 | if (opt == NULL) | ||
287 | return -ENOMEM; | 493 | return -ENOMEM; |
288 | 494 | ||
289 | copy = kmemdup(val, len, GFP_ATOMIC); | 495 | new->feat_num = feat; |
290 | if (copy == NULL) { | 496 | new->is_local = local; |
291 | kfree(opt); | 497 | new->state = FEAT_INITIALISING; |
292 | return -ENOMEM; | 498 | new->needs_confirm = 0; |
293 | } | 499 | new->empty_confirm = 0; |
500 | new->val = *fval; | ||
501 | new->needs_mandatory = mandatory; | ||
294 | 502 | ||
295 | opt->dccpop_type = DCCPO_CONFIRM_R; /* NN can only confirm R */ | 503 | return 0; |
296 | opt->dccpop_feat = feature; | 504 | } |
297 | opt->dccpop_val = copy; | ||
298 | opt->dccpop_len = len; | ||
299 | 505 | ||
300 | /* change feature */ | 506 | /** |
301 | rc = dccp_feat_update(sk, type, feature, *val); | 507 | * dccp_feat_push_confirm - Add a Confirm entry to the FN list |
302 | if (rc) { | 508 | * @fn_list: feature-negotiation list to add to |
303 | kfree(opt->dccpop_val); | 509 | * @feat: one of %dccp_feature_numbers |
304 | kfree(opt); | 510 | * @local: whether local (1) or remote (0) @feat_num is being confirmed |
305 | return rc; | 511 | * @fval: pointer to NN/SP value to be inserted or NULL |
306 | } | 512 | * Returns 0 on success, a Reset code for further processing otherwise. |
513 | */ | ||
514 | static int dccp_feat_push_confirm(struct list_head *fn_list, u8 feat, u8 local, | ||
515 | dccp_feat_val *fval) | ||
516 | { | ||
517 | struct dccp_feat_entry *new = dccp_feat_entry_new(fn_list, feat, local); | ||
307 | 518 | ||
308 | dccp_feat_debug(type, feature, *copy); | 519 | if (new == NULL) |
520 | return DCCP_RESET_CODE_TOO_BUSY; | ||
309 | 521 | ||
310 | list_add_tail(&opt->dccpop_node, &dmsk->dccpms_conf); | 522 | new->feat_num = feat; |
523 | new->is_local = local; | ||
524 | new->state = FEAT_STABLE; /* transition in 6.6.2 */ | ||
525 | new->needs_confirm = 1; | ||
526 | new->empty_confirm = (fval == NULL); | ||
527 | new->val.nn = 0; /* zeroes the whole structure */ | ||
528 | if (!new->empty_confirm) | ||
529 | new->val = *fval; | ||
530 | new->needs_mandatory = 0; | ||
311 | 531 | ||
312 | return 0; | 532 | return 0; |
313 | } | 533 | } |
314 | 534 | ||
315 | static void dccp_feat_empty_confirm(struct dccp_minisock *dmsk, | 535 | static int dccp_push_empty_confirm(struct list_head *fn_list, u8 feat, u8 local) |
316 | u8 type, u8 feature) | ||
317 | { | 536 | { |
318 | /* XXX check if other confirms for that are queued and recycle slot */ | 537 | return dccp_feat_push_confirm(fn_list, feat, local, NULL); |
319 | struct dccp_opt_pend *opt = kzalloc(sizeof(*opt), GFP_ATOMIC); | 538 | } |
320 | 539 | ||
321 | if (opt == NULL) { | 540 | static inline void dccp_feat_list_pop(struct dccp_feat_entry *entry) |
322 | /* XXX what do we do? Ignoring should be fine. It's a change | 541 | { |
323 | * after all =P | 542 | list_del(&entry->node); |
324 | */ | 543 | dccp_feat_entry_destructor(entry); |
325 | return; | 544 | } |
326 | } | ||
327 | 545 | ||
328 | switch (type) { | 546 | void dccp_feat_list_purge(struct list_head *fn_list) |
329 | case DCCPO_CHANGE_L: | 547 | { |
330 | opt->dccpop_type = DCCPO_CONFIRM_R; | 548 | struct dccp_feat_entry *entry, *next; |
331 | break; | 549 | |
332 | case DCCPO_CHANGE_R: | 550 | list_for_each_entry_safe(entry, next, fn_list, node) |
333 | opt->dccpop_type = DCCPO_CONFIRM_L; | 551 | dccp_feat_entry_destructor(entry); |
334 | break; | 552 | INIT_LIST_HEAD(fn_list); |
335 | default: | 553 | } |
336 | DCCP_WARN("invalid type %d\n", type); | 554 | EXPORT_SYMBOL_GPL(dccp_feat_list_purge); |
337 | kfree(opt); | 555 | |
338 | return; | 556 | /* generate @to as full clone of @from - @to must not contain any nodes */ |
557 | int dccp_feat_clone_list(struct list_head const *from, struct list_head *to) | ||
558 | { | ||
559 | struct dccp_feat_entry *entry, *new; | ||
560 | |||
561 | INIT_LIST_HEAD(to); | ||
562 | list_for_each_entry(entry, from, node) { | ||
563 | new = dccp_feat_clone_entry(entry); | ||
564 | if (new == NULL) | ||
565 | goto cloning_failed; | ||
566 | list_add_tail(&new->node, to); | ||
339 | } | 567 | } |
340 | opt->dccpop_feat = feature; | 568 | return 0; |
341 | opt->dccpop_val = NULL; | ||
342 | opt->dccpop_len = 0; | ||
343 | 569 | ||
344 | /* change feature */ | 570 | cloning_failed: |
345 | dccp_pr_debug("Empty %s(%d)\n", dccp_feat_typename(type), feature); | 571 | dccp_feat_list_purge(to); |
572 | return -ENOMEM; | ||
573 | } | ||
346 | 574 | ||
347 | list_add_tail(&opt->dccpop_node, &dmsk->dccpms_conf); | 575 | /** |
576 | * dccp_feat_valid_nn_length - Enforce length constraints on NN options | ||
577 | * Length is between 0 and %DCCP_OPTVAL_MAXLEN. Used for outgoing packets only, | ||
578 | * incoming options are accepted as long as their values are valid. | ||
579 | */ | ||
580 | static u8 dccp_feat_valid_nn_length(u8 feat_num) | ||
581 | { | ||
582 | if (feat_num == DCCPF_ACK_RATIO) /* RFC 4340, 11.3 and 6.6.8 */ | ||
583 | return 2; | ||
584 | if (feat_num == DCCPF_SEQUENCE_WINDOW) /* RFC 4340, 7.5.2 and 6.5 */ | ||
585 | return 6; | ||
586 | return 0; | ||
348 | } | 587 | } |
349 | 588 | ||
350 | static void dccp_feat_flush_confirm(struct sock *sk) | 589 | static u8 dccp_feat_is_valid_nn_val(u8 feat_num, u64 val) |
351 | { | 590 | { |
352 | struct dccp_minisock *dmsk = dccp_msk(sk); | 591 | switch (feat_num) { |
353 | /* Check if there is anything to confirm in the first place */ | 592 | case DCCPF_ACK_RATIO: |
354 | int yes = !list_empty(&dmsk->dccpms_conf); | 593 | return val <= DCCPF_ACK_RATIO_MAX; |
594 | case DCCPF_SEQUENCE_WINDOW: | ||
595 | return val >= DCCPF_SEQ_WMIN && val <= DCCPF_SEQ_WMAX; | ||
596 | } | ||
597 | return 0; /* feature unknown - so we can't tell */ | ||
598 | } | ||
355 | 599 | ||
356 | if (!yes) { | 600 | /* check that SP values are within the ranges defined in RFC 4340 */ |
357 | struct dccp_opt_pend *opt; | 601 | static u8 dccp_feat_is_valid_sp_val(u8 feat_num, u8 val) |
602 | { | ||
603 | switch (feat_num) { | ||
604 | case DCCPF_CCID: | ||
605 | return val == DCCPC_CCID2 || val == DCCPC_CCID3; | ||
606 | /* Type-check Boolean feature values: */ | ||
607 | case DCCPF_SHORT_SEQNOS: | ||
608 | case DCCPF_ECN_INCAPABLE: | ||
609 | case DCCPF_SEND_ACK_VECTOR: | ||
610 | case DCCPF_SEND_NDP_COUNT: | ||
611 | case DCCPF_DATA_CHECKSUM: | ||
612 | case DCCPF_SEND_LEV_RATE: | ||
613 | return val < 2; | ||
614 | case DCCPF_MIN_CSUM_COVER: | ||
615 | return val < 16; | ||
616 | } | ||
617 | return 0; /* feature unknown */ | ||
618 | } | ||
619 | |||
620 | static u8 dccp_feat_sp_list_ok(u8 feat_num, u8 const *sp_list, u8 sp_len) | ||
621 | { | ||
622 | if (sp_list == NULL || sp_len < 1) | ||
623 | return 0; | ||
624 | while (sp_len--) | ||
625 | if (!dccp_feat_is_valid_sp_val(feat_num, *sp_list++)) | ||
626 | return 0; | ||
627 | return 1; | ||
628 | } | ||
358 | 629 | ||
359 | list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) { | 630 | /** |
360 | if (opt->dccpop_conf) { | 631 | * dccp_feat_insert_opts - Generate FN options from current list state |
361 | yes = 1; | 632 | * @skb: next sk_buff to be sent to the peer |
362 | break; | 633 | * @dp: for client during handshake and general negotiation |
634 | * @dreq: used by the server only (all Changes/Confirms in LISTEN/RESPOND) | ||
635 | */ | ||
636 | int dccp_feat_insert_opts(struct dccp_sock *dp, struct dccp_request_sock *dreq, | ||
637 | struct sk_buff *skb) | ||
638 | { | ||
639 | struct list_head *fn = dreq ? &dreq->dreq_featneg : &dp->dccps_featneg; | ||
640 | struct dccp_feat_entry *pos, *next; | ||
641 | u8 opt, type, len, *ptr, nn_in_nbo[DCCP_OPTVAL_MAXLEN]; | ||
642 | bool rpt; | ||
643 | |||
644 | /* put entries into @skb in the order they appear in the list */ | ||
645 | list_for_each_entry_safe_reverse(pos, next, fn, node) { | ||
646 | opt = dccp_feat_genopt(pos); | ||
647 | type = dccp_feat_type(pos->feat_num); | ||
648 | rpt = false; | ||
649 | |||
650 | if (pos->empty_confirm) { | ||
651 | len = 0; | ||
652 | ptr = NULL; | ||
653 | } else { | ||
654 | if (type == FEAT_SP) { | ||
655 | len = pos->val.sp.len; | ||
656 | ptr = pos->val.sp.vec; | ||
657 | rpt = pos->needs_confirm; | ||
658 | } else if (type == FEAT_NN) { | ||
659 | len = dccp_feat_valid_nn_length(pos->feat_num); | ||
660 | ptr = nn_in_nbo; | ||
661 | dccp_encode_value_var(pos->val.nn, ptr, len); | ||
662 | } else { | ||
663 | DCCP_BUG("unknown feature %u", pos->feat_num); | ||
664 | return -1; | ||
363 | } | 665 | } |
364 | } | 666 | } |
667 | dccp_feat_print_opt(opt, pos->feat_num, ptr, len, 0); | ||
668 | |||
669 | if (dccp_insert_fn_opt(skb, opt, pos->feat_num, ptr, len, rpt)) | ||
670 | return -1; | ||
671 | if (pos->needs_mandatory && dccp_insert_option_mandatory(skb)) | ||
672 | return -1; | ||
673 | /* | ||
674 | * Enter CHANGING after transmitting the Change option (6.6.2). | ||
675 | */ | ||
676 | if (pos->state == FEAT_INITIALISING) | ||
677 | pos->state = FEAT_CHANGING; | ||
365 | } | 678 | } |
679 | return 0; | ||
680 | } | ||
366 | 681 | ||
367 | if (!yes) | 682 | /** |
368 | return; | 683 | * __feat_register_nn - Register new NN value on socket |
684 | * @fn: feature-negotiation list to register with | ||
685 | * @feat: an NN feature from %dccp_feature_numbers | ||
686 | * @mandatory: use Mandatory option if 1 | ||
687 | * @nn_val: value to register (restricted to 4 bytes) | ||
688 | * Note that NN features are local by definition (RFC 4340, 6.3.2). | ||
689 | */ | ||
690 | static int __feat_register_nn(struct list_head *fn, u8 feat, | ||
691 | u8 mandatory, u64 nn_val) | ||
692 | { | ||
693 | dccp_feat_val fval = { .nn = nn_val }; | ||
369 | 694 | ||
370 | /* OK there is something to confirm... */ | 695 | if (dccp_feat_type(feat) != FEAT_NN || |
371 | /* XXX check if packet is in flight? Send delayed ack?? */ | 696 | !dccp_feat_is_valid_nn_val(feat, nn_val)) |
372 | if (sk->sk_state == DCCP_OPEN) | 697 | return -EINVAL; |
373 | dccp_send_ack(sk); | 698 | |
699 | /* Don't bother with default values, they will be activated anyway. */ | ||
700 | if (nn_val - (u64)dccp_feat_default_value(feat) == 0) | ||
701 | return 0; | ||
702 | |||
703 | return dccp_feat_push_change(fn, feat, 1, mandatory, &fval); | ||
374 | } | 704 | } |
375 | 705 | ||
376 | int dccp_feat_change_recv(struct sock *sk, u8 type, u8 feature, u8 *val, u8 len) | 706 | /** |
707 | * __feat_register_sp - Register new SP value/list on socket | ||
708 | * @fn: feature-negotiation list to register with | ||
709 | * @feat: an SP feature from %dccp_feature_numbers | ||
710 | * @is_local: whether the local (1) or the remote (0) @feat is meant | ||
711 | * @mandatory: use Mandatory option if 1 | ||
712 | * @sp_val: SP value followed by optional preference list | ||
713 | * @sp_len: length of @sp_val in bytes | ||
714 | */ | ||
715 | static int __feat_register_sp(struct list_head *fn, u8 feat, u8 is_local, | ||
716 | u8 mandatory, u8 const *sp_val, u8 sp_len) | ||
377 | { | 717 | { |
378 | int rc; | 718 | dccp_feat_val fval; |
379 | 719 | ||
380 | dccp_feat_debug(type, feature, *val); | 720 | if (dccp_feat_type(feat) != FEAT_SP || |
721 | !dccp_feat_sp_list_ok(feat, sp_val, sp_len)) | ||
722 | return -EINVAL; | ||
381 | 723 | ||
382 | /* figure out if it's SP or NN feature */ | 724 | /* Avoid negotiating alien CCIDs by only advertising supported ones */ |
383 | switch (feature) { | 725 | if (feat == DCCPF_CCID && !ccid_support_check(sp_val, sp_len)) |
384 | /* deal with SP features */ | 726 | return -EOPNOTSUPP; |
385 | case DCCPF_CCID: | ||
386 | rc = dccp_feat_sp(sk, type, feature, val, len); | ||
387 | break; | ||
388 | 727 | ||
389 | /* deal with NN features */ | 728 | if (dccp_feat_clone_sp_val(&fval, sp_val, sp_len)) |
390 | case DCCPF_ACK_RATIO: | 729 | return -ENOMEM; |
391 | rc = dccp_feat_nn(sk, type, feature, val, len); | ||
392 | break; | ||
393 | 730 | ||
394 | /* XXX implement other features */ | 731 | return dccp_feat_push_change(fn, feat, is_local, mandatory, &fval); |
395 | default: | 732 | } |
396 | dccp_pr_debug("UNIMPLEMENTED: not handling %s(%d, ...)\n", | 733 | |
397 | dccp_feat_typename(type), feature); | 734 | /** |
398 | rc = -EFAULT; | 735 | * dccp_feat_register_sp - Register requests to change SP feature values |
399 | break; | 736 | * @sk: client or listening socket |
737 | * @feat: one of %dccp_feature_numbers | ||
738 | * @is_local: whether the local (1) or remote (0) @feat is meant | ||
739 | * @list: array of preferred values, in descending order of preference | ||
740 | * @len: length of @list in bytes | ||
741 | */ | ||
742 | int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local, | ||
743 | u8 const *list, u8 len) | ||
744 | { /* any changes must be registered before establishing the connection */ | ||
745 | if (sk->sk_state != DCCP_CLOSED) | ||
746 | return -EISCONN; | ||
747 | if (dccp_feat_type(feat) != FEAT_SP) | ||
748 | return -EINVAL; | ||
749 | return __feat_register_sp(&dccp_sk(sk)->dccps_featneg, feat, is_local, | ||
750 | 0, list, len); | ||
751 | } | ||
752 | |||
753 | /* Analogous to dccp_feat_register_sp(), but for non-negotiable values */ | ||
754 | int dccp_feat_register_nn(struct sock *sk, u8 feat, u64 val) | ||
755 | { | ||
756 | /* any changes must be registered before establishing the connection */ | ||
757 | if (sk->sk_state != DCCP_CLOSED) | ||
758 | return -EISCONN; | ||
759 | if (dccp_feat_type(feat) != FEAT_NN) | ||
760 | return -EINVAL; | ||
761 | return __feat_register_nn(&dccp_sk(sk)->dccps_featneg, feat, 0, val); | ||
762 | } | ||
763 | |||
764 | /** | ||
765 | * dccp_feat_signal_nn_change - Update NN values for an established connection | ||
766 | * @sk: DCCP socket of an established connection | ||
767 | * @feat: NN feature number from %dccp_feature_numbers | ||
768 | * @nn_val: the new value to use | ||
769 | * This function is used to communicate NN updates out-of-band. The difference | ||
770 | * to feature negotiation during connection setup is that values are activated | ||
771 | * immediately after validation, i.e. we don't wait for the Confirm: either the | ||
772 | * value is accepted by the peer (and then the waiting is futile), or it is not | ||
773 | * (Reset or empty Confirm). We don't accept empty Confirms - transmitted values | ||
774 | * are validated, and the peer "MUST accept any valid value" (RFC 4340, 6.3.2). | ||
775 | */ | ||
776 | int dccp_feat_signal_nn_change(struct sock *sk, u8 feat, u64 nn_val) | ||
777 | { | ||
778 | struct list_head *fn = &dccp_sk(sk)->dccps_featneg; | ||
779 | dccp_feat_val fval = { .nn = nn_val }; | ||
780 | struct dccp_feat_entry *entry; | ||
781 | |||
782 | if (sk->sk_state != DCCP_OPEN && sk->sk_state != DCCP_PARTOPEN) | ||
783 | return 0; | ||
784 | |||
785 | if (dccp_feat_type(feat) != FEAT_NN || | ||
786 | !dccp_feat_is_valid_nn_val(feat, nn_val)) | ||
787 | return -EINVAL; | ||
788 | |||
789 | entry = dccp_feat_list_lookup(fn, feat, 1); | ||
790 | if (entry != NULL) { | ||
791 | dccp_pr_debug("Ignoring %llu, entry %llu exists in state %s\n", | ||
792 | (unsigned long long)nn_val, | ||
793 | (unsigned long long)entry->val.nn, | ||
794 | dccp_feat_sname[entry->state]); | ||
795 | return 0; | ||
400 | } | 796 | } |
401 | 797 | ||
402 | /* check if there were problems changing features */ | 798 | if (dccp_feat_activate(sk, feat, 1, &fval)) |
403 | if (rc) { | 799 | return -EADV; |
404 | /* If we don't agree on SP, we sent a confirm for old value. | 800 | |
405 | * However we propagate rc to caller in case option was | 801 | inet_csk_schedule_ack(sk); |
406 | * mandatory | 802 | return dccp_feat_push_change(fn, feat, 1, 0, &fval); |
803 | } | ||
804 | EXPORT_SYMBOL_GPL(dccp_feat_signal_nn_change); | ||
805 | |||
806 | /* | ||
807 | * Tracking features whose value depend on the choice of CCID | ||
808 | * | ||
809 | * This is designed with an extension in mind so that a list walk could be done | ||
810 | * before activating any features. However, the existing framework was found to | ||
811 | * work satisfactorily up until now, the automatic verification is left open. | ||
812 | * When adding new CCIDs, add a corresponding dependency table here. | ||
813 | */ | ||
814 | static const struct ccid_dependency *dccp_feat_ccid_deps(u8 ccid, bool is_local) | ||
815 | { | ||
816 | static const struct ccid_dependency ccid2_dependencies[2][2] = { | ||
817 | /* | ||
818 | * CCID2 mandates Ack Vectors (RFC 4341, 4.): as CCID is a TX | ||
819 | * feature and Send Ack Vector is an RX feature, `is_local' | ||
820 | * needs to be reversed. | ||
407 | */ | 821 | */ |
408 | if (rc != DCCP_FEAT_SP_NOAGREE) | 822 | { /* Dependencies of the receiver-side (remote) CCID2 */ |
409 | dccp_feat_empty_confirm(dccp_msk(sk), type, feature); | 823 | { |
824 | .dependent_feat = DCCPF_SEND_ACK_VECTOR, | ||
825 | .is_local = true, | ||
826 | .is_mandatory = true, | ||
827 | .val = 1 | ||
828 | }, | ||
829 | { 0, 0, 0, 0 } | ||
830 | }, | ||
831 | { /* Dependencies of the sender-side (local) CCID2 */ | ||
832 | { | ||
833 | .dependent_feat = DCCPF_SEND_ACK_VECTOR, | ||
834 | .is_local = false, | ||
835 | .is_mandatory = true, | ||
836 | .val = 1 | ||
837 | }, | ||
838 | { 0, 0, 0, 0 } | ||
839 | } | ||
840 | }; | ||
841 | static const struct ccid_dependency ccid3_dependencies[2][5] = { | ||
842 | { /* | ||
843 | * Dependencies of the receiver-side CCID3 | ||
844 | */ | ||
845 | { /* locally disable Ack Vectors */ | ||
846 | .dependent_feat = DCCPF_SEND_ACK_VECTOR, | ||
847 | .is_local = true, | ||
848 | .is_mandatory = false, | ||
849 | .val = 0 | ||
850 | }, | ||
851 | { /* see below why Send Loss Event Rate is on */ | ||
852 | .dependent_feat = DCCPF_SEND_LEV_RATE, | ||
853 | .is_local = true, | ||
854 | .is_mandatory = true, | ||
855 | .val = 1 | ||
856 | }, | ||
857 | { /* NDP Count is needed as per RFC 4342, 6.1.1 */ | ||
858 | .dependent_feat = DCCPF_SEND_NDP_COUNT, | ||
859 | .is_local = false, | ||
860 | .is_mandatory = true, | ||
861 | .val = 1 | ||
862 | }, | ||
863 | { 0, 0, 0, 0 }, | ||
864 | }, | ||
865 | { /* | ||
866 | * CCID3 at the TX side: we request that the HC-receiver | ||
867 | * will not send Ack Vectors (they will be ignored, so | ||
868 | * Mandatory is not set); we enable Send Loss Event Rate | ||
869 | * (Mandatory since the implementation does not support | ||
870 | * the Loss Intervals option of RFC 4342, 8.6). | ||
871 | * The last two options are for peer's information only. | ||
872 | */ | ||
873 | { | ||
874 | .dependent_feat = DCCPF_SEND_ACK_VECTOR, | ||
875 | .is_local = false, | ||
876 | .is_mandatory = false, | ||
877 | .val = 0 | ||
878 | }, | ||
879 | { | ||
880 | .dependent_feat = DCCPF_SEND_LEV_RATE, | ||
881 | .is_local = false, | ||
882 | .is_mandatory = true, | ||
883 | .val = 1 | ||
884 | }, | ||
885 | { /* this CCID does not support Ack Ratio */ | ||
886 | .dependent_feat = DCCPF_ACK_RATIO, | ||
887 | .is_local = true, | ||
888 | .is_mandatory = false, | ||
889 | .val = 0 | ||
890 | }, | ||
891 | { /* tell receiver we are sending NDP counts */ | ||
892 | .dependent_feat = DCCPF_SEND_NDP_COUNT, | ||
893 | .is_local = true, | ||
894 | .is_mandatory = false, | ||
895 | .val = 1 | ||
896 | }, | ||
897 | { 0, 0, 0, 0 } | ||
898 | } | ||
899 | }; | ||
900 | switch (ccid) { | ||
901 | case DCCPC_CCID2: | ||
902 | return ccid2_dependencies[is_local]; | ||
903 | case DCCPC_CCID3: | ||
904 | return ccid3_dependencies[is_local]; | ||
905 | default: | ||
906 | return NULL; | ||
410 | } | 907 | } |
908 | } | ||
411 | 909 | ||
412 | /* generate the confirm [if required] */ | 910 | /** |
413 | dccp_feat_flush_confirm(sk); | 911 | * dccp_feat_propagate_ccid - Resolve dependencies of features on choice of CCID |
414 | 912 | * @fn: feature-negotiation list to update | |
913 | * @id: CCID number to track | ||
914 | * @is_local: whether TX CCID (1) or RX CCID (0) is meant | ||
915 | * This function needs to be called after registering all other features. | ||
916 | */ | ||
917 | static int dccp_feat_propagate_ccid(struct list_head *fn, u8 id, bool is_local) | ||
918 | { | ||
919 | const struct ccid_dependency *table = dccp_feat_ccid_deps(id, is_local); | ||
920 | int i, rc = (table == NULL); | ||
921 | |||
922 | for (i = 0; rc == 0 && table[i].dependent_feat != DCCPF_RESERVED; i++) | ||
923 | if (dccp_feat_type(table[i].dependent_feat) == FEAT_SP) | ||
924 | rc = __feat_register_sp(fn, table[i].dependent_feat, | ||
925 | table[i].is_local, | ||
926 | table[i].is_mandatory, | ||
927 | &table[i].val, 1); | ||
928 | else | ||
929 | rc = __feat_register_nn(fn, table[i].dependent_feat, | ||
930 | table[i].is_mandatory, | ||
931 | table[i].val); | ||
415 | return rc; | 932 | return rc; |
416 | } | 933 | } |
417 | 934 | ||
418 | EXPORT_SYMBOL_GPL(dccp_feat_change_recv); | 935 | /** |
936 | * dccp_feat_finalise_settings - Finalise settings before starting negotiation | ||
937 | * @dp: client or listening socket (settings will be inherited) | ||
938 | * This is called after all registrations (socket initialisation, sysctls, and | ||
939 | * sockopt calls), and before sending the first packet containing Change options | ||
940 | * (ie. client-Request or server-Response), to ensure internal consistency. | ||
941 | */ | ||
942 | int dccp_feat_finalise_settings(struct dccp_sock *dp) | ||
943 | { | ||
944 | struct list_head *fn = &dp->dccps_featneg; | ||
945 | struct dccp_feat_entry *entry; | ||
946 | int i = 2, ccids[2] = { -1, -1 }; | ||
947 | |||
948 | /* | ||
949 | * Propagating CCIDs: | ||
950 | * 1) not useful to propagate CCID settings if this host advertises more | ||
951 | * than one CCID: the choice of CCID may still change - if this is | ||
952 | * the client, or if this is the server and the client sends | ||
953 | * singleton CCID values. | ||
954 | * 2) since is that propagate_ccid changes the list, we defer changing | ||
955 | * the sorted list until after the traversal. | ||
956 | */ | ||
957 | list_for_each_entry(entry, fn, node) | ||
958 | if (entry->feat_num == DCCPF_CCID && entry->val.sp.len == 1) | ||
959 | ccids[entry->is_local] = entry->val.sp.vec[0]; | ||
960 | while (i--) | ||
961 | if (ccids[i] > 0 && dccp_feat_propagate_ccid(fn, ccids[i], i)) | ||
962 | return -1; | ||
963 | dccp_feat_print_fnlist(fn); | ||
964 | return 0; | ||
965 | } | ||
419 | 966 | ||
420 | int dccp_feat_confirm_recv(struct sock *sk, u8 type, u8 feature, | 967 | /** |
421 | u8 *val, u8 len) | 968 | * dccp_feat_server_ccid_dependencies - Resolve CCID-dependent features |
969 | * It is the server which resolves the dependencies once the CCID has been | ||
970 | * fully negotiated. If no CCID has been negotiated, it uses the default CCID. | ||
971 | */ | ||
972 | int dccp_feat_server_ccid_dependencies(struct dccp_request_sock *dreq) | ||
422 | { | 973 | { |
423 | u8 t; | 974 | struct list_head *fn = &dreq->dreq_featneg; |
424 | struct dccp_opt_pend *opt; | 975 | struct dccp_feat_entry *entry; |
425 | struct dccp_minisock *dmsk = dccp_msk(sk); | 976 | u8 is_local, ccid; |
426 | int found = 0; | ||
427 | int all_confirmed = 1; | ||
428 | 977 | ||
429 | dccp_feat_debug(type, feature, *val); | 978 | for (is_local = 0; is_local <= 1; is_local++) { |
979 | entry = dccp_feat_list_lookup(fn, DCCPF_CCID, is_local); | ||
430 | 980 | ||
431 | /* locate our change request */ | 981 | if (entry != NULL && !entry->empty_confirm) |
432 | switch (type) { | 982 | ccid = entry->val.sp.vec[0]; |
433 | case DCCPO_CONFIRM_L: t = DCCPO_CHANGE_R; break; | 983 | else |
434 | case DCCPO_CONFIRM_R: t = DCCPO_CHANGE_L; break; | 984 | ccid = dccp_feat_default_value(DCCPF_CCID); |
435 | default: DCCP_WARN("invalid type %d\n", type); | ||
436 | return 1; | ||
437 | 985 | ||
986 | if (dccp_feat_propagate_ccid(fn, ccid, is_local)) | ||
987 | return -1; | ||
438 | } | 988 | } |
439 | /* XXX sanity check feature value */ | 989 | return 0; |
990 | } | ||
440 | 991 | ||
441 | list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) { | 992 | /* Select the first entry in @servlist that also occurs in @clilist (6.3.1) */ |
442 | if (!opt->dccpop_conf && opt->dccpop_type == t && | 993 | static int dccp_feat_preflist_match(u8 *servlist, u8 slen, u8 *clilist, u8 clen) |
443 | opt->dccpop_feat == feature) { | 994 | { |
444 | found = 1; | 995 | u8 c, s; |
445 | dccp_pr_debug("feature %d found\n", opt->dccpop_feat); | ||
446 | 996 | ||
447 | /* XXX do sanity check */ | 997 | for (s = 0; s < slen; s++) |
998 | for (c = 0; c < clen; c++) | ||
999 | if (servlist[s] == clilist[c]) | ||
1000 | return servlist[s]; | ||
1001 | return -1; | ||
1002 | } | ||
448 | 1003 | ||
449 | opt->dccpop_conf = 1; | 1004 | /** |
1005 | * dccp_feat_prefer - Move preferred entry to the start of array | ||
1006 | * Reorder the @array_len elements in @array so that @preferred_value comes | ||
1007 | * first. Returns >0 to indicate that @preferred_value does occur in @array. | ||
1008 | */ | ||
1009 | static u8 dccp_feat_prefer(u8 preferred_value, u8 *array, u8 array_len) | ||
1010 | { | ||
1011 | u8 i, does_occur = 0; | ||
450 | 1012 | ||
451 | /* We got a confirmation---change the option */ | 1013 | if (array != NULL) { |
452 | dccp_feat_update(sk, opt->dccpop_type, | 1014 | for (i = 0; i < array_len; i++) |
453 | opt->dccpop_feat, *val); | 1015 | if (array[i] == preferred_value) { |
1016 | array[i] = array[0]; | ||
1017 | does_occur++; | ||
1018 | } | ||
1019 | if (does_occur) | ||
1020 | array[0] = preferred_value; | ||
1021 | } | ||
1022 | return does_occur; | ||
1023 | } | ||
454 | 1024 | ||
455 | /* XXX check the return value of dccp_feat_update */ | 1025 | /** |
456 | break; | 1026 | * dccp_feat_reconcile - Reconcile SP preference lists |
457 | } | 1027 | * @fval: SP list to reconcile into |
1028 | * @arr: received SP preference list | ||
1029 | * @len: length of @arr in bytes | ||
1030 | * @is_server: whether this side is the server (and @fv is the server's list) | ||
1031 | * @reorder: whether to reorder the list in @fv after reconciling with @arr | ||
1032 | * When successful, > 0 is returned and the reconciled list is in @fval. | ||
1033 | * A value of 0 means that negotiation failed (no shared entry). | ||
1034 | */ | ||
1035 | static int dccp_feat_reconcile(dccp_feat_val *fv, u8 *arr, u8 len, | ||
1036 | bool is_server, bool reorder) | ||
1037 | { | ||
1038 | int rc; | ||
458 | 1039 | ||
459 | if (!opt->dccpop_conf) | 1040 | if (!fv->sp.vec || !arr) { |
460 | all_confirmed = 0; | 1041 | DCCP_CRIT("NULL feature value or array"); |
1042 | return 0; | ||
461 | } | 1043 | } |
462 | 1044 | ||
463 | /* fix re-transmit timer */ | 1045 | if (is_server) |
464 | /* XXX gotta make sure that no option negotiation occurs during | 1046 | rc = dccp_feat_preflist_match(fv->sp.vec, fv->sp.len, arr, len); |
465 | * connection shutdown. Consider that the CLOSEREQ is sent and timer is | 1047 | else |
466 | * on. if all options are confirmed it might kill timer which should | 1048 | rc = dccp_feat_preflist_match(arr, len, fv->sp.vec, fv->sp.len); |
467 | * remain alive until close is received. | ||
468 | */ | ||
469 | if (all_confirmed) { | ||
470 | dccp_pr_debug("clear feat negotiation timer %p\n", sk); | ||
471 | inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); | ||
472 | } | ||
473 | 1049 | ||
474 | if (!found) | 1050 | if (!reorder) |
475 | dccp_pr_debug("%s(%d, ...) never requested\n", | 1051 | return rc; |
476 | dccp_feat_typename(type), feature); | 1052 | if (rc < 0) |
477 | return 0; | 1053 | return 0; |
478 | } | ||
479 | 1054 | ||
480 | EXPORT_SYMBOL_GPL(dccp_feat_confirm_recv); | 1055 | /* |
1056 | * Reorder list: used for activating features and in dccp_insert_fn_opt. | ||
1057 | */ | ||
1058 | return dccp_feat_prefer(rc, fv->sp.vec, fv->sp.len); | ||
1059 | } | ||
481 | 1060 | ||
482 | void dccp_feat_clean(struct dccp_minisock *dmsk) | 1061 | /** |
1062 | * dccp_feat_change_recv - Process incoming ChangeL/R options | ||
1063 | * @fn: feature-negotiation list to update | ||
1064 | * @is_mandatory: whether the Change was preceded by a Mandatory option | ||
1065 | * @opt: %DCCPO_CHANGE_L or %DCCPO_CHANGE_R | ||
1066 | * @feat: one of %dccp_feature_numbers | ||
1067 | * @val: NN value or SP value/preference list | ||
1068 | * @len: length of @val in bytes | ||
1069 | * @server: whether this node is the server (1) or the client (0) | ||
1070 | */ | ||
1071 | static u8 dccp_feat_change_recv(struct list_head *fn, u8 is_mandatory, u8 opt, | ||
1072 | u8 feat, u8 *val, u8 len, const bool server) | ||
483 | { | 1073 | { |
484 | struct dccp_opt_pend *opt, *next; | 1074 | u8 defval, type = dccp_feat_type(feat); |
1075 | const bool local = (opt == DCCPO_CHANGE_R); | ||
1076 | struct dccp_feat_entry *entry; | ||
1077 | dccp_feat_val fval; | ||
1078 | |||
1079 | if (len == 0 || type == FEAT_UNKNOWN) /* 6.1 and 6.6.8 */ | ||
1080 | goto unknown_feature_or_value; | ||
1081 | |||
1082 | dccp_feat_print_opt(opt, feat, val, len, is_mandatory); | ||
1083 | |||
1084 | /* | ||
1085 | * Negotiation of NN features: Change R is invalid, so there is no | ||
1086 | * simultaneous negotiation; hence we do not look up in the list. | ||
1087 | */ | ||
1088 | if (type == FEAT_NN) { | ||
1089 | if (local || len > sizeof(fval.nn)) | ||
1090 | goto unknown_feature_or_value; | ||
485 | 1091 | ||
486 | list_for_each_entry_safe(opt, next, &dmsk->dccpms_pending, | 1092 | /* 6.3.2: "The feature remote MUST accept any valid value..." */ |
487 | dccpop_node) { | 1093 | fval.nn = dccp_decode_value_var(val, len); |
488 | BUG_ON(opt->dccpop_val == NULL); | 1094 | if (!dccp_feat_is_valid_nn_val(feat, fval.nn)) |
489 | kfree(opt->dccpop_val); | 1095 | goto unknown_feature_or_value; |
490 | 1096 | ||
491 | if (opt->dccpop_sc != NULL) { | 1097 | return dccp_feat_push_confirm(fn, feat, local, &fval); |
492 | BUG_ON(opt->dccpop_sc->dccpoc_val == NULL); | 1098 | } |
493 | kfree(opt->dccpop_sc->dccpoc_val); | 1099 | |
494 | kfree(opt->dccpop_sc); | 1100 | /* |
1101 | * Unidirectional/simultaneous negotiation of SP features (6.3.1) | ||
1102 | */ | ||
1103 | entry = dccp_feat_list_lookup(fn, feat, local); | ||
1104 | if (entry == NULL) { | ||
1105 | /* | ||
1106 | * No particular preferences have been registered. We deal with | ||
1107 | * this situation by assuming that all valid values are equally | ||
1108 | * acceptable, and apply the following checks: | ||
1109 | * - if the peer's list is a singleton, we accept a valid value; | ||
1110 | * - if we are the server, we first try to see if the peer (the | ||
1111 | * client) advertises the default value. If yes, we use it, | ||
1112 | * otherwise we accept the preferred value; | ||
1113 | * - else if we are the client, we use the first list element. | ||
1114 | */ | ||
1115 | if (dccp_feat_clone_sp_val(&fval, val, 1)) | ||
1116 | return DCCP_RESET_CODE_TOO_BUSY; | ||
1117 | |||
1118 | if (len > 1 && server) { | ||
1119 | defval = dccp_feat_default_value(feat); | ||
1120 | if (dccp_feat_preflist_match(&defval, 1, val, len) > -1) | ||
1121 | fval.sp.vec[0] = defval; | ||
1122 | } else if (!dccp_feat_is_valid_sp_val(feat, fval.sp.vec[0])) { | ||
1123 | kfree(fval.sp.vec); | ||
1124 | goto unknown_feature_or_value; | ||
1125 | } | ||
1126 | |||
1127 | /* Treat unsupported CCIDs like invalid values */ | ||
1128 | if (feat == DCCPF_CCID && !ccid_support_check(fval.sp.vec, 1)) { | ||
1129 | kfree(fval.sp.vec); | ||
1130 | goto not_valid_or_not_known; | ||
495 | } | 1131 | } |
496 | 1132 | ||
497 | kfree(opt); | 1133 | return dccp_feat_push_confirm(fn, feat, local, &fval); |
1134 | |||
1135 | } else if (entry->state == FEAT_UNSTABLE) { /* 6.6.2 */ | ||
1136 | return 0; | ||
498 | } | 1137 | } |
499 | INIT_LIST_HEAD(&dmsk->dccpms_pending); | ||
500 | 1138 | ||
501 | list_for_each_entry_safe(opt, next, &dmsk->dccpms_conf, dccpop_node) { | 1139 | if (dccp_feat_reconcile(&entry->val, val, len, server, true)) { |
502 | BUG_ON(opt == NULL); | 1140 | entry->empty_confirm = 0; |
503 | if (opt->dccpop_val != NULL) | 1141 | } else if (is_mandatory) { |
504 | kfree(opt->dccpop_val); | 1142 | return DCCP_RESET_CODE_MANDATORY_ERROR; |
505 | kfree(opt); | 1143 | } else if (entry->state == FEAT_INITIALISING) { |
1144 | /* | ||
1145 | * Failed simultaneous negotiation (server only): try to `save' | ||
1146 | * the connection by checking whether entry contains the default | ||
1147 | * value for @feat. If yes, send an empty Confirm to signal that | ||
1148 | * the received Change was not understood - which implies using | ||
1149 | * the default value. | ||
1150 | * If this also fails, we use Reset as the last resort. | ||
1151 | */ | ||
1152 | WARN_ON(!server); | ||
1153 | defval = dccp_feat_default_value(feat); | ||
1154 | if (!dccp_feat_reconcile(&entry->val, &defval, 1, server, true)) | ||
1155 | return DCCP_RESET_CODE_OPTION_ERROR; | ||
1156 | entry->empty_confirm = 1; | ||
506 | } | 1157 | } |
507 | INIT_LIST_HEAD(&dmsk->dccpms_conf); | 1158 | entry->needs_confirm = 1; |
508 | } | 1159 | entry->needs_mandatory = 0; |
1160 | entry->state = FEAT_STABLE; | ||
1161 | return 0; | ||
509 | 1162 | ||
510 | EXPORT_SYMBOL_GPL(dccp_feat_clean); | 1163 | unknown_feature_or_value: |
1164 | if (!is_mandatory) | ||
1165 | return dccp_push_empty_confirm(fn, feat, local); | ||
511 | 1166 | ||
512 | /* this is to be called only when a listening sock creates its child. It is | 1167 | not_valid_or_not_known: |
513 | * assumed by the function---the confirm is not duplicated, but rather it is | 1168 | return is_mandatory ? DCCP_RESET_CODE_MANDATORY_ERROR |
514 | * "passed on". | 1169 | : DCCP_RESET_CODE_OPTION_ERROR; |
1170 | } | ||
1171 | |||
1172 | /** | ||
1173 | * dccp_feat_confirm_recv - Process received Confirm options | ||
1174 | * @fn: feature-negotiation list to update | ||
1175 | * @is_mandatory: whether @opt was preceded by a Mandatory option | ||
1176 | * @opt: %DCCPO_CONFIRM_L or %DCCPO_CONFIRM_R | ||
1177 | * @feat: one of %dccp_feature_numbers | ||
1178 | * @val: NN value or SP value/preference list | ||
1179 | * @len: length of @val in bytes | ||
1180 | * @server: whether this node is server (1) or client (0) | ||
515 | */ | 1181 | */ |
516 | int dccp_feat_clone(struct sock *oldsk, struct sock *newsk) | 1182 | static u8 dccp_feat_confirm_recv(struct list_head *fn, u8 is_mandatory, u8 opt, |
1183 | u8 feat, u8 *val, u8 len, const bool server) | ||
517 | { | 1184 | { |
518 | struct dccp_minisock *olddmsk = dccp_msk(oldsk); | 1185 | u8 *plist, plen, type = dccp_feat_type(feat); |
519 | struct dccp_minisock *newdmsk = dccp_msk(newsk); | 1186 | const bool local = (opt == DCCPO_CONFIRM_R); |
520 | struct dccp_opt_pend *opt; | 1187 | struct dccp_feat_entry *entry = dccp_feat_list_lookup(fn, feat, local); |
521 | int rc = 0; | ||
522 | 1188 | ||
523 | INIT_LIST_HEAD(&newdmsk->dccpms_pending); | 1189 | dccp_feat_print_opt(opt, feat, val, len, is_mandatory); |
524 | INIT_LIST_HEAD(&newdmsk->dccpms_conf); | ||
525 | 1190 | ||
526 | list_for_each_entry(opt, &olddmsk->dccpms_pending, dccpop_node) { | 1191 | if (entry == NULL) { /* nothing queued: ignore or handle error */ |
527 | struct dccp_opt_pend *newopt; | 1192 | if (is_mandatory && type == FEAT_UNKNOWN) |
528 | /* copy the value of the option */ | 1193 | return DCCP_RESET_CODE_MANDATORY_ERROR; |
529 | u8 *val = kmemdup(opt->dccpop_val, opt->dccpop_len, GFP_ATOMIC); | ||
530 | 1194 | ||
531 | if (val == NULL) | 1195 | if (!local && type == FEAT_NN) /* 6.3.2 */ |
532 | goto out_clean; | 1196 | goto confirmation_failed; |
533 | 1197 | return 0; | |
534 | newopt = kmemdup(opt, sizeof(*newopt), GFP_ATOMIC); | 1198 | } |
535 | if (newopt == NULL) { | ||
536 | kfree(val); | ||
537 | goto out_clean; | ||
538 | } | ||
539 | 1199 | ||
540 | /* insert the option */ | 1200 | if (entry->state != FEAT_CHANGING) /* 6.6.2 */ |
541 | newopt->dccpop_val = val; | 1201 | return 0; |
542 | list_add_tail(&newopt->dccpop_node, &newdmsk->dccpms_pending); | ||
543 | 1202 | ||
544 | /* XXX what happens with backlogs and multiple connections at | 1203 | if (len == 0) { |
545 | * once... | 1204 | if (dccp_feat_must_be_understood(feat)) /* 6.6.7 */ |
1205 | goto confirmation_failed; | ||
1206 | /* | ||
1207 | * Empty Confirm during connection setup: this means reverting | ||
1208 | * to the `old' value, which in this case is the default. Since | ||
1209 | * we handle default values automatically when no other values | ||
1210 | * have been set, we revert to the old value by removing this | ||
1211 | * entry from the list. | ||
546 | */ | 1212 | */ |
547 | /* the master socket no longer needs to worry about confirms */ | 1213 | dccp_feat_list_pop(entry); |
548 | opt->dccpop_sc = NULL; /* it's not a memleak---new socket has it */ | 1214 | return 0; |
1215 | } | ||
1216 | |||
1217 | if (type == FEAT_NN) { | ||
1218 | if (len > sizeof(entry->val.nn)) | ||
1219 | goto confirmation_failed; | ||
1220 | |||
1221 | if (entry->val.nn == dccp_decode_value_var(val, len)) | ||
1222 | goto confirmation_succeeded; | ||
1223 | |||
1224 | DCCP_WARN("Bogus Confirm for non-existing value\n"); | ||
1225 | goto confirmation_failed; | ||
1226 | } | ||
549 | 1227 | ||
550 | /* reset state for a new socket */ | 1228 | /* |
551 | opt->dccpop_conf = 0; | 1229 | * Parsing SP Confirms: the first element of @val is the preferred |
1230 | * SP value which the peer confirms, the remainder depends on @len. | ||
1231 | * Note that only the confirmed value need to be a valid SP value. | ||
1232 | */ | ||
1233 | if (!dccp_feat_is_valid_sp_val(feat, *val)) | ||
1234 | goto confirmation_failed; | ||
1235 | |||
1236 | if (len == 1) { /* peer didn't supply a preference list */ | ||
1237 | plist = val; | ||
1238 | plen = len; | ||
1239 | } else { /* preferred value + preference list */ | ||
1240 | plist = val + 1; | ||
1241 | plen = len - 1; | ||
552 | } | 1242 | } |
553 | 1243 | ||
554 | /* XXX not doing anything about the conf queue */ | 1244 | /* Check whether the peer got the reconciliation right (6.6.8) */ |
1245 | if (dccp_feat_reconcile(&entry->val, plist, plen, server, 0) != *val) { | ||
1246 | DCCP_WARN("Confirm selected the wrong value %u\n", *val); | ||
1247 | return DCCP_RESET_CODE_OPTION_ERROR; | ||
1248 | } | ||
1249 | entry->val.sp.vec[0] = *val; | ||
555 | 1250 | ||
556 | out: | 1251 | confirmation_succeeded: |
557 | return rc; | 1252 | entry->state = FEAT_STABLE; |
1253 | return 0; | ||
558 | 1254 | ||
559 | out_clean: | 1255 | confirmation_failed: |
560 | dccp_feat_clean(newdmsk); | 1256 | DCCP_WARN("Confirmation failed\n"); |
561 | rc = -ENOMEM; | 1257 | return is_mandatory ? DCCP_RESET_CODE_MANDATORY_ERROR |
562 | goto out; | 1258 | : DCCP_RESET_CODE_OPTION_ERROR; |
563 | } | 1259 | } |
564 | 1260 | ||
565 | EXPORT_SYMBOL_GPL(dccp_feat_clone); | 1261 | /** |
1262 | * dccp_feat_handle_nn_established - Fast-path reception of NN options | ||
1263 | * @sk: socket of an established DCCP connection | ||
1264 | * @mandatory: whether @opt was preceded by a Mandatory option | ||
1265 | * @opt: %DCCPO_CHANGE_L | %DCCPO_CONFIRM_R (NN only) | ||
1266 | * @feat: NN number, one of %dccp_feature_numbers | ||
1267 | * @val: NN value | ||
1268 | * @len: length of @val in bytes | ||
1269 | * This function combines the functionality of change_recv/confirm_recv, with | ||
1270 | * the following differences (reset codes are the same): | ||
1271 | * - cleanup after receiving the Confirm; | ||
1272 | * - values are directly activated after successful parsing; | ||
1273 | * - deliberately restricted to NN features. | ||
1274 | * The restriction to NN features is essential since SP features can have non- | ||
1275 | * predictable outcomes (depending on the remote configuration), and are inter- | ||
1276 | * dependent (CCIDs for instance cause further dependencies). | ||
1277 | */ | ||
1278 | static u8 dccp_feat_handle_nn_established(struct sock *sk, u8 mandatory, u8 opt, | ||
1279 | u8 feat, u8 *val, u8 len) | ||
1280 | { | ||
1281 | struct list_head *fn = &dccp_sk(sk)->dccps_featneg; | ||
1282 | const bool local = (opt == DCCPO_CONFIRM_R); | ||
1283 | struct dccp_feat_entry *entry; | ||
1284 | u8 type = dccp_feat_type(feat); | ||
1285 | dccp_feat_val fval; | ||
1286 | |||
1287 | dccp_feat_print_opt(opt, feat, val, len, mandatory); | ||
1288 | |||
1289 | /* Ignore non-mandatory unknown and non-NN features */ | ||
1290 | if (type == FEAT_UNKNOWN) { | ||
1291 | if (local && !mandatory) | ||
1292 | return 0; | ||
1293 | goto fast_path_unknown; | ||
1294 | } else if (type != FEAT_NN) { | ||
1295 | return 0; | ||
1296 | } | ||
1297 | |||
1298 | /* | ||
1299 | * We don't accept empty Confirms, since in fast-path feature | ||
1300 | * negotiation the values are enabled immediately after sending | ||
1301 | * the Change option. | ||
1302 | * Empty Changes on the other hand are invalid (RFC 4340, 6.1). | ||
1303 | */ | ||
1304 | if (len == 0 || len > sizeof(fval.nn)) | ||
1305 | goto fast_path_unknown; | ||
1306 | |||
1307 | if (opt == DCCPO_CHANGE_L) { | ||
1308 | fval.nn = dccp_decode_value_var(val, len); | ||
1309 | if (!dccp_feat_is_valid_nn_val(feat, fval.nn)) | ||
1310 | goto fast_path_unknown; | ||
1311 | |||
1312 | if (dccp_feat_push_confirm(fn, feat, local, &fval) || | ||
1313 | dccp_feat_activate(sk, feat, local, &fval)) | ||
1314 | return DCCP_RESET_CODE_TOO_BUSY; | ||
1315 | |||
1316 | /* set the `Ack Pending' flag to piggyback a Confirm */ | ||
1317 | inet_csk_schedule_ack(sk); | ||
1318 | |||
1319 | } else if (opt == DCCPO_CONFIRM_R) { | ||
1320 | entry = dccp_feat_list_lookup(fn, feat, local); | ||
1321 | if (entry == NULL || entry->state != FEAT_CHANGING) | ||
1322 | return 0; | ||
1323 | |||
1324 | fval.nn = dccp_decode_value_var(val, len); | ||
1325 | if (fval.nn != entry->val.nn) { | ||
1326 | DCCP_WARN("Bogus Confirm for non-existing value\n"); | ||
1327 | goto fast_path_failed; | ||
1328 | } | ||
1329 | |||
1330 | /* It has been confirmed - so remove the entry */ | ||
1331 | dccp_feat_list_pop(entry); | ||
1332 | |||
1333 | } else { | ||
1334 | DCCP_WARN("Received illegal option %u\n", opt); | ||
1335 | goto fast_path_failed; | ||
1336 | } | ||
1337 | return 0; | ||
1338 | |||
1339 | fast_path_unknown: | ||
1340 | if (!mandatory) | ||
1341 | return dccp_push_empty_confirm(fn, feat, local); | ||
1342 | |||
1343 | fast_path_failed: | ||
1344 | return mandatory ? DCCP_RESET_CODE_MANDATORY_ERROR | ||
1345 | : DCCP_RESET_CODE_OPTION_ERROR; | ||
1346 | } | ||
566 | 1347 | ||
567 | static int __dccp_feat_init(struct dccp_minisock *dmsk, u8 type, u8 feat, | 1348 | /** |
568 | u8 *val, u8 len) | 1349 | * dccp_feat_parse_options - Process Feature-Negotiation Options |
1350 | * @sk: for general use and used by the client during connection setup | ||
1351 | * @dreq: used by the server during connection setup | ||
1352 | * @mandatory: whether @opt was preceded by a Mandatory option | ||
1353 | * @opt: %DCCPO_CHANGE_L | %DCCPO_CHANGE_R | %DCCPO_CONFIRM_L | %DCCPO_CONFIRM_R | ||
1354 | * @feat: one of %dccp_feature_numbers | ||
1355 | * @val: value contents of @opt | ||
1356 | * @len: length of @val in bytes | ||
1357 | * Returns 0 on success, a Reset code for ending the connection otherwise. | ||
1358 | */ | ||
1359 | int dccp_feat_parse_options(struct sock *sk, struct dccp_request_sock *dreq, | ||
1360 | u8 mandatory, u8 opt, u8 feat, u8 *val, u8 len) | ||
569 | { | 1361 | { |
570 | int rc = -ENOMEM; | 1362 | struct dccp_sock *dp = dccp_sk(sk); |
571 | u8 *copy = kmemdup(val, len, GFP_KERNEL); | 1363 | struct list_head *fn = dreq ? &dreq->dreq_featneg : &dp->dccps_featneg; |
1364 | bool server = false; | ||
572 | 1365 | ||
573 | if (copy != NULL) { | 1366 | switch (sk->sk_state) { |
574 | rc = dccp_feat_change(dmsk, type, feat, copy, len, GFP_KERNEL); | 1367 | /* |
575 | if (rc) | 1368 | * Negotiation during connection setup |
576 | kfree(copy); | 1369 | */ |
1370 | case DCCP_LISTEN: | ||
1371 | server = true; /* fall through */ | ||
1372 | case DCCP_REQUESTING: | ||
1373 | switch (opt) { | ||
1374 | case DCCPO_CHANGE_L: | ||
1375 | case DCCPO_CHANGE_R: | ||
1376 | return dccp_feat_change_recv(fn, mandatory, opt, feat, | ||
1377 | val, len, server); | ||
1378 | case DCCPO_CONFIRM_R: | ||
1379 | case DCCPO_CONFIRM_L: | ||
1380 | return dccp_feat_confirm_recv(fn, mandatory, opt, feat, | ||
1381 | val, len, server); | ||
1382 | } | ||
1383 | break; | ||
1384 | /* | ||
1385 | * Support for exchanging NN options on an established connection | ||
1386 | * This is currently restricted to Ack Ratio (RFC 4341, 6.1.2) | ||
1387 | */ | ||
1388 | case DCCP_OPEN: | ||
1389 | case DCCP_PARTOPEN: | ||
1390 | return dccp_feat_handle_nn_established(sk, mandatory, opt, feat, | ||
1391 | val, len); | ||
577 | } | 1392 | } |
578 | return rc; | 1393 | return 0; /* ignore FN options in all other states */ |
579 | } | 1394 | } |
580 | 1395 | ||
581 | int dccp_feat_init(struct dccp_minisock *dmsk) | 1396 | /** |
1397 | * dccp_feat_init - Seed feature negotiation with host-specific defaults | ||
1398 | * This initialises global defaults, depending on the value of the sysctls. | ||
1399 | * These can later be overridden by registering changes via setsockopt calls. | ||
1400 | * The last link in the chain is finalise_settings, to make sure that between | ||
1401 | * here and the start of actual feature negotiation no inconsistencies enter. | ||
1402 | * | ||
1403 | * All features not appearing below use either defaults or are otherwise | ||
1404 | * later adjusted through dccp_feat_finalise_settings(). | ||
1405 | */ | ||
1406 | int dccp_feat_init(struct sock *sk) | ||
582 | { | 1407 | { |
1408 | struct list_head *fn = &dccp_sk(sk)->dccps_featneg; | ||
1409 | u8 on = 1, off = 0; | ||
583 | int rc; | 1410 | int rc; |
1411 | struct { | ||
1412 | u8 *val; | ||
1413 | u8 len; | ||
1414 | } tx, rx; | ||
1415 | |||
1416 | /* Non-negotiable (NN) features */ | ||
1417 | rc = __feat_register_nn(fn, DCCPF_SEQUENCE_WINDOW, 0, | ||
1418 | sysctl_dccp_sequence_window); | ||
1419 | if (rc) | ||
1420 | return rc; | ||
584 | 1421 | ||
585 | INIT_LIST_HEAD(&dmsk->dccpms_pending); | 1422 | /* Server-priority (SP) features */ |
586 | INIT_LIST_HEAD(&dmsk->dccpms_conf); | 1423 | |
1424 | /* Advertise that short seqnos are not supported (7.6.1) */ | ||
1425 | rc = __feat_register_sp(fn, DCCPF_SHORT_SEQNOS, true, true, &off, 1); | ||
1426 | if (rc) | ||
1427 | return rc; | ||
587 | 1428 | ||
588 | /* CCID L */ | 1429 | /* RFC 4340 12.1: "If a DCCP is not ECN capable, ..." */ |
589 | rc = __dccp_feat_init(dmsk, DCCPO_CHANGE_L, DCCPF_CCID, | 1430 | rc = __feat_register_sp(fn, DCCPF_ECN_INCAPABLE, true, true, &on, 1); |
590 | &dmsk->dccpms_tx_ccid, 1); | ||
591 | if (rc) | 1431 | if (rc) |
592 | goto out; | 1432 | return rc; |
1433 | |||
1434 | /* | ||
1435 | * We advertise the available list of CCIDs and reorder according to | ||
1436 | * preferences, to avoid failure resulting from negotiating different | ||
1437 | * singleton values (which always leads to failure). | ||
1438 | * These settings can still (later) be overridden via sockopts. | ||
1439 | */ | ||
1440 | if (ccid_get_builtin_ccids(&tx.val, &tx.len) || | ||
1441 | ccid_get_builtin_ccids(&rx.val, &rx.len)) | ||
1442 | return -ENOBUFS; | ||
1443 | |||
1444 | /* Pre-load all CCID modules that are going to be advertised */ | ||
1445 | rc = -EUNATCH; | ||
1446 | if (ccid_request_modules(tx.val, tx.len)) | ||
1447 | goto free_ccid_lists; | ||
1448 | |||
1449 | if (!dccp_feat_prefer(sysctl_dccp_tx_ccid, tx.val, tx.len) || | ||
1450 | !dccp_feat_prefer(sysctl_dccp_rx_ccid, rx.val, rx.len)) | ||
1451 | goto free_ccid_lists; | ||
593 | 1452 | ||
594 | /* CCID R */ | 1453 | rc = __feat_register_sp(fn, DCCPF_CCID, true, false, tx.val, tx.len); |
595 | rc = __dccp_feat_init(dmsk, DCCPO_CHANGE_R, DCCPF_CCID, | ||
596 | &dmsk->dccpms_rx_ccid, 1); | ||
597 | if (rc) | 1454 | if (rc) |
598 | goto out; | 1455 | goto free_ccid_lists; |
599 | 1456 | ||
600 | /* Ack ratio */ | 1457 | rc = __feat_register_sp(fn, DCCPF_CCID, false, false, rx.val, rx.len); |
601 | rc = __dccp_feat_init(dmsk, DCCPO_CHANGE_L, DCCPF_ACK_RATIO, | 1458 | |
602 | &dmsk->dccpms_ack_ratio, 1); | 1459 | free_ccid_lists: |
603 | out: | 1460 | kfree(tx.val); |
1461 | kfree(rx.val); | ||
604 | return rc; | 1462 | return rc; |
605 | } | 1463 | } |
606 | 1464 | ||
607 | EXPORT_SYMBOL_GPL(dccp_feat_init); | 1465 | int dccp_feat_activate_values(struct sock *sk, struct list_head *fn_list) |
608 | |||
609 | #ifdef CONFIG_IP_DCCP_DEBUG | ||
610 | const char *dccp_feat_typename(const u8 type) | ||
611 | { | 1466 | { |
612 | switch(type) { | 1467 | struct dccp_sock *dp = dccp_sk(sk); |
613 | case DCCPO_CHANGE_L: return("ChangeL"); | 1468 | struct dccp_feat_entry *cur, *next; |
614 | case DCCPO_CONFIRM_L: return("ConfirmL"); | 1469 | int idx; |
615 | case DCCPO_CHANGE_R: return("ChangeR"); | 1470 | dccp_feat_val *fvals[DCCP_FEAT_SUPPORTED_MAX][2] = { |
616 | case DCCPO_CONFIRM_R: return("ConfirmR"); | 1471 | [0 ... DCCP_FEAT_SUPPORTED_MAX-1] = { NULL, NULL } |
617 | /* the following case must not appear in feature negotation */ | 1472 | }; |
618 | default: dccp_pr_debug("unknown type %d [BUG!]\n", type); | 1473 | |
1474 | list_for_each_entry(cur, fn_list, node) { | ||
1475 | /* | ||
1476 | * An empty Confirm means that either an unknown feature type | ||
1477 | * or an invalid value was present. In the first case there is | ||
1478 | * nothing to activate, in the other the default value is used. | ||
1479 | */ | ||
1480 | if (cur->empty_confirm) | ||
1481 | continue; | ||
1482 | |||
1483 | idx = dccp_feat_index(cur->feat_num); | ||
1484 | if (idx < 0) { | ||
1485 | DCCP_BUG("Unknown feature %u", cur->feat_num); | ||
1486 | goto activation_failed; | ||
1487 | } | ||
1488 | if (cur->state != FEAT_STABLE) { | ||
1489 | DCCP_CRIT("Negotiation of %s %s failed in state %s", | ||
1490 | cur->is_local ? "local" : "remote", | ||
1491 | dccp_feat_fname(cur->feat_num), | ||
1492 | dccp_feat_sname[cur->state]); | ||
1493 | goto activation_failed; | ||
1494 | } | ||
1495 | fvals[idx][cur->is_local] = &cur->val; | ||
619 | } | 1496 | } |
620 | return NULL; | ||
621 | } | ||
622 | 1497 | ||
623 | EXPORT_SYMBOL_GPL(dccp_feat_typename); | 1498 | /* |
1499 | * Activate in decreasing order of index, so that the CCIDs are always | ||
1500 | * activated as the last feature. This avoids the case where a CCID | ||
1501 | * relies on the initialisation of one or more features that it depends | ||
1502 | * on (e.g. Send NDP Count, Send Ack Vector, and Ack Ratio features). | ||
1503 | */ | ||
1504 | for (idx = DCCP_FEAT_SUPPORTED_MAX; --idx >= 0;) | ||
1505 | if (__dccp_feat_activate(sk, idx, 0, fvals[idx][0]) || | ||
1506 | __dccp_feat_activate(sk, idx, 1, fvals[idx][1])) { | ||
1507 | DCCP_CRIT("Could not activate %d", idx); | ||
1508 | goto activation_failed; | ||
1509 | } | ||
624 | 1510 | ||
625 | const char *dccp_feat_name(const u8 feat) | 1511 | /* Clean up Change options which have been confirmed already */ |
626 | { | 1512 | list_for_each_entry_safe(cur, next, fn_list, node) |
627 | static const char *feature_names[] = { | 1513 | if (!cur->needs_confirm) |
628 | [DCCPF_RESERVED] = "Reserved", | 1514 | dccp_feat_list_pop(cur); |
629 | [DCCPF_CCID] = "CCID", | ||
630 | [DCCPF_SHORT_SEQNOS] = "Allow Short Seqnos", | ||
631 | [DCCPF_SEQUENCE_WINDOW] = "Sequence Window", | ||
632 | [DCCPF_ECN_INCAPABLE] = "ECN Incapable", | ||
633 | [DCCPF_ACK_RATIO] = "Ack Ratio", | ||
634 | [DCCPF_SEND_ACK_VECTOR] = "Send ACK Vector", | ||
635 | [DCCPF_SEND_NDP_COUNT] = "Send NDP Count", | ||
636 | [DCCPF_MIN_CSUM_COVER] = "Min. Csum Coverage", | ||
637 | [DCCPF_DATA_CHECKSUM] = "Send Data Checksum", | ||
638 | }; | ||
639 | if (feat > DCCPF_DATA_CHECKSUM && feat < DCCPF_MIN_CCID_SPECIFIC) | ||
640 | return feature_names[DCCPF_RESERVED]; | ||
641 | 1515 | ||
642 | if (feat >= DCCPF_MIN_CCID_SPECIFIC) | 1516 | dccp_pr_debug("Activation OK\n"); |
643 | return "CCID-specific"; | 1517 | return 0; |
644 | 1518 | ||
645 | return feature_names[feat]; | 1519 | activation_failed: |
1520 | /* | ||
1521 | * We clean up everything that may have been allocated, since | ||
1522 | * it is difficult to track at which stage negotiation failed. | ||
1523 | * This is ok, since all allocation functions below are robust | ||
1524 | * against NULL arguments. | ||
1525 | */ | ||
1526 | ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk); | ||
1527 | ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk); | ||
1528 | dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL; | ||
1529 | dccp_ackvec_free(dp->dccps_hc_rx_ackvec); | ||
1530 | dp->dccps_hc_rx_ackvec = NULL; | ||
1531 | return -1; | ||
646 | } | 1532 | } |
647 | |||
648 | EXPORT_SYMBOL_GPL(dccp_feat_name); | ||
649 | #endif /* CONFIG_IP_DCCP_DEBUG */ | ||
diff --git a/net/dccp/feat.h b/net/dccp/feat.h index e272222c7ace..2217066e22d7 100644 --- a/net/dccp/feat.h +++ b/net/dccp/feat.h | |||
@@ -3,38 +3,134 @@ | |||
3 | /* | 3 | /* |
4 | * net/dccp/feat.h | 4 | * net/dccp/feat.h |
5 | * | 5 | * |
6 | * An implementation of the DCCP protocol | 6 | * Feature negotiation for the DCCP protocol (RFC 4340, section 6) |
7 | * Copyright (c) 2008 Gerrit Renker <gerrit@erg.abdn.ac.uk> | ||
7 | * Copyright (c) 2005 Andrea Bittau <a.bittau@cs.ucl.ac.uk> | 8 | * Copyright (c) 2005 Andrea Bittau <a.bittau@cs.ucl.ac.uk> |
8 | * | 9 | * |
9 | * This program is free software; you can redistribute it and/or modify it | 10 | * This program is free software; you can redistribute it and/or modify it |
10 | * under the terms of the GNU General Public License version 2 as | 11 | * under the terms of the GNU General Public License version 2 as |
11 | * published by the Free Software Foundation. | 12 | * published by the Free Software Foundation. |
12 | */ | 13 | */ |
13 | |||
14 | #include <linux/types.h> | 14 | #include <linux/types.h> |
15 | #include "dccp.h" | 15 | #include "dccp.h" |
16 | 16 | ||
17 | #ifdef CONFIG_IP_DCCP_DEBUG | 17 | /* |
18 | extern const char *dccp_feat_typename(const u8 type); | 18 | * Known limit values |
19 | extern const char *dccp_feat_name(const u8 feat); | 19 | */ |
20 | /* Ack Ratio takes 2-byte integer values (11.3) */ | ||
21 | #define DCCPF_ACK_RATIO_MAX 0xFFFF | ||
22 | /* Wmin=32 and Wmax=2^46-1 from 7.5.2 */ | ||
23 | #define DCCPF_SEQ_WMIN 32 | ||
24 | #define DCCPF_SEQ_WMAX 0x3FFFFFFFFFFFull | ||
25 | /* Maximum number of SP values that fit in a single (Confirm) option */ | ||
26 | #define DCCP_FEAT_MAX_SP_VALS (DCCP_SINGLE_OPT_MAXLEN - 2) | ||
27 | |||
28 | enum dccp_feat_type { | ||
29 | FEAT_AT_RX = 1, /* located at RX side of half-connection */ | ||
30 | FEAT_AT_TX = 2, /* located at TX side of half-connection */ | ||
31 | FEAT_SP = 4, /* server-priority reconciliation (6.3.1) */ | ||
32 | FEAT_NN = 8, /* non-negotiable reconciliation (6.3.2) */ | ||
33 | FEAT_UNKNOWN = 0xFF /* not understood or invalid feature */ | ||
34 | }; | ||
35 | |||
36 | enum dccp_feat_state { | ||
37 | FEAT_DEFAULT = 0, /* using default values from 6.4 */ | ||
38 | FEAT_INITIALISING, /* feature is being initialised */ | ||
39 | FEAT_CHANGING, /* Change sent but not confirmed yet */ | ||
40 | FEAT_UNSTABLE, /* local modification in state CHANGING */ | ||
41 | FEAT_STABLE /* both ends (think they) agree */ | ||
42 | }; | ||
20 | 43 | ||
21 | static inline void dccp_feat_debug(const u8 type, const u8 feat, const u8 val) | 44 | /** |
45 | * dccp_feat_val - Container for SP or NN feature values | ||
46 | * @nn: single NN value | ||
47 | * @sp.vec: single SP value plus optional preference list | ||
48 | * @sp.len: length of @sp.vec in bytes | ||
49 | */ | ||
50 | typedef union { | ||
51 | u64 nn; | ||
52 | struct { | ||
53 | u8 *vec; | ||
54 | u8 len; | ||
55 | } sp; | ||
56 | } dccp_feat_val; | ||
57 | |||
58 | /** | ||
59 | * struct feat_entry - Data structure to perform feature negotiation | ||
60 | * @feat_num: one of %dccp_feature_numbers | ||
61 | * @val: feature's current value (SP features may have preference list) | ||
62 | * @state: feature's current state | ||
63 | * @needs_mandatory: whether Mandatory options should be sent | ||
64 | * @needs_confirm: whether to send a Confirm instead of a Change | ||
65 | * @empty_confirm: whether to send an empty Confirm (depends on @needs_confirm) | ||
66 | * @is_local: feature location (1) or feature-remote (0) | ||
67 | * @node: list pointers, entries arranged in FIFO order | ||
68 | */ | ||
69 | struct dccp_feat_entry { | ||
70 | u8 feat_num; | ||
71 | dccp_feat_val val; | ||
72 | enum dccp_feat_state state:8; | ||
73 | bool needs_mandatory:1, | ||
74 | needs_confirm:1, | ||
75 | empty_confirm:1, | ||
76 | is_local:1; | ||
77 | |||
78 | struct list_head node; | ||
79 | }; | ||
80 | |||
81 | static inline u8 dccp_feat_genopt(struct dccp_feat_entry *entry) | ||
22 | { | 82 | { |
23 | dccp_pr_debug("%s(%s (%d), %d)\n", dccp_feat_typename(type), | 83 | if (entry->needs_confirm) |
24 | dccp_feat_name(feat), feat, val); | 84 | return entry->is_local ? DCCPO_CONFIRM_L : DCCPO_CONFIRM_R; |
85 | return entry->is_local ? DCCPO_CHANGE_L : DCCPO_CHANGE_R; | ||
25 | } | 86 | } |
26 | #else | ||
27 | #define dccp_feat_debug(type, feat, val) | ||
28 | #endif /* CONFIG_IP_DCCP_DEBUG */ | ||
29 | |||
30 | extern int dccp_feat_change(struct dccp_minisock *dmsk, u8 type, u8 feature, | ||
31 | u8 *val, u8 len, gfp_t gfp); | ||
32 | extern int dccp_feat_change_recv(struct sock *sk, u8 type, u8 feature, | ||
33 | u8 *val, u8 len); | ||
34 | extern int dccp_feat_confirm_recv(struct sock *sk, u8 type, u8 feature, | ||
35 | u8 *val, u8 len); | ||
36 | extern void dccp_feat_clean(struct dccp_minisock *dmsk); | ||
37 | extern int dccp_feat_clone(struct sock *oldsk, struct sock *newsk); | ||
38 | extern int dccp_feat_init(struct dccp_minisock *dmsk); | ||
39 | 87 | ||
88 | /** | ||
89 | * struct ccid_dependency - Track changes resulting from choosing a CCID | ||
90 | * @dependent_feat: one of %dccp_feature_numbers | ||
91 | * @is_local: local (1) or remote (0) @dependent_feat | ||
92 | * @is_mandatory: whether presence of @dependent_feat is mission-critical or not | ||
93 | * @val: corresponding default value for @dependent_feat (u8 is sufficient here) | ||
94 | */ | ||
95 | struct ccid_dependency { | ||
96 | u8 dependent_feat; | ||
97 | bool is_local:1, | ||
98 | is_mandatory:1; | ||
99 | u8 val; | ||
100 | }; | ||
101 | |||
102 | /* | ||
103 | * Sysctls to seed defaults for feature negotiation | ||
104 | */ | ||
105 | extern unsigned long sysctl_dccp_sequence_window; | ||
106 | extern int sysctl_dccp_rx_ccid; | ||
107 | extern int sysctl_dccp_tx_ccid; | ||
108 | |||
109 | extern int dccp_feat_init(struct sock *sk); | ||
110 | extern void dccp_feat_initialise_sysctls(void); | ||
111 | extern int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local, | ||
112 | u8 const *list, u8 len); | ||
113 | extern int dccp_feat_register_nn(struct sock *sk, u8 feat, u64 val); | ||
114 | extern int dccp_feat_parse_options(struct sock *, struct dccp_request_sock *, | ||
115 | u8 mand, u8 opt, u8 feat, u8 *val, u8 len); | ||
116 | extern int dccp_feat_clone_list(struct list_head const *, struct list_head *); | ||
117 | |||
118 | /* | ||
119 | * Encoding variable-length options and their maximum length. | ||
120 | * | ||
121 | * This affects NN options (SP options are all u8) and other variable-length | ||
122 | * options (see table 3 in RFC 4340). The limit is currently given the Sequence | ||
123 | * Window NN value (sec. 7.5.2) and the NDP count (sec. 7.7) option, all other | ||
124 | * options consume less than 6 bytes (timestamps are 4 bytes). | ||
125 | * When updating this constant (e.g. due to new internet drafts / RFCs), make | ||
126 | * sure that you also update all code which refers to it. | ||
127 | */ | ||
128 | #define DCCP_OPTVAL_MAXLEN 6 | ||
129 | |||
130 | extern void dccp_encode_value_var(const u64 value, u8 *to, const u8 len); | ||
131 | extern u64 dccp_decode_value_var(const u8 *bf, const u8 len); | ||
132 | |||
133 | extern int dccp_insert_option_mandatory(struct sk_buff *skb); | ||
134 | extern int dccp_insert_fn_opt(struct sk_buff *skb, u8 type, u8 feat, | ||
135 | u8 *val, u8 len, bool repeat_first); | ||
40 | #endif /* _DCCP_FEAT_H */ | 136 | #endif /* _DCCP_FEAT_H */ |
diff --git a/net/dccp/input.c b/net/dccp/input.c index 779d0ed9ae94..df0e6714aa11 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c | |||
@@ -159,13 +159,15 @@ static void dccp_rcv_reset(struct sock *sk, struct sk_buff *skb) | |||
159 | dccp_time_wait(sk, DCCP_TIME_WAIT, 0); | 159 | dccp_time_wait(sk, DCCP_TIME_WAIT, 0); |
160 | } | 160 | } |
161 | 161 | ||
162 | static void dccp_event_ack_recv(struct sock *sk, struct sk_buff *skb) | 162 | static void dccp_handle_ackvec_processing(struct sock *sk, struct sk_buff *skb) |
163 | { | 163 | { |
164 | struct dccp_sock *dp = dccp_sk(sk); | 164 | struct dccp_ackvec *av = dccp_sk(sk)->dccps_hc_rx_ackvec; |
165 | 165 | ||
166 | if (dccp_msk(sk)->dccpms_send_ack_vector) | 166 | if (av == NULL) |
167 | dccp_ackvec_check_rcv_ackno(dp->dccps_hc_rx_ackvec, sk, | 167 | return; |
168 | DCCP_SKB_CB(skb)->dccpd_ack_seq); | 168 | if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) |
169 | dccp_ackvec_clear_state(av, DCCP_SKB_CB(skb)->dccpd_ack_seq); | ||
170 | dccp_ackvec_input(av, skb); | ||
169 | } | 171 | } |
170 | 172 | ||
171 | static void dccp_deliver_input_to_ccids(struct sock *sk, struct sk_buff *skb) | 173 | static void dccp_deliver_input_to_ccids(struct sock *sk, struct sk_buff *skb) |
@@ -364,22 +366,13 @@ discard: | |||
364 | int dccp_rcv_established(struct sock *sk, struct sk_buff *skb, | 366 | int dccp_rcv_established(struct sock *sk, struct sk_buff *skb, |
365 | const struct dccp_hdr *dh, const unsigned len) | 367 | const struct dccp_hdr *dh, const unsigned len) |
366 | { | 368 | { |
367 | struct dccp_sock *dp = dccp_sk(sk); | ||
368 | |||
369 | if (dccp_check_seqno(sk, skb)) | 369 | if (dccp_check_seqno(sk, skb)) |
370 | goto discard; | 370 | goto discard; |
371 | 371 | ||
372 | if (dccp_parse_options(sk, NULL, skb)) | 372 | if (dccp_parse_options(sk, NULL, skb)) |
373 | return 1; | 373 | return 1; |
374 | 374 | ||
375 | if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) | 375 | dccp_handle_ackvec_processing(sk, skb); |
376 | dccp_event_ack_recv(sk, skb); | ||
377 | |||
378 | if (dccp_msk(sk)->dccpms_send_ack_vector && | ||
379 | dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk, | ||
380 | DCCP_SKB_CB(skb)->dccpd_seq, | ||
381 | DCCP_ACKVEC_STATE_RECEIVED)) | ||
382 | goto discard; | ||
383 | dccp_deliver_input_to_ccids(sk, skb); | 376 | dccp_deliver_input_to_ccids(sk, skb); |
384 | 377 | ||
385 | return __dccp_rcv_established(sk, skb, dh, len); | 378 | return __dccp_rcv_established(sk, skb, dh, len); |
@@ -421,40 +414,33 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk, | |||
421 | goto out_invalid_packet; | 414 | goto out_invalid_packet; |
422 | } | 415 | } |
423 | 416 | ||
417 | /* | ||
418 | * If option processing (Step 8) failed, return 1 here so that | ||
419 | * dccp_v4_do_rcv() sends a Reset. The Reset code depends on | ||
420 | * the option type and is set in dccp_parse_options(). | ||
421 | */ | ||
424 | if (dccp_parse_options(sk, NULL, skb)) | 422 | if (dccp_parse_options(sk, NULL, skb)) |
425 | goto out_invalid_packet; | 423 | return 1; |
426 | 424 | ||
427 | /* Obtain usec RTT sample from SYN exchange (used by CCID 3) */ | 425 | /* Obtain usec RTT sample from SYN exchange (used by CCID 3) */ |
428 | if (likely(dp->dccps_options_received.dccpor_timestamp_echo)) | 426 | if (likely(dp->dccps_options_received.dccpor_timestamp_echo)) |
429 | dp->dccps_syn_rtt = dccp_sample_rtt(sk, 10 * (tstamp - | 427 | dp->dccps_syn_rtt = dccp_sample_rtt(sk, 10 * (tstamp - |
430 | dp->dccps_options_received.dccpor_timestamp_echo)); | 428 | dp->dccps_options_received.dccpor_timestamp_echo)); |
431 | 429 | ||
432 | if (dccp_msk(sk)->dccpms_send_ack_vector && | ||
433 | dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk, | ||
434 | DCCP_SKB_CB(skb)->dccpd_seq, | ||
435 | DCCP_ACKVEC_STATE_RECEIVED)) | ||
436 | goto out_invalid_packet; /* FIXME: change error code */ | ||
437 | |||
438 | /* Stop the REQUEST timer */ | 430 | /* Stop the REQUEST timer */ |
439 | inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); | 431 | inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); |
440 | WARN_ON(sk->sk_send_head == NULL); | 432 | WARN_ON(sk->sk_send_head == NULL); |
441 | kfree_skb(sk->sk_send_head); | 433 | kfree_skb(sk->sk_send_head); |
442 | sk->sk_send_head = NULL; | 434 | sk->sk_send_head = NULL; |
443 | 435 | ||
444 | dp->dccps_isr = DCCP_SKB_CB(skb)->dccpd_seq; | ||
445 | dccp_update_gsr(sk, dp->dccps_isr); | ||
446 | /* | 436 | /* |
447 | * SWL and AWL are initially adjusted so that they are not less than | 437 | * Set ISR, GSR from packet. ISS was set in dccp_v{4,6}_connect |
448 | * the initial Sequence Numbers received and sent, respectively: | 438 | * and GSS in dccp_transmit_skb(). Setting AWL/AWH and SWL/SWH |
449 | * SWL := max(GSR + 1 - floor(W/4), ISR), | 439 | * is done as part of activating the feature values below, since |
450 | * AWL := max(GSS - W' + 1, ISS). | 440 | * these settings depend on the local/remote Sequence Window |
451 | * These adjustments MUST be applied only at the beginning of the | 441 | * features, which were undefined or not confirmed until now. |
452 | * connection. | ||
453 | * | ||
454 | * AWL was adjusted in dccp_v4_connect -acme | ||
455 | */ | 442 | */ |
456 | dccp_set_seqno(&dp->dccps_swl, | 443 | dp->dccps_gsr = dp->dccps_isr = DCCP_SKB_CB(skb)->dccpd_seq; |
457 | max48(dp->dccps_swl, dp->dccps_isr)); | ||
458 | 444 | ||
459 | dccp_sync_mss(sk, icsk->icsk_pmtu_cookie); | 445 | dccp_sync_mss(sk, icsk->icsk_pmtu_cookie); |
460 | 446 | ||
@@ -475,6 +461,15 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk, | |||
475 | */ | 461 | */ |
476 | dccp_set_state(sk, DCCP_PARTOPEN); | 462 | dccp_set_state(sk, DCCP_PARTOPEN); |
477 | 463 | ||
464 | /* | ||
465 | * If feature negotiation was successful, activate features now; | ||
466 | * an activation failure means that this host could not activate | ||
467 | * one ore more features (e.g. insufficient memory), which would | ||
468 | * leave at least one feature in an undefined state. | ||
469 | */ | ||
470 | if (dccp_feat_activate_values(sk, &dp->dccps_featneg)) | ||
471 | goto unable_to_proceed; | ||
472 | |||
478 | /* Make sure socket is routed, for correct metrics. */ | 473 | /* Make sure socket is routed, for correct metrics. */ |
479 | icsk->icsk_af_ops->rebuild_header(sk); | 474 | icsk->icsk_af_ops->rebuild_header(sk); |
480 | 475 | ||
@@ -509,6 +504,16 @@ out_invalid_packet: | |||
509 | /* dccp_v4_do_rcv will send a reset */ | 504 | /* dccp_v4_do_rcv will send a reset */ |
510 | DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_PACKET_ERROR; | 505 | DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_PACKET_ERROR; |
511 | return 1; | 506 | return 1; |
507 | |||
508 | unable_to_proceed: | ||
509 | DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_ABORTED; | ||
510 | /* | ||
511 | * We mark this socket as no longer usable, so that the loop in | ||
512 | * dccp_sendmsg() terminates and the application gets notified. | ||
513 | */ | ||
514 | dccp_set_state(sk, DCCP_CLOSED); | ||
515 | sk->sk_err = ECOMM; | ||
516 | return 1; | ||
512 | } | 517 | } |
513 | 518 | ||
514 | static int dccp_rcv_respond_partopen_state_process(struct sock *sk, | 519 | static int dccp_rcv_respond_partopen_state_process(struct sock *sk, |
@@ -590,8 +595,6 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, | |||
590 | if (inet_csk(sk)->icsk_af_ops->conn_request(sk, | 595 | if (inet_csk(sk)->icsk_af_ops->conn_request(sk, |
591 | skb) < 0) | 596 | skb) < 0) |
592 | return 1; | 597 | return 1; |
593 | |||
594 | /* FIXME: do congestion control initialization */ | ||
595 | goto discard; | 598 | goto discard; |
596 | } | 599 | } |
597 | if (dh->dccph_type == DCCP_PKT_RESET) | 600 | if (dh->dccph_type == DCCP_PKT_RESET) |
@@ -600,30 +603,36 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, | |||
600 | /* Caller (dccp_v4_do_rcv) will send Reset */ | 603 | /* Caller (dccp_v4_do_rcv) will send Reset */ |
601 | dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION; | 604 | dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION; |
602 | return 1; | 605 | return 1; |
606 | } else if (sk->sk_state == DCCP_CLOSED) { | ||
607 | dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION; | ||
608 | return 1; | ||
603 | } | 609 | } |
604 | 610 | ||
605 | if (sk->sk_state != DCCP_REQUESTING) { | 611 | /* Step 6: Check sequence numbers (omitted in LISTEN/REQUEST state) */ |
606 | if (dccp_check_seqno(sk, skb)) | 612 | if (sk->sk_state != DCCP_REQUESTING && dccp_check_seqno(sk, skb)) |
607 | goto discard; | 613 | goto discard; |
608 | |||
609 | /* | ||
610 | * Step 8: Process options and mark acknowledgeable | ||
611 | */ | ||
612 | if (dccp_parse_options(sk, NULL, skb)) | ||
613 | return 1; | ||
614 | |||
615 | if (dcb->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) | ||
616 | dccp_event_ack_recv(sk, skb); | ||
617 | |||
618 | if (dccp_msk(sk)->dccpms_send_ack_vector && | ||
619 | dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk, | ||
620 | DCCP_SKB_CB(skb)->dccpd_seq, | ||
621 | DCCP_ACKVEC_STATE_RECEIVED)) | ||
622 | goto discard; | ||
623 | 614 | ||
624 | dccp_deliver_input_to_ccids(sk, skb); | 615 | /* |
616 | * Step 7: Check for unexpected packet types | ||
617 | * If (S.is_server and P.type == Response) | ||
618 | * or (S.is_client and P.type == Request) | ||
619 | * or (S.state == RESPOND and P.type == Data), | ||
620 | * Send Sync packet acknowledging P.seqno | ||
621 | * Drop packet and return | ||
622 | */ | ||
623 | if ((dp->dccps_role != DCCP_ROLE_CLIENT && | ||
624 | dh->dccph_type == DCCP_PKT_RESPONSE) || | ||
625 | (dp->dccps_role == DCCP_ROLE_CLIENT && | ||
626 | dh->dccph_type == DCCP_PKT_REQUEST) || | ||
627 | (sk->sk_state == DCCP_RESPOND && dh->dccph_type == DCCP_PKT_DATA)) { | ||
628 | dccp_send_sync(sk, dcb->dccpd_seq, DCCP_PKT_SYNC); | ||
629 | goto discard; | ||
625 | } | 630 | } |
626 | 631 | ||
632 | /* Step 8: Process options */ | ||
633 | if (dccp_parse_options(sk, NULL, skb)) | ||
634 | return 1; | ||
635 | |||
627 | /* | 636 | /* |
628 | * Step 9: Process Reset | 637 | * Step 9: Process Reset |
629 | * If P.type == Reset, | 638 | * If P.type == Reset, |
@@ -631,44 +640,22 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, | |||
631 | * S.state := TIMEWAIT | 640 | * S.state := TIMEWAIT |
632 | * Set TIMEWAIT timer | 641 | * Set TIMEWAIT timer |
633 | * Drop packet and return | 642 | * Drop packet and return |
634 | */ | 643 | */ |
635 | if (dh->dccph_type == DCCP_PKT_RESET) { | 644 | if (dh->dccph_type == DCCP_PKT_RESET) { |
636 | dccp_rcv_reset(sk, skb); | 645 | dccp_rcv_reset(sk, skb); |
637 | return 0; | 646 | return 0; |
638 | /* | 647 | } else if (dh->dccph_type == DCCP_PKT_CLOSEREQ) { /* Step 13 */ |
639 | * Step 7: Check for unexpected packet types | ||
640 | * If (S.is_server and P.type == Response) | ||
641 | * or (S.is_client and P.type == Request) | ||
642 | * or (S.state == RESPOND and P.type == Data), | ||
643 | * Send Sync packet acknowledging P.seqno | ||
644 | * Drop packet and return | ||
645 | */ | ||
646 | } else if ((dp->dccps_role != DCCP_ROLE_CLIENT && | ||
647 | dh->dccph_type == DCCP_PKT_RESPONSE) || | ||
648 | (dp->dccps_role == DCCP_ROLE_CLIENT && | ||
649 | dh->dccph_type == DCCP_PKT_REQUEST) || | ||
650 | (sk->sk_state == DCCP_RESPOND && | ||
651 | dh->dccph_type == DCCP_PKT_DATA)) { | ||
652 | dccp_send_sync(sk, dcb->dccpd_seq, DCCP_PKT_SYNC); | ||
653 | goto discard; | ||
654 | } else if (dh->dccph_type == DCCP_PKT_CLOSEREQ) { | ||
655 | if (dccp_rcv_closereq(sk, skb)) | 648 | if (dccp_rcv_closereq(sk, skb)) |
656 | return 0; | 649 | return 0; |
657 | goto discard; | 650 | goto discard; |
658 | } else if (dh->dccph_type == DCCP_PKT_CLOSE) { | 651 | } else if (dh->dccph_type == DCCP_PKT_CLOSE) { /* Step 14 */ |
659 | if (dccp_rcv_close(sk, skb)) | 652 | if (dccp_rcv_close(sk, skb)) |
660 | return 0; | 653 | return 0; |
661 | goto discard; | 654 | goto discard; |
662 | } | 655 | } |
663 | 656 | ||
664 | switch (sk->sk_state) { | 657 | switch (sk->sk_state) { |
665 | case DCCP_CLOSED: | ||
666 | dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION; | ||
667 | return 1; | ||
668 | |||
669 | case DCCP_REQUESTING: | 658 | case DCCP_REQUESTING: |
670 | /* FIXME: do congestion control initialization */ | ||
671 | |||
672 | queued = dccp_rcv_request_sent_state_process(sk, skb, dh, len); | 659 | queued = dccp_rcv_request_sent_state_process(sk, skb, dh, len); |
673 | if (queued >= 0) | 660 | if (queued >= 0) |
674 | return queued; | 661 | return queued; |
@@ -676,8 +663,12 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, | |||
676 | __kfree_skb(skb); | 663 | __kfree_skb(skb); |
677 | return 0; | 664 | return 0; |
678 | 665 | ||
679 | case DCCP_RESPOND: | ||
680 | case DCCP_PARTOPEN: | 666 | case DCCP_PARTOPEN: |
667 | /* Step 8: if using Ack Vectors, mark packet acknowledgeable */ | ||
668 | dccp_handle_ackvec_processing(sk, skb); | ||
669 | dccp_deliver_input_to_ccids(sk, skb); | ||
670 | /* fall through */ | ||
671 | case DCCP_RESPOND: | ||
681 | queued = dccp_rcv_respond_partopen_state_process(sk, skb, | 672 | queued = dccp_rcv_respond_partopen_state_process(sk, skb, |
682 | dh, len); | 673 | dh, len); |
683 | break; | 674 | break; |
@@ -716,16 +707,7 @@ u32 dccp_sample_rtt(struct sock *sk, long delta) | |||
716 | /* dccpor_elapsed_time is either zeroed out or set and > 0 */ | 707 | /* dccpor_elapsed_time is either zeroed out or set and > 0 */ |
717 | delta -= dccp_sk(sk)->dccps_options_received.dccpor_elapsed_time * 10; | 708 | delta -= dccp_sk(sk)->dccps_options_received.dccpor_elapsed_time * 10; |
718 | 709 | ||
719 | if (unlikely(delta <= 0)) { | 710 | return dccp_sane_rtt(delta); |
720 | DCCP_WARN("unusable RTT sample %ld, using min\n", delta); | ||
721 | return DCCP_SANE_RTT_MIN; | ||
722 | } | ||
723 | if (unlikely(delta > DCCP_SANE_RTT_MAX)) { | ||
724 | DCCP_WARN("RTT sample %ld too large, using max\n", delta); | ||
725 | return DCCP_SANE_RTT_MAX; | ||
726 | } | ||
727 | |||
728 | return delta; | ||
729 | } | 711 | } |
730 | 712 | ||
731 | EXPORT_SYMBOL_GPL(dccp_sample_rtt); | 713 | EXPORT_SYMBOL_GPL(dccp_sample_rtt); |
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 882c5c4de69e..b623f6b25482 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c | |||
@@ -545,6 +545,7 @@ out: | |||
545 | 545 | ||
546 | static void dccp_v4_reqsk_destructor(struct request_sock *req) | 546 | static void dccp_v4_reqsk_destructor(struct request_sock *req) |
547 | { | 547 | { |
548 | dccp_feat_list_purge(&dccp_rsk(req)->dreq_featneg); | ||
548 | kfree(inet_rsk(req)->opt); | 549 | kfree(inet_rsk(req)->opt); |
549 | } | 550 | } |
550 | 551 | ||
@@ -595,7 +596,8 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb) | |||
595 | if (req == NULL) | 596 | if (req == NULL) |
596 | goto drop; | 597 | goto drop; |
597 | 598 | ||
598 | dccp_reqsk_init(req, skb); | 599 | if (dccp_reqsk_init(req, dccp_sk(sk), skb)) |
600 | goto drop_and_free; | ||
599 | 601 | ||
600 | dreq = dccp_rsk(req); | 602 | dreq = dccp_rsk(req); |
601 | if (dccp_parse_options(sk, dreq, skb)) | 603 | if (dccp_parse_options(sk, dreq, skb)) |
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 5e1ee0da2c40..ad6212e00435 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c | |||
@@ -302,6 +302,7 @@ done: | |||
302 | 302 | ||
303 | static void dccp_v6_reqsk_destructor(struct request_sock *req) | 303 | static void dccp_v6_reqsk_destructor(struct request_sock *req) |
304 | { | 304 | { |
305 | dccp_feat_list_purge(&dccp_rsk(req)->dreq_featneg); | ||
305 | if (inet6_rsk(req)->pktopts != NULL) | 306 | if (inet6_rsk(req)->pktopts != NULL) |
306 | kfree_skb(inet6_rsk(req)->pktopts); | 307 | kfree_skb(inet6_rsk(req)->pktopts); |
307 | } | 308 | } |
@@ -424,7 +425,8 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb) | |||
424 | if (req == NULL) | 425 | if (req == NULL) |
425 | goto drop; | 426 | goto drop; |
426 | 427 | ||
427 | dccp_reqsk_init(req, skb); | 428 | if (dccp_reqsk_init(req, dccp_sk(sk), skb)) |
429 | goto drop_and_free; | ||
428 | 430 | ||
429 | dreq = dccp_rsk(req); | 431 | dreq = dccp_rsk(req); |
430 | if (dccp_parse_options(sk, dreq, skb)) | 432 | if (dccp_parse_options(sk, dreq, skb)) |
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index b2804e2d1b8c..f4d9c8f60ede 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c | |||
@@ -42,16 +42,6 @@ struct inet_timewait_death_row dccp_death_row = { | |||
42 | 42 | ||
43 | EXPORT_SYMBOL_GPL(dccp_death_row); | 43 | EXPORT_SYMBOL_GPL(dccp_death_row); |
44 | 44 | ||
45 | void dccp_minisock_init(struct dccp_minisock *dmsk) | ||
46 | { | ||
47 | dmsk->dccpms_sequence_window = sysctl_dccp_feat_sequence_window; | ||
48 | dmsk->dccpms_rx_ccid = sysctl_dccp_feat_rx_ccid; | ||
49 | dmsk->dccpms_tx_ccid = sysctl_dccp_feat_tx_ccid; | ||
50 | dmsk->dccpms_ack_ratio = sysctl_dccp_feat_ack_ratio; | ||
51 | dmsk->dccpms_send_ack_vector = sysctl_dccp_feat_send_ack_vector; | ||
52 | dmsk->dccpms_send_ndp_count = sysctl_dccp_feat_send_ndp_count; | ||
53 | } | ||
54 | |||
55 | void dccp_time_wait(struct sock *sk, int state, int timeo) | 45 | void dccp_time_wait(struct sock *sk, int state, int timeo) |
56 | { | 46 | { |
57 | struct inet_timewait_sock *tw = NULL; | 47 | struct inet_timewait_sock *tw = NULL; |
@@ -112,10 +102,9 @@ struct sock *dccp_create_openreq_child(struct sock *sk, | |||
112 | struct sock *newsk = inet_csk_clone(sk, req, GFP_ATOMIC); | 102 | struct sock *newsk = inet_csk_clone(sk, req, GFP_ATOMIC); |
113 | 103 | ||
114 | if (newsk != NULL) { | 104 | if (newsk != NULL) { |
115 | const struct dccp_request_sock *dreq = dccp_rsk(req); | 105 | struct dccp_request_sock *dreq = dccp_rsk(req); |
116 | struct inet_connection_sock *newicsk = inet_csk(newsk); | 106 | struct inet_connection_sock *newicsk = inet_csk(newsk); |
117 | struct dccp_sock *newdp = dccp_sk(newsk); | 107 | struct dccp_sock *newdp = dccp_sk(newsk); |
118 | struct dccp_minisock *newdmsk = dccp_msk(newsk); | ||
119 | 108 | ||
120 | newdp->dccps_role = DCCP_ROLE_SERVER; | 109 | newdp->dccps_role = DCCP_ROLE_SERVER; |
121 | newdp->dccps_hc_rx_ackvec = NULL; | 110 | newdp->dccps_hc_rx_ackvec = NULL; |
@@ -125,65 +114,32 @@ struct sock *dccp_create_openreq_child(struct sock *sk, | |||
125 | newdp->dccps_timestamp_time = dreq->dreq_timestamp_time; | 114 | newdp->dccps_timestamp_time = dreq->dreq_timestamp_time; |
126 | newicsk->icsk_rto = DCCP_TIMEOUT_INIT; | 115 | newicsk->icsk_rto = DCCP_TIMEOUT_INIT; |
127 | 116 | ||
128 | if (dccp_feat_clone(sk, newsk)) | 117 | INIT_LIST_HEAD(&newdp->dccps_featneg); |
129 | goto out_free; | ||
130 | |||
131 | if (newdmsk->dccpms_send_ack_vector) { | ||
132 | newdp->dccps_hc_rx_ackvec = | ||
133 | dccp_ackvec_alloc(GFP_ATOMIC); | ||
134 | if (unlikely(newdp->dccps_hc_rx_ackvec == NULL)) | ||
135 | goto out_free; | ||
136 | } | ||
137 | |||
138 | newdp->dccps_hc_rx_ccid = | ||
139 | ccid_hc_rx_new(newdmsk->dccpms_rx_ccid, | ||
140 | newsk, GFP_ATOMIC); | ||
141 | newdp->dccps_hc_tx_ccid = | ||
142 | ccid_hc_tx_new(newdmsk->dccpms_tx_ccid, | ||
143 | newsk, GFP_ATOMIC); | ||
144 | if (unlikely(newdp->dccps_hc_rx_ccid == NULL || | ||
145 | newdp->dccps_hc_tx_ccid == NULL)) { | ||
146 | dccp_ackvec_free(newdp->dccps_hc_rx_ackvec); | ||
147 | ccid_hc_rx_delete(newdp->dccps_hc_rx_ccid, newsk); | ||
148 | ccid_hc_tx_delete(newdp->dccps_hc_tx_ccid, newsk); | ||
149 | out_free: | ||
150 | /* It is still raw copy of parent, so invalidate | ||
151 | * destructor and make plain sk_free() */ | ||
152 | newsk->sk_destruct = NULL; | ||
153 | sk_free(newsk); | ||
154 | return NULL; | ||
155 | } | ||
156 | |||
157 | /* | 118 | /* |
158 | * Step 3: Process LISTEN state | 119 | * Step 3: Process LISTEN state |
159 | * | 120 | * |
160 | * Choose S.ISS (initial seqno) or set from Init Cookies | 121 | * Choose S.ISS (initial seqno) or set from Init Cookies |
161 | * Initialize S.GAR := S.ISS | 122 | * Initialize S.GAR := S.ISS |
162 | * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookies | 123 | * Set S.ISR, S.GSR from packet (or Init Cookies) |
124 | * | ||
125 | * Setting AWL/AWH and SWL/SWH happens as part of the feature | ||
126 | * activation below, as these windows all depend on the local | ||
127 | * and remote Sequence Window feature values (7.5.2). | ||
163 | */ | 128 | */ |
164 | 129 | newdp->dccps_gss = newdp->dccps_iss = dreq->dreq_iss; | |
165 | /* See dccp_v4_conn_request */ | 130 | newdp->dccps_gar = newdp->dccps_iss; |
166 | newdmsk->dccpms_sequence_window = req->rcv_wnd; | 131 | newdp->dccps_gsr = newdp->dccps_isr = dreq->dreq_isr; |
167 | |||
168 | newdp->dccps_gar = newdp->dccps_iss = dreq->dreq_iss; | ||
169 | dccp_update_gss(newsk, dreq->dreq_iss); | ||
170 | |||
171 | newdp->dccps_isr = dreq->dreq_isr; | ||
172 | dccp_update_gsr(newsk, dreq->dreq_isr); | ||
173 | 132 | ||
174 | /* | 133 | /* |
175 | * SWL and AWL are initially adjusted so that they are not less than | 134 | * Activate features: initialise CCIDs, sequence windows etc. |
176 | * the initial Sequence Numbers received and sent, respectively: | ||
177 | * SWL := max(GSR + 1 - floor(W/4), ISR), | ||
178 | * AWL := max(GSS - W' + 1, ISS). | ||
179 | * These adjustments MUST be applied only at the beginning of the | ||
180 | * connection. | ||
181 | */ | 135 | */ |
182 | dccp_set_seqno(&newdp->dccps_swl, | 136 | if (dccp_feat_activate_values(newsk, &dreq->dreq_featneg)) { |
183 | max48(newdp->dccps_swl, newdp->dccps_isr)); | 137 | /* It is still raw copy of parent, so invalidate |
184 | dccp_set_seqno(&newdp->dccps_awl, | 138 | * destructor and make plain sk_free() */ |
185 | max48(newdp->dccps_awl, newdp->dccps_iss)); | 139 | newsk->sk_destruct = NULL; |
186 | 140 | sk_free(newsk); | |
141 | return NULL; | ||
142 | } | ||
187 | dccp_init_xmit_timers(newsk); | 143 | dccp_init_xmit_timers(newsk); |
188 | 144 | ||
189 | DCCP_INC_STATS_BH(DCCP_MIB_PASSIVEOPENS); | 145 | DCCP_INC_STATS_BH(DCCP_MIB_PASSIVEOPENS); |
@@ -304,14 +260,17 @@ void dccp_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, | |||
304 | 260 | ||
305 | EXPORT_SYMBOL_GPL(dccp_reqsk_send_ack); | 261 | EXPORT_SYMBOL_GPL(dccp_reqsk_send_ack); |
306 | 262 | ||
307 | void dccp_reqsk_init(struct request_sock *req, struct sk_buff *skb) | 263 | int dccp_reqsk_init(struct request_sock *req, |
264 | struct dccp_sock const *dp, struct sk_buff const *skb) | ||
308 | { | 265 | { |
309 | struct dccp_request_sock *dreq = dccp_rsk(req); | 266 | struct dccp_request_sock *dreq = dccp_rsk(req); |
310 | 267 | ||
311 | inet_rsk(req)->rmt_port = dccp_hdr(skb)->dccph_sport; | 268 | inet_rsk(req)->rmt_port = dccp_hdr(skb)->dccph_sport; |
312 | inet_rsk(req)->acked = 0; | 269 | inet_rsk(req)->acked = 0; |
313 | req->rcv_wnd = sysctl_dccp_feat_sequence_window; | ||
314 | dreq->dreq_timestamp_echo = 0; | 270 | dreq->dreq_timestamp_echo = 0; |
271 | |||
272 | /* inherit feature negotiation options from listening socket */ | ||
273 | return dccp_feat_clone_list(&dp->dccps_featneg, &dreq->dreq_featneg); | ||
315 | } | 274 | } |
316 | 275 | ||
317 | EXPORT_SYMBOL_GPL(dccp_reqsk_init); | 276 | EXPORT_SYMBOL_GPL(dccp_reqsk_init); |
diff --git a/net/dccp/options.c b/net/dccp/options.c index 0809b63cb055..e5a32979d7d7 100644 --- a/net/dccp/options.c +++ b/net/dccp/options.c | |||
@@ -23,23 +23,20 @@ | |||
23 | #include "dccp.h" | 23 | #include "dccp.h" |
24 | #include "feat.h" | 24 | #include "feat.h" |
25 | 25 | ||
26 | int sysctl_dccp_feat_sequence_window = DCCPF_INITIAL_SEQUENCE_WINDOW; | 26 | u64 dccp_decode_value_var(const u8 *bf, const u8 len) |
27 | int sysctl_dccp_feat_rx_ccid = DCCPF_INITIAL_CCID; | ||
28 | int sysctl_dccp_feat_tx_ccid = DCCPF_INITIAL_CCID; | ||
29 | int sysctl_dccp_feat_ack_ratio = DCCPF_INITIAL_ACK_RATIO; | ||
30 | int sysctl_dccp_feat_send_ack_vector = DCCPF_INITIAL_SEND_ACK_VECTOR; | ||
31 | int sysctl_dccp_feat_send_ndp_count = DCCPF_INITIAL_SEND_NDP_COUNT; | ||
32 | |||
33 | static u32 dccp_decode_value_var(const unsigned char *bf, const u8 len) | ||
34 | { | 27 | { |
35 | u32 value = 0; | 28 | u64 value = 0; |
36 | 29 | ||
30 | if (len >= DCCP_OPTVAL_MAXLEN) | ||
31 | value += ((u64)*bf++) << 40; | ||
32 | if (len > 4) | ||
33 | value += ((u64)*bf++) << 32; | ||
37 | if (len > 3) | 34 | if (len > 3) |
38 | value += *bf++ << 24; | 35 | value += ((u64)*bf++) << 24; |
39 | if (len > 2) | 36 | if (len > 2) |
40 | value += *bf++ << 16; | 37 | value += ((u64)*bf++) << 16; |
41 | if (len > 1) | 38 | if (len > 1) |
42 | value += *bf++ << 8; | 39 | value += ((u64)*bf++) << 8; |
43 | if (len > 0) | 40 | if (len > 0) |
44 | value += *bf; | 41 | value += *bf; |
45 | 42 | ||
@@ -57,7 +54,6 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, | |||
57 | struct dccp_sock *dp = dccp_sk(sk); | 54 | struct dccp_sock *dp = dccp_sk(sk); |
58 | const struct dccp_hdr *dh = dccp_hdr(skb); | 55 | const struct dccp_hdr *dh = dccp_hdr(skb); |
59 | const u8 pkt_type = DCCP_SKB_CB(skb)->dccpd_type; | 56 | const u8 pkt_type = DCCP_SKB_CB(skb)->dccpd_type; |
60 | u64 ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq; | ||
61 | unsigned char *options = (unsigned char *)dh + dccp_hdr_len(skb); | 57 | unsigned char *options = (unsigned char *)dh + dccp_hdr_len(skb); |
62 | unsigned char *opt_ptr = options; | 58 | unsigned char *opt_ptr = options; |
63 | const unsigned char *opt_end = (unsigned char *)dh + | 59 | const unsigned char *opt_end = (unsigned char *)dh + |
@@ -99,18 +95,11 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, | |||
99 | } | 95 | } |
100 | 96 | ||
101 | /* | 97 | /* |
102 | * CCID-Specific Options (from RFC 4340, sec. 10.3): | ||
103 | * | ||
104 | * Option numbers 128 through 191 are for options sent from the | ||
105 | * HC-Sender to the HC-Receiver; option numbers 192 through 255 | ||
106 | * are for options sent from the HC-Receiver to the HC-Sender. | ||
107 | * | ||
108 | * CCID-specific options are ignored during connection setup, as | 98 | * CCID-specific options are ignored during connection setup, as |
109 | * negotiation may still be in progress (see RFC 4340, 10.3). | 99 | * negotiation may still be in progress (see RFC 4340, 10.3). |
110 | * The same applies to Ack Vectors, as these depend on the CCID. | 100 | * The same applies to Ack Vectors, as these depend on the CCID. |
111 | * | ||
112 | */ | 101 | */ |
113 | if (dreq != NULL && (opt >= 128 || | 102 | if (dreq != NULL && (opt >= DCCPO_MIN_RX_CCID_SPECIFIC || |
114 | opt == DCCPO_ACK_VECTOR_0 || opt == DCCPO_ACK_VECTOR_1)) | 103 | opt == DCCPO_ACK_VECTOR_0 || opt == DCCPO_ACK_VECTOR_1)) |
115 | goto ignore_option; | 104 | goto ignore_option; |
116 | 105 | ||
@@ -131,43 +120,13 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, | |||
131 | dccp_pr_debug("%s opt: NDP count=%llu\n", dccp_role(sk), | 120 | dccp_pr_debug("%s opt: NDP count=%llu\n", dccp_role(sk), |
132 | (unsigned long long)opt_recv->dccpor_ndp); | 121 | (unsigned long long)opt_recv->dccpor_ndp); |
133 | break; | 122 | break; |
134 | case DCCPO_CHANGE_L: | 123 | case DCCPO_CHANGE_L ... DCCPO_CONFIRM_R: |
135 | /* fall through */ | 124 | if (pkt_type == DCCP_PKT_DATA) /* RFC 4340, 6 */ |
136 | case DCCPO_CHANGE_R: | ||
137 | if (pkt_type == DCCP_PKT_DATA) | ||
138 | break; | 125 | break; |
139 | if (len < 2) | 126 | rc = dccp_feat_parse_options(sk, dreq, mandatory, opt, |
140 | goto out_invalid_option; | 127 | *value, value + 1, len - 1); |
141 | rc = dccp_feat_change_recv(sk, opt, *value, value + 1, | 128 | if (rc) |
142 | len - 1); | 129 | goto out_featneg_failed; |
143 | /* | ||
144 | * When there is a change error, change_recv is | ||
145 | * responsible for dealing with it. i.e. reply with an | ||
146 | * empty confirm. | ||
147 | * If the change was mandatory, then we need to die. | ||
148 | */ | ||
149 | if (rc && mandatory) | ||
150 | goto out_invalid_option; | ||
151 | break; | ||
152 | case DCCPO_CONFIRM_L: | ||
153 | /* fall through */ | ||
154 | case DCCPO_CONFIRM_R: | ||
155 | if (pkt_type == DCCP_PKT_DATA) | ||
156 | break; | ||
157 | if (len < 2) /* FIXME this disallows empty confirm */ | ||
158 | goto out_invalid_option; | ||
159 | if (dccp_feat_confirm_recv(sk, opt, *value, | ||
160 | value + 1, len - 1)) | ||
161 | goto out_invalid_option; | ||
162 | break; | ||
163 | case DCCPO_ACK_VECTOR_0: | ||
164 | case DCCPO_ACK_VECTOR_1: | ||
165 | if (dccp_packet_without_ack(skb)) /* RFC 4340, 11.4 */ | ||
166 | break; | ||
167 | |||
168 | if (dccp_msk(sk)->dccpms_send_ack_vector && | ||
169 | dccp_ackvec_parse(sk, skb, &ackno, opt, value, len)) | ||
170 | goto out_invalid_option; | ||
171 | break; | 130 | break; |
172 | case DCCPO_TIMESTAMP: | 131 | case DCCPO_TIMESTAMP: |
173 | if (len != 4) | 132 | if (len != 4) |
@@ -195,6 +154,8 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, | |||
195 | dccp_role(sk), ntohl(opt_val), | 154 | dccp_role(sk), ntohl(opt_val), |
196 | (unsigned long long) | 155 | (unsigned long long) |
197 | DCCP_SKB_CB(skb)->dccpd_ack_seq); | 156 | DCCP_SKB_CB(skb)->dccpd_ack_seq); |
157 | /* schedule an Ack in case this sender is quiescent */ | ||
158 | inet_csk_schedule_ack(sk); | ||
198 | break; | 159 | break; |
199 | case DCCPO_TIMESTAMP_ECHO: | 160 | case DCCPO_TIMESTAMP_ECHO: |
200 | if (len != 4 && len != 6 && len != 8) | 161 | if (len != 4 && len != 6 && len != 8) |
@@ -251,23 +212,25 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, | |||
251 | dccp_pr_debug("%s rx opt: ELAPSED_TIME=%d\n", | 212 | dccp_pr_debug("%s rx opt: ELAPSED_TIME=%d\n", |
252 | dccp_role(sk), elapsed_time); | 213 | dccp_role(sk), elapsed_time); |
253 | break; | 214 | break; |
254 | case 128 ... 191: { | 215 | case DCCPO_MIN_RX_CCID_SPECIFIC ... DCCPO_MAX_RX_CCID_SPECIFIC: |
255 | const u16 idx = value - options; | ||
256 | |||
257 | if (ccid_hc_rx_parse_options(dp->dccps_hc_rx_ccid, sk, | 216 | if (ccid_hc_rx_parse_options(dp->dccps_hc_rx_ccid, sk, |
258 | opt, len, idx, | 217 | pkt_type, opt, value, len)) |
259 | value) != 0) | ||
260 | goto out_invalid_option; | 218 | goto out_invalid_option; |
261 | } | ||
262 | break; | 219 | break; |
263 | case 192 ... 255: { | 220 | case DCCPO_ACK_VECTOR_0: |
264 | const u16 idx = value - options; | 221 | case DCCPO_ACK_VECTOR_1: |
265 | 222 | if (dccp_packet_without_ack(skb)) /* RFC 4340, 11.4 */ | |
223 | break; | ||
224 | /* | ||
225 | * Ack vectors are processed by the TX CCID if it is | ||
226 | * interested. The RX CCID need not parse Ack Vectors, | ||
227 | * since it is only interested in clearing old state. | ||
228 | * Fall through. | ||
229 | */ | ||
230 | case DCCPO_MIN_TX_CCID_SPECIFIC ... DCCPO_MAX_TX_CCID_SPECIFIC: | ||
266 | if (ccid_hc_tx_parse_options(dp->dccps_hc_tx_ccid, sk, | 231 | if (ccid_hc_tx_parse_options(dp->dccps_hc_tx_ccid, sk, |
267 | opt, len, idx, | 232 | pkt_type, opt, value, len)) |
268 | value) != 0) | ||
269 | goto out_invalid_option; | 233 | goto out_invalid_option; |
270 | } | ||
271 | break; | 234 | break; |
272 | default: | 235 | default: |
273 | DCCP_CRIT("DCCP(%p): option %d(len=%d) not " | 236 | DCCP_CRIT("DCCP(%p): option %d(len=%d) not " |
@@ -289,8 +252,10 @@ out_nonsensical_length: | |||
289 | 252 | ||
290 | out_invalid_option: | 253 | out_invalid_option: |
291 | DCCP_INC_STATS_BH(DCCP_MIB_INVALIDOPT); | 254 | DCCP_INC_STATS_BH(DCCP_MIB_INVALIDOPT); |
292 | DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_OPTION_ERROR; | 255 | rc = DCCP_RESET_CODE_OPTION_ERROR; |
293 | DCCP_WARN("DCCP(%p): invalid option %d, len=%d", sk, opt, len); | 256 | out_featneg_failed: |
257 | DCCP_WARN("DCCP(%p): Option %d (len=%d) error=%u\n", sk, opt, len, rc); | ||
258 | DCCP_SKB_CB(skb)->dccpd_reset_code = rc; | ||
294 | DCCP_SKB_CB(skb)->dccpd_reset_data[0] = opt; | 259 | DCCP_SKB_CB(skb)->dccpd_reset_data[0] = opt; |
295 | DCCP_SKB_CB(skb)->dccpd_reset_data[1] = len > 0 ? value[0] : 0; | 260 | DCCP_SKB_CB(skb)->dccpd_reset_data[1] = len > 0 ? value[0] : 0; |
296 | DCCP_SKB_CB(skb)->dccpd_reset_data[2] = len > 1 ? value[1] : 0; | 261 | DCCP_SKB_CB(skb)->dccpd_reset_data[2] = len > 1 ? value[1] : 0; |
@@ -299,9 +264,12 @@ out_invalid_option: | |||
299 | 264 | ||
300 | EXPORT_SYMBOL_GPL(dccp_parse_options); | 265 | EXPORT_SYMBOL_GPL(dccp_parse_options); |
301 | 266 | ||
302 | static void dccp_encode_value_var(const u32 value, unsigned char *to, | 267 | void dccp_encode_value_var(const u64 value, u8 *to, const u8 len) |
303 | const unsigned int len) | ||
304 | { | 268 | { |
269 | if (len >= DCCP_OPTVAL_MAXLEN) | ||
270 | *to++ = (value & 0xFF0000000000ull) >> 40; | ||
271 | if (len > 4) | ||
272 | *to++ = (value & 0xFF00000000ull) >> 32; | ||
305 | if (len > 3) | 273 | if (len > 3) |
306 | *to++ = (value & 0xFF000000) >> 24; | 274 | *to++ = (value & 0xFF000000) >> 24; |
307 | if (len > 2) | 275 | if (len > 2) |
@@ -461,92 +429,140 @@ static int dccp_insert_option_timestamp_echo(struct dccp_sock *dp, | |||
461 | return 0; | 429 | return 0; |
462 | } | 430 | } |
463 | 431 | ||
464 | static int dccp_insert_feat_opt(struct sk_buff *skb, u8 type, u8 feat, | 432 | static int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb) |
465 | u8 *val, u8 len) | ||
466 | { | 433 | { |
467 | u8 *to; | 434 | struct dccp_sock *dp = dccp_sk(sk); |
435 | struct dccp_ackvec *av = dp->dccps_hc_rx_ackvec; | ||
436 | struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); | ||
437 | const u16 buflen = dccp_ackvec_buflen(av); | ||
438 | /* Figure out how many options do we need to represent the ackvec */ | ||
439 | const u8 nr_opts = DIV_ROUND_UP(buflen, DCCP_SINGLE_OPT_MAXLEN); | ||
440 | u16 len = buflen + 2 * nr_opts; | ||
441 | u8 i, nonce = 0; | ||
442 | const unsigned char *tail, *from; | ||
443 | unsigned char *to; | ||
468 | 444 | ||
469 | if (DCCP_SKB_CB(skb)->dccpd_opt_len + len + 3 > DCCP_MAX_OPT_LEN) { | 445 | if (dcb->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) { |
470 | DCCP_WARN("packet too small for feature %d option!\n", feat); | 446 | DCCP_WARN("Lacking space for %u bytes on %s packet\n", len, |
447 | dccp_packet_name(dcb->dccpd_type)); | ||
471 | return -1; | 448 | return -1; |
472 | } | 449 | } |
450 | /* | ||
451 | * Since Ack Vectors are variable-length, we can not always predict | ||
452 | * their size. To catch exception cases where the space is running out | ||
453 | * on the skb, a separate Sync is scheduled to carry the Ack Vector. | ||
454 | */ | ||
455 | if (len > DCCPAV_MIN_OPTLEN && | ||
456 | len + dcb->dccpd_opt_len + skb->len > dp->dccps_mss_cache) { | ||
457 | DCCP_WARN("No space left for Ack Vector (%u) on skb (%u+%u), " | ||
458 | "MPS=%u ==> reduce payload size?\n", len, skb->len, | ||
459 | dcb->dccpd_opt_len, dp->dccps_mss_cache); | ||
460 | dp->dccps_sync_scheduled = 1; | ||
461 | return 0; | ||
462 | } | ||
463 | dcb->dccpd_opt_len += len; | ||
473 | 464 | ||
474 | DCCP_SKB_CB(skb)->dccpd_opt_len += len + 3; | 465 | to = skb_push(skb, len); |
466 | len = buflen; | ||
467 | from = av->av_buf + av->av_buf_head; | ||
468 | tail = av->av_buf + DCCPAV_MAX_ACKVEC_LEN; | ||
475 | 469 | ||
476 | to = skb_push(skb, len + 3); | 470 | for (i = 0; i < nr_opts; ++i) { |
477 | *to++ = type; | 471 | int copylen = len; |
478 | *to++ = len + 3; | ||
479 | *to++ = feat; | ||
480 | 472 | ||
481 | if (len) | 473 | if (len > DCCP_SINGLE_OPT_MAXLEN) |
482 | memcpy(to, val, len); | 474 | copylen = DCCP_SINGLE_OPT_MAXLEN; |
475 | |||
476 | /* | ||
477 | * RFC 4340, 12.2: Encode the Nonce Echo for this Ack Vector via | ||
478 | * its type; ack_nonce is the sum of all individual buf_nonce's. | ||
479 | */ | ||
480 | nonce ^= av->av_buf_nonce[i]; | ||
481 | |||
482 | *to++ = DCCPO_ACK_VECTOR_0 + av->av_buf_nonce[i]; | ||
483 | *to++ = copylen + 2; | ||
483 | 484 | ||
484 | dccp_pr_debug("%s(%s (%d), ...), length %d\n", | 485 | /* Check if buf_head wraps */ |
485 | dccp_feat_typename(type), | 486 | if (from + copylen > tail) { |
486 | dccp_feat_name(feat), feat, len); | 487 | const u16 tailsize = tail - from; |
488 | |||
489 | memcpy(to, from, tailsize); | ||
490 | to += tailsize; | ||
491 | len -= tailsize; | ||
492 | copylen -= tailsize; | ||
493 | from = av->av_buf; | ||
494 | } | ||
495 | |||
496 | memcpy(to, from, copylen); | ||
497 | from += copylen; | ||
498 | to += copylen; | ||
499 | len -= copylen; | ||
500 | } | ||
501 | /* | ||
502 | * Each sent Ack Vector is recorded in the list, as per A.2 of RFC 4340. | ||
503 | */ | ||
504 | if (dccp_ackvec_update_records(av, dcb->dccpd_seq, nonce)) | ||
505 | return -ENOBUFS; | ||
487 | return 0; | 506 | return 0; |
488 | } | 507 | } |
489 | 508 | ||
490 | static int dccp_insert_options_feat(struct sock *sk, struct sk_buff *skb) | 509 | /** |
510 | * dccp_insert_option_mandatory - Mandatory option (5.8.2) | ||
511 | * Note that since we are using skb_push, this function needs to be called | ||
512 | * _after_ inserting the option it is supposed to influence (stack order). | ||
513 | */ | ||
514 | int dccp_insert_option_mandatory(struct sk_buff *skb) | ||
491 | { | 515 | { |
492 | struct dccp_sock *dp = dccp_sk(sk); | 516 | if (DCCP_SKB_CB(skb)->dccpd_opt_len >= DCCP_MAX_OPT_LEN) |
493 | struct dccp_minisock *dmsk = dccp_msk(sk); | 517 | return -1; |
494 | struct dccp_opt_pend *opt, *next; | ||
495 | int change = 0; | ||
496 | |||
497 | /* confirm any options [NN opts] */ | ||
498 | list_for_each_entry_safe(opt, next, &dmsk->dccpms_conf, dccpop_node) { | ||
499 | dccp_insert_feat_opt(skb, opt->dccpop_type, | ||
500 | opt->dccpop_feat, opt->dccpop_val, | ||
501 | opt->dccpop_len); | ||
502 | /* fear empty confirms */ | ||
503 | if (opt->dccpop_val) | ||
504 | kfree(opt->dccpop_val); | ||
505 | kfree(opt); | ||
506 | } | ||
507 | INIT_LIST_HEAD(&dmsk->dccpms_conf); | ||
508 | |||
509 | /* see which features we need to send */ | ||
510 | list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) { | ||
511 | /* see if we need to send any confirm */ | ||
512 | if (opt->dccpop_sc) { | ||
513 | dccp_insert_feat_opt(skb, opt->dccpop_type + 1, | ||
514 | opt->dccpop_feat, | ||
515 | opt->dccpop_sc->dccpoc_val, | ||
516 | opt->dccpop_sc->dccpoc_len); | ||
517 | |||
518 | BUG_ON(!opt->dccpop_sc->dccpoc_val); | ||
519 | kfree(opt->dccpop_sc->dccpoc_val); | ||
520 | kfree(opt->dccpop_sc); | ||
521 | opt->dccpop_sc = NULL; | ||
522 | } | ||
523 | 518 | ||
524 | /* any option not confirmed, re-send it */ | 519 | DCCP_SKB_CB(skb)->dccpd_opt_len++; |
525 | if (!opt->dccpop_conf) { | 520 | *skb_push(skb, 1) = DCCPO_MANDATORY; |
526 | dccp_insert_feat_opt(skb, opt->dccpop_type, | 521 | return 0; |
527 | opt->dccpop_feat, opt->dccpop_val, | 522 | } |
528 | opt->dccpop_len); | 523 | |
529 | change++; | 524 | /** |
530 | } | 525 | * dccp_insert_fn_opt - Insert single Feature-Negotiation option into @skb |
526 | * @type: %DCCPO_CHANGE_L, %DCCPO_CHANGE_R, %DCCPO_CONFIRM_L, %DCCPO_CONFIRM_R | ||
527 | * @feat: one out of %dccp_feature_numbers | ||
528 | * @val: NN value or SP array (preferred element first) to copy | ||
529 | * @len: true length of @val in bytes (excluding first element repetition) | ||
530 | * @repeat_first: whether to copy the first element of @val twice | ||
531 | * The last argument is used to construct Confirm options, where the preferred | ||
532 | * value and the preference list appear separately (RFC 4340, 6.3.1). Preference | ||
533 | * lists are kept such that the preferred entry is always first, so we only need | ||
534 | * to copy twice, and avoid the overhead of cloning into a bigger array. | ||
535 | */ | ||
536 | int dccp_insert_fn_opt(struct sk_buff *skb, u8 type, u8 feat, | ||
537 | u8 *val, u8 len, bool repeat_first) | ||
538 | { | ||
539 | u8 tot_len, *to; | ||
540 | |||
541 | /* take the `Feature' field and possible repetition into account */ | ||
542 | if (len > (DCCP_SINGLE_OPT_MAXLEN - 2)) { | ||
543 | DCCP_WARN("length %u for feature %u too large\n", len, feat); | ||
544 | return -1; | ||
531 | } | 545 | } |
532 | 546 | ||
533 | /* Retransmit timer. | 547 | if (unlikely(val == NULL || len == 0)) |
534 | * If this is the master listening sock, we don't set a timer on it. It | 548 | len = repeat_first = 0; |
535 | * should be fine because if the dude doesn't receive our RESPONSE | 549 | tot_len = 3 + repeat_first + len; |
536 | * [which will contain the CHANGE] he will send another REQUEST which | ||
537 | * will "retrnasmit" the change. | ||
538 | */ | ||
539 | if (change && dp->dccps_role != DCCP_ROLE_LISTEN) { | ||
540 | dccp_pr_debug("reset feat negotiation timer %p\n", sk); | ||
541 | 550 | ||
542 | /* XXX don't reset the timer on re-transmissions. I.e. reset it | 551 | if (DCCP_SKB_CB(skb)->dccpd_opt_len + tot_len > DCCP_MAX_OPT_LEN) { |
543 | * only when sending new stuff i guess. Currently the timer | 552 | DCCP_WARN("packet too small for feature %d option!\n", feat); |
544 | * never backs off because on re-transmission it just resets it! | 553 | return -1; |
545 | */ | ||
546 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, | ||
547 | inet_csk(sk)->icsk_rto, DCCP_RTO_MAX); | ||
548 | } | 554 | } |
555 | DCCP_SKB_CB(skb)->dccpd_opt_len += tot_len; | ||
556 | |||
557 | to = skb_push(skb, tot_len); | ||
558 | *to++ = type; | ||
559 | *to++ = tot_len; | ||
560 | *to++ = feat; | ||
549 | 561 | ||
562 | if (repeat_first) | ||
563 | *to++ = *val; | ||
564 | if (len) | ||
565 | memcpy(to, val, len); | ||
550 | return 0; | 566 | return 0; |
551 | } | 567 | } |
552 | 568 | ||
@@ -565,19 +581,30 @@ static void dccp_insert_option_padding(struct sk_buff *skb) | |||
565 | int dccp_insert_options(struct sock *sk, struct sk_buff *skb) | 581 | int dccp_insert_options(struct sock *sk, struct sk_buff *skb) |
566 | { | 582 | { |
567 | struct dccp_sock *dp = dccp_sk(sk); | 583 | struct dccp_sock *dp = dccp_sk(sk); |
568 | struct dccp_minisock *dmsk = dccp_msk(sk); | ||
569 | 584 | ||
570 | DCCP_SKB_CB(skb)->dccpd_opt_len = 0; | 585 | DCCP_SKB_CB(skb)->dccpd_opt_len = 0; |
571 | 586 | ||
572 | if (dmsk->dccpms_send_ndp_count && | 587 | if (dp->dccps_send_ndp_count && dccp_insert_option_ndp(sk, skb)) |
573 | dccp_insert_option_ndp(sk, skb)) | ||
574 | return -1; | 588 | return -1; |
575 | 589 | ||
576 | if (!dccp_packet_without_ack(skb)) { | 590 | if (DCCP_SKB_CB(skb)->dccpd_type != DCCP_PKT_DATA) { |
577 | if (dmsk->dccpms_send_ack_vector && | 591 | |
578 | dccp_ackvec_pending(dp->dccps_hc_rx_ackvec) && | 592 | /* Feature Negotiation */ |
579 | dccp_insert_option_ackvec(sk, skb)) | 593 | if (dccp_feat_insert_opts(dp, NULL, skb)) |
580 | return -1; | 594 | return -1; |
595 | |||
596 | if (DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_REQUEST) { | ||
597 | /* | ||
598 | * Obtain RTT sample from Request/Response exchange. | ||
599 | * This is currently used in CCID 3 initialisation. | ||
600 | */ | ||
601 | if (dccp_insert_option_timestamp(sk, skb)) | ||
602 | return -1; | ||
603 | |||
604 | } else if (dccp_ackvec_pending(sk) && | ||
605 | dccp_insert_option_ackvec(sk, skb)) { | ||
606 | return -1; | ||
607 | } | ||
581 | } | 608 | } |
582 | 609 | ||
583 | if (dp->dccps_hc_rx_insert_options) { | 610 | if (dp->dccps_hc_rx_insert_options) { |
@@ -586,21 +613,6 @@ int dccp_insert_options(struct sock *sk, struct sk_buff *skb) | |||
586 | dp->dccps_hc_rx_insert_options = 0; | 613 | dp->dccps_hc_rx_insert_options = 0; |
587 | } | 614 | } |
588 | 615 | ||
589 | /* Feature negotiation */ | ||
590 | /* Data packets can't do feat negotiation */ | ||
591 | if (DCCP_SKB_CB(skb)->dccpd_type != DCCP_PKT_DATA && | ||
592 | DCCP_SKB_CB(skb)->dccpd_type != DCCP_PKT_DATAACK && | ||
593 | dccp_insert_options_feat(sk, skb)) | ||
594 | return -1; | ||
595 | |||
596 | /* | ||
597 | * Obtain RTT sample from Request/Response exchange. | ||
598 | * This is currently used in CCID 3 initialisation. | ||
599 | */ | ||
600 | if (DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_REQUEST && | ||
601 | dccp_insert_option_timestamp(sk, skb)) | ||
602 | return -1; | ||
603 | |||
604 | if (dp->dccps_timestamp_echo != 0 && | 616 | if (dp->dccps_timestamp_echo != 0 && |
605 | dccp_insert_option_timestamp_echo(dp, NULL, skb)) | 617 | dccp_insert_option_timestamp_echo(dp, NULL, skb)) |
606 | return -1; | 618 | return -1; |
@@ -613,6 +625,9 @@ int dccp_insert_options_rsk(struct dccp_request_sock *dreq, struct sk_buff *skb) | |||
613 | { | 625 | { |
614 | DCCP_SKB_CB(skb)->dccpd_opt_len = 0; | 626 | DCCP_SKB_CB(skb)->dccpd_opt_len = 0; |
615 | 627 | ||
628 | if (dccp_feat_insert_opts(NULL, dreq, skb)) | ||
629 | return -1; | ||
630 | |||
616 | if (dreq->dreq_timestamp_echo != 0 && | 631 | if (dreq->dreq_timestamp_echo != 0 && |
617 | dccp_insert_option_timestamp_echo(NULL, dreq, skb)) | 632 | dccp_insert_option_timestamp_echo(NULL, dreq, skb)) |
618 | return -1; | 633 | return -1; |
diff --git a/net/dccp/output.c b/net/dccp/output.c index d06945c7d3df..2532797a8009 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c | |||
@@ -26,11 +26,13 @@ static inline void dccp_event_ack_sent(struct sock *sk) | |||
26 | inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); | 26 | inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); |
27 | } | 27 | } |
28 | 28 | ||
29 | static void dccp_skb_entail(struct sock *sk, struct sk_buff *skb) | 29 | /* enqueue @skb on sk_send_head for retransmission, return clone to send now */ |
30 | static struct sk_buff *dccp_skb_entail(struct sock *sk, struct sk_buff *skb) | ||
30 | { | 31 | { |
31 | skb_set_owner_w(skb, sk); | 32 | skb_set_owner_w(skb, sk); |
32 | WARN_ON(sk->sk_send_head); | 33 | WARN_ON(sk->sk_send_head); |
33 | sk->sk_send_head = skb; | 34 | sk->sk_send_head = skb; |
35 | return skb_clone(sk->sk_send_head, gfp_any()); | ||
34 | } | 36 | } |
35 | 37 | ||
36 | /* | 38 | /* |
@@ -161,21 +163,27 @@ unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu) | |||
161 | struct inet_connection_sock *icsk = inet_csk(sk); | 163 | struct inet_connection_sock *icsk = inet_csk(sk); |
162 | struct dccp_sock *dp = dccp_sk(sk); | 164 | struct dccp_sock *dp = dccp_sk(sk); |
163 | u32 ccmps = dccp_determine_ccmps(dp); | 165 | u32 ccmps = dccp_determine_ccmps(dp); |
164 | int cur_mps = ccmps ? min(pmtu, ccmps) : pmtu; | 166 | u32 cur_mps = ccmps ? min(pmtu, ccmps) : pmtu; |
165 | 167 | ||
166 | /* Account for header lengths and IPv4/v6 option overhead */ | 168 | /* Account for header lengths and IPv4/v6 option overhead */ |
167 | cur_mps -= (icsk->icsk_af_ops->net_header_len + icsk->icsk_ext_hdr_len + | 169 | cur_mps -= (icsk->icsk_af_ops->net_header_len + icsk->icsk_ext_hdr_len + |
168 | sizeof(struct dccp_hdr) + sizeof(struct dccp_hdr_ext)); | 170 | sizeof(struct dccp_hdr) + sizeof(struct dccp_hdr_ext)); |
169 | 171 | ||
170 | /* | 172 | /* |
171 | * FIXME: this should come from the CCID infrastructure, where, say, | 173 | * Leave enough headroom for common DCCP header options. |
172 | * TFRC will say it wants TIMESTAMPS, ELAPSED time, etc, for now lets | 174 | * This only considers options which may appear on DCCP-Data packets, as |
173 | * put a rough estimate for NDP + TIMESTAMP + TIMESTAMP_ECHO + ELAPSED | 175 | * per table 3 in RFC 4340, 5.8. When running out of space for other |
174 | * TIME + TFRC_OPT_LOSS_EVENT_RATE + TFRC_OPT_RECEIVE_RATE + padding to | 176 | * options (eg. Ack Vector which can take up to 255 bytes), it is better |
175 | * make it a multiple of 4 | 177 | * to schedule a separate Ack. Thus we leave headroom for the following: |
178 | * - 1 byte for Slow Receiver (11.6) | ||
179 | * - 6 bytes for Timestamp (13.1) | ||
180 | * - 10 bytes for Timestamp Echo (13.3) | ||
181 | * - 8 bytes for NDP count (7.7, when activated) | ||
182 | * - 6 bytes for Data Checksum (9.3) | ||
183 | * - %DCCPAV_MIN_OPTLEN bytes for Ack Vector size (11.4, when enabled) | ||
176 | */ | 184 | */ |
177 | 185 | cur_mps -= roundup(1 + 6 + 10 + dp->dccps_send_ndp_count * 8 + 6 + | |
178 | cur_mps -= ((5 + 6 + 10 + 6 + 6 + 6 + 3) / 4) * 4; | 186 | (dp->dccps_hc_rx_ackvec ? DCCPAV_MIN_OPTLEN : 0), 4); |
179 | 187 | ||
180 | /* And store cached results */ | 188 | /* And store cached results */ |
181 | icsk->icsk_pmtu_cookie = pmtu; | 189 | icsk->icsk_pmtu_cookie = pmtu; |
@@ -200,95 +208,158 @@ void dccp_write_space(struct sock *sk) | |||
200 | } | 208 | } |
201 | 209 | ||
202 | /** | 210 | /** |
203 | * dccp_wait_for_ccid - Wait for ccid to tell us we can send a packet | 211 | * dccp_wait_for_ccid - Await CCID send permission |
204 | * @sk: socket to wait for | 212 | * @sk: socket to wait for |
205 | * @skb: current skb to pass on for waiting | 213 | * @delay: timeout in jiffies |
206 | * @delay: sleep timeout in milliseconds (> 0) | 214 | * This is used by CCIDs which need to delay the send time in process context. |
207 | * This function is called by default when the socket is closed, and | ||
208 | * when a non-zero linger time is set on the socket. For consistency | ||
209 | */ | 215 | */ |
210 | static int dccp_wait_for_ccid(struct sock *sk, struct sk_buff *skb, int delay) | 216 | static int dccp_wait_for_ccid(struct sock *sk, unsigned long delay) |
211 | { | 217 | { |
212 | struct dccp_sock *dp = dccp_sk(sk); | ||
213 | DEFINE_WAIT(wait); | 218 | DEFINE_WAIT(wait); |
214 | unsigned long jiffdelay; | 219 | long remaining; |
215 | int rc; | ||
216 | 220 | ||
217 | do { | 221 | prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); |
218 | dccp_pr_debug("delayed send by %d msec\n", delay); | 222 | sk->sk_write_pending++; |
219 | jiffdelay = msecs_to_jiffies(delay); | 223 | release_sock(sk); |
220 | 224 | ||
221 | prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); | 225 | remaining = schedule_timeout(delay); |
222 | 226 | ||
223 | sk->sk_write_pending++; | 227 | lock_sock(sk); |
224 | release_sock(sk); | 228 | sk->sk_write_pending--; |
225 | schedule_timeout(jiffdelay); | 229 | finish_wait(sk->sk_sleep, &wait); |
226 | lock_sock(sk); | ||
227 | sk->sk_write_pending--; | ||
228 | 230 | ||
229 | if (sk->sk_err) | 231 | if (signal_pending(current) || sk->sk_err) |
230 | goto do_error; | 232 | return -1; |
231 | if (signal_pending(current)) | 233 | return remaining; |
232 | goto do_interrupted; | 234 | } |
233 | 235 | ||
234 | rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb); | 236 | /** |
235 | } while ((delay = rc) > 0); | 237 | * dccp_xmit_packet - Send data packet under control of CCID |
236 | out: | 238 | * Transmits next-queued payload and informs CCID to account for the packet. |
237 | finish_wait(sk->sk_sleep, &wait); | 239 | */ |
238 | return rc; | 240 | static void dccp_xmit_packet(struct sock *sk) |
239 | 241 | { | |
240 | do_error: | 242 | int err, len; |
241 | rc = -EPIPE; | 243 | struct dccp_sock *dp = dccp_sk(sk); |
242 | goto out; | 244 | struct sk_buff *skb = dccp_qpolicy_pop(sk); |
243 | do_interrupted: | 245 | |
244 | rc = -EINTR; | 246 | if (unlikely(skb == NULL)) |
245 | goto out; | 247 | return; |
248 | len = skb->len; | ||
249 | |||
250 | if (sk->sk_state == DCCP_PARTOPEN) { | ||
251 | const u32 cur_mps = dp->dccps_mss_cache - DCCP_FEATNEG_OVERHEAD; | ||
252 | /* | ||
253 | * See 8.1.5 - Handshake Completion. | ||
254 | * | ||
255 | * For robustness we resend Confirm options until the client has | ||
256 | * entered OPEN. During the initial feature negotiation, the MPS | ||
257 | * is smaller than usual, reduced by the Change/Confirm options. | ||
258 | */ | ||
259 | if (!list_empty(&dp->dccps_featneg) && len > cur_mps) { | ||
260 | DCCP_WARN("Payload too large (%d) for featneg.\n", len); | ||
261 | dccp_send_ack(sk); | ||
262 | dccp_feat_list_purge(&dp->dccps_featneg); | ||
263 | } | ||
264 | |||
265 | inet_csk_schedule_ack(sk); | ||
266 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, | ||
267 | inet_csk(sk)->icsk_rto, | ||
268 | DCCP_RTO_MAX); | ||
269 | DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATAACK; | ||
270 | } else if (dccp_ack_pending(sk)) { | ||
271 | DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATAACK; | ||
272 | } else { | ||
273 | DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATA; | ||
274 | } | ||
275 | |||
276 | err = dccp_transmit_skb(sk, skb); | ||
277 | if (err) | ||
278 | dccp_pr_debug("transmit_skb() returned err=%d\n", err); | ||
279 | /* | ||
280 | * Register this one as sent even if an error occurred. To the remote | ||
281 | * end a local packet drop is indistinguishable from network loss, i.e. | ||
282 | * any local drop will eventually be reported via receiver feedback. | ||
283 | */ | ||
284 | ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, len); | ||
285 | |||
286 | /* | ||
287 | * If the CCID needs to transfer additional header options out-of-band | ||
288 | * (e.g. Ack Vectors or feature-negotiation options), it activates this | ||
289 | * flag to schedule a Sync. The Sync will automatically incorporate all | ||
290 | * currently pending header options, thus clearing the backlog. | ||
291 | */ | ||
292 | if (dp->dccps_sync_scheduled) | ||
293 | dccp_send_sync(sk, dp->dccps_gsr, DCCP_PKT_SYNC); | ||
246 | } | 294 | } |
247 | 295 | ||
248 | void dccp_write_xmit(struct sock *sk, int block) | 296 | /** |
297 | * dccp_flush_write_queue - Drain queue at end of connection | ||
298 | * Since dccp_sendmsg queues packets without waiting for them to be sent, it may | ||
299 | * happen that the TX queue is not empty at the end of a connection. We give the | ||
300 | * HC-sender CCID a grace period of up to @time_budget jiffies. If this function | ||
301 | * returns with a non-empty write queue, it will be purged later. | ||
302 | */ | ||
303 | void dccp_flush_write_queue(struct sock *sk, long *time_budget) | ||
249 | { | 304 | { |
250 | struct dccp_sock *dp = dccp_sk(sk); | 305 | struct dccp_sock *dp = dccp_sk(sk); |
251 | struct sk_buff *skb; | 306 | struct sk_buff *skb; |
307 | long delay, rc; | ||
308 | |||
309 | while (*time_budget > 0 && (skb = skb_peek(&sk->sk_write_queue))) { | ||
310 | rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb); | ||
252 | 311 | ||
253 | while ((skb = skb_peek(&sk->sk_write_queue))) { | 312 | switch (ccid_packet_dequeue_eval(rc)) { |
254 | int err = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb); | 313 | case CCID_PACKET_WILL_DEQUEUE_LATER: |
255 | 314 | /* | |
256 | if (err > 0) { | 315 | * If the CCID determines when to send, the next sending |
257 | if (!block) { | 316 | * time is unknown or the CCID may not even send again |
258 | sk_reset_timer(sk, &dp->dccps_xmit_timer, | 317 | * (e.g. remote host crashes or lost Ack packets). |
259 | msecs_to_jiffies(err)+jiffies); | 318 | */ |
260 | break; | 319 | DCCP_WARN("CCID did not manage to send all packets\n"); |
261 | } else | 320 | return; |
262 | err = dccp_wait_for_ccid(sk, skb, err); | 321 | case CCID_PACKET_DELAY: |
263 | if (err && err != -EINTR) | 322 | delay = msecs_to_jiffies(rc); |
264 | DCCP_BUG("err=%d after dccp_wait_for_ccid", err); | 323 | if (delay > *time_budget) |
324 | return; | ||
325 | rc = dccp_wait_for_ccid(sk, delay); | ||
326 | if (rc < 0) | ||
327 | return; | ||
328 | *time_budget -= (delay - rc); | ||
329 | /* check again if we can send now */ | ||
330 | break; | ||
331 | case CCID_PACKET_SEND_AT_ONCE: | ||
332 | dccp_xmit_packet(sk); | ||
333 | break; | ||
334 | case CCID_PACKET_ERR: | ||
335 | skb_dequeue(&sk->sk_write_queue); | ||
336 | kfree_skb(skb); | ||
337 | dccp_pr_debug("packet discarded due to err=%ld\n", rc); | ||
265 | } | 338 | } |
339 | } | ||
340 | } | ||
266 | 341 | ||
267 | skb_dequeue(&sk->sk_write_queue); | 342 | void dccp_write_xmit(struct sock *sk) |
268 | if (err == 0) { | 343 | { |
269 | struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); | 344 | struct dccp_sock *dp = dccp_sk(sk); |
270 | const int len = skb->len; | 345 | struct sk_buff *skb; |
271 | 346 | ||
272 | if (sk->sk_state == DCCP_PARTOPEN) { | 347 | while ((skb = dccp_qpolicy_top(sk))) { |
273 | /* See 8.1.5. Handshake Completion */ | 348 | int rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb); |
274 | inet_csk_schedule_ack(sk); | 349 | |
275 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, | 350 | switch (ccid_packet_dequeue_eval(rc)) { |
276 | inet_csk(sk)->icsk_rto, | 351 | case CCID_PACKET_WILL_DEQUEUE_LATER: |
277 | DCCP_RTO_MAX); | 352 | return; |
278 | dcb->dccpd_type = DCCP_PKT_DATAACK; | 353 | case CCID_PACKET_DELAY: |
279 | } else if (dccp_ack_pending(sk)) | 354 | sk_reset_timer(sk, &dp->dccps_xmit_timer, |
280 | dcb->dccpd_type = DCCP_PKT_DATAACK; | 355 | jiffies + msecs_to_jiffies(rc)); |
281 | else | 356 | return; |
282 | dcb->dccpd_type = DCCP_PKT_DATA; | 357 | case CCID_PACKET_SEND_AT_ONCE: |
283 | 358 | dccp_xmit_packet(sk); | |
284 | err = dccp_transmit_skb(sk, skb); | 359 | break; |
285 | ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, 0, len); | 360 | case CCID_PACKET_ERR: |
286 | if (err) | 361 | dccp_qpolicy_drop(sk, skb); |
287 | DCCP_BUG("err=%d after ccid_hc_tx_packet_sent", | 362 | dccp_pr_debug("packet discarded due to err=%d\n", rc); |
288 | err); | ||
289 | } else { | ||
290 | dccp_pr_debug("packet discarded due to err=%d\n", err); | ||
291 | kfree_skb(skb); | ||
292 | } | 363 | } |
293 | } | 364 | } |
294 | } | 365 | } |
@@ -339,10 +410,12 @@ struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst, | |||
339 | DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_RESPONSE; | 410 | DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_RESPONSE; |
340 | DCCP_SKB_CB(skb)->dccpd_seq = dreq->dreq_iss; | 411 | DCCP_SKB_CB(skb)->dccpd_seq = dreq->dreq_iss; |
341 | 412 | ||
342 | if (dccp_insert_options_rsk(dreq, skb)) { | 413 | /* Resolve feature dependencies resulting from choice of CCID */ |
343 | kfree_skb(skb); | 414 | if (dccp_feat_server_ccid_dependencies(dreq)) |
344 | return NULL; | 415 | goto response_failed; |
345 | } | 416 | |
417 | if (dccp_insert_options_rsk(dreq, skb)) | ||
418 | goto response_failed; | ||
346 | 419 | ||
347 | /* Build and checksum header */ | 420 | /* Build and checksum header */ |
348 | dh = dccp_zeroed_hdr(skb, dccp_header_size); | 421 | dh = dccp_zeroed_hdr(skb, dccp_header_size); |
@@ -363,6 +436,9 @@ struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst, | |||
363 | inet_rsk(req)->acked = 1; | 436 | inet_rsk(req)->acked = 1; |
364 | DCCP_INC_STATS(DCCP_MIB_OUTSEGS); | 437 | DCCP_INC_STATS(DCCP_MIB_OUTSEGS); |
365 | return skb; | 438 | return skb; |
439 | response_failed: | ||
440 | kfree_skb(skb); | ||
441 | return NULL; | ||
366 | } | 442 | } |
367 | 443 | ||
368 | EXPORT_SYMBOL_GPL(dccp_make_response); | 444 | EXPORT_SYMBOL_GPL(dccp_make_response); |
@@ -447,8 +523,9 @@ int dccp_send_reset(struct sock *sk, enum dccp_reset_codes code) | |||
447 | /* | 523 | /* |
448 | * Do all connect socket setups that can be done AF independent. | 524 | * Do all connect socket setups that can be done AF independent. |
449 | */ | 525 | */ |
450 | static inline void dccp_connect_init(struct sock *sk) | 526 | int dccp_connect(struct sock *sk) |
451 | { | 527 | { |
528 | struct sk_buff *skb; | ||
452 | struct dccp_sock *dp = dccp_sk(sk); | 529 | struct dccp_sock *dp = dccp_sk(sk); |
453 | struct dst_entry *dst = __sk_dst_get(sk); | 530 | struct dst_entry *dst = __sk_dst_get(sk); |
454 | struct inet_connection_sock *icsk = inet_csk(sk); | 531 | struct inet_connection_sock *icsk = inet_csk(sk); |
@@ -458,19 +535,13 @@ static inline void dccp_connect_init(struct sock *sk) | |||
458 | 535 | ||
459 | dccp_sync_mss(sk, dst_mtu(dst)); | 536 | dccp_sync_mss(sk, dst_mtu(dst)); |
460 | 537 | ||
538 | /* do not connect if feature negotiation setup fails */ | ||
539 | if (dccp_feat_finalise_settings(dccp_sk(sk))) | ||
540 | return -EPROTO; | ||
541 | |||
461 | /* Initialise GAR as per 8.5; AWL/AWH are set in dccp_transmit_skb() */ | 542 | /* Initialise GAR as per 8.5; AWL/AWH are set in dccp_transmit_skb() */ |
462 | dp->dccps_gar = dp->dccps_iss; | 543 | dp->dccps_gar = dp->dccps_iss; |
463 | 544 | ||
464 | icsk->icsk_retransmits = 0; | ||
465 | } | ||
466 | |||
467 | int dccp_connect(struct sock *sk) | ||
468 | { | ||
469 | struct sk_buff *skb; | ||
470 | struct inet_connection_sock *icsk = inet_csk(sk); | ||
471 | |||
472 | dccp_connect_init(sk); | ||
473 | |||
474 | skb = alloc_skb(sk->sk_prot->max_header, sk->sk_allocation); | 545 | skb = alloc_skb(sk->sk_prot->max_header, sk->sk_allocation); |
475 | if (unlikely(skb == NULL)) | 546 | if (unlikely(skb == NULL)) |
476 | return -ENOBUFS; | 547 | return -ENOBUFS; |
@@ -480,11 +551,11 @@ int dccp_connect(struct sock *sk) | |||
480 | 551 | ||
481 | DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_REQUEST; | 552 | DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_REQUEST; |
482 | 553 | ||
483 | dccp_skb_entail(sk, skb); | 554 | dccp_transmit_skb(sk, dccp_skb_entail(sk, skb)); |
484 | dccp_transmit_skb(sk, skb_clone(skb, GFP_KERNEL)); | ||
485 | DCCP_INC_STATS(DCCP_MIB_ACTIVEOPENS); | 555 | DCCP_INC_STATS(DCCP_MIB_ACTIVEOPENS); |
486 | 556 | ||
487 | /* Timer for repeating the REQUEST until an answer. */ | 557 | /* Timer for repeating the REQUEST until an answer. */ |
558 | icsk->icsk_retransmits = 0; | ||
488 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, | 559 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, |
489 | icsk->icsk_rto, DCCP_RTO_MAX); | 560 | icsk->icsk_rto, DCCP_RTO_MAX); |
490 | return 0; | 561 | return 0; |
@@ -571,6 +642,12 @@ void dccp_send_sync(struct sock *sk, const u64 ackno, | |||
571 | DCCP_SKB_CB(skb)->dccpd_type = pkt_type; | 642 | DCCP_SKB_CB(skb)->dccpd_type = pkt_type; |
572 | DCCP_SKB_CB(skb)->dccpd_ack_seq = ackno; | 643 | DCCP_SKB_CB(skb)->dccpd_ack_seq = ackno; |
573 | 644 | ||
645 | /* | ||
646 | * Clear the flag in case the Sync was scheduled for out-of-band data, | ||
647 | * such as carrying a long Ack Vector. | ||
648 | */ | ||
649 | dccp_sk(sk)->dccps_sync_scheduled = 0; | ||
650 | |||
574 | dccp_transmit_skb(sk, skb); | 651 | dccp_transmit_skb(sk, skb); |
575 | } | 652 | } |
576 | 653 | ||
@@ -599,9 +676,7 @@ void dccp_send_close(struct sock *sk, const int active) | |||
599 | DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_CLOSE; | 676 | DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_CLOSE; |
600 | 677 | ||
601 | if (active) { | 678 | if (active) { |
602 | dccp_write_xmit(sk, 1); | 679 | skb = dccp_skb_entail(sk, skb); |
603 | dccp_skb_entail(sk, skb); | ||
604 | dccp_transmit_skb(sk, skb_clone(skb, prio)); | ||
605 | /* | 680 | /* |
606 | * Retransmission timer for active-close: RFC 4340, 8.3 requires | 681 | * Retransmission timer for active-close: RFC 4340, 8.3 requires |
607 | * to retransmit the Close/CloseReq until the CLOSING/CLOSEREQ | 682 | * to retransmit the Close/CloseReq until the CLOSING/CLOSEREQ |
@@ -614,6 +689,6 @@ void dccp_send_close(struct sock *sk, const int active) | |||
614 | */ | 689 | */ |
615 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, | 690 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, |
616 | DCCP_TIMEOUT_INIT, DCCP_RTO_MAX); | 691 | DCCP_TIMEOUT_INIT, DCCP_RTO_MAX); |
617 | } else | 692 | } |
618 | dccp_transmit_skb(sk, skb); | 693 | dccp_transmit_skb(sk, skb); |
619 | } | 694 | } |
diff --git a/net/dccp/probe.c b/net/dccp/probe.c index 81368a7f5379..eaa59d82ab0f 100644 --- a/net/dccp/probe.c +++ b/net/dccp/probe.c | |||
@@ -46,75 +46,54 @@ static struct { | |||
46 | struct kfifo *fifo; | 46 | struct kfifo *fifo; |
47 | spinlock_t lock; | 47 | spinlock_t lock; |
48 | wait_queue_head_t wait; | 48 | wait_queue_head_t wait; |
49 | struct timespec tstart; | 49 | ktime_t start; |
50 | } dccpw; | 50 | } dccpw; |
51 | 51 | ||
52 | static void printl(const char *fmt, ...) | 52 | static void jdccp_write_xmit(struct sock *sk) |
53 | { | 53 | { |
54 | va_list args; | ||
55 | int len; | ||
56 | struct timespec now; | ||
57 | char tbuf[256]; | ||
58 | |||
59 | va_start(args, fmt); | ||
60 | getnstimeofday(&now); | ||
61 | |||
62 | now = timespec_sub(now, dccpw.tstart); | ||
63 | |||
64 | len = sprintf(tbuf, "%lu.%06lu ", | ||
65 | (unsigned long) now.tv_sec, | ||
66 | (unsigned long) now.tv_nsec / NSEC_PER_USEC); | ||
67 | len += vscnprintf(tbuf+len, sizeof(tbuf)-len, fmt, args); | ||
68 | va_end(args); | ||
69 | |||
70 | kfifo_put(dccpw.fifo, tbuf, len); | ||
71 | wake_up(&dccpw.wait); | ||
72 | } | ||
73 | |||
74 | static int jdccp_sendmsg(struct kiocb *iocb, struct sock *sk, | ||
75 | struct msghdr *msg, size_t size) | ||
76 | { | ||
77 | const struct dccp_minisock *dmsk = dccp_msk(sk); | ||
78 | const struct inet_sock *inet = inet_sk(sk); | 54 | const struct inet_sock *inet = inet_sk(sk); |
79 | const struct ccid3_hc_tx_sock *hctx; | 55 | struct ccid3_hc_tx_sock *hctx = NULL; |
56 | struct timespec tv; | ||
57 | char buf[256]; | ||
58 | int len, ccid = ccid_get_current_tx_ccid(dccp_sk(sk)); | ||
80 | 59 | ||
81 | if (dmsk->dccpms_tx_ccid == DCCPC_CCID3) | 60 | if (ccid == DCCPC_CCID3) |
82 | hctx = ccid3_hc_tx_sk(sk); | 61 | hctx = ccid3_hc_tx_sk(sk); |
83 | else | ||
84 | hctx = NULL; | ||
85 | 62 | ||
86 | if (port == 0 || ntohs(inet->dport) == port || | 63 | if (!port || ntohs(inet->dport) == port || ntohs(inet->sport) == port) { |
87 | ntohs(inet->sport) == port) { | 64 | |
88 | if (hctx) | 65 | tv = ktime_to_timespec(ktime_sub(ktime_get(), dccpw.start)); |
89 | printl("%d.%d.%d.%d:%u %d.%d.%d.%d:%u %d %d %d %d %u " | 66 | len = sprintf(buf, "%lu.%09lu %d.%d.%d.%d:%u %d.%d.%d.%d:%u %d", |
90 | "%llu %llu %d\n", | 67 | (unsigned long)tv.tv_sec, |
91 | NIPQUAD(inet->saddr), ntohs(inet->sport), | 68 | (unsigned long)tv.tv_nsec, |
92 | NIPQUAD(inet->daddr), ntohs(inet->dport), size, | ||
93 | hctx->ccid3hctx_s, hctx->ccid3hctx_rtt, | ||
94 | hctx->ccid3hctx_p, hctx->ccid3hctx_x_calc, | ||
95 | hctx->ccid3hctx_x_recv >> 6, | ||
96 | hctx->ccid3hctx_x >> 6, hctx->ccid3hctx_t_ipi); | ||
97 | else | ||
98 | printl("%d.%d.%d.%d:%u %d.%d.%d.%d:%u %d\n", | ||
99 | NIPQUAD(inet->saddr), ntohs(inet->sport), | 69 | NIPQUAD(inet->saddr), ntohs(inet->sport), |
100 | NIPQUAD(inet->daddr), ntohs(inet->dport), size); | 70 | NIPQUAD(inet->daddr), ntohs(inet->dport), ccid); |
71 | |||
72 | if (hctx) | ||
73 | len += sprintf(buf + len, " %d %d %d %u %u %u %d", | ||
74 | hctx->s, hctx->rtt, hctx->p, hctx->x_calc, | ||
75 | (unsigned)(hctx->x_recv >> 6), | ||
76 | (unsigned)(hctx->x >> 6), hctx->t_ipi); | ||
77 | |||
78 | len += sprintf(buf + len, "\n"); | ||
79 | kfifo_put(dccpw.fifo, buf, len); | ||
80 | wake_up(&dccpw.wait); | ||
101 | } | 81 | } |
102 | 82 | ||
103 | jprobe_return(); | 83 | jprobe_return(); |
104 | return 0; | ||
105 | } | 84 | } |
106 | 85 | ||
107 | static struct jprobe dccp_send_probe = { | 86 | static struct jprobe dccp_send_probe = { |
108 | .kp = { | 87 | .kp = { |
109 | .symbol_name = "dccp_sendmsg", | 88 | .symbol_name = "dccp_write_xmit", |
110 | }, | 89 | }, |
111 | .entry = jdccp_sendmsg, | 90 | .entry = jdccp_write_xmit, |
112 | }; | 91 | }; |
113 | 92 | ||
114 | static int dccpprobe_open(struct inode *inode, struct file *file) | 93 | static int dccpprobe_open(struct inode *inode, struct file *file) |
115 | { | 94 | { |
116 | kfifo_reset(dccpw.fifo); | 95 | kfifo_reset(dccpw.fifo); |
117 | getnstimeofday(&dccpw.tstart); | 96 | dccpw.start = ktime_get(); |
118 | return 0; | 97 | return 0; |
119 | } | 98 | } |
120 | 99 | ||
diff --git a/net/dccp/proto.c b/net/dccp/proto.c index d0bd34819761..ecf3be961e11 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c | |||
@@ -67,6 +67,9 @@ void dccp_set_state(struct sock *sk, const int state) | |||
67 | case DCCP_OPEN: | 67 | case DCCP_OPEN: |
68 | if (oldstate != DCCP_OPEN) | 68 | if (oldstate != DCCP_OPEN) |
69 | DCCP_INC_STATS(DCCP_MIB_CURRESTAB); | 69 | DCCP_INC_STATS(DCCP_MIB_CURRESTAB); |
70 | /* Client retransmits all Confirm options until entering OPEN */ | ||
71 | if (oldstate == DCCP_PARTOPEN) | ||
72 | dccp_feat_list_purge(&dccp_sk(sk)->dccps_featneg); | ||
70 | break; | 73 | break; |
71 | 74 | ||
72 | case DCCP_CLOSED: | 75 | case DCCP_CLOSED: |
@@ -175,63 +178,25 @@ EXPORT_SYMBOL_GPL(dccp_state_name); | |||
175 | int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized) | 178 | int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized) |
176 | { | 179 | { |
177 | struct dccp_sock *dp = dccp_sk(sk); | 180 | struct dccp_sock *dp = dccp_sk(sk); |
178 | struct dccp_minisock *dmsk = dccp_msk(sk); | ||
179 | struct inet_connection_sock *icsk = inet_csk(sk); | 181 | struct inet_connection_sock *icsk = inet_csk(sk); |
180 | 182 | ||
181 | dccp_minisock_init(&dp->dccps_minisock); | ||
182 | |||
183 | icsk->icsk_rto = DCCP_TIMEOUT_INIT; | 183 | icsk->icsk_rto = DCCP_TIMEOUT_INIT; |
184 | icsk->icsk_syn_retries = sysctl_dccp_request_retries; | 184 | icsk->icsk_syn_retries = sysctl_dccp_request_retries; |
185 | sk->sk_state = DCCP_CLOSED; | 185 | sk->sk_state = DCCP_CLOSED; |
186 | sk->sk_write_space = dccp_write_space; | 186 | sk->sk_write_space = dccp_write_space; |
187 | icsk->icsk_sync_mss = dccp_sync_mss; | 187 | icsk->icsk_sync_mss = dccp_sync_mss; |
188 | dp->dccps_mss_cache = 536; | 188 | dp->dccps_mss_cache = TCP_MIN_RCVMSS; |
189 | dp->dccps_rate_last = jiffies; | 189 | dp->dccps_rate_last = jiffies; |
190 | dp->dccps_role = DCCP_ROLE_UNDEFINED; | 190 | dp->dccps_role = DCCP_ROLE_UNDEFINED; |
191 | dp->dccps_service = DCCP_SERVICE_CODE_IS_ABSENT; | 191 | dp->dccps_service = DCCP_SERVICE_CODE_IS_ABSENT; |
192 | dp->dccps_l_ack_ratio = dp->dccps_r_ack_ratio = 1; | 192 | dp->dccps_tx_qlen = sysctl_dccp_tx_qlen; |
193 | 193 | ||
194 | dccp_init_xmit_timers(sk); | 194 | dccp_init_xmit_timers(sk); |
195 | 195 | ||
196 | /* | 196 | INIT_LIST_HEAD(&dp->dccps_featneg); |
197 | * FIXME: We're hardcoding the CCID, and doing this at this point makes | 197 | /* control socket doesn't need feat nego */ |
198 | * the listening (master) sock get CCID control blocks, which is not | 198 | if (likely(ctl_sock_initialized)) |
199 | * necessary, but for now, to not mess with the test userspace apps, | 199 | return dccp_feat_init(sk); |
200 | * lets leave it here, later the real solution is to do this in a | ||
201 | * setsockopt(CCIDs-I-want/accept). -acme | ||
202 | */ | ||
203 | if (likely(ctl_sock_initialized)) { | ||
204 | int rc = dccp_feat_init(dmsk); | ||
205 | |||
206 | if (rc) | ||
207 | return rc; | ||
208 | |||
209 | if (dmsk->dccpms_send_ack_vector) { | ||
210 | dp->dccps_hc_rx_ackvec = dccp_ackvec_alloc(GFP_KERNEL); | ||
211 | if (dp->dccps_hc_rx_ackvec == NULL) | ||
212 | return -ENOMEM; | ||
213 | } | ||
214 | dp->dccps_hc_rx_ccid = ccid_hc_rx_new(dmsk->dccpms_rx_ccid, | ||
215 | sk, GFP_KERNEL); | ||
216 | dp->dccps_hc_tx_ccid = ccid_hc_tx_new(dmsk->dccpms_tx_ccid, | ||
217 | sk, GFP_KERNEL); | ||
218 | if (unlikely(dp->dccps_hc_rx_ccid == NULL || | ||
219 | dp->dccps_hc_tx_ccid == NULL)) { | ||
220 | ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk); | ||
221 | ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk); | ||
222 | if (dmsk->dccpms_send_ack_vector) { | ||
223 | dccp_ackvec_free(dp->dccps_hc_rx_ackvec); | ||
224 | dp->dccps_hc_rx_ackvec = NULL; | ||
225 | } | ||
226 | dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL; | ||
227 | return -ENOMEM; | ||
228 | } | ||
229 | } else { | ||
230 | /* control socket doesn't need feat nego */ | ||
231 | INIT_LIST_HEAD(&dmsk->dccpms_pending); | ||
232 | INIT_LIST_HEAD(&dmsk->dccpms_conf); | ||
233 | } | ||
234 | |||
235 | return 0; | 200 | return 0; |
236 | } | 201 | } |
237 | 202 | ||
@@ -240,7 +205,6 @@ EXPORT_SYMBOL_GPL(dccp_init_sock); | |||
240 | void dccp_destroy_sock(struct sock *sk) | 205 | void dccp_destroy_sock(struct sock *sk) |
241 | { | 206 | { |
242 | struct dccp_sock *dp = dccp_sk(sk); | 207 | struct dccp_sock *dp = dccp_sk(sk); |
243 | struct dccp_minisock *dmsk = dccp_msk(sk); | ||
244 | 208 | ||
245 | /* | 209 | /* |
246 | * DCCP doesn't use sk_write_queue, just sk_send_head | 210 | * DCCP doesn't use sk_write_queue, just sk_send_head |
@@ -258,7 +222,7 @@ void dccp_destroy_sock(struct sock *sk) | |||
258 | kfree(dp->dccps_service_list); | 222 | kfree(dp->dccps_service_list); |
259 | dp->dccps_service_list = NULL; | 223 | dp->dccps_service_list = NULL; |
260 | 224 | ||
261 | if (dmsk->dccpms_send_ack_vector) { | 225 | if (dp->dccps_hc_rx_ackvec != NULL) { |
262 | dccp_ackvec_free(dp->dccps_hc_rx_ackvec); | 226 | dccp_ackvec_free(dp->dccps_hc_rx_ackvec); |
263 | dp->dccps_hc_rx_ackvec = NULL; | 227 | dp->dccps_hc_rx_ackvec = NULL; |
264 | } | 228 | } |
@@ -267,7 +231,7 @@ void dccp_destroy_sock(struct sock *sk) | |||
267 | dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL; | 231 | dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL; |
268 | 232 | ||
269 | /* clean up feature negotiation state */ | 233 | /* clean up feature negotiation state */ |
270 | dccp_feat_clean(dmsk); | 234 | dccp_feat_list_purge(&dp->dccps_featneg); |
271 | } | 235 | } |
272 | 236 | ||
273 | EXPORT_SYMBOL_GPL(dccp_destroy_sock); | 237 | EXPORT_SYMBOL_GPL(dccp_destroy_sock); |
@@ -277,6 +241,9 @@ static inline int dccp_listen_start(struct sock *sk, int backlog) | |||
277 | struct dccp_sock *dp = dccp_sk(sk); | 241 | struct dccp_sock *dp = dccp_sk(sk); |
278 | 242 | ||
279 | dp->dccps_role = DCCP_ROLE_LISTEN; | 243 | dp->dccps_role = DCCP_ROLE_LISTEN; |
244 | /* do not start to listen if feature negotiation setup fails */ | ||
245 | if (dccp_feat_finalise_settings(dp)) | ||
246 | return -EPROTO; | ||
280 | return inet_csk_listen_start(sk, backlog); | 247 | return inet_csk_listen_start(sk, backlog); |
281 | } | 248 | } |
282 | 249 | ||
@@ -466,42 +433,70 @@ static int dccp_setsockopt_service(struct sock *sk, const __be32 service, | |||
466 | return 0; | 433 | return 0; |
467 | } | 434 | } |
468 | 435 | ||
469 | /* byte 1 is feature. the rest is the preference list */ | 436 | static int dccp_setsockopt_cscov(struct sock *sk, int cscov, bool rx) |
470 | static int dccp_setsockopt_change(struct sock *sk, int type, | ||
471 | struct dccp_so_feat __user *optval) | ||
472 | { | 437 | { |
473 | struct dccp_so_feat opt; | 438 | u8 *list, len; |
474 | u8 *val; | 439 | int i, rc; |
475 | int rc; | ||
476 | 440 | ||
477 | if (copy_from_user(&opt, optval, sizeof(opt))) | 441 | if (cscov < 0 || cscov > 15) |
478 | return -EFAULT; | 442 | return -EINVAL; |
479 | /* | 443 | /* |
480 | * rfc4340: 6.1. Change Options | 444 | * Populate a list of permissible values, in the range cscov...15. This |
445 | * is necessary since feature negotiation of single values only works if | ||
446 | * both sides incidentally choose the same value. Since the list starts | ||
447 | * lowest-value first, negotiation will pick the smallest shared value. | ||
481 | */ | 448 | */ |
482 | if (opt.dccpsf_len < 1) | 449 | if (cscov == 0) |
450 | return 0; | ||
451 | len = 16 - cscov; | ||
452 | |||
453 | list = kmalloc(len, GFP_KERNEL); | ||
454 | if (list == NULL) | ||
455 | return -ENOBUFS; | ||
456 | |||
457 | for (i = 0; i < len; i++) | ||
458 | list[i] = cscov++; | ||
459 | |||
460 | rc = dccp_feat_register_sp(sk, DCCPF_MIN_CSUM_COVER, rx, list, len); | ||
461 | |||
462 | if (rc == 0) { | ||
463 | if (rx) | ||
464 | dccp_sk(sk)->dccps_pcrlen = cscov; | ||
465 | else | ||
466 | dccp_sk(sk)->dccps_pcslen = cscov; | ||
467 | } | ||
468 | kfree(list); | ||
469 | return rc; | ||
470 | } | ||
471 | |||
472 | static int dccp_setsockopt_ccid(struct sock *sk, int type, | ||
473 | char __user *optval, int optlen) | ||
474 | { | ||
475 | u8 *val; | ||
476 | int rc = 0; | ||
477 | |||
478 | if (optlen < 1 || optlen > DCCP_FEAT_MAX_SP_VALS) | ||
483 | return -EINVAL; | 479 | return -EINVAL; |
484 | 480 | ||
485 | val = kmalloc(opt.dccpsf_len, GFP_KERNEL); | 481 | val = kmalloc(optlen, GFP_KERNEL); |
486 | if (!val) | 482 | if (val == NULL) |
487 | return -ENOMEM; | 483 | return -ENOMEM; |
488 | 484 | ||
489 | if (copy_from_user(val, opt.dccpsf_val, opt.dccpsf_len)) { | 485 | if (copy_from_user(val, optval, optlen)) { |
490 | rc = -EFAULT; | 486 | kfree(val); |
491 | goto out_free_val; | 487 | return -EFAULT; |
492 | } | 488 | } |
493 | 489 | ||
494 | rc = dccp_feat_change(dccp_msk(sk), type, opt.dccpsf_feat, | 490 | lock_sock(sk); |
495 | val, opt.dccpsf_len, GFP_KERNEL); | 491 | if (type == DCCP_SOCKOPT_TX_CCID || type == DCCP_SOCKOPT_CCID) |
496 | if (rc) | 492 | rc = dccp_feat_register_sp(sk, DCCPF_CCID, 1, val, optlen); |
497 | goto out_free_val; | ||
498 | 493 | ||
499 | out: | 494 | if (!rc && (type == DCCP_SOCKOPT_RX_CCID || type == DCCP_SOCKOPT_CCID)) |
500 | return rc; | 495 | rc = dccp_feat_register_sp(sk, DCCPF_CCID, 0, val, optlen); |
496 | release_sock(sk); | ||
501 | 497 | ||
502 | out_free_val: | ||
503 | kfree(val); | 498 | kfree(val); |
504 | goto out; | 499 | return rc; |
505 | } | 500 | } |
506 | 501 | ||
507 | static int do_dccp_setsockopt(struct sock *sk, int level, int optname, | 502 | static int do_dccp_setsockopt(struct sock *sk, int level, int optname, |
@@ -510,7 +505,21 @@ static int do_dccp_setsockopt(struct sock *sk, int level, int optname, | |||
510 | struct dccp_sock *dp = dccp_sk(sk); | 505 | struct dccp_sock *dp = dccp_sk(sk); |
511 | int val, err = 0; | 506 | int val, err = 0; |
512 | 507 | ||
513 | if (optlen < sizeof(int)) | 508 | switch (optname) { |
509 | case DCCP_SOCKOPT_PACKET_SIZE: | ||
510 | DCCP_WARN("sockopt(PACKET_SIZE) is deprecated: fix your app\n"); | ||
511 | return 0; | ||
512 | case DCCP_SOCKOPT_CHANGE_L: | ||
513 | case DCCP_SOCKOPT_CHANGE_R: | ||
514 | DCCP_WARN("sockopt(CHANGE_L/R) is deprecated: fix your app\n"); | ||
515 | return 0; | ||
516 | case DCCP_SOCKOPT_CCID: | ||
517 | case DCCP_SOCKOPT_RX_CCID: | ||
518 | case DCCP_SOCKOPT_TX_CCID: | ||
519 | return dccp_setsockopt_ccid(sk, optname, optval, optlen); | ||
520 | } | ||
521 | |||
522 | if (optlen < (int)sizeof(int)) | ||
514 | return -EINVAL; | 523 | return -EINVAL; |
515 | 524 | ||
516 | if (get_user(val, (int __user *)optval)) | 525 | if (get_user(val, (int __user *)optval)) |
@@ -521,53 +530,38 @@ static int do_dccp_setsockopt(struct sock *sk, int level, int optname, | |||
521 | 530 | ||
522 | lock_sock(sk); | 531 | lock_sock(sk); |
523 | switch (optname) { | 532 | switch (optname) { |
524 | case DCCP_SOCKOPT_PACKET_SIZE: | ||
525 | DCCP_WARN("sockopt(PACKET_SIZE) is deprecated: fix your app\n"); | ||
526 | err = 0; | ||
527 | break; | ||
528 | case DCCP_SOCKOPT_CHANGE_L: | ||
529 | if (optlen != sizeof(struct dccp_so_feat)) | ||
530 | err = -EINVAL; | ||
531 | else | ||
532 | err = dccp_setsockopt_change(sk, DCCPO_CHANGE_L, | ||
533 | (struct dccp_so_feat __user *) | ||
534 | optval); | ||
535 | break; | ||
536 | case DCCP_SOCKOPT_CHANGE_R: | ||
537 | if (optlen != sizeof(struct dccp_so_feat)) | ||
538 | err = -EINVAL; | ||
539 | else | ||
540 | err = dccp_setsockopt_change(sk, DCCPO_CHANGE_R, | ||
541 | (struct dccp_so_feat __user *) | ||
542 | optval); | ||
543 | break; | ||
544 | case DCCP_SOCKOPT_SERVER_TIMEWAIT: | 533 | case DCCP_SOCKOPT_SERVER_TIMEWAIT: |
545 | if (dp->dccps_role != DCCP_ROLE_SERVER) | 534 | if (dp->dccps_role != DCCP_ROLE_SERVER) |
546 | err = -EOPNOTSUPP; | 535 | err = -EOPNOTSUPP; |
547 | else | 536 | else |
548 | dp->dccps_server_timewait = (val != 0); | 537 | dp->dccps_server_timewait = (val != 0); |
549 | break; | 538 | break; |
550 | case DCCP_SOCKOPT_SEND_CSCOV: /* sender side, RFC 4340, sec. 9.2 */ | 539 | case DCCP_SOCKOPT_SEND_CSCOV: |
551 | if (val < 0 || val > 15) | 540 | err = dccp_setsockopt_cscov(sk, val, false); |
541 | break; | ||
542 | case DCCP_SOCKOPT_RECV_CSCOV: | ||
543 | err = dccp_setsockopt_cscov(sk, val, true); | ||
544 | break; | ||
545 | case DCCP_SOCKOPT_QPOLICY_ID: | ||
546 | if (sk->sk_state != DCCP_CLOSED) | ||
547 | err = -EISCONN; | ||
548 | else if (val < 0 || val >= DCCPQ_POLICY_MAX) | ||
552 | err = -EINVAL; | 549 | err = -EINVAL; |
553 | else | 550 | else |
554 | dp->dccps_pcslen = val; | 551 | dp->dccps_qpolicy = val; |
555 | break; | 552 | break; |
556 | case DCCP_SOCKOPT_RECV_CSCOV: /* receiver side, RFC 4340 sec. 9.2.1 */ | 553 | case DCCP_SOCKOPT_QPOLICY_TXQLEN: |
557 | if (val < 0 || val > 15) | 554 | if (val < 0) |
558 | err = -EINVAL; | 555 | err = -EINVAL; |
559 | else { | 556 | else |
560 | dp->dccps_pcrlen = val; | 557 | dp->dccps_tx_qlen = val; |
561 | /* FIXME: add feature negotiation, | ||
562 | * ChangeL(MinimumChecksumCoverage, val) */ | ||
563 | } | ||
564 | break; | 558 | break; |
565 | default: | 559 | default: |
566 | err = -ENOPROTOOPT; | 560 | err = -ENOPROTOOPT; |
567 | break; | 561 | break; |
568 | } | 562 | } |
569 | |||
570 | release_sock(sk); | 563 | release_sock(sk); |
564 | |||
571 | return err; | 565 | return err; |
572 | } | 566 | } |
573 | 567 | ||
@@ -648,6 +642,18 @@ static int do_dccp_getsockopt(struct sock *sk, int level, int optname, | |||
648 | case DCCP_SOCKOPT_GET_CUR_MPS: | 642 | case DCCP_SOCKOPT_GET_CUR_MPS: |
649 | val = dp->dccps_mss_cache; | 643 | val = dp->dccps_mss_cache; |
650 | break; | 644 | break; |
645 | case DCCP_SOCKOPT_AVAILABLE_CCIDS: | ||
646 | return ccid_getsockopt_builtin_ccids(sk, len, optval, optlen); | ||
647 | case DCCP_SOCKOPT_TX_CCID: | ||
648 | val = ccid_get_current_tx_ccid(dp); | ||
649 | if (val < 0) | ||
650 | return -ENOPROTOOPT; | ||
651 | break; | ||
652 | case DCCP_SOCKOPT_RX_CCID: | ||
653 | val = ccid_get_current_rx_ccid(dp); | ||
654 | if (val < 0) | ||
655 | return -ENOPROTOOPT; | ||
656 | break; | ||
651 | case DCCP_SOCKOPT_SERVER_TIMEWAIT: | 657 | case DCCP_SOCKOPT_SERVER_TIMEWAIT: |
652 | val = dp->dccps_server_timewait; | 658 | val = dp->dccps_server_timewait; |
653 | break; | 659 | break; |
@@ -657,6 +663,12 @@ static int do_dccp_getsockopt(struct sock *sk, int level, int optname, | |||
657 | case DCCP_SOCKOPT_RECV_CSCOV: | 663 | case DCCP_SOCKOPT_RECV_CSCOV: |
658 | val = dp->dccps_pcrlen; | 664 | val = dp->dccps_pcrlen; |
659 | break; | 665 | break; |
666 | case DCCP_SOCKOPT_QPOLICY_ID: | ||
667 | val = dp->dccps_qpolicy; | ||
668 | break; | ||
669 | case DCCP_SOCKOPT_QPOLICY_TXQLEN: | ||
670 | val = dp->dccps_tx_qlen; | ||
671 | break; | ||
660 | case 128 ... 191: | 672 | case 128 ... 191: |
661 | return ccid_hc_rx_getsockopt(dp->dccps_hc_rx_ccid, sk, optname, | 673 | return ccid_hc_rx_getsockopt(dp->dccps_hc_rx_ccid, sk, optname, |
662 | len, (u32 __user *)optval, optlen); | 674 | len, (u32 __user *)optval, optlen); |
@@ -699,6 +711,47 @@ int compat_dccp_getsockopt(struct sock *sk, int level, int optname, | |||
699 | EXPORT_SYMBOL_GPL(compat_dccp_getsockopt); | 711 | EXPORT_SYMBOL_GPL(compat_dccp_getsockopt); |
700 | #endif | 712 | #endif |
701 | 713 | ||
714 | static int dccp_msghdr_parse(struct msghdr *msg, struct sk_buff *skb) | ||
715 | { | ||
716 | struct cmsghdr *cmsg = CMSG_FIRSTHDR(msg); | ||
717 | |||
718 | /* | ||
719 | * Assign an (opaque) qpolicy priority value to skb->priority. | ||
720 | * | ||
721 | * We are overloading this skb field for use with the qpolicy subystem. | ||
722 | * The skb->priority is normally used for the SO_PRIORITY option, which | ||
723 | * is initialised from sk_priority. Since the assignment of sk_priority | ||
724 | * to skb->priority happens later (on layer 3), we overload this field | ||
725 | * for use with queueing priorities as long as the skb is on layer 4. | ||
726 | * The default priority value (if nothing is set) is 0. | ||
727 | */ | ||
728 | skb->priority = 0; | ||
729 | |||
730 | for (; cmsg != NULL; cmsg = CMSG_NXTHDR(msg, cmsg)) { | ||
731 | |||
732 | if (!CMSG_OK(msg, cmsg)) | ||
733 | return -EINVAL; | ||
734 | |||
735 | if (cmsg->cmsg_level != SOL_DCCP) | ||
736 | continue; | ||
737 | |||
738 | if (cmsg->cmsg_type <= DCCP_SCM_QPOLICY_MAX && | ||
739 | !dccp_qpolicy_param_ok(skb->sk, cmsg->cmsg_type)) | ||
740 | return -EINVAL; | ||
741 | |||
742 | switch (cmsg->cmsg_type) { | ||
743 | case DCCP_SCM_PRIORITY: | ||
744 | if (cmsg->cmsg_len != CMSG_LEN(sizeof(__u32))) | ||
745 | return -EINVAL; | ||
746 | skb->priority = *(__u32 *)CMSG_DATA(cmsg); | ||
747 | break; | ||
748 | default: | ||
749 | return -EINVAL; | ||
750 | } | ||
751 | } | ||
752 | return 0; | ||
753 | } | ||
754 | |||
702 | int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, | 755 | int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, |
703 | size_t len) | 756 | size_t len) |
704 | { | 757 | { |
@@ -714,8 +767,7 @@ int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, | |||
714 | 767 | ||
715 | lock_sock(sk); | 768 | lock_sock(sk); |
716 | 769 | ||
717 | if (sysctl_dccp_tx_qlen && | 770 | if (dccp_qpolicy_full(sk)) { |
718 | (sk->sk_write_queue.qlen >= sysctl_dccp_tx_qlen)) { | ||
719 | rc = -EAGAIN; | 771 | rc = -EAGAIN; |
720 | goto out_release; | 772 | goto out_release; |
721 | } | 773 | } |
@@ -743,8 +795,12 @@ int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, | |||
743 | if (rc != 0) | 795 | if (rc != 0) |
744 | goto out_discard; | 796 | goto out_discard; |
745 | 797 | ||
746 | skb_queue_tail(&sk->sk_write_queue, skb); | 798 | rc = dccp_msghdr_parse(msg, skb); |
747 | dccp_write_xmit(sk,0); | 799 | if (rc != 0) |
800 | goto out_discard; | ||
801 | |||
802 | dccp_qpolicy_push(sk, skb); | ||
803 | dccp_write_xmit(sk); | ||
748 | out_release: | 804 | out_release: |
749 | release_sock(sk); | 805 | release_sock(sk); |
750 | return rc ? : len; | 806 | return rc ? : len; |
@@ -967,9 +1023,22 @@ void dccp_close(struct sock *sk, long timeout) | |||
967 | /* Check zero linger _after_ checking for unread data. */ | 1023 | /* Check zero linger _after_ checking for unread data. */ |
968 | sk->sk_prot->disconnect(sk, 0); | 1024 | sk->sk_prot->disconnect(sk, 0); |
969 | } else if (sk->sk_state != DCCP_CLOSED) { | 1025 | } else if (sk->sk_state != DCCP_CLOSED) { |
1026 | /* | ||
1027 | * Normal connection termination. May need to wait if there are | ||
1028 | * still packets in the TX queue that are delayed by the CCID. | ||
1029 | */ | ||
1030 | dccp_flush_write_queue(sk, &timeout); | ||
970 | dccp_terminate_connection(sk); | 1031 | dccp_terminate_connection(sk); |
971 | } | 1032 | } |
972 | 1033 | ||
1034 | /* | ||
1035 | * Flush write queue. This may be necessary in several cases: | ||
1036 | * - we have been closed by the peer but still have application data; | ||
1037 | * - abortive termination (unread data or zero linger time), | ||
1038 | * - normal termination but queue could not be flushed within time limit | ||
1039 | */ | ||
1040 | __skb_queue_purge(&sk->sk_write_queue); | ||
1041 | |||
973 | sk_stream_wait_close(sk, timeout); | 1042 | sk_stream_wait_close(sk, timeout); |
974 | 1043 | ||
975 | adjudge_to_death: | 1044 | adjudge_to_death: |
diff --git a/net/dccp/qpolicy.c b/net/dccp/qpolicy.c new file mode 100644 index 000000000000..27383f88c75f --- /dev/null +++ b/net/dccp/qpolicy.c | |||
@@ -0,0 +1,137 @@ | |||
1 | /* | ||
2 | * net/dccp/qpolicy.c | ||
3 | * | ||
4 | * Policy-based packet dequeueing interface for DCCP. | ||
5 | * | ||
6 | * Copyright (c) 2008 Tomasz Grobelny <tomasz@grobelny.oswiecenia.net> | ||
7 | * | ||
8 | * This program is free software; you can redistribute it and/or | ||
9 | * modify it under the terms of the GNU General Public License v2 | ||
10 | * as published by the Free Software Foundation. | ||
11 | */ | ||
12 | #include "dccp.h" | ||
13 | |||
14 | /* | ||
15 | * Simple Dequeueing Policy: | ||
16 | * If tx_qlen is different from 0, enqueue up to tx_qlen elements. | ||
17 | */ | ||
18 | static void qpolicy_simple_push(struct sock *sk, struct sk_buff *skb) | ||
19 | { | ||
20 | skb_queue_tail(&sk->sk_write_queue, skb); | ||
21 | } | ||
22 | |||
23 | static bool qpolicy_simple_full(struct sock *sk) | ||
24 | { | ||
25 | return dccp_sk(sk)->dccps_tx_qlen && | ||
26 | sk->sk_write_queue.qlen >= dccp_sk(sk)->dccps_tx_qlen; | ||
27 | } | ||
28 | |||
29 | static struct sk_buff *qpolicy_simple_top(struct sock *sk) | ||
30 | { | ||
31 | return skb_peek(&sk->sk_write_queue); | ||
32 | } | ||
33 | |||
34 | /* | ||
35 | * Priority-based Dequeueing Policy: | ||
36 | * If tx_qlen is different from 0 and the queue has reached its upper bound | ||
37 | * of tx_qlen elements, replace older packets lowest-priority-first. | ||
38 | */ | ||
39 | static struct sk_buff *qpolicy_prio_best_skb(struct sock *sk) | ||
40 | { | ||
41 | struct sk_buff *skb, *best = NULL; | ||
42 | |||
43 | skb_queue_walk(&sk->sk_write_queue, skb) | ||
44 | if (best == NULL || skb->priority > best->priority) | ||
45 | best = skb; | ||
46 | return best; | ||
47 | } | ||
48 | |||
49 | static struct sk_buff *qpolicy_prio_worst_skb(struct sock *sk) | ||
50 | { | ||
51 | struct sk_buff *skb, *worst = NULL; | ||
52 | |||
53 | skb_queue_walk(&sk->sk_write_queue, skb) | ||
54 | if (worst == NULL || skb->priority < worst->priority) | ||
55 | worst = skb; | ||
56 | return worst; | ||
57 | } | ||
58 | |||
59 | static bool qpolicy_prio_full(struct sock *sk) | ||
60 | { | ||
61 | if (qpolicy_simple_full(sk)) | ||
62 | dccp_qpolicy_drop(sk, qpolicy_prio_worst_skb(sk)); | ||
63 | return false; | ||
64 | } | ||
65 | |||
66 | /** | ||
67 | * struct dccp_qpolicy_operations - TX Packet Dequeueing Interface | ||
68 | * @push: add a new @skb to the write queue | ||
69 | * @full: indicates that no more packets will be admitted | ||
70 | * @top: peeks at whatever the queueing policy defines as its `top' | ||
71 | */ | ||
72 | static struct dccp_qpolicy_operations { | ||
73 | void (*push) (struct sock *sk, struct sk_buff *skb); | ||
74 | bool (*full) (struct sock *sk); | ||
75 | struct sk_buff* (*top) (struct sock *sk); | ||
76 | __be32 params; | ||
77 | |||
78 | } qpol_table[DCCPQ_POLICY_MAX] = { | ||
79 | [DCCPQ_POLICY_SIMPLE] = { | ||
80 | .push = qpolicy_simple_push, | ||
81 | .full = qpolicy_simple_full, | ||
82 | .top = qpolicy_simple_top, | ||
83 | .params = 0, | ||
84 | }, | ||
85 | [DCCPQ_POLICY_PRIO] = { | ||
86 | .push = qpolicy_simple_push, | ||
87 | .full = qpolicy_prio_full, | ||
88 | .top = qpolicy_prio_best_skb, | ||
89 | .params = DCCP_SCM_PRIORITY, | ||
90 | }, | ||
91 | }; | ||
92 | |||
93 | /* | ||
94 | * Externally visible interface | ||
95 | */ | ||
96 | void dccp_qpolicy_push(struct sock *sk, struct sk_buff *skb) | ||
97 | { | ||
98 | qpol_table[dccp_sk(sk)->dccps_qpolicy].push(sk, skb); | ||
99 | } | ||
100 | |||
101 | bool dccp_qpolicy_full(struct sock *sk) | ||
102 | { | ||
103 | return qpol_table[dccp_sk(sk)->dccps_qpolicy].full(sk); | ||
104 | } | ||
105 | |||
106 | void dccp_qpolicy_drop(struct sock *sk, struct sk_buff *skb) | ||
107 | { | ||
108 | if (skb != NULL) { | ||
109 | skb_unlink(skb, &sk->sk_write_queue); | ||
110 | kfree_skb(skb); | ||
111 | } | ||
112 | } | ||
113 | |||
114 | struct sk_buff *dccp_qpolicy_top(struct sock *sk) | ||
115 | { | ||
116 | return qpol_table[dccp_sk(sk)->dccps_qpolicy].top(sk); | ||
117 | } | ||
118 | |||
119 | struct sk_buff *dccp_qpolicy_pop(struct sock *sk) | ||
120 | { | ||
121 | struct sk_buff *skb = dccp_qpolicy_top(sk); | ||
122 | |||
123 | /* Clear any skb fields that we used internally */ | ||
124 | skb->priority = 0; | ||
125 | |||
126 | if (skb) | ||
127 | skb_unlink(skb, &sk->sk_write_queue); | ||
128 | return skb; | ||
129 | } | ||
130 | |||
131 | bool dccp_qpolicy_param_ok(struct sock *sk, __be32 param) | ||
132 | { | ||
133 | /* check if exactly one bit is set */ | ||
134 | if (!param || (param & (param - 1))) | ||
135 | return false; | ||
136 | return (qpol_table[dccp_sk(sk)->dccps_qpolicy].params & param) == param; | ||
137 | } | ||
diff --git a/net/dccp/sysctl.c b/net/dccp/sysctl.c index 21295993fdb8..a5a1856234e7 100644 --- a/net/dccp/sysctl.c +++ b/net/dccp/sysctl.c | |||
@@ -18,76 +18,72 @@ | |||
18 | #error This file should not be compiled without CONFIG_SYSCTL defined | 18 | #error This file should not be compiled without CONFIG_SYSCTL defined |
19 | #endif | 19 | #endif |
20 | 20 | ||
21 | /* Boundary values */ | ||
22 | static int zero = 0, | ||
23 | u8_max = 0xFF; | ||
24 | static unsigned long seqw_min = 32; | ||
25 | |||
21 | static struct ctl_table dccp_default_table[] = { | 26 | static struct ctl_table dccp_default_table[] = { |
22 | { | 27 | { |
23 | .procname = "seq_window", | 28 | .procname = "seq_window", |
24 | .data = &sysctl_dccp_feat_sequence_window, | 29 | .data = &sysctl_dccp_sequence_window, |
25 | .maxlen = sizeof(sysctl_dccp_feat_sequence_window), | 30 | .maxlen = sizeof(sysctl_dccp_sequence_window), |
26 | .mode = 0644, | 31 | .mode = 0644, |
27 | .proc_handler = proc_dointvec, | 32 | .proc_handler = proc_doulongvec_minmax, |
33 | .extra1 = &seqw_min, /* RFC 4340, 7.5.2 */ | ||
28 | }, | 34 | }, |
29 | { | 35 | { |
30 | .procname = "rx_ccid", | 36 | .procname = "rx_ccid", |
31 | .data = &sysctl_dccp_feat_rx_ccid, | 37 | .data = &sysctl_dccp_rx_ccid, |
32 | .maxlen = sizeof(sysctl_dccp_feat_rx_ccid), | 38 | .maxlen = sizeof(sysctl_dccp_rx_ccid), |
33 | .mode = 0644, | 39 | .mode = 0644, |
34 | .proc_handler = proc_dointvec, | 40 | .proc_handler = proc_dointvec_minmax, |
41 | .extra1 = &zero, | ||
42 | .extra2 = &u8_max, /* RFC 4340, 10. */ | ||
35 | }, | 43 | }, |
36 | { | 44 | { |
37 | .procname = "tx_ccid", | 45 | .procname = "tx_ccid", |
38 | .data = &sysctl_dccp_feat_tx_ccid, | 46 | .data = &sysctl_dccp_tx_ccid, |
39 | .maxlen = sizeof(sysctl_dccp_feat_tx_ccid), | 47 | .maxlen = sizeof(sysctl_dccp_tx_ccid), |
40 | .mode = 0644, | ||
41 | .proc_handler = proc_dointvec, | ||
42 | }, | ||
43 | { | ||
44 | .procname = "ack_ratio", | ||
45 | .data = &sysctl_dccp_feat_ack_ratio, | ||
46 | .maxlen = sizeof(sysctl_dccp_feat_ack_ratio), | ||
47 | .mode = 0644, | ||
48 | .proc_handler = proc_dointvec, | ||
49 | }, | ||
50 | { | ||
51 | .procname = "send_ackvec", | ||
52 | .data = &sysctl_dccp_feat_send_ack_vector, | ||
53 | .maxlen = sizeof(sysctl_dccp_feat_send_ack_vector), | ||
54 | .mode = 0644, | ||
55 | .proc_handler = proc_dointvec, | ||
56 | }, | ||
57 | { | ||
58 | .procname = "send_ndp", | ||
59 | .data = &sysctl_dccp_feat_send_ndp_count, | ||
60 | .maxlen = sizeof(sysctl_dccp_feat_send_ndp_count), | ||
61 | .mode = 0644, | 48 | .mode = 0644, |
62 | .proc_handler = proc_dointvec, | 49 | .proc_handler = proc_dointvec_minmax, |
50 | .extra1 = &zero, | ||
51 | .extra2 = &u8_max, /* RFC 4340, 10. */ | ||
63 | }, | 52 | }, |
64 | { | 53 | { |
65 | .procname = "request_retries", | 54 | .procname = "request_retries", |
66 | .data = &sysctl_dccp_request_retries, | 55 | .data = &sysctl_dccp_request_retries, |
67 | .maxlen = sizeof(sysctl_dccp_request_retries), | 56 | .maxlen = sizeof(sysctl_dccp_request_retries), |
68 | .mode = 0644, | 57 | .mode = 0644, |
69 | .proc_handler = proc_dointvec, | 58 | .proc_handler = proc_dointvec_minmax, |
59 | .extra1 = &zero, | ||
60 | .extra2 = &u8_max, | ||
70 | }, | 61 | }, |
71 | { | 62 | { |
72 | .procname = "retries1", | 63 | .procname = "retries1", |
73 | .data = &sysctl_dccp_retries1, | 64 | .data = &sysctl_dccp_retries1, |
74 | .maxlen = sizeof(sysctl_dccp_retries1), | 65 | .maxlen = sizeof(sysctl_dccp_retries1), |
75 | .mode = 0644, | 66 | .mode = 0644, |
76 | .proc_handler = proc_dointvec, | 67 | .proc_handler = proc_dointvec_minmax, |
68 | .extra1 = &zero, | ||
69 | .extra2 = &u8_max, | ||
77 | }, | 70 | }, |
78 | { | 71 | { |
79 | .procname = "retries2", | 72 | .procname = "retries2", |
80 | .data = &sysctl_dccp_retries2, | 73 | .data = &sysctl_dccp_retries2, |
81 | .maxlen = sizeof(sysctl_dccp_retries2), | 74 | .maxlen = sizeof(sysctl_dccp_retries2), |
82 | .mode = 0644, | 75 | .mode = 0644, |
83 | .proc_handler = proc_dointvec, | 76 | .proc_handler = proc_dointvec_minmax, |
77 | .extra1 = &zero, | ||
78 | .extra2 = &u8_max, | ||
84 | }, | 79 | }, |
85 | { | 80 | { |
86 | .procname = "tx_qlen", | 81 | .procname = "tx_qlen", |
87 | .data = &sysctl_dccp_tx_qlen, | 82 | .data = &sysctl_dccp_tx_qlen, |
88 | .maxlen = sizeof(sysctl_dccp_tx_qlen), | 83 | .maxlen = sizeof(sysctl_dccp_tx_qlen), |
89 | .mode = 0644, | 84 | .mode = 0644, |
90 | .proc_handler = proc_dointvec, | 85 | .proc_handler = proc_dointvec_minmax, |
86 | .extra1 = &zero, | ||
91 | }, | 87 | }, |
92 | { | 88 | { |
93 | .procname = "sync_ratelimit", | 89 | .procname = "sync_ratelimit", |
diff --git a/net/dccp/timer.c b/net/dccp/timer.c index 54b3c7e9e016..16359e29e7f5 100644 --- a/net/dccp/timer.c +++ b/net/dccp/timer.c | |||
@@ -87,17 +87,6 @@ static void dccp_retransmit_timer(struct sock *sk) | |||
87 | { | 87 | { |
88 | struct inet_connection_sock *icsk = inet_csk(sk); | 88 | struct inet_connection_sock *icsk = inet_csk(sk); |
89 | 89 | ||
90 | /* retransmit timer is used for feature negotiation throughout | ||
91 | * connection. In this case, no packet is re-transmitted, but rather an | ||
92 | * ack is generated and pending changes are placed into its options. | ||
93 | */ | ||
94 | if (sk->sk_send_head == NULL) { | ||
95 | dccp_pr_debug("feat negotiation retransmit timeout %p\n", sk); | ||
96 | if (sk->sk_state == DCCP_OPEN) | ||
97 | dccp_send_ack(sk); | ||
98 | goto backoff; | ||
99 | } | ||
100 | |||
101 | /* | 90 | /* |
102 | * More than than 4MSL (8 minutes) has passed, a RESET(aborted) was | 91 | * More than than 4MSL (8 minutes) has passed, a RESET(aborted) was |
103 | * sent, no need to retransmit, this sock is dead. | 92 | * sent, no need to retransmit, this sock is dead. |
@@ -126,7 +115,6 @@ static void dccp_retransmit_timer(struct sock *sk) | |||
126 | return; | 115 | return; |
127 | } | 116 | } |
128 | 117 | ||
129 | backoff: | ||
130 | icsk->icsk_backoff++; | 118 | icsk->icsk_backoff++; |
131 | 119 | ||
132 | icsk->icsk_rto = min(icsk->icsk_rto << 1, DCCP_RTO_MAX); | 120 | icsk->icsk_rto = min(icsk->icsk_rto << 1, DCCP_RTO_MAX); |
@@ -249,32 +237,35 @@ out: | |||
249 | sock_put(sk); | 237 | sock_put(sk); |
250 | } | 238 | } |
251 | 239 | ||
252 | /* Transmit-delay timer: used by the CCIDs to delay actual send time */ | 240 | /** |
253 | static void dccp_write_xmit_timer(unsigned long data) | 241 | * dccp_write_xmitlet - Workhorse for CCID packet dequeueing interface |
242 | * See the comments above %ccid_dequeueing_decision for supported modes. | ||
243 | */ | ||
244 | static void dccp_write_xmitlet(unsigned long data) | ||
254 | { | 245 | { |
255 | struct sock *sk = (struct sock *)data; | 246 | struct sock *sk = (struct sock *)data; |
256 | struct dccp_sock *dp = dccp_sk(sk); | ||
257 | 247 | ||
258 | bh_lock_sock(sk); | 248 | bh_lock_sock(sk); |
259 | if (sock_owned_by_user(sk)) | 249 | if (sock_owned_by_user(sk)) |
260 | sk_reset_timer(sk, &dp->dccps_xmit_timer, jiffies+1); | 250 | sk_reset_timer(sk, &dccp_sk(sk)->dccps_xmit_timer, jiffies + 1); |
261 | else | 251 | else |
262 | dccp_write_xmit(sk, 0); | 252 | dccp_write_xmit(sk); |
263 | bh_unlock_sock(sk); | 253 | bh_unlock_sock(sk); |
264 | sock_put(sk); | ||
265 | } | 254 | } |
266 | 255 | ||
267 | static void dccp_init_write_xmit_timer(struct sock *sk) | 256 | static void dccp_write_xmit_timer(unsigned long data) |
268 | { | 257 | { |
269 | struct dccp_sock *dp = dccp_sk(sk); | 258 | dccp_write_xmitlet(data); |
270 | 259 | sock_put((struct sock *)data); | |
271 | setup_timer(&dp->dccps_xmit_timer, dccp_write_xmit_timer, | ||
272 | (unsigned long)sk); | ||
273 | } | 260 | } |
274 | 261 | ||
275 | void dccp_init_xmit_timers(struct sock *sk) | 262 | void dccp_init_xmit_timers(struct sock *sk) |
276 | { | 263 | { |
277 | dccp_init_write_xmit_timer(sk); | 264 | struct dccp_sock *dp = dccp_sk(sk); |
265 | |||
266 | tasklet_init(&dp->dccps_xmitlet, dccp_write_xmitlet, (unsigned long)sk); | ||
267 | setup_timer(&dp->dccps_xmit_timer, dccp_write_xmit_timer, | ||
268 | (unsigned long)sk); | ||
278 | inet_csk_init_xmit_timers(sk, &dccp_write_timer, &dccp_delack_timer, | 269 | inet_csk_init_xmit_timers(sk, &dccp_write_timer, &dccp_delack_timer, |
279 | &dccp_keepalive_timer); | 270 | &dccp_keepalive_timer); |
280 | } | 271 | } |
@@ -290,8 +281,7 @@ u32 dccp_timestamp(void) | |||
290 | { | 281 | { |
291 | s64 delta = ktime_us_delta(ktime_get_real(), dccp_timestamp_seed); | 282 | s64 delta = ktime_us_delta(ktime_get_real(), dccp_timestamp_seed); |
292 | 283 | ||
293 | do_div(delta, 10); | 284 | return div_u64(delta, DCCP_TIME_RESOLUTION); |
294 | return delta; | ||
295 | } | 285 | } |
296 | EXPORT_SYMBOL_GPL(dccp_timestamp); | 286 | EXPORT_SYMBOL_GPL(dccp_timestamp); |
297 | 287 | ||
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index f79a51607292..9da9f19ece8a 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c | |||
@@ -811,25 +811,12 @@ void tcp_update_metrics(struct sock *sk) | |||
811 | } | 811 | } |
812 | } | 812 | } |
813 | 813 | ||
814 | /* Numbers are taken from RFC3390. | ||
815 | * | ||
816 | * John Heffner states: | ||
817 | * | ||
818 | * The RFC specifies a window of no more than 4380 bytes | ||
819 | * unless 2*MSS > 4380. Reading the pseudocode in the RFC | ||
820 | * is a bit misleading because they use a clamp at 4380 bytes | ||
821 | * rather than use a multiplier in the relevant range. | ||
822 | */ | ||
823 | __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst) | 814 | __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst) |
824 | { | 815 | { |
825 | __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); | 816 | __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); |
826 | 817 | ||
827 | if (!cwnd) { | 818 | if (!cwnd) |
828 | if (tp->mss_cache > 1460) | 819 | cwnd = rfc3390_bytes_to_packets(tp->mss_cache); |
829 | cwnd = 2; | ||
830 | else | ||
831 | cwnd = (tp->mss_cache > 1095) ? 3 : 4; | ||
832 | } | ||
833 | return min_t(__u32, cwnd, tp->snd_cwnd_clamp); | 820 | return min_t(__u32, cwnd, tp->snd_cwnd_clamp); |
834 | } | 821 | } |
835 | 822 | ||