diff options
36 files changed, 2884 insertions, 3971 deletions
diff --git a/Documentation/networking/dccp.txt b/Documentation/networking/dccp.txt index fcfc12534428..39131a3c78f8 100644 --- a/Documentation/networking/dccp.txt +++ b/Documentation/networking/dccp.txt | |||
@@ -45,25 +45,6 @@ http://linux-net.osdl.org/index.php/DCCP_Testing#Experimental_DCCP_source_tree | |||
45 | 45 | ||
46 | Socket options | 46 | Socket options |
47 | ============== | 47 | ============== |
48 | DCCP_SOCKOPT_QPOLICY_ID sets the dequeuing policy for outgoing packets. It takes | ||
49 | a policy ID as argument and can only be set before the connection (i.e. changes | ||
50 | during an established connection are not supported). Currently, two policies are | ||
51 | defined: the "simple" policy (DCCPQ_POLICY_SIMPLE), which does nothing special, | ||
52 | and a priority-based variant (DCCPQ_POLICY_PRIO). The latter allows to pass an | ||
53 | u32 priority value as ancillary data to sendmsg(), where higher numbers indicate | ||
54 | a higher packet priority (similar to SO_PRIORITY). This ancillary data needs to | ||
55 | be formatted using a cmsg(3) message header filled in as follows: | ||
56 | cmsg->cmsg_level = SOL_DCCP; | ||
57 | cmsg->cmsg_type = DCCP_SCM_PRIORITY; | ||
58 | cmsg->cmsg_len = CMSG_LEN(sizeof(uint32_t)); /* or CMSG_LEN(4) */ | ||
59 | |||
60 | DCCP_SOCKOPT_QPOLICY_TXQLEN sets the maximum length of the output queue. A zero | ||
61 | value is always interpreted as unbounded queue length. If different from zero, | ||
62 | the interpretation of this parameter depends on the current dequeuing policy | ||
63 | (see above): the "simple" policy will enforce a fixed queue size by returning | ||
64 | EAGAIN, whereas the "prio" policy enforces a fixed queue length by dropping the | ||
65 | lowest-priority packet first. The default value for this parameter is | ||
66 | initialised from /proc/sys/net/dccp/default/tx_qlen. | ||
67 | 48 | ||
68 | DCCP_SOCKOPT_SERVICE sets the service. The specification mandates use of | 49 | DCCP_SOCKOPT_SERVICE sets the service. The specification mandates use of |
69 | service codes (RFC 4340, sec. 8.1.2); if this socket option is not set, | 50 | service codes (RFC 4340, sec. 8.1.2); if this socket option is not set, |
@@ -76,24 +57,6 @@ can be set before calling bind(). | |||
76 | DCCP_SOCKOPT_GET_CUR_MPS is read-only and retrieves the current maximum packet | 57 | DCCP_SOCKOPT_GET_CUR_MPS is read-only and retrieves the current maximum packet |
77 | size (application payload size) in bytes, see RFC 4340, section 14. | 58 | size (application payload size) in bytes, see RFC 4340, section 14. |
78 | 59 | ||
79 | DCCP_SOCKOPT_AVAILABLE_CCIDS is also read-only and returns the list of CCIDs | ||
80 | supported by the endpoint (see include/linux/dccp.h for symbolic constants). | ||
81 | The caller needs to provide a sufficiently large (> 2) array of type uint8_t. | ||
82 | |||
83 | DCCP_SOCKOPT_CCID is write-only and sets both the TX and RX CCIDs at the same | ||
84 | time, combining the operation of the next two socket options. This option is | ||
85 | preferrable over the latter two, since often applications will use the same | ||
86 | type of CCID for both directions; and mixed use of CCIDs is not currently well | ||
87 | understood. This socket option takes as argument at least one uint8_t value, or | ||
88 | an array of uint8_t values, which must match available CCIDS (see above). CCIDs | ||
89 | must be registered on the socket before calling connect() or listen(). | ||
90 | |||
91 | DCCP_SOCKOPT_TX_CCID is read/write. It returns the current CCID (if set) or sets | ||
92 | the preference list for the TX CCID, using the same format as DCCP_SOCKOPT_CCID. | ||
93 | Please note that the getsockopt argument type here is `int', not uint8_t. | ||
94 | |||
95 | DCCP_SOCKOPT_RX_CCID is analogous to DCCP_SOCKOPT_TX_CCID, but for the RX CCID. | ||
96 | |||
97 | DCCP_SOCKOPT_SERVER_TIMEWAIT enables the server (listening socket) to hold | 60 | DCCP_SOCKOPT_SERVER_TIMEWAIT enables the server (listening socket) to hold |
98 | timewait state when closing the connection (RFC 4340, 8.3). The usual case is | 61 | timewait state when closing the connection (RFC 4340, 8.3). The usual case is |
99 | that the closing server sends a CloseReq, whereupon the client holds timewait | 62 | that the closing server sends a CloseReq, whereupon the client holds timewait |
@@ -152,16 +115,23 @@ retries2 | |||
152 | importance for retransmitted acknowledgments and feature negotiation, | 115 | importance for retransmitted acknowledgments and feature negotiation, |
153 | data packets are never retransmitted. Analogue of tcp_retries2. | 116 | data packets are never retransmitted. Analogue of tcp_retries2. |
154 | 117 | ||
118 | send_ndp = 1 | ||
119 | Whether or not to send NDP count options (sec. 7.7.2). | ||
120 | |||
121 | send_ackvec = 1 | ||
122 | Whether or not to send Ack Vector options (sec. 11.5). | ||
123 | |||
124 | ack_ratio = 2 | ||
125 | The default Ack Ratio (sec. 11.3) to use. | ||
126 | |||
155 | tx_ccid = 2 | 127 | tx_ccid = 2 |
156 | Default CCID for the sender-receiver half-connection. Depending on the | 128 | Default CCID for the sender-receiver half-connection. |
157 | choice of CCID, the Send Ack Vector feature is enabled automatically. | ||
158 | 129 | ||
159 | rx_ccid = 2 | 130 | rx_ccid = 2 |
160 | Default CCID for the receiver-sender half-connection; see tx_ccid. | 131 | Default CCID for the receiver-sender half-connection. |
161 | 132 | ||
162 | seq_window = 100 | 133 | seq_window = 100 |
163 | The initial sequence window (sec. 7.5.2) of the sender. This influences | 134 | The initial sequence window (sec. 7.5.2). |
164 | the local ackno validity and the remote seqno validity windows (7.5.1). | ||
165 | 135 | ||
166 | tx_qlen = 5 | 136 | tx_qlen = 5 |
167 | The size of the transmit buffer in packets. A value of 0 corresponds | 137 | The size of the transmit buffer in packets. A value of 0 corresponds |
diff --git a/include/linux/dccp.h b/include/linux/dccp.h index 010e2d87ed75..6080449fbec9 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h | |||
@@ -165,13 +165,9 @@ enum { | |||
165 | DCCPO_TIMESTAMP_ECHO = 42, | 165 | DCCPO_TIMESTAMP_ECHO = 42, |
166 | DCCPO_ELAPSED_TIME = 43, | 166 | DCCPO_ELAPSED_TIME = 43, |
167 | DCCPO_MAX = 45, | 167 | DCCPO_MAX = 45, |
168 | DCCPO_MIN_RX_CCID_SPECIFIC = 128, /* from sender to receiver */ | 168 | DCCPO_MIN_CCID_SPECIFIC = 128, |
169 | DCCPO_MAX_RX_CCID_SPECIFIC = 191, | 169 | DCCPO_MAX_CCID_SPECIFIC = 255, |
170 | DCCPO_MIN_TX_CCID_SPECIFIC = 192, /* from receiver to sender */ | ||
171 | DCCPO_MAX_TX_CCID_SPECIFIC = 255, | ||
172 | }; | 170 | }; |
173 | /* maximum size of a single TLV-encoded DCCP option (sans type/len bytes) */ | ||
174 | #define DCCP_SINGLE_OPT_MAXLEN 253 | ||
175 | 171 | ||
176 | /* DCCP CCIDS */ | 172 | /* DCCP CCIDS */ |
177 | enum { | 173 | enum { |
@@ -180,36 +176,27 @@ enum { | |||
180 | }; | 176 | }; |
181 | 177 | ||
182 | /* DCCP features (RFC 4340 section 6.4) */ | 178 | /* DCCP features (RFC 4340 section 6.4) */ |
183 | enum dccp_feature_numbers { | 179 | enum { |
184 | DCCPF_RESERVED = 0, | 180 | DCCPF_RESERVED = 0, |
185 | DCCPF_CCID = 1, | 181 | DCCPF_CCID = 1, |
186 | DCCPF_SHORT_SEQNOS = 2, | 182 | DCCPF_SHORT_SEQNOS = 2, /* XXX: not yet implemented */ |
187 | DCCPF_SEQUENCE_WINDOW = 3, | 183 | DCCPF_SEQUENCE_WINDOW = 3, |
188 | DCCPF_ECN_INCAPABLE = 4, | 184 | DCCPF_ECN_INCAPABLE = 4, /* XXX: not yet implemented */ |
189 | DCCPF_ACK_RATIO = 5, | 185 | DCCPF_ACK_RATIO = 5, |
190 | DCCPF_SEND_ACK_VECTOR = 6, | 186 | DCCPF_SEND_ACK_VECTOR = 6, |
191 | DCCPF_SEND_NDP_COUNT = 7, | 187 | DCCPF_SEND_NDP_COUNT = 7, |
192 | DCCPF_MIN_CSUM_COVER = 8, | 188 | DCCPF_MIN_CSUM_COVER = 8, |
193 | DCCPF_DATA_CHECKSUM = 9, | 189 | DCCPF_DATA_CHECKSUM = 9, /* XXX: not yet implemented */ |
194 | /* 10-127 reserved */ | 190 | /* 10-127 reserved */ |
195 | DCCPF_MIN_CCID_SPECIFIC = 128, | 191 | DCCPF_MIN_CCID_SPECIFIC = 128, |
196 | DCCPF_SEND_LEV_RATE = 192, /* RFC 4342, sec. 8.4 */ | ||
197 | DCCPF_MAX_CCID_SPECIFIC = 255, | 192 | DCCPF_MAX_CCID_SPECIFIC = 255, |
198 | }; | 193 | }; |
199 | 194 | ||
200 | /* DCCP socket control message types for cmsg */ | 195 | /* this structure is argument to DCCP_SOCKOPT_CHANGE_X */ |
201 | enum dccp_cmsg_type { | 196 | struct dccp_so_feat { |
202 | DCCP_SCM_PRIORITY = 1, | 197 | __u8 dccpsf_feat; |
203 | DCCP_SCM_QPOLICY_MAX = 0xFFFF, | 198 | __u8 __user *dccpsf_val; |
204 | /* ^-- Up to here reserved exclusively for qpolicy parameters */ | 199 | __u8 dccpsf_len; |
205 | DCCP_SCM_MAX | ||
206 | }; | ||
207 | |||
208 | /* DCCP priorities for outgoing/queued packets */ | ||
209 | enum dccp_packet_dequeueing_policy { | ||
210 | DCCPQ_POLICY_SIMPLE, | ||
211 | DCCPQ_POLICY_PRIO, | ||
212 | DCCPQ_POLICY_MAX | ||
213 | }; | 200 | }; |
214 | 201 | ||
215 | /* DCCP socket options */ | 202 | /* DCCP socket options */ |
@@ -221,12 +208,6 @@ enum dccp_packet_dequeueing_policy { | |||
221 | #define DCCP_SOCKOPT_SERVER_TIMEWAIT 6 | 208 | #define DCCP_SOCKOPT_SERVER_TIMEWAIT 6 |
222 | #define DCCP_SOCKOPT_SEND_CSCOV 10 | 209 | #define DCCP_SOCKOPT_SEND_CSCOV 10 |
223 | #define DCCP_SOCKOPT_RECV_CSCOV 11 | 210 | #define DCCP_SOCKOPT_RECV_CSCOV 11 |
224 | #define DCCP_SOCKOPT_AVAILABLE_CCIDS 12 | ||
225 | #define DCCP_SOCKOPT_CCID 13 | ||
226 | #define DCCP_SOCKOPT_TX_CCID 14 | ||
227 | #define DCCP_SOCKOPT_RX_CCID 15 | ||
228 | #define DCCP_SOCKOPT_QPOLICY_ID 16 | ||
229 | #define DCCP_SOCKOPT_QPOLICY_TXQLEN 17 | ||
230 | #define DCCP_SOCKOPT_CCID_RX_INFO 128 | 211 | #define DCCP_SOCKOPT_CCID_RX_INFO 128 |
231 | #define DCCP_SOCKOPT_CCID_TX_INFO 192 | 212 | #define DCCP_SOCKOPT_CCID_TX_INFO 192 |
232 | 213 | ||
@@ -374,13 +355,62 @@ static inline unsigned int dccp_hdr_len(const struct sk_buff *skb) | |||
374 | return __dccp_hdr_len(dccp_hdr(skb)); | 355 | return __dccp_hdr_len(dccp_hdr(skb)); |
375 | } | 356 | } |
376 | 357 | ||
358 | |||
359 | /* initial values for each feature */ | ||
360 | #define DCCPF_INITIAL_SEQUENCE_WINDOW 100 | ||
361 | #define DCCPF_INITIAL_ACK_RATIO 2 | ||
362 | #define DCCPF_INITIAL_CCID DCCPC_CCID2 | ||
363 | #define DCCPF_INITIAL_SEND_ACK_VECTOR 1 | ||
364 | /* FIXME: for now we're default to 1 but it should really be 0 */ | ||
365 | #define DCCPF_INITIAL_SEND_NDP_COUNT 1 | ||
366 | |||
367 | /** | ||
368 | * struct dccp_minisock - Minimal DCCP connection representation | ||
369 | * | ||
370 | * Will be used to pass the state from dccp_request_sock to dccp_sock. | ||
371 | * | ||
372 | * @dccpms_sequence_window - Sequence Window Feature (section 7.5.2) | ||
373 | * @dccpms_ccid - Congestion Control Id (CCID) (section 10) | ||
374 | * @dccpms_send_ack_vector - Send Ack Vector Feature (section 11.5) | ||
375 | * @dccpms_send_ndp_count - Send NDP Count Feature (7.7.2) | ||
376 | * @dccpms_ack_ratio - Ack Ratio Feature (section 11.3) | ||
377 | * @dccpms_pending - List of features being negotiated | ||
378 | * @dccpms_conf - | ||
379 | */ | ||
380 | struct dccp_minisock { | ||
381 | __u64 dccpms_sequence_window; | ||
382 | __u8 dccpms_rx_ccid; | ||
383 | __u8 dccpms_tx_ccid; | ||
384 | __u8 dccpms_send_ack_vector; | ||
385 | __u8 dccpms_send_ndp_count; | ||
386 | __u8 dccpms_ack_ratio; | ||
387 | struct list_head dccpms_pending; | ||
388 | struct list_head dccpms_conf; | ||
389 | }; | ||
390 | |||
391 | struct dccp_opt_conf { | ||
392 | __u8 *dccpoc_val; | ||
393 | __u8 dccpoc_len; | ||
394 | }; | ||
395 | |||
396 | struct dccp_opt_pend { | ||
397 | struct list_head dccpop_node; | ||
398 | __u8 dccpop_type; | ||
399 | __u8 dccpop_feat; | ||
400 | __u8 *dccpop_val; | ||
401 | __u8 dccpop_len; | ||
402 | int dccpop_conf; | ||
403 | struct dccp_opt_conf *dccpop_sc; | ||
404 | }; | ||
405 | |||
406 | extern void dccp_minisock_init(struct dccp_minisock *dmsk); | ||
407 | |||
377 | /** | 408 | /** |
378 | * struct dccp_request_sock - represent DCCP-specific connection request | 409 | * struct dccp_request_sock - represent DCCP-specific connection request |
379 | * @dreq_inet_rsk: structure inherited from | 410 | * @dreq_inet_rsk: structure inherited from |
380 | * @dreq_iss: initial sequence number sent on the Response (RFC 4340, 7.1) | 411 | * @dreq_iss: initial sequence number sent on the Response (RFC 4340, 7.1) |
381 | * @dreq_isr: initial sequence number received on the Request | 412 | * @dreq_isr: initial sequence number received on the Request |
382 | * @dreq_service: service code present on the Request (there is just one) | 413 | * @dreq_service: service code present on the Request (there is just one) |
383 | * @dreq_featneg: feature negotiation options for this connection | ||
384 | * The following two fields are analogous to the ones in dccp_sock: | 414 | * The following two fields are analogous to the ones in dccp_sock: |
385 | * @dreq_timestamp_echo: last received timestamp to echo (13.1) | 415 | * @dreq_timestamp_echo: last received timestamp to echo (13.1) |
386 | * @dreq_timestamp_echo: the time of receiving the last @dreq_timestamp_echo | 416 | * @dreq_timestamp_echo: the time of receiving the last @dreq_timestamp_echo |
@@ -390,7 +420,6 @@ struct dccp_request_sock { | |||
390 | __u64 dreq_iss; | 420 | __u64 dreq_iss; |
391 | __u64 dreq_isr; | 421 | __u64 dreq_isr; |
392 | __be32 dreq_service; | 422 | __be32 dreq_service; |
393 | struct list_head dreq_featneg; | ||
394 | __u32 dreq_timestamp_echo; | 423 | __u32 dreq_timestamp_echo; |
395 | __u32 dreq_timestamp_time; | 424 | __u32 dreq_timestamp_time; |
396 | }; | 425 | }; |
@@ -462,28 +491,21 @@ struct dccp_ackvec; | |||
462 | * @dccps_timestamp_time - time of receiving latest @dccps_timestamp_echo | 491 | * @dccps_timestamp_time - time of receiving latest @dccps_timestamp_echo |
463 | * @dccps_l_ack_ratio - feature-local Ack Ratio | 492 | * @dccps_l_ack_ratio - feature-local Ack Ratio |
464 | * @dccps_r_ack_ratio - feature-remote Ack Ratio | 493 | * @dccps_r_ack_ratio - feature-remote Ack Ratio |
465 | * @dccps_l_seq_win - local Sequence Window (influences ack number validity) | ||
466 | * @dccps_r_seq_win - remote Sequence Window (influences seq number validity) | ||
467 | * @dccps_pcslen - sender partial checksum coverage (via sockopt) | 494 | * @dccps_pcslen - sender partial checksum coverage (via sockopt) |
468 | * @dccps_pcrlen - receiver partial checksum coverage (via sockopt) | 495 | * @dccps_pcrlen - receiver partial checksum coverage (via sockopt) |
469 | * @dccps_send_ndp_count - local Send NDP Count feature (7.7.2) | ||
470 | * @dccps_ndp_count - number of Non Data Packets since last data packet | 496 | * @dccps_ndp_count - number of Non Data Packets since last data packet |
471 | * @dccps_mss_cache - current value of MSS (path MTU minus header sizes) | 497 | * @dccps_mss_cache - current value of MSS (path MTU minus header sizes) |
472 | * @dccps_rate_last - timestamp for rate-limiting DCCP-Sync (RFC 4340, 7.5.4) | 498 | * @dccps_rate_last - timestamp for rate-limiting DCCP-Sync (RFC 4340, 7.5.4) |
473 | * @dccps_featneg - tracks feature-negotiation state (mostly during handshake) | 499 | * @dccps_minisock - associated minisock (accessed via dccp_msk) |
474 | * @dccps_hc_rx_ackvec - rx half connection ack vector | 500 | * @dccps_hc_rx_ackvec - rx half connection ack vector |
475 | * @dccps_hc_rx_ccid - CCID used for the receiver (or receiving half-connection) | 501 | * @dccps_hc_rx_ccid - CCID used for the receiver (or receiving half-connection) |
476 | * @dccps_hc_tx_ccid - CCID used for the sender (or sending half-connection) | 502 | * @dccps_hc_tx_ccid - CCID used for the sender (or sending half-connection) |
477 | * @dccps_options_received - parsed set of retrieved options | 503 | * @dccps_options_received - parsed set of retrieved options |
478 | * @dccps_qpolicy - TX dequeueing policy, one of %dccp_packet_dequeueing_policy | ||
479 | * @dccps_tx_qlen - maximum length of the TX queue | ||
480 | * @dccps_role - role of this sock, one of %dccp_role | 504 | * @dccps_role - role of this sock, one of %dccp_role |
481 | * @dccps_hc_rx_insert_options - receiver wants to add options when acking | 505 | * @dccps_hc_rx_insert_options - receiver wants to add options when acking |
482 | * @dccps_hc_tx_insert_options - sender wants to add options when sending | 506 | * @dccps_hc_tx_insert_options - sender wants to add options when sending |
483 | * @dccps_server_timewait - server holds timewait state on close (RFC 4340, 8.3) | 507 | * @dccps_server_timewait - server holds timewait state on close (RFC 4340, 8.3) |
484 | * @dccps_sync_scheduled - flag which signals "send out-of-band message soon" | 508 | * @dccps_xmit_timer - timer for when CCID is not ready to send |
485 | * @dccps_xmitlet - tasklet scheduled by the TX CCID to dequeue data packets | ||
486 | * @dccps_xmit_timer - used by the TX CCID to delay sending (rate-based pacing) | ||
487 | * @dccps_syn_rtt - RTT sample from Request/Response exchange (in usecs) | 509 | * @dccps_syn_rtt - RTT sample from Request/Response exchange (in usecs) |
488 | */ | 510 | */ |
489 | struct dccp_sock { | 511 | struct dccp_sock { |
@@ -507,26 +529,19 @@ struct dccp_sock { | |||
507 | __u32 dccps_timestamp_time; | 529 | __u32 dccps_timestamp_time; |
508 | __u16 dccps_l_ack_ratio; | 530 | __u16 dccps_l_ack_ratio; |
509 | __u16 dccps_r_ack_ratio; | 531 | __u16 dccps_r_ack_ratio; |
510 | __u64 dccps_l_seq_win:48; | 532 | __u16 dccps_pcslen; |
511 | __u64 dccps_r_seq_win:48; | 533 | __u16 dccps_pcrlen; |
512 | __u8 dccps_pcslen:4; | ||
513 | __u8 dccps_pcrlen:4; | ||
514 | __u8 dccps_send_ndp_count:1; | ||
515 | __u64 dccps_ndp_count:48; | 534 | __u64 dccps_ndp_count:48; |
516 | unsigned long dccps_rate_last; | 535 | unsigned long dccps_rate_last; |
517 | struct list_head dccps_featneg; | 536 | struct dccp_minisock dccps_minisock; |
518 | struct dccp_ackvec *dccps_hc_rx_ackvec; | 537 | struct dccp_ackvec *dccps_hc_rx_ackvec; |
519 | struct ccid *dccps_hc_rx_ccid; | 538 | struct ccid *dccps_hc_rx_ccid; |
520 | struct ccid *dccps_hc_tx_ccid; | 539 | struct ccid *dccps_hc_tx_ccid; |
521 | struct dccp_options_received dccps_options_received; | 540 | struct dccp_options_received dccps_options_received; |
522 | __u8 dccps_qpolicy; | ||
523 | __u32 dccps_tx_qlen; | ||
524 | enum dccp_role dccps_role:2; | 541 | enum dccp_role dccps_role:2; |
525 | __u8 dccps_hc_rx_insert_options:1; | 542 | __u8 dccps_hc_rx_insert_options:1; |
526 | __u8 dccps_hc_tx_insert_options:1; | 543 | __u8 dccps_hc_tx_insert_options:1; |
527 | __u8 dccps_server_timewait:1; | 544 | __u8 dccps_server_timewait:1; |
528 | __u8 dccps_sync_scheduled:1; | ||
529 | struct tasklet_struct dccps_xmitlet; | ||
530 | struct timer_list dccps_xmit_timer; | 545 | struct timer_list dccps_xmit_timer; |
531 | }; | 546 | }; |
532 | 547 | ||
@@ -535,6 +550,11 @@ static inline struct dccp_sock *dccp_sk(const struct sock *sk) | |||
535 | return (struct dccp_sock *)sk; | 550 | return (struct dccp_sock *)sk; |
536 | } | 551 | } |
537 | 552 | ||
553 | static inline struct dccp_minisock *dccp_msk(const struct sock *sk) | ||
554 | { | ||
555 | return (struct dccp_minisock *)&dccp_sk(sk)->dccps_minisock; | ||
556 | } | ||
557 | |||
538 | static inline const char *dccp_role(const struct sock *sk) | 558 | static inline const char *dccp_role(const struct sock *sk) |
539 | { | 559 | { |
540 | switch (dccp_sk(sk)->dccps_role) { | 560 | switch (dccp_sk(sk)->dccps_role) { |
diff --git a/include/net/tcp.h b/include/net/tcp.h index 6bc4b8148ca0..8983386356a5 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h | |||
@@ -782,21 +782,6 @@ static inline __u32 tcp_current_ssthresh(const struct sock *sk) | |||
782 | /* Use define here intentionally to get WARN_ON location shown at the caller */ | 782 | /* Use define here intentionally to get WARN_ON location shown at the caller */ |
783 | #define tcp_verify_left_out(tp) WARN_ON(tcp_left_out(tp) > tp->packets_out) | 783 | #define tcp_verify_left_out(tp) WARN_ON(tcp_left_out(tp) > tp->packets_out) |
784 | 784 | ||
785 | /* | ||
786 | * Convert RFC3390 larger initial windows into an equivalent number of packets. | ||
787 | * | ||
788 | * John Heffner states: | ||
789 | * | ||
790 | * The RFC specifies a window of no more than 4380 bytes | ||
791 | * unless 2*MSS > 4380. Reading the pseudocode in the RFC | ||
792 | * is a bit misleading because they use a clamp at 4380 bytes | ||
793 | * rather than a multiplier in the relevant range. | ||
794 | */ | ||
795 | static inline u32 rfc3390_bytes_to_packets(const u32 bytes) | ||
796 | { | ||
797 | return bytes <= 1095 ? 4 : (bytes > 1460 ? 2 : 3); | ||
798 | } | ||
799 | |||
800 | extern void tcp_enter_cwr(struct sock *sk, const int set_ssthresh); | 785 | extern void tcp_enter_cwr(struct sock *sk, const int set_ssthresh); |
801 | extern __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst); | 786 | extern __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst); |
802 | 787 | ||
diff --git a/net/dccp/Kconfig b/net/dccp/Kconfig index 206c16ad9c3c..7aa2a7acc7ec 100644 --- a/net/dccp/Kconfig +++ b/net/dccp/Kconfig | |||
@@ -25,6 +25,9 @@ config INET_DCCP_DIAG | |||
25 | def_tristate y if (IP_DCCP = y && INET_DIAG = y) | 25 | def_tristate y if (IP_DCCP = y && INET_DIAG = y) |
26 | def_tristate m | 26 | def_tristate m |
27 | 27 | ||
28 | config IP_DCCP_ACKVEC | ||
29 | bool | ||
30 | |||
28 | source "net/dccp/ccids/Kconfig" | 31 | source "net/dccp/ccids/Kconfig" |
29 | 32 | ||
30 | menu "DCCP Kernel Hacking" | 33 | menu "DCCP Kernel Hacking" |
diff --git a/net/dccp/Makefile b/net/dccp/Makefile index 0c1c9af2bf7e..f4f8793aafff 100644 --- a/net/dccp/Makefile +++ b/net/dccp/Makefile | |||
@@ -1,7 +1,6 @@ | |||
1 | obj-$(CONFIG_IP_DCCP) += dccp.o dccp_ipv4.o | 1 | obj-$(CONFIG_IP_DCCP) += dccp.o dccp_ipv4.o |
2 | 2 | ||
3 | dccp-y := ccid.o feat.o input.o minisocks.o options.o \ | 3 | dccp-y := ccid.o feat.o input.o minisocks.o options.o output.o proto.o timer.o |
4 | qpolicy.o output.o proto.o timer.o ackvec.o | ||
5 | 4 | ||
6 | dccp_ipv4-y := ipv4.o | 5 | dccp_ipv4-y := ipv4.o |
7 | 6 | ||
@@ -9,6 +8,8 @@ dccp_ipv4-y := ipv4.o | |||
9 | obj-$(subst y,$(CONFIG_IP_DCCP),$(CONFIG_IPV6)) += dccp_ipv6.o | 8 | obj-$(subst y,$(CONFIG_IP_DCCP),$(CONFIG_IPV6)) += dccp_ipv6.o |
10 | dccp_ipv6-y := ipv6.o | 9 | dccp_ipv6-y := ipv6.o |
11 | 10 | ||
11 | dccp-$(CONFIG_IP_DCCP_ACKVEC) += ackvec.o | ||
12 | |||
12 | obj-$(CONFIG_INET_DCCP_DIAG) += dccp_diag.o | 13 | obj-$(CONFIG_INET_DCCP_DIAG) += dccp_diag.o |
13 | obj-$(CONFIG_NET_DCCPPROBE) += dccp_probe.o | 14 | obj-$(CONFIG_NET_DCCPPROBE) += dccp_probe.o |
14 | 15 | ||
diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c index 41819848bdda..1e8be246ad15 100644 --- a/net/dccp/ackvec.c +++ b/net/dccp/ackvec.c | |||
@@ -1,375 +1,445 @@ | |||
1 | /* | 1 | /* |
2 | * net/dccp/ackvec.c | 2 | * net/dccp/ackvec.c |
3 | * | 3 | * |
4 | * An implementation of Ack Vectors for the DCCP protocol | 4 | * An implementation of the DCCP protocol |
5 | * Copyright (c) 2007 University of Aberdeen, Scotland, UK | ||
6 | * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@ghostprotocols.net> | 5 | * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@ghostprotocols.net> |
7 | * | 6 | * |
8 | * This program is free software; you can redistribute it and/or modify it | 7 | * This program is free software; you can redistribute it and/or modify it |
9 | * under the terms of the GNU General Public License as published by the | 8 | * under the terms of the GNU General Public License as published by the |
10 | * Free Software Foundation; version 2 of the License; | 9 | * Free Software Foundation; version 2 of the License; |
11 | */ | 10 | */ |
11 | |||
12 | #include "ackvec.h" | ||
12 | #include "dccp.h" | 13 | #include "dccp.h" |
14 | |||
15 | #include <linux/dccp.h> | ||
16 | #include <linux/init.h> | ||
17 | #include <linux/errno.h> | ||
13 | #include <linux/kernel.h> | 18 | #include <linux/kernel.h> |
19 | #include <linux/skbuff.h> | ||
14 | #include <linux/slab.h> | 20 | #include <linux/slab.h> |
15 | 21 | ||
22 | #include <net/sock.h> | ||
23 | |||
16 | static struct kmem_cache *dccp_ackvec_slab; | 24 | static struct kmem_cache *dccp_ackvec_slab; |
17 | static struct kmem_cache *dccp_ackvec_record_slab; | 25 | static struct kmem_cache *dccp_ackvec_record_slab; |
18 | 26 | ||
19 | struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority) | 27 | static struct dccp_ackvec_record *dccp_ackvec_record_new(void) |
20 | { | 28 | { |
21 | struct dccp_ackvec *av = kmem_cache_zalloc(dccp_ackvec_slab, priority); | 29 | struct dccp_ackvec_record *avr = |
30 | kmem_cache_alloc(dccp_ackvec_record_slab, GFP_ATOMIC); | ||
22 | 31 | ||
23 | if (av != NULL) { | 32 | if (avr != NULL) |
24 | av->av_buf_head = av->av_buf_tail = DCCPAV_MAX_ACKVEC_LEN - 1; | 33 | INIT_LIST_HEAD(&avr->avr_node); |
25 | INIT_LIST_HEAD(&av->av_records); | 34 | |
26 | } | 35 | return avr; |
27 | return av; | ||
28 | } | 36 | } |
29 | 37 | ||
30 | static void dccp_ackvec_purge_records(struct dccp_ackvec *av) | 38 | static void dccp_ackvec_record_delete(struct dccp_ackvec_record *avr) |
31 | { | 39 | { |
32 | struct dccp_ackvec_record *cur, *next; | 40 | if (unlikely(avr == NULL)) |
33 | 41 | return; | |
34 | list_for_each_entry_safe(cur, next, &av->av_records, avr_node) | 42 | /* Check if deleting a linked record */ |
35 | kmem_cache_free(dccp_ackvec_record_slab, cur); | 43 | WARN_ON(!list_empty(&avr->avr_node)); |
36 | INIT_LIST_HEAD(&av->av_records); | 44 | kmem_cache_free(dccp_ackvec_record_slab, avr); |
37 | } | 45 | } |
38 | 46 | ||
39 | void dccp_ackvec_free(struct dccp_ackvec *av) | 47 | static void dccp_ackvec_insert_avr(struct dccp_ackvec *av, |
48 | struct dccp_ackvec_record *avr) | ||
40 | { | 49 | { |
41 | if (likely(av != NULL)) { | 50 | /* |
42 | dccp_ackvec_purge_records(av); | 51 | * AVRs are sorted by seqno. Since we are sending them in order, we |
43 | kmem_cache_free(dccp_ackvec_slab, av); | 52 | * just add the AVR at the head of the list. |
53 | * -sorbo. | ||
54 | */ | ||
55 | if (!list_empty(&av->av_records)) { | ||
56 | const struct dccp_ackvec_record *head = | ||
57 | list_entry(av->av_records.next, | ||
58 | struct dccp_ackvec_record, | ||
59 | avr_node); | ||
60 | BUG_ON(before48(avr->avr_ack_seqno, head->avr_ack_seqno)); | ||
44 | } | 61 | } |
62 | |||
63 | list_add(&avr->avr_node, &av->av_records); | ||
45 | } | 64 | } |
46 | 65 | ||
47 | /** | 66 | int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb) |
48 | * dccp_ackvec_update_records - Record information about sent Ack Vectors | ||
49 | * @av: Ack Vector records to update | ||
50 | * @seqno: Sequence number of the packet carrying the Ack Vector just sent | ||
51 | * @nonce_sum: The sum of all buffer nonces contained in the Ack Vector | ||
52 | */ | ||
53 | int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seqno, u8 nonce_sum) | ||
54 | { | 67 | { |
68 | struct dccp_sock *dp = dccp_sk(sk); | ||
69 | struct dccp_ackvec *av = dp->dccps_hc_rx_ackvec; | ||
70 | /* Figure out how many options do we need to represent the ackvec */ | ||
71 | const u16 nr_opts = DIV_ROUND_UP(av->av_vec_len, DCCP_MAX_ACKVEC_OPT_LEN); | ||
72 | u16 len = av->av_vec_len + 2 * nr_opts, i; | ||
73 | u32 elapsed_time; | ||
74 | const unsigned char *tail, *from; | ||
75 | unsigned char *to; | ||
55 | struct dccp_ackvec_record *avr; | 76 | struct dccp_ackvec_record *avr; |
77 | suseconds_t delta; | ||
78 | |||
79 | if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) | ||
80 | return -1; | ||
81 | |||
82 | delta = ktime_us_delta(ktime_get_real(), av->av_time); | ||
83 | elapsed_time = delta / 10; | ||
56 | 84 | ||
57 | avr = kmem_cache_alloc(dccp_ackvec_record_slab, GFP_ATOMIC); | 85 | if (elapsed_time != 0 && |
86 | dccp_insert_option_elapsed_time(sk, skb, elapsed_time)) | ||
87 | return -1; | ||
88 | |||
89 | avr = dccp_ackvec_record_new(); | ||
58 | if (avr == NULL) | 90 | if (avr == NULL) |
59 | return -ENOBUFS; | 91 | return -1; |
92 | |||
93 | DCCP_SKB_CB(skb)->dccpd_opt_len += len; | ||
94 | |||
95 | to = skb_push(skb, len); | ||
96 | len = av->av_vec_len; | ||
97 | from = av->av_buf + av->av_buf_head; | ||
98 | tail = av->av_buf + DCCP_MAX_ACKVEC_LEN; | ||
99 | |||
100 | for (i = 0; i < nr_opts; ++i) { | ||
101 | int copylen = len; | ||
102 | |||
103 | if (len > DCCP_MAX_ACKVEC_OPT_LEN) | ||
104 | copylen = DCCP_MAX_ACKVEC_OPT_LEN; | ||
105 | |||
106 | *to++ = DCCPO_ACK_VECTOR_0; | ||
107 | *to++ = copylen + 2; | ||
108 | |||
109 | /* Check if buf_head wraps */ | ||
110 | if (from + copylen > tail) { | ||
111 | const u16 tailsize = tail - from; | ||
112 | |||
113 | memcpy(to, from, tailsize); | ||
114 | to += tailsize; | ||
115 | len -= tailsize; | ||
116 | copylen -= tailsize; | ||
117 | from = av->av_buf; | ||
118 | } | ||
119 | |||
120 | memcpy(to, from, copylen); | ||
121 | from += copylen; | ||
122 | to += copylen; | ||
123 | len -= copylen; | ||
124 | } | ||
60 | 125 | ||
61 | avr->avr_ack_seqno = seqno; | ||
62 | avr->avr_ack_ptr = av->av_buf_head; | ||
63 | avr->avr_ack_ackno = av->av_buf_ackno; | ||
64 | avr->avr_ack_nonce = nonce_sum; | ||
65 | avr->avr_ack_runlen = dccp_ackvec_runlen(av->av_buf + av->av_buf_head); | ||
66 | /* | ||
67 | * When the buffer overflows, we keep no more than one record. This is | ||
68 | * the simplest way of disambiguating sender-Acks dating from before the | ||
69 | * overflow from sender-Acks which refer to after the overflow; a simple | ||
70 | * solution is preferable here since we are handling an exception. | ||
71 | */ | ||
72 | if (av->av_overflow) | ||
73 | dccp_ackvec_purge_records(av); | ||
74 | /* | 126 | /* |
75 | * Since GSS is incremented for each packet, the list is automatically | 127 | * From RFC 4340, A.2: |
76 | * arranged in descending order of @ack_seqno. | 128 | * |
129 | * For each acknowledgement it sends, the HC-Receiver will add an | ||
130 | * acknowledgement record. ack_seqno will equal the HC-Receiver | ||
131 | * sequence number it used for the ack packet; ack_ptr will equal | ||
132 | * buf_head; ack_ackno will equal buf_ackno; and ack_nonce will | ||
133 | * equal buf_nonce. | ||
77 | */ | 134 | */ |
78 | list_add(&avr->avr_node, &av->av_records); | 135 | avr->avr_ack_seqno = DCCP_SKB_CB(skb)->dccpd_seq; |
136 | avr->avr_ack_ptr = av->av_buf_head; | ||
137 | avr->avr_ack_ackno = av->av_buf_ackno; | ||
138 | avr->avr_ack_nonce = av->av_buf_nonce; | ||
139 | avr->avr_sent_len = av->av_vec_len; | ||
79 | 140 | ||
80 | dccp_pr_debug("Added Vector, ack_seqno=%llu, ack_ackno=%llu (rl=%u)\n", | 141 | dccp_ackvec_insert_avr(av, avr); |
142 | |||
143 | dccp_pr_debug("%s ACK Vector 0, len=%d, ack_seqno=%llu, " | ||
144 | "ack_ackno=%llu\n", | ||
145 | dccp_role(sk), avr->avr_sent_len, | ||
81 | (unsigned long long)avr->avr_ack_seqno, | 146 | (unsigned long long)avr->avr_ack_seqno, |
82 | (unsigned long long)avr->avr_ack_ackno, | 147 | (unsigned long long)avr->avr_ack_ackno); |
83 | avr->avr_ack_runlen); | ||
84 | return 0; | 148 | return 0; |
85 | } | 149 | } |
86 | 150 | ||
87 | static struct dccp_ackvec_record *dccp_ackvec_lookup(struct list_head *av_list, | 151 | struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority) |
88 | const u64 ackno) | ||
89 | { | 152 | { |
90 | struct dccp_ackvec_record *avr; | 153 | struct dccp_ackvec *av = kmem_cache_alloc(dccp_ackvec_slab, priority); |
91 | /* | 154 | |
92 | * Exploit that records are inserted in descending order of sequence | 155 | if (av != NULL) { |
93 | * number, start with the oldest record first. If @ackno is `before' | 156 | av->av_buf_head = DCCP_MAX_ACKVEC_LEN - 1; |
94 | * the earliest ack_ackno, the packet is too old to be considered. | 157 | av->av_buf_ackno = UINT48_MAX + 1; |
95 | */ | 158 | av->av_buf_nonce = 0; |
96 | list_for_each_entry_reverse(avr, av_list, avr_node) { | 159 | av->av_time = ktime_set(0, 0); |
97 | if (avr->avr_ack_seqno == ackno) | 160 | av->av_vec_len = 0; |
98 | return avr; | 161 | INIT_LIST_HEAD(&av->av_records); |
99 | if (before48(ackno, avr->avr_ack_seqno)) | ||
100 | break; | ||
101 | } | 162 | } |
102 | return NULL; | 163 | |
164 | return av; | ||
103 | } | 165 | } |
104 | 166 | ||
105 | /* | 167 | void dccp_ackvec_free(struct dccp_ackvec *av) |
106 | * Buffer index and length computation using modulo-buffersize arithmetic. | ||
107 | * Note that, as pointers move from right to left, head is `before' tail. | ||
108 | */ | ||
109 | static inline u16 __ackvec_idx_add(const u16 a, const u16 b) | ||
110 | { | 168 | { |
111 | return (a + b) % DCCPAV_MAX_ACKVEC_LEN; | 169 | if (unlikely(av == NULL)) |
170 | return; | ||
171 | |||
172 | if (!list_empty(&av->av_records)) { | ||
173 | struct dccp_ackvec_record *avr, *next; | ||
174 | |||
175 | list_for_each_entry_safe(avr, next, &av->av_records, avr_node) { | ||
176 | list_del_init(&avr->avr_node); | ||
177 | dccp_ackvec_record_delete(avr); | ||
178 | } | ||
179 | } | ||
180 | |||
181 | kmem_cache_free(dccp_ackvec_slab, av); | ||
112 | } | 182 | } |
113 | 183 | ||
114 | static inline u16 __ackvec_idx_sub(const u16 a, const u16 b) | 184 | static inline u8 dccp_ackvec_state(const struct dccp_ackvec *av, |
185 | const u32 index) | ||
115 | { | 186 | { |
116 | return __ackvec_idx_add(a, DCCPAV_MAX_ACKVEC_LEN - b); | 187 | return av->av_buf[index] & DCCP_ACKVEC_STATE_MASK; |
117 | } | 188 | } |
118 | 189 | ||
119 | u16 dccp_ackvec_buflen(const struct dccp_ackvec *av) | 190 | static inline u8 dccp_ackvec_len(const struct dccp_ackvec *av, |
191 | const u32 index) | ||
120 | { | 192 | { |
121 | if (unlikely(av->av_overflow)) | 193 | return av->av_buf[index] & DCCP_ACKVEC_LEN_MASK; |
122 | return DCCPAV_MAX_ACKVEC_LEN; | ||
123 | return __ackvec_idx_sub(av->av_buf_tail, av->av_buf_head); | ||
124 | } | 194 | } |
125 | 195 | ||
126 | /** | 196 | /* |
127 | * dccp_ackvec_update_old - Update previous state as per RFC 4340, 11.4.1 | 197 | * If several packets are missing, the HC-Receiver may prefer to enter multiple |
128 | * @av: non-empty buffer to update | 198 | * bytes with run length 0, rather than a single byte with a larger run length; |
129 | * @distance: negative or zero distance of @seqno from buf_ackno downward | 199 | * this simplifies table updates if one of the missing packets arrives. |
130 | * @seqno: the (old) sequence number whose record is to be updated | ||
131 | * @state: state in which packet carrying @seqno was received | ||
132 | */ | 200 | */ |
133 | static void dccp_ackvec_update_old(struct dccp_ackvec *av, s64 distance, | 201 | static inline int dccp_ackvec_set_buf_head_state(struct dccp_ackvec *av, |
134 | u64 seqno, enum dccp_ackvec_states state) | 202 | const unsigned int packets, |
203 | const unsigned char state) | ||
135 | { | 204 | { |
136 | u16 ptr = av->av_buf_head; | 205 | unsigned int gap; |
206 | long new_head; | ||
137 | 207 | ||
138 | BUG_ON(distance > 0); | 208 | if (av->av_vec_len + packets > DCCP_MAX_ACKVEC_LEN) |
139 | if (unlikely(dccp_ackvec_is_empty(av))) | 209 | return -ENOBUFS; |
140 | return; | ||
141 | 210 | ||
142 | do { | 211 | gap = packets - 1; |
143 | u8 runlen = dccp_ackvec_runlen(av->av_buf + ptr); | 212 | new_head = av->av_buf_head - packets; |
144 | 213 | ||
145 | if (distance + runlen >= 0) { | 214 | if (new_head < 0) { |
146 | /* | 215 | if (gap > 0) { |
147 | * Only update the state if packet has not been received | 216 | memset(av->av_buf, DCCP_ACKVEC_STATE_NOT_RECEIVED, |
148 | * yet. This is OK as per the second table in RFC 4340, | 217 | gap + new_head + 1); |
149 | * 11.4.1; i.e. here we are using the following table: | 218 | gap = -new_head; |
150 | * RECEIVED | ||
151 | * 0 1 3 | ||
152 | * S +---+---+---+ | ||
153 | * T 0 | 0 | 0 | 0 | | ||
154 | * O +---+---+---+ | ||
155 | * R 1 | 1 | 1 | 1 | | ||
156 | * E +---+---+---+ | ||
157 | * D 3 | 0 | 1 | 3 | | ||
158 | * +---+---+---+ | ||
159 | * The "Not Received" state was set by reserve_seats(). | ||
160 | */ | ||
161 | if (av->av_buf[ptr] == DCCPAV_NOT_RECEIVED) | ||
162 | av->av_buf[ptr] = state; | ||
163 | else | ||
164 | dccp_pr_debug("Not changing %llu state to %u\n", | ||
165 | (unsigned long long)seqno, state); | ||
166 | break; | ||
167 | } | 219 | } |
220 | new_head += DCCP_MAX_ACKVEC_LEN; | ||
221 | } | ||
168 | 222 | ||
169 | distance += runlen + 1; | 223 | av->av_buf_head = new_head; |
170 | ptr = __ackvec_idx_add(ptr, 1); | ||
171 | 224 | ||
172 | } while (ptr != av->av_buf_tail); | 225 | if (gap > 0) |
173 | } | 226 | memset(av->av_buf + av->av_buf_head + 1, |
227 | DCCP_ACKVEC_STATE_NOT_RECEIVED, gap); | ||
174 | 228 | ||
175 | /* Mark @num entries after buf_head as "Not yet received". */ | 229 | av->av_buf[av->av_buf_head] = state; |
176 | static void dccp_ackvec_reserve_seats(struct dccp_ackvec *av, u16 num) | 230 | av->av_vec_len += packets; |
177 | { | 231 | return 0; |
178 | u16 start = __ackvec_idx_add(av->av_buf_head, 1), | ||
179 | len = DCCPAV_MAX_ACKVEC_LEN - start; | ||
180 | |||
181 | /* check for buffer wrap-around */ | ||
182 | if (num > len) { | ||
183 | memset(av->av_buf + start, DCCPAV_NOT_RECEIVED, len); | ||
184 | start = 0; | ||
185 | num -= len; | ||
186 | } | ||
187 | if (num) | ||
188 | memset(av->av_buf + start, DCCPAV_NOT_RECEIVED, num); | ||
189 | } | 232 | } |
190 | 233 | ||
191 | /** | 234 | /* |
192 | * dccp_ackvec_add_new - Record one or more new entries in Ack Vector buffer | 235 | * Implements the RFC 4340, Appendix A |
193 | * @av: container of buffer to update (can be empty or non-empty) | ||
194 | * @num_packets: number of packets to register (must be >= 1) | ||
195 | * @seqno: sequence number of the first packet in @num_packets | ||
196 | * @state: state in which packet carrying @seqno was received | ||
197 | */ | 236 | */ |
198 | static void dccp_ackvec_add_new(struct dccp_ackvec *av, u32 num_packets, | 237 | int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk, |
199 | u64 seqno, enum dccp_ackvec_states state) | 238 | const u64 ackno, const u8 state) |
200 | { | 239 | { |
201 | u32 num_cells = num_packets; | 240 | /* |
241 | * Check at the right places if the buffer is full, if it is, tell the | ||
242 | * caller to start dropping packets till the HC-Sender acks our ACK | ||
243 | * vectors, when we will free up space in av_buf. | ||
244 | * | ||
245 | * We may well decide to do buffer compression, etc, but for now lets | ||
246 | * just drop. | ||
247 | * | ||
248 | * From Appendix A.1.1 (`New Packets'): | ||
249 | * | ||
250 | * Of course, the circular buffer may overflow, either when the | ||
251 | * HC-Sender is sending data at a very high rate, when the | ||
252 | * HC-Receiver's acknowledgements are not reaching the HC-Sender, | ||
253 | * or when the HC-Sender is forgetting to acknowledge those acks | ||
254 | * (so the HC-Receiver is unable to clean up old state). In this | ||
255 | * case, the HC-Receiver should either compress the buffer (by | ||
256 | * increasing run lengths when possible), transfer its state to | ||
257 | * a larger buffer, or, as a last resort, drop all received | ||
258 | * packets, without processing them whatsoever, until its buffer | ||
259 | * shrinks again. | ||
260 | */ | ||
202 | 261 | ||
203 | if (num_packets > DCCPAV_BURST_THRESH) { | 262 | /* See if this is the first ackno being inserted */ |
204 | u32 lost_packets = num_packets - 1; | 263 | if (av->av_vec_len == 0) { |
264 | av->av_buf[av->av_buf_head] = state; | ||
265 | av->av_vec_len = 1; | ||
266 | } else if (after48(ackno, av->av_buf_ackno)) { | ||
267 | const u64 delta = dccp_delta_seqno(av->av_buf_ackno, ackno); | ||
205 | 268 | ||
206 | DCCP_WARN("Warning: large burst loss (%u)\n", lost_packets); | ||
207 | /* | 269 | /* |
208 | * We received 1 packet and have a loss of size "num_packets-1" | 270 | * Look if the state of this packet is the same as the |
209 | * which we squeeze into num_cells-1 rather than reserving an | 271 | * previous ackno and if so if we can bump the head len. |
210 | * entire byte for each lost packet. | ||
211 | * The reason is that the vector grows in O(burst_length); when | ||
212 | * it grows too large there will no room left for the payload. | ||
213 | * This is a trade-off: if a few packets out of the burst show | ||
214 | * up later, their state will not be changed; it is simply too | ||
215 | * costly to reshuffle/reallocate/copy the buffer each time. | ||
216 | * Should such problems persist, we will need to switch to a | ||
217 | * different underlying data structure. | ||
218 | */ | 272 | */ |
219 | for (num_packets = num_cells = 1; lost_packets; ++num_cells) { | 273 | if (delta == 1 && |
220 | u8 len = min(lost_packets, (u32)DCCPAV_MAX_RUNLEN); | 274 | dccp_ackvec_state(av, av->av_buf_head) == state && |
221 | 275 | dccp_ackvec_len(av, av->av_buf_head) < DCCP_ACKVEC_LEN_MASK) | |
222 | av->av_buf_head = __ackvec_idx_sub(av->av_buf_head, 1); | 276 | av->av_buf[av->av_buf_head]++; |
223 | av->av_buf[av->av_buf_head] = DCCPAV_NOT_RECEIVED | len; | 277 | else if (dccp_ackvec_set_buf_head_state(av, delta, state)) |
278 | return -ENOBUFS; | ||
279 | } else { | ||
280 | /* | ||
281 | * A.1.2. Old Packets | ||
282 | * | ||
283 | * When a packet with Sequence Number S <= buf_ackno | ||
284 | * arrives, the HC-Receiver will scan the table for | ||
285 | * the byte corresponding to S. (Indexing structures | ||
286 | * could reduce the complexity of this scan.) | ||
287 | */ | ||
288 | u64 delta = dccp_delta_seqno(ackno, av->av_buf_ackno); | ||
289 | u32 index = av->av_buf_head; | ||
224 | 290 | ||
225 | lost_packets -= len; | 291 | while (1) { |
292 | const u8 len = dccp_ackvec_len(av, index); | ||
293 | const u8 av_state = dccp_ackvec_state(av, index); | ||
294 | /* | ||
295 | * valid packets not yet in av_buf have a reserved | ||
296 | * entry, with a len equal to 0. | ||
297 | */ | ||
298 | if (av_state == DCCP_ACKVEC_STATE_NOT_RECEIVED && | ||
299 | len == 0 && delta == 0) { /* Found our | ||
300 | reserved seat! */ | ||
301 | dccp_pr_debug("Found %llu reserved seat!\n", | ||
302 | (unsigned long long)ackno); | ||
303 | av->av_buf[index] = state; | ||
304 | goto out; | ||
305 | } | ||
306 | /* len == 0 means one packet */ | ||
307 | if (delta < len + 1) | ||
308 | goto out_duplicate; | ||
309 | |||
310 | delta -= len + 1; | ||
311 | if (++index == DCCP_MAX_ACKVEC_LEN) | ||
312 | index = 0; | ||
226 | } | 313 | } |
227 | } | 314 | } |
228 | 315 | ||
229 | if (num_cells + dccp_ackvec_buflen(av) >= DCCPAV_MAX_ACKVEC_LEN) { | 316 | av->av_buf_ackno = ackno; |
230 | DCCP_CRIT("Ack Vector buffer overflow: dropping old entries\n"); | 317 | av->av_time = ktime_get_real(); |
231 | av->av_overflow = true; | 318 | out: |
232 | } | 319 | return 0; |
233 | |||
234 | av->av_buf_head = __ackvec_idx_sub(av->av_buf_head, num_packets); | ||
235 | if (av->av_overflow) | ||
236 | av->av_buf_tail = av->av_buf_head; | ||
237 | |||
238 | av->av_buf[av->av_buf_head] = state; | ||
239 | av->av_buf_ackno = seqno; | ||
240 | 320 | ||
241 | if (num_packets > 1) | 321 | out_duplicate: |
242 | dccp_ackvec_reserve_seats(av, num_packets - 1); | 322 | /* Duplicate packet */ |
323 | dccp_pr_debug("Received a dup or already considered lost " | ||
324 | "packet: %llu\n", (unsigned long long)ackno); | ||
325 | return -EILSEQ; | ||
243 | } | 326 | } |
244 | 327 | ||
245 | /** | 328 | static void dccp_ackvec_throw_record(struct dccp_ackvec *av, |
246 | * dccp_ackvec_input - Register incoming packet in the buffer | 329 | struct dccp_ackvec_record *avr) |
247 | */ | ||
248 | void dccp_ackvec_input(struct dccp_ackvec *av, struct sk_buff *skb) | ||
249 | { | 330 | { |
250 | u64 seqno = DCCP_SKB_CB(skb)->dccpd_seq; | 331 | struct dccp_ackvec_record *next; |
251 | enum dccp_ackvec_states state = DCCPAV_RECEIVED; | ||
252 | 332 | ||
253 | if (dccp_ackvec_is_empty(av)) { | 333 | /* sort out vector length */ |
254 | dccp_ackvec_add_new(av, 1, seqno, state); | 334 | if (av->av_buf_head <= avr->avr_ack_ptr) |
255 | av->av_tail_ackno = seqno; | 335 | av->av_vec_len = avr->avr_ack_ptr - av->av_buf_head; |
336 | else | ||
337 | av->av_vec_len = DCCP_MAX_ACKVEC_LEN - 1 - | ||
338 | av->av_buf_head + avr->avr_ack_ptr; | ||
256 | 339 | ||
257 | } else { | 340 | /* free records */ |
258 | s64 num_packets = dccp_delta_seqno(av->av_buf_ackno, seqno); | 341 | list_for_each_entry_safe_from(avr, next, &av->av_records, avr_node) { |
259 | u8 *current_head = av->av_buf + av->av_buf_head; | 342 | list_del_init(&avr->avr_node); |
260 | 343 | dccp_ackvec_record_delete(avr); | |
261 | if (num_packets == 1 && | 344 | } |
262 | dccp_ackvec_state(current_head) == state && | 345 | } |
263 | dccp_ackvec_runlen(current_head) < DCCPAV_MAX_RUNLEN) { | ||
264 | 346 | ||
265 | *current_head += 1; | 347 | void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av, struct sock *sk, |
266 | av->av_buf_ackno = seqno; | 348 | const u64 ackno) |
349 | { | ||
350 | struct dccp_ackvec_record *avr; | ||
267 | 351 | ||
268 | } else if (num_packets > 0) { | 352 | /* |
269 | dccp_ackvec_add_new(av, num_packets, seqno, state); | 353 | * If we traverse backwards, it should be faster when we have large |
270 | } else { | 354 | * windows. We will be receiving ACKs for stuff we sent a while back |
271 | dccp_ackvec_update_old(av, num_packets, seqno, state); | 355 | * -sorbo. |
272 | } | 356 | */ |
357 | list_for_each_entry_reverse(avr, &av->av_records, avr_node) { | ||
358 | if (ackno == avr->avr_ack_seqno) { | ||
359 | dccp_pr_debug("%s ACK packet 0, len=%d, ack_seqno=%llu, " | ||
360 | "ack_ackno=%llu, ACKED!\n", | ||
361 | dccp_role(sk), 1, | ||
362 | (unsigned long long)avr->avr_ack_seqno, | ||
363 | (unsigned long long)avr->avr_ack_ackno); | ||
364 | dccp_ackvec_throw_record(av, avr); | ||
365 | break; | ||
366 | } else if (avr->avr_ack_seqno > ackno) | ||
367 | break; /* old news */ | ||
273 | } | 368 | } |
274 | } | 369 | } |
275 | 370 | ||
276 | /** | 371 | static void dccp_ackvec_check_rcv_ackvector(struct dccp_ackvec *av, |
277 | * dccp_ackvec_clear_state - Perform house-keeping / garbage-collection | 372 | struct sock *sk, u64 *ackno, |
278 | * This routine is called when the peer acknowledges the receipt of Ack Vectors | 373 | const unsigned char len, |
279 | * up to and including @ackno. While based on on section A.3 of RFC 4340, here | 374 | const unsigned char *vector) |
280 | * are additional precautions to prevent corrupted buffer state. In particular, | 375 | { |
281 | * we use tail_ackno to identify outdated records; it always marks the earliest | 376 | unsigned char i; |
282 | * packet of group (2) in 11.4.2. | 377 | struct dccp_ackvec_record *avr; |
283 | */ | ||
284 | void dccp_ackvec_clear_state(struct dccp_ackvec *av, const u64 ackno) | ||
285 | { | ||
286 | struct dccp_ackvec_record *avr, *next; | ||
287 | u8 runlen_now, eff_runlen; | ||
288 | s64 delta; | ||
289 | 378 | ||
290 | avr = dccp_ackvec_lookup(&av->av_records, ackno); | 379 | /* Check if we actually sent an ACK vector */ |
291 | if (avr == NULL) | 380 | if (list_empty(&av->av_records)) |
292 | return; | 381 | return; |
293 | /* | ||
294 | * Deal with outdated acknowledgments: this arises when e.g. there are | ||
295 | * several old records and the acks from the peer come in slowly. In | ||
296 | * that case we may still have records that pre-date tail_ackno. | ||
297 | */ | ||
298 | delta = dccp_delta_seqno(av->av_tail_ackno, avr->avr_ack_ackno); | ||
299 | if (delta < 0) | ||
300 | goto free_records; | ||
301 | /* | ||
302 | * Deal with overlapping Ack Vectors: don't subtract more than the | ||
303 | * number of packets between tail_ackno and ack_ackno. | ||
304 | */ | ||
305 | eff_runlen = delta < avr->avr_ack_runlen ? delta : avr->avr_ack_runlen; | ||
306 | 382 | ||
307 | runlen_now = dccp_ackvec_runlen(av->av_buf + avr->avr_ack_ptr); | 383 | i = len; |
308 | /* | 384 | /* |
309 | * The run length of Ack Vector cells does not decrease over time. If | 385 | * XXX |
310 | * the run length is the same as at the time the Ack Vector was sent, we | 386 | * I think it might be more efficient to work backwards. See comment on |
311 | * free the ack_ptr cell. That cell can however not be freed if the run | 387 | * rcv_ackno. -sorbo. |
312 | * length has increased: in this case we need to move the tail pointer | ||
313 | * backwards (towards higher indices), to its next-oldest neighbour. | ||
314 | */ | 388 | */ |
315 | if (runlen_now > eff_runlen) { | 389 | avr = list_entry(av->av_records.next, struct dccp_ackvec_record, avr_node); |
390 | while (i--) { | ||
391 | const u8 rl = *vector & DCCP_ACKVEC_LEN_MASK; | ||
392 | u64 ackno_end_rl; | ||
316 | 393 | ||
317 | av->av_buf[avr->avr_ack_ptr] -= eff_runlen + 1; | 394 | dccp_set_seqno(&ackno_end_rl, *ackno - rl); |
318 | av->av_buf_tail = __ackvec_idx_add(avr->avr_ack_ptr, 1); | ||
319 | 395 | ||
320 | /* This move may not have cleared the overflow flag. */ | ||
321 | if (av->av_overflow) | ||
322 | av->av_overflow = (av->av_buf_head == av->av_buf_tail); | ||
323 | } else { | ||
324 | av->av_buf_tail = avr->avr_ack_ptr; | ||
325 | /* | 396 | /* |
326 | * We have made sure that avr points to a valid cell within the | 397 | * If our AVR sequence number is greater than the ack, go |
327 | * buffer. This cell is either older than head, or equals head | 398 | * forward in the AVR list until it is not so. |
328 | * (empty buffer): in both cases we no longer have any overflow. | ||
329 | */ | 399 | */ |
330 | av->av_overflow = 0; | 400 | list_for_each_entry_from(avr, &av->av_records, avr_node) { |
331 | } | 401 | if (!after48(avr->avr_ack_seqno, *ackno)) |
332 | 402 | goto found; | |
333 | /* | 403 | } |
334 | * The peer has acknowledged up to and including ack_ackno. Hence the | 404 | /* End of the av_records list, not found, exit */ |
335 | * first packet in group (2) of 11.4.2 is the successor of ack_ackno. | 405 | break; |
336 | */ | 406 | found: |
337 | av->av_tail_ackno = ADD48(avr->avr_ack_ackno, 1); | 407 | if (between48(avr->avr_ack_seqno, ackno_end_rl, *ackno)) { |
408 | const u8 state = *vector & DCCP_ACKVEC_STATE_MASK; | ||
409 | if (state != DCCP_ACKVEC_STATE_NOT_RECEIVED) { | ||
410 | dccp_pr_debug("%s ACK vector 0, len=%d, " | ||
411 | "ack_seqno=%llu, ack_ackno=%llu, " | ||
412 | "ACKED!\n", | ||
413 | dccp_role(sk), len, | ||
414 | (unsigned long long) | ||
415 | avr->avr_ack_seqno, | ||
416 | (unsigned long long) | ||
417 | avr->avr_ack_ackno); | ||
418 | dccp_ackvec_throw_record(av, avr); | ||
419 | break; | ||
420 | } | ||
421 | /* | ||
422 | * If it wasn't received, continue scanning... we might | ||
423 | * find another one. | ||
424 | */ | ||
425 | } | ||
338 | 426 | ||
339 | free_records: | 427 | dccp_set_seqno(ackno, ackno_end_rl - 1); |
340 | list_for_each_entry_safe_from(avr, next, &av->av_records, avr_node) { | 428 | ++vector; |
341 | list_del(&avr->avr_node); | ||
342 | kmem_cache_free(dccp_ackvec_record_slab, avr); | ||
343 | } | 429 | } |
344 | } | 430 | } |
345 | 431 | ||
346 | /* | 432 | int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb, |
347 | * Routines to keep track of Ack Vectors received in an skb | 433 | u64 *ackno, const u8 opt, const u8 *value, const u8 len) |
348 | */ | ||
349 | int dccp_ackvec_parsed_add(struct list_head *head, u8 *vec, u8 len, u8 nonce) | ||
350 | { | 434 | { |
351 | struct dccp_ackvec_parsed *new = kmalloc(sizeof(*new), GFP_ATOMIC); | 435 | if (len > DCCP_MAX_ACKVEC_OPT_LEN) |
352 | 436 | return -1; | |
353 | if (new == NULL) | ||
354 | return -ENOBUFS; | ||
355 | new->vec = vec; | ||
356 | new->len = len; | ||
357 | new->nonce = nonce; | ||
358 | 437 | ||
359 | list_add_tail(&new->node, head); | 438 | /* dccp_ackvector_print(DCCP_SKB_CB(skb)->dccpd_ack_seq, value, len); */ |
439 | dccp_ackvec_check_rcv_ackvector(dccp_sk(sk)->dccps_hc_rx_ackvec, sk, | ||
440 | ackno, len, value); | ||
360 | return 0; | 441 | return 0; |
361 | } | 442 | } |
362 | EXPORT_SYMBOL_GPL(dccp_ackvec_parsed_add); | ||
363 | |||
364 | void dccp_ackvec_parsed_cleanup(struct list_head *parsed_chunks) | ||
365 | { | ||
366 | struct dccp_ackvec_parsed *cur, *next; | ||
367 | |||
368 | list_for_each_entry_safe(cur, next, parsed_chunks, node) | ||
369 | kfree(cur); | ||
370 | INIT_LIST_HEAD(parsed_chunks); | ||
371 | } | ||
372 | EXPORT_SYMBOL_GPL(dccp_ackvec_parsed_cleanup); | ||
373 | 443 | ||
374 | int __init dccp_ackvec_init(void) | 444 | int __init dccp_ackvec_init(void) |
375 | { | 445 | { |
@@ -379,9 +449,10 @@ int __init dccp_ackvec_init(void) | |||
379 | if (dccp_ackvec_slab == NULL) | 449 | if (dccp_ackvec_slab == NULL) |
380 | goto out_err; | 450 | goto out_err; |
381 | 451 | ||
382 | dccp_ackvec_record_slab = kmem_cache_create("dccp_ackvec_record", | 452 | dccp_ackvec_record_slab = |
383 | sizeof(struct dccp_ackvec_record), | 453 | kmem_cache_create("dccp_ackvec_record", |
384 | 0, SLAB_HWCACHE_ALIGN, NULL); | 454 | sizeof(struct dccp_ackvec_record), |
455 | 0, SLAB_HWCACHE_ALIGN, NULL); | ||
385 | if (dccp_ackvec_record_slab == NULL) | 456 | if (dccp_ackvec_record_slab == NULL) |
386 | goto out_destroy_slab; | 457 | goto out_destroy_slab; |
387 | 458 | ||
diff --git a/net/dccp/ackvec.h b/net/dccp/ackvec.h index 6cdca79a99f7..bcb64fb4acef 100644 --- a/net/dccp/ackvec.h +++ b/net/dccp/ackvec.h | |||
@@ -3,134 +3,156 @@ | |||
3 | /* | 3 | /* |
4 | * net/dccp/ackvec.h | 4 | * net/dccp/ackvec.h |
5 | * | 5 | * |
6 | * An implementation of Ack Vectors for the DCCP protocol | 6 | * An implementation of the DCCP protocol |
7 | * Copyright (c) 2007 University of Aberdeen, Scotland, UK | ||
8 | * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@mandriva.com> | 7 | * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@mandriva.com> |
8 | * | ||
9 | * This program is free software; you can redistribute it and/or modify it | 9 | * This program is free software; you can redistribute it and/or modify it |
10 | * under the terms of the GNU General Public License version 2 as | 10 | * under the terms of the GNU General Public License version 2 as |
11 | * published by the Free Software Foundation. | 11 | * published by the Free Software Foundation. |
12 | */ | 12 | */ |
13 | 13 | ||
14 | #include <linux/dccp.h> | ||
15 | #include <linux/compiler.h> | 14 | #include <linux/compiler.h> |
15 | #include <linux/ktime.h> | ||
16 | #include <linux/list.h> | 16 | #include <linux/list.h> |
17 | #include <linux/types.h> | 17 | #include <linux/types.h> |
18 | 18 | ||
19 | /* | 19 | /* Read about the ECN nonce to see why it is 253 */ |
20 | * Ack Vector buffer space is static, in multiples of %DCCP_SINGLE_OPT_MAXLEN, | 20 | #define DCCP_MAX_ACKVEC_OPT_LEN 253 |
21 | * the maximum size of a single Ack Vector. Setting %DCCPAV_NUM_ACKVECS to 1 | 21 | /* We can spread an ack vector across multiple options */ |
22 | * will be sufficient for most cases of low Ack Ratios, using a value of 2 gives | 22 | #define DCCP_MAX_ACKVEC_LEN (DCCP_MAX_ACKVEC_OPT_LEN * 2) |
23 | * more headroom if Ack Ratio is higher or when the sender acknowledges slowly. | ||
24 | * The maximum value is bounded by the u16 types for indices and functions. | ||
25 | */ | ||
26 | #define DCCPAV_NUM_ACKVECS 2 | ||
27 | #define DCCPAV_MAX_ACKVEC_LEN (DCCP_SINGLE_OPT_MAXLEN * DCCPAV_NUM_ACKVECS) | ||
28 | |||
29 | /* Estimated minimum average Ack Vector length - used for updating MPS */ | ||
30 | #define DCCPAV_MIN_OPTLEN 16 | ||
31 | |||
32 | /* Threshold for coping with large bursts of losses */ | ||
33 | #define DCCPAV_BURST_THRESH (DCCPAV_MAX_ACKVEC_LEN / 8) | ||
34 | |||
35 | enum dccp_ackvec_states { | ||
36 | DCCPAV_RECEIVED = 0x00, | ||
37 | DCCPAV_ECN_MARKED = 0x40, | ||
38 | DCCPAV_RESERVED = 0x80, | ||
39 | DCCPAV_NOT_RECEIVED = 0xC0 | ||
40 | }; | ||
41 | #define DCCPAV_MAX_RUNLEN 0x3F | ||
42 | 23 | ||
43 | static inline u8 dccp_ackvec_runlen(const u8 *cell) | 24 | #define DCCP_ACKVEC_STATE_RECEIVED 0 |
44 | { | 25 | #define DCCP_ACKVEC_STATE_ECN_MARKED (1 << 6) |
45 | return *cell & DCCPAV_MAX_RUNLEN; | 26 | #define DCCP_ACKVEC_STATE_NOT_RECEIVED (3 << 6) |
46 | } | ||
47 | 27 | ||
48 | static inline u8 dccp_ackvec_state(const u8 *cell) | 28 | #define DCCP_ACKVEC_STATE_MASK 0xC0 /* 11000000 */ |
49 | { | 29 | #define DCCP_ACKVEC_LEN_MASK 0x3F /* 00111111 */ |
50 | return *cell & ~DCCPAV_MAX_RUNLEN; | ||
51 | } | ||
52 | 30 | ||
53 | /** struct dccp_ackvec - Ack Vector main data structure | 31 | /** struct dccp_ackvec - ack vector |
32 | * | ||
33 | * This data structure is the one defined in RFC 4340, Appendix A. | ||
54 | * | 34 | * |
55 | * This implements a fixed-size circular buffer within an array and is largely | 35 | * @av_buf_head - circular buffer head |
56 | * based on Appendix A of RFC 4340. | 36 | * @av_buf_tail - circular buffer tail |
37 | * @av_buf_ackno - ack # of the most recent packet acknowledgeable in the | ||
38 | * buffer (i.e. %av_buf_head) | ||
39 | * @av_buf_nonce - the one-bit sum of the ECN Nonces on all packets acked | ||
40 | * by the buffer with State 0 | ||
57 | * | 41 | * |
58 | * @av_buf: circular buffer storage area | 42 | * Additionally, the HC-Receiver must keep some information about the |
59 | * @av_buf_head: head index; begin of live portion in @av_buf | 43 | * Ack Vectors it has recently sent. For each packet sent carrying an |
60 | * @av_buf_tail: tail index; first index _after_ the live portion in @av_buf | 44 | * Ack Vector, it remembers four variables: |
61 | * @av_buf_ackno: highest seqno of acknowledgeable packet recorded in @av_buf | 45 | * |
62 | * @av_tail_ackno: lowest seqno of acknowledgeable packet recorded in @av_buf | 46 | * @av_records - list of dccp_ackvec_record |
63 | * @av_buf_nonce: ECN nonce sums, each covering subsequent segments of up to | 47 | * @av_ack_nonce - the one-bit sum of the ECN Nonces for all State 0. |
64 | * %DCCP_SINGLE_OPT_MAXLEN cells in the live portion of @av_buf | 48 | * |
65 | * @av_overflow: if 1 then buf_head == buf_tail indicates buffer wraparound | 49 | * @av_time - the time in usecs |
66 | * @av_records: list of %dccp_ackvec_record (Ack Vectors sent previously) | 50 | * @av_buf - circular buffer of acknowledgeable packets |
67 | */ | 51 | */ |
68 | struct dccp_ackvec { | 52 | struct dccp_ackvec { |
69 | u8 av_buf[DCCPAV_MAX_ACKVEC_LEN]; | 53 | u64 av_buf_ackno; |
70 | u16 av_buf_head; | ||
71 | u16 av_buf_tail; | ||
72 | u64 av_buf_ackno:48; | ||
73 | u64 av_tail_ackno:48; | ||
74 | bool av_buf_nonce[DCCPAV_NUM_ACKVECS]; | ||
75 | u8 av_overflow:1; | ||
76 | struct list_head av_records; | 54 | struct list_head av_records; |
55 | ktime_t av_time; | ||
56 | u16 av_buf_head; | ||
57 | u16 av_vec_len; | ||
58 | u8 av_buf_nonce; | ||
59 | u8 av_ack_nonce; | ||
60 | u8 av_buf[DCCP_MAX_ACKVEC_LEN]; | ||
77 | }; | 61 | }; |
78 | 62 | ||
79 | /** struct dccp_ackvec_record - Records information about sent Ack Vectors | 63 | /** struct dccp_ackvec_record - ack vector record |
80 | * | 64 | * |
81 | * These list entries define the additional information which the HC-Receiver | 65 | * ACK vector record as defined in Appendix A of spec. |
82 | * keeps about recently-sent Ack Vectors; again refer to RFC 4340, Appendix A. | ||
83 | * | 66 | * |
84 | * @avr_node: the list node in @av_records | 67 | * The list is sorted by avr_ack_seqno |
85 | * @avr_ack_seqno: sequence number of the packet the Ack Vector was sent on | ||
86 | * @avr_ack_ackno: the Ack number that this record/Ack Vector refers to | ||
87 | * @avr_ack_ptr: pointer into @av_buf where this record starts | ||
88 | * @avr_ack_runlen: run length of @avr_ack_ptr at the time of sending | ||
89 | * @avr_ack_nonce: the sum of @av_buf_nonce's at the time this record was sent | ||
90 | * | 68 | * |
91 | * The list as a whole is sorted in descending order by @avr_ack_seqno. | 69 | * @avr_node - node in av_records |
70 | * @avr_ack_seqno - sequence number of the packet this record was sent on | ||
71 | * @avr_ack_ackno - sequence number being acknowledged | ||
72 | * @avr_ack_ptr - pointer into av_buf where this record starts | ||
73 | * @avr_ack_nonce - av_ack_nonce at the time this record was sent | ||
74 | * @avr_sent_len - lenght of the record in av_buf | ||
92 | */ | 75 | */ |
93 | struct dccp_ackvec_record { | 76 | struct dccp_ackvec_record { |
94 | struct list_head avr_node; | 77 | struct list_head avr_node; |
95 | u64 avr_ack_seqno:48; | 78 | u64 avr_ack_seqno; |
96 | u64 avr_ack_ackno:48; | 79 | u64 avr_ack_ackno; |
97 | u16 avr_ack_ptr; | 80 | u16 avr_ack_ptr; |
98 | u8 avr_ack_runlen; | 81 | u16 avr_sent_len; |
99 | u8 avr_ack_nonce:1; | 82 | u8 avr_ack_nonce; |
100 | }; | 83 | }; |
101 | 84 | ||
102 | extern int dccp_ackvec_init(void); | 85 | struct sock; |
86 | struct sk_buff; | ||
87 | |||
88 | #ifdef CONFIG_IP_DCCP_ACKVEC | ||
89 | extern int dccp_ackvec_init(void); | ||
103 | extern void dccp_ackvec_exit(void); | 90 | extern void dccp_ackvec_exit(void); |
104 | 91 | ||
105 | extern struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority); | 92 | extern struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority); |
106 | extern void dccp_ackvec_free(struct dccp_ackvec *av); | 93 | extern void dccp_ackvec_free(struct dccp_ackvec *av); |
107 | 94 | ||
108 | extern void dccp_ackvec_input(struct dccp_ackvec *av, struct sk_buff *skb); | 95 | extern int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk, |
109 | extern int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seq, u8 sum); | 96 | const u64 ackno, const u8 state); |
110 | extern void dccp_ackvec_clear_state(struct dccp_ackvec *av, const u64 ackno); | 97 | |
111 | extern u16 dccp_ackvec_buflen(const struct dccp_ackvec *av); | 98 | extern void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av, |
99 | struct sock *sk, const u64 ackno); | ||
100 | extern int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb, | ||
101 | u64 *ackno, const u8 opt, | ||
102 | const u8 *value, const u8 len); | ||
112 | 103 | ||
113 | static inline bool dccp_ackvec_is_empty(const struct dccp_ackvec *av) | 104 | extern int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb); |
105 | |||
106 | static inline int dccp_ackvec_pending(const struct dccp_ackvec *av) | ||
107 | { | ||
108 | return av->av_vec_len; | ||
109 | } | ||
110 | #else /* CONFIG_IP_DCCP_ACKVEC */ | ||
111 | static inline int dccp_ackvec_init(void) | ||
114 | { | 112 | { |
115 | return av->av_overflow == 0 && av->av_buf_head == av->av_buf_tail; | 113 | return 0; |
116 | } | 114 | } |
117 | 115 | ||
118 | /** | 116 | static inline void dccp_ackvec_exit(void) |
119 | * struct dccp_ackvec_parsed - Record offsets of Ack Vectors in skb | 117 | { |
120 | * @vec: start of vector (offset into skb) | 118 | } |
121 | * @len: length of @vec | 119 | |
122 | * @nonce: whether @vec had an ECN nonce of 0 or 1 | 120 | static inline struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority) |
123 | * @node: FIFO - arranged in descending order of ack_ackno | 121 | { |
124 | * This structure is used by CCIDs to access Ack Vectors in a received skb. | 122 | return NULL; |
125 | */ | 123 | } |
126 | struct dccp_ackvec_parsed { | 124 | |
127 | u8 *vec, | 125 | static inline void dccp_ackvec_free(struct dccp_ackvec *av) |
128 | len, | 126 | { |
129 | nonce:1; | 127 | } |
130 | struct list_head node; | 128 | |
131 | }; | 129 | static inline int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk, |
130 | const u64 ackno, const u8 state) | ||
131 | { | ||
132 | return -1; | ||
133 | } | ||
132 | 134 | ||
133 | extern int dccp_ackvec_parsed_add(struct list_head *head, | 135 | static inline void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av, |
134 | u8 *vec, u8 len, u8 nonce); | 136 | struct sock *sk, const u64 ackno) |
135 | extern void dccp_ackvec_parsed_cleanup(struct list_head *parsed_chunks); | 137 | { |
138 | } | ||
139 | |||
140 | static inline int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb, | ||
141 | const u64 *ackno, const u8 opt, | ||
142 | const u8 *value, const u8 len) | ||
143 | { | ||
144 | return -1; | ||
145 | } | ||
146 | |||
147 | static inline int dccp_insert_option_ackvec(const struct sock *sk, | ||
148 | const struct sk_buff *skb) | ||
149 | { | ||
150 | return -1; | ||
151 | } | ||
152 | |||
153 | static inline int dccp_ackvec_pending(const struct dccp_ackvec *av) | ||
154 | { | ||
155 | return 0; | ||
156 | } | ||
157 | #endif /* CONFIG_IP_DCCP_ACKVEC */ | ||
136 | #endif /* _ACKVEC_H */ | 158 | #endif /* _ACKVEC_H */ |
diff --git a/net/dccp/ccid.c b/net/dccp/ccid.c index e3fb52b4f5c6..4809753d12ae 100644 --- a/net/dccp/ccid.c +++ b/net/dccp/ccid.c | |||
@@ -13,13 +13,6 @@ | |||
13 | 13 | ||
14 | #include "ccid.h" | 14 | #include "ccid.h" |
15 | 15 | ||
16 | static u8 builtin_ccids[] = { | ||
17 | DCCPC_CCID2, /* CCID2 is supported by default */ | ||
18 | #if defined(CONFIG_IP_DCCP_CCID3) || defined(CONFIG_IP_DCCP_CCID3_MODULE) | ||
19 | DCCPC_CCID3, | ||
20 | #endif | ||
21 | }; | ||
22 | |||
23 | static struct ccid_operations *ccids[CCID_MAX]; | 16 | static struct ccid_operations *ccids[CCID_MAX]; |
24 | #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT) | 17 | #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT) |
25 | static atomic_t ccids_lockct = ATOMIC_INIT(0); | 18 | static atomic_t ccids_lockct = ATOMIC_INIT(0); |
@@ -93,47 +86,6 @@ static void ccid_kmem_cache_destroy(struct kmem_cache *slab) | |||
93 | } | 86 | } |
94 | } | 87 | } |
95 | 88 | ||
96 | /* check that up to @array_len members in @ccid_array are supported */ | ||
97 | bool ccid_support_check(u8 const *ccid_array, u8 array_len) | ||
98 | { | ||
99 | u8 i, j, found; | ||
100 | |||
101 | for (i = 0, found = 0; i < array_len; i++, found = 0) { | ||
102 | for (j = 0; !found && j < ARRAY_SIZE(builtin_ccids); j++) | ||
103 | found = (ccid_array[i] == builtin_ccids[j]); | ||
104 | if (!found) | ||
105 | return false; | ||
106 | } | ||
107 | return true; | ||
108 | } | ||
109 | |||
110 | /** | ||
111 | * ccid_get_builtin_ccids - Provide copy of `builtin' CCID array | ||
112 | * @ccid_array: pointer to copy into | ||
113 | * @array_len: value to return length into | ||
114 | * This function allocates memory - caller must see that it is freed after use. | ||
115 | */ | ||
116 | int ccid_get_builtin_ccids(u8 **ccid_array, u8 *array_len) | ||
117 | { | ||
118 | *ccid_array = kmemdup(builtin_ccids, sizeof(builtin_ccids), gfp_any()); | ||
119 | if (*ccid_array == NULL) | ||
120 | return -ENOBUFS; | ||
121 | *array_len = ARRAY_SIZE(builtin_ccids); | ||
122 | return 0; | ||
123 | } | ||
124 | |||
125 | int ccid_getsockopt_builtin_ccids(struct sock *sk, int len, | ||
126 | char __user *optval, int __user *optlen) | ||
127 | { | ||
128 | if (len < sizeof(builtin_ccids)) | ||
129 | return -EINVAL; | ||
130 | |||
131 | if (put_user(sizeof(builtin_ccids), optlen) || | ||
132 | copy_to_user(optval, builtin_ccids, sizeof(builtin_ccids))) | ||
133 | return -EFAULT; | ||
134 | return 0; | ||
135 | } | ||
136 | |||
137 | int ccid_register(struct ccid_operations *ccid_ops) | 89 | int ccid_register(struct ccid_operations *ccid_ops) |
138 | { | 90 | { |
139 | int err = -ENOBUFS; | 91 | int err = -ENOBUFS; |
@@ -196,41 +148,22 @@ int ccid_unregister(struct ccid_operations *ccid_ops) | |||
196 | 148 | ||
197 | EXPORT_SYMBOL_GPL(ccid_unregister); | 149 | EXPORT_SYMBOL_GPL(ccid_unregister); |
198 | 150 | ||
199 | /** | ||
200 | * ccid_request_module - Pre-load CCID module for later use | ||
201 | * This should be called only from process context (e.g. during connection | ||
202 | * setup) and is necessary for later calls to ccid_new (typically in software | ||
203 | * interrupt), so that it has the modules available when they are needed. | ||
204 | */ | ||
205 | static int ccid_request_module(u8 id) | ||
206 | { | ||
207 | if (!in_atomic()) { | ||
208 | ccids_read_lock(); | ||
209 | if (ccids[id] == NULL) { | ||
210 | ccids_read_unlock(); | ||
211 | return request_module("net-dccp-ccid-%d", id); | ||
212 | } | ||
213 | ccids_read_unlock(); | ||
214 | } | ||
215 | return 0; | ||
216 | } | ||
217 | |||
218 | int ccid_request_modules(u8 const *ccid_array, u8 array_len) | ||
219 | { | ||
220 | #ifdef CONFIG_KMOD | ||
221 | while (array_len--) | ||
222 | if (ccid_request_module(ccid_array[array_len])) | ||
223 | return -1; | ||
224 | #endif | ||
225 | return 0; | ||
226 | } | ||
227 | |||
228 | struct ccid *ccid_new(unsigned char id, struct sock *sk, int rx, gfp_t gfp) | 151 | struct ccid *ccid_new(unsigned char id, struct sock *sk, int rx, gfp_t gfp) |
229 | { | 152 | { |
230 | struct ccid_operations *ccid_ops; | 153 | struct ccid_operations *ccid_ops; |
231 | struct ccid *ccid = NULL; | 154 | struct ccid *ccid = NULL; |
232 | 155 | ||
233 | ccids_read_lock(); | 156 | ccids_read_lock(); |
157 | #ifdef CONFIG_KMOD | ||
158 | if (ccids[id] == NULL) { | ||
159 | /* We only try to load if in process context */ | ||
160 | ccids_read_unlock(); | ||
161 | if (gfp & GFP_ATOMIC) | ||
162 | goto out; | ||
163 | request_module("net-dccp-ccid-%d", id); | ||
164 | ccids_read_lock(); | ||
165 | } | ||
166 | #endif | ||
234 | ccid_ops = ccids[id]; | 167 | ccid_ops = ccids[id]; |
235 | if (ccid_ops == NULL) | 168 | if (ccid_ops == NULL) |
236 | goto out_unlock; | 169 | goto out_unlock; |
@@ -272,6 +205,20 @@ out_module_put: | |||
272 | 205 | ||
273 | EXPORT_SYMBOL_GPL(ccid_new); | 206 | EXPORT_SYMBOL_GPL(ccid_new); |
274 | 207 | ||
208 | struct ccid *ccid_hc_rx_new(unsigned char id, struct sock *sk, gfp_t gfp) | ||
209 | { | ||
210 | return ccid_new(id, sk, 1, gfp); | ||
211 | } | ||
212 | |||
213 | EXPORT_SYMBOL_GPL(ccid_hc_rx_new); | ||
214 | |||
215 | struct ccid *ccid_hc_tx_new(unsigned char id,struct sock *sk, gfp_t gfp) | ||
216 | { | ||
217 | return ccid_new(id, sk, 0, gfp); | ||
218 | } | ||
219 | |||
220 | EXPORT_SYMBOL_GPL(ccid_hc_tx_new); | ||
221 | |||
275 | static void ccid_delete(struct ccid *ccid, struct sock *sk, int rx) | 222 | static void ccid_delete(struct ccid *ccid, struct sock *sk, int rx) |
276 | { | 223 | { |
277 | struct ccid_operations *ccid_ops; | 224 | struct ccid_operations *ccid_ops; |
diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h index d27054ba2159..fdeae7b57319 100644 --- a/net/dccp/ccid.h +++ b/net/dccp/ccid.h | |||
@@ -60,18 +60,22 @@ struct ccid_operations { | |||
60 | void (*ccid_hc_tx_exit)(struct sock *sk); | 60 | void (*ccid_hc_tx_exit)(struct sock *sk); |
61 | void (*ccid_hc_rx_packet_recv)(struct sock *sk, | 61 | void (*ccid_hc_rx_packet_recv)(struct sock *sk, |
62 | struct sk_buff *skb); | 62 | struct sk_buff *skb); |
63 | int (*ccid_hc_rx_parse_options)(struct sock *sk, u8 pkt, | 63 | int (*ccid_hc_rx_parse_options)(struct sock *sk, |
64 | u8 opt, u8 *val, u8 len); | 64 | unsigned char option, |
65 | unsigned char len, u16 idx, | ||
66 | unsigned char* value); | ||
65 | int (*ccid_hc_rx_insert_options)(struct sock *sk, | 67 | int (*ccid_hc_rx_insert_options)(struct sock *sk, |
66 | struct sk_buff *skb); | 68 | struct sk_buff *skb); |
67 | void (*ccid_hc_tx_packet_recv)(struct sock *sk, | 69 | void (*ccid_hc_tx_packet_recv)(struct sock *sk, |
68 | struct sk_buff *skb); | 70 | struct sk_buff *skb); |
69 | int (*ccid_hc_tx_parse_options)(struct sock *sk, u8 pkt, | 71 | int (*ccid_hc_tx_parse_options)(struct sock *sk, |
70 | u8 opt, u8 *val, u8 len); | 72 | unsigned char option, |
73 | unsigned char len, u16 idx, | ||
74 | unsigned char* value); | ||
71 | int (*ccid_hc_tx_send_packet)(struct sock *sk, | 75 | int (*ccid_hc_tx_send_packet)(struct sock *sk, |
72 | struct sk_buff *skb); | 76 | struct sk_buff *skb); |
73 | void (*ccid_hc_tx_packet_sent)(struct sock *sk, | 77 | void (*ccid_hc_tx_packet_sent)(struct sock *sk, |
74 | unsigned int len); | 78 | int more, unsigned int len); |
75 | void (*ccid_hc_rx_get_info)(struct sock *sk, | 79 | void (*ccid_hc_rx_get_info)(struct sock *sk, |
76 | struct tcp_info *info); | 80 | struct tcp_info *info); |
77 | void (*ccid_hc_tx_get_info)(struct sock *sk, | 81 | void (*ccid_hc_tx_get_info)(struct sock *sk, |
@@ -99,78 +103,31 @@ static inline void *ccid_priv(const struct ccid *ccid) | |||
99 | return (void *)ccid->ccid_priv; | 103 | return (void *)ccid->ccid_priv; |
100 | } | 104 | } |
101 | 105 | ||
102 | extern bool ccid_support_check(u8 const *ccid_array, u8 array_len); | ||
103 | extern int ccid_get_builtin_ccids(u8 **ccid_array, u8 *array_len); | ||
104 | extern int ccid_getsockopt_builtin_ccids(struct sock *sk, int len, | ||
105 | char __user *, int __user *); | ||
106 | |||
107 | extern int ccid_request_modules(u8 const *ccid_array, u8 array_len); | ||
108 | extern struct ccid *ccid_new(unsigned char id, struct sock *sk, int rx, | 106 | extern struct ccid *ccid_new(unsigned char id, struct sock *sk, int rx, |
109 | gfp_t gfp); | 107 | gfp_t gfp); |
110 | 108 | ||
111 | static inline int ccid_get_current_rx_ccid(struct dccp_sock *dp) | 109 | extern struct ccid *ccid_hc_rx_new(unsigned char id, struct sock *sk, |
112 | { | 110 | gfp_t gfp); |
113 | struct ccid *ccid = dp->dccps_hc_rx_ccid; | 111 | extern struct ccid *ccid_hc_tx_new(unsigned char id, struct sock *sk, |
114 | 112 | gfp_t gfp); | |
115 | if (ccid == NULL || ccid->ccid_ops == NULL) | ||
116 | return -1; | ||
117 | return ccid->ccid_ops->ccid_id; | ||
118 | } | ||
119 | |||
120 | static inline int ccid_get_current_tx_ccid(struct dccp_sock *dp) | ||
121 | { | ||
122 | struct ccid *ccid = dp->dccps_hc_tx_ccid; | ||
123 | |||
124 | if (ccid == NULL || ccid->ccid_ops == NULL) | ||
125 | return -1; | ||
126 | return ccid->ccid_ops->ccid_id; | ||
127 | } | ||
128 | 113 | ||
129 | extern void ccid_hc_rx_delete(struct ccid *ccid, struct sock *sk); | 114 | extern void ccid_hc_rx_delete(struct ccid *ccid, struct sock *sk); |
130 | extern void ccid_hc_tx_delete(struct ccid *ccid, struct sock *sk); | 115 | extern void ccid_hc_tx_delete(struct ccid *ccid, struct sock *sk); |
131 | 116 | ||
132 | /* | ||
133 | * Congestion control of queued data packets via CCID decision. | ||
134 | * | ||
135 | * The TX CCID performs its congestion-control by indicating whether and when a | ||
136 | * queued packet may be sent, using the return code of ccid_hc_tx_send_packet(). | ||
137 | * The following modes are supported via the symbolic constants below: | ||
138 | * - timer-based pacing (CCID returns a delay value in milliseconds); | ||
139 | * - autonomous dequeueing (CCID internally schedules dccps_xmitlet). | ||
140 | */ | ||
141 | |||
142 | enum ccid_dequeueing_decision { | ||
143 | CCID_PACKET_SEND_AT_ONCE = 0x00000, /* "green light": no delay */ | ||
144 | CCID_PACKET_DELAY_MAX = 0x0FFFF, /* maximum delay in msecs */ | ||
145 | CCID_PACKET_DELAY = 0x10000, /* CCID msec-delay mode */ | ||
146 | CCID_PACKET_WILL_DEQUEUE_LATER = 0x20000, /* CCID autonomous mode */ | ||
147 | CCID_PACKET_ERR = 0xF0000, /* error condition */ | ||
148 | }; | ||
149 | |||
150 | static inline int ccid_packet_dequeue_eval(const int return_code) | ||
151 | { | ||
152 | if (return_code < 0) | ||
153 | return CCID_PACKET_ERR; | ||
154 | if (return_code == 0) | ||
155 | return CCID_PACKET_SEND_AT_ONCE; | ||
156 | if (return_code <= CCID_PACKET_DELAY_MAX) | ||
157 | return CCID_PACKET_DELAY; | ||
158 | return return_code; | ||
159 | } | ||
160 | |||
161 | static inline int ccid_hc_tx_send_packet(struct ccid *ccid, struct sock *sk, | 117 | static inline int ccid_hc_tx_send_packet(struct ccid *ccid, struct sock *sk, |
162 | struct sk_buff *skb) | 118 | struct sk_buff *skb) |
163 | { | 119 | { |
120 | int rc = 0; | ||
164 | if (ccid->ccid_ops->ccid_hc_tx_send_packet != NULL) | 121 | if (ccid->ccid_ops->ccid_hc_tx_send_packet != NULL) |
165 | return ccid->ccid_ops->ccid_hc_tx_send_packet(sk, skb); | 122 | rc = ccid->ccid_ops->ccid_hc_tx_send_packet(sk, skb); |
166 | return CCID_PACKET_SEND_AT_ONCE; | 123 | return rc; |
167 | } | 124 | } |
168 | 125 | ||
169 | static inline void ccid_hc_tx_packet_sent(struct ccid *ccid, struct sock *sk, | 126 | static inline void ccid_hc_tx_packet_sent(struct ccid *ccid, struct sock *sk, |
170 | unsigned int len) | 127 | int more, unsigned int len) |
171 | { | 128 | { |
172 | if (ccid->ccid_ops->ccid_hc_tx_packet_sent != NULL) | 129 | if (ccid->ccid_ops->ccid_hc_tx_packet_sent != NULL) |
173 | ccid->ccid_ops->ccid_hc_tx_packet_sent(sk, len); | 130 | ccid->ccid_ops->ccid_hc_tx_packet_sent(sk, more, len); |
174 | } | 131 | } |
175 | 132 | ||
176 | static inline void ccid_hc_rx_packet_recv(struct ccid *ccid, struct sock *sk, | 133 | static inline void ccid_hc_rx_packet_recv(struct ccid *ccid, struct sock *sk, |
@@ -187,31 +144,27 @@ static inline void ccid_hc_tx_packet_recv(struct ccid *ccid, struct sock *sk, | |||
187 | ccid->ccid_ops->ccid_hc_tx_packet_recv(sk, skb); | 144 | ccid->ccid_ops->ccid_hc_tx_packet_recv(sk, skb); |
188 | } | 145 | } |
189 | 146 | ||
190 | /** | ||
191 | * ccid_hc_tx_parse_options - Parse CCID-specific options sent by the receiver | ||
192 | * @pkt: type of packet that @opt appears on (RFC 4340, 5.1) | ||
193 | * @opt: the CCID-specific option type (RFC 4340, 5.8 and 10.3) | ||
194 | * @val: value of @opt | ||
195 | * @len: length of @val in bytes | ||
196 | */ | ||
197 | static inline int ccid_hc_tx_parse_options(struct ccid *ccid, struct sock *sk, | 147 | static inline int ccid_hc_tx_parse_options(struct ccid *ccid, struct sock *sk, |
198 | u8 pkt, u8 opt, u8 *val, u8 len) | 148 | unsigned char option, |
149 | unsigned char len, u16 idx, | ||
150 | unsigned char* value) | ||
199 | { | 151 | { |
200 | if (ccid->ccid_ops->ccid_hc_tx_parse_options == NULL) | 152 | int rc = 0; |
201 | return 0; | 153 | if (ccid->ccid_ops->ccid_hc_tx_parse_options != NULL) |
202 | return ccid->ccid_ops->ccid_hc_tx_parse_options(sk, pkt, opt, val, len); | 154 | rc = ccid->ccid_ops->ccid_hc_tx_parse_options(sk, option, len, idx, |
155 | value); | ||
156 | return rc; | ||
203 | } | 157 | } |
204 | 158 | ||
205 | /** | ||
206 | * ccid_hc_rx_parse_options - Parse CCID-specific options sent by the sender | ||
207 | * Arguments are analogous to ccid_hc_tx_parse_options() | ||
208 | */ | ||
209 | static inline int ccid_hc_rx_parse_options(struct ccid *ccid, struct sock *sk, | 159 | static inline int ccid_hc_rx_parse_options(struct ccid *ccid, struct sock *sk, |
210 | u8 pkt, u8 opt, u8 *val, u8 len) | 160 | unsigned char option, |
161 | unsigned char len, u16 idx, | ||
162 | unsigned char* value) | ||
211 | { | 163 | { |
212 | if (ccid->ccid_ops->ccid_hc_rx_parse_options == NULL) | 164 | int rc = 0; |
213 | return 0; | 165 | if (ccid->ccid_ops->ccid_hc_rx_parse_options != NULL) |
214 | return ccid->ccid_ops->ccid_hc_rx_parse_options(sk, pkt, opt, val, len); | 166 | rc = ccid->ccid_ops->ccid_hc_rx_parse_options(sk, option, len, idx, value); |
167 | return rc; | ||
215 | } | 168 | } |
216 | 169 | ||
217 | static inline int ccid_hc_rx_insert_options(struct ccid *ccid, struct sock *sk, | 170 | static inline int ccid_hc_rx_insert_options(struct ccid *ccid, struct sock *sk, |
diff --git a/net/dccp/ccids/Kconfig b/net/dccp/ccids/Kconfig index fb168be2cb43..12275943eab8 100644 --- a/net/dccp/ccids/Kconfig +++ b/net/dccp/ccids/Kconfig | |||
@@ -1,8 +1,10 @@ | |||
1 | menu "DCCP CCIDs Configuration (EXPERIMENTAL)" | 1 | menu "DCCP CCIDs Configuration (EXPERIMENTAL)" |
2 | depends on EXPERIMENTAL | ||
2 | 3 | ||
3 | config IP_DCCP_CCID2 | 4 | config IP_DCCP_CCID2 |
4 | tristate "CCID2 (TCP-Like)" | 5 | tristate "CCID2 (TCP-Like) (EXPERIMENTAL)" |
5 | def_tristate IP_DCCP | 6 | def_tristate IP_DCCP |
7 | select IP_DCCP_ACKVEC | ||
6 | ---help--- | 8 | ---help--- |
7 | CCID 2, TCP-like Congestion Control, denotes Additive Increase, | 9 | CCID 2, TCP-like Congestion Control, denotes Additive Increase, |
8 | Multiplicative Decrease (AIMD) congestion control with behavior | 10 | Multiplicative Decrease (AIMD) congestion control with behavior |
@@ -34,7 +36,7 @@ config IP_DCCP_CCID2_DEBUG | |||
34 | If in doubt, say N. | 36 | If in doubt, say N. |
35 | 37 | ||
36 | config IP_DCCP_CCID3 | 38 | config IP_DCCP_CCID3 |
37 | tristate "CCID3 (TCP-Friendly)" | 39 | tristate "CCID3 (TCP-Friendly) (EXPERIMENTAL)" |
38 | def_tristate IP_DCCP | 40 | def_tristate IP_DCCP |
39 | select IP_DCCP_TFRC_LIB | 41 | select IP_DCCP_TFRC_LIB |
40 | ---help--- | 42 | ---help--- |
@@ -62,9 +64,9 @@ config IP_DCCP_CCID3 | |||
62 | 64 | ||
63 | If in doubt, say M. | 65 | If in doubt, say M. |
64 | 66 | ||
65 | if IP_DCCP_CCID3 | ||
66 | config IP_DCCP_CCID3_DEBUG | 67 | config IP_DCCP_CCID3_DEBUG |
67 | bool "CCID3 debugging messages" | 68 | bool "CCID3 debugging messages" |
69 | depends on IP_DCCP_CCID3 | ||
68 | ---help--- | 70 | ---help--- |
69 | Enable CCID3-specific debugging messages. | 71 | Enable CCID3-specific debugging messages. |
70 | 72 | ||
@@ -74,29 +76,10 @@ config IP_DCCP_CCID3_DEBUG | |||
74 | 76 | ||
75 | If in doubt, say N. | 77 | If in doubt, say N. |
76 | 78 | ||
77 | choice | ||
78 | prompt "Select method for measuring the packet size s" | ||
79 | default IP_DCCP_CCID3_MEASURE_S_AS_MPS | ||
80 | |||
81 | config IP_DCCP_CCID3_MEASURE_S_AS_MPS | ||
82 | bool "Always use MPS in place of s" | ||
83 | ---help--- | ||
84 | This use is recommended as it is consistent with the initialisation | ||
85 | of X and suggested when s varies (rfc3448bis, (1) in section 4.1). | ||
86 | config IP_DCCP_CCID3_MEASURE_S_AS_AVG | ||
87 | bool "Use moving average" | ||
88 | ---help--- | ||
89 | An alternative way of tracking s, also supported by rfc3448bis. | ||
90 | This used to be the default for CCID-3 in previous kernels. | ||
91 | config IP_DCCP_CCID3_MEASURE_S_AS_MAX | ||
92 | bool "Track the maximum payload length" | ||
93 | ---help--- | ||
94 | An experimental method based on tracking the maximum packet size. | ||
95 | endchoice | ||
96 | |||
97 | config IP_DCCP_CCID3_RTO | 79 | config IP_DCCP_CCID3_RTO |
98 | int "Use higher bound for nofeedback timer" | 80 | int "Use higher bound for nofeedback timer" |
99 | default 100 | 81 | default 100 |
82 | depends on IP_DCCP_CCID3 && EXPERIMENTAL | ||
100 | ---help--- | 83 | ---help--- |
101 | Use higher lower bound for nofeedback timer expiration. | 84 | Use higher lower bound for nofeedback timer expiration. |
102 | 85 | ||
@@ -123,7 +106,6 @@ config IP_DCCP_CCID3_RTO | |||
123 | The purpose of the nofeedback timer is to slow DCCP down when there | 106 | The purpose of the nofeedback timer is to slow DCCP down when there |
124 | is serious network congestion: experimenting with larger values should | 107 | is serious network congestion: experimenting with larger values should |
125 | therefore not be performed on WANs. | 108 | therefore not be performed on WANs. |
126 | endif # IP_DCCP_CCID3 | ||
127 | 109 | ||
128 | config IP_DCCP_TFRC_LIB | 110 | config IP_DCCP_TFRC_LIB |
129 | tristate | 111 | tristate |
diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c index fa713227c66f..9a430734530c 100644 --- a/net/dccp/ccids/ccid2.c +++ b/net/dccp/ccids/ccid2.c | |||
@@ -25,7 +25,7 @@ | |||
25 | /* | 25 | /* |
26 | * This implementation should follow RFC 4341 | 26 | * This implementation should follow RFC 4341 |
27 | */ | 27 | */ |
28 | #include "../feat.h" | 28 | |
29 | #include "../ccid.h" | 29 | #include "../ccid.h" |
30 | #include "../dccp.h" | 30 | #include "../dccp.h" |
31 | #include "ccid2.h" | 31 | #include "ccid2.h" |
@@ -34,8 +34,51 @@ | |||
34 | #ifdef CONFIG_IP_DCCP_CCID2_DEBUG | 34 | #ifdef CONFIG_IP_DCCP_CCID2_DEBUG |
35 | static int ccid2_debug; | 35 | static int ccid2_debug; |
36 | #define ccid2_pr_debug(format, a...) DCCP_PR_DEBUG(ccid2_debug, format, ##a) | 36 | #define ccid2_pr_debug(format, a...) DCCP_PR_DEBUG(ccid2_debug, format, ##a) |
37 | |||
38 | static void ccid2_hc_tx_check_sanity(const struct ccid2_hc_tx_sock *hctx) | ||
39 | { | ||
40 | int len = 0; | ||
41 | int pipe = 0; | ||
42 | struct ccid2_seq *seqp = hctx->ccid2hctx_seqh; | ||
43 | |||
44 | /* there is data in the chain */ | ||
45 | if (seqp != hctx->ccid2hctx_seqt) { | ||
46 | seqp = seqp->ccid2s_prev; | ||
47 | len++; | ||
48 | if (!seqp->ccid2s_acked) | ||
49 | pipe++; | ||
50 | |||
51 | while (seqp != hctx->ccid2hctx_seqt) { | ||
52 | struct ccid2_seq *prev = seqp->ccid2s_prev; | ||
53 | |||
54 | len++; | ||
55 | if (!prev->ccid2s_acked) | ||
56 | pipe++; | ||
57 | |||
58 | /* packets are sent sequentially */ | ||
59 | BUG_ON(dccp_delta_seqno(seqp->ccid2s_seq, | ||
60 | prev->ccid2s_seq ) >= 0); | ||
61 | BUG_ON(time_before(seqp->ccid2s_sent, | ||
62 | prev->ccid2s_sent)); | ||
63 | |||
64 | seqp = prev; | ||
65 | } | ||
66 | } | ||
67 | |||
68 | BUG_ON(pipe != hctx->ccid2hctx_pipe); | ||
69 | ccid2_pr_debug("len of chain=%d\n", len); | ||
70 | |||
71 | do { | ||
72 | seqp = seqp->ccid2s_prev; | ||
73 | len++; | ||
74 | } while (seqp != hctx->ccid2hctx_seqh); | ||
75 | |||
76 | ccid2_pr_debug("total len=%d\n", len); | ||
77 | BUG_ON(len != hctx->ccid2hctx_seqbufc * CCID2_SEQBUF_LEN); | ||
78 | } | ||
37 | #else | 79 | #else |
38 | #define ccid2_pr_debug(format, a...) | 80 | #define ccid2_pr_debug(format, a...) |
81 | #define ccid2_hc_tx_check_sanity(hctx) | ||
39 | #endif | 82 | #endif |
40 | 83 | ||
41 | static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hctx) | 84 | static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hctx) |
@@ -44,7 +87,8 @@ static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hctx) | |||
44 | int i; | 87 | int i; |
45 | 88 | ||
46 | /* check if we have space to preserve the pointer to the buffer */ | 89 | /* check if we have space to preserve the pointer to the buffer */ |
47 | if (hctx->seqbufc >= sizeof(hctx->seqbuf) / sizeof(struct ccid2_seq *)) | 90 | if (hctx->ccid2hctx_seqbufc >= (sizeof(hctx->ccid2hctx_seqbuf) / |
91 | sizeof(struct ccid2_seq*))) | ||
48 | return -ENOMEM; | 92 | return -ENOMEM; |
49 | 93 | ||
50 | /* allocate buffer and initialize linked list */ | 94 | /* allocate buffer and initialize linked list */ |
@@ -60,35 +104,38 @@ static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hctx) | |||
60 | seqp->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1]; | 104 | seqp->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1]; |
61 | 105 | ||
62 | /* This is the first allocation. Initiate the head and tail. */ | 106 | /* This is the first allocation. Initiate the head and tail. */ |
63 | if (hctx->seqbufc == 0) | 107 | if (hctx->ccid2hctx_seqbufc == 0) |
64 | hctx->seqh = hctx->seqt = seqp; | 108 | hctx->ccid2hctx_seqh = hctx->ccid2hctx_seqt = seqp; |
65 | else { | 109 | else { |
66 | /* link the existing list with the one we just created */ | 110 | /* link the existing list with the one we just created */ |
67 | hctx->seqh->ccid2s_next = seqp; | 111 | hctx->ccid2hctx_seqh->ccid2s_next = seqp; |
68 | seqp->ccid2s_prev = hctx->seqh; | 112 | seqp->ccid2s_prev = hctx->ccid2hctx_seqh; |
69 | 113 | ||
70 | hctx->seqt->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1]; | 114 | hctx->ccid2hctx_seqt->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1]; |
71 | seqp[CCID2_SEQBUF_LEN - 1].ccid2s_next = hctx->seqt; | 115 | seqp[CCID2_SEQBUF_LEN - 1].ccid2s_next = hctx->ccid2hctx_seqt; |
72 | } | 116 | } |
73 | 117 | ||
74 | /* store the original pointer to the buffer so we can free it */ | 118 | /* store the original pointer to the buffer so we can free it */ |
75 | hctx->seqbuf[hctx->seqbufc] = seqp; | 119 | hctx->ccid2hctx_seqbuf[hctx->ccid2hctx_seqbufc] = seqp; |
76 | hctx->seqbufc++; | 120 | hctx->ccid2hctx_seqbufc++; |
77 | 121 | ||
78 | return 0; | 122 | return 0; |
79 | } | 123 | } |
80 | 124 | ||
81 | static int ccid2_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) | 125 | static int ccid2_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) |
82 | { | 126 | { |
83 | if (ccid2_cwnd_network_limited(ccid2_hc_tx_sk(sk))) | 127 | struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); |
84 | return CCID_PACKET_WILL_DEQUEUE_LATER; | 128 | |
85 | return CCID_PACKET_SEND_AT_ONCE; | 129 | if (hctx->ccid2hctx_pipe < hctx->ccid2hctx_cwnd) |
130 | return 0; | ||
131 | |||
132 | return 1; /* XXX CCID should dequeue when ready instead of polling */ | ||
86 | } | 133 | } |
87 | 134 | ||
88 | static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val) | 135 | static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val) |
89 | { | 136 | { |
90 | struct dccp_sock *dp = dccp_sk(sk); | 137 | struct dccp_sock *dp = dccp_sk(sk); |
91 | u32 max_ratio = DIV_ROUND_UP(ccid2_hc_tx_sk(sk)->cwnd, 2); | 138 | u32 max_ratio = DIV_ROUND_UP(ccid2_hc_tx_sk(sk)->ccid2hctx_cwnd, 2); |
92 | 139 | ||
93 | /* | 140 | /* |
94 | * Ensure that Ack Ratio does not exceed ceil(cwnd/2), which is (2) from | 141 | * Ensure that Ack Ratio does not exceed ceil(cwnd/2), which is (2) from |
@@ -100,8 +147,8 @@ static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val) | |||
100 | DCCP_WARN("Limiting Ack Ratio (%u) to %u\n", val, max_ratio); | 147 | DCCP_WARN("Limiting Ack Ratio (%u) to %u\n", val, max_ratio); |
101 | val = max_ratio; | 148 | val = max_ratio; |
102 | } | 149 | } |
103 | if (val > DCCPF_ACK_RATIO_MAX) | 150 | if (val > 0xFFFF) /* RFC 4340, 11.3 */ |
104 | val = DCCPF_ACK_RATIO_MAX; | 151 | val = 0xFFFF; |
105 | 152 | ||
106 | if (val == dp->dccps_l_ack_ratio) | 153 | if (val == dp->dccps_l_ack_ratio) |
107 | return; | 154 | return; |
@@ -110,77 +157,99 @@ static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val) | |||
110 | dp->dccps_l_ack_ratio = val; | 157 | dp->dccps_l_ack_ratio = val; |
111 | } | 158 | } |
112 | 159 | ||
160 | static void ccid2_change_srtt(struct ccid2_hc_tx_sock *hctx, long val) | ||
161 | { | ||
162 | ccid2_pr_debug("change SRTT to %ld\n", val); | ||
163 | hctx->ccid2hctx_srtt = val; | ||
164 | } | ||
165 | |||
166 | static void ccid2_start_rto_timer(struct sock *sk); | ||
167 | |||
113 | static void ccid2_hc_tx_rto_expire(unsigned long data) | 168 | static void ccid2_hc_tx_rto_expire(unsigned long data) |
114 | { | 169 | { |
115 | struct sock *sk = (struct sock *)data; | 170 | struct sock *sk = (struct sock *)data; |
116 | struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); | 171 | struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); |
117 | const bool sender_was_blocked = ccid2_cwnd_network_limited(hctx); | 172 | long s; |
118 | 173 | ||
119 | bh_lock_sock(sk); | 174 | bh_lock_sock(sk); |
120 | if (sock_owned_by_user(sk)) { | 175 | if (sock_owned_by_user(sk)) { |
121 | sk_reset_timer(sk, &hctx->rtotimer, jiffies + HZ / 5); | 176 | sk_reset_timer(sk, &hctx->ccid2hctx_rtotimer, |
177 | jiffies + HZ / 5); | ||
122 | goto out; | 178 | goto out; |
123 | } | 179 | } |
124 | 180 | ||
125 | ccid2_pr_debug("RTO_EXPIRE\n"); | 181 | ccid2_pr_debug("RTO_EXPIRE\n"); |
126 | 182 | ||
183 | ccid2_hc_tx_check_sanity(hctx); | ||
184 | |||
127 | /* back-off timer */ | 185 | /* back-off timer */ |
128 | hctx->rto <<= 1; | 186 | hctx->ccid2hctx_rto <<= 1; |
129 | if (hctx->rto > DCCP_RTO_MAX) | 187 | |
130 | hctx->rto = DCCP_RTO_MAX; | 188 | s = hctx->ccid2hctx_rto / HZ; |
189 | if (s > 60) | ||
190 | hctx->ccid2hctx_rto = 60 * HZ; | ||
191 | |||
192 | ccid2_start_rto_timer(sk); | ||
131 | 193 | ||
132 | /* adjust pipe, cwnd etc */ | 194 | /* adjust pipe, cwnd etc */ |
133 | hctx->ssthresh = hctx->cwnd / 2; | 195 | hctx->ccid2hctx_ssthresh = hctx->ccid2hctx_cwnd / 2; |
134 | if (hctx->ssthresh < 2) | 196 | if (hctx->ccid2hctx_ssthresh < 2) |
135 | hctx->ssthresh = 2; | 197 | hctx->ccid2hctx_ssthresh = 2; |
136 | hctx->cwnd = 1; | 198 | hctx->ccid2hctx_cwnd = 1; |
137 | hctx->pipe = 0; | 199 | hctx->ccid2hctx_pipe = 0; |
138 | 200 | ||
139 | /* clear state about stuff we sent */ | 201 | /* clear state about stuff we sent */ |
140 | hctx->seqt = hctx->seqh; | 202 | hctx->ccid2hctx_seqt = hctx->ccid2hctx_seqh; |
141 | hctx->packets_acked = 0; | 203 | hctx->ccid2hctx_packets_acked = 0; |
142 | 204 | ||
143 | /* clear ack ratio state. */ | 205 | /* clear ack ratio state. */ |
144 | hctx->rpseq = 0; | 206 | hctx->ccid2hctx_rpseq = 0; |
145 | hctx->rpdupack = -1; | 207 | hctx->ccid2hctx_rpdupack = -1; |
146 | ccid2_change_l_ack_ratio(sk, 1); | 208 | ccid2_change_l_ack_ratio(sk, 1); |
147 | 209 | ccid2_hc_tx_check_sanity(hctx); | |
148 | /* if we were blocked before, we may now send cwnd=1 packet */ | ||
149 | if (sender_was_blocked) | ||
150 | tasklet_schedule(&dccp_sk(sk)->dccps_xmitlet); | ||
151 | /* restart backed-off timer */ | ||
152 | sk_reset_timer(sk, &hctx->rtotimer, jiffies + hctx->rto); | ||
153 | out: | 210 | out: |
154 | bh_unlock_sock(sk); | 211 | bh_unlock_sock(sk); |
155 | sock_put(sk); | 212 | sock_put(sk); |
156 | } | 213 | } |
157 | 214 | ||
158 | static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len) | 215 | static void ccid2_start_rto_timer(struct sock *sk) |
216 | { | ||
217 | struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); | ||
218 | |||
219 | ccid2_pr_debug("setting RTO timeout=%ld\n", hctx->ccid2hctx_rto); | ||
220 | |||
221 | BUG_ON(timer_pending(&hctx->ccid2hctx_rtotimer)); | ||
222 | sk_reset_timer(sk, &hctx->ccid2hctx_rtotimer, | ||
223 | jiffies + hctx->ccid2hctx_rto); | ||
224 | } | ||
225 | |||
226 | static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len) | ||
159 | { | 227 | { |
160 | struct dccp_sock *dp = dccp_sk(sk); | 228 | struct dccp_sock *dp = dccp_sk(sk); |
161 | struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); | 229 | struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); |
162 | struct ccid2_seq *next; | 230 | struct ccid2_seq *next; |
163 | 231 | ||
164 | hctx->pipe++; | 232 | hctx->ccid2hctx_pipe++; |
165 | 233 | ||
166 | hctx->seqh->ccid2s_seq = dp->dccps_gss; | 234 | hctx->ccid2hctx_seqh->ccid2s_seq = dp->dccps_gss; |
167 | hctx->seqh->ccid2s_acked = 0; | 235 | hctx->ccid2hctx_seqh->ccid2s_acked = 0; |
168 | hctx->seqh->ccid2s_sent = jiffies; | 236 | hctx->ccid2hctx_seqh->ccid2s_sent = jiffies; |
169 | 237 | ||
170 | next = hctx->seqh->ccid2s_next; | 238 | next = hctx->ccid2hctx_seqh->ccid2s_next; |
171 | /* check if we need to alloc more space */ | 239 | /* check if we need to alloc more space */ |
172 | if (next == hctx->seqt) { | 240 | if (next == hctx->ccid2hctx_seqt) { |
173 | if (ccid2_hc_tx_alloc_seq(hctx)) { | 241 | if (ccid2_hc_tx_alloc_seq(hctx)) { |
174 | DCCP_CRIT("packet history - out of memory!"); | 242 | DCCP_CRIT("packet history - out of memory!"); |
175 | /* FIXME: find a more graceful way to bail out */ | 243 | /* FIXME: find a more graceful way to bail out */ |
176 | return; | 244 | return; |
177 | } | 245 | } |
178 | next = hctx->seqh->ccid2s_next; | 246 | next = hctx->ccid2hctx_seqh->ccid2s_next; |
179 | BUG_ON(next == hctx->seqt); | 247 | BUG_ON(next == hctx->ccid2hctx_seqt); |
180 | } | 248 | } |
181 | hctx->seqh = next; | 249 | hctx->ccid2hctx_seqh = next; |
182 | 250 | ||
183 | ccid2_pr_debug("cwnd=%d pipe=%d\n", hctx->cwnd, hctx->pipe); | 251 | ccid2_pr_debug("cwnd=%d pipe=%d\n", hctx->ccid2hctx_cwnd, |
252 | hctx->ccid2hctx_pipe); | ||
184 | 253 | ||
185 | /* | 254 | /* |
186 | * FIXME: The code below is broken and the variables have been removed | 255 | * FIXME: The code below is broken and the variables have been removed |
@@ -203,12 +272,12 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len) | |||
203 | */ | 272 | */ |
204 | #if 0 | 273 | #if 0 |
205 | /* Ack Ratio. Need to maintain a concept of how many windows we sent */ | 274 | /* Ack Ratio. Need to maintain a concept of how many windows we sent */ |
206 | hctx->arsent++; | 275 | hctx->ccid2hctx_arsent++; |
207 | /* We had an ack loss in this window... */ | 276 | /* We had an ack loss in this window... */ |
208 | if (hctx->ackloss) { | 277 | if (hctx->ccid2hctx_ackloss) { |
209 | if (hctx->arsent >= hctx->cwnd) { | 278 | if (hctx->ccid2hctx_arsent >= hctx->ccid2hctx_cwnd) { |
210 | hctx->arsent = 0; | 279 | hctx->ccid2hctx_arsent = 0; |
211 | hctx->ackloss = 0; | 280 | hctx->ccid2hctx_ackloss = 0; |
212 | } | 281 | } |
213 | } else { | 282 | } else { |
214 | /* No acks lost up to now... */ | 283 | /* No acks lost up to now... */ |
@@ -218,28 +287,28 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len) | |||
218 | int denom = dp->dccps_l_ack_ratio * dp->dccps_l_ack_ratio - | 287 | int denom = dp->dccps_l_ack_ratio * dp->dccps_l_ack_ratio - |
219 | dp->dccps_l_ack_ratio; | 288 | dp->dccps_l_ack_ratio; |
220 | 289 | ||
221 | denom = hctx->cwnd * hctx->cwnd / denom; | 290 | denom = hctx->ccid2hctx_cwnd * hctx->ccid2hctx_cwnd / denom; |
222 | 291 | ||
223 | if (hctx->arsent >= denom) { | 292 | if (hctx->ccid2hctx_arsent >= denom) { |
224 | ccid2_change_l_ack_ratio(sk, dp->dccps_l_ack_ratio - 1); | 293 | ccid2_change_l_ack_ratio(sk, dp->dccps_l_ack_ratio - 1); |
225 | hctx->arsent = 0; | 294 | hctx->ccid2hctx_arsent = 0; |
226 | } | 295 | } |
227 | } else { | 296 | } else { |
228 | /* we can't increase ack ratio further [1] */ | 297 | /* we can't increase ack ratio further [1] */ |
229 | hctx->arsent = 0; /* or maybe set it to cwnd*/ | 298 | hctx->ccid2hctx_arsent = 0; /* or maybe set it to cwnd*/ |
230 | } | 299 | } |
231 | } | 300 | } |
232 | #endif | 301 | #endif |
233 | 302 | ||
234 | /* setup RTO timer */ | 303 | /* setup RTO timer */ |
235 | if (!timer_pending(&hctx->rtotimer)) | 304 | if (!timer_pending(&hctx->ccid2hctx_rtotimer)) |
236 | sk_reset_timer(sk, &hctx->rtotimer, jiffies + hctx->rto); | 305 | ccid2_start_rto_timer(sk); |
237 | 306 | ||
238 | #ifdef CONFIG_IP_DCCP_CCID2_DEBUG | 307 | #ifdef CONFIG_IP_DCCP_CCID2_DEBUG |
239 | do { | 308 | do { |
240 | struct ccid2_seq *seqp = hctx->seqt; | 309 | struct ccid2_seq *seqp = hctx->ccid2hctx_seqt; |
241 | 310 | ||
242 | while (seqp != hctx->seqh) { | 311 | while (seqp != hctx->ccid2hctx_seqh) { |
243 | ccid2_pr_debug("out seq=%llu acked=%d time=%lu\n", | 312 | ccid2_pr_debug("out seq=%llu acked=%d time=%lu\n", |
244 | (unsigned long long)seqp->ccid2s_seq, | 313 | (unsigned long long)seqp->ccid2s_seq, |
245 | seqp->ccid2s_acked, seqp->ccid2s_sent); | 314 | seqp->ccid2s_acked, seqp->ccid2s_sent); |
@@ -247,158 +316,205 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len) | |||
247 | } | 316 | } |
248 | } while (0); | 317 | } while (0); |
249 | ccid2_pr_debug("=========\n"); | 318 | ccid2_pr_debug("=========\n"); |
319 | ccid2_hc_tx_check_sanity(hctx); | ||
250 | #endif | 320 | #endif |
251 | } | 321 | } |
252 | 322 | ||
253 | /** | 323 | /* XXX Lame code duplication! |
254 | * ccid2_rtt_estimator - Sample RTT and compute RTO using RFC2988 algorithm | 324 | * returns -1 if none was found. |
255 | * This code is almost identical with TCP's tcp_rtt_estimator(), since | 325 | * else returns the next offset to use in the function call. |
256 | * - it has a higher sampling frequency (recommended by RFC 1323), | ||
257 | * - the RTO does not collapse into RTT due to RTTVAR going towards zero, | ||
258 | * - it is simple (cf. more complex proposals such as Eifel timer or research | ||
259 | * which suggests that the gain should be set according to window size), | ||
260 | * - in tests it was found to work well with CCID2 [gerrit]. | ||
261 | */ | 326 | */ |
262 | static void ccid2_rtt_estimator(struct sock *sk, const long mrtt) | 327 | static int ccid2_ackvector(struct sock *sk, struct sk_buff *skb, int offset, |
328 | unsigned char **vec, unsigned char *veclen) | ||
263 | { | 329 | { |
264 | struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); | 330 | const struct dccp_hdr *dh = dccp_hdr(skb); |
265 | long m = mrtt ? : 1; | 331 | unsigned char *options = (unsigned char *)dh + dccp_hdr_len(skb); |
266 | 332 | unsigned char *opt_ptr; | |
267 | if (hctx->srtt == 0) { | 333 | const unsigned char *opt_end = (unsigned char *)dh + |
268 | /* First measurement m */ | 334 | (dh->dccph_doff * 4); |
269 | hctx->srtt = m << 3; | 335 | unsigned char opt, len; |
270 | hctx->mdev = m << 1; | 336 | unsigned char *value; |
271 | 337 | ||
272 | hctx->mdev_max = max(TCP_RTO_MIN, hctx->mdev); | 338 | BUG_ON(offset < 0); |
273 | hctx->rttvar = hctx->mdev_max; | 339 | options += offset; |
274 | hctx->rtt_seq = dccp_sk(sk)->dccps_gss; | 340 | opt_ptr = options; |
275 | } else { | 341 | if (opt_ptr >= opt_end) |
276 | /* Update scaled SRTT as SRTT += 1/8 * (m - SRTT) */ | 342 | return -1; |
277 | m -= (hctx->srtt >> 3); | 343 | |
278 | hctx->srtt += m; | 344 | while (opt_ptr != opt_end) { |
279 | 345 | opt = *opt_ptr++; | |
280 | /* Similarly, update scaled mdev with regard to |m| */ | 346 | len = 0; |
281 | if (m < 0) { | 347 | value = NULL; |
282 | m = -m; | 348 | |
283 | m -= (hctx->mdev >> 2); | 349 | /* Check if this isn't a single byte option */ |
350 | if (opt > DCCPO_MAX_RESERVED) { | ||
351 | if (opt_ptr == opt_end) | ||
352 | goto out_invalid_option; | ||
353 | |||
354 | len = *opt_ptr++; | ||
355 | if (len < 3) | ||
356 | goto out_invalid_option; | ||
284 | /* | 357 | /* |
285 | * This neutralises RTO increase when RTT < SRTT - mdev | 358 | * Remove the type and len fields, leaving |
286 | * (see P. Sarolahti, A. Kuznetsov,"Congestion Control | 359 | * just the value size |
287 | * in Linux TCP", USENIX 2002, pp. 49-62). | ||
288 | */ | 360 | */ |
289 | if (m > 0) | 361 | len -= 2; |
290 | m >>= 3; | 362 | value = opt_ptr; |
291 | } else { | 363 | opt_ptr += len; |
292 | m -= (hctx->mdev >> 2); | ||
293 | } | ||
294 | hctx->mdev += m; | ||
295 | 364 | ||
296 | if (hctx->mdev > hctx->mdev_max) { | 365 | if (opt_ptr > opt_end) |
297 | hctx->mdev_max = hctx->mdev; | 366 | goto out_invalid_option; |
298 | if (hctx->mdev_max > hctx->rttvar) | ||
299 | hctx->rttvar = hctx->mdev_max; | ||
300 | } | 367 | } |
301 | 368 | ||
302 | /* | 369 | switch (opt) { |
303 | * Decay RTTVAR at most once per flight, exploiting that | 370 | case DCCPO_ACK_VECTOR_0: |
304 | * 1) pipe <= cwnd <= Sequence_Window = W (RFC 4340, 7.5.2) | 371 | case DCCPO_ACK_VECTOR_1: |
305 | * 2) AWL = GSS-W+1 <= GAR <= GSS (RFC 4340, 7.5.1) | 372 | *vec = value; |
306 | * GAR is a useful bound for FlightSize = pipe, AWL is probably | 373 | *veclen = len; |
307 | * too low as it over-estimates pipe. | 374 | return offset + (opt_ptr - options); |
308 | */ | ||
309 | if (after48(dccp_sk(sk)->dccps_gar, hctx->rtt_seq)) { | ||
310 | if (hctx->mdev_max < hctx->rttvar) | ||
311 | hctx->rttvar -= (hctx->rttvar - | ||
312 | hctx->mdev_max) >> 2; | ||
313 | hctx->rtt_seq = dccp_sk(sk)->dccps_gss; | ||
314 | hctx->mdev_max = TCP_RTO_MIN; | ||
315 | } | 375 | } |
316 | } | 376 | } |
317 | 377 | ||
318 | /* | 378 | return -1; |
319 | * Set RTO from SRTT and RTTVAR | ||
320 | * Clock granularity is ignored since the minimum error for RTTVAR is | ||
321 | * clamped to 50msec (corresponding to HZ=20). This leads to a minimum | ||
322 | * RTO of 200msec. This agrees with TCP and RFC 4341, 5.: "Because DCCP | ||
323 | * does not retransmit data, DCCP does not require TCP's recommended | ||
324 | * minimum timeout of one second". | ||
325 | */ | ||
326 | hctx->rto = (hctx->srtt >> 3) + hctx->rttvar; | ||
327 | 379 | ||
328 | if (hctx->rto > DCCP_RTO_MAX) | 380 | out_invalid_option: |
329 | hctx->rto = DCCP_RTO_MAX; | 381 | DCCP_BUG("Invalid option - this should not happen (previous parsing)!"); |
382 | return -1; | ||
330 | } | 383 | } |
331 | 384 | ||
332 | static void ccid2_new_ack(struct sock *sk, struct ccid2_seq *seqp, | 385 | static void ccid2_hc_tx_kill_rto_timer(struct sock *sk) |
333 | unsigned int *maxincr) | ||
334 | { | 386 | { |
335 | struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); | 387 | struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); |
336 | 388 | ||
337 | if (hctx->cwnd < hctx->ssthresh) { | 389 | sk_stop_timer(sk, &hctx->ccid2hctx_rtotimer); |
338 | if (*maxincr > 0 && ++hctx->packets_acked == 2) { | 390 | ccid2_pr_debug("deleted RTO timer\n"); |
339 | hctx->cwnd += 1; | ||
340 | *maxincr -= 1; | ||
341 | hctx->packets_acked = 0; | ||
342 | } | ||
343 | } else if (++hctx->packets_acked >= hctx->cwnd) { | ||
344 | hctx->cwnd += 1; | ||
345 | hctx->packets_acked = 0; | ||
346 | } | ||
347 | /* | ||
348 | * FIXME: RTT is sampled several times per acknowledgment (for each | ||
349 | * entry in the Ack Vector), instead of once per Ack (as in TCP SACK). | ||
350 | * This causes the RTT to be over-estimated, since the older entries | ||
351 | * in the Ack Vector have earlier sending times. | ||
352 | * The cleanest solution is to not use the ccid2s_sent field at all | ||
353 | * and instead use DCCP timestamps - need to be resolved at some time. | ||
354 | */ | ||
355 | ccid2_rtt_estimator(sk, jiffies - seqp->ccid2s_sent); | ||
356 | } | 391 | } |
357 | 392 | ||
358 | static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp) | 393 | static inline void ccid2_new_ack(struct sock *sk, |
394 | struct ccid2_seq *seqp, | ||
395 | unsigned int *maxincr) | ||
359 | { | 396 | { |
360 | struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); | 397 | struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); |
361 | 398 | ||
362 | if (time_before(seqp->ccid2s_sent, hctx->last_cong)) { | 399 | if (hctx->ccid2hctx_cwnd < hctx->ccid2hctx_ssthresh) { |
363 | ccid2_pr_debug("Multiple losses in an RTT---treating as one\n"); | 400 | if (*maxincr > 0 && ++hctx->ccid2hctx_packets_acked == 2) { |
364 | return; | 401 | hctx->ccid2hctx_cwnd += 1; |
402 | *maxincr -= 1; | ||
403 | hctx->ccid2hctx_packets_acked = 0; | ||
404 | } | ||
405 | } else if (++hctx->ccid2hctx_packets_acked >= hctx->ccid2hctx_cwnd) { | ||
406 | hctx->ccid2hctx_cwnd += 1; | ||
407 | hctx->ccid2hctx_packets_acked = 0; | ||
365 | } | 408 | } |
366 | 409 | ||
367 | hctx->last_cong = jiffies; | 410 | /* update RTO */ |
411 | if (hctx->ccid2hctx_srtt == -1 || | ||
412 | time_after(jiffies, hctx->ccid2hctx_lastrtt + hctx->ccid2hctx_srtt)) { | ||
413 | unsigned long r = (long)jiffies - (long)seqp->ccid2s_sent; | ||
414 | int s; | ||
415 | |||
416 | /* first measurement */ | ||
417 | if (hctx->ccid2hctx_srtt == -1) { | ||
418 | ccid2_pr_debug("R: %lu Time=%lu seq=%llu\n", | ||
419 | r, jiffies, | ||
420 | (unsigned long long)seqp->ccid2s_seq); | ||
421 | ccid2_change_srtt(hctx, r); | ||
422 | hctx->ccid2hctx_rttvar = r >> 1; | ||
423 | } else { | ||
424 | /* RTTVAR */ | ||
425 | long tmp = hctx->ccid2hctx_srtt - r; | ||
426 | long srtt; | ||
427 | |||
428 | if (tmp < 0) | ||
429 | tmp *= -1; | ||
430 | |||
431 | tmp >>= 2; | ||
432 | hctx->ccid2hctx_rttvar *= 3; | ||
433 | hctx->ccid2hctx_rttvar >>= 2; | ||
434 | hctx->ccid2hctx_rttvar += tmp; | ||
435 | |||
436 | /* SRTT */ | ||
437 | srtt = hctx->ccid2hctx_srtt; | ||
438 | srtt *= 7; | ||
439 | srtt >>= 3; | ||
440 | tmp = r >> 3; | ||
441 | srtt += tmp; | ||
442 | ccid2_change_srtt(hctx, srtt); | ||
443 | } | ||
444 | s = hctx->ccid2hctx_rttvar << 2; | ||
445 | /* clock granularity is 1 when based on jiffies */ | ||
446 | if (!s) | ||
447 | s = 1; | ||
448 | hctx->ccid2hctx_rto = hctx->ccid2hctx_srtt + s; | ||
449 | |||
450 | /* must be at least a second */ | ||
451 | s = hctx->ccid2hctx_rto / HZ; | ||
452 | /* DCCP doesn't require this [but I like it cuz my code sux] */ | ||
453 | #if 1 | ||
454 | if (s < 1) | ||
455 | hctx->ccid2hctx_rto = HZ; | ||
456 | #endif | ||
457 | /* max 60 seconds */ | ||
458 | if (s > 60) | ||
459 | hctx->ccid2hctx_rto = HZ * 60; | ||
368 | 460 | ||
369 | hctx->cwnd = hctx->cwnd / 2 ? : 1U; | 461 | hctx->ccid2hctx_lastrtt = jiffies; |
370 | hctx->ssthresh = max(hctx->cwnd, 2U); | ||
371 | 462 | ||
372 | /* Avoid spurious timeouts resulting from Ack Ratio > cwnd */ | 463 | ccid2_pr_debug("srtt: %ld rttvar: %ld rto: %ld (HZ=%d) R=%lu\n", |
373 | if (dccp_sk(sk)->dccps_l_ack_ratio > hctx->cwnd) | 464 | hctx->ccid2hctx_srtt, hctx->ccid2hctx_rttvar, |
374 | ccid2_change_l_ack_ratio(sk, hctx->cwnd); | 465 | hctx->ccid2hctx_rto, HZ, r); |
466 | } | ||
467 | |||
468 | /* we got a new ack, so re-start RTO timer */ | ||
469 | ccid2_hc_tx_kill_rto_timer(sk); | ||
470 | ccid2_start_rto_timer(sk); | ||
375 | } | 471 | } |
376 | 472 | ||
377 | static int ccid2_hc_tx_parse_options(struct sock *sk, u8 packet_type, | 473 | static void ccid2_hc_tx_dec_pipe(struct sock *sk) |
378 | u8 option, u8 *optval, u8 optlen) | ||
379 | { | 474 | { |
380 | struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); | 475 | struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); |
381 | 476 | ||
382 | switch (option) { | 477 | if (hctx->ccid2hctx_pipe == 0) |
383 | case DCCPO_ACK_VECTOR_0: | 478 | DCCP_BUG("pipe == 0"); |
384 | case DCCPO_ACK_VECTOR_1: | 479 | else |
385 | return dccp_ackvec_parsed_add(&hctx->av_chunks, optval, optlen, | 480 | hctx->ccid2hctx_pipe--; |
386 | option - DCCPO_ACK_VECTOR_0); | 481 | |
482 | if (hctx->ccid2hctx_pipe == 0) | ||
483 | ccid2_hc_tx_kill_rto_timer(sk); | ||
484 | } | ||
485 | |||
486 | static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp) | ||
487 | { | ||
488 | struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); | ||
489 | |||
490 | if (time_before(seqp->ccid2s_sent, hctx->ccid2hctx_last_cong)) { | ||
491 | ccid2_pr_debug("Multiple losses in an RTT---treating as one\n"); | ||
492 | return; | ||
387 | } | 493 | } |
388 | return 0; | 494 | |
495 | hctx->ccid2hctx_last_cong = jiffies; | ||
496 | |||
497 | hctx->ccid2hctx_cwnd = hctx->ccid2hctx_cwnd / 2 ? : 1U; | ||
498 | hctx->ccid2hctx_ssthresh = max(hctx->ccid2hctx_cwnd, 2U); | ||
499 | |||
500 | /* Avoid spurious timeouts resulting from Ack Ratio > cwnd */ | ||
501 | if (dccp_sk(sk)->dccps_l_ack_ratio > hctx->ccid2hctx_cwnd) | ||
502 | ccid2_change_l_ack_ratio(sk, hctx->ccid2hctx_cwnd); | ||
389 | } | 503 | } |
390 | 504 | ||
391 | static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) | 505 | static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) |
392 | { | 506 | { |
393 | struct dccp_sock *dp = dccp_sk(sk); | 507 | struct dccp_sock *dp = dccp_sk(sk); |
394 | struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); | 508 | struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); |
395 | const bool sender_was_blocked = ccid2_cwnd_network_limited(hctx); | ||
396 | struct dccp_ackvec_parsed *avp; | ||
397 | u64 ackno, seqno; | 509 | u64 ackno, seqno; |
398 | struct ccid2_seq *seqp; | 510 | struct ccid2_seq *seqp; |
511 | unsigned char *vector; | ||
512 | unsigned char veclen; | ||
513 | int offset = 0; | ||
399 | int done = 0; | 514 | int done = 0; |
400 | unsigned int maxincr = 0; | 515 | unsigned int maxincr = 0; |
401 | 516 | ||
517 | ccid2_hc_tx_check_sanity(hctx); | ||
402 | /* check reverse path congestion */ | 518 | /* check reverse path congestion */ |
403 | seqno = DCCP_SKB_CB(skb)->dccpd_seq; | 519 | seqno = DCCP_SKB_CB(skb)->dccpd_seq; |
404 | 520 | ||
@@ -407,21 +523,21 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) | |||
407 | * -sorbo. | 523 | * -sorbo. |
408 | */ | 524 | */ |
409 | /* need to bootstrap */ | 525 | /* need to bootstrap */ |
410 | if (hctx->rpdupack == -1) { | 526 | if (hctx->ccid2hctx_rpdupack == -1) { |
411 | hctx->rpdupack = 0; | 527 | hctx->ccid2hctx_rpdupack = 0; |
412 | hctx->rpseq = seqno; | 528 | hctx->ccid2hctx_rpseq = seqno; |
413 | } else { | 529 | } else { |
414 | /* check if packet is consecutive */ | 530 | /* check if packet is consecutive */ |
415 | if (dccp_delta_seqno(hctx->rpseq, seqno) == 1) | 531 | if (dccp_delta_seqno(hctx->ccid2hctx_rpseq, seqno) == 1) |
416 | hctx->rpseq = seqno; | 532 | hctx->ccid2hctx_rpseq = seqno; |
417 | /* it's a later packet */ | 533 | /* it's a later packet */ |
418 | else if (after48(seqno, hctx->rpseq)) { | 534 | else if (after48(seqno, hctx->ccid2hctx_rpseq)) { |
419 | hctx->rpdupack++; | 535 | hctx->ccid2hctx_rpdupack++; |
420 | 536 | ||
421 | /* check if we got enough dupacks */ | 537 | /* check if we got enough dupacks */ |
422 | if (hctx->rpdupack >= NUMDUPACK) { | 538 | if (hctx->ccid2hctx_rpdupack >= NUMDUPACK) { |
423 | hctx->rpdupack = -1; /* XXX lame */ | 539 | hctx->ccid2hctx_rpdupack = -1; /* XXX lame */ |
424 | hctx->rpseq = 0; | 540 | hctx->ccid2hctx_rpseq = 0; |
425 | 541 | ||
426 | ccid2_change_l_ack_ratio(sk, 2 * dp->dccps_l_ack_ratio); | 542 | ccid2_change_l_ack_ratio(sk, 2 * dp->dccps_l_ack_ratio); |
427 | } | 543 | } |
@@ -429,22 +545,27 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) | |||
429 | } | 545 | } |
430 | 546 | ||
431 | /* check forward path congestion */ | 547 | /* check forward path congestion */ |
432 | if (dccp_packet_without_ack(skb)) | 548 | /* still didn't send out new data packets */ |
549 | if (hctx->ccid2hctx_seqh == hctx->ccid2hctx_seqt) | ||
433 | return; | 550 | return; |
434 | 551 | ||
435 | /* still didn't send out new data packets */ | 552 | switch (DCCP_SKB_CB(skb)->dccpd_type) { |
436 | if (hctx->seqh == hctx->seqt) | 553 | case DCCP_PKT_ACK: |
437 | goto done; | 554 | case DCCP_PKT_DATAACK: |
555 | break; | ||
556 | default: | ||
557 | return; | ||
558 | } | ||
438 | 559 | ||
439 | ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq; | 560 | ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq; |
440 | if (after48(ackno, hctx->high_ack)) | 561 | if (after48(ackno, hctx->ccid2hctx_high_ack)) |
441 | hctx->high_ack = ackno; | 562 | hctx->ccid2hctx_high_ack = ackno; |
442 | 563 | ||
443 | seqp = hctx->seqt; | 564 | seqp = hctx->ccid2hctx_seqt; |
444 | while (before48(seqp->ccid2s_seq, ackno)) { | 565 | while (before48(seqp->ccid2s_seq, ackno)) { |
445 | seqp = seqp->ccid2s_next; | 566 | seqp = seqp->ccid2s_next; |
446 | if (seqp == hctx->seqh) { | 567 | if (seqp == hctx->ccid2hctx_seqh) { |
447 | seqp = hctx->seqh->ccid2s_prev; | 568 | seqp = hctx->ccid2hctx_seqh->ccid2s_prev; |
448 | break; | 569 | break; |
449 | } | 570 | } |
450 | } | 571 | } |
@@ -454,26 +575,26 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) | |||
454 | * packets per acknowledgement. Rounding up avoids that cwnd is not | 575 | * packets per acknowledgement. Rounding up avoids that cwnd is not |
455 | * advanced when Ack Ratio is 1 and gives a slight edge otherwise. | 576 | * advanced when Ack Ratio is 1 and gives a slight edge otherwise. |
456 | */ | 577 | */ |
457 | if (hctx->cwnd < hctx->ssthresh) | 578 | if (hctx->ccid2hctx_cwnd < hctx->ccid2hctx_ssthresh) |
458 | maxincr = DIV_ROUND_UP(dp->dccps_l_ack_ratio, 2); | 579 | maxincr = DIV_ROUND_UP(dp->dccps_l_ack_ratio, 2); |
459 | 580 | ||
460 | /* go through all ack vectors */ | 581 | /* go through all ack vectors */ |
461 | list_for_each_entry(avp, &hctx->av_chunks, node) { | 582 | while ((offset = ccid2_ackvector(sk, skb, offset, |
583 | &vector, &veclen)) != -1) { | ||
462 | /* go through this ack vector */ | 584 | /* go through this ack vector */ |
463 | for (; avp->len--; avp->vec++) { | 585 | while (veclen--) { |
464 | u64 ackno_end_rl = SUB48(ackno, | 586 | const u8 rl = *vector & DCCP_ACKVEC_LEN_MASK; |
465 | dccp_ackvec_runlen(avp->vec)); | 587 | u64 ackno_end_rl = SUB48(ackno, rl); |
466 | 588 | ||
467 | ccid2_pr_debug("ackvec %llu |%u,%u|\n", | 589 | ccid2_pr_debug("ackvec start:%llu end:%llu\n", |
468 | (unsigned long long)ackno, | 590 | (unsigned long long)ackno, |
469 | dccp_ackvec_state(avp->vec) >> 6, | 591 | (unsigned long long)ackno_end_rl); |
470 | dccp_ackvec_runlen(avp->vec)); | ||
471 | /* if the seqno we are analyzing is larger than the | 592 | /* if the seqno we are analyzing is larger than the |
472 | * current ackno, then move towards the tail of our | 593 | * current ackno, then move towards the tail of our |
473 | * seqnos. | 594 | * seqnos. |
474 | */ | 595 | */ |
475 | while (after48(seqp->ccid2s_seq, ackno)) { | 596 | while (after48(seqp->ccid2s_seq, ackno)) { |
476 | if (seqp == hctx->seqt) { | 597 | if (seqp == hctx->ccid2hctx_seqt) { |
477 | done = 1; | 598 | done = 1; |
478 | break; | 599 | break; |
479 | } | 600 | } |
@@ -486,24 +607,26 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) | |||
486 | * run length | 607 | * run length |
487 | */ | 608 | */ |
488 | while (between48(seqp->ccid2s_seq,ackno_end_rl,ackno)) { | 609 | while (between48(seqp->ccid2s_seq,ackno_end_rl,ackno)) { |
489 | const u8 state = dccp_ackvec_state(avp->vec); | 610 | const u8 state = *vector & |
611 | DCCP_ACKVEC_STATE_MASK; | ||
490 | 612 | ||
491 | /* new packet received or marked */ | 613 | /* new packet received or marked */ |
492 | if (state != DCCPAV_NOT_RECEIVED && | 614 | if (state != DCCP_ACKVEC_STATE_NOT_RECEIVED && |
493 | !seqp->ccid2s_acked) { | 615 | !seqp->ccid2s_acked) { |
494 | if (state == DCCPAV_ECN_MARKED) | 616 | if (state == |
617 | DCCP_ACKVEC_STATE_ECN_MARKED) { | ||
495 | ccid2_congestion_event(sk, | 618 | ccid2_congestion_event(sk, |
496 | seqp); | 619 | seqp); |
497 | else | 620 | } else |
498 | ccid2_new_ack(sk, seqp, | 621 | ccid2_new_ack(sk, seqp, |
499 | &maxincr); | 622 | &maxincr); |
500 | 623 | ||
501 | seqp->ccid2s_acked = 1; | 624 | seqp->ccid2s_acked = 1; |
502 | ccid2_pr_debug("Got ack for %llu\n", | 625 | ccid2_pr_debug("Got ack for %llu\n", |
503 | (unsigned long long)seqp->ccid2s_seq); | 626 | (unsigned long long)seqp->ccid2s_seq); |
504 | hctx->pipe--; | 627 | ccid2_hc_tx_dec_pipe(sk); |
505 | } | 628 | } |
506 | if (seqp == hctx->seqt) { | 629 | if (seqp == hctx->ccid2hctx_seqt) { |
507 | done = 1; | 630 | done = 1; |
508 | break; | 631 | break; |
509 | } | 632 | } |
@@ -513,6 +636,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) | |||
513 | break; | 636 | break; |
514 | 637 | ||
515 | ackno = SUB48(ackno_end_rl, 1); | 638 | ackno = SUB48(ackno_end_rl, 1); |
639 | vector++; | ||
516 | } | 640 | } |
517 | if (done) | 641 | if (done) |
518 | break; | 642 | break; |
@@ -521,11 +645,11 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) | |||
521 | /* The state about what is acked should be correct now | 645 | /* The state about what is acked should be correct now |
522 | * Check for NUMDUPACK | 646 | * Check for NUMDUPACK |
523 | */ | 647 | */ |
524 | seqp = hctx->seqt; | 648 | seqp = hctx->ccid2hctx_seqt; |
525 | while (before48(seqp->ccid2s_seq, hctx->high_ack)) { | 649 | while (before48(seqp->ccid2s_seq, hctx->ccid2hctx_high_ack)) { |
526 | seqp = seqp->ccid2s_next; | 650 | seqp = seqp->ccid2s_next; |
527 | if (seqp == hctx->seqh) { | 651 | if (seqp == hctx->ccid2hctx_seqh) { |
528 | seqp = hctx->seqh->ccid2s_prev; | 652 | seqp = hctx->ccid2hctx_seqh->ccid2s_prev; |
529 | break; | 653 | break; |
530 | } | 654 | } |
531 | } | 655 | } |
@@ -536,7 +660,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) | |||
536 | if (done == NUMDUPACK) | 660 | if (done == NUMDUPACK) |
537 | break; | 661 | break; |
538 | } | 662 | } |
539 | if (seqp == hctx->seqt) | 663 | if (seqp == hctx->ccid2hctx_seqt) |
540 | break; | 664 | break; |
541 | seqp = seqp->ccid2s_prev; | 665 | seqp = seqp->ccid2s_prev; |
542 | } | 666 | } |
@@ -557,34 +681,25 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) | |||
557 | * one ack vector. | 681 | * one ack vector. |
558 | */ | 682 | */ |
559 | ccid2_congestion_event(sk, seqp); | 683 | ccid2_congestion_event(sk, seqp); |
560 | hctx->pipe--; | 684 | ccid2_hc_tx_dec_pipe(sk); |
561 | } | 685 | } |
562 | if (seqp == hctx->seqt) | 686 | if (seqp == hctx->ccid2hctx_seqt) |
563 | break; | 687 | break; |
564 | seqp = seqp->ccid2s_prev; | 688 | seqp = seqp->ccid2s_prev; |
565 | } | 689 | } |
566 | 690 | ||
567 | hctx->seqt = last_acked; | 691 | hctx->ccid2hctx_seqt = last_acked; |
568 | } | 692 | } |
569 | 693 | ||
570 | /* trim acked packets in tail */ | 694 | /* trim acked packets in tail */ |
571 | while (hctx->seqt != hctx->seqh) { | 695 | while (hctx->ccid2hctx_seqt != hctx->ccid2hctx_seqh) { |
572 | if (!hctx->seqt->ccid2s_acked) | 696 | if (!hctx->ccid2hctx_seqt->ccid2s_acked) |
573 | break; | 697 | break; |
574 | 698 | ||
575 | hctx->seqt = hctx->seqt->ccid2s_next; | 699 | hctx->ccid2hctx_seqt = hctx->ccid2hctx_seqt->ccid2s_next; |
576 | } | 700 | } |
577 | 701 | ||
578 | /* restart RTO timer if not all outstanding data has been acked */ | 702 | ccid2_hc_tx_check_sanity(hctx); |
579 | if (hctx->pipe == 0) | ||
580 | sk_stop_timer(sk, &hctx->rtotimer); | ||
581 | else | ||
582 | sk_reset_timer(sk, &hctx->rtotimer, jiffies + hctx->rto); | ||
583 | done: | ||
584 | /* check if incoming Acks allow pending packets to be sent */ | ||
585 | if (sender_was_blocked && !ccid2_cwnd_network_limited(hctx)) | ||
586 | tasklet_schedule(&dccp_sk(sk)->dccps_xmitlet); | ||
587 | dccp_ackvec_parsed_cleanup(&hctx->av_chunks); | ||
588 | } | 703 | } |
589 | 704 | ||
590 | static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk) | 705 | static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk) |
@@ -594,13 +709,17 @@ static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk) | |||
594 | u32 max_ratio; | 709 | u32 max_ratio; |
595 | 710 | ||
596 | /* RFC 4341, 5: initialise ssthresh to arbitrarily high (max) value */ | 711 | /* RFC 4341, 5: initialise ssthresh to arbitrarily high (max) value */ |
597 | hctx->ssthresh = ~0U; | 712 | hctx->ccid2hctx_ssthresh = ~0U; |
598 | 713 | ||
599 | /* Use larger initial windows (RFC 3390, rfc2581bis) */ | 714 | /* |
600 | hctx->cwnd = rfc3390_bytes_to_packets(dp->dccps_mss_cache); | 715 | * RFC 4341, 5: "The cwnd parameter is initialized to at most four |
716 | * packets for new connections, following the rules from [RFC3390]". | ||
717 | * We need to convert the bytes of RFC3390 into the packets of RFC 4341. | ||
718 | */ | ||
719 | hctx->ccid2hctx_cwnd = clamp(4380U / dp->dccps_mss_cache, 2U, 4U); | ||
601 | 720 | ||
602 | /* Make sure that Ack Ratio is enabled and within bounds. */ | 721 | /* Make sure that Ack Ratio is enabled and within bounds. */ |
603 | max_ratio = DIV_ROUND_UP(hctx->cwnd, 2); | 722 | max_ratio = DIV_ROUND_UP(hctx->ccid2hctx_cwnd, 2); |
604 | if (dp->dccps_l_ack_ratio == 0 || dp->dccps_l_ack_ratio > max_ratio) | 723 | if (dp->dccps_l_ack_ratio == 0 || dp->dccps_l_ack_ratio > max_ratio) |
605 | dp->dccps_l_ack_ratio = max_ratio; | 724 | dp->dccps_l_ack_ratio = max_ratio; |
606 | 725 | ||
@@ -608,11 +727,15 @@ static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk) | |||
608 | if (ccid2_hc_tx_alloc_seq(hctx)) | 727 | if (ccid2_hc_tx_alloc_seq(hctx)) |
609 | return -ENOMEM; | 728 | return -ENOMEM; |
610 | 729 | ||
611 | hctx->rto = DCCP_TIMEOUT_INIT; | 730 | hctx->ccid2hctx_rto = 3 * HZ; |
612 | hctx->rpdupack = -1; | 731 | ccid2_change_srtt(hctx, -1); |
613 | hctx->last_cong = jiffies; | 732 | hctx->ccid2hctx_rttvar = -1; |
614 | setup_timer(&hctx->rtotimer, ccid2_hc_tx_rto_expire, (unsigned long)sk); | 733 | hctx->ccid2hctx_rpdupack = -1; |
615 | INIT_LIST_HEAD(&hctx->av_chunks); | 734 | hctx->ccid2hctx_last_cong = jiffies; |
735 | setup_timer(&hctx->ccid2hctx_rtotimer, ccid2_hc_tx_rto_expire, | ||
736 | (unsigned long)sk); | ||
737 | |||
738 | ccid2_hc_tx_check_sanity(hctx); | ||
616 | return 0; | 739 | return 0; |
617 | } | 740 | } |
618 | 741 | ||
@@ -621,11 +744,11 @@ static void ccid2_hc_tx_exit(struct sock *sk) | |||
621 | struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); | 744 | struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); |
622 | int i; | 745 | int i; |
623 | 746 | ||
624 | sk_stop_timer(sk, &hctx->rtotimer); | 747 | ccid2_hc_tx_kill_rto_timer(sk); |
625 | 748 | ||
626 | for (i = 0; i < hctx->seqbufc; i++) | 749 | for (i = 0; i < hctx->ccid2hctx_seqbufc; i++) |
627 | kfree(hctx->seqbuf[i]); | 750 | kfree(hctx->ccid2hctx_seqbuf[i]); |
628 | hctx->seqbufc = 0; | 751 | hctx->ccid2hctx_seqbufc = 0; |
629 | } | 752 | } |
630 | 753 | ||
631 | static void ccid2_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) | 754 | static void ccid2_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) |
@@ -636,28 +759,27 @@ static void ccid2_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) | |||
636 | switch (DCCP_SKB_CB(skb)->dccpd_type) { | 759 | switch (DCCP_SKB_CB(skb)->dccpd_type) { |
637 | case DCCP_PKT_DATA: | 760 | case DCCP_PKT_DATA: |
638 | case DCCP_PKT_DATAACK: | 761 | case DCCP_PKT_DATAACK: |
639 | hcrx->data++; | 762 | hcrx->ccid2hcrx_data++; |
640 | if (hcrx->data >= dp->dccps_r_ack_ratio) { | 763 | if (hcrx->ccid2hcrx_data >= dp->dccps_r_ack_ratio) { |
641 | dccp_send_ack(sk); | 764 | dccp_send_ack(sk); |
642 | hcrx->data = 0; | 765 | hcrx->ccid2hcrx_data = 0; |
643 | } | 766 | } |
644 | break; | 767 | break; |
645 | } | 768 | } |
646 | } | 769 | } |
647 | 770 | ||
648 | static struct ccid_operations ccid2 = { | 771 | static struct ccid_operations ccid2 = { |
649 | .ccid_id = DCCPC_CCID2, | 772 | .ccid_id = DCCPC_CCID2, |
650 | .ccid_name = "TCP-like", | 773 | .ccid_name = "TCP-like", |
651 | .ccid_owner = THIS_MODULE, | 774 | .ccid_owner = THIS_MODULE, |
652 | .ccid_hc_tx_obj_size = sizeof(struct ccid2_hc_tx_sock), | 775 | .ccid_hc_tx_obj_size = sizeof(struct ccid2_hc_tx_sock), |
653 | .ccid_hc_tx_init = ccid2_hc_tx_init, | 776 | .ccid_hc_tx_init = ccid2_hc_tx_init, |
654 | .ccid_hc_tx_exit = ccid2_hc_tx_exit, | 777 | .ccid_hc_tx_exit = ccid2_hc_tx_exit, |
655 | .ccid_hc_tx_send_packet = ccid2_hc_tx_send_packet, | 778 | .ccid_hc_tx_send_packet = ccid2_hc_tx_send_packet, |
656 | .ccid_hc_tx_packet_sent = ccid2_hc_tx_packet_sent, | 779 | .ccid_hc_tx_packet_sent = ccid2_hc_tx_packet_sent, |
657 | .ccid_hc_tx_parse_options = ccid2_hc_tx_parse_options, | 780 | .ccid_hc_tx_packet_recv = ccid2_hc_tx_packet_recv, |
658 | .ccid_hc_tx_packet_recv = ccid2_hc_tx_packet_recv, | 781 | .ccid_hc_rx_obj_size = sizeof(struct ccid2_hc_rx_sock), |
659 | .ccid_hc_rx_obj_size = sizeof(struct ccid2_hc_rx_sock), | 782 | .ccid_hc_rx_packet_recv = ccid2_hc_rx_packet_recv, |
660 | .ccid_hc_rx_packet_recv = ccid2_hc_rx_packet_recv, | ||
661 | }; | 783 | }; |
662 | 784 | ||
663 | #ifdef CONFIG_IP_DCCP_CCID2_DEBUG | 785 | #ifdef CONFIG_IP_DCCP_CCID2_DEBUG |
diff --git a/net/dccp/ccids/ccid2.h b/net/dccp/ccids/ccid2.h index 8b7a2dee2f6d..2c94ca029010 100644 --- a/net/dccp/ccids/ccid2.h +++ b/net/dccp/ccids/ccid2.h | |||
@@ -42,49 +42,34 @@ struct ccid2_seq { | |||
42 | 42 | ||
43 | /** struct ccid2_hc_tx_sock - CCID2 TX half connection | 43 | /** struct ccid2_hc_tx_sock - CCID2 TX half connection |
44 | * | 44 | * |
45 | * @{cwnd,ssthresh,pipe}: as per RFC 4341, section 5 | 45 | * @ccid2hctx_{cwnd,ssthresh,pipe}: as per RFC 4341, section 5 |
46 | * @packets_acked: Ack counter for deriving cwnd growth (RFC 3465) | 46 | * @ccid2hctx_packets_acked - Ack counter for deriving cwnd growth (RFC 3465) |
47 | * @srtt: smoothed RTT estimate, scaled by 2^3 | 47 | * @ccid2hctx_lastrtt -time RTT was last measured |
48 | * @mdev: smoothed RTT variation, scaled by 2^2 | 48 | * @ccid2hctx_rpseq - last consecutive seqno |
49 | * @mdev_max: maximum of @mdev during one flight | 49 | * @ccid2hctx_rpdupack - dupacks since rpseq |
50 | * @rttvar: moving average/maximum of @mdev_max | 50 | */ |
51 | * @rto: RTO value deriving from SRTT and RTTVAR (RFC 2988) | ||
52 | * @rtt_seq: to decay RTTVAR at most once per flight | ||
53 | * @rpseq: last consecutive seqno | ||
54 | * @rpdupack: dupacks since rpseq | ||
55 | * @av_chunks: list of Ack Vectors received on current skb | ||
56 | */ | ||
57 | struct ccid2_hc_tx_sock { | 51 | struct ccid2_hc_tx_sock { |
58 | u32 cwnd; | 52 | u32 ccid2hctx_cwnd; |
59 | u32 ssthresh; | 53 | u32 ccid2hctx_ssthresh; |
60 | u32 pipe; | 54 | u32 ccid2hctx_pipe; |
61 | u32 packets_acked; | 55 | u32 ccid2hctx_packets_acked; |
62 | struct ccid2_seq *seqbuf[CCID2_SEQBUF_MAX]; | 56 | struct ccid2_seq *ccid2hctx_seqbuf[CCID2_SEQBUF_MAX]; |
63 | int seqbufc; | 57 | int ccid2hctx_seqbufc; |
64 | struct ccid2_seq *seqh; | 58 | struct ccid2_seq *ccid2hctx_seqh; |
65 | struct ccid2_seq *seqt; | 59 | struct ccid2_seq *ccid2hctx_seqt; |
66 | /* RTT measurement: variables/principles are the same as in TCP */ | 60 | long ccid2hctx_rto; |
67 | u32 srtt, | 61 | long ccid2hctx_srtt; |
68 | mdev, | 62 | long ccid2hctx_rttvar; |
69 | mdev_max, | 63 | unsigned long ccid2hctx_lastrtt; |
70 | rttvar, | 64 | struct timer_list ccid2hctx_rtotimer; |
71 | rto; | 65 | u64 ccid2hctx_rpseq; |
72 | u64 rtt_seq:48; | 66 | int ccid2hctx_rpdupack; |
73 | struct timer_list rtotimer; | 67 | unsigned long ccid2hctx_last_cong; |
74 | u64 rpseq; | 68 | u64 ccid2hctx_high_ack; |
75 | int rpdupack; | ||
76 | unsigned long last_cong; | ||
77 | u64 high_ack; | ||
78 | struct list_head av_chunks; | ||
79 | }; | 69 | }; |
80 | 70 | ||
81 | static inline bool ccid2_cwnd_network_limited(struct ccid2_hc_tx_sock *hctx) | ||
82 | { | ||
83 | return (hctx->pipe >= hctx->cwnd); | ||
84 | } | ||
85 | |||
86 | struct ccid2_hc_rx_sock { | 71 | struct ccid2_hc_rx_sock { |
87 | int data; | 72 | int ccid2hcrx_data; |
88 | }; | 73 | }; |
89 | 74 | ||
90 | static inline struct ccid2_hc_tx_sock *ccid2_hc_tx_sk(const struct sock *sk) | 75 | static inline struct ccid2_hc_tx_sock *ccid2_hc_tx_sk(const struct sock *sk) |
diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index 06cfdad84a6a..3b8bd7ca6761 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c | |||
@@ -49,41 +49,75 @@ static int ccid3_debug; | |||
49 | /* | 49 | /* |
50 | * Transmitter Half-Connection Routines | 50 | * Transmitter Half-Connection Routines |
51 | */ | 51 | */ |
52 | /* Oscillation Prevention/Reduction: recommended by rfc3448bis, on by default */ | 52 | #ifdef CONFIG_IP_DCCP_CCID3_DEBUG |
53 | static int do_osc_prev = true; | 53 | static const char *ccid3_tx_state_name(enum ccid3_hc_tx_states state) |
54 | { | ||
55 | static char *ccid3_state_names[] = { | ||
56 | [TFRC_SSTATE_NO_SENT] = "NO_SENT", | ||
57 | [TFRC_SSTATE_NO_FBACK] = "NO_FBACK", | ||
58 | [TFRC_SSTATE_FBACK] = "FBACK", | ||
59 | [TFRC_SSTATE_TERM] = "TERM", | ||
60 | }; | ||
61 | |||
62 | return ccid3_state_names[state]; | ||
63 | } | ||
64 | #endif | ||
65 | |||
66 | static void ccid3_hc_tx_set_state(struct sock *sk, | ||
67 | enum ccid3_hc_tx_states state) | ||
68 | { | ||
69 | struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); | ||
70 | enum ccid3_hc_tx_states oldstate = hctx->ccid3hctx_state; | ||
71 | |||
72 | ccid3_pr_debug("%s(%p) %-8.8s -> %s\n", | ||
73 | dccp_role(sk), sk, ccid3_tx_state_name(oldstate), | ||
74 | ccid3_tx_state_name(state)); | ||
75 | WARN_ON(state == oldstate); | ||
76 | hctx->ccid3hctx_state = state; | ||
77 | } | ||
54 | 78 | ||
55 | /* | 79 | /* |
56 | * Compute the initial sending rate X_init in the manner of RFC 3390: | 80 | * Compute the initial sending rate X_init in the manner of RFC 3390: |
57 | * | 81 | * |
58 | * X_init = min(4 * MPS, max(2 * MPS, 4380 bytes)) / RTT | 82 | * X_init = min(4 * s, max(2 * s, 4380 bytes)) / RTT |
59 | * | 83 | * |
84 | * Note that RFC 3390 uses MSS, RFC 4342 refers to RFC 3390, and rfc3448bis | ||
85 | * (rev-02) clarifies the use of RFC 3390 with regard to the above formula. | ||
60 | * For consistency with other parts of the code, X_init is scaled by 2^6. | 86 | * For consistency with other parts of the code, X_init is scaled by 2^6. |
61 | */ | 87 | */ |
62 | static inline u64 rfc3390_initial_rate(struct sock *sk) | 88 | static inline u64 rfc3390_initial_rate(struct sock *sk) |
63 | { | 89 | { |
64 | const u32 mps = dccp_sk(sk)->dccps_mss_cache, | 90 | const struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); |
65 | w_init = clamp(4380U, 2 * mps, 4 * mps); | 91 | const __u32 w_init = clamp_t(__u32, 4380U, |
92 | 2 * hctx->ccid3hctx_s, 4 * hctx->ccid3hctx_s); | ||
66 | 93 | ||
67 | return scaled_div(w_init << 6, ccid3_hc_tx_sk(sk)->rtt); | 94 | return scaled_div(w_init << 6, hctx->ccid3hctx_rtt); |
68 | } | 95 | } |
69 | 96 | ||
70 | /** | 97 | /* |
71 | * ccid3_update_send_interval - Calculate new t_ipi = s / X | 98 | * Recalculate t_ipi and delta (should be called whenever X changes) |
72 | * This respects the granularity of X (64 * bytes/second) and enforces the | ||
73 | * scaled minimum of s * 64 / t_mbi = `s' bytes/second as per RFC 3448/4342. | ||
74 | */ | 99 | */ |
75 | static void ccid3_update_send_interval(struct ccid3_hc_tx_sock *hctx) | 100 | static void ccid3_update_send_interval(struct ccid3_hc_tx_sock *hctx) |
76 | { | 101 | { |
77 | if (unlikely(hctx->x <= hctx->s)) | 102 | /* Calculate new t_ipi = s / X_inst (X_inst is in 64 * bytes/second) */ |
78 | hctx->x = hctx->s; | 103 | hctx->ccid3hctx_t_ipi = scaled_div32(((u64)hctx->ccid3hctx_s) << 6, |
79 | hctx->t_ipi = scaled_div32(((u64)hctx->s) << 6, hctx->x); | 104 | hctx->ccid3hctx_x); |
105 | |||
106 | /* Calculate new delta by delta = min(t_ipi / 2, t_gran / 2) */ | ||
107 | hctx->ccid3hctx_delta = min_t(u32, hctx->ccid3hctx_t_ipi / 2, | ||
108 | TFRC_OPSYS_HALF_TIME_GRAN); | ||
109 | |||
110 | ccid3_pr_debug("t_ipi=%u, delta=%u, s=%u, X=%u\n", | ||
111 | hctx->ccid3hctx_t_ipi, hctx->ccid3hctx_delta, | ||
112 | hctx->ccid3hctx_s, (unsigned)(hctx->ccid3hctx_x >> 6)); | ||
113 | |||
80 | } | 114 | } |
81 | 115 | ||
82 | static u32 ccid3_hc_tx_idle_rtt(struct ccid3_hc_tx_sock *hctx, ktime_t now) | 116 | static u32 ccid3_hc_tx_idle_rtt(struct ccid3_hc_tx_sock *hctx, ktime_t now) |
83 | { | 117 | { |
84 | u32 delta = ktime_us_delta(now, hctx->t_last_win_count); | 118 | u32 delta = ktime_us_delta(now, hctx->ccid3hctx_t_last_win_count); |
85 | 119 | ||
86 | return delta / hctx->rtt; | 120 | return delta / hctx->ccid3hctx_rtt; |
87 | } | 121 | } |
88 | 122 | ||
89 | /** | 123 | /** |
@@ -99,8 +133,8 @@ static u32 ccid3_hc_tx_idle_rtt(struct ccid3_hc_tx_sock *hctx, ktime_t now) | |||
99 | static void ccid3_hc_tx_update_x(struct sock *sk, ktime_t *stamp) | 133 | static void ccid3_hc_tx_update_x(struct sock *sk, ktime_t *stamp) |
100 | { | 134 | { |
101 | struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); | 135 | struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); |
102 | u64 min_rate = 2 * hctx->x_recv; | 136 | __u64 min_rate = 2 * hctx->ccid3hctx_x_recv; |
103 | const u64 old_x = hctx->x; | 137 | const __u64 old_x = hctx->ccid3hctx_x; |
104 | ktime_t now = stamp ? *stamp : ktime_get_real(); | 138 | ktime_t now = stamp ? *stamp : ktime_get_real(); |
105 | 139 | ||
106 | /* | 140 | /* |
@@ -111,44 +145,50 @@ static void ccid3_hc_tx_update_x(struct sock *sk, ktime_t *stamp) | |||
111 | */ | 145 | */ |
112 | if (ccid3_hc_tx_idle_rtt(hctx, now) >= 2) { | 146 | if (ccid3_hc_tx_idle_rtt(hctx, now) >= 2) { |
113 | min_rate = rfc3390_initial_rate(sk); | 147 | min_rate = rfc3390_initial_rate(sk); |
114 | min_rate = max(min_rate, 2 * hctx->x_recv); | 148 | min_rate = max(min_rate, 2 * hctx->ccid3hctx_x_recv); |
115 | } | 149 | } |
116 | 150 | ||
117 | if (hctx->p > 0) { | 151 | if (hctx->ccid3hctx_p > 0) { |
118 | 152 | ||
119 | hctx->x = min(((u64)hctx->x_calc) << 6, min_rate); | 153 | hctx->ccid3hctx_x = min(((__u64)hctx->ccid3hctx_x_calc) << 6, |
154 | min_rate); | ||
155 | hctx->ccid3hctx_x = max(hctx->ccid3hctx_x, | ||
156 | (((__u64)hctx->ccid3hctx_s) << 6) / | ||
157 | TFRC_T_MBI); | ||
120 | 158 | ||
121 | } else if (ktime_us_delta(now, hctx->t_ld) - (s64)hctx->rtt >= 0) { | 159 | } else if (ktime_us_delta(now, hctx->ccid3hctx_t_ld) |
160 | - (s64)hctx->ccid3hctx_rtt >= 0) { | ||
122 | 161 | ||
123 | hctx->x = min(2 * hctx->x, min_rate); | 162 | hctx->ccid3hctx_x = min(2 * hctx->ccid3hctx_x, min_rate); |
124 | hctx->x = max(hctx->x, | 163 | hctx->ccid3hctx_x = max(hctx->ccid3hctx_x, |
125 | scaled_div(((u64)hctx->s) << 6, hctx->rtt)); | 164 | scaled_div(((__u64)hctx->ccid3hctx_s) << 6, |
126 | hctx->t_ld = now; | 165 | hctx->ccid3hctx_rtt)); |
166 | hctx->ccid3hctx_t_ld = now; | ||
127 | } | 167 | } |
128 | 168 | ||
129 | if (hctx->x != old_x) { | 169 | if (hctx->ccid3hctx_x != old_x) { |
130 | ccid3_pr_debug("X_prev=%u, X_now=%u, X_calc=%u, " | 170 | ccid3_pr_debug("X_prev=%u, X_now=%u, X_calc=%u, " |
131 | "X_recv=%u\n", (unsigned)(old_x >> 6), | 171 | "X_recv=%u\n", (unsigned)(old_x >> 6), |
132 | (unsigned)(hctx->x >> 6), hctx->x_calc, | 172 | (unsigned)(hctx->ccid3hctx_x >> 6), |
133 | (unsigned)(hctx->x_recv >> 6)); | 173 | hctx->ccid3hctx_x_calc, |
174 | (unsigned)(hctx->ccid3hctx_x_recv >> 6)); | ||
134 | 175 | ||
135 | ccid3_update_send_interval(hctx); | 176 | ccid3_update_send_interval(hctx); |
136 | } | 177 | } |
137 | } | 178 | } |
138 | 179 | ||
139 | /* | 180 | /* |
140 | * ccid3_hc_tx_measure_packet_size - Measuring the packet size `s' (sec 4.1) | 181 | * Track the mean packet size `s' (cf. RFC 4342, 5.3 and RFC 3448, 4.1) |
141 | * @new_len: DCCP payload size in bytes (not used by all methods) | 182 | * @len: DCCP packet payload size in bytes |
142 | */ | 183 | */ |
143 | static u32 ccid3_hc_tx_measure_packet_size(struct sock *sk, const u16 new_len) | 184 | static inline void ccid3_hc_tx_update_s(struct ccid3_hc_tx_sock *hctx, int len) |
144 | { | 185 | { |
145 | #if defined(CONFIG_IP_DCCP_CCID3_MEASURE_S_AS_AVG) | 186 | const u16 old_s = hctx->ccid3hctx_s; |
146 | return tfrc_ewma(ccid3_hc_tx_sk(sk)->s, new_len, 9); | 187 | |
147 | #elif defined(CONFIG_IP_DCCP_CCID3_MEASURE_S_AS_MAX) | 188 | hctx->ccid3hctx_s = tfrc_ewma(hctx->ccid3hctx_s, len, 9); |
148 | return max(ccid3_hc_tx_sk(sk)->s, new_len); | 189 | |
149 | #else /* CONFIG_IP_DCCP_CCID3_MEASURE_S_AS_MPS */ | 190 | if (hctx->ccid3hctx_s != old_s) |
150 | return dccp_sk(sk)->dccps_mss_cache; | 191 | ccid3_update_send_interval(hctx); |
151 | #endif | ||
152 | } | 192 | } |
153 | 193 | ||
154 | /* | 194 | /* |
@@ -158,13 +198,13 @@ static u32 ccid3_hc_tx_measure_packet_size(struct sock *sk, const u16 new_len) | |||
158 | static inline void ccid3_hc_tx_update_win_count(struct ccid3_hc_tx_sock *hctx, | 198 | static inline void ccid3_hc_tx_update_win_count(struct ccid3_hc_tx_sock *hctx, |
159 | ktime_t now) | 199 | ktime_t now) |
160 | { | 200 | { |
161 | u32 delta = ktime_us_delta(now, hctx->t_last_win_count), | 201 | u32 delta = ktime_us_delta(now, hctx->ccid3hctx_t_last_win_count), |
162 | quarter_rtts = (4 * delta) / hctx->rtt; | 202 | quarter_rtts = (4 * delta) / hctx->ccid3hctx_rtt; |
163 | 203 | ||
164 | if (quarter_rtts > 0) { | 204 | if (quarter_rtts > 0) { |
165 | hctx->t_last_win_count = now; | 205 | hctx->ccid3hctx_t_last_win_count = now; |
166 | hctx->last_win_count += min(quarter_rtts, 5U); | 206 | hctx->ccid3hctx_last_win_count += min(quarter_rtts, 5U); |
167 | hctx->last_win_count &= 0xF; /* mod 16 */ | 207 | hctx->ccid3hctx_last_win_count &= 0xF; /* mod 16 */ |
168 | } | 208 | } |
169 | } | 209 | } |
170 | 210 | ||
@@ -181,26 +221,25 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data) | |||
181 | goto restart_timer; | 221 | goto restart_timer; |
182 | } | 222 | } |
183 | 223 | ||
184 | ccid3_pr_debug("%s(%p) entry with%s feedback\n", dccp_role(sk), sk, | 224 | ccid3_pr_debug("%s(%p, state=%s) - entry \n", dccp_role(sk), sk, |
185 | hctx->feedback ? "" : "out"); | 225 | ccid3_tx_state_name(hctx->ccid3hctx_state)); |
186 | 226 | ||
187 | /* Ignore and do not restart after leaving the established state */ | 227 | if (hctx->ccid3hctx_state == TFRC_SSTATE_FBACK) |
188 | if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN)) | 228 | ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK); |
229 | else if (hctx->ccid3hctx_state != TFRC_SSTATE_NO_FBACK) | ||
189 | goto out; | 230 | goto out; |
190 | 231 | ||
191 | /* Reset feedback state to "no feedback received" */ | ||
192 | hctx->feedback = false; | ||
193 | |||
194 | /* | 232 | /* |
195 | * Determine new allowed sending rate X as per draft rfc3448bis-00, 4.4 | 233 | * Determine new allowed sending rate X as per draft rfc3448bis-00, 4.4 |
196 | * RTO is 0 if and only if no feedback has been received yet. | ||
197 | */ | 234 | */ |
198 | if (hctx->t_rto == 0 || hctx->p == 0) { | 235 | if (hctx->ccid3hctx_t_rto == 0 || /* no feedback received yet */ |
236 | hctx->ccid3hctx_p == 0) { | ||
199 | 237 | ||
200 | /* halve send rate directly */ | 238 | /* halve send rate directly */ |
201 | hctx->x /= 2; | 239 | hctx->ccid3hctx_x = max(hctx->ccid3hctx_x / 2, |
240 | (((__u64)hctx->ccid3hctx_s) << 6) / | ||
241 | TFRC_T_MBI); | ||
202 | ccid3_update_send_interval(hctx); | 242 | ccid3_update_send_interval(hctx); |
203 | |||
204 | } else { | 243 | } else { |
205 | /* | 244 | /* |
206 | * Modify the cached value of X_recv | 245 | * Modify the cached value of X_recv |
@@ -212,41 +251,44 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data) | |||
212 | * | 251 | * |
213 | * Note that X_recv is scaled by 2^6 while X_calc is not | 252 | * Note that X_recv is scaled by 2^6 while X_calc is not |
214 | */ | 253 | */ |
215 | BUG_ON(hctx->p && !hctx->x_calc); | 254 | BUG_ON(hctx->ccid3hctx_p && !hctx->ccid3hctx_x_calc); |
216 | 255 | ||
217 | if (hctx->x_calc > (hctx->x_recv >> 5)) | 256 | if (hctx->ccid3hctx_x_calc > (hctx->ccid3hctx_x_recv >> 5)) |
218 | hctx->x_recv /= 2; | 257 | hctx->ccid3hctx_x_recv = |
258 | max(hctx->ccid3hctx_x_recv / 2, | ||
259 | (((__u64)hctx->ccid3hctx_s) << 6) / | ||
260 | (2 * TFRC_T_MBI)); | ||
219 | else { | 261 | else { |
220 | hctx->x_recv = hctx->x_calc; | 262 | hctx->ccid3hctx_x_recv = hctx->ccid3hctx_x_calc; |
221 | hctx->x_recv <<= 4; | 263 | hctx->ccid3hctx_x_recv <<= 4; |
222 | } | 264 | } |
223 | ccid3_hc_tx_update_x(sk, NULL); | 265 | ccid3_hc_tx_update_x(sk, NULL); |
224 | } | 266 | } |
225 | ccid3_pr_debug("Reduced X to %llu/64 bytes/sec\n", | 267 | ccid3_pr_debug("Reduced X to %llu/64 bytes/sec\n", |
226 | (unsigned long long)hctx->x); | 268 | (unsigned long long)hctx->ccid3hctx_x); |
227 | 269 | ||
228 | /* | 270 | /* |
229 | * Set new timeout for the nofeedback timer. | 271 | * Set new timeout for the nofeedback timer. |
230 | * See comments in packet_recv() regarding the value of t_RTO. | 272 | * See comments in packet_recv() regarding the value of t_RTO. |
231 | */ | 273 | */ |
232 | if (unlikely(hctx->t_rto == 0)) /* no feedback received yet */ | 274 | if (unlikely(hctx->ccid3hctx_t_rto == 0)) /* no feedback yet */ |
233 | t_nfb = TFRC_INITIAL_TIMEOUT; | 275 | t_nfb = TFRC_INITIAL_TIMEOUT; |
234 | else | 276 | else |
235 | t_nfb = max(hctx->t_rto, 2 * hctx->t_ipi); | 277 | t_nfb = max(hctx->ccid3hctx_t_rto, 2 * hctx->ccid3hctx_t_ipi); |
236 | 278 | ||
237 | restart_timer: | 279 | restart_timer: |
238 | sk_reset_timer(sk, &hctx->no_feedback_timer, | 280 | sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer, |
239 | jiffies + usecs_to_jiffies(t_nfb)); | 281 | jiffies + usecs_to_jiffies(t_nfb)); |
240 | out: | 282 | out: |
241 | bh_unlock_sock(sk); | 283 | bh_unlock_sock(sk); |
242 | sock_put(sk); | 284 | sock_put(sk); |
243 | } | 285 | } |
244 | 286 | ||
245 | /** | 287 | /* |
246 | * ccid3_hc_tx_send_packet - Delay-based dequeueing of TX packets | 288 | * returns |
247 | * @skb: next packet candidate to send on @sk | 289 | * > 0: delay (in msecs) that should pass before actually sending |
248 | * This function uses the convention of ccid_packet_dequeue_eval() and | 290 | * = 0: can send immediately |
249 | * returns a millisecond-delay value between 0 and t_mbi = 64000 msec. | 291 | * < 0: error condition; do not send packet |
250 | */ | 292 | */ |
251 | static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) | 293 | static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) |
252 | { | 294 | { |
@@ -263,14 +305,18 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) | |||
263 | if (unlikely(skb->len == 0)) | 305 | if (unlikely(skb->len == 0)) |
264 | return -EBADMSG; | 306 | return -EBADMSG; |
265 | 307 | ||
266 | if (hctx->s == 0) { | 308 | switch (hctx->ccid3hctx_state) { |
267 | sk_reset_timer(sk, &hctx->no_feedback_timer, (jiffies + | 309 | case TFRC_SSTATE_NO_SENT: |
310 | sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer, | ||
311 | (jiffies + | ||
268 | usecs_to_jiffies(TFRC_INITIAL_TIMEOUT))); | 312 | usecs_to_jiffies(TFRC_INITIAL_TIMEOUT))); |
269 | hctx->last_win_count = 0; | 313 | hctx->ccid3hctx_last_win_count = 0; |
270 | hctx->t_last_win_count = now; | 314 | hctx->ccid3hctx_t_last_win_count = now; |
271 | 315 | ||
272 | /* Set t_0 for initial packet */ | 316 | /* Set t_0 for initial packet */ |
273 | hctx->t_nom = now; | 317 | hctx->ccid3hctx_t_nom = now; |
318 | |||
319 | hctx->ccid3hctx_s = skb->len; | ||
274 | 320 | ||
275 | /* | 321 | /* |
276 | * Use initial RTT sample when available: recommended by erratum | 322 | * Use initial RTT sample when available: recommended by erratum |
@@ -279,9 +325,9 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) | |||
279 | */ | 325 | */ |
280 | if (dp->dccps_syn_rtt) { | 326 | if (dp->dccps_syn_rtt) { |
281 | ccid3_pr_debug("SYN RTT = %uus\n", dp->dccps_syn_rtt); | 327 | ccid3_pr_debug("SYN RTT = %uus\n", dp->dccps_syn_rtt); |
282 | hctx->rtt = dp->dccps_syn_rtt; | 328 | hctx->ccid3hctx_rtt = dp->dccps_syn_rtt; |
283 | hctx->x = rfc3390_initial_rate(sk); | 329 | hctx->ccid3hctx_x = rfc3390_initial_rate(sk); |
284 | hctx->t_ld = now; | 330 | hctx->ccid3hctx_t_ld = now; |
285 | } else { | 331 | } else { |
286 | /* | 332 | /* |
287 | * Sender does not have RTT sample: | 333 | * Sender does not have RTT sample: |
@@ -289,20 +335,17 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) | |||
289 | * is needed in several parts (e.g. window counter); | 335 | * is needed in several parts (e.g. window counter); |
290 | * - set sending rate X_pps = 1pps as per RFC 3448, 4.2. | 336 | * - set sending rate X_pps = 1pps as per RFC 3448, 4.2. |
291 | */ | 337 | */ |
292 | hctx->rtt = DCCP_FALLBACK_RTT; | 338 | hctx->ccid3hctx_rtt = DCCP_FALLBACK_RTT; |
293 | hctx->x = dp->dccps_mss_cache; | 339 | hctx->ccid3hctx_x = hctx->ccid3hctx_s; |
294 | hctx->x <<= 6; | 340 | hctx->ccid3hctx_x <<= 6; |
295 | } | 341 | } |
296 | |||
297 | /* Compute t_ipi = s / X */ | ||
298 | hctx->s = ccid3_hc_tx_measure_packet_size(sk, skb->len); | ||
299 | ccid3_update_send_interval(hctx); | 342 | ccid3_update_send_interval(hctx); |
300 | 343 | ||
301 | /* Seed value for Oscillation Prevention (sec. 4.5) */ | 344 | ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK); |
302 | hctx->r_sqmean = tfrc_scaled_sqrt(hctx->rtt); | 345 | break; |
303 | 346 | case TFRC_SSTATE_NO_FBACK: | |
304 | } else { | 347 | case TFRC_SSTATE_FBACK: |
305 | delay = ktime_us_delta(hctx->t_nom, now); | 348 | delay = ktime_us_delta(hctx->ccid3hctx_t_nom, now); |
306 | ccid3_pr_debug("delay=%ld\n", (long)delay); | 349 | ccid3_pr_debug("delay=%ld\n", (long)delay); |
307 | /* | 350 | /* |
308 | * Scheduling of packet transmissions [RFC 3448, 4.6] | 351 | * Scheduling of packet transmissions [RFC 3448, 4.6] |
@@ -312,80 +355,99 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) | |||
312 | * else | 355 | * else |
313 | * // send the packet in (t_nom - t_now) milliseconds. | 356 | * // send the packet in (t_nom - t_now) milliseconds. |
314 | */ | 357 | */ |
315 | if (delay >= TFRC_T_DELTA) | 358 | if (delay - (s64)hctx->ccid3hctx_delta >= 1000) |
316 | return (u32)delay / USEC_PER_MSEC; | 359 | return (u32)delay / 1000L; |
317 | 360 | ||
318 | ccid3_hc_tx_update_win_count(hctx, now); | 361 | ccid3_hc_tx_update_win_count(hctx, now); |
362 | break; | ||
363 | case TFRC_SSTATE_TERM: | ||
364 | DCCP_BUG("%s(%p) - Illegal state TERM", dccp_role(sk), sk); | ||
365 | return -EINVAL; | ||
319 | } | 366 | } |
320 | 367 | ||
321 | /* prepare to send now (add options etc.) */ | 368 | /* prepare to send now (add options etc.) */ |
322 | dp->dccps_hc_tx_insert_options = 1; | 369 | dp->dccps_hc_tx_insert_options = 1; |
323 | DCCP_SKB_CB(skb)->dccpd_ccval = hctx->last_win_count; | 370 | DCCP_SKB_CB(skb)->dccpd_ccval = hctx->ccid3hctx_last_win_count; |
324 | 371 | ||
325 | /* set the nominal send time for the next following packet */ | 372 | /* set the nominal send time for the next following packet */ |
326 | hctx->t_nom = ktime_add_us(hctx->t_nom, hctx->t_ipi); | 373 | hctx->ccid3hctx_t_nom = ktime_add_us(hctx->ccid3hctx_t_nom, |
327 | return CCID_PACKET_SEND_AT_ONCE; | 374 | hctx->ccid3hctx_t_ipi); |
375 | return 0; | ||
328 | } | 376 | } |
329 | 377 | ||
330 | static void ccid3_hc_tx_packet_sent(struct sock *sk, unsigned int len) | 378 | static void ccid3_hc_tx_packet_sent(struct sock *sk, int more, |
379 | unsigned int len) | ||
331 | { | 380 | { |
332 | struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); | 381 | struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); |
333 | 382 | ||
334 | /* Changes to s will become effective the next time X is computed */ | 383 | ccid3_hc_tx_update_s(hctx, len); |
335 | hctx->s = ccid3_hc_tx_measure_packet_size(sk, len); | ||
336 | 384 | ||
337 | if (tfrc_tx_hist_add(&hctx->hist, dccp_sk(sk)->dccps_gss)) | 385 | if (tfrc_tx_hist_add(&hctx->ccid3hctx_hist, dccp_sk(sk)->dccps_gss)) |
338 | DCCP_CRIT("packet history - out of memory!"); | 386 | DCCP_CRIT("packet history - out of memory!"); |
339 | } | 387 | } |
340 | 388 | ||
341 | static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) | 389 | static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) |
342 | { | 390 | { |
343 | struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); | 391 | struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); |
344 | struct tfrc_tx_hist_entry *acked; | 392 | struct ccid3_options_received *opt_recv; |
345 | ktime_t now; | 393 | ktime_t now; |
346 | unsigned long t_nfb; | 394 | unsigned long t_nfb; |
347 | u32 r_sample; | 395 | u32 pinv, r_sample; |
348 | 396 | ||
349 | /* we are only interested in ACKs */ | 397 | /* we are only interested in ACKs */ |
350 | if (!(DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK || | 398 | if (!(DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK || |
351 | DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_DATAACK)) | 399 | DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_DATAACK)) |
352 | return; | 400 | return; |
353 | /* | 401 | /* ... and only in the established state */ |
354 | * Locate the acknowledged packet in the TX history. | 402 | if (hctx->ccid3hctx_state != TFRC_SSTATE_FBACK && |
355 | * | 403 | hctx->ccid3hctx_state != TFRC_SSTATE_NO_FBACK) |
356 | * Returning "entry not found" here can for instance happen when | 404 | return; |
357 | * - the host has not sent out anything (e.g. a passive server), | 405 | |
358 | * - the Ack is outdated (packet with higher Ack number was received), | 406 | opt_recv = &hctx->ccid3hctx_options_received; |
359 | * - it is a bogus Ack (for a packet not sent on this connection). | 407 | now = ktime_get_real(); |
360 | */ | 408 | |
361 | acked = tfrc_tx_hist_find_entry(hctx->hist, dccp_hdr_ack_seq(skb)); | 409 | /* Estimate RTT from history if ACK number is valid */ |
362 | if (acked == NULL) | 410 | r_sample = tfrc_tx_hist_rtt(hctx->ccid3hctx_hist, |
411 | DCCP_SKB_CB(skb)->dccpd_ack_seq, now); | ||
412 | if (r_sample == 0) { | ||
413 | DCCP_WARN("%s(%p): %s with bogus ACK-%llu\n", dccp_role(sk), sk, | ||
414 | dccp_packet_name(DCCP_SKB_CB(skb)->dccpd_type), | ||
415 | (unsigned long long)DCCP_SKB_CB(skb)->dccpd_ack_seq); | ||
363 | return; | 416 | return; |
364 | /* For the sake of RTT sampling, ignore/remove all older entries */ | 417 | } |
365 | tfrc_tx_hist_purge(&acked->next); | ||
366 | 418 | ||
367 | /* Update the moving average for the RTT estimate (RFC 3448, 4.3) */ | 419 | /* Update receive rate in units of 64 * bytes/second */ |
368 | now = ktime_get_real(); | 420 | hctx->ccid3hctx_x_recv = opt_recv->ccid3or_receive_rate; |
369 | r_sample = dccp_sample_rtt(sk, ktime_us_delta(now, acked->stamp)); | 421 | hctx->ccid3hctx_x_recv <<= 6; |
370 | hctx->rtt = tfrc_ewma(hctx->rtt, r_sample, 9); | ||
371 | 422 | ||
423 | /* Update loss event rate (which is scaled by 1e6) */ | ||
424 | pinv = opt_recv->ccid3or_loss_event_rate; | ||
425 | if (pinv == ~0U || pinv == 0) /* see RFC 4342, 8.5 */ | ||
426 | hctx->ccid3hctx_p = 0; | ||
427 | else /* can not exceed 100% */ | ||
428 | hctx->ccid3hctx_p = scaled_div(1, pinv); | ||
429 | /* | ||
430 | * Validate new RTT sample and update moving average | ||
431 | */ | ||
432 | r_sample = dccp_sample_rtt(sk, r_sample); | ||
433 | hctx->ccid3hctx_rtt = tfrc_ewma(hctx->ccid3hctx_rtt, r_sample, 9); | ||
372 | /* | 434 | /* |
373 | * Update allowed sending rate X as per draft rfc3448bis-00, 4.2/3 | 435 | * Update allowed sending rate X as per draft rfc3448bis-00, 4.2/3 |
374 | */ | 436 | */ |
375 | if (!hctx->feedback) { | 437 | if (hctx->ccid3hctx_state == TFRC_SSTATE_NO_FBACK) { |
376 | hctx->feedback = true; | 438 | ccid3_hc_tx_set_state(sk, TFRC_SSTATE_FBACK); |
377 | 439 | ||
378 | if (hctx->t_rto == 0) { | 440 | if (hctx->ccid3hctx_t_rto == 0) { |
379 | /* | 441 | /* |
380 | * Initial feedback packet: Larger Initial Windows (4.2) | 442 | * Initial feedback packet: Larger Initial Windows (4.2) |
381 | */ | 443 | */ |
382 | hctx->x = rfc3390_initial_rate(sk); | 444 | hctx->ccid3hctx_x = rfc3390_initial_rate(sk); |
383 | hctx->t_ld = now; | 445 | hctx->ccid3hctx_t_ld = now; |
384 | 446 | ||
385 | ccid3_update_send_interval(hctx); | 447 | ccid3_update_send_interval(hctx); |
386 | 448 | ||
387 | goto done_computing_x; | 449 | goto done_computing_x; |
388 | } else if (hctx->p == 0) { | 450 | } else if (hctx->ccid3hctx_p == 0) { |
389 | /* | 451 | /* |
390 | * First feedback after nofeedback timer expiry (4.3) | 452 | * First feedback after nofeedback timer expiry (4.3) |
391 | */ | 453 | */ |
@@ -394,52 +456,25 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) | |||
394 | } | 456 | } |
395 | 457 | ||
396 | /* Update sending rate (step 4 of [RFC 3448, 4.3]) */ | 458 | /* Update sending rate (step 4 of [RFC 3448, 4.3]) */ |
397 | if (hctx->p > 0) | 459 | if (hctx->ccid3hctx_p > 0) |
398 | hctx->x_calc = tfrc_calc_x(hctx->s, hctx->rtt, hctx->p); | 460 | hctx->ccid3hctx_x_calc = |
461 | tfrc_calc_x(hctx->ccid3hctx_s, | ||
462 | hctx->ccid3hctx_rtt, | ||
463 | hctx->ccid3hctx_p); | ||
399 | ccid3_hc_tx_update_x(sk, &now); | 464 | ccid3_hc_tx_update_x(sk, &now); |
400 | 465 | ||
401 | done_computing_x: | 466 | done_computing_x: |
402 | ccid3_pr_debug("%s(%p), RTT=%uus (sample=%uus), s=%u, " | 467 | ccid3_pr_debug("%s(%p), RTT=%uus (sample=%uus), s=%u, " |
403 | "p=%u, X_calc=%u, X_recv=%u, X=%u\n", | 468 | "p=%u, X_calc=%u, X_recv=%u, X=%u\n", |
404 | dccp_role(sk), sk, hctx->rtt, r_sample, | 469 | dccp_role(sk), |
405 | hctx->s, hctx->p, hctx->x_calc, | 470 | sk, hctx->ccid3hctx_rtt, r_sample, |
406 | (unsigned)(hctx->x_recv >> 6), | 471 | hctx->ccid3hctx_s, hctx->ccid3hctx_p, |
407 | (unsigned)(hctx->x >> 6)); | 472 | hctx->ccid3hctx_x_calc, |
408 | /* | 473 | (unsigned)(hctx->ccid3hctx_x_recv >> 6), |
409 | * Oscillation Reduction (RFC 3448, 4.5) - modifying t_ipi according to | 474 | (unsigned)(hctx->ccid3hctx_x >> 6)); |
410 | * RTT changes, multiplying by X/X_inst = sqrt(R_sample)/R_sqmean. This | ||
411 | * can be useful if few connections share a link, avoiding that buffer | ||
412 | * fill levels (RTT) oscillate as a result of frequent adjustments to X. | ||
413 | * A useful presentation with background information is in | ||
414 | * Joerg Widmer, "Equation-Based Congestion Control", | ||
415 | * MSc Thesis, University of Mannheim, Germany, 2000 | ||
416 | * (sec. 3.6.4), who calls this ISM ("Inter-packet Space Modulation"). | ||
417 | */ | ||
418 | if (do_osc_prev) { | ||
419 | r_sample = tfrc_scaled_sqrt(r_sample); | ||
420 | /* | ||
421 | * The modulation can work in both ways: increase/decrease t_ipi | ||
422 | * according to long-term increases/decreases of the RTT. The | ||
423 | * former is a useful measure, since it works against queue | ||
424 | * build-up. The latter temporarily increases the sending rate, | ||
425 | * so that buffers fill up more quickly. This in turn causes | ||
426 | * the RTT to increase, so that either later reduction becomes | ||
427 | * necessary or the RTT stays at a very high level. Decreasing | ||
428 | * t_ipi is therefore not supported. | ||
429 | * Furthermore, during the initial slow-start phase the RTT | ||
430 | * naturally increases, where using the algorithm would cause | ||
431 | * delays. Hence it is disabled during the initial slow-start. | ||
432 | */ | ||
433 | if (r_sample > hctx->r_sqmean && hctx->p > 0) | ||
434 | hctx->t_ipi = div_u64((u64)hctx->t_ipi * (u64)r_sample, | ||
435 | hctx->r_sqmean); | ||
436 | hctx->t_ipi = min_t(u32, hctx->t_ipi, TFRC_T_MBI); | ||
437 | /* update R_sqmean _after_ computing the modulation factor */ | ||
438 | hctx->r_sqmean = tfrc_ewma(hctx->r_sqmean, r_sample, 9); | ||
439 | } | ||
440 | 475 | ||
441 | /* unschedule no feedback timer */ | 476 | /* unschedule no feedback timer */ |
442 | sk_stop_timer(sk, &hctx->no_feedback_timer); | 477 | sk_stop_timer(sk, &hctx->ccid3hctx_no_feedback_timer); |
443 | 478 | ||
444 | /* | 479 | /* |
445 | * As we have calculated new ipi, delta, t_nom it is possible | 480 | * As we have calculated new ipi, delta, t_nom it is possible |
@@ -453,66 +488,95 @@ done_computing_x: | |||
453 | * This can help avoid triggering the nofeedback timer too | 488 | * This can help avoid triggering the nofeedback timer too |
454 | * often ('spinning') on LANs with small RTTs. | 489 | * often ('spinning') on LANs with small RTTs. |
455 | */ | 490 | */ |
456 | hctx->t_rto = max_t(u32, 4 * hctx->rtt, (CONFIG_IP_DCCP_CCID3_RTO * | 491 | hctx->ccid3hctx_t_rto = max_t(u32, 4 * hctx->ccid3hctx_rtt, |
457 | (USEC_PER_SEC / 1000))); | 492 | (CONFIG_IP_DCCP_CCID3_RTO * |
493 | (USEC_PER_SEC / 1000))); | ||
458 | /* | 494 | /* |
459 | * Schedule no feedback timer to expire in | 495 | * Schedule no feedback timer to expire in |
460 | * max(t_RTO, 2 * s/X) = max(t_RTO, 2 * t_ipi) | 496 | * max(t_RTO, 2 * s/X) = max(t_RTO, 2 * t_ipi) |
461 | */ | 497 | */ |
462 | t_nfb = max(hctx->t_rto, 2 * hctx->t_ipi); | 498 | t_nfb = max(hctx->ccid3hctx_t_rto, 2 * hctx->ccid3hctx_t_ipi); |
463 | 499 | ||
464 | ccid3_pr_debug("%s(%p), Scheduled no feedback timer to " | 500 | ccid3_pr_debug("%s(%p), Scheduled no feedback timer to " |
465 | "expire in %lu jiffies (%luus)\n", | 501 | "expire in %lu jiffies (%luus)\n", |
466 | dccp_role(sk), sk, usecs_to_jiffies(t_nfb), t_nfb); | 502 | dccp_role(sk), |
503 | sk, usecs_to_jiffies(t_nfb), t_nfb); | ||
467 | 504 | ||
468 | sk_reset_timer(sk, &hctx->no_feedback_timer, | 505 | sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer, |
469 | jiffies + usecs_to_jiffies(t_nfb)); | 506 | jiffies + usecs_to_jiffies(t_nfb)); |
470 | } | 507 | } |
471 | 508 | ||
472 | static int ccid3_hc_tx_parse_options(struct sock *sk, u8 packet_type, | 509 | static int ccid3_hc_tx_parse_options(struct sock *sk, unsigned char option, |
473 | u8 option, u8 *optval, u8 optlen) | 510 | unsigned char len, u16 idx, |
511 | unsigned char *value) | ||
474 | { | 512 | { |
513 | int rc = 0; | ||
514 | const struct dccp_sock *dp = dccp_sk(sk); | ||
475 | struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); | 515 | struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); |
516 | struct ccid3_options_received *opt_recv; | ||
476 | __be32 opt_val; | 517 | __be32 opt_val; |
477 | 518 | ||
478 | switch (option) { | 519 | opt_recv = &hctx->ccid3hctx_options_received; |
479 | case TFRC_OPT_RECEIVE_RATE: | ||
480 | case TFRC_OPT_LOSS_EVENT_RATE: | ||
481 | /* Must be ignored on Data packets, cf. RFC 4342 8.3 and 8.5 */ | ||
482 | if (packet_type == DCCP_PKT_DATA) | ||
483 | break; | ||
484 | if (unlikely(optlen != 4)) { | ||
485 | DCCP_WARN("%s(%p), invalid len %d for %u\n", | ||
486 | dccp_role(sk), sk, optlen, option); | ||
487 | return -EINVAL; | ||
488 | } | ||
489 | opt_val = ntohl(get_unaligned((__be32 *)optval)); | ||
490 | 520 | ||
491 | if (option == TFRC_OPT_RECEIVE_RATE) { | 521 | if (opt_recv->ccid3or_seqno != dp->dccps_gsr) { |
492 | /* Receive Rate is kept in units of 64 bytes/second */ | 522 | opt_recv->ccid3or_seqno = dp->dccps_gsr; |
493 | hctx->x_recv = opt_val; | 523 | opt_recv->ccid3or_loss_event_rate = ~0; |
494 | hctx->x_recv <<= 6; | 524 | opt_recv->ccid3or_loss_intervals_idx = 0; |
525 | opt_recv->ccid3or_loss_intervals_len = 0; | ||
526 | opt_recv->ccid3or_receive_rate = 0; | ||
527 | } | ||
495 | 528 | ||
496 | ccid3_pr_debug("%s(%p), RECEIVE_RATE=%u\n", | 529 | switch (option) { |
497 | dccp_role(sk), sk, opt_val); | 530 | case TFRC_OPT_LOSS_EVENT_RATE: |
531 | if (unlikely(len != 4)) { | ||
532 | DCCP_WARN("%s(%p), invalid len %d " | ||
533 | "for TFRC_OPT_LOSS_EVENT_RATE\n", | ||
534 | dccp_role(sk), sk, len); | ||
535 | rc = -EINVAL; | ||
498 | } else { | 536 | } else { |
499 | /* Update the fixpoint Loss Event Rate fraction */ | 537 | opt_val = get_unaligned((__be32 *)value); |
500 | hctx->p = tfrc_invert_loss_event_rate(opt_val); | 538 | opt_recv->ccid3or_loss_event_rate = ntohl(opt_val); |
501 | |||
502 | ccid3_pr_debug("%s(%p), LOSS_EVENT_RATE=%u\n", | 539 | ccid3_pr_debug("%s(%p), LOSS_EVENT_RATE=%u\n", |
503 | dccp_role(sk), sk, opt_val); | 540 | dccp_role(sk), sk, |
541 | opt_recv->ccid3or_loss_event_rate); | ||
504 | } | 542 | } |
543 | break; | ||
544 | case TFRC_OPT_LOSS_INTERVALS: | ||
545 | opt_recv->ccid3or_loss_intervals_idx = idx; | ||
546 | opt_recv->ccid3or_loss_intervals_len = len; | ||
547 | ccid3_pr_debug("%s(%p), LOSS_INTERVALS=(%u, %u)\n", | ||
548 | dccp_role(sk), sk, | ||
549 | opt_recv->ccid3or_loss_intervals_idx, | ||
550 | opt_recv->ccid3or_loss_intervals_len); | ||
551 | break; | ||
552 | case TFRC_OPT_RECEIVE_RATE: | ||
553 | if (unlikely(len != 4)) { | ||
554 | DCCP_WARN("%s(%p), invalid len %d " | ||
555 | "for TFRC_OPT_RECEIVE_RATE\n", | ||
556 | dccp_role(sk), sk, len); | ||
557 | rc = -EINVAL; | ||
558 | } else { | ||
559 | opt_val = get_unaligned((__be32 *)value); | ||
560 | opt_recv->ccid3or_receive_rate = ntohl(opt_val); | ||
561 | ccid3_pr_debug("%s(%p), RECEIVE_RATE=%u\n", | ||
562 | dccp_role(sk), sk, | ||
563 | opt_recv->ccid3or_receive_rate); | ||
564 | } | ||
565 | break; | ||
505 | } | 566 | } |
506 | return 0; | 567 | |
568 | return rc; | ||
507 | } | 569 | } |
508 | 570 | ||
509 | static int ccid3_hc_tx_init(struct ccid *ccid, struct sock *sk) | 571 | static int ccid3_hc_tx_init(struct ccid *ccid, struct sock *sk) |
510 | { | 572 | { |
511 | struct ccid3_hc_tx_sock *hctx = ccid_priv(ccid); | 573 | struct ccid3_hc_tx_sock *hctx = ccid_priv(ccid); |
512 | 574 | ||
513 | hctx->hist = NULL; | 575 | hctx->ccid3hctx_state = TFRC_SSTATE_NO_SENT; |
514 | setup_timer(&hctx->no_feedback_timer, | 576 | hctx->ccid3hctx_hist = NULL; |
515 | ccid3_hc_tx_no_feedback_timer, (unsigned long)sk); | 577 | setup_timer(&hctx->ccid3hctx_no_feedback_timer, |
578 | ccid3_hc_tx_no_feedback_timer, (unsigned long)sk); | ||
579 | |||
516 | return 0; | 580 | return 0; |
517 | } | 581 | } |
518 | 582 | ||
@@ -520,36 +584,42 @@ static void ccid3_hc_tx_exit(struct sock *sk) | |||
520 | { | 584 | { |
521 | struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); | 585 | struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); |
522 | 586 | ||
523 | sk_stop_timer(sk, &hctx->no_feedback_timer); | 587 | ccid3_hc_tx_set_state(sk, TFRC_SSTATE_TERM); |
524 | tfrc_tx_hist_purge(&hctx->hist); | 588 | sk_stop_timer(sk, &hctx->ccid3hctx_no_feedback_timer); |
589 | |||
590 | tfrc_tx_hist_purge(&hctx->ccid3hctx_hist); | ||
525 | } | 591 | } |
526 | 592 | ||
527 | static void ccid3_hc_tx_get_info(struct sock *sk, struct tcp_info *info) | 593 | static void ccid3_hc_tx_get_info(struct sock *sk, struct tcp_info *info) |
528 | { | 594 | { |
529 | info->tcpi_rto = ccid3_hc_tx_sk(sk)->t_rto; | 595 | struct ccid3_hc_tx_sock *hctx; |
530 | info->tcpi_rtt = ccid3_hc_tx_sk(sk)->rtt; | 596 | |
597 | /* Listen socks doesn't have a private CCID block */ | ||
598 | if (sk->sk_state == DCCP_LISTEN) | ||
599 | return; | ||
600 | |||
601 | hctx = ccid3_hc_tx_sk(sk); | ||
602 | info->tcpi_rto = hctx->ccid3hctx_t_rto; | ||
603 | info->tcpi_rtt = hctx->ccid3hctx_rtt; | ||
531 | } | 604 | } |
532 | 605 | ||
533 | static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len, | 606 | static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len, |
534 | u32 __user *optval, int __user *optlen) | 607 | u32 __user *optval, int __user *optlen) |
535 | { | 608 | { |
536 | const struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); | 609 | const struct ccid3_hc_tx_sock *hctx; |
537 | struct tfrc_tx_info tfrc; | ||
538 | const void *val; | 610 | const void *val; |
539 | 611 | ||
612 | /* Listen socks doesn't have a private CCID block */ | ||
613 | if (sk->sk_state == DCCP_LISTEN) | ||
614 | return -EINVAL; | ||
615 | |||
616 | hctx = ccid3_hc_tx_sk(sk); | ||
540 | switch (optname) { | 617 | switch (optname) { |
541 | case DCCP_SOCKOPT_CCID_TX_INFO: | 618 | case DCCP_SOCKOPT_CCID_TX_INFO: |
542 | if (len < sizeof(tfrc)) | 619 | if (len < sizeof(hctx->ccid3hctx_tfrc)) |
543 | return -EINVAL; | 620 | return -EINVAL; |
544 | tfrc.tfrctx_x = hctx->x; | 621 | len = sizeof(hctx->ccid3hctx_tfrc); |
545 | tfrc.tfrctx_x_recv = hctx->x_recv; | 622 | val = &hctx->ccid3hctx_tfrc; |
546 | tfrc.tfrctx_x_calc = hctx->x_calc; | ||
547 | tfrc.tfrctx_rtt = hctx->rtt; | ||
548 | tfrc.tfrctx_p = hctx->p; | ||
549 | tfrc.tfrctx_rto = hctx->t_rto; | ||
550 | tfrc.tfrctx_ipi = hctx->t_ipi; | ||
551 | len = sizeof(tfrc); | ||
552 | val = &tfrc; | ||
553 | break; | 623 | break; |
554 | default: | 624 | default: |
555 | return -ENOPROTOOPT; | 625 | return -ENOPROTOOPT; |
@@ -564,82 +634,112 @@ static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len, | |||
564 | /* | 634 | /* |
565 | * Receiver Half-Connection Routines | 635 | * Receiver Half-Connection Routines |
566 | */ | 636 | */ |
637 | |||
638 | /* CCID3 feedback types */ | ||
639 | enum ccid3_fback_type { | ||
640 | CCID3_FBACK_NONE = 0, | ||
641 | CCID3_FBACK_INITIAL, | ||
642 | CCID3_FBACK_PERIODIC, | ||
643 | CCID3_FBACK_PARAM_CHANGE | ||
644 | }; | ||
645 | |||
646 | #ifdef CONFIG_IP_DCCP_CCID3_DEBUG | ||
647 | static const char *ccid3_rx_state_name(enum ccid3_hc_rx_states state) | ||
648 | { | ||
649 | static char *ccid3_rx_state_names[] = { | ||
650 | [TFRC_RSTATE_NO_DATA] = "NO_DATA", | ||
651 | [TFRC_RSTATE_DATA] = "DATA", | ||
652 | [TFRC_RSTATE_TERM] = "TERM", | ||
653 | }; | ||
654 | |||
655 | return ccid3_rx_state_names[state]; | ||
656 | } | ||
657 | #endif | ||
658 | |||
659 | static void ccid3_hc_rx_set_state(struct sock *sk, | ||
660 | enum ccid3_hc_rx_states state) | ||
661 | { | ||
662 | struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); | ||
663 | enum ccid3_hc_rx_states oldstate = hcrx->ccid3hcrx_state; | ||
664 | |||
665 | ccid3_pr_debug("%s(%p) %-8.8s -> %s\n", | ||
666 | dccp_role(sk), sk, ccid3_rx_state_name(oldstate), | ||
667 | ccid3_rx_state_name(state)); | ||
668 | WARN_ON(state == oldstate); | ||
669 | hcrx->ccid3hcrx_state = state; | ||
670 | } | ||
671 | |||
567 | static void ccid3_hc_rx_send_feedback(struct sock *sk, | 672 | static void ccid3_hc_rx_send_feedback(struct sock *sk, |
568 | const struct sk_buff *skb, | 673 | const struct sk_buff *skb, |
569 | enum ccid3_fback_type fbtype) | 674 | enum ccid3_fback_type fbtype) |
570 | { | 675 | { |
571 | struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); | 676 | struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); |
677 | struct dccp_sock *dp = dccp_sk(sk); | ||
678 | ktime_t now; | ||
679 | s64 delta = 0; | ||
680 | |||
681 | if (unlikely(hcrx->ccid3hcrx_state == TFRC_RSTATE_TERM)) | ||
682 | return; | ||
683 | |||
684 | now = ktime_get_real(); | ||
572 | 685 | ||
573 | switch (fbtype) { | 686 | switch (fbtype) { |
574 | case CCID3_FBACK_INITIAL: | 687 | case CCID3_FBACK_INITIAL: |
575 | hcrx->x_recv = 0; | 688 | hcrx->ccid3hcrx_x_recv = 0; |
576 | hcrx->p_inverse = ~0U; /* see RFC 4342, 8.5 */ | 689 | hcrx->ccid3hcrx_pinv = ~0U; /* see RFC 4342, 8.5 */ |
577 | break; | 690 | break; |
578 | case CCID3_FBACK_PARAM_CHANGE: | 691 | case CCID3_FBACK_PARAM_CHANGE: |
579 | if (unlikely(hcrx->feedback == CCID3_FBACK_NONE)) { | ||
580 | /* | ||
581 | * rfc3448bis-06, 6.3.1: First packet(s) lost or marked | ||
582 | * FIXME: in rfc3448bis the receiver returns X_recv=0 | ||
583 | * here as it normally would in the first feedback packet. | ||
584 | * However this is not possible yet, since the code still | ||
585 | * uses RFC 3448, i.e. | ||
586 | * If (p > 0) | ||
587 | * Calculate X_calc using the TCP throughput equation. | ||
588 | * X = max(min(X_calc, 2*X_recv), s/t_mbi); | ||
589 | * would bring X down to s/t_mbi. That is why we return | ||
590 | * X_recv according to rfc3448bis-06 for the moment. | ||
591 | */ | ||
592 | u32 s = tfrc_rx_hist_packet_size(&hcrx->hist), | ||
593 | rtt = tfrc_rx_hist_rtt(&hcrx->hist); | ||
594 | |||
595 | hcrx->x_recv = scaled_div32(s, 2 * rtt); | ||
596 | break; | ||
597 | } | ||
598 | /* | 692 | /* |
599 | * When parameters change (new loss or p > p_prev), we do not | 693 | * When parameters change (new loss or p > p_prev), we do not |
600 | * have a reliable estimate for R_m of [RFC 3448, 6.2] and so | 694 | * have a reliable estimate for R_m of [RFC 3448, 6.2] and so |
601 | * always check whether at least RTT time units were covered. | 695 | * need to reuse the previous value of X_recv. However, when |
696 | * X_recv was 0 (due to early loss), this would kill X down to | ||
697 | * s/t_mbi (i.e. one packet in 64 seconds). | ||
698 | * To avoid such drastic reduction, we approximate X_recv as | ||
699 | * the number of bytes since last feedback. | ||
700 | * This is a safe fallback, since X is bounded above by X_calc. | ||
602 | */ | 701 | */ |
603 | hcrx->x_recv = tfrc_rx_hist_x_recv(&hcrx->hist, hcrx->x_recv); | 702 | if (hcrx->ccid3hcrx_x_recv > 0) |
604 | break; | 703 | break; |
704 | /* fall through */ | ||
605 | case CCID3_FBACK_PERIODIC: | 705 | case CCID3_FBACK_PERIODIC: |
606 | /* | 706 | delta = ktime_us_delta(now, hcrx->ccid3hcrx_tstamp_last_feedback); |
607 | * Step (2) of rfc3448bis-06, 6.2: | 707 | if (delta <= 0) |
608 | * - if no data packets have been received, just restart timer | 708 | DCCP_BUG("delta (%ld) <= 0", (long)delta); |
609 | * - if data packets have been received, re-compute X_recv | 709 | else |
610 | */ | 710 | hcrx->ccid3hcrx_x_recv = |
611 | if (hcrx->hist.bytes_recvd == 0) | 711 | scaled_div32(hcrx->ccid3hcrx_bytes_recv, delta); |
612 | goto prepare_for_next_time; | ||
613 | hcrx->x_recv = tfrc_rx_hist_x_recv(&hcrx->hist, hcrx->x_recv); | ||
614 | break; | 712 | break; |
615 | default: | 713 | default: |
616 | return; | 714 | return; |
617 | } | 715 | } |
618 | 716 | ||
619 | ccid3_pr_debug("X_recv=%u, 1/p=%u\n", hcrx->x_recv, hcrx->p_inverse); | 717 | ccid3_pr_debug("Interval %ldusec, X_recv=%u, 1/p=%u\n", (long)delta, |
718 | hcrx->ccid3hcrx_x_recv, hcrx->ccid3hcrx_pinv); | ||
620 | 719 | ||
621 | dccp_sk(sk)->dccps_hc_rx_insert_options = 1; | 720 | hcrx->ccid3hcrx_tstamp_last_feedback = now; |
622 | dccp_send_ack(sk); | 721 | hcrx->ccid3hcrx_last_counter = dccp_hdr(skb)->dccph_ccval; |
722 | hcrx->ccid3hcrx_bytes_recv = 0; | ||
623 | 723 | ||
624 | prepare_for_next_time: | 724 | dp->dccps_hc_rx_insert_options = 1; |
625 | tfrc_rx_hist_restart_byte_counter(&hcrx->hist); | 725 | dccp_send_ack(sk); |
626 | hcrx->last_counter = dccp_hdr(skb)->dccph_ccval; | ||
627 | hcrx->feedback = fbtype; | ||
628 | } | 726 | } |
629 | 727 | ||
630 | static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb) | 728 | static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb) |
631 | { | 729 | { |
632 | const struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); | 730 | const struct ccid3_hc_rx_sock *hcrx; |
633 | __be32 x_recv, pinv; | 731 | __be32 x_recv, pinv; |
634 | 732 | ||
635 | if (!(sk->sk_state == DCCP_OPEN || sk->sk_state == DCCP_PARTOPEN)) | 733 | if (!(sk->sk_state == DCCP_OPEN || sk->sk_state == DCCP_PARTOPEN)) |
636 | return 0; | 734 | return 0; |
637 | 735 | ||
736 | hcrx = ccid3_hc_rx_sk(sk); | ||
737 | |||
638 | if (dccp_packet_without_ack(skb)) | 738 | if (dccp_packet_without_ack(skb)) |
639 | return 0; | 739 | return 0; |
640 | 740 | ||
641 | x_recv = htonl(hcrx->x_recv); | 741 | x_recv = htonl(hcrx->ccid3hcrx_x_recv); |
642 | pinv = htonl(hcrx->p_inverse); | 742 | pinv = htonl(hcrx->ccid3hcrx_pinv); |
643 | 743 | ||
644 | if (dccp_insert_option(sk, skb, TFRC_OPT_LOSS_EVENT_RATE, | 744 | if (dccp_insert_option(sk, skb, TFRC_OPT_LOSS_EVENT_RATE, |
645 | &pinv, sizeof(pinv)) || | 745 | &pinv, sizeof(pinv)) || |
@@ -662,95 +762,171 @@ static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb) | |||
662 | static u32 ccid3_first_li(struct sock *sk) | 762 | static u32 ccid3_first_li(struct sock *sk) |
663 | { | 763 | { |
664 | struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); | 764 | struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); |
665 | u32 s = tfrc_rx_hist_packet_size(&hcrx->hist), | 765 | u32 x_recv, p, delta; |
666 | rtt = tfrc_rx_hist_rtt(&hcrx->hist), x_recv, p; | ||
667 | u64 fval; | 766 | u64 fval; |
668 | 767 | ||
669 | /* | 768 | if (hcrx->ccid3hcrx_rtt == 0) { |
670 | * rfc3448bis-06, 6.3.1: First data packet(s) are marked or lost. Set p | 769 | DCCP_WARN("No RTT estimate available, using fallback RTT\n"); |
671 | * to give the equivalent of X_target = s/(2*R). Thus fval = 2 and so p | 770 | hcrx->ccid3hcrx_rtt = DCCP_FALLBACK_RTT; |
672 | * is about 20.64%. This yields an interval length of 4.84 (rounded up). | 771 | } |
673 | */ | ||
674 | if (unlikely(hcrx->feedback == CCID3_FBACK_NONE)) | ||
675 | return 5; | ||
676 | 772 | ||
677 | x_recv = tfrc_rx_hist_x_recv(&hcrx->hist, hcrx->x_recv); | 773 | delta = ktime_to_us(net_timedelta(hcrx->ccid3hcrx_tstamp_last_feedback)); |
678 | if (x_recv == 0) | 774 | x_recv = scaled_div32(hcrx->ccid3hcrx_bytes_recv, delta); |
679 | goto failed; | 775 | if (x_recv == 0) { /* would also trigger divide-by-zero */ |
776 | DCCP_WARN("X_recv==0\n"); | ||
777 | if ((x_recv = hcrx->ccid3hcrx_x_recv) == 0) { | ||
778 | DCCP_BUG("stored value of X_recv is zero"); | ||
779 | return ~0U; | ||
780 | } | ||
781 | } | ||
680 | 782 | ||
681 | fval = scaled_div32(scaled_div(s, rtt), x_recv); | 783 | fval = scaled_div(hcrx->ccid3hcrx_s, hcrx->ccid3hcrx_rtt); |
784 | fval = scaled_div32(fval, x_recv); | ||
682 | p = tfrc_calc_x_reverse_lookup(fval); | 785 | p = tfrc_calc_x_reverse_lookup(fval); |
683 | 786 | ||
684 | ccid3_pr_debug("%s(%p), receive rate=%u bytes/s, implied " | 787 | ccid3_pr_debug("%s(%p), receive rate=%u bytes/s, implied " |
685 | "loss rate=%u\n", dccp_role(sk), sk, x_recv, p); | 788 | "loss rate=%u\n", dccp_role(sk), sk, x_recv, p); |
686 | 789 | ||
687 | if (p > 0) | 790 | return p == 0 ? ~0U : scaled_div(1, p); |
688 | return scaled_div(1, p); | ||
689 | failed: | ||
690 | return UINT_MAX; | ||
691 | } | 791 | } |
692 | 792 | ||
693 | static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) | 793 | static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) |
694 | { | 794 | { |
695 | struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); | 795 | struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); |
796 | enum ccid3_fback_type do_feedback = CCID3_FBACK_NONE; | ||
696 | const u64 ndp = dccp_sk(sk)->dccps_options_received.dccpor_ndp; | 797 | const u64 ndp = dccp_sk(sk)->dccps_options_received.dccpor_ndp; |
697 | const bool is_data_packet = dccp_data_packet(skb); | 798 | const bool is_data_packet = dccp_data_packet(skb); |
698 | 799 | ||
800 | if (unlikely(hcrx->ccid3hcrx_state == TFRC_RSTATE_NO_DATA)) { | ||
801 | if (is_data_packet) { | ||
802 | const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4; | ||
803 | do_feedback = CCID3_FBACK_INITIAL; | ||
804 | ccid3_hc_rx_set_state(sk, TFRC_RSTATE_DATA); | ||
805 | hcrx->ccid3hcrx_s = payload; | ||
806 | /* | ||
807 | * Not necessary to update ccid3hcrx_bytes_recv here, | ||
808 | * since X_recv = 0 for the first feedback packet (cf. | ||
809 | * RFC 3448, 6.3) -- gerrit | ||
810 | */ | ||
811 | } | ||
812 | goto update_records; | ||
813 | } | ||
814 | |||
815 | if (tfrc_rx_hist_duplicate(&hcrx->ccid3hcrx_hist, skb)) | ||
816 | return; /* done receiving */ | ||
817 | |||
818 | if (is_data_packet) { | ||
819 | const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4; | ||
820 | /* | ||
821 | * Update moving-average of s and the sum of received payload bytes | ||
822 | */ | ||
823 | hcrx->ccid3hcrx_s = tfrc_ewma(hcrx->ccid3hcrx_s, payload, 9); | ||
824 | hcrx->ccid3hcrx_bytes_recv += payload; | ||
825 | } | ||
826 | |||
699 | /* | 827 | /* |
700 | * Perform loss detection and handle pending losses | 828 | * Perform loss detection and handle pending losses |
701 | */ | 829 | */ |
702 | if (tfrc_rx_congestion_event(&hcrx->hist, &hcrx->li_hist, | 830 | if (tfrc_rx_handle_loss(&hcrx->ccid3hcrx_hist, &hcrx->ccid3hcrx_li_hist, |
703 | skb, ndp, ccid3_first_li, sk)) | 831 | skb, ndp, ccid3_first_li, sk)) { |
704 | ccid3_hc_rx_send_feedback(sk, skb, CCID3_FBACK_PARAM_CHANGE); | 832 | do_feedback = CCID3_FBACK_PARAM_CHANGE; |
833 | goto done_receiving; | ||
834 | } | ||
835 | |||
836 | if (tfrc_rx_hist_loss_pending(&hcrx->ccid3hcrx_hist)) | ||
837 | return; /* done receiving */ | ||
838 | |||
705 | /* | 839 | /* |
706 | * Feedback for first non-empty data packet (RFC 3448, 6.3) | 840 | * Handle data packets: RTT sampling and monitoring p |
707 | */ | 841 | */ |
708 | else if (unlikely(hcrx->feedback == CCID3_FBACK_NONE && is_data_packet)) | 842 | if (unlikely(!is_data_packet)) |
709 | ccid3_hc_rx_send_feedback(sk, skb, CCID3_FBACK_INITIAL); | 843 | goto update_records; |
844 | |||
845 | if (!tfrc_lh_is_initialised(&hcrx->ccid3hcrx_li_hist)) { | ||
846 | const u32 sample = tfrc_rx_hist_sample_rtt(&hcrx->ccid3hcrx_hist, skb); | ||
847 | /* | ||
848 | * Empty loss history: no loss so far, hence p stays 0. | ||
849 | * Sample RTT values, since an RTT estimate is required for the | ||
850 | * computation of p when the first loss occurs; RFC 3448, 6.3.1. | ||
851 | */ | ||
852 | if (sample != 0) | ||
853 | hcrx->ccid3hcrx_rtt = tfrc_ewma(hcrx->ccid3hcrx_rtt, sample, 9); | ||
854 | |||
855 | } else if (tfrc_lh_update_i_mean(&hcrx->ccid3hcrx_li_hist, skb)) { | ||
856 | /* | ||
857 | * Step (3) of [RFC 3448, 6.1]: Recompute I_mean and, if I_mean | ||
858 | * has decreased (resp. p has increased), send feedback now. | ||
859 | */ | ||
860 | do_feedback = CCID3_FBACK_PARAM_CHANGE; | ||
861 | } | ||
862 | |||
710 | /* | 863 | /* |
711 | * Check if the periodic once-per-RTT feedback is due; RFC 4342, 10.3 | 864 | * Check if the periodic once-per-RTT feedback is due; RFC 4342, 10.3 |
712 | */ | 865 | */ |
713 | else if (!tfrc_rx_hist_loss_pending(&hcrx->hist) && is_data_packet && | 866 | if (SUB16(dccp_hdr(skb)->dccph_ccval, hcrx->ccid3hcrx_last_counter) > 3) |
714 | SUB16(dccp_hdr(skb)->dccph_ccval, hcrx->last_counter) > 3) | 867 | do_feedback = CCID3_FBACK_PERIODIC; |
715 | ccid3_hc_rx_send_feedback(sk, skb, CCID3_FBACK_PERIODIC); | 868 | |
869 | update_records: | ||
870 | tfrc_rx_hist_add_packet(&hcrx->ccid3hcrx_hist, skb, ndp); | ||
871 | |||
872 | done_receiving: | ||
873 | if (do_feedback) | ||
874 | ccid3_hc_rx_send_feedback(sk, skb, do_feedback); | ||
716 | } | 875 | } |
717 | 876 | ||
718 | static int ccid3_hc_rx_init(struct ccid *ccid, struct sock *sk) | 877 | static int ccid3_hc_rx_init(struct ccid *ccid, struct sock *sk) |
719 | { | 878 | { |
720 | struct ccid3_hc_rx_sock *hcrx = ccid_priv(ccid); | 879 | struct ccid3_hc_rx_sock *hcrx = ccid_priv(ccid); |
721 | 880 | ||
722 | tfrc_lh_init(&hcrx->li_hist); | 881 | hcrx->ccid3hcrx_state = TFRC_RSTATE_NO_DATA; |
723 | return tfrc_rx_hist_init(&hcrx->hist, sk); | 882 | tfrc_lh_init(&hcrx->ccid3hcrx_li_hist); |
883 | return tfrc_rx_hist_alloc(&hcrx->ccid3hcrx_hist); | ||
724 | } | 884 | } |
725 | 885 | ||
726 | static void ccid3_hc_rx_exit(struct sock *sk) | 886 | static void ccid3_hc_rx_exit(struct sock *sk) |
727 | { | 887 | { |
728 | struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); | 888 | struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); |
729 | 889 | ||
730 | tfrc_rx_hist_purge(&hcrx->hist); | 890 | ccid3_hc_rx_set_state(sk, TFRC_RSTATE_TERM); |
731 | tfrc_lh_cleanup(&hcrx->li_hist); | 891 | |
892 | tfrc_rx_hist_purge(&hcrx->ccid3hcrx_hist); | ||
893 | tfrc_lh_cleanup(&hcrx->ccid3hcrx_li_hist); | ||
732 | } | 894 | } |
733 | 895 | ||
734 | static void ccid3_hc_rx_get_info(struct sock *sk, struct tcp_info *info) | 896 | static void ccid3_hc_rx_get_info(struct sock *sk, struct tcp_info *info) |
735 | { | 897 | { |
898 | const struct ccid3_hc_rx_sock *hcrx; | ||
899 | |||
900 | /* Listen socks doesn't have a private CCID block */ | ||
901 | if (sk->sk_state == DCCP_LISTEN) | ||
902 | return; | ||
903 | |||
904 | hcrx = ccid3_hc_rx_sk(sk); | ||
905 | info->tcpi_ca_state = hcrx->ccid3hcrx_state; | ||
736 | info->tcpi_options |= TCPI_OPT_TIMESTAMPS; | 906 | info->tcpi_options |= TCPI_OPT_TIMESTAMPS; |
737 | info->tcpi_rcv_rtt = tfrc_rx_hist_rtt(&ccid3_hc_rx_sk(sk)->hist); | 907 | info->tcpi_rcv_rtt = hcrx->ccid3hcrx_rtt; |
738 | } | 908 | } |
739 | 909 | ||
740 | static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len, | 910 | static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len, |
741 | u32 __user *optval, int __user *optlen) | 911 | u32 __user *optval, int __user *optlen) |
742 | { | 912 | { |
743 | const struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); | 913 | const struct ccid3_hc_rx_sock *hcrx; |
744 | struct tfrc_rx_info rx_info; | 914 | struct tfrc_rx_info rx_info; |
745 | const void *val; | 915 | const void *val; |
746 | 916 | ||
917 | /* Listen socks doesn't have a private CCID block */ | ||
918 | if (sk->sk_state == DCCP_LISTEN) | ||
919 | return -EINVAL; | ||
920 | |||
921 | hcrx = ccid3_hc_rx_sk(sk); | ||
747 | switch (optname) { | 922 | switch (optname) { |
748 | case DCCP_SOCKOPT_CCID_RX_INFO: | 923 | case DCCP_SOCKOPT_CCID_RX_INFO: |
749 | if (len < sizeof(rx_info)) | 924 | if (len < sizeof(rx_info)) |
750 | return -EINVAL; | 925 | return -EINVAL; |
751 | rx_info.tfrcrx_x_recv = hcrx->x_recv; | 926 | rx_info.tfrcrx_x_recv = hcrx->ccid3hcrx_x_recv; |
752 | rx_info.tfrcrx_rtt = tfrc_rx_hist_rtt(&hcrx->hist); | 927 | rx_info.tfrcrx_rtt = hcrx->ccid3hcrx_rtt; |
753 | rx_info.tfrcrx_p = tfrc_invert_loss_event_rate(hcrx->p_inverse); | 928 | rx_info.tfrcrx_p = hcrx->ccid3hcrx_pinv == 0 ? ~0U : |
929 | scaled_div(1, hcrx->ccid3hcrx_pinv); | ||
754 | len = sizeof(rx_info); | 930 | len = sizeof(rx_info); |
755 | val = &rx_info; | 931 | val = &rx_info; |
756 | break; | 932 | break; |
@@ -786,9 +962,6 @@ static struct ccid_operations ccid3 = { | |||
786 | .ccid_hc_tx_getsockopt = ccid3_hc_tx_getsockopt, | 962 | .ccid_hc_tx_getsockopt = ccid3_hc_tx_getsockopt, |
787 | }; | 963 | }; |
788 | 964 | ||
789 | module_param(do_osc_prev, bool, 0644); | ||
790 | MODULE_PARM_DESC(do_osc_prev, "Use Oscillation Prevention (RFC 3448, 4.5)"); | ||
791 | |||
792 | #ifdef CONFIG_IP_DCCP_CCID3_DEBUG | 965 | #ifdef CONFIG_IP_DCCP_CCID3_DEBUG |
793 | module_param(ccid3_debug, bool, 0644); | 966 | module_param(ccid3_debug, bool, 0644); |
794 | MODULE_PARM_DESC(ccid3_debug, "Enable debug messages"); | 967 | MODULE_PARM_DESC(ccid3_debug, "Enable debug messages"); |
@@ -796,19 +969,6 @@ MODULE_PARM_DESC(ccid3_debug, "Enable debug messages"); | |||
796 | 969 | ||
797 | static __init int ccid3_module_init(void) | 970 | static __init int ccid3_module_init(void) |
798 | { | 971 | { |
799 | struct timespec tp; | ||
800 | |||
801 | /* | ||
802 | * Without a fine-grained clock resolution, RTTs/X_recv are not sampled | ||
803 | * correctly and feedback is sent either too early or too late. | ||
804 | */ | ||
805 | hrtimer_get_res(CLOCK_MONOTONIC, &tp); | ||
806 | if (tp.tv_sec || tp.tv_nsec > DCCP_TIME_RESOLUTION * NSEC_PER_USEC) { | ||
807 | printk(KERN_ERR "%s: Timer too coarse (%ld usec), need %u-usec" | ||
808 | " resolution - check your clocksource.\n", __func__, | ||
809 | tp.tv_nsec/NSEC_PER_USEC, DCCP_TIME_RESOLUTION); | ||
810 | return -ESOCKTNOSUPPORT; | ||
811 | } | ||
812 | return ccid_register(&ccid3); | 972 | return ccid_register(&ccid3); |
813 | } | 973 | } |
814 | module_init(ccid3_module_init); | 974 | module_init(ccid3_module_init); |
diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h index af6e1bf937d9..49ca32bd7e79 100644 --- a/net/dccp/ccids/ccid3.h +++ b/net/dccp/ccids/ccid3.h | |||
@@ -47,22 +47,11 @@ | |||
47 | /* Two seconds as per RFC 3448 4.2 */ | 47 | /* Two seconds as per RFC 3448 4.2 */ |
48 | #define TFRC_INITIAL_TIMEOUT (2 * USEC_PER_SEC) | 48 | #define TFRC_INITIAL_TIMEOUT (2 * USEC_PER_SEC) |
49 | 49 | ||
50 | /* Maximum backoff interval t_mbi (RFC 3448, 4.3) */ | 50 | /* In usecs - half the scheduling granularity as per RFC3448 4.6 */ |
51 | #define TFRC_T_MBI (64 * USEC_PER_SEC) | 51 | #define TFRC_OPSYS_HALF_TIME_GRAN (USEC_PER_SEC / (2 * HZ)) |
52 | 52 | ||
53 | /* | 53 | /* Parameter t_mbi from [RFC 3448, 4.3]: backoff interval in seconds */ |
54 | * The t_delta parameter (RFC 3448, 4.6): delays of less than %USEC_PER_MSEC are | 54 | #define TFRC_T_MBI 64 |
55 | * rounded down to 0, since sk_reset_timer() here uses millisecond granularity. | ||
56 | * Hence we can use a constant t_delta = %USEC_PER_MSEC when HZ >= 500. A coarse | ||
57 | * resolution of HZ < 500 means that the error is below one timer tick (t_gran) | ||
58 | * when using the constant t_delta = t_gran / 2 = %USEC_PER_SEC / (2 * HZ). | ||
59 | */ | ||
60 | #if (HZ >= 500) | ||
61 | # define TFRC_T_DELTA USEC_PER_MSEC | ||
62 | #else | ||
63 | # define TFRC_T_DELTA (USEC_PER_SEC / (2 * HZ)) | ||
64 | #warning Coarse CONFIG_HZ resolution -- higher value recommended for TFRC. | ||
65 | #endif | ||
66 | 55 | ||
67 | enum ccid3_options { | 56 | enum ccid3_options { |
68 | TFRC_OPT_LOSS_EVENT_RATE = 192, | 57 | TFRC_OPT_LOSS_EVENT_RATE = 192, |
@@ -70,43 +59,62 @@ enum ccid3_options { | |||
70 | TFRC_OPT_RECEIVE_RATE = 194, | 59 | TFRC_OPT_RECEIVE_RATE = 194, |
71 | }; | 60 | }; |
72 | 61 | ||
62 | struct ccid3_options_received { | ||
63 | u64 ccid3or_seqno:48, | ||
64 | ccid3or_loss_intervals_idx:16; | ||
65 | u16 ccid3or_loss_intervals_len; | ||
66 | u32 ccid3or_loss_event_rate; | ||
67 | u32 ccid3or_receive_rate; | ||
68 | }; | ||
69 | |||
70 | /* TFRC sender states */ | ||
71 | enum ccid3_hc_tx_states { | ||
72 | TFRC_SSTATE_NO_SENT = 1, | ||
73 | TFRC_SSTATE_NO_FBACK, | ||
74 | TFRC_SSTATE_FBACK, | ||
75 | TFRC_SSTATE_TERM, | ||
76 | }; | ||
77 | |||
73 | /** struct ccid3_hc_tx_sock - CCID3 sender half-connection socket | 78 | /** struct ccid3_hc_tx_sock - CCID3 sender half-connection socket |
74 | * | 79 | * |
75 | * @x - Current sending rate in 64 * bytes per second | 80 | * @ccid3hctx_x - Current sending rate in 64 * bytes per second |
76 | * @x_recv - Receive rate in 64 * bytes per second | 81 | * @ccid3hctx_x_recv - Receive rate in 64 * bytes per second |
77 | * @x_calc - Calculated rate in bytes per second | 82 | * @ccid3hctx_x_calc - Calculated rate in bytes per second |
78 | * @rtt - Estimate of current round trip time in usecs | 83 | * @ccid3hctx_rtt - Estimate of current round trip time in usecs |
79 | * @r_sqmean - Estimate of long-term RTT (RFC 3448, 4.5) | 84 | * @ccid3hctx_p - Current loss event rate (0-1) scaled by 1000000 |
80 | * @p - Current loss event rate (0-1) scaled by 1000000 | 85 | * @ccid3hctx_s - Packet size in bytes |
81 | * @s - Packet size in bytes | 86 | * @ccid3hctx_t_rto - Nofeedback Timer setting in usecs |
82 | * @t_rto - Nofeedback Timer setting in usecs | 87 | * @ccid3hctx_t_ipi - Interpacket (send) interval (RFC 3448, 4.6) in usecs |
83 | * @t_ipi - Interpacket (send) interval (RFC 3448, 4.6) in usecs | 88 | * @ccid3hctx_state - Sender state, one of %ccid3_hc_tx_states |
84 | * @feedback - Whether feedback has been received or not | 89 | * @ccid3hctx_last_win_count - Last window counter sent |
85 | * @last_win_count - Last window counter sent | 90 | * @ccid3hctx_t_last_win_count - Timestamp of earliest packet |
86 | * @t_last_win_count - Timestamp of earliest packet with | 91 | * with last_win_count value sent |
87 | * last_win_count value sent | 92 | * @ccid3hctx_no_feedback_timer - Handle to no feedback timer |
88 | * @no_feedback_timer - Handle to no feedback timer | 93 | * @ccid3hctx_t_ld - Time last doubled during slow start |
89 | * @t_ld - Time last doubled during slow start | 94 | * @ccid3hctx_t_nom - Nominal send time of next packet |
90 | * @t_nom - Nominal send time of next packet | 95 | * @ccid3hctx_delta - Send timer delta (RFC 3448, 4.6) in usecs |
91 | * @hist - Packet history | 96 | * @ccid3hctx_hist - Packet history |
97 | * @ccid3hctx_options_received - Parsed set of retrieved options | ||
92 | */ | 98 | */ |
93 | struct ccid3_hc_tx_sock { | 99 | struct ccid3_hc_tx_sock { |
94 | u64 x; | 100 | struct tfrc_tx_info ccid3hctx_tfrc; |
95 | u64 x_recv; | 101 | #define ccid3hctx_x ccid3hctx_tfrc.tfrctx_x |
96 | u32 x_calc; | 102 | #define ccid3hctx_x_recv ccid3hctx_tfrc.tfrctx_x_recv |
97 | u32 rtt; | 103 | #define ccid3hctx_x_calc ccid3hctx_tfrc.tfrctx_x_calc |
98 | u16 r_sqmean; | 104 | #define ccid3hctx_rtt ccid3hctx_tfrc.tfrctx_rtt |
99 | u32 p; | 105 | #define ccid3hctx_p ccid3hctx_tfrc.tfrctx_p |
100 | u32 t_rto; | 106 | #define ccid3hctx_t_rto ccid3hctx_tfrc.tfrctx_rto |
101 | u32 t_ipi; | 107 | #define ccid3hctx_t_ipi ccid3hctx_tfrc.tfrctx_ipi |
102 | u16 s; | 108 | u16 ccid3hctx_s; |
103 | bool feedback:1; | 109 | enum ccid3_hc_tx_states ccid3hctx_state:8; |
104 | u8 last_win_count; | 110 | u8 ccid3hctx_last_win_count; |
105 | ktime_t t_last_win_count; | 111 | ktime_t ccid3hctx_t_last_win_count; |
106 | struct timer_list no_feedback_timer; | 112 | struct timer_list ccid3hctx_no_feedback_timer; |
107 | ktime_t t_ld; | 113 | ktime_t ccid3hctx_t_ld; |
108 | ktime_t t_nom; | 114 | ktime_t ccid3hctx_t_nom; |
109 | struct tfrc_tx_hist_entry *hist; | 115 | u32 ccid3hctx_delta; |
116 | struct tfrc_tx_hist_entry *ccid3hctx_hist; | ||
117 | struct ccid3_options_received ccid3hctx_options_received; | ||
110 | }; | 118 | }; |
111 | 119 | ||
112 | static inline struct ccid3_hc_tx_sock *ccid3_hc_tx_sk(const struct sock *sk) | 120 | static inline struct ccid3_hc_tx_sock *ccid3_hc_tx_sk(const struct sock *sk) |
@@ -116,32 +124,41 @@ static inline struct ccid3_hc_tx_sock *ccid3_hc_tx_sk(const struct sock *sk) | |||
116 | return hctx; | 124 | return hctx; |
117 | } | 125 | } |
118 | 126 | ||
119 | 127 | /* TFRC receiver states */ | |
120 | enum ccid3_fback_type { | 128 | enum ccid3_hc_rx_states { |
121 | CCID3_FBACK_NONE = 0, | 129 | TFRC_RSTATE_NO_DATA = 1, |
122 | CCID3_FBACK_INITIAL, | 130 | TFRC_RSTATE_DATA, |
123 | CCID3_FBACK_PERIODIC, | 131 | TFRC_RSTATE_TERM = 127, |
124 | CCID3_FBACK_PARAM_CHANGE | ||
125 | }; | 132 | }; |
126 | 133 | ||
127 | /** struct ccid3_hc_rx_sock - CCID3 receiver half-connection socket | 134 | /** struct ccid3_hc_rx_sock - CCID3 receiver half-connection socket |
128 | * | 135 | * |
129 | * @last_counter - Tracks window counter (RFC 4342, 8.1) | 136 | * @ccid3hcrx_x_recv - Receiver estimate of send rate (RFC 3448 4.3) |
130 | * @feedback - The type of the feedback last sent | 137 | * @ccid3hcrx_rtt - Receiver estimate of rtt (non-standard) |
131 | * @x_recv - Receiver estimate of send rate (RFC 3448, sec. 4.3) | 138 | * @ccid3hcrx_p - Current loss event rate (RFC 3448 5.4) |
132 | * @tstamp_last_feedback - Time at which last feedback was sent | 139 | * @ccid3hcrx_last_counter - Tracks window counter (RFC 4342, 8.1) |
133 | * @hist - Packet history (loss detection + RTT sampling) | 140 | * @ccid3hcrx_state - Receiver state, one of %ccid3_hc_rx_states |
134 | * @li_hist - Loss Interval database | 141 | * @ccid3hcrx_bytes_recv - Total sum of DCCP payload bytes |
135 | * @p_inverse - Inverse of Loss Event Rate (RFC 4342, sec. 8.5) | 142 | * @ccid3hcrx_x_recv - Receiver estimate of send rate (RFC 3448, sec. 4.3) |
143 | * @ccid3hcrx_rtt - Receiver estimate of RTT | ||
144 | * @ccid3hcrx_tstamp_last_feedback - Time at which last feedback was sent | ||
145 | * @ccid3hcrx_tstamp_last_ack - Time at which last feedback was sent | ||
146 | * @ccid3hcrx_hist - Packet history (loss detection + RTT sampling) | ||
147 | * @ccid3hcrx_li_hist - Loss Interval database | ||
148 | * @ccid3hcrx_s - Received packet size in bytes | ||
149 | * @ccid3hcrx_pinv - Inverse of Loss Event Rate (RFC 4342, sec. 8.5) | ||
136 | */ | 150 | */ |
137 | struct ccid3_hc_rx_sock { | 151 | struct ccid3_hc_rx_sock { |
138 | u8 last_counter:4; | 152 | u8 ccid3hcrx_last_counter:4; |
139 | enum ccid3_fback_type feedback:4; | 153 | enum ccid3_hc_rx_states ccid3hcrx_state:8; |
140 | u32 x_recv; | 154 | u32 ccid3hcrx_bytes_recv; |
141 | ktime_t tstamp_last_feedback; | 155 | u32 ccid3hcrx_x_recv; |
142 | struct tfrc_rx_hist hist; | 156 | u32 ccid3hcrx_rtt; |
143 | struct tfrc_loss_hist li_hist; | 157 | ktime_t ccid3hcrx_tstamp_last_feedback; |
144 | #define p_inverse li_hist.i_mean | 158 | struct tfrc_rx_hist ccid3hcrx_hist; |
159 | struct tfrc_loss_hist ccid3hcrx_li_hist; | ||
160 | u16 ccid3hcrx_s; | ||
161 | #define ccid3hcrx_pinv ccid3hcrx_li_hist.i_mean | ||
145 | }; | 162 | }; |
146 | 163 | ||
147 | static inline struct ccid3_hc_rx_sock *ccid3_hc_rx_sk(const struct sock *sk) | 164 | static inline struct ccid3_hc_rx_sock *ccid3_hc_rx_sk(const struct sock *sk) |
diff --git a/net/dccp/ccids/lib/loss_interval.c b/net/dccp/ccids/lib/loss_interval.c index b1ae8f8259e5..5b3ce0688c5c 100644 --- a/net/dccp/ccids/lib/loss_interval.c +++ b/net/dccp/ccids/lib/loss_interval.c | |||
@@ -86,26 +86,21 @@ static void tfrc_lh_calc_i_mean(struct tfrc_loss_hist *lh) | |||
86 | 86 | ||
87 | /** | 87 | /** |
88 | * tfrc_lh_update_i_mean - Update the `open' loss interval I_0 | 88 | * tfrc_lh_update_i_mean - Update the `open' loss interval I_0 |
89 | * This updates I_mean as the sequence numbers increase. As a consequence, the | 89 | * For recomputing p: returns `true' if p > p_prev <=> 1/p < 1/p_prev |
90 | * open loss interval I_0 increases, hence p = W_tot/max(I_tot0, I_tot1) | ||
91 | * decreases, and thus there is no need to send renewed feedback. | ||
92 | */ | 90 | */ |
93 | void tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *skb) | 91 | u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *skb) |
94 | { | 92 | { |
95 | struct tfrc_loss_interval *cur = tfrc_lh_peek(lh); | 93 | struct tfrc_loss_interval *cur = tfrc_lh_peek(lh); |
94 | u32 old_i_mean = lh->i_mean; | ||
96 | s64 len; | 95 | s64 len; |
97 | 96 | ||
98 | if (cur == NULL) /* not initialised */ | 97 | if (cur == NULL) /* not initialised */ |
99 | return; | 98 | return 0; |
100 | |||
101 | /* FIXME: should probably also count non-data packets (RFC 4342, 6.1) */ | ||
102 | if (!dccp_data_packet(skb)) | ||
103 | return; | ||
104 | 99 | ||
105 | len = dccp_delta_seqno(cur->li_seqno, DCCP_SKB_CB(skb)->dccpd_seq) + 1; | 100 | len = dccp_delta_seqno(cur->li_seqno, DCCP_SKB_CB(skb)->dccpd_seq) + 1; |
106 | 101 | ||
107 | if (len - (s64)cur->li_length <= 0) /* duplicate or reordered */ | 102 | if (len - (s64)cur->li_length <= 0) /* duplicate or reordered */ |
108 | return; | 103 | return 0; |
109 | 104 | ||
110 | if (SUB16(dccp_hdr(skb)->dccph_ccval, cur->li_ccval) > 4) | 105 | if (SUB16(dccp_hdr(skb)->dccph_ccval, cur->li_ccval) > 4) |
111 | /* | 106 | /* |
@@ -119,11 +114,14 @@ void tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *skb) | |||
119 | cur->li_is_closed = 1; | 114 | cur->li_is_closed = 1; |
120 | 115 | ||
121 | if (tfrc_lh_length(lh) == 1) /* due to RFC 3448, 6.3.1 */ | 116 | if (tfrc_lh_length(lh) == 1) /* due to RFC 3448, 6.3.1 */ |
122 | return; | 117 | return 0; |
123 | 118 | ||
124 | cur->li_length = len; | 119 | cur->li_length = len; |
125 | tfrc_lh_calc_i_mean(lh); | 120 | tfrc_lh_calc_i_mean(lh); |
121 | |||
122 | return (lh->i_mean < old_i_mean); | ||
126 | } | 123 | } |
124 | EXPORT_SYMBOL_GPL(tfrc_lh_update_i_mean); | ||
127 | 125 | ||
128 | /* Determine if `new_loss' does begin a new loss interval [RFC 4342, 10.2] */ | 126 | /* Determine if `new_loss' does begin a new loss interval [RFC 4342, 10.2] */ |
129 | static inline u8 tfrc_lh_is_new_loss(struct tfrc_loss_interval *cur, | 127 | static inline u8 tfrc_lh_is_new_loss(struct tfrc_loss_interval *cur, |
@@ -140,18 +138,18 @@ static inline u8 tfrc_lh_is_new_loss(struct tfrc_loss_interval *cur, | |||
140 | * @sk: Used by @calc_first_li in caller-specific way (subtyping) | 138 | * @sk: Used by @calc_first_li in caller-specific way (subtyping) |
141 | * Updates I_mean and returns 1 if a new interval has in fact been added to @lh. | 139 | * Updates I_mean and returns 1 if a new interval has in fact been added to @lh. |
142 | */ | 140 | */ |
143 | bool tfrc_lh_interval_add(struct tfrc_loss_hist *lh, struct tfrc_rx_hist *rh, | 141 | int tfrc_lh_interval_add(struct tfrc_loss_hist *lh, struct tfrc_rx_hist *rh, |
144 | u32 (*calc_first_li)(struct sock *), struct sock *sk) | 142 | u32 (*calc_first_li)(struct sock *), struct sock *sk) |
145 | { | 143 | { |
146 | struct tfrc_loss_interval *cur = tfrc_lh_peek(lh), *new; | 144 | struct tfrc_loss_interval *cur = tfrc_lh_peek(lh), *new; |
147 | 145 | ||
148 | if (cur != NULL && !tfrc_lh_is_new_loss(cur, tfrc_rx_hist_loss_prev(rh))) | 146 | if (cur != NULL && !tfrc_lh_is_new_loss(cur, tfrc_rx_hist_loss_prev(rh))) |
149 | return false; | 147 | return 0; |
150 | 148 | ||
151 | new = tfrc_lh_demand_next(lh); | 149 | new = tfrc_lh_demand_next(lh); |
152 | if (unlikely(new == NULL)) { | 150 | if (unlikely(new == NULL)) { |
153 | DCCP_CRIT("Cannot allocate/add loss record."); | 151 | DCCP_CRIT("Cannot allocate/add loss record."); |
154 | return false; | 152 | return 0; |
155 | } | 153 | } |
156 | 154 | ||
157 | new->li_seqno = tfrc_rx_hist_loss_prev(rh)->tfrchrx_seqno; | 155 | new->li_seqno = tfrc_rx_hist_loss_prev(rh)->tfrchrx_seqno; |
@@ -169,7 +167,7 @@ bool tfrc_lh_interval_add(struct tfrc_loss_hist *lh, struct tfrc_rx_hist *rh, | |||
169 | 167 | ||
170 | tfrc_lh_calc_i_mean(lh); | 168 | tfrc_lh_calc_i_mean(lh); |
171 | } | 169 | } |
172 | return true; | 170 | return 1; |
173 | } | 171 | } |
174 | EXPORT_SYMBOL_GPL(tfrc_lh_interval_add); | 172 | EXPORT_SYMBOL_GPL(tfrc_lh_interval_add); |
175 | 173 | ||
diff --git a/net/dccp/ccids/lib/loss_interval.h b/net/dccp/ccids/lib/loss_interval.h index d08a226db43e..246018a3b269 100644 --- a/net/dccp/ccids/lib/loss_interval.h +++ b/net/dccp/ccids/lib/loss_interval.h | |||
@@ -67,9 +67,9 @@ static inline u8 tfrc_lh_length(struct tfrc_loss_hist *lh) | |||
67 | 67 | ||
68 | struct tfrc_rx_hist; | 68 | struct tfrc_rx_hist; |
69 | 69 | ||
70 | extern bool tfrc_lh_interval_add(struct tfrc_loss_hist *, struct tfrc_rx_hist *, | 70 | extern int tfrc_lh_interval_add(struct tfrc_loss_hist *, struct tfrc_rx_hist *, |
71 | u32 (*first_li)(struct sock *), struct sock *); | 71 | u32 (*first_li)(struct sock *), struct sock *); |
72 | extern void tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *); | 72 | extern u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *); |
73 | extern void tfrc_lh_cleanup(struct tfrc_loss_hist *lh); | 73 | extern void tfrc_lh_cleanup(struct tfrc_loss_hist *lh); |
74 | 74 | ||
75 | #endif /* _DCCP_LI_HIST_ */ | 75 | #endif /* _DCCP_LI_HIST_ */ |
diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c index cce9f03bda3e..6cc108afdc3b 100644 --- a/net/dccp/ccids/lib/packet_history.c +++ b/net/dccp/ccids/lib/packet_history.c | |||
@@ -40,6 +40,18 @@ | |||
40 | #include "packet_history.h" | 40 | #include "packet_history.h" |
41 | #include "../../dccp.h" | 41 | #include "../../dccp.h" |
42 | 42 | ||
43 | /** | ||
44 | * tfrc_tx_hist_entry - Simple singly-linked TX history list | ||
45 | * @next: next oldest entry (LIFO order) | ||
46 | * @seqno: sequence number of this entry | ||
47 | * @stamp: send time of packet with sequence number @seqno | ||
48 | */ | ||
49 | struct tfrc_tx_hist_entry { | ||
50 | struct tfrc_tx_hist_entry *next; | ||
51 | u64 seqno; | ||
52 | ktime_t stamp; | ||
53 | }; | ||
54 | |||
43 | /* | 55 | /* |
44 | * Transmitter History Routines | 56 | * Transmitter History Routines |
45 | */ | 57 | */ |
@@ -61,6 +73,15 @@ void tfrc_tx_packet_history_exit(void) | |||
61 | } | 73 | } |
62 | } | 74 | } |
63 | 75 | ||
76 | static struct tfrc_tx_hist_entry * | ||
77 | tfrc_tx_hist_find_entry(struct tfrc_tx_hist_entry *head, u64 seqno) | ||
78 | { | ||
79 | while (head != NULL && head->seqno != seqno) | ||
80 | head = head->next; | ||
81 | |||
82 | return head; | ||
83 | } | ||
84 | |||
64 | int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno) | 85 | int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno) |
65 | { | 86 | { |
66 | struct tfrc_tx_hist_entry *entry = kmem_cache_alloc(tfrc_tx_hist_slab, gfp_any()); | 87 | struct tfrc_tx_hist_entry *entry = kmem_cache_alloc(tfrc_tx_hist_slab, gfp_any()); |
@@ -90,6 +111,25 @@ void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp) | |||
90 | } | 111 | } |
91 | EXPORT_SYMBOL_GPL(tfrc_tx_hist_purge); | 112 | EXPORT_SYMBOL_GPL(tfrc_tx_hist_purge); |
92 | 113 | ||
114 | u32 tfrc_tx_hist_rtt(struct tfrc_tx_hist_entry *head, const u64 seqno, | ||
115 | const ktime_t now) | ||
116 | { | ||
117 | u32 rtt = 0; | ||
118 | struct tfrc_tx_hist_entry *packet = tfrc_tx_hist_find_entry(head, seqno); | ||
119 | |||
120 | if (packet != NULL) { | ||
121 | rtt = ktime_us_delta(now, packet->stamp); | ||
122 | /* | ||
123 | * Garbage-collect older (irrelevant) entries: | ||
124 | */ | ||
125 | tfrc_tx_hist_purge(&packet->next); | ||
126 | } | ||
127 | |||
128 | return rtt; | ||
129 | } | ||
130 | EXPORT_SYMBOL_GPL(tfrc_tx_hist_rtt); | ||
131 | |||
132 | |||
93 | /* | 133 | /* |
94 | * Receiver History Routines | 134 | * Receiver History Routines |
95 | */ | 135 | */ |
@@ -151,31 +191,14 @@ int tfrc_rx_hist_duplicate(struct tfrc_rx_hist *h, struct sk_buff *skb) | |||
151 | } | 191 | } |
152 | EXPORT_SYMBOL_GPL(tfrc_rx_hist_duplicate); | 192 | EXPORT_SYMBOL_GPL(tfrc_rx_hist_duplicate); |
153 | 193 | ||
154 | |||
155 | static void __tfrc_rx_hist_swap(struct tfrc_rx_hist *h, const u8 a, const u8 b) | ||
156 | { | ||
157 | struct tfrc_rx_hist_entry *tmp = h->ring[a]; | ||
158 | |||
159 | h->ring[a] = h->ring[b]; | ||
160 | h->ring[b] = tmp; | ||
161 | } | ||
162 | |||
163 | static void tfrc_rx_hist_swap(struct tfrc_rx_hist *h, const u8 a, const u8 b) | 194 | static void tfrc_rx_hist_swap(struct tfrc_rx_hist *h, const u8 a, const u8 b) |
164 | { | 195 | { |
165 | __tfrc_rx_hist_swap(h, tfrc_rx_hist_index(h, a), | 196 | const u8 idx_a = tfrc_rx_hist_index(h, a), |
166 | tfrc_rx_hist_index(h, b)); | 197 | idx_b = tfrc_rx_hist_index(h, b); |
167 | } | 198 | struct tfrc_rx_hist_entry *tmp = h->ring[idx_a]; |
168 | 199 | ||
169 | /** | 200 | h->ring[idx_a] = h->ring[idx_b]; |
170 | * tfrc_rx_hist_resume_rtt_sampling - Prepare RX history for RTT sampling | 201 | h->ring[idx_b] = tmp; |
171 | * This is called after loss detection has finished, when the history entry | ||
172 | * with the index of `loss_count' holds the highest-received sequence number. | ||
173 | * RTT sampling requires this information at ring[0] (tfrc_rx_hist_sample_rtt). | ||
174 | */ | ||
175 | static inline void tfrc_rx_hist_resume_rtt_sampling(struct tfrc_rx_hist *h) | ||
176 | { | ||
177 | __tfrc_rx_hist_swap(h, 0, tfrc_rx_hist_index(h, h->loss_count)); | ||
178 | h->loss_count = h->loss_start = 0; | ||
179 | } | 202 | } |
180 | 203 | ||
181 | /* | 204 | /* |
@@ -192,8 +215,10 @@ static void __do_track_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u64 n1) | |||
192 | u64 s0 = tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno, | 215 | u64 s0 = tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno, |
193 | s1 = DCCP_SKB_CB(skb)->dccpd_seq; | 216 | s1 = DCCP_SKB_CB(skb)->dccpd_seq; |
194 | 217 | ||
195 | if (!dccp_loss_free(s0, s1, n1)) /* gap between S0 and S1 */ | 218 | if (!dccp_loss_free(s0, s1, n1)) { /* gap between S0 and S1 */ |
196 | h->loss_count = 1; | 219 | h->loss_count = 1; |
220 | tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 1), skb, n1); | ||
221 | } | ||
197 | } | 222 | } |
198 | 223 | ||
199 | static void __one_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n2) | 224 | static void __one_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n2) |
@@ -215,7 +240,8 @@ static void __one_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n2 | |||
215 | 240 | ||
216 | if (dccp_loss_free(s2, s1, n1)) { | 241 | if (dccp_loss_free(s2, s1, n1)) { |
217 | /* hole is filled: S0, S2, and S1 are consecutive */ | 242 | /* hole is filled: S0, S2, and S1 are consecutive */ |
218 | tfrc_rx_hist_resume_rtt_sampling(h); | 243 | h->loss_count = 0; |
244 | h->loss_start = tfrc_rx_hist_index(h, 1); | ||
219 | } else | 245 | } else |
220 | /* gap between S2 and S1: just update loss_prev */ | 246 | /* gap between S2 and S1: just update loss_prev */ |
221 | tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_loss_prev(h), skb, n2); | 247 | tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_loss_prev(h), skb, n2); |
@@ -268,7 +294,8 @@ static int __two_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n3) | |||
268 | 294 | ||
269 | if (dccp_loss_free(s1, s2, n2)) { | 295 | if (dccp_loss_free(s1, s2, n2)) { |
270 | /* entire hole filled by S0, S3, S1, S2 */ | 296 | /* entire hole filled by S0, S3, S1, S2 */ |
271 | tfrc_rx_hist_resume_rtt_sampling(h); | 297 | h->loss_start = tfrc_rx_hist_index(h, 2); |
298 | h->loss_count = 0; | ||
272 | } else { | 299 | } else { |
273 | /* gap remains between S1 and S2 */ | 300 | /* gap remains between S1 and S2 */ |
274 | h->loss_start = tfrc_rx_hist_index(h, 1); | 301 | h->loss_start = tfrc_rx_hist_index(h, 1); |
@@ -312,7 +339,8 @@ static void __three_after_loss(struct tfrc_rx_hist *h) | |||
312 | 339 | ||
313 | if (dccp_loss_free(s2, s3, n3)) { | 340 | if (dccp_loss_free(s2, s3, n3)) { |
314 | /* no gap between S2 and S3: entire hole is filled */ | 341 | /* no gap between S2 and S3: entire hole is filled */ |
315 | tfrc_rx_hist_resume_rtt_sampling(h); | 342 | h->loss_start = tfrc_rx_hist_index(h, 3); |
343 | h->loss_count = 0; | ||
316 | } else { | 344 | } else { |
317 | /* gap between S2 and S3 */ | 345 | /* gap between S2 and S3 */ |
318 | h->loss_start = tfrc_rx_hist_index(h, 2); | 346 | h->loss_start = tfrc_rx_hist_index(h, 2); |
@@ -326,13 +354,13 @@ static void __three_after_loss(struct tfrc_rx_hist *h) | |||
326 | } | 354 | } |
327 | 355 | ||
328 | /** | 356 | /** |
329 | * tfrc_rx_congestion_event - Loss detection and further processing | 357 | * tfrc_rx_handle_loss - Loss detection and further processing |
330 | * @h: The non-empty RX history object | 358 | * @h: The non-empty RX history object |
331 | * @lh: Loss Intervals database to update | 359 | * @lh: Loss Intervals database to update |
332 | * @skb: Currently received packet | 360 | * @skb: Currently received packet |
333 | * @ndp: The NDP count belonging to @skb | 361 | * @ndp: The NDP count belonging to @skb |
334 | * @first_li: Caller-dependent computation of first loss interval in @lh | 362 | * @calc_first_li: Caller-dependent computation of first loss interval in @lh |
335 | * @sk: Used by @calc_first_li (see tfrc_lh_interval_add) | 363 | * @sk: Used by @calc_first_li (see tfrc_lh_interval_add) |
336 | * Chooses action according to pending loss, updates LI database when a new | 364 | * Chooses action according to pending loss, updates LI database when a new |
337 | * loss was detected, and does required post-processing. Returns 1 when caller | 365 | * loss was detected, and does required post-processing. Returns 1 when caller |
338 | * should send feedback, 0 otherwise. | 366 | * should send feedback, 0 otherwise. |
@@ -340,20 +368,15 @@ static void __three_after_loss(struct tfrc_rx_hist *h) | |||
340 | * records accordingly, the caller should not perform any more RX history | 368 | * records accordingly, the caller should not perform any more RX history |
341 | * operations when loss_count is greater than 0 after calling this function. | 369 | * operations when loss_count is greater than 0 after calling this function. |
342 | */ | 370 | */ |
343 | bool tfrc_rx_congestion_event(struct tfrc_rx_hist *h, | 371 | int tfrc_rx_handle_loss(struct tfrc_rx_hist *h, |
344 | struct tfrc_loss_hist *lh, | 372 | struct tfrc_loss_hist *lh, |
345 | struct sk_buff *skb, const u64 ndp, | 373 | struct sk_buff *skb, const u64 ndp, |
346 | u32 (*first_li)(struct sock *), struct sock *sk) | 374 | u32 (*calc_first_li)(struct sock *), struct sock *sk) |
347 | { | 375 | { |
348 | bool new_event = false; | 376 | int is_new_loss = 0; |
349 | |||
350 | if (tfrc_rx_hist_duplicate(h, skb)) | ||
351 | return 0; | ||
352 | 377 | ||
353 | if (h->loss_count == 0) { | 378 | if (h->loss_count == 0) { |
354 | __do_track_loss(h, skb, ndp); | 379 | __do_track_loss(h, skb, ndp); |
355 | tfrc_rx_hist_sample_rtt(h, skb); | ||
356 | tfrc_rx_hist_add_packet(h, skb, ndp); | ||
357 | } else if (h->loss_count == 1) { | 380 | } else if (h->loss_count == 1) { |
358 | __one_after_loss(h, skb, ndp); | 381 | __one_after_loss(h, skb, ndp); |
359 | } else if (h->loss_count != 2) { | 382 | } else if (h->loss_count != 2) { |
@@ -362,57 +385,34 @@ bool tfrc_rx_congestion_event(struct tfrc_rx_hist *h, | |||
362 | /* | 385 | /* |
363 | * Update Loss Interval database and recycle RX records | 386 | * Update Loss Interval database and recycle RX records |
364 | */ | 387 | */ |
365 | new_event = tfrc_lh_interval_add(lh, h, first_li, sk); | 388 | is_new_loss = tfrc_lh_interval_add(lh, h, calc_first_li, sk); |
366 | __three_after_loss(h); | 389 | __three_after_loss(h); |
367 | } | 390 | } |
368 | 391 | return is_new_loss; | |
369 | /* | ||
370 | * Update moving-average of `s' and the sum of received payload bytes. | ||
371 | */ | ||
372 | if (dccp_data_packet(skb)) { | ||
373 | const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4; | ||
374 | |||
375 | h->packet_size = tfrc_ewma(h->packet_size, payload, 9); | ||
376 | h->bytes_recvd += payload; | ||
377 | } | ||
378 | |||
379 | /* RFC 3448, 6.1: update I_0, whose growth implies p <= p_prev */ | ||
380 | if (!new_event) | ||
381 | tfrc_lh_update_i_mean(lh, skb); | ||
382 | |||
383 | return new_event; | ||
384 | } | 392 | } |
385 | EXPORT_SYMBOL_GPL(tfrc_rx_congestion_event); | 393 | EXPORT_SYMBOL_GPL(tfrc_rx_handle_loss); |
386 | 394 | ||
387 | /* Compute the sending rate X_recv measured between feedback intervals */ | 395 | int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h) |
388 | u32 tfrc_rx_hist_x_recv(struct tfrc_rx_hist *h, const u32 last_x_recv) | ||
389 | { | 396 | { |
390 | u64 bytes = h->bytes_recvd, last_rtt = h->rtt_estimate; | 397 | int i; |
391 | s64 delta = ktime_to_us(net_timedelta(h->bytes_start)); | ||
392 | |||
393 | WARN_ON(delta <= 0); | ||
394 | /* | ||
395 | * Ensure that the sampling interval for X_recv is at least one RTT, | ||
396 | * by extending the sampling interval backwards in time, over the last | ||
397 | * R_(m-1) seconds, as per rfc3448bis-06, 6.2. | ||
398 | * To reduce noise (e.g. when the RTT changes often), this is only | ||
399 | * done when delta is smaller than RTT/2. | ||
400 | */ | ||
401 | if (last_x_recv > 0 && delta < last_rtt/2) { | ||
402 | tfrc_pr_debug("delta < RTT ==> %ld us < %u us\n", | ||
403 | (long)delta, (unsigned)last_rtt); | ||
404 | 398 | ||
405 | delta = (bytes ? delta : 0) + last_rtt; | 399 | for (i = 0; i <= TFRC_NDUPACK; i++) { |
406 | bytes += div_u64((u64)last_x_recv * last_rtt, USEC_PER_SEC); | 400 | h->ring[i] = kmem_cache_alloc(tfrc_rx_hist_slab, GFP_ATOMIC); |
401 | if (h->ring[i] == NULL) | ||
402 | goto out_free; | ||
407 | } | 403 | } |
408 | 404 | ||
409 | if (unlikely(bytes == 0)) { | 405 | h->loss_count = h->loss_start = 0; |
410 | DCCP_WARN("X_recv == 0, using old value of %u\n", last_x_recv); | 406 | return 0; |
411 | return last_x_recv; | 407 | |
408 | out_free: | ||
409 | while (i-- != 0) { | ||
410 | kmem_cache_free(tfrc_rx_hist_slab, h->ring[i]); | ||
411 | h->ring[i] = NULL; | ||
412 | } | 412 | } |
413 | return scaled_div32(bytes, delta); | 413 | return -ENOBUFS; |
414 | } | 414 | } |
415 | EXPORT_SYMBOL_GPL(tfrc_rx_hist_x_recv); | 415 | EXPORT_SYMBOL_GPL(tfrc_rx_hist_alloc); |
416 | 416 | ||
417 | void tfrc_rx_hist_purge(struct tfrc_rx_hist *h) | 417 | void tfrc_rx_hist_purge(struct tfrc_rx_hist *h) |
418 | { | 418 | { |
@@ -426,81 +426,73 @@ void tfrc_rx_hist_purge(struct tfrc_rx_hist *h) | |||
426 | } | 426 | } |
427 | EXPORT_SYMBOL_GPL(tfrc_rx_hist_purge); | 427 | EXPORT_SYMBOL_GPL(tfrc_rx_hist_purge); |
428 | 428 | ||
429 | static int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h) | 429 | /** |
430 | * tfrc_rx_hist_rtt_last_s - reference entry to compute RTT samples against | ||
431 | */ | ||
432 | static inline struct tfrc_rx_hist_entry * | ||
433 | tfrc_rx_hist_rtt_last_s(const struct tfrc_rx_hist *h) | ||
430 | { | 434 | { |
431 | int i; | 435 | return h->ring[0]; |
432 | |||
433 | memset(h, 0, sizeof(*h)); | ||
434 | |||
435 | for (i = 0; i <= TFRC_NDUPACK; i++) { | ||
436 | h->ring[i] = kmem_cache_alloc(tfrc_rx_hist_slab, GFP_ATOMIC); | ||
437 | if (h->ring[i] == NULL) { | ||
438 | tfrc_rx_hist_purge(h); | ||
439 | return -ENOBUFS; | ||
440 | } | ||
441 | } | ||
442 | return 0; | ||
443 | } | 436 | } |
444 | 437 | ||
445 | int tfrc_rx_hist_init(struct tfrc_rx_hist *h, struct sock *sk) | 438 | /** |
439 | * tfrc_rx_hist_rtt_prev_s: previously suitable (wrt rtt_last_s) RTT-sampling entry | ||
440 | */ | ||
441 | static inline struct tfrc_rx_hist_entry * | ||
442 | tfrc_rx_hist_rtt_prev_s(const struct tfrc_rx_hist *h) | ||
446 | { | 443 | { |
447 | if (tfrc_rx_hist_alloc(h)) | 444 | return h->ring[h->rtt_sample_prev]; |
448 | return -ENOBUFS; | ||
449 | /* | ||
450 | * Initialise first entry with GSR to start loss detection as early as | ||
451 | * possible. Code using this must not use any other fields. The entry | ||
452 | * will be overwritten once the CCID updates its received packets. | ||
453 | */ | ||
454 | tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno = dccp_sk(sk)->dccps_gsr; | ||
455 | return 0; | ||
456 | } | 445 | } |
457 | EXPORT_SYMBOL_GPL(tfrc_rx_hist_init); | ||
458 | 446 | ||
459 | /** | 447 | /** |
460 | * tfrc_rx_hist_sample_rtt - Sample RTT from timestamp / CCVal | 448 | * tfrc_rx_hist_sample_rtt - Sample RTT from timestamp / CCVal |
461 | * Based on ideas presented in RFC 4342, 8.1. This function expects that no loss | 449 | * Based on ideas presented in RFC 4342, 8.1. Returns 0 if it was not able |
462 | * is pending and uses the following history entries (via rtt_sample_prev): | 450 | * to compute a sample with given data - calling function should check this. |
463 | * - h->ring[0] contains the most recent history entry prior to @skb; | ||
464 | * - h->ring[1] is an unused `dummy' entry when the current difference is 0; | ||
465 | */ | 451 | */ |
466 | void tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, const struct sk_buff *skb) | 452 | u32 tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, const struct sk_buff *skb) |
467 | { | 453 | { |
468 | struct tfrc_rx_hist_entry *last = h->ring[0]; | 454 | u32 sample = 0, |
469 | u32 sample, delta_v; | 455 | delta_v = SUB16(dccp_hdr(skb)->dccph_ccval, |
470 | 456 | tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval); | |
471 | /* | 457 | |
472 | * When not to sample: | 458 | if (delta_v < 1 || delta_v > 4) { /* unsuitable CCVal delta */ |
473 | * - on non-data packets | 459 | if (h->rtt_sample_prev == 2) { /* previous candidate stored */ |
474 | * (RFC 4342, 8.1: CCVal only fully defined for data packets); | 460 | sample = SUB16(tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_ccval, |
475 | * - when no data packets have been received yet | 461 | tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval); |
476 | * (FIXME: using sampled packet size as indicator here); | 462 | if (sample) |
477 | * - as long as there are gaps in the sequence space (pending loss). | 463 | sample = 4 / sample * |
478 | */ | 464 | ktime_us_delta(tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_tstamp, |
479 | if (!dccp_data_packet(skb) || h->packet_size == 0 || | 465 | tfrc_rx_hist_rtt_last_s(h)->tfrchrx_tstamp); |
480 | tfrc_rx_hist_loss_pending(h)) | 466 | else /* |
481 | return; | 467 | * FIXME: This condition is in principle not |
468 | * possible but occurs when CCID is used for | ||
469 | * two-way data traffic. I have tried to trace | ||
470 | * it, but the cause does not seem to be here. | ||
471 | */ | ||
472 | DCCP_BUG("please report to dccp@vger.kernel.org" | ||
473 | " => prev = %u, last = %u", | ||
474 | tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_ccval, | ||
475 | tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval); | ||
476 | } else if (delta_v < 1) { | ||
477 | h->rtt_sample_prev = 1; | ||
478 | goto keep_ref_for_next_time; | ||
479 | } | ||
482 | 480 | ||
483 | h->rtt_sample_prev = 0; /* reset previous candidate */ | 481 | } else if (delta_v == 4) /* optimal match */ |
482 | sample = ktime_to_us(net_timedelta(tfrc_rx_hist_rtt_last_s(h)->tfrchrx_tstamp)); | ||
483 | else { /* suboptimal match */ | ||
484 | h->rtt_sample_prev = 2; | ||
485 | goto keep_ref_for_next_time; | ||
486 | } | ||
484 | 487 | ||
485 | delta_v = SUB16(dccp_hdr(skb)->dccph_ccval, last->tfrchrx_ccval); | 488 | if (unlikely(sample > DCCP_SANE_RTT_MAX)) { |
486 | if (delta_v == 0) { /* less than RTT/4 difference */ | 489 | DCCP_WARN("RTT sample %u too large, using max\n", sample); |
487 | h->rtt_sample_prev = 1; | 490 | sample = DCCP_SANE_RTT_MAX; |
488 | return; | ||
489 | } | 491 | } |
490 | sample = dccp_sane_rtt(ktime_to_us(net_timedelta(last->tfrchrx_tstamp))); | ||
491 | 492 | ||
492 | if (delta_v <= 4) /* between RTT/4 and RTT */ | 493 | h->rtt_sample_prev = 0; /* use current entry as next reference */ |
493 | sample *= 4 / delta_v; | 494 | keep_ref_for_next_time: |
494 | else if (!(sample < h->rtt_estimate && sample > h->rtt_estimate/2)) | ||
495 | /* | ||
496 | * Optimisation: CCVal difference is greater than 1 RTT, yet the | ||
497 | * sample is less than the local RTT estimate; which means that | ||
498 | * the RTT estimate is too high. | ||
499 | * To avoid noise, it is not done if the sample is below RTT/2. | ||
500 | */ | ||
501 | return; | ||
502 | 495 | ||
503 | /* Use a lower weight than usual to increase responsiveness */ | 496 | return sample; |
504 | h->rtt_estimate = tfrc_ewma(h->rtt_estimate, sample, 5); | ||
505 | } | 497 | } |
506 | EXPORT_SYMBOL_GPL(tfrc_rx_hist_sample_rtt); | 498 | EXPORT_SYMBOL_GPL(tfrc_rx_hist_sample_rtt); |
diff --git a/net/dccp/ccids/lib/packet_history.h b/net/dccp/ccids/lib/packet_history.h index 555e65cd73a0..461cc91cce88 100644 --- a/net/dccp/ccids/lib/packet_history.h +++ b/net/dccp/ccids/lib/packet_history.h | |||
@@ -40,28 +40,12 @@ | |||
40 | #include <linux/slab.h> | 40 | #include <linux/slab.h> |
41 | #include "tfrc.h" | 41 | #include "tfrc.h" |
42 | 42 | ||
43 | /** | 43 | struct tfrc_tx_hist_entry; |
44 | * tfrc_tx_hist_entry - Simple singly-linked TX history list | ||
45 | * @next: next oldest entry (LIFO order) | ||
46 | * @seqno: sequence number of this entry | ||
47 | * @stamp: send time of packet with sequence number @seqno | ||
48 | */ | ||
49 | struct tfrc_tx_hist_entry { | ||
50 | struct tfrc_tx_hist_entry *next; | ||
51 | u64 seqno; | ||
52 | ktime_t stamp; | ||
53 | }; | ||
54 | |||
55 | static inline struct tfrc_tx_hist_entry * | ||
56 | tfrc_tx_hist_find_entry(struct tfrc_tx_hist_entry *head, u64 seqno) | ||
57 | { | ||
58 | while (head != NULL && head->seqno != seqno) | ||
59 | head = head->next; | ||
60 | return head; | ||
61 | } | ||
62 | 44 | ||
63 | extern int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno); | 45 | extern int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno); |
64 | extern void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp); | 46 | extern void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp); |
47 | extern u32 tfrc_tx_hist_rtt(struct tfrc_tx_hist_entry *head, | ||
48 | const u64 seqno, const ktime_t now); | ||
65 | 49 | ||
66 | /* Subtraction a-b modulo-16, respects circular wrap-around */ | 50 | /* Subtraction a-b modulo-16, respects circular wrap-around */ |
67 | #define SUB16(a, b) (((a) + 16 - (b)) & 0xF) | 51 | #define SUB16(a, b) (((a) + 16 - (b)) & 0xF) |
@@ -91,22 +75,12 @@ struct tfrc_rx_hist_entry { | |||
91 | * @loss_count: Number of entries in circular history | 75 | * @loss_count: Number of entries in circular history |
92 | * @loss_start: Movable index (for loss detection) | 76 | * @loss_start: Movable index (for loss detection) |
93 | * @rtt_sample_prev: Used during RTT sampling, points to candidate entry | 77 | * @rtt_sample_prev: Used during RTT sampling, points to candidate entry |
94 | * @rtt_estimate: Receiver RTT estimate | ||
95 | * @packet_size: Packet size in bytes (as per RFC 3448, 3.1) | ||
96 | * @bytes_recvd: Number of bytes received since @bytes_start | ||
97 | * @bytes_start: Start time for counting @bytes_recvd | ||
98 | */ | 78 | */ |
99 | struct tfrc_rx_hist { | 79 | struct tfrc_rx_hist { |
100 | struct tfrc_rx_hist_entry *ring[TFRC_NDUPACK + 1]; | 80 | struct tfrc_rx_hist_entry *ring[TFRC_NDUPACK + 1]; |
101 | u8 loss_count:2, | 81 | u8 loss_count:2, |
102 | loss_start:2; | 82 | loss_start:2; |
103 | /* Receiver RTT sampling */ | ||
104 | #define rtt_sample_prev loss_start | 83 | #define rtt_sample_prev loss_start |
105 | u32 rtt_estimate; | ||
106 | /* Receiver sampling of application payload lengths */ | ||
107 | u32 packet_size, | ||
108 | bytes_recvd; | ||
109 | ktime_t bytes_start; | ||
110 | }; | 84 | }; |
111 | 85 | ||
112 | /** | 86 | /** |
@@ -150,50 +124,20 @@ static inline bool tfrc_rx_hist_loss_pending(const struct tfrc_rx_hist *h) | |||
150 | return h->loss_count > 0; | 124 | return h->loss_count > 0; |
151 | } | 125 | } |
152 | 126 | ||
153 | /* | ||
154 | * Accessor functions to retrieve parameters sampled by the RX history | ||
155 | */ | ||
156 | static inline u32 tfrc_rx_hist_packet_size(const struct tfrc_rx_hist *h) | ||
157 | { | ||
158 | if (h->packet_size == 0) { | ||
159 | DCCP_WARN("No sample for s, using fallback\n"); | ||
160 | return TCP_MIN_RCVMSS; | ||
161 | } | ||
162 | return h->packet_size; | ||
163 | |||
164 | } | ||
165 | static inline u32 tfrc_rx_hist_rtt(const struct tfrc_rx_hist *h) | ||
166 | { | ||
167 | if (h->rtt_estimate == 0) { | ||
168 | DCCP_WARN("No RTT estimate available, using fallback RTT\n"); | ||
169 | return DCCP_FALLBACK_RTT; | ||
170 | } | ||
171 | return h->rtt_estimate; | ||
172 | } | ||
173 | |||
174 | static inline void tfrc_rx_hist_restart_byte_counter(struct tfrc_rx_hist *h) | ||
175 | { | ||
176 | h->bytes_recvd = 0; | ||
177 | h->bytes_start = ktime_get_real(); | ||
178 | } | ||
179 | |||
180 | extern u32 tfrc_rx_hist_x_recv(struct tfrc_rx_hist *h, const u32 last_x_recv); | ||
181 | |||
182 | |||
183 | extern void tfrc_rx_hist_add_packet(struct tfrc_rx_hist *h, | 127 | extern void tfrc_rx_hist_add_packet(struct tfrc_rx_hist *h, |
184 | const struct sk_buff *skb, const u64 ndp); | 128 | const struct sk_buff *skb, const u64 ndp); |
185 | 129 | ||
186 | extern int tfrc_rx_hist_duplicate(struct tfrc_rx_hist *h, struct sk_buff *skb); | 130 | extern int tfrc_rx_hist_duplicate(struct tfrc_rx_hist *h, struct sk_buff *skb); |
187 | 131 | ||
188 | struct tfrc_loss_hist; | 132 | struct tfrc_loss_hist; |
189 | extern bool tfrc_rx_congestion_event(struct tfrc_rx_hist *h, | 133 | extern int tfrc_rx_handle_loss(struct tfrc_rx_hist *h, |
190 | struct tfrc_loss_hist *lh, | 134 | struct tfrc_loss_hist *lh, |
191 | struct sk_buff *skb, const u64 ndp, | 135 | struct sk_buff *skb, const u64 ndp, |
192 | u32 (*first_li)(struct sock *sk), | 136 | u32 (*first_li)(struct sock *sk), |
193 | struct sock *sk); | 137 | struct sock *sk); |
194 | extern void tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, | 138 | extern u32 tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, |
195 | const struct sk_buff *skb); | 139 | const struct sk_buff *skb); |
196 | extern int tfrc_rx_hist_init(struct tfrc_rx_hist *h, struct sock *sk); | 140 | extern int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h); |
197 | extern void tfrc_rx_hist_purge(struct tfrc_rx_hist *h); | 141 | extern void tfrc_rx_hist_purge(struct tfrc_rx_hist *h); |
198 | 142 | ||
199 | #endif /* _DCCP_PKT_HIST_ */ | 143 | #endif /* _DCCP_PKT_HIST_ */ |
diff --git a/net/dccp/ccids/lib/tfrc.h b/net/dccp/ccids/lib/tfrc.h index ede12f53de5a..ed9857527acf 100644 --- a/net/dccp/ccids/lib/tfrc.h +++ b/net/dccp/ccids/lib/tfrc.h | |||
@@ -48,21 +48,6 @@ static inline u32 scaled_div32(u64 a, u64 b) | |||
48 | } | 48 | } |
49 | 49 | ||
50 | /** | 50 | /** |
51 | * tfrc_scaled_sqrt - Compute scaled integer sqrt(x) for 0 < x < 2^22-1 | ||
52 | * Uses scaling to improve accuracy of the integer approximation of sqrt(). The | ||
53 | * scaling factor of 2^10 limits the maximum @sample to 4e6; this is okay for | ||
54 | * clamped RTT samples (dccp_sample_rtt). | ||
55 | * Should best be used for expressions of type sqrt(x)/sqrt(y), since then the | ||
56 | * scaling factor is neutralised. For this purpose, it avoids returning zero. | ||
57 | */ | ||
58 | static inline u16 tfrc_scaled_sqrt(const u32 sample) | ||
59 | { | ||
60 | const unsigned long non_zero_sample = sample ? : 1; | ||
61 | |||
62 | return int_sqrt(non_zero_sample << 10); | ||
63 | } | ||
64 | |||
65 | /** | ||
66 | * tfrc_ewma - Exponentially weighted moving average | 51 | * tfrc_ewma - Exponentially weighted moving average |
67 | * @weight: Weight to be used as damping factor, in units of 1/10 | 52 | * @weight: Weight to be used as damping factor, in units of 1/10 |
68 | */ | 53 | */ |
@@ -73,7 +58,6 @@ static inline u32 tfrc_ewma(const u32 avg, const u32 newval, const u8 weight) | |||
73 | 58 | ||
74 | extern u32 tfrc_calc_x(u16 s, u32 R, u32 p); | 59 | extern u32 tfrc_calc_x(u16 s, u32 R, u32 p); |
75 | extern u32 tfrc_calc_x_reverse_lookup(u32 fvalue); | 60 | extern u32 tfrc_calc_x_reverse_lookup(u32 fvalue); |
76 | extern u32 tfrc_invert_loss_event_rate(u32 loss_event_rate); | ||
77 | 61 | ||
78 | extern int tfrc_tx_packet_history_init(void); | 62 | extern int tfrc_tx_packet_history_init(void); |
79 | extern void tfrc_tx_packet_history_exit(void); | 63 | extern void tfrc_tx_packet_history_exit(void); |
diff --git a/net/dccp/ccids/lib/tfrc_equation.c b/net/dccp/ccids/lib/tfrc_equation.c index 38239c4d5e14..2f20a29cffe4 100644 --- a/net/dccp/ccids/lib/tfrc_equation.c +++ b/net/dccp/ccids/lib/tfrc_equation.c | |||
@@ -632,16 +632,8 @@ u32 tfrc_calc_x(u16 s, u32 R, u32 p) | |||
632 | 632 | ||
633 | if (p <= TFRC_CALC_X_SPLIT) { /* 0.0000 < p <= 0.05 */ | 633 | if (p <= TFRC_CALC_X_SPLIT) { /* 0.0000 < p <= 0.05 */ |
634 | if (p < TFRC_SMALLEST_P) { /* 0.0000 < p < 0.0001 */ | 634 | if (p < TFRC_SMALLEST_P) { /* 0.0000 < p < 0.0001 */ |
635 | /* | 635 | DCCP_WARN("Value of p (%d) below resolution. " |
636 | * In the congestion-avoidance phase p decays towards 0 | 636 | "Substituting %d\n", p, TFRC_SMALLEST_P); |
637 | * when there are no further losses, so this case is | ||
638 | * natural. Truncating to p_min = 0.01% means that the | ||
639 | * maximum achievable throughput is limited to about | ||
640 | * X_calc_max = 122.4 * s/RTT (see RFC 3448, 3.1); e.g. | ||
641 | * with s=1500 bytes, RTT=0.01 s: X_calc_max = 147 Mbps. | ||
642 | */ | ||
643 | tfrc_pr_debug("Value of p (%d) below resolution. " | ||
644 | "Substituting %d\n", p, TFRC_SMALLEST_P); | ||
645 | index = 0; | 637 | index = 0; |
646 | } else /* 0.0001 <= p <= 0.05 */ | 638 | } else /* 0.0001 <= p <= 0.05 */ |
647 | index = p/TFRC_SMALLEST_P - 1; | 639 | index = p/TFRC_SMALLEST_P - 1; |
@@ -666,6 +658,7 @@ u32 tfrc_calc_x(u16 s, u32 R, u32 p) | |||
666 | result = scaled_div(s, R); | 658 | result = scaled_div(s, R); |
667 | return scaled_div32(result, f); | 659 | return scaled_div32(result, f); |
668 | } | 660 | } |
661 | |||
669 | EXPORT_SYMBOL_GPL(tfrc_calc_x); | 662 | EXPORT_SYMBOL_GPL(tfrc_calc_x); |
670 | 663 | ||
671 | /** | 664 | /** |
@@ -700,19 +693,5 @@ u32 tfrc_calc_x_reverse_lookup(u32 fvalue) | |||
700 | index = tfrc_binsearch(fvalue, 0); | 693 | index = tfrc_binsearch(fvalue, 0); |
701 | return (index + 1) * 1000000 / TFRC_CALC_X_ARRSIZE; | 694 | return (index + 1) * 1000000 / TFRC_CALC_X_ARRSIZE; |
702 | } | 695 | } |
703 | EXPORT_SYMBOL_GPL(tfrc_calc_x_reverse_lookup); | ||
704 | 696 | ||
705 | /** | 697 | EXPORT_SYMBOL_GPL(tfrc_calc_x_reverse_lookup); |
706 | * tfrc_invert_loss_event_rate - Compute p so that 10^6 corresponds to 100% | ||
707 | * When @loss_event_rate is large, there is a chance that p is truncated to 0. | ||
708 | * To avoid re-entering slow-start in that case, we set p = TFRC_SMALLEST_P > 0. | ||
709 | */ | ||
710 | u32 tfrc_invert_loss_event_rate(u32 loss_event_rate) | ||
711 | { | ||
712 | if (loss_event_rate == UINT_MAX) /* see RFC 4342, 8.5 */ | ||
713 | return 0; | ||
714 | if (unlikely(loss_event_rate == 0)) /* map 1/0 into 100% */ | ||
715 | return 1000000; | ||
716 | return max_t(u32, scaled_div(1, loss_event_rate), TFRC_SMALLEST_P); | ||
717 | } | ||
718 | EXPORT_SYMBOL_GPL(tfrc_invert_loss_event_rate); | ||
diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index 5281190aa19c..b4bc6e095a0e 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h | |||
@@ -42,11 +42,9 @@ | |||
42 | extern int dccp_debug; | 42 | extern int dccp_debug; |
43 | #define dccp_pr_debug(format, a...) DCCP_PR_DEBUG(dccp_debug, format, ##a) | 43 | #define dccp_pr_debug(format, a...) DCCP_PR_DEBUG(dccp_debug, format, ##a) |
44 | #define dccp_pr_debug_cat(format, a...) DCCP_PRINTK(dccp_debug, format, ##a) | 44 | #define dccp_pr_debug_cat(format, a...) DCCP_PRINTK(dccp_debug, format, ##a) |
45 | #define dccp_debug(fmt, a...) dccp_pr_debug_cat(KERN_DEBUG fmt, ##a) | ||
46 | #else | 45 | #else |
47 | #define dccp_pr_debug(format, a...) | 46 | #define dccp_pr_debug(format, a...) |
48 | #define dccp_pr_debug_cat(format, a...) | 47 | #define dccp_pr_debug_cat(format, a...) |
49 | #define dccp_debug(format, a...) | ||
50 | #endif | 48 | #endif |
51 | 49 | ||
52 | extern struct inet_hashinfo dccp_hashinfo; | 50 | extern struct inet_hashinfo dccp_hashinfo; |
@@ -63,14 +61,11 @@ extern void dccp_time_wait(struct sock *sk, int state, int timeo); | |||
63 | * - DCCP-Reset with ACK Subheader and 4 bytes of Reset Code fields | 61 | * - DCCP-Reset with ACK Subheader and 4 bytes of Reset Code fields |
64 | * Hence a safe upper bound for the maximum option length is 1020-28 = 992 | 62 | * Hence a safe upper bound for the maximum option length is 1020-28 = 992 |
65 | */ | 63 | */ |
66 | #define MAX_DCCP_SPECIFIC_HEADER (255 * sizeof(uint32_t)) | 64 | #define MAX_DCCP_SPECIFIC_HEADER (255 * sizeof(int)) |
67 | #define DCCP_MAX_PACKET_HDR 28 | 65 | #define DCCP_MAX_PACKET_HDR 28 |
68 | #define DCCP_MAX_OPT_LEN (MAX_DCCP_SPECIFIC_HEADER - DCCP_MAX_PACKET_HDR) | 66 | #define DCCP_MAX_OPT_LEN (MAX_DCCP_SPECIFIC_HEADER - DCCP_MAX_PACKET_HDR) |
69 | #define MAX_DCCP_HEADER (MAX_DCCP_SPECIFIC_HEADER + MAX_HEADER) | 67 | #define MAX_DCCP_HEADER (MAX_DCCP_SPECIFIC_HEADER + MAX_HEADER) |
70 | 68 | ||
71 | /* Upper bound for initial feature-negotiation overhead (padded to 32 bits) */ | ||
72 | #define DCCP_FEATNEG_OVERHEAD (32 * sizeof(uint32_t)) | ||
73 | |||
74 | #define DCCP_TIMEWAIT_LEN (60 * HZ) /* how long to wait to destroy TIME-WAIT | 69 | #define DCCP_TIMEWAIT_LEN (60 * HZ) /* how long to wait to destroy TIME-WAIT |
75 | * state, about 60 seconds */ | 70 | * state, about 60 seconds */ |
76 | 71 | ||
@@ -86,13 +81,10 @@ extern void dccp_time_wait(struct sock *sk, int state, int timeo); | |||
86 | */ | 81 | */ |
87 | #define DCCP_RTO_MAX ((unsigned)(64 * HZ)) | 82 | #define DCCP_RTO_MAX ((unsigned)(64 * HZ)) |
88 | 83 | ||
89 | /* DCCP base time resolution - 10 microseconds (RFC 4340, 13.1 ... 13.3) */ | ||
90 | #define DCCP_TIME_RESOLUTION 10 | ||
91 | |||
92 | /* | 84 | /* |
93 | * RTT sampling: sanity bounds and fallback RTT value from RFC 4340, section 3.4 | 85 | * RTT sampling: sanity bounds and fallback RTT value from RFC 4340, section 3.4 |
94 | */ | 86 | */ |
95 | #define DCCP_SANE_RTT_MIN (10 * DCCP_TIME_RESOLUTION) | 87 | #define DCCP_SANE_RTT_MIN 100 |
96 | #define DCCP_FALLBACK_RTT (USEC_PER_SEC / 5) | 88 | #define DCCP_FALLBACK_RTT (USEC_PER_SEC / 5) |
97 | #define DCCP_SANE_RTT_MAX (3 * USEC_PER_SEC) | 89 | #define DCCP_SANE_RTT_MAX (3 * USEC_PER_SEC) |
98 | 90 | ||
@@ -103,6 +95,12 @@ extern void dccp_time_wait(struct sock *sk, int state, int timeo); | |||
103 | extern int sysctl_dccp_request_retries; | 95 | extern int sysctl_dccp_request_retries; |
104 | extern int sysctl_dccp_retries1; | 96 | extern int sysctl_dccp_retries1; |
105 | extern int sysctl_dccp_retries2; | 97 | extern int sysctl_dccp_retries2; |
98 | extern int sysctl_dccp_feat_sequence_window; | ||
99 | extern int sysctl_dccp_feat_rx_ccid; | ||
100 | extern int sysctl_dccp_feat_tx_ccid; | ||
101 | extern int sysctl_dccp_feat_ack_ratio; | ||
102 | extern int sysctl_dccp_feat_send_ack_vector; | ||
103 | extern int sysctl_dccp_feat_send_ndp_count; | ||
106 | extern int sysctl_dccp_tx_qlen; | 104 | extern int sysctl_dccp_tx_qlen; |
107 | extern int sysctl_dccp_sync_ratelimit; | 105 | extern int sysctl_dccp_sync_ratelimit; |
108 | 106 | ||
@@ -237,22 +235,8 @@ extern void dccp_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, | |||
237 | extern void dccp_send_sync(struct sock *sk, const u64 seq, | 235 | extern void dccp_send_sync(struct sock *sk, const u64 seq, |
238 | const enum dccp_pkt_type pkt_type); | 236 | const enum dccp_pkt_type pkt_type); |
239 | 237 | ||
240 | /* | 238 | extern void dccp_write_xmit(struct sock *sk, int block); |
241 | * TX Packet Dequeueing Interface | ||
242 | */ | ||
243 | extern void dccp_qpolicy_push(struct sock *sk, struct sk_buff *skb); | ||
244 | extern bool dccp_qpolicy_full(struct sock *sk); | ||
245 | extern void dccp_qpolicy_drop(struct sock *sk, struct sk_buff *skb); | ||
246 | extern struct sk_buff *dccp_qpolicy_top(struct sock *sk); | ||
247 | extern struct sk_buff *dccp_qpolicy_pop(struct sock *sk); | ||
248 | extern bool dccp_qpolicy_param_ok(struct sock *sk, __be32 param); | ||
249 | |||
250 | /* | ||
251 | * TX Packet Output and TX Timers | ||
252 | */ | ||
253 | extern void dccp_write_xmit(struct sock *sk); | ||
254 | extern void dccp_write_space(struct sock *sk); | 239 | extern void dccp_write_space(struct sock *sk); |
255 | extern void dccp_flush_write_queue(struct sock *sk, long *time_budget); | ||
256 | 240 | ||
257 | extern void dccp_init_xmit_timers(struct sock *sk); | 241 | extern void dccp_init_xmit_timers(struct sock *sk); |
258 | static inline void dccp_clear_xmit_timers(struct sock *sk) | 242 | static inline void dccp_clear_xmit_timers(struct sock *sk) |
@@ -268,8 +252,7 @@ extern const char *dccp_state_name(const int state); | |||
268 | extern void dccp_set_state(struct sock *sk, const int state); | 252 | extern void dccp_set_state(struct sock *sk, const int state); |
269 | extern void dccp_done(struct sock *sk); | 253 | extern void dccp_done(struct sock *sk); |
270 | 254 | ||
271 | extern int dccp_reqsk_init(struct request_sock *rq, struct dccp_sock const *dp, | 255 | extern void dccp_reqsk_init(struct request_sock *req, struct sk_buff *skb); |
272 | struct sk_buff const *skb); | ||
273 | 256 | ||
274 | extern int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb); | 257 | extern int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb); |
275 | 258 | ||
@@ -334,14 +317,7 @@ extern struct sk_buff *dccp_ctl_make_reset(struct sock *sk, | |||
334 | extern int dccp_send_reset(struct sock *sk, enum dccp_reset_codes code); | 317 | extern int dccp_send_reset(struct sock *sk, enum dccp_reset_codes code); |
335 | extern void dccp_send_close(struct sock *sk, const int active); | 318 | extern void dccp_send_close(struct sock *sk, const int active); |
336 | extern int dccp_invalid_packet(struct sk_buff *skb); | 319 | extern int dccp_invalid_packet(struct sk_buff *skb); |
337 | 320 | extern u32 dccp_sample_rtt(struct sock *sk, long delta); | |
338 | static inline u32 dccp_sane_rtt(long usec_sample) | ||
339 | { | ||
340 | if (unlikely(usec_sample <= 0 || usec_sample > DCCP_SANE_RTT_MAX)) | ||
341 | DCCP_WARN("RTT sample %ld out of bounds!\n", usec_sample); | ||
342 | return clamp_val(usec_sample, DCCP_SANE_RTT_MIN, DCCP_SANE_RTT_MAX); | ||
343 | } | ||
344 | extern u32 dccp_sample_rtt(struct sock *sk, long delta); | ||
345 | 321 | ||
346 | static inline int dccp_bad_service_code(const struct sock *sk, | 322 | static inline int dccp_bad_service_code(const struct sock *sk, |
347 | const __be32 service) | 323 | const __be32 service) |
@@ -435,62 +411,36 @@ static inline void dccp_hdr_set_ack(struct dccp_hdr_ack_bits *dhack, | |||
435 | static inline void dccp_update_gsr(struct sock *sk, u64 seq) | 411 | static inline void dccp_update_gsr(struct sock *sk, u64 seq) |
436 | { | 412 | { |
437 | struct dccp_sock *dp = dccp_sk(sk); | 413 | struct dccp_sock *dp = dccp_sk(sk); |
414 | const struct dccp_minisock *dmsk = dccp_msk(sk); | ||
438 | 415 | ||
439 | dp->dccps_gsr = seq; | 416 | dp->dccps_gsr = seq; |
440 | /* Sequence validity window depends on remote Sequence Window (7.5.1) */ | 417 | dccp_set_seqno(&dp->dccps_swl, |
441 | dp->dccps_swl = SUB48(ADD48(dp->dccps_gsr, 1), dp->dccps_r_seq_win / 4); | 418 | dp->dccps_gsr + 1 - (dmsk->dccpms_sequence_window / 4)); |
442 | /* | 419 | dccp_set_seqno(&dp->dccps_swh, |
443 | * Adjust SWL so that it is not below ISR. In contrast to RFC 4340, | 420 | dp->dccps_gsr + (3 * dmsk->dccpms_sequence_window) / 4); |
444 | * 7.5.1 we perform this check beyond the initial handshake: W/W' are | ||
445 | * always > 32, so for the first W/W' packets in the lifetime of a | ||
446 | * connection we always have to adjust SWL. | ||
447 | * A second reason why we are doing this is that the window depends on | ||
448 | * the feature-remote value of Sequence Window: nothing stops the peer | ||
449 | * from updating this value while we are busy adjusting SWL for the | ||
450 | * first W packets (we would have to count from scratch again then). | ||
451 | * Therefore it is safer to always make sure that the Sequence Window | ||
452 | * is not artificially extended by a peer who grows SWL downwards by | ||
453 | * continually updating the feature-remote Sequence-Window. | ||
454 | * If sequence numbers wrap it is bad luck. But that will take a while | ||
455 | * (48 bit), and this measure prevents Sequence-number attacks. | ||
456 | */ | ||
457 | if (before48(dp->dccps_swl, dp->dccps_isr)) | ||
458 | dp->dccps_swl = dp->dccps_isr; | ||
459 | dp->dccps_swh = ADD48(dp->dccps_gsr, (3 * dp->dccps_r_seq_win) / 4); | ||
460 | } | 421 | } |
461 | 422 | ||
462 | static inline void dccp_update_gss(struct sock *sk, u64 seq) | 423 | static inline void dccp_update_gss(struct sock *sk, u64 seq) |
463 | { | 424 | { |
464 | struct dccp_sock *dp = dccp_sk(sk); | 425 | struct dccp_sock *dp = dccp_sk(sk); |
465 | 426 | ||
466 | dp->dccps_gss = seq; | 427 | dp->dccps_awh = dp->dccps_gss = seq; |
467 | /* Ack validity window depends on local Sequence Window value (7.5.1) */ | 428 | dccp_set_seqno(&dp->dccps_awl, |
468 | dp->dccps_awl = SUB48(ADD48(dp->dccps_gss, 1), dp->dccps_l_seq_win); | 429 | (dp->dccps_gss - |
469 | /* Adjust AWL so that it is not below ISS - see comment above for SWL */ | 430 | dccp_msk(sk)->dccpms_sequence_window + 1)); |
470 | if (before48(dp->dccps_awl, dp->dccps_iss)) | ||
471 | dp->dccps_awl = dp->dccps_iss; | ||
472 | dp->dccps_awh = dp->dccps_gss; | ||
473 | } | ||
474 | |||
475 | static inline int dccp_ackvec_pending(const struct sock *sk) | ||
476 | { | ||
477 | return dccp_sk(sk)->dccps_hc_rx_ackvec != NULL && | ||
478 | !dccp_ackvec_is_empty(dccp_sk(sk)->dccps_hc_rx_ackvec); | ||
479 | } | 431 | } |
480 | 432 | ||
481 | static inline int dccp_ack_pending(const struct sock *sk) | 433 | static inline int dccp_ack_pending(const struct sock *sk) |
482 | { | 434 | { |
483 | return dccp_ackvec_pending(sk) || inet_csk_ack_scheduled(sk); | 435 | const struct dccp_sock *dp = dccp_sk(sk); |
436 | return dp->dccps_timestamp_echo != 0 || | ||
437 | #ifdef CONFIG_IP_DCCP_ACKVEC | ||
438 | (dccp_msk(sk)->dccpms_send_ack_vector && | ||
439 | dccp_ackvec_pending(dp->dccps_hc_rx_ackvec)) || | ||
440 | #endif | ||
441 | inet_csk_ack_scheduled(sk); | ||
484 | } | 442 | } |
485 | 443 | ||
486 | extern int dccp_feat_signal_nn_change(struct sock *sk, u8 feat, u64 nn_val); | ||
487 | extern int dccp_feat_finalise_settings(struct dccp_sock *dp); | ||
488 | extern int dccp_feat_server_ccid_dependencies(struct dccp_request_sock *dreq); | ||
489 | extern int dccp_feat_insert_opts(struct dccp_sock*, struct dccp_request_sock*, | ||
490 | struct sk_buff *skb); | ||
491 | extern int dccp_feat_activate_values(struct sock *sk, struct list_head *fn); | ||
492 | extern void dccp_feat_list_purge(struct list_head *fn_list); | ||
493 | |||
494 | extern int dccp_insert_options(struct sock *sk, struct sk_buff *skb); | 444 | extern int dccp_insert_options(struct sock *sk, struct sk_buff *skb); |
495 | extern int dccp_insert_options_rsk(struct dccp_request_sock*, struct sk_buff*); | 445 | extern int dccp_insert_options_rsk(struct dccp_request_sock*, struct sk_buff*); |
496 | extern int dccp_insert_option_elapsed_time(struct sock *sk, | 446 | extern int dccp_insert_option_elapsed_time(struct sock *sk, |
diff --git a/net/dccp/diag.c b/net/dccp/diag.c index 93aae7c95550..d8a3509b26f6 100644 --- a/net/dccp/diag.c +++ b/net/dccp/diag.c | |||
@@ -29,7 +29,7 @@ static void dccp_get_info(struct sock *sk, struct tcp_info *info) | |||
29 | info->tcpi_backoff = icsk->icsk_backoff; | 29 | info->tcpi_backoff = icsk->icsk_backoff; |
30 | info->tcpi_pmtu = icsk->icsk_pmtu_cookie; | 30 | info->tcpi_pmtu = icsk->icsk_pmtu_cookie; |
31 | 31 | ||
32 | if (dp->dccps_hc_rx_ackvec != NULL) | 32 | if (dccp_msk(sk)->dccpms_send_ack_vector) |
33 | info->tcpi_options |= TCPI_OPT_SACK; | 33 | info->tcpi_options |= TCPI_OPT_SACK; |
34 | 34 | ||
35 | ccid_hc_rx_get_info(dp->dccps_hc_rx_ccid, sk, info); | 35 | ccid_hc_rx_get_info(dp->dccps_hc_rx_ccid, sk, info); |
diff --git a/net/dccp/feat.c b/net/dccp/feat.c index f94c7c9d1a7f..933a0ecf8d46 100644 --- a/net/dccp/feat.c +++ b/net/dccp/feat.c | |||
@@ -1,19 +1,11 @@ | |||
1 | /* | 1 | /* |
2 | * net/dccp/feat.c | 2 | * net/dccp/feat.c |
3 | * | 3 | * |
4 | * Feature negotiation for the DCCP protocol (RFC 4340, section 6) | 4 | * An implementation of the DCCP protocol |
5 | * | 5 | * Andrea Bittau <a.bittau@cs.ucl.ac.uk> |
6 | * Copyright (c) 2008 The University of Aberdeen, Scotland, UK | ||
7 | * Copyright (c) 2008 Gerrit Renker <gerrit@erg.abdn.ac.uk> | ||
8 | * Rewrote from scratch, some bits from earlier code by | ||
9 | * Copyright (c) 2005 Andrea Bittau <a.bittau@cs.ucl.ac.uk> | ||
10 | * | ||
11 | * | 6 | * |
12 | * ASSUMPTIONS | 7 | * ASSUMPTIONS |
13 | * ----------- | 8 | * ----------- |
14 | * o Feature negotiation is coordinated with connection setup (as in TCP), wild | ||
15 | * changes of parameters of an established connection are not supported. | ||
16 | * o Changing NN values (Ack Ratio only) is supported in state OPEN/PARTOPEN. | ||
17 | * o All currently known SP features have 1-byte quantities. If in the future | 9 | * o All currently known SP features have 1-byte quantities. If in the future |
18 | * extensions of RFCs 4340..42 define features with item lengths larger than | 10 | * extensions of RFCs 4340..42 define features with item lengths larger than |
19 | * one byte, a feature-specific extension of the code will be required. | 11 | * one byte, a feature-specific extension of the code will be required. |
@@ -23,1510 +15,635 @@ | |||
23 | * as published by the Free Software Foundation; either version | 15 | * as published by the Free Software Foundation; either version |
24 | * 2 of the License, or (at your option) any later version. | 16 | * 2 of the License, or (at your option) any later version. |
25 | */ | 17 | */ |
18 | |||
26 | #include <linux/module.h> | 19 | #include <linux/module.h> |
20 | |||
27 | #include "ccid.h" | 21 | #include "ccid.h" |
28 | #include "feat.h" | 22 | #include "feat.h" |
29 | 23 | ||
30 | /* feature-specific sysctls - initialised to the defaults from RFC 4340, 6.4 */ | 24 | #define DCCP_FEAT_SP_NOAGREE (-123) |
31 | unsigned long sysctl_dccp_sequence_window __read_mostly = 100; | ||
32 | int sysctl_dccp_rx_ccid __read_mostly = 2, | ||
33 | sysctl_dccp_tx_ccid __read_mostly = 2; | ||
34 | 25 | ||
35 | /* | 26 | int dccp_feat_change(struct dccp_minisock *dmsk, u8 type, u8 feature, |
36 | * Feature activation handlers. | 27 | u8 *val, u8 len, gfp_t gfp) |
37 | * | ||
38 | * These all use an u64 argument, to provide enough room for NN/SP features. At | ||
39 | * this stage the negotiated values have been checked to be within their range. | ||
40 | */ | ||
41 | static int dccp_hdlr_ccid(struct sock *sk, u64 ccid, bool rx) | ||
42 | { | 28 | { |
43 | struct dccp_sock *dp = dccp_sk(sk); | 29 | struct dccp_opt_pend *opt; |
44 | struct ccid *new_ccid = ccid_new(ccid, sk, rx, gfp_any()); | ||
45 | 30 | ||
46 | if (new_ccid == NULL) | 31 | dccp_feat_debug(type, feature, *val); |
47 | return -ENOMEM; | ||
48 | 32 | ||
49 | if (rx) { | 33 | if (len > 3) { |
50 | ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk); | 34 | DCCP_WARN("invalid length %d\n", len); |
51 | dp->dccps_hc_rx_ccid = new_ccid; | 35 | return -EINVAL; |
52 | } else { | 36 | } |
53 | ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk); | 37 | /* XXX add further sanity checks */ |
54 | dp->dccps_hc_tx_ccid = new_ccid; | 38 | |
39 | /* check if that feature is already being negotiated */ | ||
40 | list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) { | ||
41 | /* ok we found a negotiation for this option already */ | ||
42 | if (opt->dccpop_feat == feature && opt->dccpop_type == type) { | ||
43 | dccp_pr_debug("Replacing old\n"); | ||
44 | /* replace */ | ||
45 | BUG_ON(opt->dccpop_val == NULL); | ||
46 | kfree(opt->dccpop_val); | ||
47 | opt->dccpop_val = val; | ||
48 | opt->dccpop_len = len; | ||
49 | opt->dccpop_conf = 0; | ||
50 | return 0; | ||
51 | } | ||
55 | } | 52 | } |
56 | return 0; | ||
57 | } | ||
58 | 53 | ||
59 | static int dccp_hdlr_seq_win(struct sock *sk, u64 seq_win, bool rx) | 54 | /* negotiation for a new feature */ |
60 | { | 55 | opt = kmalloc(sizeof(*opt), gfp); |
61 | struct dccp_sock *dp = dccp_sk(sk); | 56 | if (opt == NULL) |
57 | return -ENOMEM; | ||
62 | 58 | ||
63 | if (rx) { | 59 | opt->dccpop_type = type; |
64 | dp->dccps_r_seq_win = seq_win; | 60 | opt->dccpop_feat = feature; |
65 | /* propagate changes to update SWL/SWH */ | 61 | opt->dccpop_len = len; |
66 | dccp_update_gsr(sk, dp->dccps_gsr); | 62 | opt->dccpop_val = val; |
67 | } else { | 63 | opt->dccpop_conf = 0; |
68 | dp->dccps_l_seq_win = seq_win; | 64 | opt->dccpop_sc = NULL; |
69 | /* propagate changes to update AWL */ | ||
70 | dccp_update_gss(sk, dp->dccps_gss); | ||
71 | } | ||
72 | return 0; | ||
73 | } | ||
74 | 65 | ||
75 | static int dccp_hdlr_ack_ratio(struct sock *sk, u64 ratio, bool rx) | 66 | BUG_ON(opt->dccpop_val == NULL); |
76 | { | 67 | |
77 | #ifndef __CCID2_COPES_GRACEFULLY_WITH_DYNAMIC_ACK_RATIO_UPDATES__ | 68 | list_add_tail(&opt->dccpop_node, &dmsk->dccpms_pending); |
78 | /* | ||
79 | * FIXME: This is required until several problems in the CCID-2 code are | ||
80 | * resolved. The CCID-2 code currently does not cope well; using dynamic | ||
81 | * Ack Ratios greater than 1 caused instabilities. These were manifest | ||
82 | * in hangups and long RTO timeouts (1...3 seconds). Until this has been | ||
83 | * stabilised, it is safer not to activate dynamic Ack Ratio changes. | ||
84 | */ | ||
85 | dccp_pr_debug("Not changing %s Ack Ratio from 1 to %u\n", | ||
86 | rx ? "RX" : "TX", (u16)ratio); | ||
87 | ratio = 1; | ||
88 | #endif | ||
89 | if (rx) | ||
90 | dccp_sk(sk)->dccps_r_ack_ratio = ratio; | ||
91 | else | ||
92 | dccp_sk(sk)->dccps_l_ack_ratio = ratio; | ||
93 | return 0; | 69 | return 0; |
94 | } | 70 | } |
95 | 71 | ||
96 | static int dccp_hdlr_ackvec(struct sock *sk, u64 enable, bool rx) | 72 | EXPORT_SYMBOL_GPL(dccp_feat_change); |
73 | |||
74 | static int dccp_feat_update_ccid(struct sock *sk, u8 type, u8 new_ccid_nr) | ||
97 | { | 75 | { |
98 | struct dccp_sock *dp = dccp_sk(sk); | 76 | struct dccp_sock *dp = dccp_sk(sk); |
77 | struct dccp_minisock *dmsk = dccp_msk(sk); | ||
78 | /* figure out if we are changing our CCID or the peer's */ | ||
79 | const int rx = type == DCCPO_CHANGE_R; | ||
80 | const u8 ccid_nr = rx ? dmsk->dccpms_rx_ccid : dmsk->dccpms_tx_ccid; | ||
81 | struct ccid *new_ccid; | ||
82 | |||
83 | /* Check if nothing is being changed. */ | ||
84 | if (ccid_nr == new_ccid_nr) | ||
85 | return 0; | ||
86 | |||
87 | new_ccid = ccid_new(new_ccid_nr, sk, rx, GFP_ATOMIC); | ||
88 | if (new_ccid == NULL) | ||
89 | return -ENOMEM; | ||
99 | 90 | ||
100 | if (rx) { | 91 | if (rx) { |
101 | if (enable && dp->dccps_hc_rx_ackvec == NULL) { | 92 | ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk); |
102 | dp->dccps_hc_rx_ackvec = dccp_ackvec_alloc(gfp_any()); | 93 | dp->dccps_hc_rx_ccid = new_ccid; |
103 | if (dp->dccps_hc_rx_ackvec == NULL) | 94 | dmsk->dccpms_rx_ccid = new_ccid_nr; |
104 | return -ENOMEM; | 95 | } else { |
105 | } else if (!enable) { | 96 | ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk); |
106 | dccp_ackvec_free(dp->dccps_hc_rx_ackvec); | 97 | dp->dccps_hc_tx_ccid = new_ccid; |
107 | dp->dccps_hc_rx_ackvec = NULL; | 98 | dmsk->dccpms_tx_ccid = new_ccid_nr; |
108 | } | ||
109 | } | 99 | } |
110 | return 0; | ||
111 | } | ||
112 | 100 | ||
113 | static int dccp_hdlr_ndp(struct sock *sk, u64 enable, bool rx) | ||
114 | { | ||
115 | if (!rx) | ||
116 | dccp_sk(sk)->dccps_send_ndp_count = (enable > 0); | ||
117 | return 0; | 101 | return 0; |
118 | } | 102 | } |
119 | 103 | ||
120 | /* | 104 | static int dccp_feat_update(struct sock *sk, u8 type, u8 feat, u8 val) |
121 | * Minimum Checksum Coverage is located at the RX side (9.2.1). This means that | ||
122 | * `rx' holds when the sending peer informs about his partial coverage via a | ||
123 | * ChangeR() option. In the other case, we are the sender and the receiver | ||
124 | * announces its coverage via ChangeL() options. The policy here is to honour | ||
125 | * such communication by enabling the corresponding partial coverage - but only | ||
126 | * if it has not been set manually before; the warning here means that all | ||
127 | * packets will be dropped. | ||
128 | */ | ||
129 | static int dccp_hdlr_min_cscov(struct sock *sk, u64 cscov, bool rx) | ||
130 | { | 105 | { |
131 | struct dccp_sock *dp = dccp_sk(sk); | 106 | dccp_feat_debug(type, feat, val); |
132 | 107 | ||
133 | if (rx) | 108 | switch (feat) { |
134 | dp->dccps_pcrlen = cscov; | 109 | case DCCPF_CCID: |
135 | else { | 110 | return dccp_feat_update_ccid(sk, type, val); |
136 | if (dp->dccps_pcslen == 0) | 111 | default: |
137 | dp->dccps_pcslen = cscov; | 112 | dccp_pr_debug("UNIMPLEMENTED: %s(%d, ...)\n", |
138 | else if (cscov > dp->dccps_pcslen) | 113 | dccp_feat_typename(type), feat); |
139 | DCCP_WARN("CsCov %u too small, peer requires >= %u\n", | 114 | break; |
140 | dp->dccps_pcslen, (u8)cscov); | ||
141 | } | 115 | } |
142 | return 0; | 116 | return 0; |
143 | } | 117 | } |
144 | 118 | ||
145 | static const struct { | 119 | static int dccp_feat_reconcile(struct sock *sk, struct dccp_opt_pend *opt, |
146 | u8 feat_num; /* DCCPF_xxx */ | 120 | u8 *rpref, u8 rlen) |
147 | enum dccp_feat_type rxtx; /* RX or TX */ | ||
148 | enum dccp_feat_type reconciliation; /* SP or NN */ | ||
149 | u8 default_value; /* as in 6.4 */ | ||
150 | int (*activation_hdlr)(struct sock *sk, u64 val, bool rx); | ||
151 | /* | ||
152 | * Lookup table for location and type of features (from RFC 4340/4342) | ||
153 | * +--------------------------+----+-----+----+----+---------+-----------+ | ||
154 | * | Feature | Location | Reconc. | Initial | Section | | ||
155 | * | | RX | TX | SP | NN | Value | Reference | | ||
156 | * +--------------------------+----+-----+----+----+---------+-----------+ | ||
157 | * | DCCPF_CCID | | X | X | | 2 | 10 | | ||
158 | * | DCCPF_SHORT_SEQNOS | | X | X | | 0 | 7.6.1 | | ||
159 | * | DCCPF_SEQUENCE_WINDOW | | X | | X | 100 | 7.5.2 | | ||
160 | * | DCCPF_ECN_INCAPABLE | X | | X | | 0 | 12.1 | | ||
161 | * | DCCPF_ACK_RATIO | | X | | X | 2 | 11.3 | | ||
162 | * | DCCPF_SEND_ACK_VECTOR | X | | X | | 0 | 11.5 | | ||
163 | * | DCCPF_SEND_NDP_COUNT | | X | X | | 0 | 7.7.2 | | ||
164 | * | DCCPF_MIN_CSUM_COVER | X | | X | | 0 | 9.2.1 | | ||
165 | * | DCCPF_DATA_CHECKSUM | X | | X | | 0 | 9.3.1 | | ||
166 | * | DCCPF_SEND_LEV_RATE | X | | X | | 0 | 4342/8.4 | | ||
167 | * +--------------------------+----+-----+----+----+---------+-----------+ | ||
168 | */ | ||
169 | } dccp_feat_table[] = { | ||
170 | { DCCPF_CCID, FEAT_AT_TX, FEAT_SP, 2, dccp_hdlr_ccid }, | ||
171 | { DCCPF_SHORT_SEQNOS, FEAT_AT_TX, FEAT_SP, 0, NULL }, | ||
172 | { DCCPF_SEQUENCE_WINDOW, FEAT_AT_TX, FEAT_NN, 100, dccp_hdlr_seq_win }, | ||
173 | { DCCPF_ECN_INCAPABLE, FEAT_AT_RX, FEAT_SP, 0, NULL }, | ||
174 | { DCCPF_ACK_RATIO, FEAT_AT_TX, FEAT_NN, 2, dccp_hdlr_ack_ratio}, | ||
175 | { DCCPF_SEND_ACK_VECTOR, FEAT_AT_RX, FEAT_SP, 0, dccp_hdlr_ackvec }, | ||
176 | { DCCPF_SEND_NDP_COUNT, FEAT_AT_TX, FEAT_SP, 0, dccp_hdlr_ndp }, | ||
177 | { DCCPF_MIN_CSUM_COVER, FEAT_AT_RX, FEAT_SP, 0, dccp_hdlr_min_cscov}, | ||
178 | { DCCPF_DATA_CHECKSUM, FEAT_AT_RX, FEAT_SP, 0, NULL }, | ||
179 | { DCCPF_SEND_LEV_RATE, FEAT_AT_RX, FEAT_SP, 0, NULL }, | ||
180 | }; | ||
181 | #define DCCP_FEAT_SUPPORTED_MAX ARRAY_SIZE(dccp_feat_table) | ||
182 | |||
183 | /** | ||
184 | * dccp_feat_index - Hash function to map feature number into array position | ||
185 | * Returns consecutive array index or -1 if the feature is not understood. | ||
186 | */ | ||
187 | static int dccp_feat_index(u8 feat_num) | ||
188 | { | 121 | { |
189 | /* The first 9 entries are occupied by the types from RFC 4340, 6.4 */ | 122 | struct dccp_sock *dp = dccp_sk(sk); |
190 | if (feat_num > DCCPF_RESERVED && feat_num <= DCCPF_DATA_CHECKSUM) | 123 | u8 *spref, slen, *res = NULL; |
191 | return feat_num - 1; | 124 | int i, j, rc, agree = 1; |
192 | 125 | ||
126 | BUG_ON(rpref == NULL); | ||
127 | |||
128 | /* check if we are the black sheep */ | ||
129 | if (dp->dccps_role == DCCP_ROLE_CLIENT) { | ||
130 | spref = rpref; | ||
131 | slen = rlen; | ||
132 | rpref = opt->dccpop_val; | ||
133 | rlen = opt->dccpop_len; | ||
134 | } else { | ||
135 | spref = opt->dccpop_val; | ||
136 | slen = opt->dccpop_len; | ||
137 | } | ||
193 | /* | 138 | /* |
194 | * Other features: add cases for new feature types here after adding | 139 | * Now we have server preference list in spref and client preference in |
195 | * them to the above table. | 140 | * rpref |
196 | */ | 141 | */ |
197 | switch (feat_num) { | 142 | BUG_ON(spref == NULL); |
198 | case DCCPF_SEND_LEV_RATE: | 143 | BUG_ON(rpref == NULL); |
199 | return DCCP_FEAT_SUPPORTED_MAX - 1; | ||
200 | } | ||
201 | return -1; | ||
202 | } | ||
203 | |||
204 | static u8 dccp_feat_type(u8 feat_num) | ||
205 | { | ||
206 | int idx = dccp_feat_index(feat_num); | ||
207 | |||
208 | if (idx < 0) | ||
209 | return FEAT_UNKNOWN; | ||
210 | return dccp_feat_table[idx].reconciliation; | ||
211 | } | ||
212 | 144 | ||
213 | static int dccp_feat_default_value(u8 feat_num) | 145 | /* FIXME sanity check vals */ |
214 | { | ||
215 | int idx = dccp_feat_index(feat_num); | ||
216 | 146 | ||
217 | return idx < 0 ? : dccp_feat_table[idx].default_value; | 147 | /* Are values in any order? XXX Lame "algorithm" here */ |
218 | } | 148 | for (i = 0; i < slen; i++) { |
219 | 149 | for (j = 0; j < rlen; j++) { | |
220 | /* | 150 | if (spref[i] == rpref[j]) { |
221 | * Debugging and verbose-printing section | 151 | res = &spref[i]; |
222 | */ | 152 | break; |
223 | static const char *dccp_feat_fname(const u8 feat) | 153 | } |
224 | { | 154 | } |
225 | static const char *feature_names[] = { | 155 | if (res) |
226 | [DCCPF_RESERVED] = "Reserved", | 156 | break; |
227 | [DCCPF_CCID] = "CCID", | ||
228 | [DCCPF_SHORT_SEQNOS] = "Allow Short Seqnos", | ||
229 | [DCCPF_SEQUENCE_WINDOW] = "Sequence Window", | ||
230 | [DCCPF_ECN_INCAPABLE] = "ECN Incapable", | ||
231 | [DCCPF_ACK_RATIO] = "Ack Ratio", | ||
232 | [DCCPF_SEND_ACK_VECTOR] = "Send ACK Vector", | ||
233 | [DCCPF_SEND_NDP_COUNT] = "Send NDP Count", | ||
234 | [DCCPF_MIN_CSUM_COVER] = "Min. Csum Coverage", | ||
235 | [DCCPF_DATA_CHECKSUM] = "Send Data Checksum", | ||
236 | }; | ||
237 | if (feat > DCCPF_DATA_CHECKSUM && feat < DCCPF_MIN_CCID_SPECIFIC) | ||
238 | return feature_names[DCCPF_RESERVED]; | ||
239 | |||
240 | if (feat == DCCPF_SEND_LEV_RATE) | ||
241 | return "Send Loss Event Rate"; | ||
242 | if (feat >= DCCPF_MIN_CCID_SPECIFIC) | ||
243 | return "CCID-specific"; | ||
244 | |||
245 | return feature_names[feat]; | ||
246 | } | ||
247 | |||
248 | static const char *dccp_feat_sname[] = { "DEFAULT", "INITIALISING", "CHANGING", | ||
249 | "UNSTABLE", "STABLE" }; | ||
250 | |||
251 | #ifdef CONFIG_IP_DCCP_DEBUG | ||
252 | static const char *dccp_feat_oname(const u8 opt) | ||
253 | { | ||
254 | switch (opt) { | ||
255 | case DCCPO_CHANGE_L: return "Change_L"; | ||
256 | case DCCPO_CONFIRM_L: return "Confirm_L"; | ||
257 | case DCCPO_CHANGE_R: return "Change_R"; | ||
258 | case DCCPO_CONFIRM_R: return "Confirm_R"; | ||
259 | } | 157 | } |
260 | return NULL; | ||
261 | } | ||
262 | 158 | ||
263 | static void dccp_feat_printval(u8 feat_num, dccp_feat_val const *val) | 159 | /* we didn't agree on anything */ |
264 | { | 160 | if (res == NULL) { |
265 | u8 i, type = dccp_feat_type(feat_num); | 161 | /* confirm previous value */ |
266 | 162 | switch (opt->dccpop_feat) { | |
267 | if (val == NULL || (type == FEAT_SP && val->sp.vec == NULL)) | 163 | case DCCPF_CCID: |
268 | dccp_pr_debug_cat("(NULL)"); | 164 | /* XXX did i get this right? =P */ |
269 | else if (type == FEAT_SP) | 165 | if (opt->dccpop_type == DCCPO_CHANGE_L) |
270 | for (i = 0; i < val->sp.len; i++) | 166 | res = &dccp_msk(sk)->dccpms_tx_ccid; |
271 | dccp_pr_debug_cat("%s%u", i ? " " : "", val->sp.vec[i]); | 167 | else |
272 | else if (type == FEAT_NN) | 168 | res = &dccp_msk(sk)->dccpms_rx_ccid; |
273 | dccp_pr_debug_cat("%llu", (unsigned long long)val->nn); | 169 | break; |
274 | else | ||
275 | dccp_pr_debug_cat("unknown type %u", type); | ||
276 | } | ||
277 | |||
278 | static void dccp_feat_printvals(u8 feat_num, u8 *list, u8 len) | ||
279 | { | ||
280 | u8 type = dccp_feat_type(feat_num); | ||
281 | dccp_feat_val fval = { .sp.vec = list, .sp.len = len }; | ||
282 | |||
283 | if (type == FEAT_NN) | ||
284 | fval.nn = dccp_decode_value_var(list, len); | ||
285 | dccp_feat_printval(feat_num, &fval); | ||
286 | } | ||
287 | 170 | ||
288 | static void dccp_feat_print_entry(struct dccp_feat_entry const *entry) | 171 | default: |
289 | { | 172 | DCCP_BUG("Fell through, feat=%d", opt->dccpop_feat); |
290 | dccp_debug(" * %s %s = ", entry->is_local ? "local" : "remote", | 173 | /* XXX implement res */ |
291 | dccp_feat_fname(entry->feat_num)); | 174 | return -EFAULT; |
292 | dccp_feat_printval(entry->feat_num, &entry->val); | 175 | } |
293 | dccp_pr_debug_cat(", state=%s %s\n", dccp_feat_sname[entry->state], | ||
294 | entry->needs_confirm ? "(Confirm pending)" : ""); | ||
295 | } | ||
296 | 176 | ||
297 | #define dccp_feat_print_opt(opt, feat, val, len, mandatory) do { \ | 177 | dccp_pr_debug("Don't agree... reconfirming %d\n", *res); |
298 | dccp_pr_debug("%s(%s, ", dccp_feat_oname(opt), dccp_feat_fname(feat));\ | 178 | agree = 0; /* this is used for mandatory options... */ |
299 | dccp_feat_printvals(feat, val, len); \ | 179 | } |
300 | dccp_pr_debug_cat(") %s\n", mandatory ? "!" : ""); } while (0) | ||
301 | |||
302 | #define dccp_feat_print_fnlist(fn_list) { \ | ||
303 | const struct dccp_feat_entry *___entry; \ | ||
304 | \ | ||
305 | dccp_pr_debug("List Dump:\n"); \ | ||
306 | list_for_each_entry(___entry, fn_list, node) \ | ||
307 | dccp_feat_print_entry(___entry); \ | ||
308 | } | ||
309 | #else /* ! CONFIG_IP_DCCP_DEBUG */ | ||
310 | #define dccp_feat_print_opt(opt, feat, val, len, mandatory) | ||
311 | #define dccp_feat_print_fnlist(fn_list) | ||
312 | #endif | ||
313 | 180 | ||
314 | static int __dccp_feat_activate(struct sock *sk, const int idx, | 181 | /* need to put result and our preference list */ |
315 | const bool is_local, dccp_feat_val const *fval) | 182 | rlen = 1 + opt->dccpop_len; |
316 | { | 183 | rpref = kmalloc(rlen, GFP_ATOMIC); |
317 | bool rx; | 184 | if (rpref == NULL) |
318 | u64 val; | 185 | return -ENOMEM; |
319 | 186 | ||
320 | if (idx < 0 || idx >= DCCP_FEAT_SUPPORTED_MAX) | 187 | *rpref = *res; |
321 | return -1; | 188 | memcpy(&rpref[1], opt->dccpop_val, opt->dccpop_len); |
322 | if (dccp_feat_table[idx].activation_hdlr == NULL) | ||
323 | return 0; | ||
324 | 189 | ||
325 | if (fval == NULL) { | 190 | /* put it in the "confirm queue" */ |
326 | val = dccp_feat_table[idx].default_value; | 191 | if (opt->dccpop_sc == NULL) { |
327 | } else if (dccp_feat_table[idx].reconciliation == FEAT_SP) { | 192 | opt->dccpop_sc = kmalloc(sizeof(*opt->dccpop_sc), GFP_ATOMIC); |
328 | if (fval->sp.vec == NULL) { | 193 | if (opt->dccpop_sc == NULL) { |
329 | /* | 194 | kfree(rpref); |
330 | * This can happen when an empty Confirm is sent | 195 | return -ENOMEM; |
331 | * for an SP (i.e. known) feature. In this case | ||
332 | * we would be using the default anyway. | ||
333 | */ | ||
334 | DCCP_CRIT("Feature #%d undefined: using default", idx); | ||
335 | val = dccp_feat_table[idx].default_value; | ||
336 | } else { | ||
337 | val = fval->sp.vec[0]; | ||
338 | } | 196 | } |
339 | } else { | 197 | } else { |
340 | val = fval->nn; | 198 | /* recycle the confirm slot */ |
199 | BUG_ON(opt->dccpop_sc->dccpoc_val == NULL); | ||
200 | kfree(opt->dccpop_sc->dccpoc_val); | ||
201 | dccp_pr_debug("recycling confirm slot\n"); | ||
202 | } | ||
203 | memset(opt->dccpop_sc, 0, sizeof(*opt->dccpop_sc)); | ||
204 | |||
205 | opt->dccpop_sc->dccpoc_val = rpref; | ||
206 | opt->dccpop_sc->dccpoc_len = rlen; | ||
207 | |||
208 | /* update the option on our side [we are about to send the confirm] */ | ||
209 | rc = dccp_feat_update(sk, opt->dccpop_type, opt->dccpop_feat, *res); | ||
210 | if (rc) { | ||
211 | kfree(opt->dccpop_sc->dccpoc_val); | ||
212 | kfree(opt->dccpop_sc); | ||
213 | opt->dccpop_sc = NULL; | ||
214 | return rc; | ||
341 | } | 215 | } |
342 | 216 | ||
343 | /* Location is RX if this is a local-RX or remote-TX feature */ | 217 | dccp_pr_debug("Will confirm %d\n", *rpref); |
344 | rx = (is_local == (dccp_feat_table[idx].rxtx == FEAT_AT_RX)); | ||
345 | |||
346 | dccp_debug(" -> activating %s %s, %sval=%llu\n", rx ? "RX" : "TX", | ||
347 | dccp_feat_fname(dccp_feat_table[idx].feat_num), | ||
348 | fval ? "" : "default ", (unsigned long long)val); | ||
349 | |||
350 | return dccp_feat_table[idx].activation_hdlr(sk, val, rx); | ||
351 | } | ||
352 | |||
353 | /** | ||
354 | * dccp_feat_activate - Activate feature value on socket | ||
355 | * @sk: fully connected DCCP socket (after handshake is complete) | ||
356 | * @feat_num: feature to activate, one of %dccp_feature_numbers | ||
357 | * @local: whether local (1) or remote (0) @feat_num is meant | ||
358 | * @fval: the value (SP or NN) to activate, or NULL to use the default value | ||
359 | * For general use this function is preferable over __dccp_feat_activate(). | ||
360 | */ | ||
361 | static int dccp_feat_activate(struct sock *sk, u8 feat_num, bool local, | ||
362 | dccp_feat_val const *fval) | ||
363 | { | ||
364 | return __dccp_feat_activate(sk, dccp_feat_index(feat_num), local, fval); | ||
365 | } | ||
366 | |||
367 | /* Test for "Req'd" feature (RFC 4340, 6.4) */ | ||
368 | static inline int dccp_feat_must_be_understood(u8 feat_num) | ||
369 | { | ||
370 | return feat_num == DCCPF_CCID || feat_num == DCCPF_SHORT_SEQNOS || | ||
371 | feat_num == DCCPF_SEQUENCE_WINDOW; | ||
372 | } | ||
373 | 218 | ||
374 | /* copy constructor, fval must not already contain allocated memory */ | 219 | /* say we want to change to X but we just got a confirm X, suppress our |
375 | static int dccp_feat_clone_sp_val(dccp_feat_val *fval, u8 const *val, u8 len) | 220 | * change |
376 | { | 221 | */ |
377 | fval->sp.len = len; | 222 | if (!opt->dccpop_conf) { |
378 | if (fval->sp.len > 0) { | 223 | if (*opt->dccpop_val == *res) |
379 | fval->sp.vec = kmemdup(val, len, gfp_any()); | 224 | opt->dccpop_conf = 1; |
380 | if (fval->sp.vec == NULL) { | 225 | dccp_pr_debug("won't ask for change of same feature\n"); |
381 | fval->sp.len = 0; | ||
382 | return -ENOBUFS; | ||
383 | } | ||
384 | } | 226 | } |
385 | return 0; | ||
386 | } | ||
387 | 227 | ||
388 | static void dccp_feat_val_destructor(u8 feat_num, dccp_feat_val *val) | 228 | return agree ? 0 : DCCP_FEAT_SP_NOAGREE; /* used for mandatory opts */ |
389 | { | ||
390 | if (unlikely(val == NULL)) | ||
391 | return; | ||
392 | if (dccp_feat_type(feat_num) == FEAT_SP) | ||
393 | kfree(val->sp.vec); | ||
394 | memset(val, 0, sizeof(*val)); | ||
395 | } | 229 | } |
396 | 230 | ||
397 | static struct dccp_feat_entry * | 231 | static int dccp_feat_sp(struct sock *sk, u8 type, u8 feature, u8 *val, u8 len) |
398 | dccp_feat_clone_entry(struct dccp_feat_entry const *original) | ||
399 | { | 232 | { |
400 | struct dccp_feat_entry *new; | 233 | struct dccp_minisock *dmsk = dccp_msk(sk); |
401 | u8 type = dccp_feat_type(original->feat_num); | 234 | struct dccp_opt_pend *opt; |
402 | 235 | int rc = 1; | |
403 | if (type == FEAT_UNKNOWN) | 236 | u8 t; |
404 | return NULL; | ||
405 | 237 | ||
406 | new = kmemdup(original, sizeof(struct dccp_feat_entry), gfp_any()); | 238 | /* |
407 | if (new == NULL) | 239 | * We received a CHANGE. We gotta match it against our own preference |
408 | return NULL; | 240 | * list. If we got a CHANGE_R it means it's a change for us, so we need |
241 | * to compare our CHANGE_L list. | ||
242 | */ | ||
243 | if (type == DCCPO_CHANGE_L) | ||
244 | t = DCCPO_CHANGE_R; | ||
245 | else | ||
246 | t = DCCPO_CHANGE_L; | ||
409 | 247 | ||
410 | if (type == FEAT_SP && dccp_feat_clone_sp_val(&new->val, | 248 | /* find our preference list for this feature */ |
411 | original->val.sp.vec, | 249 | list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) { |
412 | original->val.sp.len)) { | 250 | if (opt->dccpop_type != t || opt->dccpop_feat != feature) |
413 | kfree(new); | 251 | continue; |
414 | return NULL; | ||
415 | } | ||
416 | return new; | ||
417 | } | ||
418 | 252 | ||
419 | static void dccp_feat_entry_destructor(struct dccp_feat_entry *entry) | 253 | /* find the winner from the two preference lists */ |
420 | { | 254 | rc = dccp_feat_reconcile(sk, opt, val, len); |
421 | if (entry != NULL) { | 255 | break; |
422 | dccp_feat_val_destructor(entry->feat_num, &entry->val); | ||
423 | kfree(entry); | ||
424 | } | 256 | } |
425 | } | ||
426 | 257 | ||
427 | /* | 258 | /* We didn't deal with the change. This can happen if we have no |
428 | * List management functions | 259 | * preference list for the feature. In fact, it just shouldn't |
429 | * | 260 | * happen---if we understand a feature, we should have a preference list |
430 | * Feature negotiation lists rely on and maintain the following invariants: | 261 | * with at least the default value. |
431 | * - each feat_num in the list is known, i.e. we know its type and default value | 262 | */ |
432 | * - each feat_num/is_local combination is unique (old entries are overwritten) | 263 | BUG_ON(rc == 1); |
433 | * - SP values are always freshly allocated | ||
434 | * - list is sorted in increasing order of feature number (faster lookup) | ||
435 | */ | ||
436 | static struct dccp_feat_entry *dccp_feat_list_lookup(struct list_head *fn_list, | ||
437 | u8 feat_num, bool is_local) | ||
438 | { | ||
439 | struct dccp_feat_entry *entry; | ||
440 | 264 | ||
441 | list_for_each_entry(entry, fn_list, node) | 265 | return rc; |
442 | if (entry->feat_num == feat_num && entry->is_local == is_local) | ||
443 | return entry; | ||
444 | else if (entry->feat_num > feat_num) | ||
445 | break; | ||
446 | return NULL; | ||
447 | } | 266 | } |
448 | 267 | ||
449 | /** | 268 | static int dccp_feat_nn(struct sock *sk, u8 type, u8 feature, u8 *val, u8 len) |
450 | * dccp_feat_entry_new - Central list update routine (called by all others) | ||
451 | * @head: list to add to | ||
452 | * @feat: feature number | ||
453 | * @local: whether the local (1) or remote feature with number @feat is meant | ||
454 | * This is the only constructor and serves to ensure the above invariants. | ||
455 | */ | ||
456 | static struct dccp_feat_entry * | ||
457 | dccp_feat_entry_new(struct list_head *head, u8 feat, bool local) | ||
458 | { | 269 | { |
459 | struct dccp_feat_entry *entry; | 270 | struct dccp_opt_pend *opt; |
460 | 271 | struct dccp_minisock *dmsk = dccp_msk(sk); | |
461 | list_for_each_entry(entry, head, node) | 272 | u8 *copy; |
462 | if (entry->feat_num == feat && entry->is_local == local) { | 273 | int rc; |
463 | dccp_feat_val_destructor(entry->feat_num, &entry->val); | ||
464 | return entry; | ||
465 | } else if (entry->feat_num > feat) { | ||
466 | head = &entry->node; | ||
467 | break; | ||
468 | } | ||
469 | 274 | ||
470 | entry = kmalloc(sizeof(*entry), gfp_any()); | 275 | /* NN features must be Change L (sec. 6.3.2) */ |
471 | if (entry != NULL) { | 276 | if (type != DCCPO_CHANGE_L) { |
472 | entry->feat_num = feat; | 277 | dccp_pr_debug("received %s for NN feature %d\n", |
473 | entry->is_local = local; | 278 | dccp_feat_typename(type), feature); |
474 | list_add_tail(&entry->node, head); | 279 | return -EFAULT; |
475 | } | 280 | } |
476 | return entry; | ||
477 | } | ||
478 | 281 | ||
479 | /** | 282 | /* XXX sanity check opt val */ |
480 | * dccp_feat_push_change - Add/overwrite a Change option in the list | ||
481 | * @fn_list: feature-negotiation list to update | ||
482 | * @feat: one of %dccp_feature_numbers | ||
483 | * @local: whether local (1) or remote (0) @feat_num is meant | ||
484 | * @needs_mandatory: whether to use Mandatory feature negotiation options | ||
485 | * @fval: pointer to NN/SP value to be inserted (will be copied) | ||
486 | */ | ||
487 | static int dccp_feat_push_change(struct list_head *fn_list, u8 feat, u8 local, | ||
488 | u8 mandatory, dccp_feat_val *fval) | ||
489 | { | ||
490 | struct dccp_feat_entry *new = dccp_feat_entry_new(fn_list, feat, local); | ||
491 | 283 | ||
492 | if (new == NULL) | 284 | /* copy option so we can confirm it */ |
285 | opt = kzalloc(sizeof(*opt), GFP_ATOMIC); | ||
286 | if (opt == NULL) | ||
493 | return -ENOMEM; | 287 | return -ENOMEM; |
494 | 288 | ||
495 | new->feat_num = feat; | 289 | copy = kmemdup(val, len, GFP_ATOMIC); |
496 | new->is_local = local; | 290 | if (copy == NULL) { |
497 | new->state = FEAT_INITIALISING; | 291 | kfree(opt); |
498 | new->needs_confirm = 0; | 292 | return -ENOMEM; |
499 | new->empty_confirm = 0; | 293 | } |
500 | new->val = *fval; | ||
501 | new->needs_mandatory = mandatory; | ||
502 | 294 | ||
503 | return 0; | 295 | opt->dccpop_type = DCCPO_CONFIRM_R; /* NN can only confirm R */ |
504 | } | 296 | opt->dccpop_feat = feature; |
297 | opt->dccpop_val = copy; | ||
298 | opt->dccpop_len = len; | ||
505 | 299 | ||
506 | /** | 300 | /* change feature */ |
507 | * dccp_feat_push_confirm - Add a Confirm entry to the FN list | 301 | rc = dccp_feat_update(sk, type, feature, *val); |
508 | * @fn_list: feature-negotiation list to add to | 302 | if (rc) { |
509 | * @feat: one of %dccp_feature_numbers | 303 | kfree(opt->dccpop_val); |
510 | * @local: whether local (1) or remote (0) @feat_num is being confirmed | 304 | kfree(opt); |
511 | * @fval: pointer to NN/SP value to be inserted or NULL | 305 | return rc; |
512 | * Returns 0 on success, a Reset code for further processing otherwise. | 306 | } |
513 | */ | ||
514 | static int dccp_feat_push_confirm(struct list_head *fn_list, u8 feat, u8 local, | ||
515 | dccp_feat_val *fval) | ||
516 | { | ||
517 | struct dccp_feat_entry *new = dccp_feat_entry_new(fn_list, feat, local); | ||
518 | 307 | ||
519 | if (new == NULL) | 308 | dccp_feat_debug(type, feature, *copy); |
520 | return DCCP_RESET_CODE_TOO_BUSY; | ||
521 | 309 | ||
522 | new->feat_num = feat; | 310 | list_add_tail(&opt->dccpop_node, &dmsk->dccpms_conf); |
523 | new->is_local = local; | ||
524 | new->state = FEAT_STABLE; /* transition in 6.6.2 */ | ||
525 | new->needs_confirm = 1; | ||
526 | new->empty_confirm = (fval == NULL); | ||
527 | new->val.nn = 0; /* zeroes the whole structure */ | ||
528 | if (!new->empty_confirm) | ||
529 | new->val = *fval; | ||
530 | new->needs_mandatory = 0; | ||
531 | 311 | ||
532 | return 0; | 312 | return 0; |
533 | } | 313 | } |
534 | 314 | ||
535 | static int dccp_push_empty_confirm(struct list_head *fn_list, u8 feat, u8 local) | 315 | static void dccp_feat_empty_confirm(struct dccp_minisock *dmsk, |
316 | u8 type, u8 feature) | ||
536 | { | 317 | { |
537 | return dccp_feat_push_confirm(fn_list, feat, local, NULL); | 318 | /* XXX check if other confirms for that are queued and recycle slot */ |
538 | } | 319 | struct dccp_opt_pend *opt = kzalloc(sizeof(*opt), GFP_ATOMIC); |
539 | 320 | ||
540 | static inline void dccp_feat_list_pop(struct dccp_feat_entry *entry) | 321 | if (opt == NULL) { |
541 | { | 322 | /* XXX what do we do? Ignoring should be fine. It's a change |
542 | list_del(&entry->node); | 323 | * after all =P |
543 | dccp_feat_entry_destructor(entry); | 324 | */ |
544 | } | 325 | return; |
545 | |||
546 | void dccp_feat_list_purge(struct list_head *fn_list) | ||
547 | { | ||
548 | struct dccp_feat_entry *entry, *next; | ||
549 | |||
550 | list_for_each_entry_safe(entry, next, fn_list, node) | ||
551 | dccp_feat_entry_destructor(entry); | ||
552 | INIT_LIST_HEAD(fn_list); | ||
553 | } | ||
554 | EXPORT_SYMBOL_GPL(dccp_feat_list_purge); | ||
555 | |||
556 | /* generate @to as full clone of @from - @to must not contain any nodes */ | ||
557 | int dccp_feat_clone_list(struct list_head const *from, struct list_head *to) | ||
558 | { | ||
559 | struct dccp_feat_entry *entry, *new; | ||
560 | |||
561 | INIT_LIST_HEAD(to); | ||
562 | list_for_each_entry(entry, from, node) { | ||
563 | new = dccp_feat_clone_entry(entry); | ||
564 | if (new == NULL) | ||
565 | goto cloning_failed; | ||
566 | list_add_tail(&new->node, to); | ||
567 | } | 326 | } |
568 | return 0; | ||
569 | 327 | ||
570 | cloning_failed: | 328 | switch (type) { |
571 | dccp_feat_list_purge(to); | 329 | case DCCPO_CHANGE_L: |
572 | return -ENOMEM; | 330 | opt->dccpop_type = DCCPO_CONFIRM_R; |
573 | } | 331 | break; |
332 | case DCCPO_CHANGE_R: | ||
333 | opt->dccpop_type = DCCPO_CONFIRM_L; | ||
334 | break; | ||
335 | default: | ||
336 | DCCP_WARN("invalid type %d\n", type); | ||
337 | kfree(opt); | ||
338 | return; | ||
339 | } | ||
340 | opt->dccpop_feat = feature; | ||
341 | opt->dccpop_val = NULL; | ||
342 | opt->dccpop_len = 0; | ||
574 | 343 | ||
575 | /** | 344 | /* change feature */ |
576 | * dccp_feat_valid_nn_length - Enforce length constraints on NN options | 345 | dccp_pr_debug("Empty %s(%d)\n", dccp_feat_typename(type), feature); |
577 | * Length is between 0 and %DCCP_OPTVAL_MAXLEN. Used for outgoing packets only, | ||
578 | * incoming options are accepted as long as their values are valid. | ||
579 | */ | ||
580 | static u8 dccp_feat_valid_nn_length(u8 feat_num) | ||
581 | { | ||
582 | if (feat_num == DCCPF_ACK_RATIO) /* RFC 4340, 11.3 and 6.6.8 */ | ||
583 | return 2; | ||
584 | if (feat_num == DCCPF_SEQUENCE_WINDOW) /* RFC 4340, 7.5.2 and 6.5 */ | ||
585 | return 6; | ||
586 | return 0; | ||
587 | } | ||
588 | 346 | ||
589 | static u8 dccp_feat_is_valid_nn_val(u8 feat_num, u64 val) | 347 | list_add_tail(&opt->dccpop_node, &dmsk->dccpms_conf); |
590 | { | ||
591 | switch (feat_num) { | ||
592 | case DCCPF_ACK_RATIO: | ||
593 | return val <= DCCPF_ACK_RATIO_MAX; | ||
594 | case DCCPF_SEQUENCE_WINDOW: | ||
595 | return val >= DCCPF_SEQ_WMIN && val <= DCCPF_SEQ_WMAX; | ||
596 | } | ||
597 | return 0; /* feature unknown - so we can't tell */ | ||
598 | } | 348 | } |
599 | 349 | ||
600 | /* check that SP values are within the ranges defined in RFC 4340 */ | 350 | static void dccp_feat_flush_confirm(struct sock *sk) |
601 | static u8 dccp_feat_is_valid_sp_val(u8 feat_num, u8 val) | ||
602 | { | 351 | { |
603 | switch (feat_num) { | 352 | struct dccp_minisock *dmsk = dccp_msk(sk); |
604 | case DCCPF_CCID: | 353 | /* Check if there is anything to confirm in the first place */ |
605 | return val == DCCPC_CCID2 || val == DCCPC_CCID3; | 354 | int yes = !list_empty(&dmsk->dccpms_conf); |
606 | /* Type-check Boolean feature values: */ | ||
607 | case DCCPF_SHORT_SEQNOS: | ||
608 | case DCCPF_ECN_INCAPABLE: | ||
609 | case DCCPF_SEND_ACK_VECTOR: | ||
610 | case DCCPF_SEND_NDP_COUNT: | ||
611 | case DCCPF_DATA_CHECKSUM: | ||
612 | case DCCPF_SEND_LEV_RATE: | ||
613 | return val < 2; | ||
614 | case DCCPF_MIN_CSUM_COVER: | ||
615 | return val < 16; | ||
616 | } | ||
617 | return 0; /* feature unknown */ | ||
618 | } | ||
619 | 355 | ||
620 | static u8 dccp_feat_sp_list_ok(u8 feat_num, u8 const *sp_list, u8 sp_len) | 356 | if (!yes) { |
621 | { | 357 | struct dccp_opt_pend *opt; |
622 | if (sp_list == NULL || sp_len < 1) | ||
623 | return 0; | ||
624 | while (sp_len--) | ||
625 | if (!dccp_feat_is_valid_sp_val(feat_num, *sp_list++)) | ||
626 | return 0; | ||
627 | return 1; | ||
628 | } | ||
629 | 358 | ||
630 | /** | 359 | list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) { |
631 | * dccp_feat_insert_opts - Generate FN options from current list state | 360 | if (opt->dccpop_conf) { |
632 | * @skb: next sk_buff to be sent to the peer | 361 | yes = 1; |
633 | * @dp: for client during handshake and general negotiation | 362 | break; |
634 | * @dreq: used by the server only (all Changes/Confirms in LISTEN/RESPOND) | ||
635 | */ | ||
636 | int dccp_feat_insert_opts(struct dccp_sock *dp, struct dccp_request_sock *dreq, | ||
637 | struct sk_buff *skb) | ||
638 | { | ||
639 | struct list_head *fn = dreq ? &dreq->dreq_featneg : &dp->dccps_featneg; | ||
640 | struct dccp_feat_entry *pos, *next; | ||
641 | u8 opt, type, len, *ptr, nn_in_nbo[DCCP_OPTVAL_MAXLEN]; | ||
642 | bool rpt; | ||
643 | |||
644 | /* put entries into @skb in the order they appear in the list */ | ||
645 | list_for_each_entry_safe_reverse(pos, next, fn, node) { | ||
646 | opt = dccp_feat_genopt(pos); | ||
647 | type = dccp_feat_type(pos->feat_num); | ||
648 | rpt = false; | ||
649 | |||
650 | if (pos->empty_confirm) { | ||
651 | len = 0; | ||
652 | ptr = NULL; | ||
653 | } else { | ||
654 | if (type == FEAT_SP) { | ||
655 | len = pos->val.sp.len; | ||
656 | ptr = pos->val.sp.vec; | ||
657 | rpt = pos->needs_confirm; | ||
658 | } else if (type == FEAT_NN) { | ||
659 | len = dccp_feat_valid_nn_length(pos->feat_num); | ||
660 | ptr = nn_in_nbo; | ||
661 | dccp_encode_value_var(pos->val.nn, ptr, len); | ||
662 | } else { | ||
663 | DCCP_BUG("unknown feature %u", pos->feat_num); | ||
664 | return -1; | ||
665 | } | 363 | } |
666 | } | 364 | } |
667 | dccp_feat_print_opt(opt, pos->feat_num, ptr, len, 0); | ||
668 | |||
669 | if (dccp_insert_fn_opt(skb, opt, pos->feat_num, ptr, len, rpt)) | ||
670 | return -1; | ||
671 | if (pos->needs_mandatory && dccp_insert_option_mandatory(skb)) | ||
672 | return -1; | ||
673 | /* | ||
674 | * Enter CHANGING after transmitting the Change option (6.6.2). | ||
675 | */ | ||
676 | if (pos->state == FEAT_INITIALISING) | ||
677 | pos->state = FEAT_CHANGING; | ||
678 | } | 365 | } |
679 | return 0; | ||
680 | } | ||
681 | |||
682 | /** | ||
683 | * __feat_register_nn - Register new NN value on socket | ||
684 | * @fn: feature-negotiation list to register with | ||
685 | * @feat: an NN feature from %dccp_feature_numbers | ||
686 | * @mandatory: use Mandatory option if 1 | ||
687 | * @nn_val: value to register (restricted to 4 bytes) | ||
688 | * Note that NN features are local by definition (RFC 4340, 6.3.2). | ||
689 | */ | ||
690 | static int __feat_register_nn(struct list_head *fn, u8 feat, | ||
691 | u8 mandatory, u64 nn_val) | ||
692 | { | ||
693 | dccp_feat_val fval = { .nn = nn_val }; | ||
694 | |||
695 | if (dccp_feat_type(feat) != FEAT_NN || | ||
696 | !dccp_feat_is_valid_nn_val(feat, nn_val)) | ||
697 | return -EINVAL; | ||
698 | |||
699 | /* Don't bother with default values, they will be activated anyway. */ | ||
700 | if (nn_val - (u64)dccp_feat_default_value(feat) == 0) | ||
701 | return 0; | ||
702 | |||
703 | return dccp_feat_push_change(fn, feat, 1, mandatory, &fval); | ||
704 | } | ||
705 | |||
706 | /** | ||
707 | * __feat_register_sp - Register new SP value/list on socket | ||
708 | * @fn: feature-negotiation list to register with | ||
709 | * @feat: an SP feature from %dccp_feature_numbers | ||
710 | * @is_local: whether the local (1) or the remote (0) @feat is meant | ||
711 | * @mandatory: use Mandatory option if 1 | ||
712 | * @sp_val: SP value followed by optional preference list | ||
713 | * @sp_len: length of @sp_val in bytes | ||
714 | */ | ||
715 | static int __feat_register_sp(struct list_head *fn, u8 feat, u8 is_local, | ||
716 | u8 mandatory, u8 const *sp_val, u8 sp_len) | ||
717 | { | ||
718 | dccp_feat_val fval; | ||
719 | 366 | ||
720 | if (dccp_feat_type(feat) != FEAT_SP || | 367 | if (!yes) |
721 | !dccp_feat_sp_list_ok(feat, sp_val, sp_len)) | 368 | return; |
722 | return -EINVAL; | ||
723 | |||
724 | /* Avoid negotiating alien CCIDs by only advertising supported ones */ | ||
725 | if (feat == DCCPF_CCID && !ccid_support_check(sp_val, sp_len)) | ||
726 | return -EOPNOTSUPP; | ||
727 | |||
728 | if (dccp_feat_clone_sp_val(&fval, sp_val, sp_len)) | ||
729 | return -ENOMEM; | ||
730 | 369 | ||
731 | return dccp_feat_push_change(fn, feat, is_local, mandatory, &fval); | 370 | /* OK there is something to confirm... */ |
371 | /* XXX check if packet is in flight? Send delayed ack?? */ | ||
372 | if (sk->sk_state == DCCP_OPEN) | ||
373 | dccp_send_ack(sk); | ||
732 | } | 374 | } |
733 | 375 | ||
734 | /** | 376 | int dccp_feat_change_recv(struct sock *sk, u8 type, u8 feature, u8 *val, u8 len) |
735 | * dccp_feat_register_sp - Register requests to change SP feature values | ||
736 | * @sk: client or listening socket | ||
737 | * @feat: one of %dccp_feature_numbers | ||
738 | * @is_local: whether the local (1) or remote (0) @feat is meant | ||
739 | * @list: array of preferred values, in descending order of preference | ||
740 | * @len: length of @list in bytes | ||
741 | */ | ||
742 | int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local, | ||
743 | u8 const *list, u8 len) | ||
744 | { /* any changes must be registered before establishing the connection */ | ||
745 | if (sk->sk_state != DCCP_CLOSED) | ||
746 | return -EISCONN; | ||
747 | if (dccp_feat_type(feat) != FEAT_SP) | ||
748 | return -EINVAL; | ||
749 | return __feat_register_sp(&dccp_sk(sk)->dccps_featneg, feat, is_local, | ||
750 | 0, list, len); | ||
751 | } | ||
752 | |||
753 | /* Analogous to dccp_feat_register_sp(), but for non-negotiable values */ | ||
754 | int dccp_feat_register_nn(struct sock *sk, u8 feat, u64 val) | ||
755 | { | 377 | { |
756 | /* any changes must be registered before establishing the connection */ | 378 | int rc; |
757 | if (sk->sk_state != DCCP_CLOSED) | ||
758 | return -EISCONN; | ||
759 | if (dccp_feat_type(feat) != FEAT_NN) | ||
760 | return -EINVAL; | ||
761 | return __feat_register_nn(&dccp_sk(sk)->dccps_featneg, feat, 0, val); | ||
762 | } | ||
763 | 379 | ||
764 | /** | 380 | dccp_feat_debug(type, feature, *val); |
765 | * dccp_feat_signal_nn_change - Update NN values for an established connection | ||
766 | * @sk: DCCP socket of an established connection | ||
767 | * @feat: NN feature number from %dccp_feature_numbers | ||
768 | * @nn_val: the new value to use | ||
769 | * This function is used to communicate NN updates out-of-band. The difference | ||
770 | * to feature negotiation during connection setup is that values are activated | ||
771 | * immediately after validation, i.e. we don't wait for the Confirm: either the | ||
772 | * value is accepted by the peer (and then the waiting is futile), or it is not | ||
773 | * (Reset or empty Confirm). We don't accept empty Confirms - transmitted values | ||
774 | * are validated, and the peer "MUST accept any valid value" (RFC 4340, 6.3.2). | ||
775 | */ | ||
776 | int dccp_feat_signal_nn_change(struct sock *sk, u8 feat, u64 nn_val) | ||
777 | { | ||
778 | struct list_head *fn = &dccp_sk(sk)->dccps_featneg; | ||
779 | dccp_feat_val fval = { .nn = nn_val }; | ||
780 | struct dccp_feat_entry *entry; | ||
781 | 381 | ||
782 | if (sk->sk_state != DCCP_OPEN && sk->sk_state != DCCP_PARTOPEN) | 382 | /* figure out if it's SP or NN feature */ |
783 | return 0; | 383 | switch (feature) { |
384 | /* deal with SP features */ | ||
385 | case DCCPF_CCID: | ||
386 | rc = dccp_feat_sp(sk, type, feature, val, len); | ||
387 | break; | ||
784 | 388 | ||
785 | if (dccp_feat_type(feat) != FEAT_NN || | 389 | /* deal with NN features */ |
786 | !dccp_feat_is_valid_nn_val(feat, nn_val)) | 390 | case DCCPF_ACK_RATIO: |
787 | return -EINVAL; | 391 | rc = dccp_feat_nn(sk, type, feature, val, len); |
392 | break; | ||
788 | 393 | ||
789 | entry = dccp_feat_list_lookup(fn, feat, 1); | 394 | /* XXX implement other features */ |
790 | if (entry != NULL) { | 395 | default: |
791 | dccp_pr_debug("Ignoring %llu, entry %llu exists in state %s\n", | 396 | dccp_pr_debug("UNIMPLEMENTED: not handling %s(%d, ...)\n", |
792 | (unsigned long long)nn_val, | 397 | dccp_feat_typename(type), feature); |
793 | (unsigned long long)entry->val.nn, | 398 | rc = -EFAULT; |
794 | dccp_feat_sname[entry->state]); | 399 | break; |
795 | return 0; | ||
796 | } | 400 | } |
797 | 401 | ||
798 | if (dccp_feat_activate(sk, feat, 1, &fval)) | 402 | /* check if there were problems changing features */ |
799 | return -EADV; | 403 | if (rc) { |
800 | 404 | /* If we don't agree on SP, we sent a confirm for old value. | |
801 | inet_csk_schedule_ack(sk); | 405 | * However we propagate rc to caller in case option was |
802 | return dccp_feat_push_change(fn, feat, 1, 0, &fval); | 406 | * mandatory |
803 | } | ||
804 | EXPORT_SYMBOL_GPL(dccp_feat_signal_nn_change); | ||
805 | |||
806 | /* | ||
807 | * Tracking features whose value depend on the choice of CCID | ||
808 | * | ||
809 | * This is designed with an extension in mind so that a list walk could be done | ||
810 | * before activating any features. However, the existing framework was found to | ||
811 | * work satisfactorily up until now, the automatic verification is left open. | ||
812 | * When adding new CCIDs, add a corresponding dependency table here. | ||
813 | */ | ||
814 | static const struct ccid_dependency *dccp_feat_ccid_deps(u8 ccid, bool is_local) | ||
815 | { | ||
816 | static const struct ccid_dependency ccid2_dependencies[2][2] = { | ||
817 | /* | ||
818 | * CCID2 mandates Ack Vectors (RFC 4341, 4.): as CCID is a TX | ||
819 | * feature and Send Ack Vector is an RX feature, `is_local' | ||
820 | * needs to be reversed. | ||
821 | */ | 407 | */ |
822 | { /* Dependencies of the receiver-side (remote) CCID2 */ | 408 | if (rc != DCCP_FEAT_SP_NOAGREE) |
823 | { | 409 | dccp_feat_empty_confirm(dccp_msk(sk), type, feature); |
824 | .dependent_feat = DCCPF_SEND_ACK_VECTOR, | ||
825 | .is_local = true, | ||
826 | .is_mandatory = true, | ||
827 | .val = 1 | ||
828 | }, | ||
829 | { 0, 0, 0, 0 } | ||
830 | }, | ||
831 | { /* Dependencies of the sender-side (local) CCID2 */ | ||
832 | { | ||
833 | .dependent_feat = DCCPF_SEND_ACK_VECTOR, | ||
834 | .is_local = false, | ||
835 | .is_mandatory = true, | ||
836 | .val = 1 | ||
837 | }, | ||
838 | { 0, 0, 0, 0 } | ||
839 | } | ||
840 | }; | ||
841 | static const struct ccid_dependency ccid3_dependencies[2][5] = { | ||
842 | { /* | ||
843 | * Dependencies of the receiver-side CCID3 | ||
844 | */ | ||
845 | { /* locally disable Ack Vectors */ | ||
846 | .dependent_feat = DCCPF_SEND_ACK_VECTOR, | ||
847 | .is_local = true, | ||
848 | .is_mandatory = false, | ||
849 | .val = 0 | ||
850 | }, | ||
851 | { /* see below why Send Loss Event Rate is on */ | ||
852 | .dependent_feat = DCCPF_SEND_LEV_RATE, | ||
853 | .is_local = true, | ||
854 | .is_mandatory = true, | ||
855 | .val = 1 | ||
856 | }, | ||
857 | { /* NDP Count is needed as per RFC 4342, 6.1.1 */ | ||
858 | .dependent_feat = DCCPF_SEND_NDP_COUNT, | ||
859 | .is_local = false, | ||
860 | .is_mandatory = true, | ||
861 | .val = 1 | ||
862 | }, | ||
863 | { 0, 0, 0, 0 }, | ||
864 | }, | ||
865 | { /* | ||
866 | * CCID3 at the TX side: we request that the HC-receiver | ||
867 | * will not send Ack Vectors (they will be ignored, so | ||
868 | * Mandatory is not set); we enable Send Loss Event Rate | ||
869 | * (Mandatory since the implementation does not support | ||
870 | * the Loss Intervals option of RFC 4342, 8.6). | ||
871 | * The last two options are for peer's information only. | ||
872 | */ | ||
873 | { | ||
874 | .dependent_feat = DCCPF_SEND_ACK_VECTOR, | ||
875 | .is_local = false, | ||
876 | .is_mandatory = false, | ||
877 | .val = 0 | ||
878 | }, | ||
879 | { | ||
880 | .dependent_feat = DCCPF_SEND_LEV_RATE, | ||
881 | .is_local = false, | ||
882 | .is_mandatory = true, | ||
883 | .val = 1 | ||
884 | }, | ||
885 | { /* this CCID does not support Ack Ratio */ | ||
886 | .dependent_feat = DCCPF_ACK_RATIO, | ||
887 | .is_local = true, | ||
888 | .is_mandatory = false, | ||
889 | .val = 0 | ||
890 | }, | ||
891 | { /* tell receiver we are sending NDP counts */ | ||
892 | .dependent_feat = DCCPF_SEND_NDP_COUNT, | ||
893 | .is_local = true, | ||
894 | .is_mandatory = false, | ||
895 | .val = 1 | ||
896 | }, | ||
897 | { 0, 0, 0, 0 } | ||
898 | } | ||
899 | }; | ||
900 | switch (ccid) { | ||
901 | case DCCPC_CCID2: | ||
902 | return ccid2_dependencies[is_local]; | ||
903 | case DCCPC_CCID3: | ||
904 | return ccid3_dependencies[is_local]; | ||
905 | default: | ||
906 | return NULL; | ||
907 | } | 410 | } |
908 | } | ||
909 | 411 | ||
910 | /** | 412 | /* generate the confirm [if required] */ |
911 | * dccp_feat_propagate_ccid - Resolve dependencies of features on choice of CCID | 413 | dccp_feat_flush_confirm(sk); |
912 | * @fn: feature-negotiation list to update | ||
913 | * @id: CCID number to track | ||
914 | * @is_local: whether TX CCID (1) or RX CCID (0) is meant | ||
915 | * This function needs to be called after registering all other features. | ||
916 | */ | ||
917 | static int dccp_feat_propagate_ccid(struct list_head *fn, u8 id, bool is_local) | ||
918 | { | ||
919 | const struct ccid_dependency *table = dccp_feat_ccid_deps(id, is_local); | ||
920 | int i, rc = (table == NULL); | ||
921 | |||
922 | for (i = 0; rc == 0 && table[i].dependent_feat != DCCPF_RESERVED; i++) | ||
923 | if (dccp_feat_type(table[i].dependent_feat) == FEAT_SP) | ||
924 | rc = __feat_register_sp(fn, table[i].dependent_feat, | ||
925 | table[i].is_local, | ||
926 | table[i].is_mandatory, | ||
927 | &table[i].val, 1); | ||
928 | else | ||
929 | rc = __feat_register_nn(fn, table[i].dependent_feat, | ||
930 | table[i].is_mandatory, | ||
931 | table[i].val); | ||
932 | return rc; | ||
933 | } | ||
934 | |||
935 | /** | ||
936 | * dccp_feat_finalise_settings - Finalise settings before starting negotiation | ||
937 | * @dp: client or listening socket (settings will be inherited) | ||
938 | * This is called after all registrations (socket initialisation, sysctls, and | ||
939 | * sockopt calls), and before sending the first packet containing Change options | ||
940 | * (ie. client-Request or server-Response), to ensure internal consistency. | ||
941 | */ | ||
942 | int dccp_feat_finalise_settings(struct dccp_sock *dp) | ||
943 | { | ||
944 | struct list_head *fn = &dp->dccps_featneg; | ||
945 | struct dccp_feat_entry *entry; | ||
946 | int i = 2, ccids[2] = { -1, -1 }; | ||
947 | 414 | ||
948 | /* | 415 | return rc; |
949 | * Propagating CCIDs: | ||
950 | * 1) not useful to propagate CCID settings if this host advertises more | ||
951 | * than one CCID: the choice of CCID may still change - if this is | ||
952 | * the client, or if this is the server and the client sends | ||
953 | * singleton CCID values. | ||
954 | * 2) since is that propagate_ccid changes the list, we defer changing | ||
955 | * the sorted list until after the traversal. | ||
956 | */ | ||
957 | list_for_each_entry(entry, fn, node) | ||
958 | if (entry->feat_num == DCCPF_CCID && entry->val.sp.len == 1) | ||
959 | ccids[entry->is_local] = entry->val.sp.vec[0]; | ||
960 | while (i--) | ||
961 | if (ccids[i] > 0 && dccp_feat_propagate_ccid(fn, ccids[i], i)) | ||
962 | return -1; | ||
963 | dccp_feat_print_fnlist(fn); | ||
964 | return 0; | ||
965 | } | 416 | } |
966 | 417 | ||
967 | /** | 418 | EXPORT_SYMBOL_GPL(dccp_feat_change_recv); |
968 | * dccp_feat_server_ccid_dependencies - Resolve CCID-dependent features | ||
969 | * It is the server which resolves the dependencies once the CCID has been | ||
970 | * fully negotiated. If no CCID has been negotiated, it uses the default CCID. | ||
971 | */ | ||
972 | int dccp_feat_server_ccid_dependencies(struct dccp_request_sock *dreq) | ||
973 | { | ||
974 | struct list_head *fn = &dreq->dreq_featneg; | ||
975 | struct dccp_feat_entry *entry; | ||
976 | u8 is_local, ccid; | ||
977 | |||
978 | for (is_local = 0; is_local <= 1; is_local++) { | ||
979 | entry = dccp_feat_list_lookup(fn, DCCPF_CCID, is_local); | ||
980 | |||
981 | if (entry != NULL && !entry->empty_confirm) | ||
982 | ccid = entry->val.sp.vec[0]; | ||
983 | else | ||
984 | ccid = dccp_feat_default_value(DCCPF_CCID); | ||
985 | |||
986 | if (dccp_feat_propagate_ccid(fn, ccid, is_local)) | ||
987 | return -1; | ||
988 | } | ||
989 | return 0; | ||
990 | } | ||
991 | 419 | ||
992 | /* Select the first entry in @servlist that also occurs in @clilist (6.3.1) */ | 420 | int dccp_feat_confirm_recv(struct sock *sk, u8 type, u8 feature, |
993 | static int dccp_feat_preflist_match(u8 *servlist, u8 slen, u8 *clilist, u8 clen) | 421 | u8 *val, u8 len) |
994 | { | 422 | { |
995 | u8 c, s; | 423 | u8 t; |
424 | struct dccp_opt_pend *opt; | ||
425 | struct dccp_minisock *dmsk = dccp_msk(sk); | ||
426 | int found = 0; | ||
427 | int all_confirmed = 1; | ||
996 | 428 | ||
997 | for (s = 0; s < slen; s++) | 429 | dccp_feat_debug(type, feature, *val); |
998 | for (c = 0; c < clen; c++) | ||
999 | if (servlist[s] == clilist[c]) | ||
1000 | return servlist[s]; | ||
1001 | return -1; | ||
1002 | } | ||
1003 | 430 | ||
1004 | /** | 431 | /* locate our change request */ |
1005 | * dccp_feat_prefer - Move preferred entry to the start of array | 432 | switch (type) { |
1006 | * Reorder the @array_len elements in @array so that @preferred_value comes | 433 | case DCCPO_CONFIRM_L: t = DCCPO_CHANGE_R; break; |
1007 | * first. Returns >0 to indicate that @preferred_value does occur in @array. | 434 | case DCCPO_CONFIRM_R: t = DCCPO_CHANGE_L; break; |
1008 | */ | 435 | default: DCCP_WARN("invalid type %d\n", type); |
1009 | static u8 dccp_feat_prefer(u8 preferred_value, u8 *array, u8 array_len) | 436 | return 1; |
1010 | { | ||
1011 | u8 i, does_occur = 0; | ||
1012 | 437 | ||
1013 | if (array != NULL) { | ||
1014 | for (i = 0; i < array_len; i++) | ||
1015 | if (array[i] == preferred_value) { | ||
1016 | array[i] = array[0]; | ||
1017 | does_occur++; | ||
1018 | } | ||
1019 | if (does_occur) | ||
1020 | array[0] = preferred_value; | ||
1021 | } | 438 | } |
1022 | return does_occur; | 439 | /* XXX sanity check feature value */ |
1023 | } | ||
1024 | 440 | ||
1025 | /** | 441 | list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) { |
1026 | * dccp_feat_reconcile - Reconcile SP preference lists | 442 | if (!opt->dccpop_conf && opt->dccpop_type == t && |
1027 | * @fval: SP list to reconcile into | 443 | opt->dccpop_feat == feature) { |
1028 | * @arr: received SP preference list | 444 | found = 1; |
1029 | * @len: length of @arr in bytes | 445 | dccp_pr_debug("feature %d found\n", opt->dccpop_feat); |
1030 | * @is_server: whether this side is the server (and @fv is the server's list) | ||
1031 | * @reorder: whether to reorder the list in @fv after reconciling with @arr | ||
1032 | * When successful, > 0 is returned and the reconciled list is in @fval. | ||
1033 | * A value of 0 means that negotiation failed (no shared entry). | ||
1034 | */ | ||
1035 | static int dccp_feat_reconcile(dccp_feat_val *fv, u8 *arr, u8 len, | ||
1036 | bool is_server, bool reorder) | ||
1037 | { | ||
1038 | int rc; | ||
1039 | 446 | ||
1040 | if (!fv->sp.vec || !arr) { | 447 | /* XXX do sanity check */ |
1041 | DCCP_CRIT("NULL feature value or array"); | ||
1042 | return 0; | ||
1043 | } | ||
1044 | 448 | ||
1045 | if (is_server) | 449 | opt->dccpop_conf = 1; |
1046 | rc = dccp_feat_preflist_match(fv->sp.vec, fv->sp.len, arr, len); | ||
1047 | else | ||
1048 | rc = dccp_feat_preflist_match(arr, len, fv->sp.vec, fv->sp.len); | ||
1049 | |||
1050 | if (!reorder) | ||
1051 | return rc; | ||
1052 | if (rc < 0) | ||
1053 | return 0; | ||
1054 | 450 | ||
1055 | /* | 451 | /* We got a confirmation---change the option */ |
1056 | * Reorder list: used for activating features and in dccp_insert_fn_opt. | 452 | dccp_feat_update(sk, opt->dccpop_type, |
1057 | */ | 453 | opt->dccpop_feat, *val); |
1058 | return dccp_feat_prefer(rc, fv->sp.vec, fv->sp.len); | ||
1059 | } | ||
1060 | 454 | ||
1061 | /** | 455 | /* XXX check the return value of dccp_feat_update */ |
1062 | * dccp_feat_change_recv - Process incoming ChangeL/R options | 456 | break; |
1063 | * @fn: feature-negotiation list to update | 457 | } |
1064 | * @is_mandatory: whether the Change was preceded by a Mandatory option | ||
1065 | * @opt: %DCCPO_CHANGE_L or %DCCPO_CHANGE_R | ||
1066 | * @feat: one of %dccp_feature_numbers | ||
1067 | * @val: NN value or SP value/preference list | ||
1068 | * @len: length of @val in bytes | ||
1069 | * @server: whether this node is the server (1) or the client (0) | ||
1070 | */ | ||
1071 | static u8 dccp_feat_change_recv(struct list_head *fn, u8 is_mandatory, u8 opt, | ||
1072 | u8 feat, u8 *val, u8 len, const bool server) | ||
1073 | { | ||
1074 | u8 defval, type = dccp_feat_type(feat); | ||
1075 | const bool local = (opt == DCCPO_CHANGE_R); | ||
1076 | struct dccp_feat_entry *entry; | ||
1077 | dccp_feat_val fval; | ||
1078 | |||
1079 | if (len == 0 || type == FEAT_UNKNOWN) /* 6.1 and 6.6.8 */ | ||
1080 | goto unknown_feature_or_value; | ||
1081 | |||
1082 | dccp_feat_print_opt(opt, feat, val, len, is_mandatory); | ||
1083 | |||
1084 | /* | ||
1085 | * Negotiation of NN features: Change R is invalid, so there is no | ||
1086 | * simultaneous negotiation; hence we do not look up in the list. | ||
1087 | */ | ||
1088 | if (type == FEAT_NN) { | ||
1089 | if (local || len > sizeof(fval.nn)) | ||
1090 | goto unknown_feature_or_value; | ||
1091 | |||
1092 | /* 6.3.2: "The feature remote MUST accept any valid value..." */ | ||
1093 | fval.nn = dccp_decode_value_var(val, len); | ||
1094 | if (!dccp_feat_is_valid_nn_val(feat, fval.nn)) | ||
1095 | goto unknown_feature_or_value; | ||
1096 | 458 | ||
1097 | return dccp_feat_push_confirm(fn, feat, local, &fval); | 459 | if (!opt->dccpop_conf) |
460 | all_confirmed = 0; | ||
1098 | } | 461 | } |
1099 | 462 | ||
1100 | /* | 463 | /* fix re-transmit timer */ |
1101 | * Unidirectional/simultaneous negotiation of SP features (6.3.1) | 464 | /* XXX gotta make sure that no option negotiation occurs during |
465 | * connection shutdown. Consider that the CLOSEREQ is sent and timer is | ||
466 | * on. if all options are confirmed it might kill timer which should | ||
467 | * remain alive until close is received. | ||
1102 | */ | 468 | */ |
1103 | entry = dccp_feat_list_lookup(fn, feat, local); | 469 | if (all_confirmed) { |
1104 | if (entry == NULL) { | 470 | dccp_pr_debug("clear feat negotiation timer %p\n", sk); |
1105 | /* | 471 | inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); |
1106 | * No particular preferences have been registered. We deal with | ||
1107 | * this situation by assuming that all valid values are equally | ||
1108 | * acceptable, and apply the following checks: | ||
1109 | * - if the peer's list is a singleton, we accept a valid value; | ||
1110 | * - if we are the server, we first try to see if the peer (the | ||
1111 | * client) advertises the default value. If yes, we use it, | ||
1112 | * otherwise we accept the preferred value; | ||
1113 | * - else if we are the client, we use the first list element. | ||
1114 | */ | ||
1115 | if (dccp_feat_clone_sp_val(&fval, val, 1)) | ||
1116 | return DCCP_RESET_CODE_TOO_BUSY; | ||
1117 | |||
1118 | if (len > 1 && server) { | ||
1119 | defval = dccp_feat_default_value(feat); | ||
1120 | if (dccp_feat_preflist_match(&defval, 1, val, len) > -1) | ||
1121 | fval.sp.vec[0] = defval; | ||
1122 | } else if (!dccp_feat_is_valid_sp_val(feat, fval.sp.vec[0])) { | ||
1123 | kfree(fval.sp.vec); | ||
1124 | goto unknown_feature_or_value; | ||
1125 | } | ||
1126 | |||
1127 | /* Treat unsupported CCIDs like invalid values */ | ||
1128 | if (feat == DCCPF_CCID && !ccid_support_check(fval.sp.vec, 1)) { | ||
1129 | kfree(fval.sp.vec); | ||
1130 | goto not_valid_or_not_known; | ||
1131 | } | ||
1132 | |||
1133 | return dccp_feat_push_confirm(fn, feat, local, &fval); | ||
1134 | |||
1135 | } else if (entry->state == FEAT_UNSTABLE) { /* 6.6.2 */ | ||
1136 | return 0; | ||
1137 | } | 472 | } |
1138 | 473 | ||
1139 | if (dccp_feat_reconcile(&entry->val, val, len, server, true)) { | 474 | if (!found) |
1140 | entry->empty_confirm = 0; | 475 | dccp_pr_debug("%s(%d, ...) never requested\n", |
1141 | } else if (is_mandatory) { | 476 | dccp_feat_typename(type), feature); |
1142 | return DCCP_RESET_CODE_MANDATORY_ERROR; | ||
1143 | } else if (entry->state == FEAT_INITIALISING) { | ||
1144 | /* | ||
1145 | * Failed simultaneous negotiation (server only): try to `save' | ||
1146 | * the connection by checking whether entry contains the default | ||
1147 | * value for @feat. If yes, send an empty Confirm to signal that | ||
1148 | * the received Change was not understood - which implies using | ||
1149 | * the default value. | ||
1150 | * If this also fails, we use Reset as the last resort. | ||
1151 | */ | ||
1152 | WARN_ON(!server); | ||
1153 | defval = dccp_feat_default_value(feat); | ||
1154 | if (!dccp_feat_reconcile(&entry->val, &defval, 1, server, true)) | ||
1155 | return DCCP_RESET_CODE_OPTION_ERROR; | ||
1156 | entry->empty_confirm = 1; | ||
1157 | } | ||
1158 | entry->needs_confirm = 1; | ||
1159 | entry->needs_mandatory = 0; | ||
1160 | entry->state = FEAT_STABLE; | ||
1161 | return 0; | 477 | return 0; |
1162 | |||
1163 | unknown_feature_or_value: | ||
1164 | if (!is_mandatory) | ||
1165 | return dccp_push_empty_confirm(fn, feat, local); | ||
1166 | |||
1167 | not_valid_or_not_known: | ||
1168 | return is_mandatory ? DCCP_RESET_CODE_MANDATORY_ERROR | ||
1169 | : DCCP_RESET_CODE_OPTION_ERROR; | ||
1170 | } | 478 | } |
1171 | 479 | ||
1172 | /** | 480 | EXPORT_SYMBOL_GPL(dccp_feat_confirm_recv); |
1173 | * dccp_feat_confirm_recv - Process received Confirm options | ||
1174 | * @fn: feature-negotiation list to update | ||
1175 | * @is_mandatory: whether @opt was preceded by a Mandatory option | ||
1176 | * @opt: %DCCPO_CONFIRM_L or %DCCPO_CONFIRM_R | ||
1177 | * @feat: one of %dccp_feature_numbers | ||
1178 | * @val: NN value or SP value/preference list | ||
1179 | * @len: length of @val in bytes | ||
1180 | * @server: whether this node is server (1) or client (0) | ||
1181 | */ | ||
1182 | static u8 dccp_feat_confirm_recv(struct list_head *fn, u8 is_mandatory, u8 opt, | ||
1183 | u8 feat, u8 *val, u8 len, const bool server) | ||
1184 | { | ||
1185 | u8 *plist, plen, type = dccp_feat_type(feat); | ||
1186 | const bool local = (opt == DCCPO_CONFIRM_R); | ||
1187 | struct dccp_feat_entry *entry = dccp_feat_list_lookup(fn, feat, local); | ||
1188 | |||
1189 | dccp_feat_print_opt(opt, feat, val, len, is_mandatory); | ||
1190 | |||
1191 | if (entry == NULL) { /* nothing queued: ignore or handle error */ | ||
1192 | if (is_mandatory && type == FEAT_UNKNOWN) | ||
1193 | return DCCP_RESET_CODE_MANDATORY_ERROR; | ||
1194 | |||
1195 | if (!local && type == FEAT_NN) /* 6.3.2 */ | ||
1196 | goto confirmation_failed; | ||
1197 | return 0; | ||
1198 | } | ||
1199 | |||
1200 | if (entry->state != FEAT_CHANGING) /* 6.6.2 */ | ||
1201 | return 0; | ||
1202 | |||
1203 | if (len == 0) { | ||
1204 | if (dccp_feat_must_be_understood(feat)) /* 6.6.7 */ | ||
1205 | goto confirmation_failed; | ||
1206 | /* | ||
1207 | * Empty Confirm during connection setup: this means reverting | ||
1208 | * to the `old' value, which in this case is the default. Since | ||
1209 | * we handle default values automatically when no other values | ||
1210 | * have been set, we revert to the old value by removing this | ||
1211 | * entry from the list. | ||
1212 | */ | ||
1213 | dccp_feat_list_pop(entry); | ||
1214 | return 0; | ||
1215 | } | ||
1216 | 481 | ||
1217 | if (type == FEAT_NN) { | 482 | void dccp_feat_clean(struct dccp_minisock *dmsk) |
1218 | if (len > sizeof(entry->val.nn)) | 483 | { |
1219 | goto confirmation_failed; | 484 | struct dccp_opt_pend *opt, *next; |
1220 | 485 | ||
1221 | if (entry->val.nn == dccp_decode_value_var(val, len)) | 486 | list_for_each_entry_safe(opt, next, &dmsk->dccpms_pending, |
1222 | goto confirmation_succeeded; | 487 | dccpop_node) { |
488 | BUG_ON(opt->dccpop_val == NULL); | ||
489 | kfree(opt->dccpop_val); | ||
1223 | 490 | ||
1224 | DCCP_WARN("Bogus Confirm for non-existing value\n"); | 491 | if (opt->dccpop_sc != NULL) { |
1225 | goto confirmation_failed; | 492 | BUG_ON(opt->dccpop_sc->dccpoc_val == NULL); |
1226 | } | 493 | kfree(opt->dccpop_sc->dccpoc_val); |
494 | kfree(opt->dccpop_sc); | ||
495 | } | ||
1227 | 496 | ||
1228 | /* | 497 | kfree(opt); |
1229 | * Parsing SP Confirms: the first element of @val is the preferred | ||
1230 | * SP value which the peer confirms, the remainder depends on @len. | ||
1231 | * Note that only the confirmed value need to be a valid SP value. | ||
1232 | */ | ||
1233 | if (!dccp_feat_is_valid_sp_val(feat, *val)) | ||
1234 | goto confirmation_failed; | ||
1235 | |||
1236 | if (len == 1) { /* peer didn't supply a preference list */ | ||
1237 | plist = val; | ||
1238 | plen = len; | ||
1239 | } else { /* preferred value + preference list */ | ||
1240 | plist = val + 1; | ||
1241 | plen = len - 1; | ||
1242 | } | 498 | } |
499 | INIT_LIST_HEAD(&dmsk->dccpms_pending); | ||
1243 | 500 | ||
1244 | /* Check whether the peer got the reconciliation right (6.6.8) */ | 501 | list_for_each_entry_safe(opt, next, &dmsk->dccpms_conf, dccpop_node) { |
1245 | if (dccp_feat_reconcile(&entry->val, plist, plen, server, 0) != *val) { | 502 | BUG_ON(opt == NULL); |
1246 | DCCP_WARN("Confirm selected the wrong value %u\n", *val); | 503 | if (opt->dccpop_val != NULL) |
1247 | return DCCP_RESET_CODE_OPTION_ERROR; | 504 | kfree(opt->dccpop_val); |
505 | kfree(opt); | ||
1248 | } | 506 | } |
1249 | entry->val.sp.vec[0] = *val; | 507 | INIT_LIST_HEAD(&dmsk->dccpms_conf); |
1250 | |||
1251 | confirmation_succeeded: | ||
1252 | entry->state = FEAT_STABLE; | ||
1253 | return 0; | ||
1254 | |||
1255 | confirmation_failed: | ||
1256 | DCCP_WARN("Confirmation failed\n"); | ||
1257 | return is_mandatory ? DCCP_RESET_CODE_MANDATORY_ERROR | ||
1258 | : DCCP_RESET_CODE_OPTION_ERROR; | ||
1259 | } | 508 | } |
1260 | 509 | ||
1261 | /** | 510 | EXPORT_SYMBOL_GPL(dccp_feat_clean); |
1262 | * dccp_feat_handle_nn_established - Fast-path reception of NN options | 511 | |
1263 | * @sk: socket of an established DCCP connection | 512 | /* this is to be called only when a listening sock creates its child. It is |
1264 | * @mandatory: whether @opt was preceded by a Mandatory option | 513 | * assumed by the function---the confirm is not duplicated, but rather it is |
1265 | * @opt: %DCCPO_CHANGE_L | %DCCPO_CONFIRM_R (NN only) | 514 | * "passed on". |
1266 | * @feat: NN number, one of %dccp_feature_numbers | ||
1267 | * @val: NN value | ||
1268 | * @len: length of @val in bytes | ||
1269 | * This function combines the functionality of change_recv/confirm_recv, with | ||
1270 | * the following differences (reset codes are the same): | ||
1271 | * - cleanup after receiving the Confirm; | ||
1272 | * - values are directly activated after successful parsing; | ||
1273 | * - deliberately restricted to NN features. | ||
1274 | * The restriction to NN features is essential since SP features can have non- | ||
1275 | * predictable outcomes (depending on the remote configuration), and are inter- | ||
1276 | * dependent (CCIDs for instance cause further dependencies). | ||
1277 | */ | 515 | */ |
1278 | static u8 dccp_feat_handle_nn_established(struct sock *sk, u8 mandatory, u8 opt, | 516 | int dccp_feat_clone(struct sock *oldsk, struct sock *newsk) |
1279 | u8 feat, u8 *val, u8 len) | ||
1280 | { | 517 | { |
1281 | struct list_head *fn = &dccp_sk(sk)->dccps_featneg; | 518 | struct dccp_minisock *olddmsk = dccp_msk(oldsk); |
1282 | const bool local = (opt == DCCPO_CONFIRM_R); | 519 | struct dccp_minisock *newdmsk = dccp_msk(newsk); |
1283 | struct dccp_feat_entry *entry; | 520 | struct dccp_opt_pend *opt; |
1284 | u8 type = dccp_feat_type(feat); | 521 | int rc = 0; |
1285 | dccp_feat_val fval; | ||
1286 | 522 | ||
1287 | dccp_feat_print_opt(opt, feat, val, len, mandatory); | 523 | INIT_LIST_HEAD(&newdmsk->dccpms_pending); |
524 | INIT_LIST_HEAD(&newdmsk->dccpms_conf); | ||
1288 | 525 | ||
1289 | /* Ignore non-mandatory unknown and non-NN features */ | 526 | list_for_each_entry(opt, &olddmsk->dccpms_pending, dccpop_node) { |
1290 | if (type == FEAT_UNKNOWN) { | 527 | struct dccp_opt_pend *newopt; |
1291 | if (local && !mandatory) | 528 | /* copy the value of the option */ |
1292 | return 0; | 529 | u8 *val = kmemdup(opt->dccpop_val, opt->dccpop_len, GFP_ATOMIC); |
1293 | goto fast_path_unknown; | ||
1294 | } else if (type != FEAT_NN) { | ||
1295 | return 0; | ||
1296 | } | ||
1297 | |||
1298 | /* | ||
1299 | * We don't accept empty Confirms, since in fast-path feature | ||
1300 | * negotiation the values are enabled immediately after sending | ||
1301 | * the Change option. | ||
1302 | * Empty Changes on the other hand are invalid (RFC 4340, 6.1). | ||
1303 | */ | ||
1304 | if (len == 0 || len > sizeof(fval.nn)) | ||
1305 | goto fast_path_unknown; | ||
1306 | |||
1307 | if (opt == DCCPO_CHANGE_L) { | ||
1308 | fval.nn = dccp_decode_value_var(val, len); | ||
1309 | if (!dccp_feat_is_valid_nn_val(feat, fval.nn)) | ||
1310 | goto fast_path_unknown; | ||
1311 | 530 | ||
1312 | if (dccp_feat_push_confirm(fn, feat, local, &fval) || | 531 | if (val == NULL) |
1313 | dccp_feat_activate(sk, feat, local, &fval)) | 532 | goto out_clean; |
1314 | return DCCP_RESET_CODE_TOO_BUSY; | ||
1315 | 533 | ||
1316 | /* set the `Ack Pending' flag to piggyback a Confirm */ | 534 | newopt = kmemdup(opt, sizeof(*newopt), GFP_ATOMIC); |
1317 | inet_csk_schedule_ack(sk); | 535 | if (newopt == NULL) { |
1318 | 536 | kfree(val); | |
1319 | } else if (opt == DCCPO_CONFIRM_R) { | 537 | goto out_clean; |
1320 | entry = dccp_feat_list_lookup(fn, feat, local); | ||
1321 | if (entry == NULL || entry->state != FEAT_CHANGING) | ||
1322 | return 0; | ||
1323 | |||
1324 | fval.nn = dccp_decode_value_var(val, len); | ||
1325 | if (fval.nn != entry->val.nn) { | ||
1326 | DCCP_WARN("Bogus Confirm for non-existing value\n"); | ||
1327 | goto fast_path_failed; | ||
1328 | } | 538 | } |
1329 | 539 | ||
1330 | /* It has been confirmed - so remove the entry */ | 540 | /* insert the option */ |
1331 | dccp_feat_list_pop(entry); | 541 | newopt->dccpop_val = val; |
542 | list_add_tail(&newopt->dccpop_node, &newdmsk->dccpms_pending); | ||
1332 | 543 | ||
1333 | } else { | 544 | /* XXX what happens with backlogs and multiple connections at |
1334 | DCCP_WARN("Received illegal option %u\n", opt); | 545 | * once... |
1335 | goto fast_path_failed; | 546 | */ |
547 | /* the master socket no longer needs to worry about confirms */ | ||
548 | opt->dccpop_sc = NULL; /* it's not a memleak---new socket has it */ | ||
549 | |||
550 | /* reset state for a new socket */ | ||
551 | opt->dccpop_conf = 0; | ||
1336 | } | 552 | } |
1337 | return 0; | ||
1338 | 553 | ||
1339 | fast_path_unknown: | 554 | /* XXX not doing anything about the conf queue */ |
1340 | if (!mandatory) | 555 | |
1341 | return dccp_push_empty_confirm(fn, feat, local); | 556 | out: |
557 | return rc; | ||
1342 | 558 | ||
1343 | fast_path_failed: | 559 | out_clean: |
1344 | return mandatory ? DCCP_RESET_CODE_MANDATORY_ERROR | 560 | dccp_feat_clean(newdmsk); |
1345 | : DCCP_RESET_CODE_OPTION_ERROR; | 561 | rc = -ENOMEM; |
562 | goto out; | ||
1346 | } | 563 | } |
1347 | 564 | ||
1348 | /** | 565 | EXPORT_SYMBOL_GPL(dccp_feat_clone); |
1349 | * dccp_feat_parse_options - Process Feature-Negotiation Options | 566 | |
1350 | * @sk: for general use and used by the client during connection setup | 567 | static int __dccp_feat_init(struct dccp_minisock *dmsk, u8 type, u8 feat, |
1351 | * @dreq: used by the server during connection setup | 568 | u8 *val, u8 len) |
1352 | * @mandatory: whether @opt was preceded by a Mandatory option | ||
1353 | * @opt: %DCCPO_CHANGE_L | %DCCPO_CHANGE_R | %DCCPO_CONFIRM_L | %DCCPO_CONFIRM_R | ||
1354 | * @feat: one of %dccp_feature_numbers | ||
1355 | * @val: value contents of @opt | ||
1356 | * @len: length of @val in bytes | ||
1357 | * Returns 0 on success, a Reset code for ending the connection otherwise. | ||
1358 | */ | ||
1359 | int dccp_feat_parse_options(struct sock *sk, struct dccp_request_sock *dreq, | ||
1360 | u8 mandatory, u8 opt, u8 feat, u8 *val, u8 len) | ||
1361 | { | 569 | { |
1362 | struct dccp_sock *dp = dccp_sk(sk); | 570 | int rc = -ENOMEM; |
1363 | struct list_head *fn = dreq ? &dreq->dreq_featneg : &dp->dccps_featneg; | 571 | u8 *copy = kmemdup(val, len, GFP_KERNEL); |
1364 | bool server = false; | ||
1365 | 572 | ||
1366 | switch (sk->sk_state) { | 573 | if (copy != NULL) { |
1367 | /* | 574 | rc = dccp_feat_change(dmsk, type, feat, copy, len, GFP_KERNEL); |
1368 | * Negotiation during connection setup | 575 | if (rc) |
1369 | */ | 576 | kfree(copy); |
1370 | case DCCP_LISTEN: | ||
1371 | server = true; /* fall through */ | ||
1372 | case DCCP_REQUESTING: | ||
1373 | switch (opt) { | ||
1374 | case DCCPO_CHANGE_L: | ||
1375 | case DCCPO_CHANGE_R: | ||
1376 | return dccp_feat_change_recv(fn, mandatory, opt, feat, | ||
1377 | val, len, server); | ||
1378 | case DCCPO_CONFIRM_R: | ||
1379 | case DCCPO_CONFIRM_L: | ||
1380 | return dccp_feat_confirm_recv(fn, mandatory, opt, feat, | ||
1381 | val, len, server); | ||
1382 | } | ||
1383 | break; | ||
1384 | /* | ||
1385 | * Support for exchanging NN options on an established connection | ||
1386 | * This is currently restricted to Ack Ratio (RFC 4341, 6.1.2) | ||
1387 | */ | ||
1388 | case DCCP_OPEN: | ||
1389 | case DCCP_PARTOPEN: | ||
1390 | return dccp_feat_handle_nn_established(sk, mandatory, opt, feat, | ||
1391 | val, len); | ||
1392 | } | 577 | } |
1393 | return 0; /* ignore FN options in all other states */ | 578 | return rc; |
1394 | } | 579 | } |
1395 | 580 | ||
1396 | /** | 581 | int dccp_feat_init(struct dccp_minisock *dmsk) |
1397 | * dccp_feat_init - Seed feature negotiation with host-specific defaults | ||
1398 | * This initialises global defaults, depending on the value of the sysctls. | ||
1399 | * These can later be overridden by registering changes via setsockopt calls. | ||
1400 | * The last link in the chain is finalise_settings, to make sure that between | ||
1401 | * here and the start of actual feature negotiation no inconsistencies enter. | ||
1402 | * | ||
1403 | * All features not appearing below use either defaults or are otherwise | ||
1404 | * later adjusted through dccp_feat_finalise_settings(). | ||
1405 | */ | ||
1406 | int dccp_feat_init(struct sock *sk) | ||
1407 | { | 582 | { |
1408 | struct list_head *fn = &dccp_sk(sk)->dccps_featneg; | ||
1409 | u8 on = 1, off = 0; | ||
1410 | int rc; | 583 | int rc; |
1411 | struct { | ||
1412 | u8 *val; | ||
1413 | u8 len; | ||
1414 | } tx, rx; | ||
1415 | |||
1416 | /* Non-negotiable (NN) features */ | ||
1417 | rc = __feat_register_nn(fn, DCCPF_SEQUENCE_WINDOW, 0, | ||
1418 | sysctl_dccp_sequence_window); | ||
1419 | if (rc) | ||
1420 | return rc; | ||
1421 | 584 | ||
1422 | /* Server-priority (SP) features */ | 585 | INIT_LIST_HEAD(&dmsk->dccpms_pending); |
1423 | 586 | INIT_LIST_HEAD(&dmsk->dccpms_conf); | |
1424 | /* Advertise that short seqnos are not supported (7.6.1) */ | ||
1425 | rc = __feat_register_sp(fn, DCCPF_SHORT_SEQNOS, true, true, &off, 1); | ||
1426 | if (rc) | ||
1427 | return rc; | ||
1428 | 587 | ||
1429 | /* RFC 4340 12.1: "If a DCCP is not ECN capable, ..." */ | 588 | /* CCID L */ |
1430 | rc = __feat_register_sp(fn, DCCPF_ECN_INCAPABLE, true, true, &on, 1); | 589 | rc = __dccp_feat_init(dmsk, DCCPO_CHANGE_L, DCCPF_CCID, |
590 | &dmsk->dccpms_tx_ccid, 1); | ||
1431 | if (rc) | 591 | if (rc) |
1432 | return rc; | 592 | goto out; |
1433 | |||
1434 | /* | ||
1435 | * We advertise the available list of CCIDs and reorder according to | ||
1436 | * preferences, to avoid failure resulting from negotiating different | ||
1437 | * singleton values (which always leads to failure). | ||
1438 | * These settings can still (later) be overridden via sockopts. | ||
1439 | */ | ||
1440 | if (ccid_get_builtin_ccids(&tx.val, &tx.len) || | ||
1441 | ccid_get_builtin_ccids(&rx.val, &rx.len)) | ||
1442 | return -ENOBUFS; | ||
1443 | |||
1444 | /* Pre-load all CCID modules that are going to be advertised */ | ||
1445 | rc = -EUNATCH; | ||
1446 | if (ccid_request_modules(tx.val, tx.len)) | ||
1447 | goto free_ccid_lists; | ||
1448 | |||
1449 | if (!dccp_feat_prefer(sysctl_dccp_tx_ccid, tx.val, tx.len) || | ||
1450 | !dccp_feat_prefer(sysctl_dccp_rx_ccid, rx.val, rx.len)) | ||
1451 | goto free_ccid_lists; | ||
1452 | 593 | ||
1453 | rc = __feat_register_sp(fn, DCCPF_CCID, true, false, tx.val, tx.len); | 594 | /* CCID R */ |
595 | rc = __dccp_feat_init(dmsk, DCCPO_CHANGE_R, DCCPF_CCID, | ||
596 | &dmsk->dccpms_rx_ccid, 1); | ||
1454 | if (rc) | 597 | if (rc) |
1455 | goto free_ccid_lists; | 598 | goto out; |
1456 | 599 | ||
1457 | rc = __feat_register_sp(fn, DCCPF_CCID, false, false, rx.val, rx.len); | 600 | /* Ack ratio */ |
1458 | 601 | rc = __dccp_feat_init(dmsk, DCCPO_CHANGE_L, DCCPF_ACK_RATIO, | |
1459 | free_ccid_lists: | 602 | &dmsk->dccpms_ack_ratio, 1); |
1460 | kfree(tx.val); | 603 | out: |
1461 | kfree(rx.val); | ||
1462 | return rc; | 604 | return rc; |
1463 | } | 605 | } |
1464 | 606 | ||
1465 | int dccp_feat_activate_values(struct sock *sk, struct list_head *fn_list) | 607 | EXPORT_SYMBOL_GPL(dccp_feat_init); |
1466 | { | ||
1467 | struct dccp_sock *dp = dccp_sk(sk); | ||
1468 | struct dccp_feat_entry *cur, *next; | ||
1469 | int idx; | ||
1470 | dccp_feat_val *fvals[DCCP_FEAT_SUPPORTED_MAX][2] = { | ||
1471 | [0 ... DCCP_FEAT_SUPPORTED_MAX-1] = { NULL, NULL } | ||
1472 | }; | ||
1473 | |||
1474 | list_for_each_entry(cur, fn_list, node) { | ||
1475 | /* | ||
1476 | * An empty Confirm means that either an unknown feature type | ||
1477 | * or an invalid value was present. In the first case there is | ||
1478 | * nothing to activate, in the other the default value is used. | ||
1479 | */ | ||
1480 | if (cur->empty_confirm) | ||
1481 | continue; | ||
1482 | 608 | ||
1483 | idx = dccp_feat_index(cur->feat_num); | 609 | #ifdef CONFIG_IP_DCCP_DEBUG |
1484 | if (idx < 0) { | 610 | const char *dccp_feat_typename(const u8 type) |
1485 | DCCP_BUG("Unknown feature %u", cur->feat_num); | 611 | { |
1486 | goto activation_failed; | 612 | switch(type) { |
1487 | } | 613 | case DCCPO_CHANGE_L: return("ChangeL"); |
1488 | if (cur->state != FEAT_STABLE) { | 614 | case DCCPO_CONFIRM_L: return("ConfirmL"); |
1489 | DCCP_CRIT("Negotiation of %s %s failed in state %s", | 615 | case DCCPO_CHANGE_R: return("ChangeR"); |
1490 | cur->is_local ? "local" : "remote", | 616 | case DCCPO_CONFIRM_R: return("ConfirmR"); |
1491 | dccp_feat_fname(cur->feat_num), | 617 | /* the following case must not appear in feature negotation */ |
1492 | dccp_feat_sname[cur->state]); | 618 | default: dccp_pr_debug("unknown type %d [BUG!]\n", type); |
1493 | goto activation_failed; | ||
1494 | } | ||
1495 | fvals[idx][cur->is_local] = &cur->val; | ||
1496 | } | 619 | } |
620 | return NULL; | ||
621 | } | ||
1497 | 622 | ||
1498 | /* | 623 | EXPORT_SYMBOL_GPL(dccp_feat_typename); |
1499 | * Activate in decreasing order of index, so that the CCIDs are always | ||
1500 | * activated as the last feature. This avoids the case where a CCID | ||
1501 | * relies on the initialisation of one or more features that it depends | ||
1502 | * on (e.g. Send NDP Count, Send Ack Vector, and Ack Ratio features). | ||
1503 | */ | ||
1504 | for (idx = DCCP_FEAT_SUPPORTED_MAX; --idx >= 0;) | ||
1505 | if (__dccp_feat_activate(sk, idx, 0, fvals[idx][0]) || | ||
1506 | __dccp_feat_activate(sk, idx, 1, fvals[idx][1])) { | ||
1507 | DCCP_CRIT("Could not activate %d", idx); | ||
1508 | goto activation_failed; | ||
1509 | } | ||
1510 | 624 | ||
1511 | /* Clean up Change options which have been confirmed already */ | 625 | const char *dccp_feat_name(const u8 feat) |
1512 | list_for_each_entry_safe(cur, next, fn_list, node) | 626 | { |
1513 | if (!cur->needs_confirm) | 627 | static const char *feature_names[] = { |
1514 | dccp_feat_list_pop(cur); | 628 | [DCCPF_RESERVED] = "Reserved", |
629 | [DCCPF_CCID] = "CCID", | ||
630 | [DCCPF_SHORT_SEQNOS] = "Allow Short Seqnos", | ||
631 | [DCCPF_SEQUENCE_WINDOW] = "Sequence Window", | ||
632 | [DCCPF_ECN_INCAPABLE] = "ECN Incapable", | ||
633 | [DCCPF_ACK_RATIO] = "Ack Ratio", | ||
634 | [DCCPF_SEND_ACK_VECTOR] = "Send ACK Vector", | ||
635 | [DCCPF_SEND_NDP_COUNT] = "Send NDP Count", | ||
636 | [DCCPF_MIN_CSUM_COVER] = "Min. Csum Coverage", | ||
637 | [DCCPF_DATA_CHECKSUM] = "Send Data Checksum", | ||
638 | }; | ||
639 | if (feat > DCCPF_DATA_CHECKSUM && feat < DCCPF_MIN_CCID_SPECIFIC) | ||
640 | return feature_names[DCCPF_RESERVED]; | ||
1515 | 641 | ||
1516 | dccp_pr_debug("Activation OK\n"); | 642 | if (feat >= DCCPF_MIN_CCID_SPECIFIC) |
1517 | return 0; | 643 | return "CCID-specific"; |
1518 | 644 | ||
1519 | activation_failed: | 645 | return feature_names[feat]; |
1520 | /* | ||
1521 | * We clean up everything that may have been allocated, since | ||
1522 | * it is difficult to track at which stage negotiation failed. | ||
1523 | * This is ok, since all allocation functions below are robust | ||
1524 | * against NULL arguments. | ||
1525 | */ | ||
1526 | ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk); | ||
1527 | ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk); | ||
1528 | dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL; | ||
1529 | dccp_ackvec_free(dp->dccps_hc_rx_ackvec); | ||
1530 | dp->dccps_hc_rx_ackvec = NULL; | ||
1531 | return -1; | ||
1532 | } | 646 | } |
647 | |||
648 | EXPORT_SYMBOL_GPL(dccp_feat_name); | ||
649 | #endif /* CONFIG_IP_DCCP_DEBUG */ | ||
diff --git a/net/dccp/feat.h b/net/dccp/feat.h index 2217066e22d7..e272222c7ace 100644 --- a/net/dccp/feat.h +++ b/net/dccp/feat.h | |||
@@ -3,134 +3,38 @@ | |||
3 | /* | 3 | /* |
4 | * net/dccp/feat.h | 4 | * net/dccp/feat.h |
5 | * | 5 | * |
6 | * Feature negotiation for the DCCP protocol (RFC 4340, section 6) | 6 | * An implementation of the DCCP protocol |
7 | * Copyright (c) 2008 Gerrit Renker <gerrit@erg.abdn.ac.uk> | ||
8 | * Copyright (c) 2005 Andrea Bittau <a.bittau@cs.ucl.ac.uk> | 7 | * Copyright (c) 2005 Andrea Bittau <a.bittau@cs.ucl.ac.uk> |
9 | * | 8 | * |
10 | * This program is free software; you can redistribute it and/or modify it | 9 | * This program is free software; you can redistribute it and/or modify it |
11 | * under the terms of the GNU General Public License version 2 as | 10 | * under the terms of the GNU General Public License version 2 as |
12 | * published by the Free Software Foundation. | 11 | * published by the Free Software Foundation. |
13 | */ | 12 | */ |
13 | |||
14 | #include <linux/types.h> | 14 | #include <linux/types.h> |
15 | #include "dccp.h" | 15 | #include "dccp.h" |
16 | 16 | ||
17 | /* | 17 | #ifdef CONFIG_IP_DCCP_DEBUG |
18 | * Known limit values | 18 | extern const char *dccp_feat_typename(const u8 type); |
19 | */ | 19 | extern const char *dccp_feat_name(const u8 feat); |
20 | /* Ack Ratio takes 2-byte integer values (11.3) */ | ||
21 | #define DCCPF_ACK_RATIO_MAX 0xFFFF | ||
22 | /* Wmin=32 and Wmax=2^46-1 from 7.5.2 */ | ||
23 | #define DCCPF_SEQ_WMIN 32 | ||
24 | #define DCCPF_SEQ_WMAX 0x3FFFFFFFFFFFull | ||
25 | /* Maximum number of SP values that fit in a single (Confirm) option */ | ||
26 | #define DCCP_FEAT_MAX_SP_VALS (DCCP_SINGLE_OPT_MAXLEN - 2) | ||
27 | |||
28 | enum dccp_feat_type { | ||
29 | FEAT_AT_RX = 1, /* located at RX side of half-connection */ | ||
30 | FEAT_AT_TX = 2, /* located at TX side of half-connection */ | ||
31 | FEAT_SP = 4, /* server-priority reconciliation (6.3.1) */ | ||
32 | FEAT_NN = 8, /* non-negotiable reconciliation (6.3.2) */ | ||
33 | FEAT_UNKNOWN = 0xFF /* not understood or invalid feature */ | ||
34 | }; | ||
35 | |||
36 | enum dccp_feat_state { | ||
37 | FEAT_DEFAULT = 0, /* using default values from 6.4 */ | ||
38 | FEAT_INITIALISING, /* feature is being initialised */ | ||
39 | FEAT_CHANGING, /* Change sent but not confirmed yet */ | ||
40 | FEAT_UNSTABLE, /* local modification in state CHANGING */ | ||
41 | FEAT_STABLE /* both ends (think they) agree */ | ||
42 | }; | ||
43 | 20 | ||
44 | /** | 21 | static inline void dccp_feat_debug(const u8 type, const u8 feat, const u8 val) |
45 | * dccp_feat_val - Container for SP or NN feature values | ||
46 | * @nn: single NN value | ||
47 | * @sp.vec: single SP value plus optional preference list | ||
48 | * @sp.len: length of @sp.vec in bytes | ||
49 | */ | ||
50 | typedef union { | ||
51 | u64 nn; | ||
52 | struct { | ||
53 | u8 *vec; | ||
54 | u8 len; | ||
55 | } sp; | ||
56 | } dccp_feat_val; | ||
57 | |||
58 | /** | ||
59 | * struct feat_entry - Data structure to perform feature negotiation | ||
60 | * @feat_num: one of %dccp_feature_numbers | ||
61 | * @val: feature's current value (SP features may have preference list) | ||
62 | * @state: feature's current state | ||
63 | * @needs_mandatory: whether Mandatory options should be sent | ||
64 | * @needs_confirm: whether to send a Confirm instead of a Change | ||
65 | * @empty_confirm: whether to send an empty Confirm (depends on @needs_confirm) | ||
66 | * @is_local: feature location (1) or feature-remote (0) | ||
67 | * @node: list pointers, entries arranged in FIFO order | ||
68 | */ | ||
69 | struct dccp_feat_entry { | ||
70 | u8 feat_num; | ||
71 | dccp_feat_val val; | ||
72 | enum dccp_feat_state state:8; | ||
73 | bool needs_mandatory:1, | ||
74 | needs_confirm:1, | ||
75 | empty_confirm:1, | ||
76 | is_local:1; | ||
77 | |||
78 | struct list_head node; | ||
79 | }; | ||
80 | |||
81 | static inline u8 dccp_feat_genopt(struct dccp_feat_entry *entry) | ||
82 | { | 22 | { |
83 | if (entry->needs_confirm) | 23 | dccp_pr_debug("%s(%s (%d), %d)\n", dccp_feat_typename(type), |
84 | return entry->is_local ? DCCPO_CONFIRM_L : DCCPO_CONFIRM_R; | 24 | dccp_feat_name(feat), feat, val); |
85 | return entry->is_local ? DCCPO_CHANGE_L : DCCPO_CHANGE_R; | ||
86 | } | 25 | } |
26 | #else | ||
27 | #define dccp_feat_debug(type, feat, val) | ||
28 | #endif /* CONFIG_IP_DCCP_DEBUG */ | ||
29 | |||
30 | extern int dccp_feat_change(struct dccp_minisock *dmsk, u8 type, u8 feature, | ||
31 | u8 *val, u8 len, gfp_t gfp); | ||
32 | extern int dccp_feat_change_recv(struct sock *sk, u8 type, u8 feature, | ||
33 | u8 *val, u8 len); | ||
34 | extern int dccp_feat_confirm_recv(struct sock *sk, u8 type, u8 feature, | ||
35 | u8 *val, u8 len); | ||
36 | extern void dccp_feat_clean(struct dccp_minisock *dmsk); | ||
37 | extern int dccp_feat_clone(struct sock *oldsk, struct sock *newsk); | ||
38 | extern int dccp_feat_init(struct dccp_minisock *dmsk); | ||
87 | 39 | ||
88 | /** | ||
89 | * struct ccid_dependency - Track changes resulting from choosing a CCID | ||
90 | * @dependent_feat: one of %dccp_feature_numbers | ||
91 | * @is_local: local (1) or remote (0) @dependent_feat | ||
92 | * @is_mandatory: whether presence of @dependent_feat is mission-critical or not | ||
93 | * @val: corresponding default value for @dependent_feat (u8 is sufficient here) | ||
94 | */ | ||
95 | struct ccid_dependency { | ||
96 | u8 dependent_feat; | ||
97 | bool is_local:1, | ||
98 | is_mandatory:1; | ||
99 | u8 val; | ||
100 | }; | ||
101 | |||
102 | /* | ||
103 | * Sysctls to seed defaults for feature negotiation | ||
104 | */ | ||
105 | extern unsigned long sysctl_dccp_sequence_window; | ||
106 | extern int sysctl_dccp_rx_ccid; | ||
107 | extern int sysctl_dccp_tx_ccid; | ||
108 | |||
109 | extern int dccp_feat_init(struct sock *sk); | ||
110 | extern void dccp_feat_initialise_sysctls(void); | ||
111 | extern int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local, | ||
112 | u8 const *list, u8 len); | ||
113 | extern int dccp_feat_register_nn(struct sock *sk, u8 feat, u64 val); | ||
114 | extern int dccp_feat_parse_options(struct sock *, struct dccp_request_sock *, | ||
115 | u8 mand, u8 opt, u8 feat, u8 *val, u8 len); | ||
116 | extern int dccp_feat_clone_list(struct list_head const *, struct list_head *); | ||
117 | |||
118 | /* | ||
119 | * Encoding variable-length options and their maximum length. | ||
120 | * | ||
121 | * This affects NN options (SP options are all u8) and other variable-length | ||
122 | * options (see table 3 in RFC 4340). The limit is currently given the Sequence | ||
123 | * Window NN value (sec. 7.5.2) and the NDP count (sec. 7.7) option, all other | ||
124 | * options consume less than 6 bytes (timestamps are 4 bytes). | ||
125 | * When updating this constant (e.g. due to new internet drafts / RFCs), make | ||
126 | * sure that you also update all code which refers to it. | ||
127 | */ | ||
128 | #define DCCP_OPTVAL_MAXLEN 6 | ||
129 | |||
130 | extern void dccp_encode_value_var(const u64 value, u8 *to, const u8 len); | ||
131 | extern u64 dccp_decode_value_var(const u8 *bf, const u8 len); | ||
132 | |||
133 | extern int dccp_insert_option_mandatory(struct sk_buff *skb); | ||
134 | extern int dccp_insert_fn_opt(struct sk_buff *skb, u8 type, u8 feat, | ||
135 | u8 *val, u8 len, bool repeat_first); | ||
136 | #endif /* _DCCP_FEAT_H */ | 40 | #endif /* _DCCP_FEAT_H */ |
diff --git a/net/dccp/input.c b/net/dccp/input.c index df0e6714aa11..779d0ed9ae94 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c | |||
@@ -159,15 +159,13 @@ static void dccp_rcv_reset(struct sock *sk, struct sk_buff *skb) | |||
159 | dccp_time_wait(sk, DCCP_TIME_WAIT, 0); | 159 | dccp_time_wait(sk, DCCP_TIME_WAIT, 0); |
160 | } | 160 | } |
161 | 161 | ||
162 | static void dccp_handle_ackvec_processing(struct sock *sk, struct sk_buff *skb) | 162 | static void dccp_event_ack_recv(struct sock *sk, struct sk_buff *skb) |
163 | { | 163 | { |
164 | struct dccp_ackvec *av = dccp_sk(sk)->dccps_hc_rx_ackvec; | 164 | struct dccp_sock *dp = dccp_sk(sk); |
165 | 165 | ||
166 | if (av == NULL) | 166 | if (dccp_msk(sk)->dccpms_send_ack_vector) |
167 | return; | 167 | dccp_ackvec_check_rcv_ackno(dp->dccps_hc_rx_ackvec, sk, |
168 | if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) | 168 | DCCP_SKB_CB(skb)->dccpd_ack_seq); |
169 | dccp_ackvec_clear_state(av, DCCP_SKB_CB(skb)->dccpd_ack_seq); | ||
170 | dccp_ackvec_input(av, skb); | ||
171 | } | 169 | } |
172 | 170 | ||
173 | static void dccp_deliver_input_to_ccids(struct sock *sk, struct sk_buff *skb) | 171 | static void dccp_deliver_input_to_ccids(struct sock *sk, struct sk_buff *skb) |
@@ -366,13 +364,22 @@ discard: | |||
366 | int dccp_rcv_established(struct sock *sk, struct sk_buff *skb, | 364 | int dccp_rcv_established(struct sock *sk, struct sk_buff *skb, |
367 | const struct dccp_hdr *dh, const unsigned len) | 365 | const struct dccp_hdr *dh, const unsigned len) |
368 | { | 366 | { |
367 | struct dccp_sock *dp = dccp_sk(sk); | ||
368 | |||
369 | if (dccp_check_seqno(sk, skb)) | 369 | if (dccp_check_seqno(sk, skb)) |
370 | goto discard; | 370 | goto discard; |
371 | 371 | ||
372 | if (dccp_parse_options(sk, NULL, skb)) | 372 | if (dccp_parse_options(sk, NULL, skb)) |
373 | return 1; | 373 | return 1; |
374 | 374 | ||
375 | dccp_handle_ackvec_processing(sk, skb); | 375 | if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) |
376 | dccp_event_ack_recv(sk, skb); | ||
377 | |||
378 | if (dccp_msk(sk)->dccpms_send_ack_vector && | ||
379 | dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk, | ||
380 | DCCP_SKB_CB(skb)->dccpd_seq, | ||
381 | DCCP_ACKVEC_STATE_RECEIVED)) | ||
382 | goto discard; | ||
376 | dccp_deliver_input_to_ccids(sk, skb); | 383 | dccp_deliver_input_to_ccids(sk, skb); |
377 | 384 | ||
378 | return __dccp_rcv_established(sk, skb, dh, len); | 385 | return __dccp_rcv_established(sk, skb, dh, len); |
@@ -414,33 +421,40 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk, | |||
414 | goto out_invalid_packet; | 421 | goto out_invalid_packet; |
415 | } | 422 | } |
416 | 423 | ||
417 | /* | ||
418 | * If option processing (Step 8) failed, return 1 here so that | ||
419 | * dccp_v4_do_rcv() sends a Reset. The Reset code depends on | ||
420 | * the option type and is set in dccp_parse_options(). | ||
421 | */ | ||
422 | if (dccp_parse_options(sk, NULL, skb)) | 424 | if (dccp_parse_options(sk, NULL, skb)) |
423 | return 1; | 425 | goto out_invalid_packet; |
424 | 426 | ||
425 | /* Obtain usec RTT sample from SYN exchange (used by CCID 3) */ | 427 | /* Obtain usec RTT sample from SYN exchange (used by CCID 3) */ |
426 | if (likely(dp->dccps_options_received.dccpor_timestamp_echo)) | 428 | if (likely(dp->dccps_options_received.dccpor_timestamp_echo)) |
427 | dp->dccps_syn_rtt = dccp_sample_rtt(sk, 10 * (tstamp - | 429 | dp->dccps_syn_rtt = dccp_sample_rtt(sk, 10 * (tstamp - |
428 | dp->dccps_options_received.dccpor_timestamp_echo)); | 430 | dp->dccps_options_received.dccpor_timestamp_echo)); |
429 | 431 | ||
432 | if (dccp_msk(sk)->dccpms_send_ack_vector && | ||
433 | dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk, | ||
434 | DCCP_SKB_CB(skb)->dccpd_seq, | ||
435 | DCCP_ACKVEC_STATE_RECEIVED)) | ||
436 | goto out_invalid_packet; /* FIXME: change error code */ | ||
437 | |||
430 | /* Stop the REQUEST timer */ | 438 | /* Stop the REQUEST timer */ |
431 | inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); | 439 | inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); |
432 | WARN_ON(sk->sk_send_head == NULL); | 440 | WARN_ON(sk->sk_send_head == NULL); |
433 | kfree_skb(sk->sk_send_head); | 441 | kfree_skb(sk->sk_send_head); |
434 | sk->sk_send_head = NULL; | 442 | sk->sk_send_head = NULL; |
435 | 443 | ||
444 | dp->dccps_isr = DCCP_SKB_CB(skb)->dccpd_seq; | ||
445 | dccp_update_gsr(sk, dp->dccps_isr); | ||
436 | /* | 446 | /* |
437 | * Set ISR, GSR from packet. ISS was set in dccp_v{4,6}_connect | 447 | * SWL and AWL are initially adjusted so that they are not less than |
438 | * and GSS in dccp_transmit_skb(). Setting AWL/AWH and SWL/SWH | 448 | * the initial Sequence Numbers received and sent, respectively: |
439 | * is done as part of activating the feature values below, since | 449 | * SWL := max(GSR + 1 - floor(W/4), ISR), |
440 | * these settings depend on the local/remote Sequence Window | 450 | * AWL := max(GSS - W' + 1, ISS). |
441 | * features, which were undefined or not confirmed until now. | 451 | * These adjustments MUST be applied only at the beginning of the |
452 | * connection. | ||
453 | * | ||
454 | * AWL was adjusted in dccp_v4_connect -acme | ||
442 | */ | 455 | */ |
443 | dp->dccps_gsr = dp->dccps_isr = DCCP_SKB_CB(skb)->dccpd_seq; | 456 | dccp_set_seqno(&dp->dccps_swl, |
457 | max48(dp->dccps_swl, dp->dccps_isr)); | ||
444 | 458 | ||
445 | dccp_sync_mss(sk, icsk->icsk_pmtu_cookie); | 459 | dccp_sync_mss(sk, icsk->icsk_pmtu_cookie); |
446 | 460 | ||
@@ -461,15 +475,6 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk, | |||
461 | */ | 475 | */ |
462 | dccp_set_state(sk, DCCP_PARTOPEN); | 476 | dccp_set_state(sk, DCCP_PARTOPEN); |
463 | 477 | ||
464 | /* | ||
465 | * If feature negotiation was successful, activate features now; | ||
466 | * an activation failure means that this host could not activate | ||
467 | * one ore more features (e.g. insufficient memory), which would | ||
468 | * leave at least one feature in an undefined state. | ||
469 | */ | ||
470 | if (dccp_feat_activate_values(sk, &dp->dccps_featneg)) | ||
471 | goto unable_to_proceed; | ||
472 | |||
473 | /* Make sure socket is routed, for correct metrics. */ | 478 | /* Make sure socket is routed, for correct metrics. */ |
474 | icsk->icsk_af_ops->rebuild_header(sk); | 479 | icsk->icsk_af_ops->rebuild_header(sk); |
475 | 480 | ||
@@ -504,16 +509,6 @@ out_invalid_packet: | |||
504 | /* dccp_v4_do_rcv will send a reset */ | 509 | /* dccp_v4_do_rcv will send a reset */ |
505 | DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_PACKET_ERROR; | 510 | DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_PACKET_ERROR; |
506 | return 1; | 511 | return 1; |
507 | |||
508 | unable_to_proceed: | ||
509 | DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_ABORTED; | ||
510 | /* | ||
511 | * We mark this socket as no longer usable, so that the loop in | ||
512 | * dccp_sendmsg() terminates and the application gets notified. | ||
513 | */ | ||
514 | dccp_set_state(sk, DCCP_CLOSED); | ||
515 | sk->sk_err = ECOMM; | ||
516 | return 1; | ||
517 | } | 512 | } |
518 | 513 | ||
519 | static int dccp_rcv_respond_partopen_state_process(struct sock *sk, | 514 | static int dccp_rcv_respond_partopen_state_process(struct sock *sk, |
@@ -595,6 +590,8 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, | |||
595 | if (inet_csk(sk)->icsk_af_ops->conn_request(sk, | 590 | if (inet_csk(sk)->icsk_af_ops->conn_request(sk, |
596 | skb) < 0) | 591 | skb) < 0) |
597 | return 1; | 592 | return 1; |
593 | |||
594 | /* FIXME: do congestion control initialization */ | ||
598 | goto discard; | 595 | goto discard; |
599 | } | 596 | } |
600 | if (dh->dccph_type == DCCP_PKT_RESET) | 597 | if (dh->dccph_type == DCCP_PKT_RESET) |
@@ -603,35 +600,29 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, | |||
603 | /* Caller (dccp_v4_do_rcv) will send Reset */ | 600 | /* Caller (dccp_v4_do_rcv) will send Reset */ |
604 | dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION; | 601 | dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION; |
605 | return 1; | 602 | return 1; |
606 | } else if (sk->sk_state == DCCP_CLOSED) { | ||
607 | dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION; | ||
608 | return 1; | ||
609 | } | 603 | } |
610 | 604 | ||
611 | /* Step 6: Check sequence numbers (omitted in LISTEN/REQUEST state) */ | 605 | if (sk->sk_state != DCCP_REQUESTING) { |
612 | if (sk->sk_state != DCCP_REQUESTING && dccp_check_seqno(sk, skb)) | 606 | if (dccp_check_seqno(sk, skb)) |
613 | goto discard; | 607 | goto discard; |
614 | 608 | ||
615 | /* | 609 | /* |
616 | * Step 7: Check for unexpected packet types | 610 | * Step 8: Process options and mark acknowledgeable |
617 | * If (S.is_server and P.type == Response) | 611 | */ |
618 | * or (S.is_client and P.type == Request) | 612 | if (dccp_parse_options(sk, NULL, skb)) |
619 | * or (S.state == RESPOND and P.type == Data), | 613 | return 1; |
620 | * Send Sync packet acknowledging P.seqno | ||
621 | * Drop packet and return | ||
622 | */ | ||
623 | if ((dp->dccps_role != DCCP_ROLE_CLIENT && | ||
624 | dh->dccph_type == DCCP_PKT_RESPONSE) || | ||
625 | (dp->dccps_role == DCCP_ROLE_CLIENT && | ||
626 | dh->dccph_type == DCCP_PKT_REQUEST) || | ||
627 | (sk->sk_state == DCCP_RESPOND && dh->dccph_type == DCCP_PKT_DATA)) { | ||
628 | dccp_send_sync(sk, dcb->dccpd_seq, DCCP_PKT_SYNC); | ||
629 | goto discard; | ||
630 | } | ||
631 | 614 | ||
632 | /* Step 8: Process options */ | 615 | if (dcb->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) |
633 | if (dccp_parse_options(sk, NULL, skb)) | 616 | dccp_event_ack_recv(sk, skb); |
634 | return 1; | 617 | |
618 | if (dccp_msk(sk)->dccpms_send_ack_vector && | ||
619 | dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk, | ||
620 | DCCP_SKB_CB(skb)->dccpd_seq, | ||
621 | DCCP_ACKVEC_STATE_RECEIVED)) | ||
622 | goto discard; | ||
623 | |||
624 | dccp_deliver_input_to_ccids(sk, skb); | ||
625 | } | ||
635 | 626 | ||
636 | /* | 627 | /* |
637 | * Step 9: Process Reset | 628 | * Step 9: Process Reset |
@@ -640,22 +631,44 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, | |||
640 | * S.state := TIMEWAIT | 631 | * S.state := TIMEWAIT |
641 | * Set TIMEWAIT timer | 632 | * Set TIMEWAIT timer |
642 | * Drop packet and return | 633 | * Drop packet and return |
643 | */ | 634 | */ |
644 | if (dh->dccph_type == DCCP_PKT_RESET) { | 635 | if (dh->dccph_type == DCCP_PKT_RESET) { |
645 | dccp_rcv_reset(sk, skb); | 636 | dccp_rcv_reset(sk, skb); |
646 | return 0; | 637 | return 0; |
647 | } else if (dh->dccph_type == DCCP_PKT_CLOSEREQ) { /* Step 13 */ | 638 | /* |
639 | * Step 7: Check for unexpected packet types | ||
640 | * If (S.is_server and P.type == Response) | ||
641 | * or (S.is_client and P.type == Request) | ||
642 | * or (S.state == RESPOND and P.type == Data), | ||
643 | * Send Sync packet acknowledging P.seqno | ||
644 | * Drop packet and return | ||
645 | */ | ||
646 | } else if ((dp->dccps_role != DCCP_ROLE_CLIENT && | ||
647 | dh->dccph_type == DCCP_PKT_RESPONSE) || | ||
648 | (dp->dccps_role == DCCP_ROLE_CLIENT && | ||
649 | dh->dccph_type == DCCP_PKT_REQUEST) || | ||
650 | (sk->sk_state == DCCP_RESPOND && | ||
651 | dh->dccph_type == DCCP_PKT_DATA)) { | ||
652 | dccp_send_sync(sk, dcb->dccpd_seq, DCCP_PKT_SYNC); | ||
653 | goto discard; | ||
654 | } else if (dh->dccph_type == DCCP_PKT_CLOSEREQ) { | ||
648 | if (dccp_rcv_closereq(sk, skb)) | 655 | if (dccp_rcv_closereq(sk, skb)) |
649 | return 0; | 656 | return 0; |
650 | goto discard; | 657 | goto discard; |
651 | } else if (dh->dccph_type == DCCP_PKT_CLOSE) { /* Step 14 */ | 658 | } else if (dh->dccph_type == DCCP_PKT_CLOSE) { |
652 | if (dccp_rcv_close(sk, skb)) | 659 | if (dccp_rcv_close(sk, skb)) |
653 | return 0; | 660 | return 0; |
654 | goto discard; | 661 | goto discard; |
655 | } | 662 | } |
656 | 663 | ||
657 | switch (sk->sk_state) { | 664 | switch (sk->sk_state) { |
665 | case DCCP_CLOSED: | ||
666 | dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION; | ||
667 | return 1; | ||
668 | |||
658 | case DCCP_REQUESTING: | 669 | case DCCP_REQUESTING: |
670 | /* FIXME: do congestion control initialization */ | ||
671 | |||
659 | queued = dccp_rcv_request_sent_state_process(sk, skb, dh, len); | 672 | queued = dccp_rcv_request_sent_state_process(sk, skb, dh, len); |
660 | if (queued >= 0) | 673 | if (queued >= 0) |
661 | return queued; | 674 | return queued; |
@@ -663,12 +676,8 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, | |||
663 | __kfree_skb(skb); | 676 | __kfree_skb(skb); |
664 | return 0; | 677 | return 0; |
665 | 678 | ||
666 | case DCCP_PARTOPEN: | ||
667 | /* Step 8: if using Ack Vectors, mark packet acknowledgeable */ | ||
668 | dccp_handle_ackvec_processing(sk, skb); | ||
669 | dccp_deliver_input_to_ccids(sk, skb); | ||
670 | /* fall through */ | ||
671 | case DCCP_RESPOND: | 679 | case DCCP_RESPOND: |
680 | case DCCP_PARTOPEN: | ||
672 | queued = dccp_rcv_respond_partopen_state_process(sk, skb, | 681 | queued = dccp_rcv_respond_partopen_state_process(sk, skb, |
673 | dh, len); | 682 | dh, len); |
674 | break; | 683 | break; |
@@ -707,7 +716,16 @@ u32 dccp_sample_rtt(struct sock *sk, long delta) | |||
707 | /* dccpor_elapsed_time is either zeroed out or set and > 0 */ | 716 | /* dccpor_elapsed_time is either zeroed out or set and > 0 */ |
708 | delta -= dccp_sk(sk)->dccps_options_received.dccpor_elapsed_time * 10; | 717 | delta -= dccp_sk(sk)->dccps_options_received.dccpor_elapsed_time * 10; |
709 | 718 | ||
710 | return dccp_sane_rtt(delta); | 719 | if (unlikely(delta <= 0)) { |
720 | DCCP_WARN("unusable RTT sample %ld, using min\n", delta); | ||
721 | return DCCP_SANE_RTT_MIN; | ||
722 | } | ||
723 | if (unlikely(delta > DCCP_SANE_RTT_MAX)) { | ||
724 | DCCP_WARN("RTT sample %ld too large, using max\n", delta); | ||
725 | return DCCP_SANE_RTT_MAX; | ||
726 | } | ||
727 | |||
728 | return delta; | ||
711 | } | 729 | } |
712 | 730 | ||
713 | EXPORT_SYMBOL_GPL(dccp_sample_rtt); | 731 | EXPORT_SYMBOL_GPL(dccp_sample_rtt); |
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index b623f6b25482..882c5c4de69e 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c | |||
@@ -545,7 +545,6 @@ out: | |||
545 | 545 | ||
546 | static void dccp_v4_reqsk_destructor(struct request_sock *req) | 546 | static void dccp_v4_reqsk_destructor(struct request_sock *req) |
547 | { | 547 | { |
548 | dccp_feat_list_purge(&dccp_rsk(req)->dreq_featneg); | ||
549 | kfree(inet_rsk(req)->opt); | 548 | kfree(inet_rsk(req)->opt); |
550 | } | 549 | } |
551 | 550 | ||
@@ -596,8 +595,7 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb) | |||
596 | if (req == NULL) | 595 | if (req == NULL) |
597 | goto drop; | 596 | goto drop; |
598 | 597 | ||
599 | if (dccp_reqsk_init(req, dccp_sk(sk), skb)) | 598 | dccp_reqsk_init(req, skb); |
600 | goto drop_and_free; | ||
601 | 599 | ||
602 | dreq = dccp_rsk(req); | 600 | dreq = dccp_rsk(req); |
603 | if (dccp_parse_options(sk, dreq, skb)) | 601 | if (dccp_parse_options(sk, dreq, skb)) |
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index ad6212e00435..5e1ee0da2c40 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c | |||
@@ -302,7 +302,6 @@ done: | |||
302 | 302 | ||
303 | static void dccp_v6_reqsk_destructor(struct request_sock *req) | 303 | static void dccp_v6_reqsk_destructor(struct request_sock *req) |
304 | { | 304 | { |
305 | dccp_feat_list_purge(&dccp_rsk(req)->dreq_featneg); | ||
306 | if (inet6_rsk(req)->pktopts != NULL) | 305 | if (inet6_rsk(req)->pktopts != NULL) |
307 | kfree_skb(inet6_rsk(req)->pktopts); | 306 | kfree_skb(inet6_rsk(req)->pktopts); |
308 | } | 307 | } |
@@ -425,8 +424,7 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb) | |||
425 | if (req == NULL) | 424 | if (req == NULL) |
426 | goto drop; | 425 | goto drop; |
427 | 426 | ||
428 | if (dccp_reqsk_init(req, dccp_sk(sk), skb)) | 427 | dccp_reqsk_init(req, skb); |
429 | goto drop_and_free; | ||
430 | 428 | ||
431 | dreq = dccp_rsk(req); | 429 | dreq = dccp_rsk(req); |
432 | if (dccp_parse_options(sk, dreq, skb)) | 430 | if (dccp_parse_options(sk, dreq, skb)) |
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index f4d9c8f60ede..b2804e2d1b8c 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c | |||
@@ -42,6 +42,16 @@ struct inet_timewait_death_row dccp_death_row = { | |||
42 | 42 | ||
43 | EXPORT_SYMBOL_GPL(dccp_death_row); | 43 | EXPORT_SYMBOL_GPL(dccp_death_row); |
44 | 44 | ||
45 | void dccp_minisock_init(struct dccp_minisock *dmsk) | ||
46 | { | ||
47 | dmsk->dccpms_sequence_window = sysctl_dccp_feat_sequence_window; | ||
48 | dmsk->dccpms_rx_ccid = sysctl_dccp_feat_rx_ccid; | ||
49 | dmsk->dccpms_tx_ccid = sysctl_dccp_feat_tx_ccid; | ||
50 | dmsk->dccpms_ack_ratio = sysctl_dccp_feat_ack_ratio; | ||
51 | dmsk->dccpms_send_ack_vector = sysctl_dccp_feat_send_ack_vector; | ||
52 | dmsk->dccpms_send_ndp_count = sysctl_dccp_feat_send_ndp_count; | ||
53 | } | ||
54 | |||
45 | void dccp_time_wait(struct sock *sk, int state, int timeo) | 55 | void dccp_time_wait(struct sock *sk, int state, int timeo) |
46 | { | 56 | { |
47 | struct inet_timewait_sock *tw = NULL; | 57 | struct inet_timewait_sock *tw = NULL; |
@@ -102,9 +112,10 @@ struct sock *dccp_create_openreq_child(struct sock *sk, | |||
102 | struct sock *newsk = inet_csk_clone(sk, req, GFP_ATOMIC); | 112 | struct sock *newsk = inet_csk_clone(sk, req, GFP_ATOMIC); |
103 | 113 | ||
104 | if (newsk != NULL) { | 114 | if (newsk != NULL) { |
105 | struct dccp_request_sock *dreq = dccp_rsk(req); | 115 | const struct dccp_request_sock *dreq = dccp_rsk(req); |
106 | struct inet_connection_sock *newicsk = inet_csk(newsk); | 116 | struct inet_connection_sock *newicsk = inet_csk(newsk); |
107 | struct dccp_sock *newdp = dccp_sk(newsk); | 117 | struct dccp_sock *newdp = dccp_sk(newsk); |
118 | struct dccp_minisock *newdmsk = dccp_msk(newsk); | ||
108 | 119 | ||
109 | newdp->dccps_role = DCCP_ROLE_SERVER; | 120 | newdp->dccps_role = DCCP_ROLE_SERVER; |
110 | newdp->dccps_hc_rx_ackvec = NULL; | 121 | newdp->dccps_hc_rx_ackvec = NULL; |
@@ -114,32 +125,65 @@ struct sock *dccp_create_openreq_child(struct sock *sk, | |||
114 | newdp->dccps_timestamp_time = dreq->dreq_timestamp_time; | 125 | newdp->dccps_timestamp_time = dreq->dreq_timestamp_time; |
115 | newicsk->icsk_rto = DCCP_TIMEOUT_INIT; | 126 | newicsk->icsk_rto = DCCP_TIMEOUT_INIT; |
116 | 127 | ||
117 | INIT_LIST_HEAD(&newdp->dccps_featneg); | 128 | if (dccp_feat_clone(sk, newsk)) |
129 | goto out_free; | ||
130 | |||
131 | if (newdmsk->dccpms_send_ack_vector) { | ||
132 | newdp->dccps_hc_rx_ackvec = | ||
133 | dccp_ackvec_alloc(GFP_ATOMIC); | ||
134 | if (unlikely(newdp->dccps_hc_rx_ackvec == NULL)) | ||
135 | goto out_free; | ||
136 | } | ||
137 | |||
138 | newdp->dccps_hc_rx_ccid = | ||
139 | ccid_hc_rx_new(newdmsk->dccpms_rx_ccid, | ||
140 | newsk, GFP_ATOMIC); | ||
141 | newdp->dccps_hc_tx_ccid = | ||
142 | ccid_hc_tx_new(newdmsk->dccpms_tx_ccid, | ||
143 | newsk, GFP_ATOMIC); | ||
144 | if (unlikely(newdp->dccps_hc_rx_ccid == NULL || | ||
145 | newdp->dccps_hc_tx_ccid == NULL)) { | ||
146 | dccp_ackvec_free(newdp->dccps_hc_rx_ackvec); | ||
147 | ccid_hc_rx_delete(newdp->dccps_hc_rx_ccid, newsk); | ||
148 | ccid_hc_tx_delete(newdp->dccps_hc_tx_ccid, newsk); | ||
149 | out_free: | ||
150 | /* It is still raw copy of parent, so invalidate | ||
151 | * destructor and make plain sk_free() */ | ||
152 | newsk->sk_destruct = NULL; | ||
153 | sk_free(newsk); | ||
154 | return NULL; | ||
155 | } | ||
156 | |||
118 | /* | 157 | /* |
119 | * Step 3: Process LISTEN state | 158 | * Step 3: Process LISTEN state |
120 | * | 159 | * |
121 | * Choose S.ISS (initial seqno) or set from Init Cookies | 160 | * Choose S.ISS (initial seqno) or set from Init Cookies |
122 | * Initialize S.GAR := S.ISS | 161 | * Initialize S.GAR := S.ISS |
123 | * Set S.ISR, S.GSR from packet (or Init Cookies) | 162 | * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookies |
124 | * | ||
125 | * Setting AWL/AWH and SWL/SWH happens as part of the feature | ||
126 | * activation below, as these windows all depend on the local | ||
127 | * and remote Sequence Window feature values (7.5.2). | ||
128 | */ | 163 | */ |
129 | newdp->dccps_gss = newdp->dccps_iss = dreq->dreq_iss; | 164 | |
130 | newdp->dccps_gar = newdp->dccps_iss; | 165 | /* See dccp_v4_conn_request */ |
131 | newdp->dccps_gsr = newdp->dccps_isr = dreq->dreq_isr; | 166 | newdmsk->dccpms_sequence_window = req->rcv_wnd; |
167 | |||
168 | newdp->dccps_gar = newdp->dccps_iss = dreq->dreq_iss; | ||
169 | dccp_update_gss(newsk, dreq->dreq_iss); | ||
170 | |||
171 | newdp->dccps_isr = dreq->dreq_isr; | ||
172 | dccp_update_gsr(newsk, dreq->dreq_isr); | ||
132 | 173 | ||
133 | /* | 174 | /* |
134 | * Activate features: initialise CCIDs, sequence windows etc. | 175 | * SWL and AWL are initially adjusted so that they are not less than |
176 | * the initial Sequence Numbers received and sent, respectively: | ||
177 | * SWL := max(GSR + 1 - floor(W/4), ISR), | ||
178 | * AWL := max(GSS - W' + 1, ISS). | ||
179 | * These adjustments MUST be applied only at the beginning of the | ||
180 | * connection. | ||
135 | */ | 181 | */ |
136 | if (dccp_feat_activate_values(newsk, &dreq->dreq_featneg)) { | 182 | dccp_set_seqno(&newdp->dccps_swl, |
137 | /* It is still raw copy of parent, so invalidate | 183 | max48(newdp->dccps_swl, newdp->dccps_isr)); |
138 | * destructor and make plain sk_free() */ | 184 | dccp_set_seqno(&newdp->dccps_awl, |
139 | newsk->sk_destruct = NULL; | 185 | max48(newdp->dccps_awl, newdp->dccps_iss)); |
140 | sk_free(newsk); | 186 | |
141 | return NULL; | ||
142 | } | ||
143 | dccp_init_xmit_timers(newsk); | 187 | dccp_init_xmit_timers(newsk); |
144 | 188 | ||
145 | DCCP_INC_STATS_BH(DCCP_MIB_PASSIVEOPENS); | 189 | DCCP_INC_STATS_BH(DCCP_MIB_PASSIVEOPENS); |
@@ -260,17 +304,14 @@ void dccp_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, | |||
260 | 304 | ||
261 | EXPORT_SYMBOL_GPL(dccp_reqsk_send_ack); | 305 | EXPORT_SYMBOL_GPL(dccp_reqsk_send_ack); |
262 | 306 | ||
263 | int dccp_reqsk_init(struct request_sock *req, | 307 | void dccp_reqsk_init(struct request_sock *req, struct sk_buff *skb) |
264 | struct dccp_sock const *dp, struct sk_buff const *skb) | ||
265 | { | 308 | { |
266 | struct dccp_request_sock *dreq = dccp_rsk(req); | 309 | struct dccp_request_sock *dreq = dccp_rsk(req); |
267 | 310 | ||
268 | inet_rsk(req)->rmt_port = dccp_hdr(skb)->dccph_sport; | 311 | inet_rsk(req)->rmt_port = dccp_hdr(skb)->dccph_sport; |
269 | inet_rsk(req)->acked = 0; | 312 | inet_rsk(req)->acked = 0; |
313 | req->rcv_wnd = sysctl_dccp_feat_sequence_window; | ||
270 | dreq->dreq_timestamp_echo = 0; | 314 | dreq->dreq_timestamp_echo = 0; |
271 | |||
272 | /* inherit feature negotiation options from listening socket */ | ||
273 | return dccp_feat_clone_list(&dp->dccps_featneg, &dreq->dreq_featneg); | ||
274 | } | 315 | } |
275 | 316 | ||
276 | EXPORT_SYMBOL_GPL(dccp_reqsk_init); | 317 | EXPORT_SYMBOL_GPL(dccp_reqsk_init); |
diff --git a/net/dccp/options.c b/net/dccp/options.c index e5a32979d7d7..0809b63cb055 100644 --- a/net/dccp/options.c +++ b/net/dccp/options.c | |||
@@ -23,20 +23,23 @@ | |||
23 | #include "dccp.h" | 23 | #include "dccp.h" |
24 | #include "feat.h" | 24 | #include "feat.h" |
25 | 25 | ||
26 | u64 dccp_decode_value_var(const u8 *bf, const u8 len) | 26 | int sysctl_dccp_feat_sequence_window = DCCPF_INITIAL_SEQUENCE_WINDOW; |
27 | int sysctl_dccp_feat_rx_ccid = DCCPF_INITIAL_CCID; | ||
28 | int sysctl_dccp_feat_tx_ccid = DCCPF_INITIAL_CCID; | ||
29 | int sysctl_dccp_feat_ack_ratio = DCCPF_INITIAL_ACK_RATIO; | ||
30 | int sysctl_dccp_feat_send_ack_vector = DCCPF_INITIAL_SEND_ACK_VECTOR; | ||
31 | int sysctl_dccp_feat_send_ndp_count = DCCPF_INITIAL_SEND_NDP_COUNT; | ||
32 | |||
33 | static u32 dccp_decode_value_var(const unsigned char *bf, const u8 len) | ||
27 | { | 34 | { |
28 | u64 value = 0; | 35 | u32 value = 0; |
29 | 36 | ||
30 | if (len >= DCCP_OPTVAL_MAXLEN) | ||
31 | value += ((u64)*bf++) << 40; | ||
32 | if (len > 4) | ||
33 | value += ((u64)*bf++) << 32; | ||
34 | if (len > 3) | 37 | if (len > 3) |
35 | value += ((u64)*bf++) << 24; | 38 | value += *bf++ << 24; |
36 | if (len > 2) | 39 | if (len > 2) |
37 | value += ((u64)*bf++) << 16; | 40 | value += *bf++ << 16; |
38 | if (len > 1) | 41 | if (len > 1) |
39 | value += ((u64)*bf++) << 8; | 42 | value += *bf++ << 8; |
40 | if (len > 0) | 43 | if (len > 0) |
41 | value += *bf; | 44 | value += *bf; |
42 | 45 | ||
@@ -54,6 +57,7 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, | |||
54 | struct dccp_sock *dp = dccp_sk(sk); | 57 | struct dccp_sock *dp = dccp_sk(sk); |
55 | const struct dccp_hdr *dh = dccp_hdr(skb); | 58 | const struct dccp_hdr *dh = dccp_hdr(skb); |
56 | const u8 pkt_type = DCCP_SKB_CB(skb)->dccpd_type; | 59 | const u8 pkt_type = DCCP_SKB_CB(skb)->dccpd_type; |
60 | u64 ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq; | ||
57 | unsigned char *options = (unsigned char *)dh + dccp_hdr_len(skb); | 61 | unsigned char *options = (unsigned char *)dh + dccp_hdr_len(skb); |
58 | unsigned char *opt_ptr = options; | 62 | unsigned char *opt_ptr = options; |
59 | const unsigned char *opt_end = (unsigned char *)dh + | 63 | const unsigned char *opt_end = (unsigned char *)dh + |
@@ -95,11 +99,18 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, | |||
95 | } | 99 | } |
96 | 100 | ||
97 | /* | 101 | /* |
102 | * CCID-Specific Options (from RFC 4340, sec. 10.3): | ||
103 | * | ||
104 | * Option numbers 128 through 191 are for options sent from the | ||
105 | * HC-Sender to the HC-Receiver; option numbers 192 through 255 | ||
106 | * are for options sent from the HC-Receiver to the HC-Sender. | ||
107 | * | ||
98 | * CCID-specific options are ignored during connection setup, as | 108 | * CCID-specific options are ignored during connection setup, as |
99 | * negotiation may still be in progress (see RFC 4340, 10.3). | 109 | * negotiation may still be in progress (see RFC 4340, 10.3). |
100 | * The same applies to Ack Vectors, as these depend on the CCID. | 110 | * The same applies to Ack Vectors, as these depend on the CCID. |
111 | * | ||
101 | */ | 112 | */ |
102 | if (dreq != NULL && (opt >= DCCPO_MIN_RX_CCID_SPECIFIC || | 113 | if (dreq != NULL && (opt >= 128 || |
103 | opt == DCCPO_ACK_VECTOR_0 || opt == DCCPO_ACK_VECTOR_1)) | 114 | opt == DCCPO_ACK_VECTOR_0 || opt == DCCPO_ACK_VECTOR_1)) |
104 | goto ignore_option; | 115 | goto ignore_option; |
105 | 116 | ||
@@ -120,13 +131,43 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, | |||
120 | dccp_pr_debug("%s opt: NDP count=%llu\n", dccp_role(sk), | 131 | dccp_pr_debug("%s opt: NDP count=%llu\n", dccp_role(sk), |
121 | (unsigned long long)opt_recv->dccpor_ndp); | 132 | (unsigned long long)opt_recv->dccpor_ndp); |
122 | break; | 133 | break; |
123 | case DCCPO_CHANGE_L ... DCCPO_CONFIRM_R: | 134 | case DCCPO_CHANGE_L: |
124 | if (pkt_type == DCCP_PKT_DATA) /* RFC 4340, 6 */ | 135 | /* fall through */ |
136 | case DCCPO_CHANGE_R: | ||
137 | if (pkt_type == DCCP_PKT_DATA) | ||
125 | break; | 138 | break; |
126 | rc = dccp_feat_parse_options(sk, dreq, mandatory, opt, | 139 | if (len < 2) |
127 | *value, value + 1, len - 1); | 140 | goto out_invalid_option; |
128 | if (rc) | 141 | rc = dccp_feat_change_recv(sk, opt, *value, value + 1, |
129 | goto out_featneg_failed; | 142 | len - 1); |
143 | /* | ||
144 | * When there is a change error, change_recv is | ||
145 | * responsible for dealing with it. i.e. reply with an | ||
146 | * empty confirm. | ||
147 | * If the change was mandatory, then we need to die. | ||
148 | */ | ||
149 | if (rc && mandatory) | ||
150 | goto out_invalid_option; | ||
151 | break; | ||
152 | case DCCPO_CONFIRM_L: | ||
153 | /* fall through */ | ||
154 | case DCCPO_CONFIRM_R: | ||
155 | if (pkt_type == DCCP_PKT_DATA) | ||
156 | break; | ||
157 | if (len < 2) /* FIXME this disallows empty confirm */ | ||
158 | goto out_invalid_option; | ||
159 | if (dccp_feat_confirm_recv(sk, opt, *value, | ||
160 | value + 1, len - 1)) | ||
161 | goto out_invalid_option; | ||
162 | break; | ||
163 | case DCCPO_ACK_VECTOR_0: | ||
164 | case DCCPO_ACK_VECTOR_1: | ||
165 | if (dccp_packet_without_ack(skb)) /* RFC 4340, 11.4 */ | ||
166 | break; | ||
167 | |||
168 | if (dccp_msk(sk)->dccpms_send_ack_vector && | ||
169 | dccp_ackvec_parse(sk, skb, &ackno, opt, value, len)) | ||
170 | goto out_invalid_option; | ||
130 | break; | 171 | break; |
131 | case DCCPO_TIMESTAMP: | 172 | case DCCPO_TIMESTAMP: |
132 | if (len != 4) | 173 | if (len != 4) |
@@ -154,8 +195,6 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, | |||
154 | dccp_role(sk), ntohl(opt_val), | 195 | dccp_role(sk), ntohl(opt_val), |
155 | (unsigned long long) | 196 | (unsigned long long) |
156 | DCCP_SKB_CB(skb)->dccpd_ack_seq); | 197 | DCCP_SKB_CB(skb)->dccpd_ack_seq); |
157 | /* schedule an Ack in case this sender is quiescent */ | ||
158 | inet_csk_schedule_ack(sk); | ||
159 | break; | 198 | break; |
160 | case DCCPO_TIMESTAMP_ECHO: | 199 | case DCCPO_TIMESTAMP_ECHO: |
161 | if (len != 4 && len != 6 && len != 8) | 200 | if (len != 4 && len != 6 && len != 8) |
@@ -212,25 +251,23 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, | |||
212 | dccp_pr_debug("%s rx opt: ELAPSED_TIME=%d\n", | 251 | dccp_pr_debug("%s rx opt: ELAPSED_TIME=%d\n", |
213 | dccp_role(sk), elapsed_time); | 252 | dccp_role(sk), elapsed_time); |
214 | break; | 253 | break; |
215 | case DCCPO_MIN_RX_CCID_SPECIFIC ... DCCPO_MAX_RX_CCID_SPECIFIC: | 254 | case 128 ... 191: { |
255 | const u16 idx = value - options; | ||
256 | |||
216 | if (ccid_hc_rx_parse_options(dp->dccps_hc_rx_ccid, sk, | 257 | if (ccid_hc_rx_parse_options(dp->dccps_hc_rx_ccid, sk, |
217 | pkt_type, opt, value, len)) | 258 | opt, len, idx, |
259 | value) != 0) | ||
218 | goto out_invalid_option; | 260 | goto out_invalid_option; |
261 | } | ||
219 | break; | 262 | break; |
220 | case DCCPO_ACK_VECTOR_0: | 263 | case 192 ... 255: { |
221 | case DCCPO_ACK_VECTOR_1: | 264 | const u16 idx = value - options; |
222 | if (dccp_packet_without_ack(skb)) /* RFC 4340, 11.4 */ | 265 | |
223 | break; | ||
224 | /* | ||
225 | * Ack vectors are processed by the TX CCID if it is | ||
226 | * interested. The RX CCID need not parse Ack Vectors, | ||
227 | * since it is only interested in clearing old state. | ||
228 | * Fall through. | ||
229 | */ | ||
230 | case DCCPO_MIN_TX_CCID_SPECIFIC ... DCCPO_MAX_TX_CCID_SPECIFIC: | ||
231 | if (ccid_hc_tx_parse_options(dp->dccps_hc_tx_ccid, sk, | 266 | if (ccid_hc_tx_parse_options(dp->dccps_hc_tx_ccid, sk, |
232 | pkt_type, opt, value, len)) | 267 | opt, len, idx, |
268 | value) != 0) | ||
233 | goto out_invalid_option; | 269 | goto out_invalid_option; |
270 | } | ||
234 | break; | 271 | break; |
235 | default: | 272 | default: |
236 | DCCP_CRIT("DCCP(%p): option %d(len=%d) not " | 273 | DCCP_CRIT("DCCP(%p): option %d(len=%d) not " |
@@ -252,10 +289,8 @@ out_nonsensical_length: | |||
252 | 289 | ||
253 | out_invalid_option: | 290 | out_invalid_option: |
254 | DCCP_INC_STATS_BH(DCCP_MIB_INVALIDOPT); | 291 | DCCP_INC_STATS_BH(DCCP_MIB_INVALIDOPT); |
255 | rc = DCCP_RESET_CODE_OPTION_ERROR; | 292 | DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_OPTION_ERROR; |
256 | out_featneg_failed: | 293 | DCCP_WARN("DCCP(%p): invalid option %d, len=%d", sk, opt, len); |
257 | DCCP_WARN("DCCP(%p): Option %d (len=%d) error=%u\n", sk, opt, len, rc); | ||
258 | DCCP_SKB_CB(skb)->dccpd_reset_code = rc; | ||
259 | DCCP_SKB_CB(skb)->dccpd_reset_data[0] = opt; | 294 | DCCP_SKB_CB(skb)->dccpd_reset_data[0] = opt; |
260 | DCCP_SKB_CB(skb)->dccpd_reset_data[1] = len > 0 ? value[0] : 0; | 295 | DCCP_SKB_CB(skb)->dccpd_reset_data[1] = len > 0 ? value[0] : 0; |
261 | DCCP_SKB_CB(skb)->dccpd_reset_data[2] = len > 1 ? value[1] : 0; | 296 | DCCP_SKB_CB(skb)->dccpd_reset_data[2] = len > 1 ? value[1] : 0; |
@@ -264,12 +299,9 @@ out_featneg_failed: | |||
264 | 299 | ||
265 | EXPORT_SYMBOL_GPL(dccp_parse_options); | 300 | EXPORT_SYMBOL_GPL(dccp_parse_options); |
266 | 301 | ||
267 | void dccp_encode_value_var(const u64 value, u8 *to, const u8 len) | 302 | static void dccp_encode_value_var(const u32 value, unsigned char *to, |
303 | const unsigned int len) | ||
268 | { | 304 | { |
269 | if (len >= DCCP_OPTVAL_MAXLEN) | ||
270 | *to++ = (value & 0xFF0000000000ull) >> 40; | ||
271 | if (len > 4) | ||
272 | *to++ = (value & 0xFF00000000ull) >> 32; | ||
273 | if (len > 3) | 305 | if (len > 3) |
274 | *to++ = (value & 0xFF000000) >> 24; | 306 | *to++ = (value & 0xFF000000) >> 24; |
275 | if (len > 2) | 307 | if (len > 2) |
@@ -429,140 +461,92 @@ static int dccp_insert_option_timestamp_echo(struct dccp_sock *dp, | |||
429 | return 0; | 461 | return 0; |
430 | } | 462 | } |
431 | 463 | ||
432 | static int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb) | 464 | static int dccp_insert_feat_opt(struct sk_buff *skb, u8 type, u8 feat, |
465 | u8 *val, u8 len) | ||
433 | { | 466 | { |
434 | struct dccp_sock *dp = dccp_sk(sk); | 467 | u8 *to; |
435 | struct dccp_ackvec *av = dp->dccps_hc_rx_ackvec; | ||
436 | struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); | ||
437 | const u16 buflen = dccp_ackvec_buflen(av); | ||
438 | /* Figure out how many options do we need to represent the ackvec */ | ||
439 | const u8 nr_opts = DIV_ROUND_UP(buflen, DCCP_SINGLE_OPT_MAXLEN); | ||
440 | u16 len = buflen + 2 * nr_opts; | ||
441 | u8 i, nonce = 0; | ||
442 | const unsigned char *tail, *from; | ||
443 | unsigned char *to; | ||
444 | 468 | ||
445 | if (dcb->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) { | 469 | if (DCCP_SKB_CB(skb)->dccpd_opt_len + len + 3 > DCCP_MAX_OPT_LEN) { |
446 | DCCP_WARN("Lacking space for %u bytes on %s packet\n", len, | 470 | DCCP_WARN("packet too small for feature %d option!\n", feat); |
447 | dccp_packet_name(dcb->dccpd_type)); | ||
448 | return -1; | 471 | return -1; |
449 | } | 472 | } |
450 | /* | ||
451 | * Since Ack Vectors are variable-length, we can not always predict | ||
452 | * their size. To catch exception cases where the space is running out | ||
453 | * on the skb, a separate Sync is scheduled to carry the Ack Vector. | ||
454 | */ | ||
455 | if (len > DCCPAV_MIN_OPTLEN && | ||
456 | len + dcb->dccpd_opt_len + skb->len > dp->dccps_mss_cache) { | ||
457 | DCCP_WARN("No space left for Ack Vector (%u) on skb (%u+%u), " | ||
458 | "MPS=%u ==> reduce payload size?\n", len, skb->len, | ||
459 | dcb->dccpd_opt_len, dp->dccps_mss_cache); | ||
460 | dp->dccps_sync_scheduled = 1; | ||
461 | return 0; | ||
462 | } | ||
463 | dcb->dccpd_opt_len += len; | ||
464 | 473 | ||
465 | to = skb_push(skb, len); | 474 | DCCP_SKB_CB(skb)->dccpd_opt_len += len + 3; |
466 | len = buflen; | ||
467 | from = av->av_buf + av->av_buf_head; | ||
468 | tail = av->av_buf + DCCPAV_MAX_ACKVEC_LEN; | ||
469 | 475 | ||
470 | for (i = 0; i < nr_opts; ++i) { | 476 | to = skb_push(skb, len + 3); |
471 | int copylen = len; | 477 | *to++ = type; |
472 | 478 | *to++ = len + 3; | |
473 | if (len > DCCP_SINGLE_OPT_MAXLEN) | 479 | *to++ = feat; |
474 | copylen = DCCP_SINGLE_OPT_MAXLEN; | ||
475 | |||
476 | /* | ||
477 | * RFC 4340, 12.2: Encode the Nonce Echo for this Ack Vector via | ||
478 | * its type; ack_nonce is the sum of all individual buf_nonce's. | ||
479 | */ | ||
480 | nonce ^= av->av_buf_nonce[i]; | ||
481 | |||
482 | *to++ = DCCPO_ACK_VECTOR_0 + av->av_buf_nonce[i]; | ||
483 | *to++ = copylen + 2; | ||
484 | |||
485 | /* Check if buf_head wraps */ | ||
486 | if (from + copylen > tail) { | ||
487 | const u16 tailsize = tail - from; | ||
488 | |||
489 | memcpy(to, from, tailsize); | ||
490 | to += tailsize; | ||
491 | len -= tailsize; | ||
492 | copylen -= tailsize; | ||
493 | from = av->av_buf; | ||
494 | } | ||
495 | |||
496 | memcpy(to, from, copylen); | ||
497 | from += copylen; | ||
498 | to += copylen; | ||
499 | len -= copylen; | ||
500 | } | ||
501 | /* | ||
502 | * Each sent Ack Vector is recorded in the list, as per A.2 of RFC 4340. | ||
503 | */ | ||
504 | if (dccp_ackvec_update_records(av, dcb->dccpd_seq, nonce)) | ||
505 | return -ENOBUFS; | ||
506 | return 0; | ||
507 | } | ||
508 | 480 | ||
509 | /** | 481 | if (len) |
510 | * dccp_insert_option_mandatory - Mandatory option (5.8.2) | 482 | memcpy(to, val, len); |
511 | * Note that since we are using skb_push, this function needs to be called | ||
512 | * _after_ inserting the option it is supposed to influence (stack order). | ||
513 | */ | ||
514 | int dccp_insert_option_mandatory(struct sk_buff *skb) | ||
515 | { | ||
516 | if (DCCP_SKB_CB(skb)->dccpd_opt_len >= DCCP_MAX_OPT_LEN) | ||
517 | return -1; | ||
518 | 483 | ||
519 | DCCP_SKB_CB(skb)->dccpd_opt_len++; | 484 | dccp_pr_debug("%s(%s (%d), ...), length %d\n", |
520 | *skb_push(skb, 1) = DCCPO_MANDATORY; | 485 | dccp_feat_typename(type), |
486 | dccp_feat_name(feat), feat, len); | ||
521 | return 0; | 487 | return 0; |
522 | } | 488 | } |
523 | 489 | ||
524 | /** | 490 | static int dccp_insert_options_feat(struct sock *sk, struct sk_buff *skb) |
525 | * dccp_insert_fn_opt - Insert single Feature-Negotiation option into @skb | ||
526 | * @type: %DCCPO_CHANGE_L, %DCCPO_CHANGE_R, %DCCPO_CONFIRM_L, %DCCPO_CONFIRM_R | ||
527 | * @feat: one out of %dccp_feature_numbers | ||
528 | * @val: NN value or SP array (preferred element first) to copy | ||
529 | * @len: true length of @val in bytes (excluding first element repetition) | ||
530 | * @repeat_first: whether to copy the first element of @val twice | ||
531 | * The last argument is used to construct Confirm options, where the preferred | ||
532 | * value and the preference list appear separately (RFC 4340, 6.3.1). Preference | ||
533 | * lists are kept such that the preferred entry is always first, so we only need | ||
534 | * to copy twice, and avoid the overhead of cloning into a bigger array. | ||
535 | */ | ||
536 | int dccp_insert_fn_opt(struct sk_buff *skb, u8 type, u8 feat, | ||
537 | u8 *val, u8 len, bool repeat_first) | ||
538 | { | 491 | { |
539 | u8 tot_len, *to; | 492 | struct dccp_sock *dp = dccp_sk(sk); |
493 | struct dccp_minisock *dmsk = dccp_msk(sk); | ||
494 | struct dccp_opt_pend *opt, *next; | ||
495 | int change = 0; | ||
496 | |||
497 | /* confirm any options [NN opts] */ | ||
498 | list_for_each_entry_safe(opt, next, &dmsk->dccpms_conf, dccpop_node) { | ||
499 | dccp_insert_feat_opt(skb, opt->dccpop_type, | ||
500 | opt->dccpop_feat, opt->dccpop_val, | ||
501 | opt->dccpop_len); | ||
502 | /* fear empty confirms */ | ||
503 | if (opt->dccpop_val) | ||
504 | kfree(opt->dccpop_val); | ||
505 | kfree(opt); | ||
506 | } | ||
507 | INIT_LIST_HEAD(&dmsk->dccpms_conf); | ||
508 | |||
509 | /* see which features we need to send */ | ||
510 | list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) { | ||
511 | /* see if we need to send any confirm */ | ||
512 | if (opt->dccpop_sc) { | ||
513 | dccp_insert_feat_opt(skb, opt->dccpop_type + 1, | ||
514 | opt->dccpop_feat, | ||
515 | opt->dccpop_sc->dccpoc_val, | ||
516 | opt->dccpop_sc->dccpoc_len); | ||
517 | |||
518 | BUG_ON(!opt->dccpop_sc->dccpoc_val); | ||
519 | kfree(opt->dccpop_sc->dccpoc_val); | ||
520 | kfree(opt->dccpop_sc); | ||
521 | opt->dccpop_sc = NULL; | ||
522 | } | ||
540 | 523 | ||
541 | /* take the `Feature' field and possible repetition into account */ | 524 | /* any option not confirmed, re-send it */ |
542 | if (len > (DCCP_SINGLE_OPT_MAXLEN - 2)) { | 525 | if (!opt->dccpop_conf) { |
543 | DCCP_WARN("length %u for feature %u too large\n", len, feat); | 526 | dccp_insert_feat_opt(skb, opt->dccpop_type, |
544 | return -1; | 527 | opt->dccpop_feat, opt->dccpop_val, |
528 | opt->dccpop_len); | ||
529 | change++; | ||
530 | } | ||
545 | } | 531 | } |
546 | 532 | ||
547 | if (unlikely(val == NULL || len == 0)) | 533 | /* Retransmit timer. |
548 | len = repeat_first = 0; | 534 | * If this is the master listening sock, we don't set a timer on it. It |
549 | tot_len = 3 + repeat_first + len; | 535 | * should be fine because if the dude doesn't receive our RESPONSE |
536 | * [which will contain the CHANGE] he will send another REQUEST which | ||
537 | * will "retrnasmit" the change. | ||
538 | */ | ||
539 | if (change && dp->dccps_role != DCCP_ROLE_LISTEN) { | ||
540 | dccp_pr_debug("reset feat negotiation timer %p\n", sk); | ||
550 | 541 | ||
551 | if (DCCP_SKB_CB(skb)->dccpd_opt_len + tot_len > DCCP_MAX_OPT_LEN) { | 542 | /* XXX don't reset the timer on re-transmissions. I.e. reset it |
552 | DCCP_WARN("packet too small for feature %d option!\n", feat); | 543 | * only when sending new stuff i guess. Currently the timer |
553 | return -1; | 544 | * never backs off because on re-transmission it just resets it! |
545 | */ | ||
546 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, | ||
547 | inet_csk(sk)->icsk_rto, DCCP_RTO_MAX); | ||
554 | } | 548 | } |
555 | DCCP_SKB_CB(skb)->dccpd_opt_len += tot_len; | ||
556 | |||
557 | to = skb_push(skb, tot_len); | ||
558 | *to++ = type; | ||
559 | *to++ = tot_len; | ||
560 | *to++ = feat; | ||
561 | 549 | ||
562 | if (repeat_first) | ||
563 | *to++ = *val; | ||
564 | if (len) | ||
565 | memcpy(to, val, len); | ||
566 | return 0; | 550 | return 0; |
567 | } | 551 | } |
568 | 552 | ||
@@ -581,30 +565,19 @@ static void dccp_insert_option_padding(struct sk_buff *skb) | |||
581 | int dccp_insert_options(struct sock *sk, struct sk_buff *skb) | 565 | int dccp_insert_options(struct sock *sk, struct sk_buff *skb) |
582 | { | 566 | { |
583 | struct dccp_sock *dp = dccp_sk(sk); | 567 | struct dccp_sock *dp = dccp_sk(sk); |
568 | struct dccp_minisock *dmsk = dccp_msk(sk); | ||
584 | 569 | ||
585 | DCCP_SKB_CB(skb)->dccpd_opt_len = 0; | 570 | DCCP_SKB_CB(skb)->dccpd_opt_len = 0; |
586 | 571 | ||
587 | if (dp->dccps_send_ndp_count && dccp_insert_option_ndp(sk, skb)) | 572 | if (dmsk->dccpms_send_ndp_count && |
573 | dccp_insert_option_ndp(sk, skb)) | ||
588 | return -1; | 574 | return -1; |
589 | 575 | ||
590 | if (DCCP_SKB_CB(skb)->dccpd_type != DCCP_PKT_DATA) { | 576 | if (!dccp_packet_without_ack(skb)) { |
591 | 577 | if (dmsk->dccpms_send_ack_vector && | |
592 | /* Feature Negotiation */ | 578 | dccp_ackvec_pending(dp->dccps_hc_rx_ackvec) && |
593 | if (dccp_feat_insert_opts(dp, NULL, skb)) | 579 | dccp_insert_option_ackvec(sk, skb)) |
594 | return -1; | 580 | return -1; |
595 | |||
596 | if (DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_REQUEST) { | ||
597 | /* | ||
598 | * Obtain RTT sample from Request/Response exchange. | ||
599 | * This is currently used in CCID 3 initialisation. | ||
600 | */ | ||
601 | if (dccp_insert_option_timestamp(sk, skb)) | ||
602 | return -1; | ||
603 | |||
604 | } else if (dccp_ackvec_pending(sk) && | ||
605 | dccp_insert_option_ackvec(sk, skb)) { | ||
606 | return -1; | ||
607 | } | ||
608 | } | 581 | } |
609 | 582 | ||
610 | if (dp->dccps_hc_rx_insert_options) { | 583 | if (dp->dccps_hc_rx_insert_options) { |
@@ -613,6 +586,21 @@ int dccp_insert_options(struct sock *sk, struct sk_buff *skb) | |||
613 | dp->dccps_hc_rx_insert_options = 0; | 586 | dp->dccps_hc_rx_insert_options = 0; |
614 | } | 587 | } |
615 | 588 | ||
589 | /* Feature negotiation */ | ||
590 | /* Data packets can't do feat negotiation */ | ||
591 | if (DCCP_SKB_CB(skb)->dccpd_type != DCCP_PKT_DATA && | ||
592 | DCCP_SKB_CB(skb)->dccpd_type != DCCP_PKT_DATAACK && | ||
593 | dccp_insert_options_feat(sk, skb)) | ||
594 | return -1; | ||
595 | |||
596 | /* | ||
597 | * Obtain RTT sample from Request/Response exchange. | ||
598 | * This is currently used in CCID 3 initialisation. | ||
599 | */ | ||
600 | if (DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_REQUEST && | ||
601 | dccp_insert_option_timestamp(sk, skb)) | ||
602 | return -1; | ||
603 | |||
616 | if (dp->dccps_timestamp_echo != 0 && | 604 | if (dp->dccps_timestamp_echo != 0 && |
617 | dccp_insert_option_timestamp_echo(dp, NULL, skb)) | 605 | dccp_insert_option_timestamp_echo(dp, NULL, skb)) |
618 | return -1; | 606 | return -1; |
@@ -625,9 +613,6 @@ int dccp_insert_options_rsk(struct dccp_request_sock *dreq, struct sk_buff *skb) | |||
625 | { | 613 | { |
626 | DCCP_SKB_CB(skb)->dccpd_opt_len = 0; | 614 | DCCP_SKB_CB(skb)->dccpd_opt_len = 0; |
627 | 615 | ||
628 | if (dccp_feat_insert_opts(NULL, dreq, skb)) | ||
629 | return -1; | ||
630 | |||
631 | if (dreq->dreq_timestamp_echo != 0 && | 616 | if (dreq->dreq_timestamp_echo != 0 && |
632 | dccp_insert_option_timestamp_echo(NULL, dreq, skb)) | 617 | dccp_insert_option_timestamp_echo(NULL, dreq, skb)) |
633 | return -1; | 618 | return -1; |
diff --git a/net/dccp/output.c b/net/dccp/output.c index 2532797a8009..d06945c7d3df 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c | |||
@@ -26,13 +26,11 @@ static inline void dccp_event_ack_sent(struct sock *sk) | |||
26 | inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); | 26 | inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); |
27 | } | 27 | } |
28 | 28 | ||
29 | /* enqueue @skb on sk_send_head for retransmission, return clone to send now */ | 29 | static void dccp_skb_entail(struct sock *sk, struct sk_buff *skb) |
30 | static struct sk_buff *dccp_skb_entail(struct sock *sk, struct sk_buff *skb) | ||
31 | { | 30 | { |
32 | skb_set_owner_w(skb, sk); | 31 | skb_set_owner_w(skb, sk); |
33 | WARN_ON(sk->sk_send_head); | 32 | WARN_ON(sk->sk_send_head); |
34 | sk->sk_send_head = skb; | 33 | sk->sk_send_head = skb; |
35 | return skb_clone(sk->sk_send_head, gfp_any()); | ||
36 | } | 34 | } |
37 | 35 | ||
38 | /* | 36 | /* |
@@ -163,27 +161,21 @@ unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu) | |||
163 | struct inet_connection_sock *icsk = inet_csk(sk); | 161 | struct inet_connection_sock *icsk = inet_csk(sk); |
164 | struct dccp_sock *dp = dccp_sk(sk); | 162 | struct dccp_sock *dp = dccp_sk(sk); |
165 | u32 ccmps = dccp_determine_ccmps(dp); | 163 | u32 ccmps = dccp_determine_ccmps(dp); |
166 | u32 cur_mps = ccmps ? min(pmtu, ccmps) : pmtu; | 164 | int cur_mps = ccmps ? min(pmtu, ccmps) : pmtu; |
167 | 165 | ||
168 | /* Account for header lengths and IPv4/v6 option overhead */ | 166 | /* Account for header lengths and IPv4/v6 option overhead */ |
169 | cur_mps -= (icsk->icsk_af_ops->net_header_len + icsk->icsk_ext_hdr_len + | 167 | cur_mps -= (icsk->icsk_af_ops->net_header_len + icsk->icsk_ext_hdr_len + |
170 | sizeof(struct dccp_hdr) + sizeof(struct dccp_hdr_ext)); | 168 | sizeof(struct dccp_hdr) + sizeof(struct dccp_hdr_ext)); |
171 | 169 | ||
172 | /* | 170 | /* |
173 | * Leave enough headroom for common DCCP header options. | 171 | * FIXME: this should come from the CCID infrastructure, where, say, |
174 | * This only considers options which may appear on DCCP-Data packets, as | 172 | * TFRC will say it wants TIMESTAMPS, ELAPSED time, etc, for now lets |
175 | * per table 3 in RFC 4340, 5.8. When running out of space for other | 173 | * put a rough estimate for NDP + TIMESTAMP + TIMESTAMP_ECHO + ELAPSED |
176 | * options (eg. Ack Vector which can take up to 255 bytes), it is better | 174 | * TIME + TFRC_OPT_LOSS_EVENT_RATE + TFRC_OPT_RECEIVE_RATE + padding to |
177 | * to schedule a separate Ack. Thus we leave headroom for the following: | 175 | * make it a multiple of 4 |
178 | * - 1 byte for Slow Receiver (11.6) | ||
179 | * - 6 bytes for Timestamp (13.1) | ||
180 | * - 10 bytes for Timestamp Echo (13.3) | ||
181 | * - 8 bytes for NDP count (7.7, when activated) | ||
182 | * - 6 bytes for Data Checksum (9.3) | ||
183 | * - %DCCPAV_MIN_OPTLEN bytes for Ack Vector size (11.4, when enabled) | ||
184 | */ | 176 | */ |
185 | cur_mps -= roundup(1 + 6 + 10 + dp->dccps_send_ndp_count * 8 + 6 + | 177 | |
186 | (dp->dccps_hc_rx_ackvec ? DCCPAV_MIN_OPTLEN : 0), 4); | 178 | cur_mps -= ((5 + 6 + 10 + 6 + 6 + 6 + 3) / 4) * 4; |
187 | 179 | ||
188 | /* And store cached results */ | 180 | /* And store cached results */ |
189 | icsk->icsk_pmtu_cookie = pmtu; | 181 | icsk->icsk_pmtu_cookie = pmtu; |
@@ -208,158 +200,95 @@ void dccp_write_space(struct sock *sk) | |||
208 | } | 200 | } |
209 | 201 | ||
210 | /** | 202 | /** |
211 | * dccp_wait_for_ccid - Await CCID send permission | 203 | * dccp_wait_for_ccid - Wait for ccid to tell us we can send a packet |
212 | * @sk: socket to wait for | 204 | * @sk: socket to wait for |
213 | * @delay: timeout in jiffies | 205 | * @skb: current skb to pass on for waiting |
214 | * This is used by CCIDs which need to delay the send time in process context. | 206 | * @delay: sleep timeout in milliseconds (> 0) |
207 | * This function is called by default when the socket is closed, and | ||
208 | * when a non-zero linger time is set on the socket. For consistency | ||
215 | */ | 209 | */ |
216 | static int dccp_wait_for_ccid(struct sock *sk, unsigned long delay) | 210 | static int dccp_wait_for_ccid(struct sock *sk, struct sk_buff *skb, int delay) |
217 | { | 211 | { |
212 | struct dccp_sock *dp = dccp_sk(sk); | ||
218 | DEFINE_WAIT(wait); | 213 | DEFINE_WAIT(wait); |
219 | long remaining; | 214 | unsigned long jiffdelay; |
220 | 215 | int rc; | |
221 | prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); | ||
222 | sk->sk_write_pending++; | ||
223 | release_sock(sk); | ||
224 | 216 | ||
225 | remaining = schedule_timeout(delay); | 217 | do { |
226 | 218 | dccp_pr_debug("delayed send by %d msec\n", delay); | |
227 | lock_sock(sk); | 219 | jiffdelay = msecs_to_jiffies(delay); |
228 | sk->sk_write_pending--; | ||
229 | finish_wait(sk->sk_sleep, &wait); | ||
230 | 220 | ||
231 | if (signal_pending(current) || sk->sk_err) | 221 | prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); |
232 | return -1; | ||
233 | return remaining; | ||
234 | } | ||
235 | |||
236 | /** | ||
237 | * dccp_xmit_packet - Send data packet under control of CCID | ||
238 | * Transmits next-queued payload and informs CCID to account for the packet. | ||
239 | */ | ||
240 | static void dccp_xmit_packet(struct sock *sk) | ||
241 | { | ||
242 | int err, len; | ||
243 | struct dccp_sock *dp = dccp_sk(sk); | ||
244 | struct sk_buff *skb = dccp_qpolicy_pop(sk); | ||
245 | 222 | ||
246 | if (unlikely(skb == NULL)) | 223 | sk->sk_write_pending++; |
247 | return; | 224 | release_sock(sk); |
248 | len = skb->len; | 225 | schedule_timeout(jiffdelay); |
226 | lock_sock(sk); | ||
227 | sk->sk_write_pending--; | ||
249 | 228 | ||
250 | if (sk->sk_state == DCCP_PARTOPEN) { | 229 | if (sk->sk_err) |
251 | const u32 cur_mps = dp->dccps_mss_cache - DCCP_FEATNEG_OVERHEAD; | 230 | goto do_error; |
252 | /* | 231 | if (signal_pending(current)) |
253 | * See 8.1.5 - Handshake Completion. | 232 | goto do_interrupted; |
254 | * | ||
255 | * For robustness we resend Confirm options until the client has | ||
256 | * entered OPEN. During the initial feature negotiation, the MPS | ||
257 | * is smaller than usual, reduced by the Change/Confirm options. | ||
258 | */ | ||
259 | if (!list_empty(&dp->dccps_featneg) && len > cur_mps) { | ||
260 | DCCP_WARN("Payload too large (%d) for featneg.\n", len); | ||
261 | dccp_send_ack(sk); | ||
262 | dccp_feat_list_purge(&dp->dccps_featneg); | ||
263 | } | ||
264 | 233 | ||
265 | inet_csk_schedule_ack(sk); | 234 | rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb); |
266 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, | 235 | } while ((delay = rc) > 0); |
267 | inet_csk(sk)->icsk_rto, | 236 | out: |
268 | DCCP_RTO_MAX); | 237 | finish_wait(sk->sk_sleep, &wait); |
269 | DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATAACK; | 238 | return rc; |
270 | } else if (dccp_ack_pending(sk)) { | 239 | |
271 | DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATAACK; | 240 | do_error: |
272 | } else { | 241 | rc = -EPIPE; |
273 | DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATA; | 242 | goto out; |
274 | } | 243 | do_interrupted: |
275 | 244 | rc = -EINTR; | |
276 | err = dccp_transmit_skb(sk, skb); | 245 | goto out; |
277 | if (err) | ||
278 | dccp_pr_debug("transmit_skb() returned err=%d\n", err); | ||
279 | /* | ||
280 | * Register this one as sent even if an error occurred. To the remote | ||
281 | * end a local packet drop is indistinguishable from network loss, i.e. | ||
282 | * any local drop will eventually be reported via receiver feedback. | ||
283 | */ | ||
284 | ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, len); | ||
285 | |||
286 | /* | ||
287 | * If the CCID needs to transfer additional header options out-of-band | ||
288 | * (e.g. Ack Vectors or feature-negotiation options), it activates this | ||
289 | * flag to schedule a Sync. The Sync will automatically incorporate all | ||
290 | * currently pending header options, thus clearing the backlog. | ||
291 | */ | ||
292 | if (dp->dccps_sync_scheduled) | ||
293 | dccp_send_sync(sk, dp->dccps_gsr, DCCP_PKT_SYNC); | ||
294 | } | 246 | } |
295 | 247 | ||
296 | /** | 248 | void dccp_write_xmit(struct sock *sk, int block) |
297 | * dccp_flush_write_queue - Drain queue at end of connection | ||
298 | * Since dccp_sendmsg queues packets without waiting for them to be sent, it may | ||
299 | * happen that the TX queue is not empty at the end of a connection. We give the | ||
300 | * HC-sender CCID a grace period of up to @time_budget jiffies. If this function | ||
301 | * returns with a non-empty write queue, it will be purged later. | ||
302 | */ | ||
303 | void dccp_flush_write_queue(struct sock *sk, long *time_budget) | ||
304 | { | 249 | { |
305 | struct dccp_sock *dp = dccp_sk(sk); | 250 | struct dccp_sock *dp = dccp_sk(sk); |
306 | struct sk_buff *skb; | 251 | struct sk_buff *skb; |
307 | long delay, rc; | ||
308 | |||
309 | while (*time_budget > 0 && (skb = skb_peek(&sk->sk_write_queue))) { | ||
310 | rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb); | ||
311 | 252 | ||
312 | switch (ccid_packet_dequeue_eval(rc)) { | 253 | while ((skb = skb_peek(&sk->sk_write_queue))) { |
313 | case CCID_PACKET_WILL_DEQUEUE_LATER: | 254 | int err = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb); |
314 | /* | 255 | |
315 | * If the CCID determines when to send, the next sending | 256 | if (err > 0) { |
316 | * time is unknown or the CCID may not even send again | 257 | if (!block) { |
317 | * (e.g. remote host crashes or lost Ack packets). | 258 | sk_reset_timer(sk, &dp->dccps_xmit_timer, |
318 | */ | 259 | msecs_to_jiffies(err)+jiffies); |
319 | DCCP_WARN("CCID did not manage to send all packets\n"); | 260 | break; |
320 | return; | 261 | } else |
321 | case CCID_PACKET_DELAY: | 262 | err = dccp_wait_for_ccid(sk, skb, err); |
322 | delay = msecs_to_jiffies(rc); | 263 | if (err && err != -EINTR) |
323 | if (delay > *time_budget) | 264 | DCCP_BUG("err=%d after dccp_wait_for_ccid", err); |
324 | return; | ||
325 | rc = dccp_wait_for_ccid(sk, delay); | ||
326 | if (rc < 0) | ||
327 | return; | ||
328 | *time_budget -= (delay - rc); | ||
329 | /* check again if we can send now */ | ||
330 | break; | ||
331 | case CCID_PACKET_SEND_AT_ONCE: | ||
332 | dccp_xmit_packet(sk); | ||
333 | break; | ||
334 | case CCID_PACKET_ERR: | ||
335 | skb_dequeue(&sk->sk_write_queue); | ||
336 | kfree_skb(skb); | ||
337 | dccp_pr_debug("packet discarded due to err=%ld\n", rc); | ||
338 | } | 265 | } |
339 | } | ||
340 | } | ||
341 | 266 | ||
342 | void dccp_write_xmit(struct sock *sk) | 267 | skb_dequeue(&sk->sk_write_queue); |
343 | { | 268 | if (err == 0) { |
344 | struct dccp_sock *dp = dccp_sk(sk); | 269 | struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); |
345 | struct sk_buff *skb; | 270 | const int len = skb->len; |
346 | 271 | ||
347 | while ((skb = dccp_qpolicy_top(sk))) { | 272 | if (sk->sk_state == DCCP_PARTOPEN) { |
348 | int rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb); | 273 | /* See 8.1.5. Handshake Completion */ |
349 | 274 | inet_csk_schedule_ack(sk); | |
350 | switch (ccid_packet_dequeue_eval(rc)) { | 275 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, |
351 | case CCID_PACKET_WILL_DEQUEUE_LATER: | 276 | inet_csk(sk)->icsk_rto, |
352 | return; | 277 | DCCP_RTO_MAX); |
353 | case CCID_PACKET_DELAY: | 278 | dcb->dccpd_type = DCCP_PKT_DATAACK; |
354 | sk_reset_timer(sk, &dp->dccps_xmit_timer, | 279 | } else if (dccp_ack_pending(sk)) |
355 | jiffies + msecs_to_jiffies(rc)); | 280 | dcb->dccpd_type = DCCP_PKT_DATAACK; |
356 | return; | 281 | else |
357 | case CCID_PACKET_SEND_AT_ONCE: | 282 | dcb->dccpd_type = DCCP_PKT_DATA; |
358 | dccp_xmit_packet(sk); | 283 | |
359 | break; | 284 | err = dccp_transmit_skb(sk, skb); |
360 | case CCID_PACKET_ERR: | 285 | ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, 0, len); |
361 | dccp_qpolicy_drop(sk, skb); | 286 | if (err) |
362 | dccp_pr_debug("packet discarded due to err=%d\n", rc); | 287 | DCCP_BUG("err=%d after ccid_hc_tx_packet_sent", |
288 | err); | ||
289 | } else { | ||
290 | dccp_pr_debug("packet discarded due to err=%d\n", err); | ||
291 | kfree_skb(skb); | ||
363 | } | 292 | } |
364 | } | 293 | } |
365 | } | 294 | } |
@@ -410,12 +339,10 @@ struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst, | |||
410 | DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_RESPONSE; | 339 | DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_RESPONSE; |
411 | DCCP_SKB_CB(skb)->dccpd_seq = dreq->dreq_iss; | 340 | DCCP_SKB_CB(skb)->dccpd_seq = dreq->dreq_iss; |
412 | 341 | ||
413 | /* Resolve feature dependencies resulting from choice of CCID */ | 342 | if (dccp_insert_options_rsk(dreq, skb)) { |
414 | if (dccp_feat_server_ccid_dependencies(dreq)) | 343 | kfree_skb(skb); |
415 | goto response_failed; | 344 | return NULL; |
416 | 345 | } | |
417 | if (dccp_insert_options_rsk(dreq, skb)) | ||
418 | goto response_failed; | ||
419 | 346 | ||
420 | /* Build and checksum header */ | 347 | /* Build and checksum header */ |
421 | dh = dccp_zeroed_hdr(skb, dccp_header_size); | 348 | dh = dccp_zeroed_hdr(skb, dccp_header_size); |
@@ -436,9 +363,6 @@ struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst, | |||
436 | inet_rsk(req)->acked = 1; | 363 | inet_rsk(req)->acked = 1; |
437 | DCCP_INC_STATS(DCCP_MIB_OUTSEGS); | 364 | DCCP_INC_STATS(DCCP_MIB_OUTSEGS); |
438 | return skb; | 365 | return skb; |
439 | response_failed: | ||
440 | kfree_skb(skb); | ||
441 | return NULL; | ||
442 | } | 366 | } |
443 | 367 | ||
444 | EXPORT_SYMBOL_GPL(dccp_make_response); | 368 | EXPORT_SYMBOL_GPL(dccp_make_response); |
@@ -523,9 +447,8 @@ int dccp_send_reset(struct sock *sk, enum dccp_reset_codes code) | |||
523 | /* | 447 | /* |
524 | * Do all connect socket setups that can be done AF independent. | 448 | * Do all connect socket setups that can be done AF independent. |
525 | */ | 449 | */ |
526 | int dccp_connect(struct sock *sk) | 450 | static inline void dccp_connect_init(struct sock *sk) |
527 | { | 451 | { |
528 | struct sk_buff *skb; | ||
529 | struct dccp_sock *dp = dccp_sk(sk); | 452 | struct dccp_sock *dp = dccp_sk(sk); |
530 | struct dst_entry *dst = __sk_dst_get(sk); | 453 | struct dst_entry *dst = __sk_dst_get(sk); |
531 | struct inet_connection_sock *icsk = inet_csk(sk); | 454 | struct inet_connection_sock *icsk = inet_csk(sk); |
@@ -535,13 +458,19 @@ int dccp_connect(struct sock *sk) | |||
535 | 458 | ||
536 | dccp_sync_mss(sk, dst_mtu(dst)); | 459 | dccp_sync_mss(sk, dst_mtu(dst)); |
537 | 460 | ||
538 | /* do not connect if feature negotiation setup fails */ | ||
539 | if (dccp_feat_finalise_settings(dccp_sk(sk))) | ||
540 | return -EPROTO; | ||
541 | |||
542 | /* Initialise GAR as per 8.5; AWL/AWH are set in dccp_transmit_skb() */ | 461 | /* Initialise GAR as per 8.5; AWL/AWH are set in dccp_transmit_skb() */ |
543 | dp->dccps_gar = dp->dccps_iss; | 462 | dp->dccps_gar = dp->dccps_iss; |
544 | 463 | ||
464 | icsk->icsk_retransmits = 0; | ||
465 | } | ||
466 | |||
467 | int dccp_connect(struct sock *sk) | ||
468 | { | ||
469 | struct sk_buff *skb; | ||
470 | struct inet_connection_sock *icsk = inet_csk(sk); | ||
471 | |||
472 | dccp_connect_init(sk); | ||
473 | |||
545 | skb = alloc_skb(sk->sk_prot->max_header, sk->sk_allocation); | 474 | skb = alloc_skb(sk->sk_prot->max_header, sk->sk_allocation); |
546 | if (unlikely(skb == NULL)) | 475 | if (unlikely(skb == NULL)) |
547 | return -ENOBUFS; | 476 | return -ENOBUFS; |
@@ -551,11 +480,11 @@ int dccp_connect(struct sock *sk) | |||
551 | 480 | ||
552 | DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_REQUEST; | 481 | DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_REQUEST; |
553 | 482 | ||
554 | dccp_transmit_skb(sk, dccp_skb_entail(sk, skb)); | 483 | dccp_skb_entail(sk, skb); |
484 | dccp_transmit_skb(sk, skb_clone(skb, GFP_KERNEL)); | ||
555 | DCCP_INC_STATS(DCCP_MIB_ACTIVEOPENS); | 485 | DCCP_INC_STATS(DCCP_MIB_ACTIVEOPENS); |
556 | 486 | ||
557 | /* Timer for repeating the REQUEST until an answer. */ | 487 | /* Timer for repeating the REQUEST until an answer. */ |
558 | icsk->icsk_retransmits = 0; | ||
559 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, | 488 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, |
560 | icsk->icsk_rto, DCCP_RTO_MAX); | 489 | icsk->icsk_rto, DCCP_RTO_MAX); |
561 | return 0; | 490 | return 0; |
@@ -642,12 +571,6 @@ void dccp_send_sync(struct sock *sk, const u64 ackno, | |||
642 | DCCP_SKB_CB(skb)->dccpd_type = pkt_type; | 571 | DCCP_SKB_CB(skb)->dccpd_type = pkt_type; |
643 | DCCP_SKB_CB(skb)->dccpd_ack_seq = ackno; | 572 | DCCP_SKB_CB(skb)->dccpd_ack_seq = ackno; |
644 | 573 | ||
645 | /* | ||
646 | * Clear the flag in case the Sync was scheduled for out-of-band data, | ||
647 | * such as carrying a long Ack Vector. | ||
648 | */ | ||
649 | dccp_sk(sk)->dccps_sync_scheduled = 0; | ||
650 | |||
651 | dccp_transmit_skb(sk, skb); | 574 | dccp_transmit_skb(sk, skb); |
652 | } | 575 | } |
653 | 576 | ||
@@ -676,7 +599,9 @@ void dccp_send_close(struct sock *sk, const int active) | |||
676 | DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_CLOSE; | 599 | DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_CLOSE; |
677 | 600 | ||
678 | if (active) { | 601 | if (active) { |
679 | skb = dccp_skb_entail(sk, skb); | 602 | dccp_write_xmit(sk, 1); |
603 | dccp_skb_entail(sk, skb); | ||
604 | dccp_transmit_skb(sk, skb_clone(skb, prio)); | ||
680 | /* | 605 | /* |
681 | * Retransmission timer for active-close: RFC 4340, 8.3 requires | 606 | * Retransmission timer for active-close: RFC 4340, 8.3 requires |
682 | * to retransmit the Close/CloseReq until the CLOSING/CLOSEREQ | 607 | * to retransmit the Close/CloseReq until the CLOSING/CLOSEREQ |
@@ -689,6 +614,6 @@ void dccp_send_close(struct sock *sk, const int active) | |||
689 | */ | 614 | */ |
690 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, | 615 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, |
691 | DCCP_TIMEOUT_INIT, DCCP_RTO_MAX); | 616 | DCCP_TIMEOUT_INIT, DCCP_RTO_MAX); |
692 | } | 617 | } else |
693 | dccp_transmit_skb(sk, skb); | 618 | dccp_transmit_skb(sk, skb); |
694 | } | 619 | } |
diff --git a/net/dccp/probe.c b/net/dccp/probe.c index eaa59d82ab0f..81368a7f5379 100644 --- a/net/dccp/probe.c +++ b/net/dccp/probe.c | |||
@@ -46,54 +46,75 @@ static struct { | |||
46 | struct kfifo *fifo; | 46 | struct kfifo *fifo; |
47 | spinlock_t lock; | 47 | spinlock_t lock; |
48 | wait_queue_head_t wait; | 48 | wait_queue_head_t wait; |
49 | ktime_t start; | 49 | struct timespec tstart; |
50 | } dccpw; | 50 | } dccpw; |
51 | 51 | ||
52 | static void jdccp_write_xmit(struct sock *sk) | 52 | static void printl(const char *fmt, ...) |
53 | { | 53 | { |
54 | const struct inet_sock *inet = inet_sk(sk); | 54 | va_list args; |
55 | struct ccid3_hc_tx_sock *hctx = NULL; | 55 | int len; |
56 | struct timespec tv; | 56 | struct timespec now; |
57 | char buf[256]; | 57 | char tbuf[256]; |
58 | int len, ccid = ccid_get_current_tx_ccid(dccp_sk(sk)); | ||
59 | 58 | ||
60 | if (ccid == DCCPC_CCID3) | 59 | va_start(args, fmt); |
61 | hctx = ccid3_hc_tx_sk(sk); | 60 | getnstimeofday(&now); |
62 | 61 | ||
63 | if (!port || ntohs(inet->dport) == port || ntohs(inet->sport) == port) { | 62 | now = timespec_sub(now, dccpw.tstart); |
64 | 63 | ||
65 | tv = ktime_to_timespec(ktime_sub(ktime_get(), dccpw.start)); | 64 | len = sprintf(tbuf, "%lu.%06lu ", |
66 | len = sprintf(buf, "%lu.%09lu %d.%d.%d.%d:%u %d.%d.%d.%d:%u %d", | 65 | (unsigned long) now.tv_sec, |
67 | (unsigned long)tv.tv_sec, | 66 | (unsigned long) now.tv_nsec / NSEC_PER_USEC); |
68 | (unsigned long)tv.tv_nsec, | 67 | len += vscnprintf(tbuf+len, sizeof(tbuf)-len, fmt, args); |
69 | NIPQUAD(inet->saddr), ntohs(inet->sport), | 68 | va_end(args); |
70 | NIPQUAD(inet->daddr), ntohs(inet->dport), ccid); | ||
71 | 69 | ||
70 | kfifo_put(dccpw.fifo, tbuf, len); | ||
71 | wake_up(&dccpw.wait); | ||
72 | } | ||
73 | |||
74 | static int jdccp_sendmsg(struct kiocb *iocb, struct sock *sk, | ||
75 | struct msghdr *msg, size_t size) | ||
76 | { | ||
77 | const struct dccp_minisock *dmsk = dccp_msk(sk); | ||
78 | const struct inet_sock *inet = inet_sk(sk); | ||
79 | const struct ccid3_hc_tx_sock *hctx; | ||
80 | |||
81 | if (dmsk->dccpms_tx_ccid == DCCPC_CCID3) | ||
82 | hctx = ccid3_hc_tx_sk(sk); | ||
83 | else | ||
84 | hctx = NULL; | ||
85 | |||
86 | if (port == 0 || ntohs(inet->dport) == port || | ||
87 | ntohs(inet->sport) == port) { | ||
72 | if (hctx) | 88 | if (hctx) |
73 | len += sprintf(buf + len, " %d %d %d %u %u %u %d", | 89 | printl("%d.%d.%d.%d:%u %d.%d.%d.%d:%u %d %d %d %d %u " |
74 | hctx->s, hctx->rtt, hctx->p, hctx->x_calc, | 90 | "%llu %llu %d\n", |
75 | (unsigned)(hctx->x_recv >> 6), | 91 | NIPQUAD(inet->saddr), ntohs(inet->sport), |
76 | (unsigned)(hctx->x >> 6), hctx->t_ipi); | 92 | NIPQUAD(inet->daddr), ntohs(inet->dport), size, |
77 | 93 | hctx->ccid3hctx_s, hctx->ccid3hctx_rtt, | |
78 | len += sprintf(buf + len, "\n"); | 94 | hctx->ccid3hctx_p, hctx->ccid3hctx_x_calc, |
79 | kfifo_put(dccpw.fifo, buf, len); | 95 | hctx->ccid3hctx_x_recv >> 6, |
80 | wake_up(&dccpw.wait); | 96 | hctx->ccid3hctx_x >> 6, hctx->ccid3hctx_t_ipi); |
97 | else | ||
98 | printl("%d.%d.%d.%d:%u %d.%d.%d.%d:%u %d\n", | ||
99 | NIPQUAD(inet->saddr), ntohs(inet->sport), | ||
100 | NIPQUAD(inet->daddr), ntohs(inet->dport), size); | ||
81 | } | 101 | } |
82 | 102 | ||
83 | jprobe_return(); | 103 | jprobe_return(); |
104 | return 0; | ||
84 | } | 105 | } |
85 | 106 | ||
86 | static struct jprobe dccp_send_probe = { | 107 | static struct jprobe dccp_send_probe = { |
87 | .kp = { | 108 | .kp = { |
88 | .symbol_name = "dccp_write_xmit", | 109 | .symbol_name = "dccp_sendmsg", |
89 | }, | 110 | }, |
90 | .entry = jdccp_write_xmit, | 111 | .entry = jdccp_sendmsg, |
91 | }; | 112 | }; |
92 | 113 | ||
93 | static int dccpprobe_open(struct inode *inode, struct file *file) | 114 | static int dccpprobe_open(struct inode *inode, struct file *file) |
94 | { | 115 | { |
95 | kfifo_reset(dccpw.fifo); | 116 | kfifo_reset(dccpw.fifo); |
96 | dccpw.start = ktime_get(); | 117 | getnstimeofday(&dccpw.tstart); |
97 | return 0; | 118 | return 0; |
98 | } | 119 | } |
99 | 120 | ||
diff --git a/net/dccp/proto.c b/net/dccp/proto.c index ecf3be961e11..d0bd34819761 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c | |||
@@ -67,9 +67,6 @@ void dccp_set_state(struct sock *sk, const int state) | |||
67 | case DCCP_OPEN: | 67 | case DCCP_OPEN: |
68 | if (oldstate != DCCP_OPEN) | 68 | if (oldstate != DCCP_OPEN) |
69 | DCCP_INC_STATS(DCCP_MIB_CURRESTAB); | 69 | DCCP_INC_STATS(DCCP_MIB_CURRESTAB); |
70 | /* Client retransmits all Confirm options until entering OPEN */ | ||
71 | if (oldstate == DCCP_PARTOPEN) | ||
72 | dccp_feat_list_purge(&dccp_sk(sk)->dccps_featneg); | ||
73 | break; | 70 | break; |
74 | 71 | ||
75 | case DCCP_CLOSED: | 72 | case DCCP_CLOSED: |
@@ -178,25 +175,63 @@ EXPORT_SYMBOL_GPL(dccp_state_name); | |||
178 | int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized) | 175 | int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized) |
179 | { | 176 | { |
180 | struct dccp_sock *dp = dccp_sk(sk); | 177 | struct dccp_sock *dp = dccp_sk(sk); |
178 | struct dccp_minisock *dmsk = dccp_msk(sk); | ||
181 | struct inet_connection_sock *icsk = inet_csk(sk); | 179 | struct inet_connection_sock *icsk = inet_csk(sk); |
182 | 180 | ||
181 | dccp_minisock_init(&dp->dccps_minisock); | ||
182 | |||
183 | icsk->icsk_rto = DCCP_TIMEOUT_INIT; | 183 | icsk->icsk_rto = DCCP_TIMEOUT_INIT; |
184 | icsk->icsk_syn_retries = sysctl_dccp_request_retries; | 184 | icsk->icsk_syn_retries = sysctl_dccp_request_retries; |
185 | sk->sk_state = DCCP_CLOSED; | 185 | sk->sk_state = DCCP_CLOSED; |
186 | sk->sk_write_space = dccp_write_space; | 186 | sk->sk_write_space = dccp_write_space; |
187 | icsk->icsk_sync_mss = dccp_sync_mss; | 187 | icsk->icsk_sync_mss = dccp_sync_mss; |
188 | dp->dccps_mss_cache = TCP_MIN_RCVMSS; | 188 | dp->dccps_mss_cache = 536; |
189 | dp->dccps_rate_last = jiffies; | 189 | dp->dccps_rate_last = jiffies; |
190 | dp->dccps_role = DCCP_ROLE_UNDEFINED; | 190 | dp->dccps_role = DCCP_ROLE_UNDEFINED; |
191 | dp->dccps_service = DCCP_SERVICE_CODE_IS_ABSENT; | 191 | dp->dccps_service = DCCP_SERVICE_CODE_IS_ABSENT; |
192 | dp->dccps_tx_qlen = sysctl_dccp_tx_qlen; | 192 | dp->dccps_l_ack_ratio = dp->dccps_r_ack_ratio = 1; |
193 | 193 | ||
194 | dccp_init_xmit_timers(sk); | 194 | dccp_init_xmit_timers(sk); |
195 | 195 | ||
196 | INIT_LIST_HEAD(&dp->dccps_featneg); | 196 | /* |
197 | /* control socket doesn't need feat nego */ | 197 | * FIXME: We're hardcoding the CCID, and doing this at this point makes |
198 | if (likely(ctl_sock_initialized)) | 198 | * the listening (master) sock get CCID control blocks, which is not |
199 | return dccp_feat_init(sk); | 199 | * necessary, but for now, to not mess with the test userspace apps, |
200 | * lets leave it here, later the real solution is to do this in a | ||
201 | * setsockopt(CCIDs-I-want/accept). -acme | ||
202 | */ | ||
203 | if (likely(ctl_sock_initialized)) { | ||
204 | int rc = dccp_feat_init(dmsk); | ||
205 | |||
206 | if (rc) | ||
207 | return rc; | ||
208 | |||
209 | if (dmsk->dccpms_send_ack_vector) { | ||
210 | dp->dccps_hc_rx_ackvec = dccp_ackvec_alloc(GFP_KERNEL); | ||
211 | if (dp->dccps_hc_rx_ackvec == NULL) | ||
212 | return -ENOMEM; | ||
213 | } | ||
214 | dp->dccps_hc_rx_ccid = ccid_hc_rx_new(dmsk->dccpms_rx_ccid, | ||
215 | sk, GFP_KERNEL); | ||
216 | dp->dccps_hc_tx_ccid = ccid_hc_tx_new(dmsk->dccpms_tx_ccid, | ||
217 | sk, GFP_KERNEL); | ||
218 | if (unlikely(dp->dccps_hc_rx_ccid == NULL || | ||
219 | dp->dccps_hc_tx_ccid == NULL)) { | ||
220 | ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk); | ||
221 | ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk); | ||
222 | if (dmsk->dccpms_send_ack_vector) { | ||
223 | dccp_ackvec_free(dp->dccps_hc_rx_ackvec); | ||
224 | dp->dccps_hc_rx_ackvec = NULL; | ||
225 | } | ||
226 | dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL; | ||
227 | return -ENOMEM; | ||
228 | } | ||
229 | } else { | ||
230 | /* control socket doesn't need feat nego */ | ||
231 | INIT_LIST_HEAD(&dmsk->dccpms_pending); | ||
232 | INIT_LIST_HEAD(&dmsk->dccpms_conf); | ||
233 | } | ||
234 | |||
200 | return 0; | 235 | return 0; |
201 | } | 236 | } |
202 | 237 | ||
@@ -205,6 +240,7 @@ EXPORT_SYMBOL_GPL(dccp_init_sock); | |||
205 | void dccp_destroy_sock(struct sock *sk) | 240 | void dccp_destroy_sock(struct sock *sk) |
206 | { | 241 | { |
207 | struct dccp_sock *dp = dccp_sk(sk); | 242 | struct dccp_sock *dp = dccp_sk(sk); |
243 | struct dccp_minisock *dmsk = dccp_msk(sk); | ||
208 | 244 | ||
209 | /* | 245 | /* |
210 | * DCCP doesn't use sk_write_queue, just sk_send_head | 246 | * DCCP doesn't use sk_write_queue, just sk_send_head |
@@ -222,7 +258,7 @@ void dccp_destroy_sock(struct sock *sk) | |||
222 | kfree(dp->dccps_service_list); | 258 | kfree(dp->dccps_service_list); |
223 | dp->dccps_service_list = NULL; | 259 | dp->dccps_service_list = NULL; |
224 | 260 | ||
225 | if (dp->dccps_hc_rx_ackvec != NULL) { | 261 | if (dmsk->dccpms_send_ack_vector) { |
226 | dccp_ackvec_free(dp->dccps_hc_rx_ackvec); | 262 | dccp_ackvec_free(dp->dccps_hc_rx_ackvec); |
227 | dp->dccps_hc_rx_ackvec = NULL; | 263 | dp->dccps_hc_rx_ackvec = NULL; |
228 | } | 264 | } |
@@ -231,7 +267,7 @@ void dccp_destroy_sock(struct sock *sk) | |||
231 | dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL; | 267 | dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL; |
232 | 268 | ||
233 | /* clean up feature negotiation state */ | 269 | /* clean up feature negotiation state */ |
234 | dccp_feat_list_purge(&dp->dccps_featneg); | 270 | dccp_feat_clean(dmsk); |
235 | } | 271 | } |
236 | 272 | ||
237 | EXPORT_SYMBOL_GPL(dccp_destroy_sock); | 273 | EXPORT_SYMBOL_GPL(dccp_destroy_sock); |
@@ -241,9 +277,6 @@ static inline int dccp_listen_start(struct sock *sk, int backlog) | |||
241 | struct dccp_sock *dp = dccp_sk(sk); | 277 | struct dccp_sock *dp = dccp_sk(sk); |
242 | 278 | ||
243 | dp->dccps_role = DCCP_ROLE_LISTEN; | 279 | dp->dccps_role = DCCP_ROLE_LISTEN; |
244 | /* do not start to listen if feature negotiation setup fails */ | ||
245 | if (dccp_feat_finalise_settings(dp)) | ||
246 | return -EPROTO; | ||
247 | return inet_csk_listen_start(sk, backlog); | 280 | return inet_csk_listen_start(sk, backlog); |
248 | } | 281 | } |
249 | 282 | ||
@@ -433,70 +466,42 @@ static int dccp_setsockopt_service(struct sock *sk, const __be32 service, | |||
433 | return 0; | 466 | return 0; |
434 | } | 467 | } |
435 | 468 | ||
436 | static int dccp_setsockopt_cscov(struct sock *sk, int cscov, bool rx) | 469 | /* byte 1 is feature. the rest is the preference list */ |
470 | static int dccp_setsockopt_change(struct sock *sk, int type, | ||
471 | struct dccp_so_feat __user *optval) | ||
437 | { | 472 | { |
438 | u8 *list, len; | 473 | struct dccp_so_feat opt; |
439 | int i, rc; | 474 | u8 *val; |
475 | int rc; | ||
440 | 476 | ||
441 | if (cscov < 0 || cscov > 15) | 477 | if (copy_from_user(&opt, optval, sizeof(opt))) |
442 | return -EINVAL; | 478 | return -EFAULT; |
443 | /* | 479 | /* |
444 | * Populate a list of permissible values, in the range cscov...15. This | 480 | * rfc4340: 6.1. Change Options |
445 | * is necessary since feature negotiation of single values only works if | ||
446 | * both sides incidentally choose the same value. Since the list starts | ||
447 | * lowest-value first, negotiation will pick the smallest shared value. | ||
448 | */ | 481 | */ |
449 | if (cscov == 0) | 482 | if (opt.dccpsf_len < 1) |
450 | return 0; | ||
451 | len = 16 - cscov; | ||
452 | |||
453 | list = kmalloc(len, GFP_KERNEL); | ||
454 | if (list == NULL) | ||
455 | return -ENOBUFS; | ||
456 | |||
457 | for (i = 0; i < len; i++) | ||
458 | list[i] = cscov++; | ||
459 | |||
460 | rc = dccp_feat_register_sp(sk, DCCPF_MIN_CSUM_COVER, rx, list, len); | ||
461 | |||
462 | if (rc == 0) { | ||
463 | if (rx) | ||
464 | dccp_sk(sk)->dccps_pcrlen = cscov; | ||
465 | else | ||
466 | dccp_sk(sk)->dccps_pcslen = cscov; | ||
467 | } | ||
468 | kfree(list); | ||
469 | return rc; | ||
470 | } | ||
471 | |||
472 | static int dccp_setsockopt_ccid(struct sock *sk, int type, | ||
473 | char __user *optval, int optlen) | ||
474 | { | ||
475 | u8 *val; | ||
476 | int rc = 0; | ||
477 | |||
478 | if (optlen < 1 || optlen > DCCP_FEAT_MAX_SP_VALS) | ||
479 | return -EINVAL; | 483 | return -EINVAL; |
480 | 484 | ||
481 | val = kmalloc(optlen, GFP_KERNEL); | 485 | val = kmalloc(opt.dccpsf_len, GFP_KERNEL); |
482 | if (val == NULL) | 486 | if (!val) |
483 | return -ENOMEM; | 487 | return -ENOMEM; |
484 | 488 | ||
485 | if (copy_from_user(val, optval, optlen)) { | 489 | if (copy_from_user(val, opt.dccpsf_val, opt.dccpsf_len)) { |
486 | kfree(val); | 490 | rc = -EFAULT; |
487 | return -EFAULT; | 491 | goto out_free_val; |
488 | } | 492 | } |
489 | 493 | ||
490 | lock_sock(sk); | 494 | rc = dccp_feat_change(dccp_msk(sk), type, opt.dccpsf_feat, |
491 | if (type == DCCP_SOCKOPT_TX_CCID || type == DCCP_SOCKOPT_CCID) | 495 | val, opt.dccpsf_len, GFP_KERNEL); |
492 | rc = dccp_feat_register_sp(sk, DCCPF_CCID, 1, val, optlen); | 496 | if (rc) |
497 | goto out_free_val; | ||
493 | 498 | ||
494 | if (!rc && (type == DCCP_SOCKOPT_RX_CCID || type == DCCP_SOCKOPT_CCID)) | 499 | out: |
495 | rc = dccp_feat_register_sp(sk, DCCPF_CCID, 0, val, optlen); | 500 | return rc; |
496 | release_sock(sk); | ||
497 | 501 | ||
502 | out_free_val: | ||
498 | kfree(val); | 503 | kfree(val); |
499 | return rc; | 504 | goto out; |
500 | } | 505 | } |
501 | 506 | ||
502 | static int do_dccp_setsockopt(struct sock *sk, int level, int optname, | 507 | static int do_dccp_setsockopt(struct sock *sk, int level, int optname, |
@@ -505,21 +510,7 @@ static int do_dccp_setsockopt(struct sock *sk, int level, int optname, | |||
505 | struct dccp_sock *dp = dccp_sk(sk); | 510 | struct dccp_sock *dp = dccp_sk(sk); |
506 | int val, err = 0; | 511 | int val, err = 0; |
507 | 512 | ||
508 | switch (optname) { | 513 | if (optlen < sizeof(int)) |
509 | case DCCP_SOCKOPT_PACKET_SIZE: | ||
510 | DCCP_WARN("sockopt(PACKET_SIZE) is deprecated: fix your app\n"); | ||
511 | return 0; | ||
512 | case DCCP_SOCKOPT_CHANGE_L: | ||
513 | case DCCP_SOCKOPT_CHANGE_R: | ||
514 | DCCP_WARN("sockopt(CHANGE_L/R) is deprecated: fix your app\n"); | ||
515 | return 0; | ||
516 | case DCCP_SOCKOPT_CCID: | ||
517 | case DCCP_SOCKOPT_RX_CCID: | ||
518 | case DCCP_SOCKOPT_TX_CCID: | ||
519 | return dccp_setsockopt_ccid(sk, optname, optval, optlen); | ||
520 | } | ||
521 | |||
522 | if (optlen < (int)sizeof(int)) | ||
523 | return -EINVAL; | 514 | return -EINVAL; |
524 | 515 | ||
525 | if (get_user(val, (int __user *)optval)) | 516 | if (get_user(val, (int __user *)optval)) |
@@ -530,38 +521,53 @@ static int do_dccp_setsockopt(struct sock *sk, int level, int optname, | |||
530 | 521 | ||
531 | lock_sock(sk); | 522 | lock_sock(sk); |
532 | switch (optname) { | 523 | switch (optname) { |
524 | case DCCP_SOCKOPT_PACKET_SIZE: | ||
525 | DCCP_WARN("sockopt(PACKET_SIZE) is deprecated: fix your app\n"); | ||
526 | err = 0; | ||
527 | break; | ||
528 | case DCCP_SOCKOPT_CHANGE_L: | ||
529 | if (optlen != sizeof(struct dccp_so_feat)) | ||
530 | err = -EINVAL; | ||
531 | else | ||
532 | err = dccp_setsockopt_change(sk, DCCPO_CHANGE_L, | ||
533 | (struct dccp_so_feat __user *) | ||
534 | optval); | ||
535 | break; | ||
536 | case DCCP_SOCKOPT_CHANGE_R: | ||
537 | if (optlen != sizeof(struct dccp_so_feat)) | ||
538 | err = -EINVAL; | ||
539 | else | ||
540 | err = dccp_setsockopt_change(sk, DCCPO_CHANGE_R, | ||
541 | (struct dccp_so_feat __user *) | ||
542 | optval); | ||
543 | break; | ||
533 | case DCCP_SOCKOPT_SERVER_TIMEWAIT: | 544 | case DCCP_SOCKOPT_SERVER_TIMEWAIT: |
534 | if (dp->dccps_role != DCCP_ROLE_SERVER) | 545 | if (dp->dccps_role != DCCP_ROLE_SERVER) |
535 | err = -EOPNOTSUPP; | 546 | err = -EOPNOTSUPP; |
536 | else | 547 | else |
537 | dp->dccps_server_timewait = (val != 0); | 548 | dp->dccps_server_timewait = (val != 0); |
538 | break; | 549 | break; |
539 | case DCCP_SOCKOPT_SEND_CSCOV: | 550 | case DCCP_SOCKOPT_SEND_CSCOV: /* sender side, RFC 4340, sec. 9.2 */ |
540 | err = dccp_setsockopt_cscov(sk, val, false); | 551 | if (val < 0 || val > 15) |
541 | break; | ||
542 | case DCCP_SOCKOPT_RECV_CSCOV: | ||
543 | err = dccp_setsockopt_cscov(sk, val, true); | ||
544 | break; | ||
545 | case DCCP_SOCKOPT_QPOLICY_ID: | ||
546 | if (sk->sk_state != DCCP_CLOSED) | ||
547 | err = -EISCONN; | ||
548 | else if (val < 0 || val >= DCCPQ_POLICY_MAX) | ||
549 | err = -EINVAL; | 552 | err = -EINVAL; |
550 | else | 553 | else |
551 | dp->dccps_qpolicy = val; | 554 | dp->dccps_pcslen = val; |
552 | break; | 555 | break; |
553 | case DCCP_SOCKOPT_QPOLICY_TXQLEN: | 556 | case DCCP_SOCKOPT_RECV_CSCOV: /* receiver side, RFC 4340 sec. 9.2.1 */ |
554 | if (val < 0) | 557 | if (val < 0 || val > 15) |
555 | err = -EINVAL; | 558 | err = -EINVAL; |
556 | else | 559 | else { |
557 | dp->dccps_tx_qlen = val; | 560 | dp->dccps_pcrlen = val; |
561 | /* FIXME: add feature negotiation, | ||
562 | * ChangeL(MinimumChecksumCoverage, val) */ | ||
563 | } | ||
558 | break; | 564 | break; |
559 | default: | 565 | default: |
560 | err = -ENOPROTOOPT; | 566 | err = -ENOPROTOOPT; |
561 | break; | 567 | break; |
562 | } | 568 | } |
563 | release_sock(sk); | ||
564 | 569 | ||
570 | release_sock(sk); | ||
565 | return err; | 571 | return err; |
566 | } | 572 | } |
567 | 573 | ||
@@ -642,18 +648,6 @@ static int do_dccp_getsockopt(struct sock *sk, int level, int optname, | |||
642 | case DCCP_SOCKOPT_GET_CUR_MPS: | 648 | case DCCP_SOCKOPT_GET_CUR_MPS: |
643 | val = dp->dccps_mss_cache; | 649 | val = dp->dccps_mss_cache; |
644 | break; | 650 | break; |
645 | case DCCP_SOCKOPT_AVAILABLE_CCIDS: | ||
646 | return ccid_getsockopt_builtin_ccids(sk, len, optval, optlen); | ||
647 | case DCCP_SOCKOPT_TX_CCID: | ||
648 | val = ccid_get_current_tx_ccid(dp); | ||
649 | if (val < 0) | ||
650 | return -ENOPROTOOPT; | ||
651 | break; | ||
652 | case DCCP_SOCKOPT_RX_CCID: | ||
653 | val = ccid_get_current_rx_ccid(dp); | ||
654 | if (val < 0) | ||
655 | return -ENOPROTOOPT; | ||
656 | break; | ||
657 | case DCCP_SOCKOPT_SERVER_TIMEWAIT: | 651 | case DCCP_SOCKOPT_SERVER_TIMEWAIT: |
658 | val = dp->dccps_server_timewait; | 652 | val = dp->dccps_server_timewait; |
659 | break; | 653 | break; |
@@ -663,12 +657,6 @@ static int do_dccp_getsockopt(struct sock *sk, int level, int optname, | |||
663 | case DCCP_SOCKOPT_RECV_CSCOV: | 657 | case DCCP_SOCKOPT_RECV_CSCOV: |
664 | val = dp->dccps_pcrlen; | 658 | val = dp->dccps_pcrlen; |
665 | break; | 659 | break; |
666 | case DCCP_SOCKOPT_QPOLICY_ID: | ||
667 | val = dp->dccps_qpolicy; | ||
668 | break; | ||
669 | case DCCP_SOCKOPT_QPOLICY_TXQLEN: | ||
670 | val = dp->dccps_tx_qlen; | ||
671 | break; | ||
672 | case 128 ... 191: | 660 | case 128 ... 191: |
673 | return ccid_hc_rx_getsockopt(dp->dccps_hc_rx_ccid, sk, optname, | 661 | return ccid_hc_rx_getsockopt(dp->dccps_hc_rx_ccid, sk, optname, |
674 | len, (u32 __user *)optval, optlen); | 662 | len, (u32 __user *)optval, optlen); |
@@ -711,47 +699,6 @@ int compat_dccp_getsockopt(struct sock *sk, int level, int optname, | |||
711 | EXPORT_SYMBOL_GPL(compat_dccp_getsockopt); | 699 | EXPORT_SYMBOL_GPL(compat_dccp_getsockopt); |
712 | #endif | 700 | #endif |
713 | 701 | ||
714 | static int dccp_msghdr_parse(struct msghdr *msg, struct sk_buff *skb) | ||
715 | { | ||
716 | struct cmsghdr *cmsg = CMSG_FIRSTHDR(msg); | ||
717 | |||
718 | /* | ||
719 | * Assign an (opaque) qpolicy priority value to skb->priority. | ||
720 | * | ||
721 | * We are overloading this skb field for use with the qpolicy subystem. | ||
722 | * The skb->priority is normally used for the SO_PRIORITY option, which | ||
723 | * is initialised from sk_priority. Since the assignment of sk_priority | ||
724 | * to skb->priority happens later (on layer 3), we overload this field | ||
725 | * for use with queueing priorities as long as the skb is on layer 4. | ||
726 | * The default priority value (if nothing is set) is 0. | ||
727 | */ | ||
728 | skb->priority = 0; | ||
729 | |||
730 | for (; cmsg != NULL; cmsg = CMSG_NXTHDR(msg, cmsg)) { | ||
731 | |||
732 | if (!CMSG_OK(msg, cmsg)) | ||
733 | return -EINVAL; | ||
734 | |||
735 | if (cmsg->cmsg_level != SOL_DCCP) | ||
736 | continue; | ||
737 | |||
738 | if (cmsg->cmsg_type <= DCCP_SCM_QPOLICY_MAX && | ||
739 | !dccp_qpolicy_param_ok(skb->sk, cmsg->cmsg_type)) | ||
740 | return -EINVAL; | ||
741 | |||
742 | switch (cmsg->cmsg_type) { | ||
743 | case DCCP_SCM_PRIORITY: | ||
744 | if (cmsg->cmsg_len != CMSG_LEN(sizeof(__u32))) | ||
745 | return -EINVAL; | ||
746 | skb->priority = *(__u32 *)CMSG_DATA(cmsg); | ||
747 | break; | ||
748 | default: | ||
749 | return -EINVAL; | ||
750 | } | ||
751 | } | ||
752 | return 0; | ||
753 | } | ||
754 | |||
755 | int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, | 702 | int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, |
756 | size_t len) | 703 | size_t len) |
757 | { | 704 | { |
@@ -767,7 +714,8 @@ int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, | |||
767 | 714 | ||
768 | lock_sock(sk); | 715 | lock_sock(sk); |
769 | 716 | ||
770 | if (dccp_qpolicy_full(sk)) { | 717 | if (sysctl_dccp_tx_qlen && |
718 | (sk->sk_write_queue.qlen >= sysctl_dccp_tx_qlen)) { | ||
771 | rc = -EAGAIN; | 719 | rc = -EAGAIN; |
772 | goto out_release; | 720 | goto out_release; |
773 | } | 721 | } |
@@ -795,12 +743,8 @@ int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, | |||
795 | if (rc != 0) | 743 | if (rc != 0) |
796 | goto out_discard; | 744 | goto out_discard; |
797 | 745 | ||
798 | rc = dccp_msghdr_parse(msg, skb); | 746 | skb_queue_tail(&sk->sk_write_queue, skb); |
799 | if (rc != 0) | 747 | dccp_write_xmit(sk,0); |
800 | goto out_discard; | ||
801 | |||
802 | dccp_qpolicy_push(sk, skb); | ||
803 | dccp_write_xmit(sk); | ||
804 | out_release: | 748 | out_release: |
805 | release_sock(sk); | 749 | release_sock(sk); |
806 | return rc ? : len; | 750 | return rc ? : len; |
@@ -1023,22 +967,9 @@ void dccp_close(struct sock *sk, long timeout) | |||
1023 | /* Check zero linger _after_ checking for unread data. */ | 967 | /* Check zero linger _after_ checking for unread data. */ |
1024 | sk->sk_prot->disconnect(sk, 0); | 968 | sk->sk_prot->disconnect(sk, 0); |
1025 | } else if (sk->sk_state != DCCP_CLOSED) { | 969 | } else if (sk->sk_state != DCCP_CLOSED) { |
1026 | /* | ||
1027 | * Normal connection termination. May need to wait if there are | ||
1028 | * still packets in the TX queue that are delayed by the CCID. | ||
1029 | */ | ||
1030 | dccp_flush_write_queue(sk, &timeout); | ||
1031 | dccp_terminate_connection(sk); | 970 | dccp_terminate_connection(sk); |
1032 | } | 971 | } |
1033 | 972 | ||
1034 | /* | ||
1035 | * Flush write queue. This may be necessary in several cases: | ||
1036 | * - we have been closed by the peer but still have application data; | ||
1037 | * - abortive termination (unread data or zero linger time), | ||
1038 | * - normal termination but queue could not be flushed within time limit | ||
1039 | */ | ||
1040 | __skb_queue_purge(&sk->sk_write_queue); | ||
1041 | |||
1042 | sk_stream_wait_close(sk, timeout); | 973 | sk_stream_wait_close(sk, timeout); |
1043 | 974 | ||
1044 | adjudge_to_death: | 975 | adjudge_to_death: |
diff --git a/net/dccp/qpolicy.c b/net/dccp/qpolicy.c deleted file mode 100644 index 27383f88c75f..000000000000 --- a/net/dccp/qpolicy.c +++ /dev/null | |||
@@ -1,137 +0,0 @@ | |||
1 | /* | ||
2 | * net/dccp/qpolicy.c | ||
3 | * | ||
4 | * Policy-based packet dequeueing interface for DCCP. | ||
5 | * | ||
6 | * Copyright (c) 2008 Tomasz Grobelny <tomasz@grobelny.oswiecenia.net> | ||
7 | * | ||
8 | * This program is free software; you can redistribute it and/or | ||
9 | * modify it under the terms of the GNU General Public License v2 | ||
10 | * as published by the Free Software Foundation. | ||
11 | */ | ||
12 | #include "dccp.h" | ||
13 | |||
14 | /* | ||
15 | * Simple Dequeueing Policy: | ||
16 | * If tx_qlen is different from 0, enqueue up to tx_qlen elements. | ||
17 | */ | ||
18 | static void qpolicy_simple_push(struct sock *sk, struct sk_buff *skb) | ||
19 | { | ||
20 | skb_queue_tail(&sk->sk_write_queue, skb); | ||
21 | } | ||
22 | |||
23 | static bool qpolicy_simple_full(struct sock *sk) | ||
24 | { | ||
25 | return dccp_sk(sk)->dccps_tx_qlen && | ||
26 | sk->sk_write_queue.qlen >= dccp_sk(sk)->dccps_tx_qlen; | ||
27 | } | ||
28 | |||
29 | static struct sk_buff *qpolicy_simple_top(struct sock *sk) | ||
30 | { | ||
31 | return skb_peek(&sk->sk_write_queue); | ||
32 | } | ||
33 | |||
34 | /* | ||
35 | * Priority-based Dequeueing Policy: | ||
36 | * If tx_qlen is different from 0 and the queue has reached its upper bound | ||
37 | * of tx_qlen elements, replace older packets lowest-priority-first. | ||
38 | */ | ||
39 | static struct sk_buff *qpolicy_prio_best_skb(struct sock *sk) | ||
40 | { | ||
41 | struct sk_buff *skb, *best = NULL; | ||
42 | |||
43 | skb_queue_walk(&sk->sk_write_queue, skb) | ||
44 | if (best == NULL || skb->priority > best->priority) | ||
45 | best = skb; | ||
46 | return best; | ||
47 | } | ||
48 | |||
49 | static struct sk_buff *qpolicy_prio_worst_skb(struct sock *sk) | ||
50 | { | ||
51 | struct sk_buff *skb, *worst = NULL; | ||
52 | |||
53 | skb_queue_walk(&sk->sk_write_queue, skb) | ||
54 | if (worst == NULL || skb->priority < worst->priority) | ||
55 | worst = skb; | ||
56 | return worst; | ||
57 | } | ||
58 | |||
59 | static bool qpolicy_prio_full(struct sock *sk) | ||
60 | { | ||
61 | if (qpolicy_simple_full(sk)) | ||
62 | dccp_qpolicy_drop(sk, qpolicy_prio_worst_skb(sk)); | ||
63 | return false; | ||
64 | } | ||
65 | |||
66 | /** | ||
67 | * struct dccp_qpolicy_operations - TX Packet Dequeueing Interface | ||
68 | * @push: add a new @skb to the write queue | ||
69 | * @full: indicates that no more packets will be admitted | ||
70 | * @top: peeks at whatever the queueing policy defines as its `top' | ||
71 | */ | ||
72 | static struct dccp_qpolicy_operations { | ||
73 | void (*push) (struct sock *sk, struct sk_buff *skb); | ||
74 | bool (*full) (struct sock *sk); | ||
75 | struct sk_buff* (*top) (struct sock *sk); | ||
76 | __be32 params; | ||
77 | |||
78 | } qpol_table[DCCPQ_POLICY_MAX] = { | ||
79 | [DCCPQ_POLICY_SIMPLE] = { | ||
80 | .push = qpolicy_simple_push, | ||
81 | .full = qpolicy_simple_full, | ||
82 | .top = qpolicy_simple_top, | ||
83 | .params = 0, | ||
84 | }, | ||
85 | [DCCPQ_POLICY_PRIO] = { | ||
86 | .push = qpolicy_simple_push, | ||
87 | .full = qpolicy_prio_full, | ||
88 | .top = qpolicy_prio_best_skb, | ||
89 | .params = DCCP_SCM_PRIORITY, | ||
90 | }, | ||
91 | }; | ||
92 | |||
93 | /* | ||
94 | * Externally visible interface | ||
95 | */ | ||
96 | void dccp_qpolicy_push(struct sock *sk, struct sk_buff *skb) | ||
97 | { | ||
98 | qpol_table[dccp_sk(sk)->dccps_qpolicy].push(sk, skb); | ||
99 | } | ||
100 | |||
101 | bool dccp_qpolicy_full(struct sock *sk) | ||
102 | { | ||
103 | return qpol_table[dccp_sk(sk)->dccps_qpolicy].full(sk); | ||
104 | } | ||
105 | |||
106 | void dccp_qpolicy_drop(struct sock *sk, struct sk_buff *skb) | ||
107 | { | ||
108 | if (skb != NULL) { | ||
109 | skb_unlink(skb, &sk->sk_write_queue); | ||
110 | kfree_skb(skb); | ||
111 | } | ||
112 | } | ||
113 | |||
114 | struct sk_buff *dccp_qpolicy_top(struct sock *sk) | ||
115 | { | ||
116 | return qpol_table[dccp_sk(sk)->dccps_qpolicy].top(sk); | ||
117 | } | ||
118 | |||
119 | struct sk_buff *dccp_qpolicy_pop(struct sock *sk) | ||
120 | { | ||
121 | struct sk_buff *skb = dccp_qpolicy_top(sk); | ||
122 | |||
123 | /* Clear any skb fields that we used internally */ | ||
124 | skb->priority = 0; | ||
125 | |||
126 | if (skb) | ||
127 | skb_unlink(skb, &sk->sk_write_queue); | ||
128 | return skb; | ||
129 | } | ||
130 | |||
131 | bool dccp_qpolicy_param_ok(struct sock *sk, __be32 param) | ||
132 | { | ||
133 | /* check if exactly one bit is set */ | ||
134 | if (!param || (param & (param - 1))) | ||
135 | return false; | ||
136 | return (qpol_table[dccp_sk(sk)->dccps_qpolicy].params & param) == param; | ||
137 | } | ||
diff --git a/net/dccp/sysctl.c b/net/dccp/sysctl.c index a5a1856234e7..21295993fdb8 100644 --- a/net/dccp/sysctl.c +++ b/net/dccp/sysctl.c | |||
@@ -18,72 +18,76 @@ | |||
18 | #error This file should not be compiled without CONFIG_SYSCTL defined | 18 | #error This file should not be compiled without CONFIG_SYSCTL defined |
19 | #endif | 19 | #endif |
20 | 20 | ||
21 | /* Boundary values */ | ||
22 | static int zero = 0, | ||
23 | u8_max = 0xFF; | ||
24 | static unsigned long seqw_min = 32; | ||
25 | |||
26 | static struct ctl_table dccp_default_table[] = { | 21 | static struct ctl_table dccp_default_table[] = { |
27 | { | 22 | { |
28 | .procname = "seq_window", | 23 | .procname = "seq_window", |
29 | .data = &sysctl_dccp_sequence_window, | 24 | .data = &sysctl_dccp_feat_sequence_window, |
30 | .maxlen = sizeof(sysctl_dccp_sequence_window), | 25 | .maxlen = sizeof(sysctl_dccp_feat_sequence_window), |
31 | .mode = 0644, | 26 | .mode = 0644, |
32 | .proc_handler = proc_doulongvec_minmax, | 27 | .proc_handler = proc_dointvec, |
33 | .extra1 = &seqw_min, /* RFC 4340, 7.5.2 */ | ||
34 | }, | 28 | }, |
35 | { | 29 | { |
36 | .procname = "rx_ccid", | 30 | .procname = "rx_ccid", |
37 | .data = &sysctl_dccp_rx_ccid, | 31 | .data = &sysctl_dccp_feat_rx_ccid, |
38 | .maxlen = sizeof(sysctl_dccp_rx_ccid), | 32 | .maxlen = sizeof(sysctl_dccp_feat_rx_ccid), |
39 | .mode = 0644, | 33 | .mode = 0644, |
40 | .proc_handler = proc_dointvec_minmax, | 34 | .proc_handler = proc_dointvec, |
41 | .extra1 = &zero, | ||
42 | .extra2 = &u8_max, /* RFC 4340, 10. */ | ||
43 | }, | 35 | }, |
44 | { | 36 | { |
45 | .procname = "tx_ccid", | 37 | .procname = "tx_ccid", |
46 | .data = &sysctl_dccp_tx_ccid, | 38 | .data = &sysctl_dccp_feat_tx_ccid, |
47 | .maxlen = sizeof(sysctl_dccp_tx_ccid), | 39 | .maxlen = sizeof(sysctl_dccp_feat_tx_ccid), |
40 | .mode = 0644, | ||
41 | .proc_handler = proc_dointvec, | ||
42 | }, | ||
43 | { | ||
44 | .procname = "ack_ratio", | ||
45 | .data = &sysctl_dccp_feat_ack_ratio, | ||
46 | .maxlen = sizeof(sysctl_dccp_feat_ack_ratio), | ||
47 | .mode = 0644, | ||
48 | .proc_handler = proc_dointvec, | ||
49 | }, | ||
50 | { | ||
51 | .procname = "send_ackvec", | ||
52 | .data = &sysctl_dccp_feat_send_ack_vector, | ||
53 | .maxlen = sizeof(sysctl_dccp_feat_send_ack_vector), | ||
54 | .mode = 0644, | ||
55 | .proc_handler = proc_dointvec, | ||
56 | }, | ||
57 | { | ||
58 | .procname = "send_ndp", | ||
59 | .data = &sysctl_dccp_feat_send_ndp_count, | ||
60 | .maxlen = sizeof(sysctl_dccp_feat_send_ndp_count), | ||
48 | .mode = 0644, | 61 | .mode = 0644, |
49 | .proc_handler = proc_dointvec_minmax, | 62 | .proc_handler = proc_dointvec, |
50 | .extra1 = &zero, | ||
51 | .extra2 = &u8_max, /* RFC 4340, 10. */ | ||
52 | }, | 63 | }, |
53 | { | 64 | { |
54 | .procname = "request_retries", | 65 | .procname = "request_retries", |
55 | .data = &sysctl_dccp_request_retries, | 66 | .data = &sysctl_dccp_request_retries, |
56 | .maxlen = sizeof(sysctl_dccp_request_retries), | 67 | .maxlen = sizeof(sysctl_dccp_request_retries), |
57 | .mode = 0644, | 68 | .mode = 0644, |
58 | .proc_handler = proc_dointvec_minmax, | 69 | .proc_handler = proc_dointvec, |
59 | .extra1 = &zero, | ||
60 | .extra2 = &u8_max, | ||
61 | }, | 70 | }, |
62 | { | 71 | { |
63 | .procname = "retries1", | 72 | .procname = "retries1", |
64 | .data = &sysctl_dccp_retries1, | 73 | .data = &sysctl_dccp_retries1, |
65 | .maxlen = sizeof(sysctl_dccp_retries1), | 74 | .maxlen = sizeof(sysctl_dccp_retries1), |
66 | .mode = 0644, | 75 | .mode = 0644, |
67 | .proc_handler = proc_dointvec_minmax, | 76 | .proc_handler = proc_dointvec, |
68 | .extra1 = &zero, | ||
69 | .extra2 = &u8_max, | ||
70 | }, | 77 | }, |
71 | { | 78 | { |
72 | .procname = "retries2", | 79 | .procname = "retries2", |
73 | .data = &sysctl_dccp_retries2, | 80 | .data = &sysctl_dccp_retries2, |
74 | .maxlen = sizeof(sysctl_dccp_retries2), | 81 | .maxlen = sizeof(sysctl_dccp_retries2), |
75 | .mode = 0644, | 82 | .mode = 0644, |
76 | .proc_handler = proc_dointvec_minmax, | 83 | .proc_handler = proc_dointvec, |
77 | .extra1 = &zero, | ||
78 | .extra2 = &u8_max, | ||
79 | }, | 84 | }, |
80 | { | 85 | { |
81 | .procname = "tx_qlen", | 86 | .procname = "tx_qlen", |
82 | .data = &sysctl_dccp_tx_qlen, | 87 | .data = &sysctl_dccp_tx_qlen, |
83 | .maxlen = sizeof(sysctl_dccp_tx_qlen), | 88 | .maxlen = sizeof(sysctl_dccp_tx_qlen), |
84 | .mode = 0644, | 89 | .mode = 0644, |
85 | .proc_handler = proc_dointvec_minmax, | 90 | .proc_handler = proc_dointvec, |
86 | .extra1 = &zero, | ||
87 | }, | 91 | }, |
88 | { | 92 | { |
89 | .procname = "sync_ratelimit", | 93 | .procname = "sync_ratelimit", |
diff --git a/net/dccp/timer.c b/net/dccp/timer.c index 16359e29e7f5..54b3c7e9e016 100644 --- a/net/dccp/timer.c +++ b/net/dccp/timer.c | |||
@@ -87,6 +87,17 @@ static void dccp_retransmit_timer(struct sock *sk) | |||
87 | { | 87 | { |
88 | struct inet_connection_sock *icsk = inet_csk(sk); | 88 | struct inet_connection_sock *icsk = inet_csk(sk); |
89 | 89 | ||
90 | /* retransmit timer is used for feature negotiation throughout | ||
91 | * connection. In this case, no packet is re-transmitted, but rather an | ||
92 | * ack is generated and pending changes are placed into its options. | ||
93 | */ | ||
94 | if (sk->sk_send_head == NULL) { | ||
95 | dccp_pr_debug("feat negotiation retransmit timeout %p\n", sk); | ||
96 | if (sk->sk_state == DCCP_OPEN) | ||
97 | dccp_send_ack(sk); | ||
98 | goto backoff; | ||
99 | } | ||
100 | |||
90 | /* | 101 | /* |
91 | * More than than 4MSL (8 minutes) has passed, a RESET(aborted) was | 102 | * More than than 4MSL (8 minutes) has passed, a RESET(aborted) was |
92 | * sent, no need to retransmit, this sock is dead. | 103 | * sent, no need to retransmit, this sock is dead. |
@@ -115,6 +126,7 @@ static void dccp_retransmit_timer(struct sock *sk) | |||
115 | return; | 126 | return; |
116 | } | 127 | } |
117 | 128 | ||
129 | backoff: | ||
118 | icsk->icsk_backoff++; | 130 | icsk->icsk_backoff++; |
119 | 131 | ||
120 | icsk->icsk_rto = min(icsk->icsk_rto << 1, DCCP_RTO_MAX); | 132 | icsk->icsk_rto = min(icsk->icsk_rto << 1, DCCP_RTO_MAX); |
@@ -237,35 +249,32 @@ out: | |||
237 | sock_put(sk); | 249 | sock_put(sk); |
238 | } | 250 | } |
239 | 251 | ||
240 | /** | 252 | /* Transmit-delay timer: used by the CCIDs to delay actual send time */ |
241 | * dccp_write_xmitlet - Workhorse for CCID packet dequeueing interface | 253 | static void dccp_write_xmit_timer(unsigned long data) |
242 | * See the comments above %ccid_dequeueing_decision for supported modes. | ||
243 | */ | ||
244 | static void dccp_write_xmitlet(unsigned long data) | ||
245 | { | 254 | { |
246 | struct sock *sk = (struct sock *)data; | 255 | struct sock *sk = (struct sock *)data; |
256 | struct dccp_sock *dp = dccp_sk(sk); | ||
247 | 257 | ||
248 | bh_lock_sock(sk); | 258 | bh_lock_sock(sk); |
249 | if (sock_owned_by_user(sk)) | 259 | if (sock_owned_by_user(sk)) |
250 | sk_reset_timer(sk, &dccp_sk(sk)->dccps_xmit_timer, jiffies + 1); | 260 | sk_reset_timer(sk, &dp->dccps_xmit_timer, jiffies+1); |
251 | else | 261 | else |
252 | dccp_write_xmit(sk); | 262 | dccp_write_xmit(sk, 0); |
253 | bh_unlock_sock(sk); | 263 | bh_unlock_sock(sk); |
264 | sock_put(sk); | ||
254 | } | 265 | } |
255 | 266 | ||
256 | static void dccp_write_xmit_timer(unsigned long data) | 267 | static void dccp_init_write_xmit_timer(struct sock *sk) |
257 | { | 268 | { |
258 | dccp_write_xmitlet(data); | 269 | struct dccp_sock *dp = dccp_sk(sk); |
259 | sock_put((struct sock *)data); | 270 | |
271 | setup_timer(&dp->dccps_xmit_timer, dccp_write_xmit_timer, | ||
272 | (unsigned long)sk); | ||
260 | } | 273 | } |
261 | 274 | ||
262 | void dccp_init_xmit_timers(struct sock *sk) | 275 | void dccp_init_xmit_timers(struct sock *sk) |
263 | { | 276 | { |
264 | struct dccp_sock *dp = dccp_sk(sk); | 277 | dccp_init_write_xmit_timer(sk); |
265 | |||
266 | tasklet_init(&dp->dccps_xmitlet, dccp_write_xmitlet, (unsigned long)sk); | ||
267 | setup_timer(&dp->dccps_xmit_timer, dccp_write_xmit_timer, | ||
268 | (unsigned long)sk); | ||
269 | inet_csk_init_xmit_timers(sk, &dccp_write_timer, &dccp_delack_timer, | 278 | inet_csk_init_xmit_timers(sk, &dccp_write_timer, &dccp_delack_timer, |
270 | &dccp_keepalive_timer); | 279 | &dccp_keepalive_timer); |
271 | } | 280 | } |
@@ -281,7 +290,8 @@ u32 dccp_timestamp(void) | |||
281 | { | 290 | { |
282 | s64 delta = ktime_us_delta(ktime_get_real(), dccp_timestamp_seed); | 291 | s64 delta = ktime_us_delta(ktime_get_real(), dccp_timestamp_seed); |
283 | 292 | ||
284 | return div_u64(delta, DCCP_TIME_RESOLUTION); | 293 | do_div(delta, 10); |
294 | return delta; | ||
285 | } | 295 | } |
286 | EXPORT_SYMBOL_GPL(dccp_timestamp); | 296 | EXPORT_SYMBOL_GPL(dccp_timestamp); |
287 | 297 | ||
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 9da9f19ece8a..f79a51607292 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c | |||
@@ -811,12 +811,25 @@ void tcp_update_metrics(struct sock *sk) | |||
811 | } | 811 | } |
812 | } | 812 | } |
813 | 813 | ||
814 | /* Numbers are taken from RFC3390. | ||
815 | * | ||
816 | * John Heffner states: | ||
817 | * | ||
818 | * The RFC specifies a window of no more than 4380 bytes | ||
819 | * unless 2*MSS > 4380. Reading the pseudocode in the RFC | ||
820 | * is a bit misleading because they use a clamp at 4380 bytes | ||
821 | * rather than use a multiplier in the relevant range. | ||
822 | */ | ||
814 | __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst) | 823 | __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst) |
815 | { | 824 | { |
816 | __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); | 825 | __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); |
817 | 826 | ||
818 | if (!cwnd) | 827 | if (!cwnd) { |
819 | cwnd = rfc3390_bytes_to_packets(tp->mss_cache); | 828 | if (tp->mss_cache > 1460) |
829 | cwnd = 2; | ||
830 | else | ||
831 | cwnd = (tp->mss_cache > 1095) ? 3 : 4; | ||
832 | } | ||
820 | return min_t(__u32, cwnd, tp->snd_cwnd_clamp); | 833 | return min_t(__u32, cwnd, tp->snd_cwnd_clamp); |
821 | } | 834 | } |
822 | 835 | ||