diff options
author | Linus Torvalds <torvalds@woody.osdl.org> | 2006-12-04 11:41:17 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@woody.osdl.org> | 2006-12-04 11:41:17 -0500 |
commit | 12472b4165a21f5415ff4f60f8f46311a6b7b9ac (patch) | |
tree | 8e54c608abb016a4e02a359bc508f575e2f1d7fa /net | |
parent | a79f43a54b7df7b12bc1c0fbf4ba76520ba40354 (diff) | |
parent | ef9467f8f0803881d6b20ad6f0f770fc39bcc2c2 (diff) |
Merge master.kernel.org:/pub/scm/linux/kernel/git/davem/net-2.6
* master.kernel.org:/pub/scm/linux/kernel/git/davem/net-2.6:
[SUNHME]: Fix for sunhme failures on x86
[XFRM] xfrm_user: Better validation of user templates.
[DCCP] tfrc: Binary search for reverse TFRC lookup
[DCCP] ccid3: Deprecate TFRC_SMALLEST_P
[DCCP] tfrc: Identify TFRC table limits and simplify code
[DCCP] tfrc: Add protection against invalid parameters to TFRC routines
[DCCP] tfrc: Fix small error in reverse lookup of p for given f(p)
[DCCP] tfrc: Document boundaries and limits of the TFRC lookup table
[DCCP] ccid3: Fix warning message about illegal ACK
[DCCP] ccid3: Fix bug in calculation of send rate
[DCCP]: Fix BUG in retransmission delay calculation
[DCCP]: Use higher RTO default for CCID3
Diffstat (limited to 'net')
-rw-r--r-- | net/dccp/ccids/Kconfig | 33 | ||||
-rw-r--r-- | net/dccp/ccids/ccid3.c | 119 | ||||
-rw-r--r-- | net/dccp/ccids/ccid3.h | 2 | ||||
-rw-r--r-- | net/dccp/ccids/lib/tfrc_equation.c | 222 | ||||
-rw-r--r-- | net/xfrm/xfrm_user.c | 50 |
5 files changed, 279 insertions, 147 deletions
diff --git a/net/dccp/ccids/Kconfig b/net/dccp/ccids/Kconfig index dac89166eb18..80f469887691 100644 --- a/net/dccp/ccids/Kconfig +++ b/net/dccp/ccids/Kconfig | |||
@@ -89,4 +89,37 @@ config IP_DCCP_CCID3_DEBUG | |||
89 | parameter to 0 or 1. | 89 | parameter to 0 or 1. |
90 | 90 | ||
91 | If in doubt, say N. | 91 | If in doubt, say N. |
92 | |||
93 | config IP_DCCP_CCID3_RTO | ||
94 | int "Use higher bound for nofeedback timer" | ||
95 | default 100 | ||
96 | depends on IP_DCCP_CCID3 && EXPERIMENTAL | ||
97 | ---help--- | ||
98 | Use higher lower bound for nofeedback timer expiration. | ||
99 | |||
100 | The TFRC nofeedback timer normally expires after the maximum of 4 | ||
101 | RTTs and twice the current send interval (RFC 3448, 4.3). On LANs | ||
102 | with a small RTT this can mean a high processing load and reduced | ||
103 | performance, since then the nofeedback timer is triggered very | ||
104 | frequently. | ||
105 | |||
106 | This option enables to set a higher lower bound for the nofeedback | ||
107 | value. Values in units of milliseconds can be set here. | ||
108 | |||
109 | A value of 0 disables this feature by enforcing the value specified | ||
110 | in RFC 3448. The following values have been suggested as bounds for | ||
111 | experimental use: | ||
112 | * 16-20ms to match the typical multimedia inter-frame interval | ||
113 | * 100ms as a reasonable compromise [default] | ||
114 | * 1000ms corresponds to the lower TCP RTO bound (RFC 2988, 2.4) | ||
115 | |||
116 | The default of 100ms is a compromise between a large value for | ||
117 | efficient DCCP implementations, and a small value to avoid disrupting | ||
118 | the network in times of congestion. | ||
119 | |||
120 | The purpose of the nofeedback timer is to slow DCCP down when there | ||
121 | is serious network congestion: experimenting with larger values should | ||
122 | therefore not be performed on WANs. | ||
123 | |||
124 | |||
92 | endmenu | 125 | endmenu |
diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index 70ebe705eb75..cf8c07b2704f 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c | |||
@@ -121,12 +121,15 @@ static inline void ccid3_update_send_time(struct ccid3_hc_tx_sock *hctx) | |||
121 | /* | 121 | /* |
122 | * Update X by | 122 | * Update X by |
123 | * If (p > 0) | 123 | * If (p > 0) |
124 | * x_calc = calcX(s, R, p); | 124 | * X_calc = calcX(s, R, p); |
125 | * X = max(min(X_calc, 2 * X_recv), s / t_mbi); | 125 | * X = max(min(X_calc, 2 * X_recv), s / t_mbi); |
126 | * Else | 126 | * Else |
127 | * If (now - tld >= R) | 127 | * If (now - tld >= R) |
128 | * X = max(min(2 * X, 2 * X_recv), s / R); | 128 | * X = max(min(2 * X, 2 * X_recv), s / R); |
129 | * tld = now; | 129 | * tld = now; |
130 | * | ||
131 | * If X has changed, we also update the scheduled send time t_now, | ||
132 | * the inter-packet interval t_ipi, and the delta value. | ||
130 | */ | 133 | */ |
131 | static void ccid3_hc_tx_update_x(struct sock *sk, struct timeval *now) | 134 | static void ccid3_hc_tx_update_x(struct sock *sk, struct timeval *now) |
132 | 135 | ||
@@ -134,8 +137,7 @@ static void ccid3_hc_tx_update_x(struct sock *sk, struct timeval *now) | |||
134 | struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); | 137 | struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); |
135 | const __u32 old_x = hctx->ccid3hctx_x; | 138 | const __u32 old_x = hctx->ccid3hctx_x; |
136 | 139 | ||
137 | /* To avoid large error in calcX */ | 140 | if (hctx->ccid3hctx_p > 0) { |
138 | if (hctx->ccid3hctx_p >= TFRC_SMALLEST_P) { | ||
139 | hctx->ccid3hctx_x_calc = tfrc_calc_x(hctx->ccid3hctx_s, | 141 | hctx->ccid3hctx_x_calc = tfrc_calc_x(hctx->ccid3hctx_s, |
140 | hctx->ccid3hctx_rtt, | 142 | hctx->ccid3hctx_rtt, |
141 | hctx->ccid3hctx_p); | 143 | hctx->ccid3hctx_p); |
@@ -223,16 +225,14 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data) | |||
223 | ccid3_tx_state_name(hctx->ccid3hctx_state)); | 225 | ccid3_tx_state_name(hctx->ccid3hctx_state)); |
224 | /* Halve sending rate */ | 226 | /* Halve sending rate */ |
225 | 227 | ||
226 | /* If (X_calc > 2 * X_recv) | 228 | /* If (p == 0 || X_calc > 2 * X_recv) |
227 | * X_recv = max(X_recv / 2, s / (2 * t_mbi)); | 229 | * X_recv = max(X_recv / 2, s / (2 * t_mbi)); |
228 | * Else | 230 | * Else |
229 | * X_recv = X_calc / 4; | 231 | * X_recv = X_calc / 4; |
230 | */ | 232 | */ |
231 | BUG_ON(hctx->ccid3hctx_p >= TFRC_SMALLEST_P && | 233 | BUG_ON(hctx->ccid3hctx_p && !hctx->ccid3hctx_x_calc); |
232 | hctx->ccid3hctx_x_calc == 0); | ||
233 | 234 | ||
234 | /* check also if p is zero -> x_calc is infinity? */ | 235 | if (hctx->ccid3hctx_p == 0 || |
235 | if (hctx->ccid3hctx_p < TFRC_SMALLEST_P || | ||
236 | hctx->ccid3hctx_x_calc > 2 * hctx->ccid3hctx_x_recv) | 236 | hctx->ccid3hctx_x_calc > 2 * hctx->ccid3hctx_x_recv) |
237 | hctx->ccid3hctx_x_recv = max_t(u32, hctx->ccid3hctx_x_recv / 2, | 237 | hctx->ccid3hctx_x_recv = max_t(u32, hctx->ccid3hctx_x_recv / 2, |
238 | hctx->ccid3hctx_s / (2 * TFRC_T_MBI)); | 238 | hctx->ccid3hctx_s / (2 * TFRC_T_MBI)); |
@@ -245,9 +245,10 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data) | |||
245 | } | 245 | } |
246 | /* | 246 | /* |
247 | * Schedule no feedback timer to expire in | 247 | * Schedule no feedback timer to expire in |
248 | * max(4 * R, 2 * s/X) = max(4 * R, 2 * t_ipi) | 248 | * max(t_RTO, 2 * s/X) = max(t_RTO, 2 * t_ipi) |
249 | * See comments in packet_recv() regarding the value of t_RTO. | ||
249 | */ | 250 | */ |
250 | t_nfb = max(4 * hctx->ccid3hctx_rtt, 2 * hctx->ccid3hctx_t_ipi); | 251 | t_nfb = max(hctx->ccid3hctx_t_rto, 2 * hctx->ccid3hctx_t_ipi); |
251 | break; | 252 | break; |
252 | case TFRC_SSTATE_NO_SENT: | 253 | case TFRC_SSTATE_NO_SENT: |
253 | DCCP_BUG("Illegal %s state NO_SENT, sk=%p", dccp_role(sk), sk); | 254 | DCCP_BUG("Illegal %s state NO_SENT, sk=%p", dccp_role(sk), sk); |
@@ -338,7 +339,7 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) | |||
338 | * else | 339 | * else |
339 | * // send the packet in (t_nom - t_now) milliseconds. | 340 | * // send the packet in (t_nom - t_now) milliseconds. |
340 | */ | 341 | */ |
341 | if (delay >= hctx->ccid3hctx_delta) | 342 | if (delay - (long)hctx->ccid3hctx_delta >= 0) |
342 | return delay / 1000L; | 343 | return delay / 1000L; |
343 | break; | 344 | break; |
344 | case TFRC_SSTATE_TERM: | 345 | case TFRC_SSTATE_TERM: |
@@ -412,10 +413,8 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) | |||
412 | struct dccp_tx_hist_entry *packet; | 413 | struct dccp_tx_hist_entry *packet; |
413 | struct timeval now; | 414 | struct timeval now; |
414 | unsigned long t_nfb; | 415 | unsigned long t_nfb; |
415 | u32 t_elapsed; | ||
416 | u32 pinv; | 416 | u32 pinv; |
417 | u32 x_recv; | 417 | long r_sample, t_elapsed; |
418 | u32 r_sample; | ||
419 | 418 | ||
420 | BUG_ON(hctx == NULL); | 419 | BUG_ON(hctx == NULL); |
421 | 420 | ||
@@ -426,31 +425,44 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) | |||
426 | 425 | ||
427 | opt_recv = &hctx->ccid3hctx_options_received; | 426 | opt_recv = &hctx->ccid3hctx_options_received; |
428 | 427 | ||
429 | t_elapsed = dp->dccps_options_received.dccpor_elapsed_time * 10; | ||
430 | x_recv = opt_recv->ccid3or_receive_rate; | ||
431 | pinv = opt_recv->ccid3or_loss_event_rate; | ||
432 | |||
433 | switch (hctx->ccid3hctx_state) { | 428 | switch (hctx->ccid3hctx_state) { |
434 | case TFRC_SSTATE_NO_FBACK: | 429 | case TFRC_SSTATE_NO_FBACK: |
435 | case TFRC_SSTATE_FBACK: | 430 | case TFRC_SSTATE_FBACK: |
436 | /* Calculate new round trip sample by | 431 | /* get packet from history to look up t_recvdata */ |
437 | * R_sample = (now - t_recvdata) - t_delay */ | ||
438 | /* get t_recvdata from history */ | ||
439 | packet = dccp_tx_hist_find_entry(&hctx->ccid3hctx_hist, | 432 | packet = dccp_tx_hist_find_entry(&hctx->ccid3hctx_hist, |
440 | DCCP_SKB_CB(skb)->dccpd_ack_seq); | 433 | DCCP_SKB_CB(skb)->dccpd_ack_seq); |
441 | if (unlikely(packet == NULL)) { | 434 | if (unlikely(packet == NULL)) { |
442 | DCCP_WARN("%s, sk=%p, seqno %llu(%s) does't exist " | 435 | DCCP_WARN("%s(%p), seqno %llu(%s) doesn't exist " |
443 | "in history!\n", dccp_role(sk), sk, | 436 | "in history!\n", dccp_role(sk), sk, |
444 | (unsigned long long)DCCP_SKB_CB(skb)->dccpd_ack_seq, | 437 | (unsigned long long)DCCP_SKB_CB(skb)->dccpd_ack_seq, |
445 | dccp_packet_name(DCCP_SKB_CB(skb)->dccpd_type)); | 438 | dccp_packet_name(DCCP_SKB_CB(skb)->dccpd_type)); |
446 | return; | 439 | return; |
447 | } | 440 | } |
448 | 441 | ||
449 | /* Update RTT */ | 442 | /* Update receive rate */ |
443 | hctx->ccid3hctx_x_recv = opt_recv->ccid3or_receive_rate; | ||
444 | |||
445 | /* Update loss event rate */ | ||
446 | pinv = opt_recv->ccid3or_loss_event_rate; | ||
447 | if (pinv == ~0U || pinv == 0) | ||
448 | hctx->ccid3hctx_p = 0; | ||
449 | else | ||
450 | hctx->ccid3hctx_p = 1000000 / pinv; | ||
451 | |||
450 | dccp_timestamp(sk, &now); | 452 | dccp_timestamp(sk, &now); |
451 | r_sample = timeval_delta(&now, &packet->dccphtx_tstamp); | 453 | |
452 | if (unlikely(r_sample <= t_elapsed)) | 454 | /* |
453 | DCCP_WARN("r_sample=%uus,t_elapsed=%uus\n", | 455 | * Calculate new round trip sample as per [RFC 3448, 4.3] by |
456 | * R_sample = (now - t_recvdata) - t_elapsed | ||
457 | */ | ||
458 | r_sample = timeval_delta(&now, &packet->dccphtx_tstamp); | ||
459 | t_elapsed = dp->dccps_options_received.dccpor_elapsed_time * 10; | ||
460 | |||
461 | if (unlikely(r_sample <= 0)) { | ||
462 | DCCP_WARN("WARNING: R_sample (%ld) <= 0!\n", r_sample); | ||
463 | r_sample = 0; | ||
464 | } else if (unlikely(r_sample <= t_elapsed)) | ||
465 | DCCP_WARN("WARNING: r_sample=%ldus <= t_elapsed=%ldus\n", | ||
454 | r_sample, t_elapsed); | 466 | r_sample, t_elapsed); |
455 | else | 467 | else |
456 | r_sample -= t_elapsed; | 468 | r_sample -= t_elapsed; |
@@ -473,31 +485,25 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) | |||
473 | hctx->ccid3hctx_t_ld = now; | 485 | hctx->ccid3hctx_t_ld = now; |
474 | 486 | ||
475 | ccid3_update_send_time(hctx); | 487 | ccid3_update_send_time(hctx); |
476 | ccid3_hc_tx_set_state(sk, TFRC_SSTATE_FBACK); | ||
477 | } else { | ||
478 | hctx->ccid3hctx_rtt = (hctx->ccid3hctx_rtt * 9) / 10 + | ||
479 | r_sample / 10; | ||
480 | ccid3_hc_tx_update_x(sk, &now); | ||
481 | } | ||
482 | 488 | ||
483 | ccid3_pr_debug("%s, sk=%p, New RTT estimate=%uus, " | 489 | ccid3_pr_debug("%s(%p), s=%u, w_init=%u, " |
484 | "r_sample=%us\n", dccp_role(sk), sk, | 490 | "R_sample=%ldus, X=%u\n", dccp_role(sk), |
485 | hctx->ccid3hctx_rtt, r_sample); | 491 | sk, hctx->ccid3hctx_s, w_init, r_sample, |
492 | hctx->ccid3hctx_x); | ||
486 | 493 | ||
487 | /* Update receive rate */ | 494 | ccid3_hc_tx_set_state(sk, TFRC_SSTATE_FBACK); |
488 | hctx->ccid3hctx_x_recv = x_recv;/* X_recv in bytes per sec */ | 495 | } else { |
496 | hctx->ccid3hctx_rtt = (9 * hctx->ccid3hctx_rtt + | ||
497 | (u32)r_sample ) / 10; | ||
489 | 498 | ||
490 | /* Update loss event rate */ | 499 | ccid3_hc_tx_update_x(sk, &now); |
491 | if (pinv == ~0 || pinv == 0) | ||
492 | hctx->ccid3hctx_p = 0; | ||
493 | else { | ||
494 | hctx->ccid3hctx_p = 1000000 / pinv; | ||
495 | 500 | ||
496 | if (hctx->ccid3hctx_p < TFRC_SMALLEST_P) { | 501 | ccid3_pr_debug("%s(%p), RTT=%uus (sample=%ldus), s=%u, " |
497 | hctx->ccid3hctx_p = TFRC_SMALLEST_P; | 502 | "p=%u, X_calc=%u, X=%u\n", dccp_role(sk), |
498 | ccid3_pr_debug("%s, sk=%p, Smallest p used!\n", | 503 | sk, hctx->ccid3hctx_rtt, r_sample, |
499 | dccp_role(sk), sk); | 504 | hctx->ccid3hctx_s, hctx->ccid3hctx_p, |
500 | } | 505 | hctx->ccid3hctx_x_calc, |
506 | hctx->ccid3hctx_x); | ||
501 | } | 507 | } |
502 | 508 | ||
503 | /* unschedule no feedback timer */ | 509 | /* unschedule no feedback timer */ |
@@ -512,16 +518,20 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) | |||
512 | */ | 518 | */ |
513 | sk->sk_write_space(sk); | 519 | sk->sk_write_space(sk); |
514 | 520 | ||
515 | /* Update timeout interval. We use the alternative variant of | 521 | /* |
516 | * [RFC 3448, 3.1] which sets the upper bound of t_rto to one | 522 | * Update timeout interval for the nofeedback timer. |
517 | * second, as it is suggested for TCP (see RFC 2988, 2.4). */ | 523 | * We use a configuration option to increase the lower bound. |
524 | * This can help avoid triggering the nofeedback timer too often | ||
525 | * ('spinning') on LANs with small RTTs. | ||
526 | */ | ||
518 | hctx->ccid3hctx_t_rto = max_t(u32, 4 * hctx->ccid3hctx_rtt, | 527 | hctx->ccid3hctx_t_rto = max_t(u32, 4 * hctx->ccid3hctx_rtt, |
519 | USEC_PER_SEC ); | 528 | CONFIG_IP_DCCP_CCID3_RTO * |
529 | (USEC_PER_SEC/1000) ); | ||
520 | /* | 530 | /* |
521 | * Schedule no feedback timer to expire in | 531 | * Schedule no feedback timer to expire in |
522 | * max(4 * R, 2 * s/X) = max(4 * R, 2 * t_ipi) | 532 | * max(t_RTO, 2 * s/X) = max(t_RTO, 2 * t_ipi) |
523 | */ | 533 | */ |
524 | t_nfb = max(4 * hctx->ccid3hctx_rtt, 2 * hctx->ccid3hctx_t_ipi); | 534 | t_nfb = max(hctx->ccid3hctx_t_rto, 2 * hctx->ccid3hctx_t_ipi); |
525 | 535 | ||
526 | ccid3_pr_debug("%s, sk=%p, Scheduled no feedback timer to " | 536 | ccid3_pr_debug("%s, sk=%p, Scheduled no feedback timer to " |
527 | "expire in %lu jiffies (%luus)\n", | 537 | "expire in %lu jiffies (%luus)\n", |
@@ -535,7 +545,8 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) | |||
535 | hctx->ccid3hctx_idle = 1; | 545 | hctx->ccid3hctx_idle = 1; |
536 | break; | 546 | break; |
537 | case TFRC_SSTATE_NO_SENT: | 547 | case TFRC_SSTATE_NO_SENT: |
538 | DCCP_WARN("Illegal ACK received - no packet has been sent\n"); | 548 | if (dccp_sk(sk)->dccps_role == DCCP_ROLE_CLIENT) |
549 | DCCP_WARN("Illegal ACK received - no packet sent\n"); | ||
539 | /* fall through */ | 550 | /* fall through */ |
540 | case TFRC_SSTATE_TERM: /* ignore feedback when closing */ | 551 | case TFRC_SSTATE_TERM: /* ignore feedback when closing */ |
541 | break; | 552 | break; |
diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h index 27cb20ae1da8..07596d704ef9 100644 --- a/net/dccp/ccids/ccid3.h +++ b/net/dccp/ccids/ccid3.h | |||
@@ -51,8 +51,6 @@ | |||
51 | /* Parameter t_mbi from [RFC 3448, 4.3]: backoff interval in seconds */ | 51 | /* Parameter t_mbi from [RFC 3448, 4.3]: backoff interval in seconds */ |
52 | #define TFRC_T_MBI 64 | 52 | #define TFRC_T_MBI 64 |
53 | 53 | ||
54 | #define TFRC_SMALLEST_P 40 | ||
55 | |||
56 | enum ccid3_options { | 54 | enum ccid3_options { |
57 | TFRC_OPT_LOSS_EVENT_RATE = 192, | 55 | TFRC_OPT_LOSS_EVENT_RATE = 192, |
58 | TFRC_OPT_LOSS_INTERVALS = 193, | 56 | TFRC_OPT_LOSS_INTERVALS = 193, |
diff --git a/net/dccp/ccids/lib/tfrc_equation.c b/net/dccp/ccids/lib/tfrc_equation.c index 2601012383fb..ddac2c511e2f 100644 --- a/net/dccp/ccids/lib/tfrc_equation.c +++ b/net/dccp/ccids/lib/tfrc_equation.c | |||
@@ -18,10 +18,79 @@ | |||
18 | #include "tfrc.h" | 18 | #include "tfrc.h" |
19 | 19 | ||
20 | #define TFRC_CALC_X_ARRSIZE 500 | 20 | #define TFRC_CALC_X_ARRSIZE 500 |
21 | #define TFRC_CALC_X_SPLIT 50000 /* 0.05 * 1000000, details below */ | ||
22 | #define TFRC_SMALLEST_P (TFRC_CALC_X_SPLIT/TFRC_CALC_X_ARRSIZE) | ||
21 | 23 | ||
22 | #define TFRC_CALC_X_SPLIT 50000 | 24 | /* |
23 | /* equivalent to 0.05 */ | 25 | TFRC TCP Reno Throughput Equation Lookup Table for f(p) |
24 | 26 | ||
27 | The following two-column lookup table implements a part of the TCP throughput | ||
28 | equation from [RFC 3448, sec. 3.1]: | ||
29 | |||
30 | s | ||
31 | X_calc = -------------------------------------------------------------- | ||
32 | R * sqrt(2*b*p/3) + (3 * t_RTO * sqrt(3*b*p/8) * (p + 32*p^3)) | ||
33 | |||
34 | Where: | ||
35 | X is the transmit rate in bytes/second | ||
36 | s is the packet size in bytes | ||
37 | R is the round trip time in seconds | ||
38 | p is the loss event rate, between 0 and 1.0, of the number of loss | ||
39 | events as a fraction of the number of packets transmitted | ||
40 | t_RTO is the TCP retransmission timeout value in seconds | ||
41 | b is the number of packets acknowledged by a single TCP ACK | ||
42 | |||
43 | We can assume that b = 1 and t_RTO is 4 * R. The equation now becomes: | ||
44 | |||
45 | s | ||
46 | X_calc = ------------------------------------------------------- | ||
47 | R * sqrt(p*2/3) + (12 * R * sqrt(p*3/8) * (p + 32*p^3)) | ||
48 | |||
49 | which we can break down into: | ||
50 | |||
51 | s | ||
52 | X_calc = --------- | ||
53 | R * f(p) | ||
54 | |||
55 | where f(p) is given for 0 < p <= 1 by: | ||
56 | |||
57 | f(p) = sqrt(2*p/3) + 12 * sqrt(3*p/8) * (p + 32*p^3) | ||
58 | |||
59 | Since this is kernel code, floating-point arithmetic is avoided in favour of | ||
60 | integer arithmetic. This means that nearly all fractional parameters are | ||
61 | scaled by 1000000: | ||
62 | * the parameters p and R | ||
63 | * the return result f(p) | ||
64 | The lookup table therefore actually tabulates the following function g(q): | ||
65 | |||
66 | g(q) = 1000000 * f(q/1000000) | ||
67 | |||
68 | Hence, when p <= 1, q must be less than or equal to 1000000. To achieve finer | ||
69 | granularity for the practically more relevant case of small values of p (up to | ||
70 | 5%), the second column is used; the first one ranges up to 100%. This split | ||
71 | corresponds to the value of q = TFRC_CALC_X_SPLIT. At the same time this also | ||
72 | determines the smallest resolution possible with this lookup table: | ||
73 | |||
74 | TFRC_SMALLEST_P = TFRC_CALC_X_SPLIT / TFRC_CALC_X_ARRSIZE | ||
75 | |||
76 | The entire table is generated by: | ||
77 | for(i=0; i < TFRC_CALC_X_ARRSIZE; i++) { | ||
78 | lookup[i][0] = g((i+1) * 1000000/TFRC_CALC_X_ARRSIZE); | ||
79 | lookup[i][1] = g((i+1) * TFRC_CALC_X_SPLIT/TFRC_CALC_X_ARRSIZE); | ||
80 | } | ||
81 | |||
82 | With the given configuration, we have, with M = TFRC_CALC_X_ARRSIZE-1, | ||
83 | lookup[0][0] = g(1000000/(M+1)) = 1000000 * f(0.2%) | ||
84 | lookup[M][0] = g(1000000) = 1000000 * f(100%) | ||
85 | lookup[0][1] = g(TFRC_SMALLEST_P) = 1000000 * f(0.01%) | ||
86 | lookup[M][1] = g(TFRC_CALC_X_SPLIT) = 1000000 * f(5%) | ||
87 | |||
88 | In summary, the two columns represent f(p) for the following ranges: | ||
89 | * The first column is for 0.002 <= p <= 1.0 | ||
90 | * The second column is for 0.0001 <= p <= 0.05 | ||
91 | Where the columns overlap, the second (finer-grained) is given preference, | ||
92 | i.e. the first column is used only for p >= 0.05. | ||
93 | */ | ||
25 | static const u32 tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE][2] = { | 94 | static const u32 tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE][2] = { |
26 | { 37172, 8172 }, | 95 | { 37172, 8172 }, |
27 | { 53499, 11567 }, | 96 | { 53499, 11567 }, |
@@ -525,85 +594,69 @@ static const u32 tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE][2] = { | |||
525 | { 243315981, 271305 } | 594 | { 243315981, 271305 } |
526 | }; | 595 | }; |
527 | 596 | ||
528 | /* Calculate the send rate as per section 3.1 of RFC3448 | 597 | /* return largest index i such that fval <= lookup[i][small] */ |
529 | 598 | static inline u32 tfrc_binsearch(u32 fval, u8 small) | |
530 | Returns send rate in bytes per second | 599 | { |
531 | 600 | u32 try, low = 0, high = TFRC_CALC_X_ARRSIZE - 1; | |
532 | Integer maths and lookups are used as not allowed floating point in kernel | 601 | |
533 | 602 | while (low < high) { | |
534 | The function for Xcalc as per section 3.1 of RFC3448 is: | 603 | try = (low + high) / 2; |
535 | 604 | if (fval <= tfrc_calc_x_lookup[try][small]) | |
536 | X = s | 605 | high = try; |
537 | ------------------------------------------------------------- | 606 | else |
538 | R*sqrt(2*b*p/3) + (t_RTO * (3*sqrt(3*b*p/8) * p * (1+32*p^2))) | 607 | low = try + 1; |
539 | 608 | } | |
540 | where | 609 | return high; |
541 | X is the trasmit rate in bytes/second | 610 | } |
542 | s is the packet size in bytes | ||
543 | R is the round trip time in seconds | ||
544 | p is the loss event rate, between 0 and 1.0, of the number of loss events | ||
545 | as a fraction of the number of packets transmitted | ||
546 | t_RTO is the TCP retransmission timeout value in seconds | ||
547 | b is the number of packets acknowledged by a single TCP acknowledgement | ||
548 | |||
549 | we can assume that b = 1 and t_RTO is 4 * R. With this the equation becomes: | ||
550 | |||
551 | X = s | ||
552 | ----------------------------------------------------------------------- | ||
553 | R * sqrt(2 * p / 3) + (12 * R * (sqrt(3 * p / 8) * p * (1 + 32 * p^2))) | ||
554 | |||
555 | |||
556 | which we can break down into: | ||
557 | |||
558 | X = s | ||
559 | -------- | ||
560 | R * f(p) | ||
561 | |||
562 | where f(p) = sqrt(2 * p / 3) + (12 * sqrt(3 * p / 8) * p * (1 + 32 * p * p)) | ||
563 | |||
564 | Function parameters: | ||
565 | s - bytes | ||
566 | R - RTT in usecs | ||
567 | p - loss rate (decimal fraction multiplied by 1,000,000) | ||
568 | |||
569 | Returns Xcalc in bytes per second | ||
570 | |||
571 | DON'T alter this code unless you run test cases against it as the code | ||
572 | has been manipulated to stop underflow/overlow. | ||
573 | 611 | ||
574 | */ | 612 | /** |
613 | * tfrc_calc_x - Calculate the send rate as per section 3.1 of RFC3448 | ||
614 | * | ||
615 | * @s: packet size in bytes | ||
616 | * @R: RTT scaled by 1000000 (i.e., microseconds) | ||
617 | * @p: loss ratio estimate scaled by 1000000 | ||
618 | * Returns X_calc in bytes per second (not scaled). | ||
619 | * | ||
620 | * Note: DO NOT alter this code unless you run test cases against it, | ||
621 | * as the code has been optimized to stop underflow/overflow. | ||
622 | */ | ||
575 | u32 tfrc_calc_x(u16 s, u32 R, u32 p) | 623 | u32 tfrc_calc_x(u16 s, u32 R, u32 p) |
576 | { | 624 | { |
577 | int index; | 625 | int index; |
578 | u32 f; | 626 | u32 f; |
579 | u64 tmp1, tmp2; | 627 | u64 tmp1, tmp2; |
580 | 628 | ||
581 | if (p < TFRC_CALC_X_SPLIT) | 629 | /* check against invalid parameters and divide-by-zero */ |
582 | index = (p / (TFRC_CALC_X_SPLIT / TFRC_CALC_X_ARRSIZE)) - 1; | 630 | BUG_ON(p > 1000000); /* p must not exceed 100% */ |
583 | else | 631 | BUG_ON(p == 0); /* f(0) = 0, divide by zero */ |
584 | index = (p / (1000000 / TFRC_CALC_X_ARRSIZE)) - 1; | 632 | if (R == 0) { /* possible divide by zero */ |
633 | DCCP_CRIT("WARNING: RTT is 0, returning maximum X_calc."); | ||
634 | return ~0U; | ||
635 | } | ||
585 | 636 | ||
586 | if (index < 0) | 637 | if (p <= TFRC_CALC_X_SPLIT) { /* 0.0000 < p <= 0.05 */ |
587 | /* p should be 0 unless there is a bug in my code */ | 638 | if (p < TFRC_SMALLEST_P) { /* 0.0000 < p < 0.0001 */ |
588 | index = 0; | 639 | DCCP_WARN("Value of p (%d) below resolution. " |
640 | "Substituting %d\n", p, TFRC_SMALLEST_P); | ||
641 | index = 0; | ||
642 | } else /* 0.0001 <= p <= 0.05 */ | ||
643 | index = p/TFRC_SMALLEST_P - 1; | ||
589 | 644 | ||
590 | if (R == 0) { | 645 | f = tfrc_calc_x_lookup[index][1]; |
591 | DCCP_WARN("RTT==0, setting to 1\n"); | ||
592 | R = 1; /* RTT can't be zero or else divide by zero */ | ||
593 | } | ||
594 | 646 | ||
595 | BUG_ON(index >= TFRC_CALC_X_ARRSIZE); | 647 | } else { /* 0.05 < p <= 1.00 */ |
648 | index = p/(1000000/TFRC_CALC_X_ARRSIZE) - 1; | ||
596 | 649 | ||
597 | if (p >= TFRC_CALC_X_SPLIT) | ||
598 | f = tfrc_calc_x_lookup[index][0]; | 650 | f = tfrc_calc_x_lookup[index][0]; |
599 | else | 651 | } |
600 | f = tfrc_calc_x_lookup[index][1]; | ||
601 | 652 | ||
653 | /* The following computes X = s/(R*f(p)) in bytes per second. Since f(p) | ||
654 | * and R are both scaled by 1000000, we need to multiply by 1000000^2. | ||
655 | * ==> DO NOT alter this unless you test against overflow on 32 bit */ | ||
602 | tmp1 = ((u64)s * 100000000); | 656 | tmp1 = ((u64)s * 100000000); |
603 | tmp2 = ((u64)R * (u64)f); | 657 | tmp2 = ((u64)R * (u64)f); |
604 | do_div(tmp2, 10000); | 658 | do_div(tmp2, 10000); |
605 | do_div(tmp1, tmp2); | 659 | do_div(tmp1, tmp2); |
606 | /* Don't alter above math unless you test due to overflow on 32 bit */ | ||
607 | 660 | ||
608 | return (u32)tmp1; | 661 | return (u32)tmp1; |
609 | } | 662 | } |
@@ -611,33 +664,36 @@ u32 tfrc_calc_x(u16 s, u32 R, u32 p) | |||
611 | EXPORT_SYMBOL_GPL(tfrc_calc_x); | 664 | EXPORT_SYMBOL_GPL(tfrc_calc_x); |
612 | 665 | ||
613 | /* | 666 | /* |
614 | * args: fvalue - function value to match | 667 | * tfrc_calc_x_reverse_lookup - try to find p given f(p) |
615 | * returns: p closest to that value | ||
616 | * | 668 | * |
617 | * both fvalue and p are multiplied by 1,000,000 to use ints | 669 | * @fvalue: function value to match, scaled by 1000000 |
670 | * Returns closest match for p, also scaled by 1000000 | ||
618 | */ | 671 | */ |
619 | u32 tfrc_calc_x_reverse_lookup(u32 fvalue) | 672 | u32 tfrc_calc_x_reverse_lookup(u32 fvalue) |
620 | { | 673 | { |
621 | int ctr = 0; | 674 | int index; |
622 | int small; | ||
623 | 675 | ||
624 | if (fvalue < tfrc_calc_x_lookup[0][1]) | 676 | if (fvalue == 0) /* f(p) = 0 whenever p = 0 */ |
625 | return 0; | 677 | return 0; |
626 | 678 | ||
627 | if (fvalue <= tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE - 1][1]) | 679 | /* Error cases. */ |
628 | small = 1; | 680 | if (fvalue < tfrc_calc_x_lookup[0][1]) { |
629 | else if (fvalue > tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE - 1][0]) | 681 | DCCP_WARN("fvalue %d smaller than resolution\n", fvalue); |
682 | return tfrc_calc_x_lookup[0][1]; | ||
683 | } | ||
684 | if (fvalue > tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE - 1][0]) { | ||
685 | DCCP_WARN("fvalue %d exceeds bounds!\n", fvalue); | ||
630 | return 1000000; | 686 | return 1000000; |
631 | else | 687 | } |
632 | small = 0; | ||
633 | |||
634 | while (fvalue > tfrc_calc_x_lookup[ctr][small]) | ||
635 | ctr++; | ||
636 | 688 | ||
637 | if (small) | 689 | if (fvalue <= tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE - 1][1]) { |
638 | return TFRC_CALC_X_SPLIT * ctr / TFRC_CALC_X_ARRSIZE; | 690 | index = tfrc_binsearch(fvalue, 1); |
639 | else | 691 | return (index + 1) * TFRC_CALC_X_SPLIT / TFRC_CALC_X_ARRSIZE; |
640 | return 1000000 * ctr / TFRC_CALC_X_ARRSIZE; | 692 | } |
693 | |||
694 | /* else ... it must be in the coarse-grained column */ | ||
695 | index = tfrc_binsearch(fvalue, 0); | ||
696 | return (index + 1) * 1000000 / TFRC_CALC_X_ARRSIZE; | ||
641 | } | 697 | } |
642 | 698 | ||
643 | EXPORT_SYMBOL_GPL(tfrc_calc_x_reverse_lookup); | 699 | EXPORT_SYMBOL_GPL(tfrc_calc_x_reverse_lookup); |
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 6f97665983d2..311205ffa775 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c | |||
@@ -858,7 +858,6 @@ static void copy_templates(struct xfrm_policy *xp, struct xfrm_user_tmpl *ut, | |||
858 | int i; | 858 | int i; |
859 | 859 | ||
860 | xp->xfrm_nr = nr; | 860 | xp->xfrm_nr = nr; |
861 | xp->family = ut->family; | ||
862 | for (i = 0; i < nr; i++, ut++) { | 861 | for (i = 0; i < nr; i++, ut++) { |
863 | struct xfrm_tmpl *t = &xp->xfrm_vec[i]; | 862 | struct xfrm_tmpl *t = &xp->xfrm_vec[i]; |
864 | 863 | ||
@@ -876,19 +875,53 @@ static void copy_templates(struct xfrm_policy *xp, struct xfrm_user_tmpl *ut, | |||
876 | } | 875 | } |
877 | } | 876 | } |
878 | 877 | ||
878 | static int validate_tmpl(int nr, struct xfrm_user_tmpl *ut, u16 family) | ||
879 | { | ||
880 | int i; | ||
881 | |||
882 | if (nr > XFRM_MAX_DEPTH) | ||
883 | return -EINVAL; | ||
884 | |||
885 | for (i = 0; i < nr; i++) { | ||
886 | /* We never validated the ut->family value, so many | ||
887 | * applications simply leave it at zero. The check was | ||
888 | * never made and ut->family was ignored because all | ||
889 | * templates could be assumed to have the same family as | ||
890 | * the policy itself. Now that we will have ipv4-in-ipv6 | ||
891 | * and ipv6-in-ipv4 tunnels, this is no longer true. | ||
892 | */ | ||
893 | if (!ut[i].family) | ||
894 | ut[i].family = family; | ||
895 | |||
896 | switch (ut[i].family) { | ||
897 | case AF_INET: | ||
898 | break; | ||
899 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) | ||
900 | case AF_INET6: | ||
901 | break; | ||
902 | #endif | ||
903 | default: | ||
904 | return -EINVAL; | ||
905 | }; | ||
906 | } | ||
907 | |||
908 | return 0; | ||
909 | } | ||
910 | |||
879 | static int copy_from_user_tmpl(struct xfrm_policy *pol, struct rtattr **xfrma) | 911 | static int copy_from_user_tmpl(struct xfrm_policy *pol, struct rtattr **xfrma) |
880 | { | 912 | { |
881 | struct rtattr *rt = xfrma[XFRMA_TMPL-1]; | 913 | struct rtattr *rt = xfrma[XFRMA_TMPL-1]; |
882 | struct xfrm_user_tmpl *utmpl; | ||
883 | int nr; | ||
884 | 914 | ||
885 | if (!rt) { | 915 | if (!rt) { |
886 | pol->xfrm_nr = 0; | 916 | pol->xfrm_nr = 0; |
887 | } else { | 917 | } else { |
888 | nr = (rt->rta_len - sizeof(*rt)) / sizeof(*utmpl); | 918 | struct xfrm_user_tmpl *utmpl = RTA_DATA(rt); |
919 | int nr = (rt->rta_len - sizeof(*rt)) / sizeof(*utmpl); | ||
920 | int err; | ||
889 | 921 | ||
890 | if (nr > XFRM_MAX_DEPTH) | 922 | err = validate_tmpl(nr, utmpl, pol->family); |
891 | return -EINVAL; | 923 | if (err) |
924 | return err; | ||
892 | 925 | ||
893 | copy_templates(pol, RTA_DATA(rt), nr); | 926 | copy_templates(pol, RTA_DATA(rt), nr); |
894 | } | 927 | } |
@@ -1530,7 +1563,8 @@ static int xfrm_add_acquire(struct sk_buff *skb, struct nlmsghdr *nlh, void **xf | |||
1530 | } | 1563 | } |
1531 | 1564 | ||
1532 | /* build an XP */ | 1565 | /* build an XP */ |
1533 | xp = xfrm_policy_construct(&ua->policy, (struct rtattr **) xfrma, &err); if (!xp) { | 1566 | xp = xfrm_policy_construct(&ua->policy, (struct rtattr **) xfrma, &err); |
1567 | if (!xp) { | ||
1534 | kfree(x); | 1568 | kfree(x); |
1535 | return err; | 1569 | return err; |
1536 | } | 1570 | } |
@@ -1979,7 +2013,7 @@ static struct xfrm_policy *xfrm_compile_policy(struct sock *sk, int opt, | |||
1979 | return NULL; | 2013 | return NULL; |
1980 | 2014 | ||
1981 | nr = ((len - sizeof(*p)) / sizeof(*ut)); | 2015 | nr = ((len - sizeof(*p)) / sizeof(*ut)); |
1982 | if (nr > XFRM_MAX_DEPTH) | 2016 | if (validate_tmpl(nr, ut, p->sel.family)) |
1983 | return NULL; | 2017 | return NULL; |
1984 | 2018 | ||
1985 | if (p->dir > XFRM_POLICY_OUT) | 2019 | if (p->dir > XFRM_POLICY_OUT) |