aboutsummaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@woody.osdl.org>2006-12-04 11:41:17 -0500
committerLinus Torvalds <torvalds@woody.osdl.org>2006-12-04 11:41:17 -0500
commit12472b4165a21f5415ff4f60f8f46311a6b7b9ac (patch)
tree8e54c608abb016a4e02a359bc508f575e2f1d7fa /net
parenta79f43a54b7df7b12bc1c0fbf4ba76520ba40354 (diff)
parentef9467f8f0803881d6b20ad6f0f770fc39bcc2c2 (diff)
Merge master.kernel.org:/pub/scm/linux/kernel/git/davem/net-2.6
* master.kernel.org:/pub/scm/linux/kernel/git/davem/net-2.6: [SUNHME]: Fix for sunhme failures on x86 [XFRM] xfrm_user: Better validation of user templates. [DCCP] tfrc: Binary search for reverse TFRC lookup [DCCP] ccid3: Deprecate TFRC_SMALLEST_P [DCCP] tfrc: Identify TFRC table limits and simplify code [DCCP] tfrc: Add protection against invalid parameters to TFRC routines [DCCP] tfrc: Fix small error in reverse lookup of p for given f(p) [DCCP] tfrc: Document boundaries and limits of the TFRC lookup table [DCCP] ccid3: Fix warning message about illegal ACK [DCCP] ccid3: Fix bug in calculation of send rate [DCCP]: Fix BUG in retransmission delay calculation [DCCP]: Use higher RTO default for CCID3
Diffstat (limited to 'net')
-rw-r--r--net/dccp/ccids/Kconfig33
-rw-r--r--net/dccp/ccids/ccid3.c119
-rw-r--r--net/dccp/ccids/ccid3.h2
-rw-r--r--net/dccp/ccids/lib/tfrc_equation.c222
-rw-r--r--net/xfrm/xfrm_user.c50
5 files changed, 279 insertions, 147 deletions
diff --git a/net/dccp/ccids/Kconfig b/net/dccp/ccids/Kconfig
index dac89166eb18..80f469887691 100644
--- a/net/dccp/ccids/Kconfig
+++ b/net/dccp/ccids/Kconfig
@@ -89,4 +89,37 @@ config IP_DCCP_CCID3_DEBUG
89 parameter to 0 or 1. 89 parameter to 0 or 1.
90 90
91 If in doubt, say N. 91 If in doubt, say N.
92
93config IP_DCCP_CCID3_RTO
94 int "Use higher bound for nofeedback timer"
95 default 100
96 depends on IP_DCCP_CCID3 && EXPERIMENTAL
97 ---help---
98 Use higher lower bound for nofeedback timer expiration.
99
100 The TFRC nofeedback timer normally expires after the maximum of 4
101 RTTs and twice the current send interval (RFC 3448, 4.3). On LANs
102 with a small RTT this can mean a high processing load and reduced
103 performance, since then the nofeedback timer is triggered very
104 frequently.
105
106 This option enables to set a higher lower bound for the nofeedback
107 value. Values in units of milliseconds can be set here.
108
109 A value of 0 disables this feature by enforcing the value specified
110 in RFC 3448. The following values have been suggested as bounds for
111 experimental use:
112 * 16-20ms to match the typical multimedia inter-frame interval
113 * 100ms as a reasonable compromise [default]
114 * 1000ms corresponds to the lower TCP RTO bound (RFC 2988, 2.4)
115
116 The default of 100ms is a compromise between a large value for
117 efficient DCCP implementations, and a small value to avoid disrupting
118 the network in times of congestion.
119
120 The purpose of the nofeedback timer is to slow DCCP down when there
121 is serious network congestion: experimenting with larger values should
122 therefore not be performed on WANs.
123
124
92endmenu 125endmenu
diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c
index 70ebe705eb75..cf8c07b2704f 100644
--- a/net/dccp/ccids/ccid3.c
+++ b/net/dccp/ccids/ccid3.c
@@ -121,12 +121,15 @@ static inline void ccid3_update_send_time(struct ccid3_hc_tx_sock *hctx)
121/* 121/*
122 * Update X by 122 * Update X by
123 * If (p > 0) 123 * If (p > 0)
124 * x_calc = calcX(s, R, p); 124 * X_calc = calcX(s, R, p);
125 * X = max(min(X_calc, 2 * X_recv), s / t_mbi); 125 * X = max(min(X_calc, 2 * X_recv), s / t_mbi);
126 * Else 126 * Else
127 * If (now - tld >= R) 127 * If (now - tld >= R)
128 * X = max(min(2 * X, 2 * X_recv), s / R); 128 * X = max(min(2 * X, 2 * X_recv), s / R);
129 * tld = now; 129 * tld = now;
130 *
131 * If X has changed, we also update the scheduled send time t_now,
132 * the inter-packet interval t_ipi, and the delta value.
130 */ 133 */
131static void ccid3_hc_tx_update_x(struct sock *sk, struct timeval *now) 134static void ccid3_hc_tx_update_x(struct sock *sk, struct timeval *now)
132 135
@@ -134,8 +137,7 @@ static void ccid3_hc_tx_update_x(struct sock *sk, struct timeval *now)
134 struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); 137 struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
135 const __u32 old_x = hctx->ccid3hctx_x; 138 const __u32 old_x = hctx->ccid3hctx_x;
136 139
137 /* To avoid large error in calcX */ 140 if (hctx->ccid3hctx_p > 0) {
138 if (hctx->ccid3hctx_p >= TFRC_SMALLEST_P) {
139 hctx->ccid3hctx_x_calc = tfrc_calc_x(hctx->ccid3hctx_s, 141 hctx->ccid3hctx_x_calc = tfrc_calc_x(hctx->ccid3hctx_s,
140 hctx->ccid3hctx_rtt, 142 hctx->ccid3hctx_rtt,
141 hctx->ccid3hctx_p); 143 hctx->ccid3hctx_p);
@@ -223,16 +225,14 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data)
223 ccid3_tx_state_name(hctx->ccid3hctx_state)); 225 ccid3_tx_state_name(hctx->ccid3hctx_state));
224 /* Halve sending rate */ 226 /* Halve sending rate */
225 227
226 /* If (X_calc > 2 * X_recv) 228 /* If (p == 0 || X_calc > 2 * X_recv)
227 * X_recv = max(X_recv / 2, s / (2 * t_mbi)); 229 * X_recv = max(X_recv / 2, s / (2 * t_mbi));
228 * Else 230 * Else
229 * X_recv = X_calc / 4; 231 * X_recv = X_calc / 4;
230 */ 232 */
231 BUG_ON(hctx->ccid3hctx_p >= TFRC_SMALLEST_P && 233 BUG_ON(hctx->ccid3hctx_p && !hctx->ccid3hctx_x_calc);
232 hctx->ccid3hctx_x_calc == 0);
233 234
234 /* check also if p is zero -> x_calc is infinity? */ 235 if (hctx->ccid3hctx_p == 0 ||
235 if (hctx->ccid3hctx_p < TFRC_SMALLEST_P ||
236 hctx->ccid3hctx_x_calc > 2 * hctx->ccid3hctx_x_recv) 236 hctx->ccid3hctx_x_calc > 2 * hctx->ccid3hctx_x_recv)
237 hctx->ccid3hctx_x_recv = max_t(u32, hctx->ccid3hctx_x_recv / 2, 237 hctx->ccid3hctx_x_recv = max_t(u32, hctx->ccid3hctx_x_recv / 2,
238 hctx->ccid3hctx_s / (2 * TFRC_T_MBI)); 238 hctx->ccid3hctx_s / (2 * TFRC_T_MBI));
@@ -245,9 +245,10 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data)
245 } 245 }
246 /* 246 /*
247 * Schedule no feedback timer to expire in 247 * Schedule no feedback timer to expire in
248 * max(4 * R, 2 * s/X) = max(4 * R, 2 * t_ipi) 248 * max(t_RTO, 2 * s/X) = max(t_RTO, 2 * t_ipi)
249 * See comments in packet_recv() regarding the value of t_RTO.
249 */ 250 */
250 t_nfb = max(4 * hctx->ccid3hctx_rtt, 2 * hctx->ccid3hctx_t_ipi); 251 t_nfb = max(hctx->ccid3hctx_t_rto, 2 * hctx->ccid3hctx_t_ipi);
251 break; 252 break;
252 case TFRC_SSTATE_NO_SENT: 253 case TFRC_SSTATE_NO_SENT:
253 DCCP_BUG("Illegal %s state NO_SENT, sk=%p", dccp_role(sk), sk); 254 DCCP_BUG("Illegal %s state NO_SENT, sk=%p", dccp_role(sk), sk);
@@ -338,7 +339,7 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
338 * else 339 * else
339 * // send the packet in (t_nom - t_now) milliseconds. 340 * // send the packet in (t_nom - t_now) milliseconds.
340 */ 341 */
341 if (delay >= hctx->ccid3hctx_delta) 342 if (delay - (long)hctx->ccid3hctx_delta >= 0)
342 return delay / 1000L; 343 return delay / 1000L;
343 break; 344 break;
344 case TFRC_SSTATE_TERM: 345 case TFRC_SSTATE_TERM:
@@ -412,10 +413,8 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
412 struct dccp_tx_hist_entry *packet; 413 struct dccp_tx_hist_entry *packet;
413 struct timeval now; 414 struct timeval now;
414 unsigned long t_nfb; 415 unsigned long t_nfb;
415 u32 t_elapsed;
416 u32 pinv; 416 u32 pinv;
417 u32 x_recv; 417 long r_sample, t_elapsed;
418 u32 r_sample;
419 418
420 BUG_ON(hctx == NULL); 419 BUG_ON(hctx == NULL);
421 420
@@ -426,31 +425,44 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
426 425
427 opt_recv = &hctx->ccid3hctx_options_received; 426 opt_recv = &hctx->ccid3hctx_options_received;
428 427
429 t_elapsed = dp->dccps_options_received.dccpor_elapsed_time * 10;
430 x_recv = opt_recv->ccid3or_receive_rate;
431 pinv = opt_recv->ccid3or_loss_event_rate;
432
433 switch (hctx->ccid3hctx_state) { 428 switch (hctx->ccid3hctx_state) {
434 case TFRC_SSTATE_NO_FBACK: 429 case TFRC_SSTATE_NO_FBACK:
435 case TFRC_SSTATE_FBACK: 430 case TFRC_SSTATE_FBACK:
436 /* Calculate new round trip sample by 431 /* get packet from history to look up t_recvdata */
437 * R_sample = (now - t_recvdata) - t_delay */
438 /* get t_recvdata from history */
439 packet = dccp_tx_hist_find_entry(&hctx->ccid3hctx_hist, 432 packet = dccp_tx_hist_find_entry(&hctx->ccid3hctx_hist,
440 DCCP_SKB_CB(skb)->dccpd_ack_seq); 433 DCCP_SKB_CB(skb)->dccpd_ack_seq);
441 if (unlikely(packet == NULL)) { 434 if (unlikely(packet == NULL)) {
442 DCCP_WARN("%s, sk=%p, seqno %llu(%s) does't exist " 435 DCCP_WARN("%s(%p), seqno %llu(%s) doesn't exist "
443 "in history!\n", dccp_role(sk), sk, 436 "in history!\n", dccp_role(sk), sk,
444 (unsigned long long)DCCP_SKB_CB(skb)->dccpd_ack_seq, 437 (unsigned long long)DCCP_SKB_CB(skb)->dccpd_ack_seq,
445 dccp_packet_name(DCCP_SKB_CB(skb)->dccpd_type)); 438 dccp_packet_name(DCCP_SKB_CB(skb)->dccpd_type));
446 return; 439 return;
447 } 440 }
448 441
449 /* Update RTT */ 442 /* Update receive rate */
443 hctx->ccid3hctx_x_recv = opt_recv->ccid3or_receive_rate;
444
445 /* Update loss event rate */
446 pinv = opt_recv->ccid3or_loss_event_rate;
447 if (pinv == ~0U || pinv == 0)
448 hctx->ccid3hctx_p = 0;
449 else
450 hctx->ccid3hctx_p = 1000000 / pinv;
451
450 dccp_timestamp(sk, &now); 452 dccp_timestamp(sk, &now);
451 r_sample = timeval_delta(&now, &packet->dccphtx_tstamp); 453
452 if (unlikely(r_sample <= t_elapsed)) 454 /*
453 DCCP_WARN("r_sample=%uus,t_elapsed=%uus\n", 455 * Calculate new round trip sample as per [RFC 3448, 4.3] by
456 * R_sample = (now - t_recvdata) - t_elapsed
457 */
458 r_sample = timeval_delta(&now, &packet->dccphtx_tstamp);
459 t_elapsed = dp->dccps_options_received.dccpor_elapsed_time * 10;
460
461 if (unlikely(r_sample <= 0)) {
462 DCCP_WARN("WARNING: R_sample (%ld) <= 0!\n", r_sample);
463 r_sample = 0;
464 } else if (unlikely(r_sample <= t_elapsed))
465 DCCP_WARN("WARNING: r_sample=%ldus <= t_elapsed=%ldus\n",
454 r_sample, t_elapsed); 466 r_sample, t_elapsed);
455 else 467 else
456 r_sample -= t_elapsed; 468 r_sample -= t_elapsed;
@@ -473,31 +485,25 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
473 hctx->ccid3hctx_t_ld = now; 485 hctx->ccid3hctx_t_ld = now;
474 486
475 ccid3_update_send_time(hctx); 487 ccid3_update_send_time(hctx);
476 ccid3_hc_tx_set_state(sk, TFRC_SSTATE_FBACK);
477 } else {
478 hctx->ccid3hctx_rtt = (hctx->ccid3hctx_rtt * 9) / 10 +
479 r_sample / 10;
480 ccid3_hc_tx_update_x(sk, &now);
481 }
482 488
483 ccid3_pr_debug("%s, sk=%p, New RTT estimate=%uus, " 489 ccid3_pr_debug("%s(%p), s=%u, w_init=%u, "
484 "r_sample=%us\n", dccp_role(sk), sk, 490 "R_sample=%ldus, X=%u\n", dccp_role(sk),
485 hctx->ccid3hctx_rtt, r_sample); 491 sk, hctx->ccid3hctx_s, w_init, r_sample,
492 hctx->ccid3hctx_x);
486 493
487 /* Update receive rate */ 494 ccid3_hc_tx_set_state(sk, TFRC_SSTATE_FBACK);
488 hctx->ccid3hctx_x_recv = x_recv;/* X_recv in bytes per sec */ 495 } else {
496 hctx->ccid3hctx_rtt = (9 * hctx->ccid3hctx_rtt +
497 (u32)r_sample ) / 10;
489 498
490 /* Update loss event rate */ 499 ccid3_hc_tx_update_x(sk, &now);
491 if (pinv == ~0 || pinv == 0)
492 hctx->ccid3hctx_p = 0;
493 else {
494 hctx->ccid3hctx_p = 1000000 / pinv;
495 500
496 if (hctx->ccid3hctx_p < TFRC_SMALLEST_P) { 501 ccid3_pr_debug("%s(%p), RTT=%uus (sample=%ldus), s=%u, "
497 hctx->ccid3hctx_p = TFRC_SMALLEST_P; 502 "p=%u, X_calc=%u, X=%u\n", dccp_role(sk),
498 ccid3_pr_debug("%s, sk=%p, Smallest p used!\n", 503 sk, hctx->ccid3hctx_rtt, r_sample,
499 dccp_role(sk), sk); 504 hctx->ccid3hctx_s, hctx->ccid3hctx_p,
500 } 505 hctx->ccid3hctx_x_calc,
506 hctx->ccid3hctx_x);
501 } 507 }
502 508
503 /* unschedule no feedback timer */ 509 /* unschedule no feedback timer */
@@ -512,16 +518,20 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
512 */ 518 */
513 sk->sk_write_space(sk); 519 sk->sk_write_space(sk);
514 520
515 /* Update timeout interval. We use the alternative variant of 521 /*
516 * [RFC 3448, 3.1] which sets the upper bound of t_rto to one 522 * Update timeout interval for the nofeedback timer.
517 * second, as it is suggested for TCP (see RFC 2988, 2.4). */ 523 * We use a configuration option to increase the lower bound.
524 * This can help avoid triggering the nofeedback timer too often
525 * ('spinning') on LANs with small RTTs.
526 */
518 hctx->ccid3hctx_t_rto = max_t(u32, 4 * hctx->ccid3hctx_rtt, 527 hctx->ccid3hctx_t_rto = max_t(u32, 4 * hctx->ccid3hctx_rtt,
519 USEC_PER_SEC ); 528 CONFIG_IP_DCCP_CCID3_RTO *
529 (USEC_PER_SEC/1000) );
520 /* 530 /*
521 * Schedule no feedback timer to expire in 531 * Schedule no feedback timer to expire in
522 * max(4 * R, 2 * s/X) = max(4 * R, 2 * t_ipi) 532 * max(t_RTO, 2 * s/X) = max(t_RTO, 2 * t_ipi)
523 */ 533 */
524 t_nfb = max(4 * hctx->ccid3hctx_rtt, 2 * hctx->ccid3hctx_t_ipi); 534 t_nfb = max(hctx->ccid3hctx_t_rto, 2 * hctx->ccid3hctx_t_ipi);
525 535
526 ccid3_pr_debug("%s, sk=%p, Scheduled no feedback timer to " 536 ccid3_pr_debug("%s, sk=%p, Scheduled no feedback timer to "
527 "expire in %lu jiffies (%luus)\n", 537 "expire in %lu jiffies (%luus)\n",
@@ -535,7 +545,8 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
535 hctx->ccid3hctx_idle = 1; 545 hctx->ccid3hctx_idle = 1;
536 break; 546 break;
537 case TFRC_SSTATE_NO_SENT: 547 case TFRC_SSTATE_NO_SENT:
538 DCCP_WARN("Illegal ACK received - no packet has been sent\n"); 548 if (dccp_sk(sk)->dccps_role == DCCP_ROLE_CLIENT)
549 DCCP_WARN("Illegal ACK received - no packet sent\n");
539 /* fall through */ 550 /* fall through */
540 case TFRC_SSTATE_TERM: /* ignore feedback when closing */ 551 case TFRC_SSTATE_TERM: /* ignore feedback when closing */
541 break; 552 break;
diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h
index 27cb20ae1da8..07596d704ef9 100644
--- a/net/dccp/ccids/ccid3.h
+++ b/net/dccp/ccids/ccid3.h
@@ -51,8 +51,6 @@
51/* Parameter t_mbi from [RFC 3448, 4.3]: backoff interval in seconds */ 51/* Parameter t_mbi from [RFC 3448, 4.3]: backoff interval in seconds */
52#define TFRC_T_MBI 64 52#define TFRC_T_MBI 64
53 53
54#define TFRC_SMALLEST_P 40
55
56enum ccid3_options { 54enum ccid3_options {
57 TFRC_OPT_LOSS_EVENT_RATE = 192, 55 TFRC_OPT_LOSS_EVENT_RATE = 192,
58 TFRC_OPT_LOSS_INTERVALS = 193, 56 TFRC_OPT_LOSS_INTERVALS = 193,
diff --git a/net/dccp/ccids/lib/tfrc_equation.c b/net/dccp/ccids/lib/tfrc_equation.c
index 2601012383fb..ddac2c511e2f 100644
--- a/net/dccp/ccids/lib/tfrc_equation.c
+++ b/net/dccp/ccids/lib/tfrc_equation.c
@@ -18,10 +18,79 @@
18#include "tfrc.h" 18#include "tfrc.h"
19 19
20#define TFRC_CALC_X_ARRSIZE 500 20#define TFRC_CALC_X_ARRSIZE 500
21#define TFRC_CALC_X_SPLIT 50000 /* 0.05 * 1000000, details below */
22#define TFRC_SMALLEST_P (TFRC_CALC_X_SPLIT/TFRC_CALC_X_ARRSIZE)
21 23
22#define TFRC_CALC_X_SPLIT 50000 24/*
23/* equivalent to 0.05 */ 25 TFRC TCP Reno Throughput Equation Lookup Table for f(p)
24 26
27 The following two-column lookup table implements a part of the TCP throughput
28 equation from [RFC 3448, sec. 3.1]:
29
30 s
31 X_calc = --------------------------------------------------------------
32 R * sqrt(2*b*p/3) + (3 * t_RTO * sqrt(3*b*p/8) * (p + 32*p^3))
33
34 Where:
35 X is the transmit rate in bytes/second
36 s is the packet size in bytes
37 R is the round trip time in seconds
38 p is the loss event rate, between 0 and 1.0, of the number of loss
39 events as a fraction of the number of packets transmitted
40 t_RTO is the TCP retransmission timeout value in seconds
41 b is the number of packets acknowledged by a single TCP ACK
42
43 We can assume that b = 1 and t_RTO is 4 * R. The equation now becomes:
44
45 s
46 X_calc = -------------------------------------------------------
47 R * sqrt(p*2/3) + (12 * R * sqrt(p*3/8) * (p + 32*p^3))
48
49 which we can break down into:
50
51 s
52 X_calc = ---------
53 R * f(p)
54
55 where f(p) is given for 0 < p <= 1 by:
56
57 f(p) = sqrt(2*p/3) + 12 * sqrt(3*p/8) * (p + 32*p^3)
58
59 Since this is kernel code, floating-point arithmetic is avoided in favour of
60 integer arithmetic. This means that nearly all fractional parameters are
61 scaled by 1000000:
62 * the parameters p and R
63 * the return result f(p)
64 The lookup table therefore actually tabulates the following function g(q):
65
66 g(q) = 1000000 * f(q/1000000)
67
68 Hence, when p <= 1, q must be less than or equal to 1000000. To achieve finer
69 granularity for the practically more relevant case of small values of p (up to
70 5%), the second column is used; the first one ranges up to 100%. This split
71 corresponds to the value of q = TFRC_CALC_X_SPLIT. At the same time this also
72 determines the smallest resolution possible with this lookup table:
73
74 TFRC_SMALLEST_P = TFRC_CALC_X_SPLIT / TFRC_CALC_X_ARRSIZE
75
76 The entire table is generated by:
77 for(i=0; i < TFRC_CALC_X_ARRSIZE; i++) {
78 lookup[i][0] = g((i+1) * 1000000/TFRC_CALC_X_ARRSIZE);
79 lookup[i][1] = g((i+1) * TFRC_CALC_X_SPLIT/TFRC_CALC_X_ARRSIZE);
80 }
81
82 With the given configuration, we have, with M = TFRC_CALC_X_ARRSIZE-1,
83 lookup[0][0] = g(1000000/(M+1)) = 1000000 * f(0.2%)
84 lookup[M][0] = g(1000000) = 1000000 * f(100%)
85 lookup[0][1] = g(TFRC_SMALLEST_P) = 1000000 * f(0.01%)
86 lookup[M][1] = g(TFRC_CALC_X_SPLIT) = 1000000 * f(5%)
87
88 In summary, the two columns represent f(p) for the following ranges:
89 * The first column is for 0.002 <= p <= 1.0
90 * The second column is for 0.0001 <= p <= 0.05
91 Where the columns overlap, the second (finer-grained) is given preference,
92 i.e. the first column is used only for p >= 0.05.
93 */
25static const u32 tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE][2] = { 94static const u32 tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE][2] = {
26 { 37172, 8172 }, 95 { 37172, 8172 },
27 { 53499, 11567 }, 96 { 53499, 11567 },
@@ -525,85 +594,69 @@ static const u32 tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE][2] = {
525 { 243315981, 271305 } 594 { 243315981, 271305 }
526}; 595};
527 596
528/* Calculate the send rate as per section 3.1 of RFC3448 597/* return largest index i such that fval <= lookup[i][small] */
529 598static inline u32 tfrc_binsearch(u32 fval, u8 small)
530Returns send rate in bytes per second 599{
531 600 u32 try, low = 0, high = TFRC_CALC_X_ARRSIZE - 1;
532Integer maths and lookups are used as not allowed floating point in kernel 601
533 602 while (low < high) {
534The function for Xcalc as per section 3.1 of RFC3448 is: 603 try = (low + high) / 2;
535 604 if (fval <= tfrc_calc_x_lookup[try][small])
536X = s 605 high = try;
537 ------------------------------------------------------------- 606 else
538 R*sqrt(2*b*p/3) + (t_RTO * (3*sqrt(3*b*p/8) * p * (1+32*p^2))) 607 low = try + 1;
539 608 }
540where 609 return high;
541X is the trasmit rate in bytes/second 610}
542s is the packet size in bytes
543R is the round trip time in seconds
544p is the loss event rate, between 0 and 1.0, of the number of loss events
545 as a fraction of the number of packets transmitted
546t_RTO is the TCP retransmission timeout value in seconds
547b is the number of packets acknowledged by a single TCP acknowledgement
548
549we can assume that b = 1 and t_RTO is 4 * R. With this the equation becomes:
550
551X = s
552 -----------------------------------------------------------------------
553 R * sqrt(2 * p / 3) + (12 * R * (sqrt(3 * p / 8) * p * (1 + 32 * p^2)))
554
555
556which we can break down into:
557
558X = s
559 --------
560 R * f(p)
561
562where f(p) = sqrt(2 * p / 3) + (12 * sqrt(3 * p / 8) * p * (1 + 32 * p * p))
563
564Function parameters:
565s - bytes
566R - RTT in usecs
567p - loss rate (decimal fraction multiplied by 1,000,000)
568
569Returns Xcalc in bytes per second
570
571DON'T alter this code unless you run test cases against it as the code
572has been manipulated to stop underflow/overlow.
573 611
574*/ 612/**
613 * tfrc_calc_x - Calculate the send rate as per section 3.1 of RFC3448
614 *
615 * @s: packet size in bytes
616 * @R: RTT scaled by 1000000 (i.e., microseconds)
617 * @p: loss ratio estimate scaled by 1000000
618 * Returns X_calc in bytes per second (not scaled).
619 *
620 * Note: DO NOT alter this code unless you run test cases against it,
621 * as the code has been optimized to stop underflow/overflow.
622 */
575u32 tfrc_calc_x(u16 s, u32 R, u32 p) 623u32 tfrc_calc_x(u16 s, u32 R, u32 p)
576{ 624{
577 int index; 625 int index;
578 u32 f; 626 u32 f;
579 u64 tmp1, tmp2; 627 u64 tmp1, tmp2;
580 628
581 if (p < TFRC_CALC_X_SPLIT) 629 /* check against invalid parameters and divide-by-zero */
582 index = (p / (TFRC_CALC_X_SPLIT / TFRC_CALC_X_ARRSIZE)) - 1; 630 BUG_ON(p > 1000000); /* p must not exceed 100% */
583 else 631 BUG_ON(p == 0); /* f(0) = 0, divide by zero */
584 index = (p / (1000000 / TFRC_CALC_X_ARRSIZE)) - 1; 632 if (R == 0) { /* possible divide by zero */
633 DCCP_CRIT("WARNING: RTT is 0, returning maximum X_calc.");
634 return ~0U;
635 }
585 636
586 if (index < 0) 637 if (p <= TFRC_CALC_X_SPLIT) { /* 0.0000 < p <= 0.05 */
587 /* p should be 0 unless there is a bug in my code */ 638 if (p < TFRC_SMALLEST_P) { /* 0.0000 < p < 0.0001 */
588 index = 0; 639 DCCP_WARN("Value of p (%d) below resolution. "
640 "Substituting %d\n", p, TFRC_SMALLEST_P);
641 index = 0;
642 } else /* 0.0001 <= p <= 0.05 */
643 index = p/TFRC_SMALLEST_P - 1;
589 644
590 if (R == 0) { 645 f = tfrc_calc_x_lookup[index][1];
591 DCCP_WARN("RTT==0, setting to 1\n");
592 R = 1; /* RTT can't be zero or else divide by zero */
593 }
594 646
595 BUG_ON(index >= TFRC_CALC_X_ARRSIZE); 647 } else { /* 0.05 < p <= 1.00 */
648 index = p/(1000000/TFRC_CALC_X_ARRSIZE) - 1;
596 649
597 if (p >= TFRC_CALC_X_SPLIT)
598 f = tfrc_calc_x_lookup[index][0]; 650 f = tfrc_calc_x_lookup[index][0];
599 else 651 }
600 f = tfrc_calc_x_lookup[index][1];
601 652
653 /* The following computes X = s/(R*f(p)) in bytes per second. Since f(p)
654 * and R are both scaled by 1000000, we need to multiply by 1000000^2.
655 * ==> DO NOT alter this unless you test against overflow on 32 bit */
602 tmp1 = ((u64)s * 100000000); 656 tmp1 = ((u64)s * 100000000);
603 tmp2 = ((u64)R * (u64)f); 657 tmp2 = ((u64)R * (u64)f);
604 do_div(tmp2, 10000); 658 do_div(tmp2, 10000);
605 do_div(tmp1, tmp2); 659 do_div(tmp1, tmp2);
606 /* Don't alter above math unless you test due to overflow on 32 bit */
607 660
608 return (u32)tmp1; 661 return (u32)tmp1;
609} 662}
@@ -611,33 +664,36 @@ u32 tfrc_calc_x(u16 s, u32 R, u32 p)
611EXPORT_SYMBOL_GPL(tfrc_calc_x); 664EXPORT_SYMBOL_GPL(tfrc_calc_x);
612 665
613/* 666/*
614 * args: fvalue - function value to match 667 * tfrc_calc_x_reverse_lookup - try to find p given f(p)
615 * returns: p closest to that value
616 * 668 *
617 * both fvalue and p are multiplied by 1,000,000 to use ints 669 * @fvalue: function value to match, scaled by 1000000
670 * Returns closest match for p, also scaled by 1000000
618 */ 671 */
619u32 tfrc_calc_x_reverse_lookup(u32 fvalue) 672u32 tfrc_calc_x_reverse_lookup(u32 fvalue)
620{ 673{
621 int ctr = 0; 674 int index;
622 int small;
623 675
624 if (fvalue < tfrc_calc_x_lookup[0][1]) 676 if (fvalue == 0) /* f(p) = 0 whenever p = 0 */
625 return 0; 677 return 0;
626 678
627 if (fvalue <= tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE - 1][1]) 679 /* Error cases. */
628 small = 1; 680 if (fvalue < tfrc_calc_x_lookup[0][1]) {
629 else if (fvalue > tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE - 1][0]) 681 DCCP_WARN("fvalue %d smaller than resolution\n", fvalue);
682 return tfrc_calc_x_lookup[0][1];
683 }
684 if (fvalue > tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE - 1][0]) {
685 DCCP_WARN("fvalue %d exceeds bounds!\n", fvalue);
630 return 1000000; 686 return 1000000;
631 else 687 }
632 small = 0;
633
634 while (fvalue > tfrc_calc_x_lookup[ctr][small])
635 ctr++;
636 688
637 if (small) 689 if (fvalue <= tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE - 1][1]) {
638 return TFRC_CALC_X_SPLIT * ctr / TFRC_CALC_X_ARRSIZE; 690 index = tfrc_binsearch(fvalue, 1);
639 else 691 return (index + 1) * TFRC_CALC_X_SPLIT / TFRC_CALC_X_ARRSIZE;
640 return 1000000 * ctr / TFRC_CALC_X_ARRSIZE; 692 }
693
694 /* else ... it must be in the coarse-grained column */
695 index = tfrc_binsearch(fvalue, 0);
696 return (index + 1) * 1000000 / TFRC_CALC_X_ARRSIZE;
641} 697}
642 698
643EXPORT_SYMBOL_GPL(tfrc_calc_x_reverse_lookup); 699EXPORT_SYMBOL_GPL(tfrc_calc_x_reverse_lookup);
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 6f97665983d2..311205ffa775 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -858,7 +858,6 @@ static void copy_templates(struct xfrm_policy *xp, struct xfrm_user_tmpl *ut,
858 int i; 858 int i;
859 859
860 xp->xfrm_nr = nr; 860 xp->xfrm_nr = nr;
861 xp->family = ut->family;
862 for (i = 0; i < nr; i++, ut++) { 861 for (i = 0; i < nr; i++, ut++) {
863 struct xfrm_tmpl *t = &xp->xfrm_vec[i]; 862 struct xfrm_tmpl *t = &xp->xfrm_vec[i];
864 863
@@ -876,19 +875,53 @@ static void copy_templates(struct xfrm_policy *xp, struct xfrm_user_tmpl *ut,
876 } 875 }
877} 876}
878 877
878static int validate_tmpl(int nr, struct xfrm_user_tmpl *ut, u16 family)
879{
880 int i;
881
882 if (nr > XFRM_MAX_DEPTH)
883 return -EINVAL;
884
885 for (i = 0; i < nr; i++) {
886 /* We never validated the ut->family value, so many
887 * applications simply leave it at zero. The check was
888 * never made and ut->family was ignored because all
889 * templates could be assumed to have the same family as
890 * the policy itself. Now that we will have ipv4-in-ipv6
891 * and ipv6-in-ipv4 tunnels, this is no longer true.
892 */
893 if (!ut[i].family)
894 ut[i].family = family;
895
896 switch (ut[i].family) {
897 case AF_INET:
898 break;
899#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
900 case AF_INET6:
901 break;
902#endif
903 default:
904 return -EINVAL;
905 };
906 }
907
908 return 0;
909}
910
879static int copy_from_user_tmpl(struct xfrm_policy *pol, struct rtattr **xfrma) 911static int copy_from_user_tmpl(struct xfrm_policy *pol, struct rtattr **xfrma)
880{ 912{
881 struct rtattr *rt = xfrma[XFRMA_TMPL-1]; 913 struct rtattr *rt = xfrma[XFRMA_TMPL-1];
882 struct xfrm_user_tmpl *utmpl;
883 int nr;
884 914
885 if (!rt) { 915 if (!rt) {
886 pol->xfrm_nr = 0; 916 pol->xfrm_nr = 0;
887 } else { 917 } else {
888 nr = (rt->rta_len - sizeof(*rt)) / sizeof(*utmpl); 918 struct xfrm_user_tmpl *utmpl = RTA_DATA(rt);
919 int nr = (rt->rta_len - sizeof(*rt)) / sizeof(*utmpl);
920 int err;
889 921
890 if (nr > XFRM_MAX_DEPTH) 922 err = validate_tmpl(nr, utmpl, pol->family);
891 return -EINVAL; 923 if (err)
924 return err;
892 925
893 copy_templates(pol, RTA_DATA(rt), nr); 926 copy_templates(pol, RTA_DATA(rt), nr);
894 } 927 }
@@ -1530,7 +1563,8 @@ static int xfrm_add_acquire(struct sk_buff *skb, struct nlmsghdr *nlh, void **xf
1530 } 1563 }
1531 1564
1532 /* build an XP */ 1565 /* build an XP */
1533 xp = xfrm_policy_construct(&ua->policy, (struct rtattr **) xfrma, &err); if (!xp) { 1566 xp = xfrm_policy_construct(&ua->policy, (struct rtattr **) xfrma, &err);
1567 if (!xp) {
1534 kfree(x); 1568 kfree(x);
1535 return err; 1569 return err;
1536 } 1570 }
@@ -1979,7 +2013,7 @@ static struct xfrm_policy *xfrm_compile_policy(struct sock *sk, int opt,
1979 return NULL; 2013 return NULL;
1980 2014
1981 nr = ((len - sizeof(*p)) / sizeof(*ut)); 2015 nr = ((len - sizeof(*p)) / sizeof(*ut));
1982 if (nr > XFRM_MAX_DEPTH) 2016 if (validate_tmpl(nr, ut, p->sel.family))
1983 return NULL; 2017 return NULL;
1984 2018
1985 if (p->dir > XFRM_POLICY_OUT) 2019 if (p->dir > XFRM_POLICY_OUT)