diff options
Diffstat (limited to 'net/ipv4')
-rw-r--r-- | net/ipv4/Kconfig | 90 | ||||
-rw-r--r-- | net/ipv4/Makefile | 10 | ||||
-rw-r--r-- | net/ipv4/devinet.c | 2 | ||||
-rw-r--r-- | net/ipv4/sysctl_net_ipv4.c | 114 | ||||
-rw-r--r-- | net/ipv4/tcp.c | 33 | ||||
-rw-r--r-- | net/ipv4/tcp_bic.c | 331 | ||||
-rw-r--r-- | net/ipv4/tcp_cong.c | 237 | ||||
-rw-r--r-- | net/ipv4/tcp_diag.c | 34 | ||||
-rw-r--r-- | net/ipv4/tcp_highspeed.c | 181 | ||||
-rw-r--r-- | net/ipv4/tcp_htcp.c | 289 | ||||
-rw-r--r-- | net/ipv4/tcp_hybla.c | 187 | ||||
-rw-r--r-- | net/ipv4/tcp_input.c | 737 | ||||
-rw-r--r-- | net/ipv4/tcp_ipv4.c | 3 | ||||
-rw-r--r-- | net/ipv4/tcp_minisocks.c | 4 | ||||
-rw-r--r-- | net/ipv4/tcp_output.c | 23 | ||||
-rw-r--r-- | net/ipv4/tcp_scalable.c | 68 | ||||
-rw-r--r-- | net/ipv4/tcp_vegas.c | 411 | ||||
-rw-r--r-- | net/ipv4/tcp_westwood.c | 259 |
18 files changed, 2215 insertions, 798 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 567b03b1c349..690e88ba2484 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig | |||
@@ -433,5 +433,95 @@ config IP_TCPDIAG | |||
433 | config IP_TCPDIAG_IPV6 | 433 | config IP_TCPDIAG_IPV6 |
434 | def_bool (IP_TCPDIAG=y && IPV6=y) || (IP_TCPDIAG=m && IPV6) | 434 | def_bool (IP_TCPDIAG=y && IPV6=y) || (IP_TCPDIAG=m && IPV6) |
435 | 435 | ||
436 | # TCP Reno is builtin (required as fallback) | ||
437 | menu "TCP congestion control" | ||
438 | depends on INET | ||
439 | |||
440 | config TCP_CONG_BIC | ||
441 | tristate "Binary Increase Congestion (BIC) control" | ||
442 | depends on INET | ||
443 | default y | ||
444 | ---help--- | ||
445 | BIC-TCP is a sender-side only change that ensures a linear RTT | ||
446 | fairness under large windows while offering both scalability and | ||
447 | bounded TCP-friendliness. The protocol combines two schemes | ||
448 | called additive increase and binary search increase. When the | ||
449 | congestion window is large, additive increase with a large | ||
450 | increment ensures linear RTT fairness as well as good | ||
451 | scalability. Under small congestion windows, binary search | ||
452 | increase provides TCP friendliness. | ||
453 | See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/ | ||
454 | |||
455 | config TCP_CONG_WESTWOOD | ||
456 | tristate "TCP Westwood+" | ||
457 | depends on INET | ||
458 | default m | ||
459 | ---help--- | ||
460 | TCP Westwood+ is a sender-side only modification of the TCP Reno | ||
461 | protocol stack that optimizes the performance of TCP congestion | ||
462 | control. It is based on end-to-end bandwidth estimation to set | ||
463 | congestion window and slow start threshold after a congestion | ||
464 | episode. Using this estimation, TCP Westwood+ adaptively sets a | ||
465 | slow start threshold and a congestion window which takes into | ||
466 | account the bandwidth used at the time congestion is experienced. | ||
467 | TCP Westwood+ significantly increases fairness wrt TCP Reno in | ||
468 | wired networks and throughput over wireless links. | ||
469 | |||
470 | config TCP_CONG_HTCP | ||
471 | tristate "H-TCP" | ||
472 | depends on INET | ||
473 | default m | ||
474 | ---help--- | ||
475 | H-TCP is a send-side only modifications of the TCP Reno | ||
476 | protocol stack that optimizes the performance of TCP | ||
477 | congestion control for high speed network links. It uses a | ||
478 | modeswitch to change the alpha and beta parameters of TCP Reno | ||
479 | based on network conditions and in a way so as to be fair with | ||
480 | other Reno and H-TCP flows. | ||
481 | |||
482 | config TCP_CONG_HSTCP | ||
483 | tristate "High Speed TCP" | ||
484 | depends on INET && EXPERIMENTAL | ||
485 | default n | ||
486 | ---help--- | ||
487 | Sally Floyd's High Speed TCP (RFC 3649) congestion control. | ||
488 | A modification to TCP's congestion control mechanism for use | ||
489 | with large congestion windows. A table indicates how much to | ||
490 | increase the congestion window by when an ACK is received. | ||
491 | For more detail see http://www.icir.org/floyd/hstcp.html | ||
492 | |||
493 | config TCP_CONG_HYBLA | ||
494 | tristate "TCP-Hybla congestion control algorithm" | ||
495 | depends on INET && EXPERIMENTAL | ||
496 | default n | ||
497 | ---help--- | ||
498 | TCP-Hybla is a sender-side only change that eliminates penalization of | ||
499 | long-RTT, large-bandwidth connections, like when satellite legs are | ||
500 | involved, expecially when sharing a common bottleneck with normal | ||
501 | terrestrial connections. | ||
502 | |||
503 | config TCP_CONG_VEGAS | ||
504 | tristate "TCP Vegas" | ||
505 | depends on INET && EXPERIMENTAL | ||
506 | default n | ||
507 | ---help--- | ||
508 | TCP Vegas is a sender-side only change to TCP that anticipates | ||
509 | the onset of congestion by estimating the bandwidth. TCP Vegas | ||
510 | adjusts the sending rate by modifying the congestion | ||
511 | window. TCP Vegas should provide less packet loss, but it is | ||
512 | not as aggressive as TCP Reno. | ||
513 | |||
514 | config TCP_CONG_SCALABLE | ||
515 | tristate "Scalable TCP" | ||
516 | depends on INET && EXPERIMENTAL | ||
517 | default n | ||
518 | ---help--- | ||
519 | Scalable TCP is a sender-side only change to TCP which uses a | ||
520 | MIMD congestion control algorithm which has some nice scaling | ||
521 | properties, though is known to have fairness issues. | ||
522 | See http://www-lce.eng.cam.ac.uk/~ctk21/scalable/ | ||
523 | |||
524 | endmenu | ||
525 | |||
436 | source "net/ipv4/ipvs/Kconfig" | 526 | source "net/ipv4/ipvs/Kconfig" |
437 | 527 | ||
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 65d57d8e1add..5718cdb3a61e 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile | |||
@@ -5,7 +5,8 @@ | |||
5 | obj-y := utils.o route.o inetpeer.o protocol.o \ | 5 | obj-y := utils.o route.o inetpeer.o protocol.o \ |
6 | ip_input.o ip_fragment.o ip_forward.o ip_options.o \ | 6 | ip_input.o ip_fragment.o ip_forward.o ip_options.o \ |
7 | ip_output.o ip_sockglue.o \ | 7 | ip_output.o ip_sockglue.o \ |
8 | tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o tcp_minisocks.o \ | 8 | tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \ |
9 | tcp_minisocks.o tcp_cong.o \ | ||
9 | datagram.o raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \ | 10 | datagram.o raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \ |
10 | sysctl_net_ipv4.o fib_frontend.o fib_semantics.o | 11 | sysctl_net_ipv4.o fib_frontend.o fib_semantics.o |
11 | 12 | ||
@@ -30,6 +31,13 @@ obj-$(CONFIG_NETFILTER) += netfilter/ | |||
30 | obj-$(CONFIG_IP_VS) += ipvs/ | 31 | obj-$(CONFIG_IP_VS) += ipvs/ |
31 | obj-$(CONFIG_IP_TCPDIAG) += tcp_diag.o | 32 | obj-$(CONFIG_IP_TCPDIAG) += tcp_diag.o |
32 | obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o | 33 | obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o |
34 | obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o | ||
35 | obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o | ||
36 | obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o | ||
37 | obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o | ||
38 | obj-$(CONFIG_TCP_CONG_HTCP) += tcp_htcp.o | ||
39 | obj-$(CONFIG_TCP_CONG_VEGAS) += tcp_vegas.o | ||
40 | obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o | ||
33 | 41 | ||
34 | obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ | 42 | obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ |
35 | xfrm4_output.o | 43 | xfrm4_output.o |
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 650dcb12d9a1..d8a10e3dd77d 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c | |||
@@ -1471,7 +1471,7 @@ static void devinet_sysctl_register(struct in_device *in_dev, | |||
1471 | * by sysctl and we wouldn't want anyone to change it under our feet | 1471 | * by sysctl and we wouldn't want anyone to change it under our feet |
1472 | * (see SIOCSIFNAME). | 1472 | * (see SIOCSIFNAME). |
1473 | */ | 1473 | */ |
1474 | dev_name = net_sysctl_strdup(dev_name); | 1474 | dev_name = kstrdup(dev_name, GFP_KERNEL); |
1475 | if (!dev_name) | 1475 | if (!dev_name) |
1476 | goto free; | 1476 | goto free; |
1477 | 1477 | ||
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 23068bddbf0b..e32894532416 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c | |||
@@ -118,6 +118,45 @@ static int ipv4_sysctl_forward_strategy(ctl_table *table, | |||
118 | return 1; | 118 | return 1; |
119 | } | 119 | } |
120 | 120 | ||
121 | static int proc_tcp_congestion_control(ctl_table *ctl, int write, struct file * filp, | ||
122 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
123 | { | ||
124 | char val[TCP_CA_NAME_MAX]; | ||
125 | ctl_table tbl = { | ||
126 | .data = val, | ||
127 | .maxlen = TCP_CA_NAME_MAX, | ||
128 | }; | ||
129 | int ret; | ||
130 | |||
131 | tcp_get_default_congestion_control(val); | ||
132 | |||
133 | ret = proc_dostring(&tbl, write, filp, buffer, lenp, ppos); | ||
134 | if (write && ret == 0) | ||
135 | ret = tcp_set_default_congestion_control(val); | ||
136 | return ret; | ||
137 | } | ||
138 | |||
139 | int sysctl_tcp_congestion_control(ctl_table *table, int __user *name, int nlen, | ||
140 | void __user *oldval, size_t __user *oldlenp, | ||
141 | void __user *newval, size_t newlen, | ||
142 | void **context) | ||
143 | { | ||
144 | char val[TCP_CA_NAME_MAX]; | ||
145 | ctl_table tbl = { | ||
146 | .data = val, | ||
147 | .maxlen = TCP_CA_NAME_MAX, | ||
148 | }; | ||
149 | int ret; | ||
150 | |||
151 | tcp_get_default_congestion_control(val); | ||
152 | ret = sysctl_string(&tbl, name, nlen, oldval, oldlenp, newval, newlen, | ||
153 | context); | ||
154 | if (ret == 0 && newval && newlen) | ||
155 | ret = tcp_set_default_congestion_control(val); | ||
156 | return ret; | ||
157 | } | ||
158 | |||
159 | |||
121 | ctl_table ipv4_table[] = { | 160 | ctl_table ipv4_table[] = { |
122 | { | 161 | { |
123 | .ctl_name = NET_IPV4_TCP_TIMESTAMPS, | 162 | .ctl_name = NET_IPV4_TCP_TIMESTAMPS, |
@@ -612,70 +651,6 @@ ctl_table ipv4_table[] = { | |||
612 | .proc_handler = &proc_dointvec, | 651 | .proc_handler = &proc_dointvec, |
613 | }, | 652 | }, |
614 | { | 653 | { |
615 | .ctl_name = NET_TCP_WESTWOOD, | ||
616 | .procname = "tcp_westwood", | ||
617 | .data = &sysctl_tcp_westwood, | ||
618 | .maxlen = sizeof(int), | ||
619 | .mode = 0644, | ||
620 | .proc_handler = &proc_dointvec, | ||
621 | }, | ||
622 | { | ||
623 | .ctl_name = NET_TCP_VEGAS, | ||
624 | .procname = "tcp_vegas_cong_avoid", | ||
625 | .data = &sysctl_tcp_vegas_cong_avoid, | ||
626 | .maxlen = sizeof(int), | ||
627 | .mode = 0644, | ||
628 | .proc_handler = &proc_dointvec, | ||
629 | }, | ||
630 | { | ||
631 | .ctl_name = NET_TCP_VEGAS_ALPHA, | ||
632 | .procname = "tcp_vegas_alpha", | ||
633 | .data = &sysctl_tcp_vegas_alpha, | ||
634 | .maxlen = sizeof(int), | ||
635 | .mode = 0644, | ||
636 | .proc_handler = &proc_dointvec, | ||
637 | }, | ||
638 | { | ||
639 | .ctl_name = NET_TCP_VEGAS_BETA, | ||
640 | .procname = "tcp_vegas_beta", | ||
641 | .data = &sysctl_tcp_vegas_beta, | ||
642 | .maxlen = sizeof(int), | ||
643 | .mode = 0644, | ||
644 | .proc_handler = &proc_dointvec, | ||
645 | }, | ||
646 | { | ||
647 | .ctl_name = NET_TCP_VEGAS_GAMMA, | ||
648 | .procname = "tcp_vegas_gamma", | ||
649 | .data = &sysctl_tcp_vegas_gamma, | ||
650 | .maxlen = sizeof(int), | ||
651 | .mode = 0644, | ||
652 | .proc_handler = &proc_dointvec, | ||
653 | }, | ||
654 | { | ||
655 | .ctl_name = NET_TCP_BIC, | ||
656 | .procname = "tcp_bic", | ||
657 | .data = &sysctl_tcp_bic, | ||
658 | .maxlen = sizeof(int), | ||
659 | .mode = 0644, | ||
660 | .proc_handler = &proc_dointvec, | ||
661 | }, | ||
662 | { | ||
663 | .ctl_name = NET_TCP_BIC_FAST_CONVERGENCE, | ||
664 | .procname = "tcp_bic_fast_convergence", | ||
665 | .data = &sysctl_tcp_bic_fast_convergence, | ||
666 | .maxlen = sizeof(int), | ||
667 | .mode = 0644, | ||
668 | .proc_handler = &proc_dointvec, | ||
669 | }, | ||
670 | { | ||
671 | .ctl_name = NET_TCP_BIC_LOW_WINDOW, | ||
672 | .procname = "tcp_bic_low_window", | ||
673 | .data = &sysctl_tcp_bic_low_window, | ||
674 | .maxlen = sizeof(int), | ||
675 | .mode = 0644, | ||
676 | .proc_handler = &proc_dointvec, | ||
677 | }, | ||
678 | { | ||
679 | .ctl_name = NET_TCP_MODERATE_RCVBUF, | 654 | .ctl_name = NET_TCP_MODERATE_RCVBUF, |
680 | .procname = "tcp_moderate_rcvbuf", | 655 | .procname = "tcp_moderate_rcvbuf", |
681 | .data = &sysctl_tcp_moderate_rcvbuf, | 656 | .data = &sysctl_tcp_moderate_rcvbuf, |
@@ -692,13 +667,14 @@ ctl_table ipv4_table[] = { | |||
692 | .proc_handler = &proc_dointvec, | 667 | .proc_handler = &proc_dointvec, |
693 | }, | 668 | }, |
694 | { | 669 | { |
695 | .ctl_name = NET_TCP_BIC_BETA, | 670 | .ctl_name = NET_TCP_CONG_CONTROL, |
696 | .procname = "tcp_bic_beta", | 671 | .procname = "tcp_congestion_control", |
697 | .data = &sysctl_tcp_bic_beta, | ||
698 | .maxlen = sizeof(int), | ||
699 | .mode = 0644, | 672 | .mode = 0644, |
700 | .proc_handler = &proc_dointvec, | 673 | .maxlen = TCP_CA_NAME_MAX, |
674 | .proc_handler = &proc_tcp_congestion_control, | ||
675 | .strategy = &sysctl_tcp_congestion_control, | ||
701 | }, | 676 | }, |
677 | |||
702 | { .ctl_name = 0 } | 678 | { .ctl_name = 0 } |
703 | }; | 679 | }; |
704 | 680 | ||
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 674bbd8cfd36..882436da9a3a 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c | |||
@@ -1927,6 +1927,25 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, | |||
1927 | return tp->af_specific->setsockopt(sk, level, optname, | 1927 | return tp->af_specific->setsockopt(sk, level, optname, |
1928 | optval, optlen); | 1928 | optval, optlen); |
1929 | 1929 | ||
1930 | /* This is a string value all the others are int's */ | ||
1931 | if (optname == TCP_CONGESTION) { | ||
1932 | char name[TCP_CA_NAME_MAX]; | ||
1933 | |||
1934 | if (optlen < 1) | ||
1935 | return -EINVAL; | ||
1936 | |||
1937 | val = strncpy_from_user(name, optval, | ||
1938 | min(TCP_CA_NAME_MAX-1, optlen)); | ||
1939 | if (val < 0) | ||
1940 | return -EFAULT; | ||
1941 | name[val] = 0; | ||
1942 | |||
1943 | lock_sock(sk); | ||
1944 | err = tcp_set_congestion_control(tp, name); | ||
1945 | release_sock(sk); | ||
1946 | return err; | ||
1947 | } | ||
1948 | |||
1930 | if (optlen < sizeof(int)) | 1949 | if (optlen < sizeof(int)) |
1931 | return -EINVAL; | 1950 | return -EINVAL; |
1932 | 1951 | ||
@@ -2211,6 +2230,16 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, | |||
2211 | case TCP_QUICKACK: | 2230 | case TCP_QUICKACK: |
2212 | val = !tp->ack.pingpong; | 2231 | val = !tp->ack.pingpong; |
2213 | break; | 2232 | break; |
2233 | |||
2234 | case TCP_CONGESTION: | ||
2235 | if (get_user(len, optlen)) | ||
2236 | return -EFAULT; | ||
2237 | len = min_t(unsigned int, len, TCP_CA_NAME_MAX); | ||
2238 | if (put_user(len, optlen)) | ||
2239 | return -EFAULT; | ||
2240 | if (copy_to_user(optval, tp->ca_ops->name, len)) | ||
2241 | return -EFAULT; | ||
2242 | return 0; | ||
2214 | default: | 2243 | default: |
2215 | return -ENOPROTOOPT; | 2244 | return -ENOPROTOOPT; |
2216 | }; | 2245 | }; |
@@ -2224,7 +2253,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, | |||
2224 | 2253 | ||
2225 | 2254 | ||
2226 | extern void __skb_cb_too_small_for_tcp(int, int); | 2255 | extern void __skb_cb_too_small_for_tcp(int, int); |
2227 | extern void tcpdiag_init(void); | 2256 | extern struct tcp_congestion_ops tcp_reno; |
2228 | 2257 | ||
2229 | static __initdata unsigned long thash_entries; | 2258 | static __initdata unsigned long thash_entries; |
2230 | static int __init set_thash_entries(char *str) | 2259 | static int __init set_thash_entries(char *str) |
@@ -2333,6 +2362,8 @@ void __init tcp_init(void) | |||
2333 | printk(KERN_INFO "TCP: Hash tables configured " | 2362 | printk(KERN_INFO "TCP: Hash tables configured " |
2334 | "(established %d bind %d)\n", | 2363 | "(established %d bind %d)\n", |
2335 | tcp_ehash_size << 1, tcp_bhash_size); | 2364 | tcp_ehash_size << 1, tcp_bhash_size); |
2365 | |||
2366 | tcp_register_congestion_control(&tcp_reno); | ||
2336 | } | 2367 | } |
2337 | 2368 | ||
2338 | EXPORT_SYMBOL(tcp_accept); | 2369 | EXPORT_SYMBOL(tcp_accept); |
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c new file mode 100644 index 000000000000..ec38d45d6649 --- /dev/null +++ b/net/ipv4/tcp_bic.c | |||
@@ -0,0 +1,331 @@ | |||
1 | /* | ||
2 | * Binary Increase Congestion control for TCP | ||
3 | * | ||
4 | * This is from the implementation of BICTCP in | ||
5 | * Lison-Xu, Kahaled Harfoush, and Injong Rhee. | ||
6 | * "Binary Increase Congestion Control for Fast, Long Distance | ||
7 | * Networks" in InfoComm 2004 | ||
8 | * Available from: | ||
9 | * http://www.csc.ncsu.edu/faculty/rhee/export/bitcp.pdf | ||
10 | * | ||
11 | * Unless BIC is enabled and congestion window is large | ||
12 | * this behaves the same as the original Reno. | ||
13 | */ | ||
14 | |||
15 | #include <linux/config.h> | ||
16 | #include <linux/mm.h> | ||
17 | #include <linux/module.h> | ||
18 | #include <net/tcp.h> | ||
19 | |||
20 | |||
21 | #define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation | ||
22 | * max_cwnd = snd_cwnd * beta | ||
23 | */ | ||
24 | #define BICTCP_B 4 /* | ||
25 | * In binary search, | ||
26 | * go to point (max+min)/N | ||
27 | */ | ||
28 | |||
29 | static int fast_convergence = 1; | ||
30 | static int max_increment = 32; | ||
31 | static int low_window = 14; | ||
32 | static int beta = 819; /* = 819/1024 (BICTCP_BETA_SCALE) */ | ||
33 | static int low_utilization_threshold = 153; | ||
34 | static int low_utilization_period = 2; | ||
35 | static int initial_ssthresh = 100; | ||
36 | static int smooth_part = 20; | ||
37 | |||
38 | module_param(fast_convergence, int, 0644); | ||
39 | MODULE_PARM_DESC(fast_convergence, "turn on/off fast convergence"); | ||
40 | module_param(max_increment, int, 0644); | ||
41 | MODULE_PARM_DESC(max_increment, "Limit on increment allowed during binary search"); | ||
42 | module_param(low_window, int, 0644); | ||
43 | MODULE_PARM_DESC(low_window, "lower bound on congestion window (for TCP friendliness)"); | ||
44 | module_param(beta, int, 0644); | ||
45 | MODULE_PARM_DESC(beta, "beta for multiplicative increase"); | ||
46 | module_param(low_utilization_threshold, int, 0644); | ||
47 | MODULE_PARM_DESC(low_utilization_threshold, "percent (scaled by 1024) for low utilization mode"); | ||
48 | module_param(low_utilization_period, int, 0644); | ||
49 | MODULE_PARM_DESC(low_utilization_period, "if average delay exceeds then goto to low utilization mode (seconds)"); | ||
50 | module_param(initial_ssthresh, int, 0644); | ||
51 | MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold"); | ||
52 | module_param(smooth_part, int, 0644); | ||
53 | MODULE_PARM_DESC(smooth_part, "log(B/(B*Smin))/log(B/(B-1))+B, # of RTT from Wmax-B to Wmax"); | ||
54 | |||
55 | |||
56 | /* BIC TCP Parameters */ | ||
57 | struct bictcp { | ||
58 | u32 cnt; /* increase cwnd by 1 after ACKs */ | ||
59 | u32 last_max_cwnd; /* last maximum snd_cwnd */ | ||
60 | u32 loss_cwnd; /* congestion window at last loss */ | ||
61 | u32 last_cwnd; /* the last snd_cwnd */ | ||
62 | u32 last_time; /* time when updated last_cwnd */ | ||
63 | u32 delay_min; /* min delay */ | ||
64 | u32 delay_max; /* max delay */ | ||
65 | u32 last_delay; | ||
66 | u8 low_utilization;/* 0: high; 1: low */ | ||
67 | u32 low_utilization_start; /* starting time of low utilization detection*/ | ||
68 | u32 epoch_start; /* beginning of an epoch */ | ||
69 | #define ACK_RATIO_SHIFT 4 | ||
70 | u32 delayed_ack; /* estimate the ratio of Packets/ACKs << 4 */ | ||
71 | }; | ||
72 | |||
73 | static inline void bictcp_reset(struct bictcp *ca) | ||
74 | { | ||
75 | ca->cnt = 0; | ||
76 | ca->last_max_cwnd = 0; | ||
77 | ca->loss_cwnd = 0; | ||
78 | ca->last_cwnd = 0; | ||
79 | ca->last_time = 0; | ||
80 | ca->delay_min = 0; | ||
81 | ca->delay_max = 0; | ||
82 | ca->last_delay = 0; | ||
83 | ca->low_utilization = 0; | ||
84 | ca->low_utilization_start = 0; | ||
85 | ca->epoch_start = 0; | ||
86 | ca->delayed_ack = 2 << ACK_RATIO_SHIFT; | ||
87 | } | ||
88 | |||
89 | static void bictcp_init(struct tcp_sock *tp) | ||
90 | { | ||
91 | bictcp_reset(tcp_ca(tp)); | ||
92 | if (initial_ssthresh) | ||
93 | tp->snd_ssthresh = initial_ssthresh; | ||
94 | } | ||
95 | |||
96 | /* | ||
97 | * Compute congestion window to use. | ||
98 | */ | ||
99 | static inline void bictcp_update(struct bictcp *ca, u32 cwnd) | ||
100 | { | ||
101 | if (ca->last_cwnd == cwnd && | ||
102 | (s32)(tcp_time_stamp - ca->last_time) <= HZ / 32) | ||
103 | return; | ||
104 | |||
105 | ca->last_cwnd = cwnd; | ||
106 | ca->last_time = tcp_time_stamp; | ||
107 | |||
108 | if (ca->epoch_start == 0) /* record the beginning of an epoch */ | ||
109 | ca->epoch_start = tcp_time_stamp; | ||
110 | |||
111 | /* start off normal */ | ||
112 | if (cwnd <= low_window) { | ||
113 | ca->cnt = cwnd; | ||
114 | return; | ||
115 | } | ||
116 | |||
117 | /* binary increase */ | ||
118 | if (cwnd < ca->last_max_cwnd) { | ||
119 | __u32 dist = (ca->last_max_cwnd - cwnd) | ||
120 | / BICTCP_B; | ||
121 | |||
122 | if (dist > max_increment) | ||
123 | /* linear increase */ | ||
124 | ca->cnt = cwnd / max_increment; | ||
125 | else if (dist <= 1U) | ||
126 | /* binary search increase */ | ||
127 | ca->cnt = (cwnd * smooth_part) / BICTCP_B; | ||
128 | else | ||
129 | /* binary search increase */ | ||
130 | ca->cnt = cwnd / dist; | ||
131 | } else { | ||
132 | /* slow start AMD linear increase */ | ||
133 | if (cwnd < ca->last_max_cwnd + BICTCP_B) | ||
134 | /* slow start */ | ||
135 | ca->cnt = (cwnd * smooth_part) / BICTCP_B; | ||
136 | else if (cwnd < ca->last_max_cwnd + max_increment*(BICTCP_B-1)) | ||
137 | /* slow start */ | ||
138 | ca->cnt = (cwnd * (BICTCP_B-1)) | ||
139 | / cwnd-ca->last_max_cwnd; | ||
140 | else | ||
141 | /* linear increase */ | ||
142 | ca->cnt = cwnd / max_increment; | ||
143 | } | ||
144 | |||
145 | /* if in slow start or link utilization is very low */ | ||
146 | if ( ca->loss_cwnd == 0 || | ||
147 | (cwnd > ca->loss_cwnd && ca->low_utilization)) { | ||
148 | if (ca->cnt > 20) /* increase cwnd 5% per RTT */ | ||
149 | ca->cnt = 20; | ||
150 | } | ||
151 | |||
152 | ca->cnt = (ca->cnt << ACK_RATIO_SHIFT) / ca->delayed_ack; | ||
153 | if (ca->cnt == 0) /* cannot be zero */ | ||
154 | ca->cnt = 1; | ||
155 | } | ||
156 | |||
157 | |||
158 | /* Detect low utilization in congestion avoidance */ | ||
159 | static inline void bictcp_low_utilization(struct tcp_sock *tp, int flag) | ||
160 | { | ||
161 | struct bictcp *ca = tcp_ca(tp); | ||
162 | u32 dist, delay; | ||
163 | |||
164 | /* No time stamp */ | ||
165 | if (!(tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) || | ||
166 | /* Discard delay samples right after fast recovery */ | ||
167 | tcp_time_stamp < ca->epoch_start + HZ || | ||
168 | /* this delay samples may not be accurate */ | ||
169 | flag == 0) { | ||
170 | ca->last_delay = 0; | ||
171 | goto notlow; | ||
172 | } | ||
173 | |||
174 | delay = ca->last_delay<<3; /* use the same scale as tp->srtt*/ | ||
175 | ca->last_delay = tcp_time_stamp - tp->rx_opt.rcv_tsecr; | ||
176 | if (delay == 0) /* no previous delay sample */ | ||
177 | goto notlow; | ||
178 | |||
179 | /* first time call or link delay decreases */ | ||
180 | if (ca->delay_min == 0 || ca->delay_min > delay) { | ||
181 | ca->delay_min = ca->delay_max = delay; | ||
182 | goto notlow; | ||
183 | } | ||
184 | |||
185 | if (ca->delay_max < delay) | ||
186 | ca->delay_max = delay; | ||
187 | |||
188 | /* utilization is low, if avg delay < dist*threshold | ||
189 | for checking_period time */ | ||
190 | dist = ca->delay_max - ca->delay_min; | ||
191 | if (dist <= ca->delay_min>>6 || | ||
192 | tp->srtt - ca->delay_min >= (dist*low_utilization_threshold)>>10) | ||
193 | goto notlow; | ||
194 | |||
195 | if (ca->low_utilization_start == 0) { | ||
196 | ca->low_utilization = 0; | ||
197 | ca->low_utilization_start = tcp_time_stamp; | ||
198 | } else if ((s32)(tcp_time_stamp - ca->low_utilization_start) | ||
199 | > low_utilization_period*HZ) { | ||
200 | ca->low_utilization = 1; | ||
201 | } | ||
202 | |||
203 | return; | ||
204 | |||
205 | notlow: | ||
206 | ca->low_utilization = 0; | ||
207 | ca->low_utilization_start = 0; | ||
208 | |||
209 | } | ||
210 | |||
211 | static void bictcp_cong_avoid(struct tcp_sock *tp, u32 ack, | ||
212 | u32 seq_rtt, u32 in_flight, int data_acked) | ||
213 | { | ||
214 | struct bictcp *ca = tcp_ca(tp); | ||
215 | |||
216 | bictcp_low_utilization(tp, data_acked); | ||
217 | |||
218 | if (in_flight < tp->snd_cwnd) | ||
219 | return; | ||
220 | |||
221 | if (tp->snd_cwnd <= tp->snd_ssthresh) { | ||
222 | /* In "safe" area, increase. */ | ||
223 | if (tp->snd_cwnd < tp->snd_cwnd_clamp) | ||
224 | tp->snd_cwnd++; | ||
225 | } else { | ||
226 | bictcp_update(ca, tp->snd_cwnd); | ||
227 | |||
228 | /* In dangerous area, increase slowly. | ||
229 | * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd | ||
230 | */ | ||
231 | if (tp->snd_cwnd_cnt >= ca->cnt) { | ||
232 | if (tp->snd_cwnd < tp->snd_cwnd_clamp) | ||
233 | tp->snd_cwnd++; | ||
234 | tp->snd_cwnd_cnt = 0; | ||
235 | } else | ||
236 | tp->snd_cwnd_cnt++; | ||
237 | } | ||
238 | |||
239 | } | ||
240 | |||
241 | /* | ||
242 | * behave like Reno until low_window is reached, | ||
243 | * then increase congestion window slowly | ||
244 | */ | ||
245 | static u32 bictcp_recalc_ssthresh(struct tcp_sock *tp) | ||
246 | { | ||
247 | struct bictcp *ca = tcp_ca(tp); | ||
248 | |||
249 | ca->epoch_start = 0; /* end of epoch */ | ||
250 | |||
251 | /* in case of wrong delay_max*/ | ||
252 | if (ca->delay_min > 0 && ca->delay_max > ca->delay_min) | ||
253 | ca->delay_max = ca->delay_min | ||
254 | + ((ca->delay_max - ca->delay_min)* 90) / 100; | ||
255 | |||
256 | /* Wmax and fast convergence */ | ||
257 | if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence) | ||
258 | ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta)) | ||
259 | / (2 * BICTCP_BETA_SCALE); | ||
260 | else | ||
261 | ca->last_max_cwnd = tp->snd_cwnd; | ||
262 | |||
263 | ca->loss_cwnd = tp->snd_cwnd; | ||
264 | |||
265 | |||
266 | if (tp->snd_cwnd <= low_window) | ||
267 | return max(tp->snd_cwnd >> 1U, 2U); | ||
268 | else | ||
269 | return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U); | ||
270 | } | ||
271 | |||
272 | static u32 bictcp_undo_cwnd(struct tcp_sock *tp) | ||
273 | { | ||
274 | struct bictcp *ca = tcp_ca(tp); | ||
275 | |||
276 | return max(tp->snd_cwnd, ca->last_max_cwnd); | ||
277 | } | ||
278 | |||
279 | static u32 bictcp_min_cwnd(struct tcp_sock *tp) | ||
280 | { | ||
281 | return tp->snd_ssthresh; | ||
282 | } | ||
283 | |||
284 | static void bictcp_state(struct tcp_sock *tp, u8 new_state) | ||
285 | { | ||
286 | if (new_state == TCP_CA_Loss) | ||
287 | bictcp_reset(tcp_ca(tp)); | ||
288 | } | ||
289 | |||
290 | /* Track delayed acknowledgement ratio using sliding window | ||
291 | * ratio = (15*ratio + sample) / 16 | ||
292 | */ | ||
293 | static void bictcp_acked(struct tcp_sock *tp, u32 cnt) | ||
294 | { | ||
295 | if (cnt > 0 && tp->ca_state == TCP_CA_Open) { | ||
296 | struct bictcp *ca = tcp_ca(tp); | ||
297 | cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT; | ||
298 | ca->delayed_ack += cnt; | ||
299 | } | ||
300 | } | ||
301 | |||
302 | |||
303 | static struct tcp_congestion_ops bictcp = { | ||
304 | .init = bictcp_init, | ||
305 | .ssthresh = bictcp_recalc_ssthresh, | ||
306 | .cong_avoid = bictcp_cong_avoid, | ||
307 | .set_state = bictcp_state, | ||
308 | .undo_cwnd = bictcp_undo_cwnd, | ||
309 | .min_cwnd = bictcp_min_cwnd, | ||
310 | .pkts_acked = bictcp_acked, | ||
311 | .owner = THIS_MODULE, | ||
312 | .name = "bic", | ||
313 | }; | ||
314 | |||
315 | static int __init bictcp_register(void) | ||
316 | { | ||
317 | BUG_ON(sizeof(struct bictcp) > TCP_CA_PRIV_SIZE); | ||
318 | return tcp_register_congestion_control(&bictcp); | ||
319 | } | ||
320 | |||
321 | static void __exit bictcp_unregister(void) | ||
322 | { | ||
323 | tcp_unregister_congestion_control(&bictcp); | ||
324 | } | ||
325 | |||
326 | module_init(bictcp_register); | ||
327 | module_exit(bictcp_unregister); | ||
328 | |||
329 | MODULE_AUTHOR("Stephen Hemminger"); | ||
330 | MODULE_LICENSE("GPL"); | ||
331 | MODULE_DESCRIPTION("BIC TCP"); | ||
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c new file mode 100644 index 000000000000..4970d10a7785 --- /dev/null +++ b/net/ipv4/tcp_cong.c | |||
@@ -0,0 +1,237 @@ | |||
1 | /* | ||
2 | * Plugable TCP congestion control support and newReno | ||
3 | * congestion control. | ||
4 | * Based on ideas from I/O scheduler suport and Web100. | ||
5 | * | ||
6 | * Copyright (C) 2005 Stephen Hemminger <shemminger@osdl.org> | ||
7 | */ | ||
8 | |||
9 | #include <linux/config.h> | ||
10 | #include <linux/module.h> | ||
11 | #include <linux/mm.h> | ||
12 | #include <linux/types.h> | ||
13 | #include <linux/list.h> | ||
14 | #include <net/tcp.h> | ||
15 | |||
16 | static DEFINE_SPINLOCK(tcp_cong_list_lock); | ||
17 | static LIST_HEAD(tcp_cong_list); | ||
18 | |||
19 | /* Simple linear search, don't expect many entries! */ | ||
20 | static struct tcp_congestion_ops *tcp_ca_find(const char *name) | ||
21 | { | ||
22 | struct tcp_congestion_ops *e; | ||
23 | |||
24 | list_for_each_entry_rcu(e, &tcp_cong_list, list) { | ||
25 | if (strcmp(e->name, name) == 0) | ||
26 | return e; | ||
27 | } | ||
28 | |||
29 | return NULL; | ||
30 | } | ||
31 | |||
32 | /* | ||
33 | * Attach new congestion control algorthim to the list | ||
34 | * of available options. | ||
35 | */ | ||
36 | int tcp_register_congestion_control(struct tcp_congestion_ops *ca) | ||
37 | { | ||
38 | int ret = 0; | ||
39 | |||
40 | /* all algorithms must implement ssthresh and cong_avoid ops */ | ||
41 | if (!ca->ssthresh || !ca->cong_avoid || !ca->min_cwnd) { | ||
42 | printk(KERN_ERR "TCP %s does not implement required ops\n", | ||
43 | ca->name); | ||
44 | return -EINVAL; | ||
45 | } | ||
46 | |||
47 | spin_lock(&tcp_cong_list_lock); | ||
48 | if (tcp_ca_find(ca->name)) { | ||
49 | printk(KERN_NOTICE "TCP %s already registered\n", ca->name); | ||
50 | ret = -EEXIST; | ||
51 | } else { | ||
52 | list_add_rcu(&ca->list, &tcp_cong_list); | ||
53 | printk(KERN_INFO "TCP %s registered\n", ca->name); | ||
54 | } | ||
55 | spin_unlock(&tcp_cong_list_lock); | ||
56 | |||
57 | return ret; | ||
58 | } | ||
59 | EXPORT_SYMBOL_GPL(tcp_register_congestion_control); | ||
60 | |||
61 | /* | ||
62 | * Remove congestion control algorithm, called from | ||
63 | * the module's remove function. Module ref counts are used | ||
64 | * to ensure that this can't be done till all sockets using | ||
65 | * that method are closed. | ||
66 | */ | ||
67 | void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca) | ||
68 | { | ||
69 | spin_lock(&tcp_cong_list_lock); | ||
70 | list_del_rcu(&ca->list); | ||
71 | spin_unlock(&tcp_cong_list_lock); | ||
72 | } | ||
73 | EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control); | ||
74 | |||
75 | /* Assign choice of congestion control. */ | ||
76 | void tcp_init_congestion_control(struct tcp_sock *tp) | ||
77 | { | ||
78 | struct tcp_congestion_ops *ca; | ||
79 | |||
80 | if (tp->ca_ops != &tcp_init_congestion_ops) | ||
81 | return; | ||
82 | |||
83 | rcu_read_lock(); | ||
84 | list_for_each_entry_rcu(ca, &tcp_cong_list, list) { | ||
85 | if (try_module_get(ca->owner)) { | ||
86 | tp->ca_ops = ca; | ||
87 | break; | ||
88 | } | ||
89 | |||
90 | } | ||
91 | rcu_read_unlock(); | ||
92 | |||
93 | if (tp->ca_ops->init) | ||
94 | tp->ca_ops->init(tp); | ||
95 | } | ||
96 | |||
97 | /* Manage refcounts on socket close. */ | ||
98 | void tcp_cleanup_congestion_control(struct tcp_sock *tp) | ||
99 | { | ||
100 | if (tp->ca_ops->release) | ||
101 | tp->ca_ops->release(tp); | ||
102 | module_put(tp->ca_ops->owner); | ||
103 | } | ||
104 | |||
105 | /* Used by sysctl to change default congestion control */ | ||
106 | int tcp_set_default_congestion_control(const char *name) | ||
107 | { | ||
108 | struct tcp_congestion_ops *ca; | ||
109 | int ret = -ENOENT; | ||
110 | |||
111 | spin_lock(&tcp_cong_list_lock); | ||
112 | ca = tcp_ca_find(name); | ||
113 | #ifdef CONFIG_KMOD | ||
114 | if (!ca) { | ||
115 | spin_unlock(&tcp_cong_list_lock); | ||
116 | |||
117 | request_module("tcp_%s", name); | ||
118 | spin_lock(&tcp_cong_list_lock); | ||
119 | ca = tcp_ca_find(name); | ||
120 | } | ||
121 | #endif | ||
122 | |||
123 | if (ca) { | ||
124 | list_move(&ca->list, &tcp_cong_list); | ||
125 | ret = 0; | ||
126 | } | ||
127 | spin_unlock(&tcp_cong_list_lock); | ||
128 | |||
129 | return ret; | ||
130 | } | ||
131 | |||
132 | /* Get current default congestion control */ | ||
133 | void tcp_get_default_congestion_control(char *name) | ||
134 | { | ||
135 | struct tcp_congestion_ops *ca; | ||
136 | /* We will always have reno... */ | ||
137 | BUG_ON(list_empty(&tcp_cong_list)); | ||
138 | |||
139 | rcu_read_lock(); | ||
140 | ca = list_entry(tcp_cong_list.next, struct tcp_congestion_ops, list); | ||
141 | strncpy(name, ca->name, TCP_CA_NAME_MAX); | ||
142 | rcu_read_unlock(); | ||
143 | } | ||
144 | |||
145 | /* Change congestion control for socket */ | ||
146 | int tcp_set_congestion_control(struct tcp_sock *tp, const char *name) | ||
147 | { | ||
148 | struct tcp_congestion_ops *ca; | ||
149 | int err = 0; | ||
150 | |||
151 | rcu_read_lock(); | ||
152 | ca = tcp_ca_find(name); | ||
153 | if (ca == tp->ca_ops) | ||
154 | goto out; | ||
155 | |||
156 | if (!ca) | ||
157 | err = -ENOENT; | ||
158 | |||
159 | else if (!try_module_get(ca->owner)) | ||
160 | err = -EBUSY; | ||
161 | |||
162 | else { | ||
163 | tcp_cleanup_congestion_control(tp); | ||
164 | tp->ca_ops = ca; | ||
165 | if (tp->ca_ops->init) | ||
166 | tp->ca_ops->init(tp); | ||
167 | } | ||
168 | out: | ||
169 | rcu_read_unlock(); | ||
170 | return err; | ||
171 | } | ||
172 | |||
173 | /* | ||
174 | * TCP Reno congestion control | ||
175 | * This is special case used for fallback as well. | ||
176 | */ | ||
177 | /* This is Jacobson's slow start and congestion avoidance. | ||
178 | * SIGCOMM '88, p. 328. | ||
179 | */ | ||
180 | void tcp_reno_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, u32 in_flight, | ||
181 | int flag) | ||
182 | { | ||
183 | if (in_flight < tp->snd_cwnd) | ||
184 | return; | ||
185 | |||
186 | if (tp->snd_cwnd <= tp->snd_ssthresh) { | ||
187 | /* In "safe" area, increase. */ | ||
188 | if (tp->snd_cwnd < tp->snd_cwnd_clamp) | ||
189 | tp->snd_cwnd++; | ||
190 | } else { | ||
191 | /* In dangerous area, increase slowly. | ||
192 | * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd | ||
193 | */ | ||
194 | if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { | ||
195 | if (tp->snd_cwnd < tp->snd_cwnd_clamp) | ||
196 | tp->snd_cwnd++; | ||
197 | tp->snd_cwnd_cnt = 0; | ||
198 | } else | ||
199 | tp->snd_cwnd_cnt++; | ||
200 | } | ||
201 | } | ||
202 | EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid); | ||
203 | |||
204 | /* Slow start threshold is half the congestion window (min 2) */ | ||
205 | u32 tcp_reno_ssthresh(struct tcp_sock *tp) | ||
206 | { | ||
207 | return max(tp->snd_cwnd >> 1U, 2U); | ||
208 | } | ||
209 | EXPORT_SYMBOL_GPL(tcp_reno_ssthresh); | ||
210 | |||
211 | /* Lower bound on congestion window. */ | ||
212 | u32 tcp_reno_min_cwnd(struct tcp_sock *tp) | ||
213 | { | ||
214 | return tp->snd_ssthresh/2; | ||
215 | } | ||
216 | EXPORT_SYMBOL_GPL(tcp_reno_min_cwnd); | ||
217 | |||
218 | struct tcp_congestion_ops tcp_reno = { | ||
219 | .name = "reno", | ||
220 | .owner = THIS_MODULE, | ||
221 | .ssthresh = tcp_reno_ssthresh, | ||
222 | .cong_avoid = tcp_reno_cong_avoid, | ||
223 | .min_cwnd = tcp_reno_min_cwnd, | ||
224 | }; | ||
225 | |||
226 | /* Initial congestion control used (until SYN) | ||
227 | * really reno under another name so we can tell difference | ||
228 | * during tcp_set_default_congestion_control | ||
229 | */ | ||
230 | struct tcp_congestion_ops tcp_init_congestion_ops = { | ||
231 | .name = "", | ||
232 | .owner = THIS_MODULE, | ||
233 | .ssthresh = tcp_reno_ssthresh, | ||
234 | .cong_avoid = tcp_reno_cong_avoid, | ||
235 | .min_cwnd = tcp_reno_min_cwnd, | ||
236 | }; | ||
237 | EXPORT_SYMBOL_GPL(tcp_init_congestion_ops); | ||
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index 634befc07921..f66945cb158f 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c | |||
@@ -42,15 +42,8 @@ struct tcpdiag_entry | |||
42 | 42 | ||
43 | static struct sock *tcpnl; | 43 | static struct sock *tcpnl; |
44 | 44 | ||
45 | |||
46 | #define TCPDIAG_PUT(skb, attrtype, attrlen) \ | 45 | #define TCPDIAG_PUT(skb, attrtype, attrlen) \ |
47 | ({ int rtalen = RTA_LENGTH(attrlen); \ | 46 | RTA_DATA(__RTA_PUT(skb, attrtype, attrlen)) |
48 | struct rtattr *rta; \ | ||
49 | if (skb_tailroom(skb) < RTA_ALIGN(rtalen)) goto nlmsg_failure; \ | ||
50 | rta = (void*)__skb_put(skb, RTA_ALIGN(rtalen)); \ | ||
51 | rta->rta_type = attrtype; \ | ||
52 | rta->rta_len = rtalen; \ | ||
53 | RTA_DATA(rta); }) | ||
54 | 47 | ||
55 | static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, | 48 | static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, |
56 | int ext, u32 pid, u32 seq, u16 nlmsg_flags) | 49 | int ext, u32 pid, u32 seq, u16 nlmsg_flags) |
@@ -61,7 +54,6 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, | |||
61 | struct nlmsghdr *nlh; | 54 | struct nlmsghdr *nlh; |
62 | struct tcp_info *info = NULL; | 55 | struct tcp_info *info = NULL; |
63 | struct tcpdiag_meminfo *minfo = NULL; | 56 | struct tcpdiag_meminfo *minfo = NULL; |
64 | struct tcpvegas_info *vinfo = NULL; | ||
65 | unsigned char *b = skb->tail; | 57 | unsigned char *b = skb->tail; |
66 | 58 | ||
67 | nlh = NLMSG_PUT(skb, pid, seq, TCPDIAG_GETSOCK, sizeof(*r)); | 59 | nlh = NLMSG_PUT(skb, pid, seq, TCPDIAG_GETSOCK, sizeof(*r)); |
@@ -73,9 +65,11 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, | |||
73 | if (ext & (1<<(TCPDIAG_INFO-1))) | 65 | if (ext & (1<<(TCPDIAG_INFO-1))) |
74 | info = TCPDIAG_PUT(skb, TCPDIAG_INFO, sizeof(*info)); | 66 | info = TCPDIAG_PUT(skb, TCPDIAG_INFO, sizeof(*info)); |
75 | 67 | ||
76 | if ((tcp_is_westwood(tp) || tcp_is_vegas(tp)) | 68 | if (ext & (1<<(TCPDIAG_CONG-1))) { |
77 | && (ext & (1<<(TCPDIAG_VEGASINFO-1)))) | 69 | size_t len = strlen(tp->ca_ops->name); |
78 | vinfo = TCPDIAG_PUT(skb, TCPDIAG_VEGASINFO, sizeof(*vinfo)); | 70 | strcpy(TCPDIAG_PUT(skb, TCPDIAG_CONG, len+1), |
71 | tp->ca_ops->name); | ||
72 | } | ||
79 | } | 73 | } |
80 | r->tcpdiag_family = sk->sk_family; | 74 | r->tcpdiag_family = sk->sk_family; |
81 | r->tcpdiag_state = sk->sk_state; | 75 | r->tcpdiag_state = sk->sk_state; |
@@ -166,23 +160,13 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, | |||
166 | if (info) | 160 | if (info) |
167 | tcp_get_info(sk, info); | 161 | tcp_get_info(sk, info); |
168 | 162 | ||
169 | if (vinfo) { | 163 | if (sk->sk_state < TCP_TIME_WAIT && tp->ca_ops->get_info) |
170 | if (tcp_is_vegas(tp)) { | 164 | tp->ca_ops->get_info(tp, ext, skb); |
171 | vinfo->tcpv_enabled = tp->vegas.doing_vegas_now; | ||
172 | vinfo->tcpv_rttcnt = tp->vegas.cntRTT; | ||
173 | vinfo->tcpv_rtt = jiffies_to_usecs(tp->vegas.baseRTT); | ||
174 | vinfo->tcpv_minrtt = jiffies_to_usecs(tp->vegas.minRTT); | ||
175 | } else { | ||
176 | vinfo->tcpv_enabled = 0; | ||
177 | vinfo->tcpv_rttcnt = 0; | ||
178 | vinfo->tcpv_rtt = jiffies_to_usecs(tp->westwood.rtt); | ||
179 | vinfo->tcpv_minrtt = jiffies_to_usecs(tp->westwood.rtt_min); | ||
180 | } | ||
181 | } | ||
182 | 165 | ||
183 | nlh->nlmsg_len = skb->tail - b; | 166 | nlh->nlmsg_len = skb->tail - b; |
184 | return skb->len; | 167 | return skb->len; |
185 | 168 | ||
169 | rtattr_failure: | ||
186 | nlmsg_failure: | 170 | nlmsg_failure: |
187 | skb_trim(skb, b - skb->data); | 171 | skb_trim(skb, b - skb->data); |
188 | return -1; | 172 | return -1; |
diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c new file mode 100644 index 000000000000..36c51f8136bf --- /dev/null +++ b/net/ipv4/tcp_highspeed.c | |||
@@ -0,0 +1,181 @@ | |||
1 | /* | ||
2 | * Sally Floyd's High Speed TCP (RFC 3649) congestion control | ||
3 | * | ||
4 | * See http://www.icir.org/floyd/hstcp.html | ||
5 | * | ||
6 | * John Heffner <jheffner@psc.edu> | ||
7 | */ | ||
8 | |||
9 | #include <linux/config.h> | ||
10 | #include <linux/module.h> | ||
11 | #include <net/tcp.h> | ||
12 | |||
13 | |||
14 | /* From AIMD tables from RFC 3649 appendix B, | ||
15 | * with fixed-point MD scaled <<8. | ||
16 | */ | ||
17 | static const struct hstcp_aimd_val { | ||
18 | unsigned int cwnd; | ||
19 | unsigned int md; | ||
20 | } hstcp_aimd_vals[] = { | ||
21 | { 38, 128, /* 0.50 */ }, | ||
22 | { 118, 112, /* 0.44 */ }, | ||
23 | { 221, 104, /* 0.41 */ }, | ||
24 | { 347, 98, /* 0.38 */ }, | ||
25 | { 495, 93, /* 0.37 */ }, | ||
26 | { 663, 89, /* 0.35 */ }, | ||
27 | { 851, 86, /* 0.34 */ }, | ||
28 | { 1058, 83, /* 0.33 */ }, | ||
29 | { 1284, 81, /* 0.32 */ }, | ||
30 | { 1529, 78, /* 0.31 */ }, | ||
31 | { 1793, 76, /* 0.30 */ }, | ||
32 | { 2076, 74, /* 0.29 */ }, | ||
33 | { 2378, 72, /* 0.28 */ }, | ||
34 | { 2699, 71, /* 0.28 */ }, | ||
35 | { 3039, 69, /* 0.27 */ }, | ||
36 | { 3399, 68, /* 0.27 */ }, | ||
37 | { 3778, 66, /* 0.26 */ }, | ||
38 | { 4177, 65, /* 0.26 */ }, | ||
39 | { 4596, 64, /* 0.25 */ }, | ||
40 | { 5036, 62, /* 0.25 */ }, | ||
41 | { 5497, 61, /* 0.24 */ }, | ||
42 | { 5979, 60, /* 0.24 */ }, | ||
43 | { 6483, 59, /* 0.23 */ }, | ||
44 | { 7009, 58, /* 0.23 */ }, | ||
45 | { 7558, 57, /* 0.22 */ }, | ||
46 | { 8130, 56, /* 0.22 */ }, | ||
47 | { 8726, 55, /* 0.22 */ }, | ||
48 | { 9346, 54, /* 0.21 */ }, | ||
49 | { 9991, 53, /* 0.21 */ }, | ||
50 | { 10661, 52, /* 0.21 */ }, | ||
51 | { 11358, 52, /* 0.20 */ }, | ||
52 | { 12082, 51, /* 0.20 */ }, | ||
53 | { 12834, 50, /* 0.20 */ }, | ||
54 | { 13614, 49, /* 0.19 */ }, | ||
55 | { 14424, 48, /* 0.19 */ }, | ||
56 | { 15265, 48, /* 0.19 */ }, | ||
57 | { 16137, 47, /* 0.19 */ }, | ||
58 | { 17042, 46, /* 0.18 */ }, | ||
59 | { 17981, 45, /* 0.18 */ }, | ||
60 | { 18955, 45, /* 0.18 */ }, | ||
61 | { 19965, 44, /* 0.17 */ }, | ||
62 | { 21013, 43, /* 0.17 */ }, | ||
63 | { 22101, 43, /* 0.17 */ }, | ||
64 | { 23230, 42, /* 0.17 */ }, | ||
65 | { 24402, 41, /* 0.16 */ }, | ||
66 | { 25618, 41, /* 0.16 */ }, | ||
67 | { 26881, 40, /* 0.16 */ }, | ||
68 | { 28193, 39, /* 0.16 */ }, | ||
69 | { 29557, 39, /* 0.15 */ }, | ||
70 | { 30975, 38, /* 0.15 */ }, | ||
71 | { 32450, 38, /* 0.15 */ }, | ||
72 | { 33986, 37, /* 0.15 */ }, | ||
73 | { 35586, 36, /* 0.14 */ }, | ||
74 | { 37253, 36, /* 0.14 */ }, | ||
75 | { 38992, 35, /* 0.14 */ }, | ||
76 | { 40808, 35, /* 0.14 */ }, | ||
77 | { 42707, 34, /* 0.13 */ }, | ||
78 | { 44694, 33, /* 0.13 */ }, | ||
79 | { 46776, 33, /* 0.13 */ }, | ||
80 | { 48961, 32, /* 0.13 */ }, | ||
81 | { 51258, 32, /* 0.13 */ }, | ||
82 | { 53677, 31, /* 0.12 */ }, | ||
83 | { 56230, 30, /* 0.12 */ }, | ||
84 | { 58932, 30, /* 0.12 */ }, | ||
85 | { 61799, 29, /* 0.12 */ }, | ||
86 | { 64851, 28, /* 0.11 */ }, | ||
87 | { 68113, 28, /* 0.11 */ }, | ||
88 | { 71617, 27, /* 0.11 */ }, | ||
89 | { 75401, 26, /* 0.10 */ }, | ||
90 | { 79517, 26, /* 0.10 */ }, | ||
91 | { 84035, 25, /* 0.10 */ }, | ||
92 | { 89053, 24, /* 0.10 */ }, | ||
93 | }; | ||
94 | |||
95 | #define HSTCP_AIMD_MAX ARRAY_SIZE(hstcp_aimd_vals) | ||
96 | |||
97 | struct hstcp { | ||
98 | u32 ai; | ||
99 | }; | ||
100 | |||
101 | static void hstcp_init(struct tcp_sock *tp) | ||
102 | { | ||
103 | struct hstcp *ca = tcp_ca(tp); | ||
104 | |||
105 | ca->ai = 0; | ||
106 | |||
107 | /* Ensure the MD arithmetic works. This is somewhat pedantic, | ||
108 | * since I don't think we will see a cwnd this large. :) */ | ||
109 | tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128); | ||
110 | } | ||
111 | |||
112 | static void hstcp_cong_avoid(struct tcp_sock *tp, u32 adk, u32 rtt, | ||
113 | u32 in_flight, int good) | ||
114 | { | ||
115 | struct hstcp *ca = tcp_ca(tp); | ||
116 | |||
117 | if (in_flight < tp->snd_cwnd) | ||
118 | return; | ||
119 | |||
120 | if (tp->snd_cwnd <= tp->snd_ssthresh) { | ||
121 | if (tp->snd_cwnd < tp->snd_cwnd_clamp) | ||
122 | tp->snd_cwnd++; | ||
123 | } else { | ||
124 | /* Update AIMD parameters */ | ||
125 | if (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd) { | ||
126 | while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd && | ||
127 | ca->ai < HSTCP_AIMD_MAX) | ||
128 | ca->ai++; | ||
129 | } else if (tp->snd_cwnd < hstcp_aimd_vals[ca->ai].cwnd) { | ||
130 | while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd && | ||
131 | ca->ai > 0) | ||
132 | ca->ai--; | ||
133 | } | ||
134 | |||
135 | /* Do additive increase */ | ||
136 | if (tp->snd_cwnd < tp->snd_cwnd_clamp) { | ||
137 | tp->snd_cwnd_cnt += ca->ai; | ||
138 | if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { | ||
139 | tp->snd_cwnd++; | ||
140 | tp->snd_cwnd_cnt -= tp->snd_cwnd; | ||
141 | } | ||
142 | } | ||
143 | } | ||
144 | } | ||
145 | |||
146 | static u32 hstcp_ssthresh(struct tcp_sock *tp) | ||
147 | { | ||
148 | struct hstcp *ca = tcp_ca(tp); | ||
149 | |||
150 | /* Do multiplicative decrease */ | ||
151 | return max(tp->snd_cwnd - ((tp->snd_cwnd * hstcp_aimd_vals[ca->ai].md) >> 8), 2U); | ||
152 | } | ||
153 | |||
154 | |||
155 | static struct tcp_congestion_ops tcp_highspeed = { | ||
156 | .init = hstcp_init, | ||
157 | .ssthresh = hstcp_ssthresh, | ||
158 | .cong_avoid = hstcp_cong_avoid, | ||
159 | .min_cwnd = tcp_reno_min_cwnd, | ||
160 | |||
161 | .owner = THIS_MODULE, | ||
162 | .name = "highspeed" | ||
163 | }; | ||
164 | |||
165 | static int __init hstcp_register(void) | ||
166 | { | ||
167 | BUG_ON(sizeof(struct hstcp) > TCP_CA_PRIV_SIZE); | ||
168 | return tcp_register_congestion_control(&tcp_highspeed); | ||
169 | } | ||
170 | |||
171 | static void __exit hstcp_unregister(void) | ||
172 | { | ||
173 | tcp_unregister_congestion_control(&tcp_highspeed); | ||
174 | } | ||
175 | |||
176 | module_init(hstcp_register); | ||
177 | module_exit(hstcp_unregister); | ||
178 | |||
179 | MODULE_AUTHOR("John Heffner"); | ||
180 | MODULE_LICENSE("GPL"); | ||
181 | MODULE_DESCRIPTION("High Speed TCP"); | ||
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c new file mode 100644 index 000000000000..40168275acf9 --- /dev/null +++ b/net/ipv4/tcp_htcp.c | |||
@@ -0,0 +1,289 @@ | |||
1 | /* | ||
2 | * H-TCP congestion control. The algorithm is detailed in: | ||
3 | * R.N.Shorten, D.J.Leith: | ||
4 | * "H-TCP: TCP for high-speed and long-distance networks" | ||
5 | * Proc. PFLDnet, Argonne, 2004. | ||
6 | * http://www.hamilton.ie/net/htcp3.pdf | ||
7 | */ | ||
8 | |||
9 | #include <linux/config.h> | ||
10 | #include <linux/mm.h> | ||
11 | #include <linux/module.h> | ||
12 | #include <net/tcp.h> | ||
13 | |||
14 | #define ALPHA_BASE (1<<7) /* 1.0 with shift << 7 */ | ||
15 | #define BETA_MIN (1<<6) /* 0.5 with shift << 7 */ | ||
16 | #define BETA_MAX 102 /* 0.8 with shift << 7 */ | ||
17 | |||
18 | static int use_rtt_scaling = 1; | ||
19 | module_param(use_rtt_scaling, int, 0644); | ||
20 | MODULE_PARM_DESC(use_rtt_scaling, "turn on/off RTT scaling"); | ||
21 | |||
22 | static int use_bandwidth_switch = 1; | ||
23 | module_param(use_bandwidth_switch, int, 0644); | ||
24 | MODULE_PARM_DESC(use_bandwidth_switch, "turn on/off bandwidth switcher"); | ||
25 | |||
26 | struct htcp { | ||
27 | u16 alpha; /* Fixed point arith, << 7 */ | ||
28 | u8 beta; /* Fixed point arith, << 7 */ | ||
29 | u8 modeswitch; /* Delay modeswitch until we had at least one congestion event */ | ||
30 | u8 ccount; /* Number of RTTs since last congestion event */ | ||
31 | u8 undo_ccount; | ||
32 | u16 packetcount; | ||
33 | u32 minRTT; | ||
34 | u32 maxRTT; | ||
35 | u32 snd_cwnd_cnt2; | ||
36 | |||
37 | u32 undo_maxRTT; | ||
38 | u32 undo_old_maxB; | ||
39 | |||
40 | /* Bandwidth estimation */ | ||
41 | u32 minB; | ||
42 | u32 maxB; | ||
43 | u32 old_maxB; | ||
44 | u32 Bi; | ||
45 | u32 lasttime; | ||
46 | }; | ||
47 | |||
48 | static inline void htcp_reset(struct htcp *ca) | ||
49 | { | ||
50 | ca->undo_ccount = ca->ccount; | ||
51 | ca->undo_maxRTT = ca->maxRTT; | ||
52 | ca->undo_old_maxB = ca->old_maxB; | ||
53 | |||
54 | ca->ccount = 0; | ||
55 | ca->snd_cwnd_cnt2 = 0; | ||
56 | } | ||
57 | |||
58 | static u32 htcp_cwnd_undo(struct tcp_sock *tp) | ||
59 | { | ||
60 | struct htcp *ca = tcp_ca(tp); | ||
61 | ca->ccount = ca->undo_ccount; | ||
62 | ca->maxRTT = ca->undo_maxRTT; | ||
63 | ca->old_maxB = ca->undo_old_maxB; | ||
64 | return max(tp->snd_cwnd, (tp->snd_ssthresh<<7)/ca->beta); | ||
65 | } | ||
66 | |||
67 | static inline void measure_rtt(struct tcp_sock *tp) | ||
68 | { | ||
69 | struct htcp *ca = tcp_ca(tp); | ||
70 | u32 srtt = tp->srtt>>3; | ||
71 | |||
72 | /* keep track of minimum RTT seen so far, minRTT is zero at first */ | ||
73 | if (ca->minRTT > srtt || !ca->minRTT) | ||
74 | ca->minRTT = srtt; | ||
75 | |||
76 | /* max RTT */ | ||
77 | if (tp->ca_state == TCP_CA_Open && tp->snd_ssthresh < 0xFFFF && ca->ccount > 3) { | ||
78 | if (ca->maxRTT < ca->minRTT) | ||
79 | ca->maxRTT = ca->minRTT; | ||
80 | if (ca->maxRTT < srtt && srtt <= ca->maxRTT+HZ/50) | ||
81 | ca->maxRTT = srtt; | ||
82 | } | ||
83 | } | ||
84 | |||
85 | static void measure_achieved_throughput(struct tcp_sock *tp, u32 pkts_acked) | ||
86 | { | ||
87 | struct htcp *ca = tcp_ca(tp); | ||
88 | u32 now = tcp_time_stamp; | ||
89 | |||
90 | /* achieved throughput calculations */ | ||
91 | if (tp->ca_state != TCP_CA_Open && tp->ca_state != TCP_CA_Disorder) { | ||
92 | ca->packetcount = 0; | ||
93 | ca->lasttime = now; | ||
94 | return; | ||
95 | } | ||
96 | |||
97 | ca->packetcount += pkts_acked; | ||
98 | |||
99 | if (ca->packetcount >= tp->snd_cwnd - (ca->alpha>>7? : 1) | ||
100 | && now - ca->lasttime >= ca->minRTT | ||
101 | && ca->minRTT > 0) { | ||
102 | __u32 cur_Bi = ca->packetcount*HZ/(now - ca->lasttime); | ||
103 | if (ca->ccount <= 3) { | ||
104 | /* just after backoff */ | ||
105 | ca->minB = ca->maxB = ca->Bi = cur_Bi; | ||
106 | } else { | ||
107 | ca->Bi = (3*ca->Bi + cur_Bi)/4; | ||
108 | if (ca->Bi > ca->maxB) | ||
109 | ca->maxB = ca->Bi; | ||
110 | if (ca->minB > ca->maxB) | ||
111 | ca->minB = ca->maxB; | ||
112 | } | ||
113 | ca->packetcount = 0; | ||
114 | ca->lasttime = now; | ||
115 | } | ||
116 | } | ||
117 | |||
118 | static inline void htcp_beta_update(struct htcp *ca, u32 minRTT, u32 maxRTT) | ||
119 | { | ||
120 | if (use_bandwidth_switch) { | ||
121 | u32 maxB = ca->maxB; | ||
122 | u32 old_maxB = ca->old_maxB; | ||
123 | ca->old_maxB = ca->maxB; | ||
124 | |||
125 | if (!between(5*maxB, 4*old_maxB, 6*old_maxB)) { | ||
126 | ca->beta = BETA_MIN; | ||
127 | ca->modeswitch = 0; | ||
128 | return; | ||
129 | } | ||
130 | } | ||
131 | |||
132 | if (ca->modeswitch && minRTT > max(HZ/100, 1) && maxRTT) { | ||
133 | ca->beta = (minRTT<<7)/maxRTT; | ||
134 | if (ca->beta < BETA_MIN) | ||
135 | ca->beta = BETA_MIN; | ||
136 | else if (ca->beta > BETA_MAX) | ||
137 | ca->beta = BETA_MAX; | ||
138 | } else { | ||
139 | ca->beta = BETA_MIN; | ||
140 | ca->modeswitch = 1; | ||
141 | } | ||
142 | } | ||
143 | |||
144 | static inline void htcp_alpha_update(struct htcp *ca) | ||
145 | { | ||
146 | u32 minRTT = ca->minRTT; | ||
147 | u32 factor = 1; | ||
148 | u32 diff = ca->ccount * minRTT; /* time since last backoff */ | ||
149 | |||
150 | if (diff > HZ) { | ||
151 | diff -= HZ; | ||
152 | factor = 1+ ( 10*diff + ((diff/2)*(diff/2)/HZ) )/HZ; | ||
153 | } | ||
154 | |||
155 | if (use_rtt_scaling && minRTT) { | ||
156 | u32 scale = (HZ<<3)/(10*minRTT); | ||
157 | scale = min(max(scale, 1U<<2), 10U<<3); /* clamping ratio to interval [0.5,10]<<3 */ | ||
158 | factor = (factor<<3)/scale; | ||
159 | if (!factor) | ||
160 | factor = 1; | ||
161 | } | ||
162 | |||
163 | ca->alpha = 2*factor*((1<<7)-ca->beta); | ||
164 | if (!ca->alpha) | ||
165 | ca->alpha = ALPHA_BASE; | ||
166 | } | ||
167 | |||
168 | /* After we have the rtt data to calculate beta, we'd still prefer to wait one | ||
169 | * rtt before we adjust our beta to ensure we are working from a consistent | ||
170 | * data. | ||
171 | * | ||
172 | * This function should be called when we hit a congestion event since only at | ||
173 | * that point do we really have a real sense of maxRTT (the queues en route | ||
174 | * were getting just too full now). | ||
175 | */ | ||
176 | static void htcp_param_update(struct tcp_sock *tp) | ||
177 | { | ||
178 | struct htcp *ca = tcp_ca(tp); | ||
179 | u32 minRTT = ca->minRTT; | ||
180 | u32 maxRTT = ca->maxRTT; | ||
181 | |||
182 | htcp_beta_update(ca, minRTT, maxRTT); | ||
183 | htcp_alpha_update(ca); | ||
184 | |||
185 | /* add slowly fading memory for maxRTT to accommodate routing changes etc */ | ||
186 | if (minRTT > 0 && maxRTT > minRTT) | ||
187 | ca->maxRTT = minRTT + ((maxRTT-minRTT)*95)/100; | ||
188 | } | ||
189 | |||
190 | static u32 htcp_recalc_ssthresh(struct tcp_sock *tp) | ||
191 | { | ||
192 | struct htcp *ca = tcp_ca(tp); | ||
193 | htcp_param_update(tp); | ||
194 | return max((tp->snd_cwnd * ca->beta) >> 7, 2U); | ||
195 | } | ||
196 | |||
197 | static void htcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, | ||
198 | u32 in_flight, int data_acked) | ||
199 | { | ||
200 | struct htcp *ca = tcp_ca(tp); | ||
201 | |||
202 | if (in_flight < tp->snd_cwnd) | ||
203 | return; | ||
204 | |||
205 | if (tp->snd_cwnd <= tp->snd_ssthresh) { | ||
206 | /* In "safe" area, increase. */ | ||
207 | if (tp->snd_cwnd < tp->snd_cwnd_clamp) | ||
208 | tp->snd_cwnd++; | ||
209 | } else { | ||
210 | measure_rtt(tp); | ||
211 | |||
212 | /* keep track of number of round-trip times since last backoff event */ | ||
213 | if (ca->snd_cwnd_cnt2++ > tp->snd_cwnd) { | ||
214 | ca->ccount++; | ||
215 | ca->snd_cwnd_cnt2 = 0; | ||
216 | htcp_alpha_update(ca); | ||
217 | } | ||
218 | |||
219 | /* In dangerous area, increase slowly. | ||
220 | * In theory this is tp->snd_cwnd += alpha / tp->snd_cwnd | ||
221 | */ | ||
222 | if ((tp->snd_cwnd_cnt++ * ca->alpha)>>7 >= tp->snd_cwnd) { | ||
223 | if (tp->snd_cwnd < tp->snd_cwnd_clamp) | ||
224 | tp->snd_cwnd++; | ||
225 | tp->snd_cwnd_cnt = 0; | ||
226 | ca->ccount++; | ||
227 | } | ||
228 | } | ||
229 | } | ||
230 | |||
231 | /* Lower bound on congestion window. */ | ||
232 | static u32 htcp_min_cwnd(struct tcp_sock *tp) | ||
233 | { | ||
234 | return tp->snd_ssthresh; | ||
235 | } | ||
236 | |||
237 | |||
238 | static void htcp_init(struct tcp_sock *tp) | ||
239 | { | ||
240 | struct htcp *ca = tcp_ca(tp); | ||
241 | |||
242 | memset(ca, 0, sizeof(struct htcp)); | ||
243 | ca->alpha = ALPHA_BASE; | ||
244 | ca->beta = BETA_MIN; | ||
245 | } | ||
246 | |||
247 | static void htcp_state(struct tcp_sock *tp, u8 new_state) | ||
248 | { | ||
249 | switch (new_state) { | ||
250 | case TCP_CA_CWR: | ||
251 | case TCP_CA_Recovery: | ||
252 | case TCP_CA_Loss: | ||
253 | htcp_reset(tcp_ca(tp)); | ||
254 | break; | ||
255 | } | ||
256 | } | ||
257 | |||
258 | static struct tcp_congestion_ops htcp = { | ||
259 | .init = htcp_init, | ||
260 | .ssthresh = htcp_recalc_ssthresh, | ||
261 | .min_cwnd = htcp_min_cwnd, | ||
262 | .cong_avoid = htcp_cong_avoid, | ||
263 | .set_state = htcp_state, | ||
264 | .undo_cwnd = htcp_cwnd_undo, | ||
265 | .pkts_acked = measure_achieved_throughput, | ||
266 | .owner = THIS_MODULE, | ||
267 | .name = "htcp", | ||
268 | }; | ||
269 | |||
270 | static int __init htcp_register(void) | ||
271 | { | ||
272 | BUG_ON(sizeof(struct htcp) > TCP_CA_PRIV_SIZE); | ||
273 | BUILD_BUG_ON(BETA_MIN >= BETA_MAX); | ||
274 | if (!use_bandwidth_switch) | ||
275 | htcp.pkts_acked = NULL; | ||
276 | return tcp_register_congestion_control(&htcp); | ||
277 | } | ||
278 | |||
279 | static void __exit htcp_unregister(void) | ||
280 | { | ||
281 | tcp_unregister_congestion_control(&htcp); | ||
282 | } | ||
283 | |||
284 | module_init(htcp_register); | ||
285 | module_exit(htcp_unregister); | ||
286 | |||
287 | MODULE_AUTHOR("Baruch Even"); | ||
288 | MODULE_LICENSE("GPL"); | ||
289 | MODULE_DESCRIPTION("H-TCP"); | ||
diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c new file mode 100644 index 000000000000..13a66342c304 --- /dev/null +++ b/net/ipv4/tcp_hybla.c | |||
@@ -0,0 +1,187 @@ | |||
1 | /* | ||
2 | * TCP HYBLA | ||
3 | * | ||
4 | * TCP-HYBLA Congestion control algorithm, based on: | ||
5 | * C.Caini, R.Firrincieli, "TCP-Hybla: A TCP Enhancement | ||
6 | * for Heterogeneous Networks", | ||
7 | * International Journal on satellite Communications, | ||
8 | * September 2004 | ||
9 | * Daniele Lacamera | ||
10 | * root at danielinux.net | ||
11 | */ | ||
12 | |||
13 | #include <linux/config.h> | ||
14 | #include <linux/module.h> | ||
15 | #include <net/tcp.h> | ||
16 | |||
17 | /* Tcp Hybla structure. */ | ||
18 | struct hybla { | ||
19 | u8 hybla_en; | ||
20 | u32 snd_cwnd_cents; /* Keeps increment values when it is <1, <<7 */ | ||
21 | u32 rho; /* Rho parameter, integer part */ | ||
22 | u32 rho2; /* Rho * Rho, integer part */ | ||
23 | u32 rho_3ls; /* Rho parameter, <<3 */ | ||
24 | u32 rho2_7ls; /* Rho^2, <<7 */ | ||
25 | u32 minrtt; /* Minimum smoothed round trip time value seen */ | ||
26 | }; | ||
27 | |||
28 | /* Hybla reference round trip time (default= 1/40 sec = 25 ms), | ||
29 | expressed in jiffies */ | ||
30 | static int rtt0 = 25; | ||
31 | module_param(rtt0, int, 0644); | ||
32 | MODULE_PARM_DESC(rtt0, "reference rout trip time (ms)"); | ||
33 | |||
34 | |||
35 | /* This is called to refresh values for hybla parameters */ | ||
36 | static inline void hybla_recalc_param (struct tcp_sock *tp) | ||
37 | { | ||
38 | struct hybla *ca = tcp_ca(tp); | ||
39 | |||
40 | ca->rho_3ls = max_t(u32, tp->srtt / msecs_to_jiffies(rtt0), 8); | ||
41 | ca->rho = ca->rho_3ls >> 3; | ||
42 | ca->rho2_7ls = (ca->rho_3ls * ca->rho_3ls) << 1; | ||
43 | ca->rho2 = ca->rho2_7ls >>7; | ||
44 | } | ||
45 | |||
46 | static void hybla_init(struct tcp_sock *tp) | ||
47 | { | ||
48 | struct hybla *ca = tcp_ca(tp); | ||
49 | |||
50 | ca->rho = 0; | ||
51 | ca->rho2 = 0; | ||
52 | ca->rho_3ls = 0; | ||
53 | ca->rho2_7ls = 0; | ||
54 | ca->snd_cwnd_cents = 0; | ||
55 | ca->hybla_en = 1; | ||
56 | tp->snd_cwnd = 2; | ||
57 | tp->snd_cwnd_clamp = 65535; | ||
58 | |||
59 | /* 1st Rho measurement based on initial srtt */ | ||
60 | hybla_recalc_param(tp); | ||
61 | |||
62 | /* set minimum rtt as this is the 1st ever seen */ | ||
63 | ca->minrtt = tp->srtt; | ||
64 | tp->snd_cwnd = ca->rho; | ||
65 | } | ||
66 | |||
67 | static void hybla_state(struct tcp_sock *tp, u8 ca_state) | ||
68 | { | ||
69 | struct hybla *ca = tcp_ca(tp); | ||
70 | |||
71 | ca->hybla_en = (ca_state == TCP_CA_Open); | ||
72 | } | ||
73 | |||
74 | static inline u32 hybla_fraction(u32 odds) | ||
75 | { | ||
76 | static const u32 fractions[] = { | ||
77 | 128, 139, 152, 165, 181, 197, 215, 234, | ||
78 | }; | ||
79 | |||
80 | return (odds < ARRAY_SIZE(fractions)) ? fractions[odds] : 128; | ||
81 | } | ||
82 | |||
83 | /* TCP Hybla main routine. | ||
84 | * This is the algorithm behavior: | ||
85 | * o Recalc Hybla parameters if min_rtt has changed | ||
86 | * o Give cwnd a new value based on the model proposed | ||
87 | * o remember increments <1 | ||
88 | */ | ||
89 | static void hybla_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, | ||
90 | u32 in_flight, int flag) | ||
91 | { | ||
92 | struct hybla *ca = tcp_ca(tp); | ||
93 | u32 increment, odd, rho_fractions; | ||
94 | int is_slowstart = 0; | ||
95 | |||
96 | /* Recalculate rho only if this srtt is the lowest */ | ||
97 | if (tp->srtt < ca->minrtt){ | ||
98 | hybla_recalc_param(tp); | ||
99 | ca->minrtt = tp->srtt; | ||
100 | } | ||
101 | |||
102 | if (!ca->hybla_en) | ||
103 | return tcp_reno_cong_avoid(tp, ack, rtt, in_flight, flag); | ||
104 | |||
105 | if (in_flight < tp->snd_cwnd) | ||
106 | return; | ||
107 | |||
108 | if (ca->rho == 0) | ||
109 | hybla_recalc_param(tp); | ||
110 | |||
111 | rho_fractions = ca->rho_3ls - (ca->rho << 3); | ||
112 | |||
113 | if (tp->snd_cwnd < tp->snd_ssthresh) { | ||
114 | /* | ||
115 | * slow start | ||
116 | * INC = 2^RHO - 1 | ||
117 | * This is done by splitting the rho parameter | ||
118 | * into 2 parts: an integer part and a fraction part. | ||
119 | * Inrement<<7 is estimated by doing: | ||
120 | * [2^(int+fract)]<<7 | ||
121 | * that is equal to: | ||
122 | * (2^int) * [(2^fract) <<7] | ||
123 | * 2^int is straightly computed as 1<<int, | ||
124 | * while we will use hybla_slowstart_fraction_increment() to | ||
125 | * calculate 2^fract in a <<7 value. | ||
126 | */ | ||
127 | is_slowstart = 1; | ||
128 | increment = ((1 << ca->rho) * hybla_fraction(rho_fractions)) | ||
129 | - 128; | ||
130 | } else { | ||
131 | /* | ||
132 | * congestion avoidance | ||
133 | * INC = RHO^2 / W | ||
134 | * as long as increment is estimated as (rho<<7)/window | ||
135 | * it already is <<7 and we can easily count its fractions. | ||
136 | */ | ||
137 | increment = ca->rho2_7ls / tp->snd_cwnd; | ||
138 | if (increment < 128) | ||
139 | tp->snd_cwnd_cnt++; | ||
140 | } | ||
141 | |||
142 | odd = increment % 128; | ||
143 | tp->snd_cwnd += increment >> 7; | ||
144 | ca->snd_cwnd_cents += odd; | ||
145 | |||
146 | /* check when fractions goes >=128 and increase cwnd by 1. */ | ||
147 | while(ca->snd_cwnd_cents >= 128) { | ||
148 | tp->snd_cwnd++; | ||
149 | ca->snd_cwnd_cents -= 128; | ||
150 | tp->snd_cwnd_cnt = 0; | ||
151 | } | ||
152 | |||
153 | /* clamp down slowstart cwnd to ssthresh value. */ | ||
154 | if (is_slowstart) | ||
155 | tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); | ||
156 | |||
157 | tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp); | ||
158 | } | ||
159 | |||
160 | static struct tcp_congestion_ops tcp_hybla = { | ||
161 | .init = hybla_init, | ||
162 | .ssthresh = tcp_reno_ssthresh, | ||
163 | .min_cwnd = tcp_reno_min_cwnd, | ||
164 | .cong_avoid = hybla_cong_avoid, | ||
165 | .set_state = hybla_state, | ||
166 | |||
167 | .owner = THIS_MODULE, | ||
168 | .name = "hybla" | ||
169 | }; | ||
170 | |||
171 | static int __init hybla_register(void) | ||
172 | { | ||
173 | BUG_ON(sizeof(struct hybla) > TCP_CA_PRIV_SIZE); | ||
174 | return tcp_register_congestion_control(&tcp_hybla); | ||
175 | } | ||
176 | |||
177 | static void __exit hybla_unregister(void) | ||
178 | { | ||
179 | tcp_unregister_congestion_control(&tcp_hybla); | ||
180 | } | ||
181 | |||
182 | module_init(hybla_register); | ||
183 | module_exit(hybla_unregister); | ||
184 | |||
185 | MODULE_AUTHOR("Daniele Lacamera"); | ||
186 | MODULE_LICENSE("GPL"); | ||
187 | MODULE_DESCRIPTION("TCP Hybla"); | ||
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 5bad504630a3..7bbbbc33eb4b 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c | |||
@@ -61,7 +61,6 @@ | |||
61 | * Panu Kuhlberg: Experimental audit of TCP (re)transmission | 61 | * Panu Kuhlberg: Experimental audit of TCP (re)transmission |
62 | * engine. Lots of bugs are found. | 62 | * engine. Lots of bugs are found. |
63 | * Pasi Sarolahti: F-RTO for dealing with spurious RTOs | 63 | * Pasi Sarolahti: F-RTO for dealing with spurious RTOs |
64 | * Angelo Dell'Aera: TCP Westwood+ support | ||
65 | */ | 64 | */ |
66 | 65 | ||
67 | #include <linux/config.h> | 66 | #include <linux/config.h> |
@@ -88,23 +87,9 @@ int sysctl_tcp_rfc1337; | |||
88 | int sysctl_tcp_max_orphans = NR_FILE; | 87 | int sysctl_tcp_max_orphans = NR_FILE; |
89 | int sysctl_tcp_frto; | 88 | int sysctl_tcp_frto; |
90 | int sysctl_tcp_nometrics_save; | 89 | int sysctl_tcp_nometrics_save; |
91 | int sysctl_tcp_westwood; | ||
92 | int sysctl_tcp_vegas_cong_avoid; | ||
93 | 90 | ||
94 | int sysctl_tcp_moderate_rcvbuf = 1; | 91 | int sysctl_tcp_moderate_rcvbuf = 1; |
95 | 92 | ||
96 | /* Default values of the Vegas variables, in fixed-point representation | ||
97 | * with V_PARAM_SHIFT bits to the right of the binary point. | ||
98 | */ | ||
99 | #define V_PARAM_SHIFT 1 | ||
100 | int sysctl_tcp_vegas_alpha = 1<<V_PARAM_SHIFT; | ||
101 | int sysctl_tcp_vegas_beta = 3<<V_PARAM_SHIFT; | ||
102 | int sysctl_tcp_vegas_gamma = 1<<V_PARAM_SHIFT; | ||
103 | int sysctl_tcp_bic = 1; | ||
104 | int sysctl_tcp_bic_fast_convergence = 1; | ||
105 | int sysctl_tcp_bic_low_window = 14; | ||
106 | int sysctl_tcp_bic_beta = 819; /* = 819/1024 (BICTCP_BETA_SCALE) */ | ||
107 | |||
108 | #define FLAG_DATA 0x01 /* Incoming frame contained data. */ | 93 | #define FLAG_DATA 0x01 /* Incoming frame contained data. */ |
109 | #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ | 94 | #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ |
110 | #define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */ | 95 | #define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */ |
@@ -333,15 +318,6 @@ static void tcp_init_buffer_space(struct sock *sk) | |||
333 | tp->snd_cwnd_stamp = tcp_time_stamp; | 318 | tp->snd_cwnd_stamp = tcp_time_stamp; |
334 | } | 319 | } |
335 | 320 | ||
336 | static void init_bictcp(struct tcp_sock *tp) | ||
337 | { | ||
338 | tp->bictcp.cnt = 0; | ||
339 | |||
340 | tp->bictcp.last_max_cwnd = 0; | ||
341 | tp->bictcp.last_cwnd = 0; | ||
342 | tp->bictcp.last_stamp = 0; | ||
343 | } | ||
344 | |||
345 | /* 5. Recalculate window clamp after socket hit its memory bounds. */ | 321 | /* 5. Recalculate window clamp after socket hit its memory bounds. */ |
346 | static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp) | 322 | static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp) |
347 | { | 323 | { |
@@ -558,45 +534,6 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_ | |||
558 | tcp_grow_window(sk, tp, skb); | 534 | tcp_grow_window(sk, tp, skb); |
559 | } | 535 | } |
560 | 536 | ||
561 | /* When starting a new connection, pin down the current choice of | ||
562 | * congestion algorithm. | ||
563 | */ | ||
564 | void tcp_ca_init(struct tcp_sock *tp) | ||
565 | { | ||
566 | if (sysctl_tcp_westwood) | ||
567 | tp->adv_cong = TCP_WESTWOOD; | ||
568 | else if (sysctl_tcp_bic) | ||
569 | tp->adv_cong = TCP_BIC; | ||
570 | else if (sysctl_tcp_vegas_cong_avoid) { | ||
571 | tp->adv_cong = TCP_VEGAS; | ||
572 | tp->vegas.baseRTT = 0x7fffffff; | ||
573 | tcp_vegas_enable(tp); | ||
574 | } | ||
575 | } | ||
576 | |||
577 | /* Do RTT sampling needed for Vegas. | ||
578 | * Basically we: | ||
579 | * o min-filter RTT samples from within an RTT to get the current | ||
580 | * propagation delay + queuing delay (we are min-filtering to try to | ||
581 | * avoid the effects of delayed ACKs) | ||
582 | * o min-filter RTT samples from a much longer window (forever for now) | ||
583 | * to find the propagation delay (baseRTT) | ||
584 | */ | ||
585 | static inline void vegas_rtt_calc(struct tcp_sock *tp, __u32 rtt) | ||
586 | { | ||
587 | __u32 vrtt = rtt + 1; /* Never allow zero rtt or baseRTT */ | ||
588 | |||
589 | /* Filter to find propagation delay: */ | ||
590 | if (vrtt < tp->vegas.baseRTT) | ||
591 | tp->vegas.baseRTT = vrtt; | ||
592 | |||
593 | /* Find the min RTT during the last RTT to find | ||
594 | * the current prop. delay + queuing delay: | ||
595 | */ | ||
596 | tp->vegas.minRTT = min(tp->vegas.minRTT, vrtt); | ||
597 | tp->vegas.cntRTT++; | ||
598 | } | ||
599 | |||
600 | /* Called to compute a smoothed rtt estimate. The data fed to this | 537 | /* Called to compute a smoothed rtt estimate. The data fed to this |
601 | * routine either comes from timestamps, or from segments that were | 538 | * routine either comes from timestamps, or from segments that were |
602 | * known _not_ to have been retransmitted [see Karn/Partridge | 539 | * known _not_ to have been retransmitted [see Karn/Partridge |
@@ -606,13 +543,10 @@ static inline void vegas_rtt_calc(struct tcp_sock *tp, __u32 rtt) | |||
606 | * To save cycles in the RFC 1323 implementation it was better to break | 543 | * To save cycles in the RFC 1323 implementation it was better to break |
607 | * it up into three procedures. -- erics | 544 | * it up into three procedures. -- erics |
608 | */ | 545 | */ |
609 | static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt) | 546 | static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt, u32 *usrtt) |
610 | { | 547 | { |
611 | long m = mrtt; /* RTT */ | 548 | long m = mrtt; /* RTT */ |
612 | 549 | ||
613 | if (tcp_vegas_enabled(tp)) | ||
614 | vegas_rtt_calc(tp, mrtt); | ||
615 | |||
616 | /* The following amusing code comes from Jacobson's | 550 | /* The following amusing code comes from Jacobson's |
617 | * article in SIGCOMM '88. Note that rtt and mdev | 551 | * article in SIGCOMM '88. Note that rtt and mdev |
618 | * are scaled versions of rtt and mean deviation. | 552 | * are scaled versions of rtt and mean deviation. |
@@ -670,7 +604,8 @@ static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt) | |||
670 | tp->rtt_seq = tp->snd_nxt; | 604 | tp->rtt_seq = tp->snd_nxt; |
671 | } | 605 | } |
672 | 606 | ||
673 | tcp_westwood_update_rtt(tp, tp->srtt >> 3); | 607 | if (tp->ca_ops->rtt_sample) |
608 | tp->ca_ops->rtt_sample(tp, *usrtt); | ||
674 | } | 609 | } |
675 | 610 | ||
676 | /* Calculate rto without backoff. This is the second half of Van Jacobson's | 611 | /* Calculate rto without backoff. This is the second half of Van Jacobson's |
@@ -1185,8 +1120,8 @@ void tcp_enter_frto(struct sock *sk) | |||
1185 | tp->snd_una == tp->high_seq || | 1120 | tp->snd_una == tp->high_seq || |
1186 | (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) { | 1121 | (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) { |
1187 | tp->prior_ssthresh = tcp_current_ssthresh(tp); | 1122 | tp->prior_ssthresh = tcp_current_ssthresh(tp); |
1188 | if (!tcp_westwood_ssthresh(tp)) | 1123 | tp->snd_ssthresh = tp->ca_ops->ssthresh(tp); |
1189 | tp->snd_ssthresh = tcp_recalc_ssthresh(tp); | 1124 | tcp_ca_event(tp, CA_EVENT_FRTO); |
1190 | } | 1125 | } |
1191 | 1126 | ||
1192 | /* Have to clear retransmission markers here to keep the bookkeeping | 1127 | /* Have to clear retransmission markers here to keep the bookkeeping |
@@ -1252,8 +1187,6 @@ static void tcp_enter_frto_loss(struct sock *sk) | |||
1252 | tcp_set_ca_state(tp, TCP_CA_Loss); | 1187 | tcp_set_ca_state(tp, TCP_CA_Loss); |
1253 | tp->high_seq = tp->frto_highmark; | 1188 | tp->high_seq = tp->frto_highmark; |
1254 | TCP_ECN_queue_cwr(tp); | 1189 | TCP_ECN_queue_cwr(tp); |
1255 | |||
1256 | init_bictcp(tp); | ||
1257 | } | 1190 | } |
1258 | 1191 | ||
1259 | void tcp_clear_retrans(struct tcp_sock *tp) | 1192 | void tcp_clear_retrans(struct tcp_sock *tp) |
@@ -1283,7 +1216,8 @@ void tcp_enter_loss(struct sock *sk, int how) | |||
1283 | if (tp->ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq || | 1216 | if (tp->ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq || |
1284 | (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) { | 1217 | (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) { |
1285 | tp->prior_ssthresh = tcp_current_ssthresh(tp); | 1218 | tp->prior_ssthresh = tcp_current_ssthresh(tp); |
1286 | tp->snd_ssthresh = tcp_recalc_ssthresh(tp); | 1219 | tp->snd_ssthresh = tp->ca_ops->ssthresh(tp); |
1220 | tcp_ca_event(tp, CA_EVENT_LOSS); | ||
1287 | } | 1221 | } |
1288 | tp->snd_cwnd = 1; | 1222 | tp->snd_cwnd = 1; |
1289 | tp->snd_cwnd_cnt = 0; | 1223 | tp->snd_cwnd_cnt = 0; |
@@ -1596,28 +1530,14 @@ static inline void tcp_moderate_cwnd(struct tcp_sock *tp) | |||
1596 | } | 1530 | } |
1597 | 1531 | ||
1598 | /* Decrease cwnd each second ack. */ | 1532 | /* Decrease cwnd each second ack. */ |
1599 | |||
1600 | static void tcp_cwnd_down(struct tcp_sock *tp) | 1533 | static void tcp_cwnd_down(struct tcp_sock *tp) |
1601 | { | 1534 | { |
1602 | int decr = tp->snd_cwnd_cnt + 1; | 1535 | int decr = tp->snd_cwnd_cnt + 1; |
1603 | __u32 limit; | ||
1604 | |||
1605 | /* | ||
1606 | * TCP Westwood | ||
1607 | * Here limit is evaluated as BWestimation*RTTmin (for obtaining it | ||
1608 | * in packets we use mss_cache). If sysctl_tcp_westwood is off | ||
1609 | * tcp_westwood_bw_rttmin() returns 0. In such case snd_ssthresh is | ||
1610 | * still used as usual. It prevents other strange cases in which | ||
1611 | * BWE*RTTmin could assume value 0. It should not happen but... | ||
1612 | */ | ||
1613 | |||
1614 | if (!(limit = tcp_westwood_bw_rttmin(tp))) | ||
1615 | limit = tp->snd_ssthresh/2; | ||
1616 | 1536 | ||
1617 | tp->snd_cwnd_cnt = decr&1; | 1537 | tp->snd_cwnd_cnt = decr&1; |
1618 | decr >>= 1; | 1538 | decr >>= 1; |
1619 | 1539 | ||
1620 | if (decr && tp->snd_cwnd > limit) | 1540 | if (decr && tp->snd_cwnd > tp->ca_ops->min_cwnd(tp)) |
1621 | tp->snd_cwnd -= decr; | 1541 | tp->snd_cwnd -= decr; |
1622 | 1542 | ||
1623 | tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1); | 1543 | tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1); |
@@ -1654,8 +1574,8 @@ static void DBGUNDO(struct sock *sk, struct tcp_sock *tp, const char *msg) | |||
1654 | static void tcp_undo_cwr(struct tcp_sock *tp, int undo) | 1574 | static void tcp_undo_cwr(struct tcp_sock *tp, int undo) |
1655 | { | 1575 | { |
1656 | if (tp->prior_ssthresh) { | 1576 | if (tp->prior_ssthresh) { |
1657 | if (tcp_is_bic(tp)) | 1577 | if (tp->ca_ops->undo_cwnd) |
1658 | tp->snd_cwnd = max(tp->snd_cwnd, tp->bictcp.last_max_cwnd); | 1578 | tp->snd_cwnd = tp->ca_ops->undo_cwnd(tp); |
1659 | else | 1579 | else |
1660 | tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh<<1); | 1580 | tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh<<1); |
1661 | 1581 | ||
@@ -1767,11 +1687,9 @@ static int tcp_try_undo_loss(struct sock *sk, struct tcp_sock *tp) | |||
1767 | 1687 | ||
1768 | static inline void tcp_complete_cwr(struct tcp_sock *tp) | 1688 | static inline void tcp_complete_cwr(struct tcp_sock *tp) |
1769 | { | 1689 | { |
1770 | if (tcp_westwood_cwnd(tp)) | 1690 | tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); |
1771 | tp->snd_ssthresh = tp->snd_cwnd; | ||
1772 | else | ||
1773 | tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); | ||
1774 | tp->snd_cwnd_stamp = tcp_time_stamp; | 1691 | tp->snd_cwnd_stamp = tcp_time_stamp; |
1692 | tcp_ca_event(tp, CA_EVENT_COMPLETE_CWR); | ||
1775 | } | 1693 | } |
1776 | 1694 | ||
1777 | static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag) | 1695 | static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag) |
@@ -1946,7 +1864,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, | |||
1946 | if (tp->ca_state < TCP_CA_CWR) { | 1864 | if (tp->ca_state < TCP_CA_CWR) { |
1947 | if (!(flag&FLAG_ECE)) | 1865 | if (!(flag&FLAG_ECE)) |
1948 | tp->prior_ssthresh = tcp_current_ssthresh(tp); | 1866 | tp->prior_ssthresh = tcp_current_ssthresh(tp); |
1949 | tp->snd_ssthresh = tcp_recalc_ssthresh(tp); | 1867 | tp->snd_ssthresh = tp->ca_ops->ssthresh(tp); |
1950 | TCP_ECN_queue_cwr(tp); | 1868 | TCP_ECN_queue_cwr(tp); |
1951 | } | 1869 | } |
1952 | 1870 | ||
@@ -1963,7 +1881,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, | |||
1963 | /* Read draft-ietf-tcplw-high-performance before mucking | 1881 | /* Read draft-ietf-tcplw-high-performance before mucking |
1964 | * with this code. (Superceeds RFC1323) | 1882 | * with this code. (Superceeds RFC1323) |
1965 | */ | 1883 | */ |
1966 | static void tcp_ack_saw_tstamp(struct tcp_sock *tp, int flag) | 1884 | static void tcp_ack_saw_tstamp(struct tcp_sock *tp, u32 *usrtt, int flag) |
1967 | { | 1885 | { |
1968 | __u32 seq_rtt; | 1886 | __u32 seq_rtt; |
1969 | 1887 | ||
@@ -1983,13 +1901,13 @@ static void tcp_ack_saw_tstamp(struct tcp_sock *tp, int flag) | |||
1983 | * in window is lost... Voila. --ANK (010210) | 1901 | * in window is lost... Voila. --ANK (010210) |
1984 | */ | 1902 | */ |
1985 | seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; | 1903 | seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; |
1986 | tcp_rtt_estimator(tp, seq_rtt); | 1904 | tcp_rtt_estimator(tp, seq_rtt, usrtt); |
1987 | tcp_set_rto(tp); | 1905 | tcp_set_rto(tp); |
1988 | tp->backoff = 0; | 1906 | tp->backoff = 0; |
1989 | tcp_bound_rto(tp); | 1907 | tcp_bound_rto(tp); |
1990 | } | 1908 | } |
1991 | 1909 | ||
1992 | static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, int flag) | 1910 | static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, u32 *usrtt, int flag) |
1993 | { | 1911 | { |
1994 | /* We don't have a timestamp. Can only use | 1912 | /* We don't have a timestamp. Can only use |
1995 | * packets that are not retransmitted to determine | 1913 | * packets that are not retransmitted to determine |
@@ -2003,338 +1921,29 @@ static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, int flag) | |||
2003 | if (flag & FLAG_RETRANS_DATA_ACKED) | 1921 | if (flag & FLAG_RETRANS_DATA_ACKED) |
2004 | return; | 1922 | return; |
2005 | 1923 | ||
2006 | tcp_rtt_estimator(tp, seq_rtt); | 1924 | tcp_rtt_estimator(tp, seq_rtt, usrtt); |
2007 | tcp_set_rto(tp); | 1925 | tcp_set_rto(tp); |
2008 | tp->backoff = 0; | 1926 | tp->backoff = 0; |
2009 | tcp_bound_rto(tp); | 1927 | tcp_bound_rto(tp); |
2010 | } | 1928 | } |
2011 | 1929 | ||
2012 | static inline void tcp_ack_update_rtt(struct tcp_sock *tp, | 1930 | static inline void tcp_ack_update_rtt(struct tcp_sock *tp, |
2013 | int flag, s32 seq_rtt) | 1931 | int flag, s32 seq_rtt, u32 *usrtt) |
2014 | { | 1932 | { |
2015 | /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */ | 1933 | /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */ |
2016 | if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) | 1934 | if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) |
2017 | tcp_ack_saw_tstamp(tp, flag); | 1935 | tcp_ack_saw_tstamp(tp, usrtt, flag); |
2018 | else if (seq_rtt >= 0) | 1936 | else if (seq_rtt >= 0) |
2019 | tcp_ack_no_tstamp(tp, seq_rtt, flag); | 1937 | tcp_ack_no_tstamp(tp, seq_rtt, usrtt, flag); |
2020 | } | 1938 | } |
2021 | 1939 | ||
2022 | /* | 1940 | static inline void tcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, |
2023 | * Compute congestion window to use. | 1941 | u32 in_flight, int good) |
2024 | * | ||
2025 | * This is from the implementation of BICTCP in | ||
2026 | * Lison-Xu, Kahaled Harfoush, and Injog Rhee. | ||
2027 | * "Binary Increase Congestion Control for Fast, Long Distance | ||
2028 | * Networks" in InfoComm 2004 | ||
2029 | * Available from: | ||
2030 | * http://www.csc.ncsu.edu/faculty/rhee/export/bitcp.pdf | ||
2031 | * | ||
2032 | * Unless BIC is enabled and congestion window is large | ||
2033 | * this behaves the same as the original Reno. | ||
2034 | */ | ||
2035 | static inline __u32 bictcp_cwnd(struct tcp_sock *tp) | ||
2036 | { | ||
2037 | /* orignal Reno behaviour */ | ||
2038 | if (!tcp_is_bic(tp)) | ||
2039 | return tp->snd_cwnd; | ||
2040 | |||
2041 | if (tp->bictcp.last_cwnd == tp->snd_cwnd && | ||
2042 | (s32)(tcp_time_stamp - tp->bictcp.last_stamp) <= (HZ>>5)) | ||
2043 | return tp->bictcp.cnt; | ||
2044 | |||
2045 | tp->bictcp.last_cwnd = tp->snd_cwnd; | ||
2046 | tp->bictcp.last_stamp = tcp_time_stamp; | ||
2047 | |||
2048 | /* start off normal */ | ||
2049 | if (tp->snd_cwnd <= sysctl_tcp_bic_low_window) | ||
2050 | tp->bictcp.cnt = tp->snd_cwnd; | ||
2051 | |||
2052 | /* binary increase */ | ||
2053 | else if (tp->snd_cwnd < tp->bictcp.last_max_cwnd) { | ||
2054 | __u32 dist = (tp->bictcp.last_max_cwnd - tp->snd_cwnd) | ||
2055 | / BICTCP_B; | ||
2056 | |||
2057 | if (dist > BICTCP_MAX_INCREMENT) | ||
2058 | /* linear increase */ | ||
2059 | tp->bictcp.cnt = tp->snd_cwnd / BICTCP_MAX_INCREMENT; | ||
2060 | else if (dist <= 1U) | ||
2061 | /* binary search increase */ | ||
2062 | tp->bictcp.cnt = tp->snd_cwnd * BICTCP_FUNC_OF_MIN_INCR | ||
2063 | / BICTCP_B; | ||
2064 | else | ||
2065 | /* binary search increase */ | ||
2066 | tp->bictcp.cnt = tp->snd_cwnd / dist; | ||
2067 | } else { | ||
2068 | /* slow start amd linear increase */ | ||
2069 | if (tp->snd_cwnd < tp->bictcp.last_max_cwnd + BICTCP_B) | ||
2070 | /* slow start */ | ||
2071 | tp->bictcp.cnt = tp->snd_cwnd * BICTCP_FUNC_OF_MIN_INCR | ||
2072 | / BICTCP_B; | ||
2073 | else if (tp->snd_cwnd < tp->bictcp.last_max_cwnd | ||
2074 | + BICTCP_MAX_INCREMENT*(BICTCP_B-1)) | ||
2075 | /* slow start */ | ||
2076 | tp->bictcp.cnt = tp->snd_cwnd * (BICTCP_B-1) | ||
2077 | / (tp->snd_cwnd-tp->bictcp.last_max_cwnd); | ||
2078 | else | ||
2079 | /* linear increase */ | ||
2080 | tp->bictcp.cnt = tp->snd_cwnd / BICTCP_MAX_INCREMENT; | ||
2081 | } | ||
2082 | return tp->bictcp.cnt; | ||
2083 | } | ||
2084 | |||
2085 | /* This is Jacobson's slow start and congestion avoidance. | ||
2086 | * SIGCOMM '88, p. 328. | ||
2087 | */ | ||
2088 | static inline void reno_cong_avoid(struct tcp_sock *tp) | ||
2089 | { | 1942 | { |
2090 | if (tp->snd_cwnd <= tp->snd_ssthresh) { | 1943 | tp->ca_ops->cong_avoid(tp, ack, rtt, in_flight, good); |
2091 | /* In "safe" area, increase. */ | ||
2092 | if (tp->snd_cwnd < tp->snd_cwnd_clamp) | ||
2093 | tp->snd_cwnd++; | ||
2094 | } else { | ||
2095 | /* In dangerous area, increase slowly. | ||
2096 | * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd | ||
2097 | */ | ||
2098 | if (tp->snd_cwnd_cnt >= bictcp_cwnd(tp)) { | ||
2099 | if (tp->snd_cwnd < tp->snd_cwnd_clamp) | ||
2100 | tp->snd_cwnd++; | ||
2101 | tp->snd_cwnd_cnt=0; | ||
2102 | } else | ||
2103 | tp->snd_cwnd_cnt++; | ||
2104 | } | ||
2105 | tp->snd_cwnd_stamp = tcp_time_stamp; | 1944 | tp->snd_cwnd_stamp = tcp_time_stamp; |
2106 | } | 1945 | } |
2107 | 1946 | ||
2108 | /* This is based on the congestion detection/avoidance scheme described in | ||
2109 | * Lawrence S. Brakmo and Larry L. Peterson. | ||
2110 | * "TCP Vegas: End to end congestion avoidance on a global internet." | ||
2111 | * IEEE Journal on Selected Areas in Communication, 13(8):1465--1480, | ||
2112 | * October 1995. Available from: | ||
2113 | * ftp://ftp.cs.arizona.edu/xkernel/Papers/jsac.ps | ||
2114 | * | ||
2115 | * See http://www.cs.arizona.edu/xkernel/ for their implementation. | ||
2116 | * The main aspects that distinguish this implementation from the | ||
2117 | * Arizona Vegas implementation are: | ||
2118 | * o We do not change the loss detection or recovery mechanisms of | ||
2119 | * Linux in any way. Linux already recovers from losses quite well, | ||
2120 | * using fine-grained timers, NewReno, and FACK. | ||
2121 | * o To avoid the performance penalty imposed by increasing cwnd | ||
2122 | * only every-other RTT during slow start, we increase during | ||
2123 | * every RTT during slow start, just like Reno. | ||
2124 | * o Largely to allow continuous cwnd growth during slow start, | ||
2125 | * we use the rate at which ACKs come back as the "actual" | ||
2126 | * rate, rather than the rate at which data is sent. | ||
2127 | * o To speed convergence to the right rate, we set the cwnd | ||
2128 | * to achieve the right ("actual") rate when we exit slow start. | ||
2129 | * o To filter out the noise caused by delayed ACKs, we use the | ||
2130 | * minimum RTT sample observed during the last RTT to calculate | ||
2131 | * the actual rate. | ||
2132 | * o When the sender re-starts from idle, it waits until it has | ||
2133 | * received ACKs for an entire flight of new data before making | ||
2134 | * a cwnd adjustment decision. The original Vegas implementation | ||
2135 | * assumed senders never went idle. | ||
2136 | */ | ||
2137 | static void vegas_cong_avoid(struct tcp_sock *tp, u32 ack, u32 seq_rtt) | ||
2138 | { | ||
2139 | /* The key players are v_beg_snd_una and v_beg_snd_nxt. | ||
2140 | * | ||
2141 | * These are so named because they represent the approximate values | ||
2142 | * of snd_una and snd_nxt at the beginning of the current RTT. More | ||
2143 | * precisely, they represent the amount of data sent during the RTT. | ||
2144 | * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt, | ||
2145 | * we will calculate that (v_beg_snd_nxt - v_beg_snd_una) outstanding | ||
2146 | * bytes of data have been ACKed during the course of the RTT, giving | ||
2147 | * an "actual" rate of: | ||
2148 | * | ||
2149 | * (v_beg_snd_nxt - v_beg_snd_una) / (rtt duration) | ||
2150 | * | ||
2151 | * Unfortunately, v_beg_snd_una is not exactly equal to snd_una, | ||
2152 | * because delayed ACKs can cover more than one segment, so they | ||
2153 | * don't line up nicely with the boundaries of RTTs. | ||
2154 | * | ||
2155 | * Another unfortunate fact of life is that delayed ACKs delay the | ||
2156 | * advance of the left edge of our send window, so that the number | ||
2157 | * of bytes we send in an RTT is often less than our cwnd will allow. | ||
2158 | * So we keep track of our cwnd separately, in v_beg_snd_cwnd. | ||
2159 | */ | ||
2160 | |||
2161 | if (after(ack, tp->vegas.beg_snd_nxt)) { | ||
2162 | /* Do the Vegas once-per-RTT cwnd adjustment. */ | ||
2163 | u32 old_wnd, old_snd_cwnd; | ||
2164 | |||
2165 | |||
2166 | /* Here old_wnd is essentially the window of data that was | ||
2167 | * sent during the previous RTT, and has all | ||
2168 | * been acknowledged in the course of the RTT that ended | ||
2169 | * with the ACK we just received. Likewise, old_snd_cwnd | ||
2170 | * is the cwnd during the previous RTT. | ||
2171 | */ | ||
2172 | old_wnd = (tp->vegas.beg_snd_nxt - tp->vegas.beg_snd_una) / | ||
2173 | tp->mss_cache_std; | ||
2174 | old_snd_cwnd = tp->vegas.beg_snd_cwnd; | ||
2175 | |||
2176 | /* Save the extent of the current window so we can use this | ||
2177 | * at the end of the next RTT. | ||
2178 | */ | ||
2179 | tp->vegas.beg_snd_una = tp->vegas.beg_snd_nxt; | ||
2180 | tp->vegas.beg_snd_nxt = tp->snd_nxt; | ||
2181 | tp->vegas.beg_snd_cwnd = tp->snd_cwnd; | ||
2182 | |||
2183 | /* Take into account the current RTT sample too, to | ||
2184 | * decrease the impact of delayed acks. This double counts | ||
2185 | * this sample since we count it for the next window as well, | ||
2186 | * but that's not too awful, since we're taking the min, | ||
2187 | * rather than averaging. | ||
2188 | */ | ||
2189 | vegas_rtt_calc(tp, seq_rtt); | ||
2190 | |||
2191 | /* We do the Vegas calculations only if we got enough RTT | ||
2192 | * samples that we can be reasonably sure that we got | ||
2193 | * at least one RTT sample that wasn't from a delayed ACK. | ||
2194 | * If we only had 2 samples total, | ||
2195 | * then that means we're getting only 1 ACK per RTT, which | ||
2196 | * means they're almost certainly delayed ACKs. | ||
2197 | * If we have 3 samples, we should be OK. | ||
2198 | */ | ||
2199 | |||
2200 | if (tp->vegas.cntRTT <= 2) { | ||
2201 | /* We don't have enough RTT samples to do the Vegas | ||
2202 | * calculation, so we'll behave like Reno. | ||
2203 | */ | ||
2204 | if (tp->snd_cwnd > tp->snd_ssthresh) | ||
2205 | tp->snd_cwnd++; | ||
2206 | } else { | ||
2207 | u32 rtt, target_cwnd, diff; | ||
2208 | |||
2209 | /* We have enough RTT samples, so, using the Vegas | ||
2210 | * algorithm, we determine if we should increase or | ||
2211 | * decrease cwnd, and by how much. | ||
2212 | */ | ||
2213 | |||
2214 | /* Pluck out the RTT we are using for the Vegas | ||
2215 | * calculations. This is the min RTT seen during the | ||
2216 | * last RTT. Taking the min filters out the effects | ||
2217 | * of delayed ACKs, at the cost of noticing congestion | ||
2218 | * a bit later. | ||
2219 | */ | ||
2220 | rtt = tp->vegas.minRTT; | ||
2221 | |||
2222 | /* Calculate the cwnd we should have, if we weren't | ||
2223 | * going too fast. | ||
2224 | * | ||
2225 | * This is: | ||
2226 | * (actual rate in segments) * baseRTT | ||
2227 | * We keep it as a fixed point number with | ||
2228 | * V_PARAM_SHIFT bits to the right of the binary point. | ||
2229 | */ | ||
2230 | target_cwnd = ((old_wnd * tp->vegas.baseRTT) | ||
2231 | << V_PARAM_SHIFT) / rtt; | ||
2232 | |||
2233 | /* Calculate the difference between the window we had, | ||
2234 | * and the window we would like to have. This quantity | ||
2235 | * is the "Diff" from the Arizona Vegas papers. | ||
2236 | * | ||
2237 | * Again, this is a fixed point number with | ||
2238 | * V_PARAM_SHIFT bits to the right of the binary | ||
2239 | * point. | ||
2240 | */ | ||
2241 | diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd; | ||
2242 | |||
2243 | if (tp->snd_cwnd < tp->snd_ssthresh) { | ||
2244 | /* Slow start. */ | ||
2245 | if (diff > sysctl_tcp_vegas_gamma) { | ||
2246 | /* Going too fast. Time to slow down | ||
2247 | * and switch to congestion avoidance. | ||
2248 | */ | ||
2249 | tp->snd_ssthresh = 2; | ||
2250 | |||
2251 | /* Set cwnd to match the actual rate | ||
2252 | * exactly: | ||
2253 | * cwnd = (actual rate) * baseRTT | ||
2254 | * Then we add 1 because the integer | ||
2255 | * truncation robs us of full link | ||
2256 | * utilization. | ||
2257 | */ | ||
2258 | tp->snd_cwnd = min(tp->snd_cwnd, | ||
2259 | (target_cwnd >> | ||
2260 | V_PARAM_SHIFT)+1); | ||
2261 | |||
2262 | } | ||
2263 | } else { | ||
2264 | /* Congestion avoidance. */ | ||
2265 | u32 next_snd_cwnd; | ||
2266 | |||
2267 | /* Figure out where we would like cwnd | ||
2268 | * to be. | ||
2269 | */ | ||
2270 | if (diff > sysctl_tcp_vegas_beta) { | ||
2271 | /* The old window was too fast, so | ||
2272 | * we slow down. | ||
2273 | */ | ||
2274 | next_snd_cwnd = old_snd_cwnd - 1; | ||
2275 | } else if (diff < sysctl_tcp_vegas_alpha) { | ||
2276 | /* We don't have enough extra packets | ||
2277 | * in the network, so speed up. | ||
2278 | */ | ||
2279 | next_snd_cwnd = old_snd_cwnd + 1; | ||
2280 | } else { | ||
2281 | /* Sending just as fast as we | ||
2282 | * should be. | ||
2283 | */ | ||
2284 | next_snd_cwnd = old_snd_cwnd; | ||
2285 | } | ||
2286 | |||
2287 | /* Adjust cwnd upward or downward, toward the | ||
2288 | * desired value. | ||
2289 | */ | ||
2290 | if (next_snd_cwnd > tp->snd_cwnd) | ||
2291 | tp->snd_cwnd++; | ||
2292 | else if (next_snd_cwnd < tp->snd_cwnd) | ||
2293 | tp->snd_cwnd--; | ||
2294 | } | ||
2295 | } | ||
2296 | |||
2297 | /* Wipe the slate clean for the next RTT. */ | ||
2298 | tp->vegas.cntRTT = 0; | ||
2299 | tp->vegas.minRTT = 0x7fffffff; | ||
2300 | } | ||
2301 | |||
2302 | /* The following code is executed for every ack we receive, | ||
2303 | * except for conditions checked in should_advance_cwnd() | ||
2304 | * before the call to tcp_cong_avoid(). Mainly this means that | ||
2305 | * we only execute this code if the ack actually acked some | ||
2306 | * data. | ||
2307 | */ | ||
2308 | |||
2309 | /* If we are in slow start, increase our cwnd in response to this ACK. | ||
2310 | * (If we are not in slow start then we are in congestion avoidance, | ||
2311 | * and adjust our congestion window only once per RTT. See the code | ||
2312 | * above.) | ||
2313 | */ | ||
2314 | if (tp->snd_cwnd <= tp->snd_ssthresh) | ||
2315 | tp->snd_cwnd++; | ||
2316 | |||
2317 | /* to keep cwnd from growing without bound */ | ||
2318 | tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp); | ||
2319 | |||
2320 | /* Make sure that we are never so timid as to reduce our cwnd below | ||
2321 | * 2 MSS. | ||
2322 | * | ||
2323 | * Going below 2 MSS would risk huge delayed ACKs from our receiver. | ||
2324 | */ | ||
2325 | tp->snd_cwnd = max(tp->snd_cwnd, 2U); | ||
2326 | |||
2327 | tp->snd_cwnd_stamp = tcp_time_stamp; | ||
2328 | } | ||
2329 | |||
2330 | static inline void tcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 seq_rtt) | ||
2331 | { | ||
2332 | if (tcp_vegas_enabled(tp)) | ||
2333 | vegas_cong_avoid(tp, ack, seq_rtt); | ||
2334 | else | ||
2335 | reno_cong_avoid(tp); | ||
2336 | } | ||
2337 | |||
2338 | /* Restart timer after forward progress on connection. | 1947 | /* Restart timer after forward progress on connection. |
2339 | * RFC2988 recommends to restart timer to now+rto. | 1948 | * RFC2988 recommends to restart timer to now+rto. |
2340 | */ | 1949 | */ |
@@ -2415,13 +2024,18 @@ static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb, | |||
2415 | 2024 | ||
2416 | 2025 | ||
2417 | /* Remove acknowledged frames from the retransmission queue. */ | 2026 | /* Remove acknowledged frames from the retransmission queue. */ |
2418 | static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) | 2027 | static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt) |
2419 | { | 2028 | { |
2420 | struct tcp_sock *tp = tcp_sk(sk); | 2029 | struct tcp_sock *tp = tcp_sk(sk); |
2421 | struct sk_buff *skb; | 2030 | struct sk_buff *skb; |
2422 | __u32 now = tcp_time_stamp; | 2031 | __u32 now = tcp_time_stamp; |
2423 | int acked = 0; | 2032 | int acked = 0; |
2424 | __s32 seq_rtt = -1; | 2033 | __s32 seq_rtt = -1; |
2034 | struct timeval usnow; | ||
2035 | u32 pkts_acked = 0; | ||
2036 | |||
2037 | if (seq_usrtt) | ||
2038 | do_gettimeofday(&usnow); | ||
2425 | 2039 | ||
2426 | while ((skb = skb_peek(&sk->sk_write_queue)) && | 2040 | while ((skb = skb_peek(&sk->sk_write_queue)) && |
2427 | skb != sk->sk_send_head) { | 2041 | skb != sk->sk_send_head) { |
@@ -2448,6 +2062,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) | |||
2448 | */ | 2062 | */ |
2449 | if (!(scb->flags & TCPCB_FLAG_SYN)) { | 2063 | if (!(scb->flags & TCPCB_FLAG_SYN)) { |
2450 | acked |= FLAG_DATA_ACKED; | 2064 | acked |= FLAG_DATA_ACKED; |
2065 | ++pkts_acked; | ||
2451 | } else { | 2066 | } else { |
2452 | acked |= FLAG_SYN_ACKED; | 2067 | acked |= FLAG_SYN_ACKED; |
2453 | tp->retrans_stamp = 0; | 2068 | tp->retrans_stamp = 0; |
@@ -2461,6 +2076,10 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) | |||
2461 | seq_rtt = -1; | 2076 | seq_rtt = -1; |
2462 | } else if (seq_rtt < 0) | 2077 | } else if (seq_rtt < 0) |
2463 | seq_rtt = now - scb->when; | 2078 | seq_rtt = now - scb->when; |
2079 | if (seq_usrtt) | ||
2080 | *seq_usrtt = (usnow.tv_sec - skb->stamp.tv_sec) * 1000000 | ||
2081 | + (usnow.tv_usec - skb->stamp.tv_usec); | ||
2082 | |||
2464 | if (sacked & TCPCB_SACKED_ACKED) | 2083 | if (sacked & TCPCB_SACKED_ACKED) |
2465 | tp->sacked_out -= tcp_skb_pcount(skb); | 2084 | tp->sacked_out -= tcp_skb_pcount(skb); |
2466 | if (sacked & TCPCB_LOST) | 2085 | if (sacked & TCPCB_LOST) |
@@ -2479,8 +2098,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) | |||
2479 | } | 2098 | } |
2480 | 2099 | ||
2481 | if (acked&FLAG_ACKED) { | 2100 | if (acked&FLAG_ACKED) { |
2482 | tcp_ack_update_rtt(tp, acked, seq_rtt); | 2101 | tcp_ack_update_rtt(tp, acked, seq_rtt, seq_usrtt); |
2483 | tcp_ack_packets_out(sk, tp); | 2102 | tcp_ack_packets_out(sk, tp); |
2103 | |||
2104 | if (tp->ca_ops->pkts_acked) | ||
2105 | tp->ca_ops->pkts_acked(tp, pkts_acked); | ||
2484 | } | 2106 | } |
2485 | 2107 | ||
2486 | #if FASTRETRANS_DEBUG > 0 | 2108 | #if FASTRETRANS_DEBUG > 0 |
@@ -2624,257 +2246,6 @@ static void tcp_process_frto(struct sock *sk, u32 prior_snd_una) | |||
2624 | tp->frto_counter = (tp->frto_counter + 1) % 3; | 2246 | tp->frto_counter = (tp->frto_counter + 1) % 3; |
2625 | } | 2247 | } |
2626 | 2248 | ||
2627 | /* | ||
2628 | * TCP Westwood+ | ||
2629 | */ | ||
2630 | |||
2631 | /* | ||
2632 | * @init_westwood | ||
2633 | * This function initializes fields used in TCP Westwood+. We can't | ||
2634 | * get no information about RTTmin at this time so we simply set it to | ||
2635 | * TCP_WESTWOOD_INIT_RTT. This value was chosen to be too conservative | ||
2636 | * since in this way we're sure it will be updated in a consistent | ||
2637 | * way as soon as possible. It will reasonably happen within the first | ||
2638 | * RTT period of the connection lifetime. | ||
2639 | */ | ||
2640 | |||
2641 | static void init_westwood(struct sock *sk) | ||
2642 | { | ||
2643 | struct tcp_sock *tp = tcp_sk(sk); | ||
2644 | |||
2645 | tp->westwood.bw_ns_est = 0; | ||
2646 | tp->westwood.bw_est = 0; | ||
2647 | tp->westwood.accounted = 0; | ||
2648 | tp->westwood.cumul_ack = 0; | ||
2649 | tp->westwood.rtt_win_sx = tcp_time_stamp; | ||
2650 | tp->westwood.rtt = TCP_WESTWOOD_INIT_RTT; | ||
2651 | tp->westwood.rtt_min = TCP_WESTWOOD_INIT_RTT; | ||
2652 | tp->westwood.snd_una = tp->snd_una; | ||
2653 | } | ||
2654 | |||
2655 | /* | ||
2656 | * @westwood_do_filter | ||
2657 | * Low-pass filter. Implemented using constant coeffients. | ||
2658 | */ | ||
2659 | |||
2660 | static inline __u32 westwood_do_filter(__u32 a, __u32 b) | ||
2661 | { | ||
2662 | return (((7 * a) + b) >> 3); | ||
2663 | } | ||
2664 | |||
2665 | static void westwood_filter(struct sock *sk, __u32 delta) | ||
2666 | { | ||
2667 | struct tcp_sock *tp = tcp_sk(sk); | ||
2668 | |||
2669 | tp->westwood.bw_ns_est = | ||
2670 | westwood_do_filter(tp->westwood.bw_ns_est, | ||
2671 | tp->westwood.bk / delta); | ||
2672 | tp->westwood.bw_est = | ||
2673 | westwood_do_filter(tp->westwood.bw_est, | ||
2674 | tp->westwood.bw_ns_est); | ||
2675 | } | ||
2676 | |||
2677 | /* | ||
2678 | * @westwood_update_rttmin | ||
2679 | * It is used to update RTTmin. In this case we MUST NOT use | ||
2680 | * WESTWOOD_RTT_MIN minimum bound since we could be on a LAN! | ||
2681 | */ | ||
2682 | |||
2683 | static inline __u32 westwood_update_rttmin(const struct sock *sk) | ||
2684 | { | ||
2685 | const struct tcp_sock *tp = tcp_sk(sk); | ||
2686 | __u32 rttmin = tp->westwood.rtt_min; | ||
2687 | |||
2688 | if (tp->westwood.rtt != 0 && | ||
2689 | (tp->westwood.rtt < tp->westwood.rtt_min || !rttmin)) | ||
2690 | rttmin = tp->westwood.rtt; | ||
2691 | |||
2692 | return rttmin; | ||
2693 | } | ||
2694 | |||
2695 | /* | ||
2696 | * @westwood_acked | ||
2697 | * Evaluate increases for dk. | ||
2698 | */ | ||
2699 | |||
2700 | static inline __u32 westwood_acked(const struct sock *sk) | ||
2701 | { | ||
2702 | const struct tcp_sock *tp = tcp_sk(sk); | ||
2703 | |||
2704 | return tp->snd_una - tp->westwood.snd_una; | ||
2705 | } | ||
2706 | |||
2707 | /* | ||
2708 | * @westwood_new_window | ||
2709 | * It evaluates if we are receiving data inside the same RTT window as | ||
2710 | * when we started. | ||
2711 | * Return value: | ||
2712 | * It returns 0 if we are still evaluating samples in the same RTT | ||
2713 | * window, 1 if the sample has to be considered in the next window. | ||
2714 | */ | ||
2715 | |||
2716 | static int westwood_new_window(const struct sock *sk) | ||
2717 | { | ||
2718 | const struct tcp_sock *tp = tcp_sk(sk); | ||
2719 | __u32 left_bound; | ||
2720 | __u32 rtt; | ||
2721 | int ret = 0; | ||
2722 | |||
2723 | left_bound = tp->westwood.rtt_win_sx; | ||
2724 | rtt = max(tp->westwood.rtt, (u32) TCP_WESTWOOD_RTT_MIN); | ||
2725 | |||
2726 | /* | ||
2727 | * A RTT-window has passed. Be careful since if RTT is less than | ||
2728 | * 50ms we don't filter but we continue 'building the sample'. | ||
2729 | * This minimum limit was choosen since an estimation on small | ||
2730 | * time intervals is better to avoid... | ||
2731 | * Obvioulsy on a LAN we reasonably will always have | ||
2732 | * right_bound = left_bound + WESTWOOD_RTT_MIN | ||
2733 | */ | ||
2734 | |||
2735 | if ((left_bound + rtt) < tcp_time_stamp) | ||
2736 | ret = 1; | ||
2737 | |||
2738 | return ret; | ||
2739 | } | ||
2740 | |||
2741 | /* | ||
2742 | * @westwood_update_window | ||
2743 | * It updates RTT evaluation window if it is the right moment to do | ||
2744 | * it. If so it calls filter for evaluating bandwidth. | ||
2745 | */ | ||
2746 | |||
2747 | static void __westwood_update_window(struct sock *sk, __u32 now) | ||
2748 | { | ||
2749 | struct tcp_sock *tp = tcp_sk(sk); | ||
2750 | __u32 delta = now - tp->westwood.rtt_win_sx; | ||
2751 | |||
2752 | if (delta) { | ||
2753 | if (tp->westwood.rtt) | ||
2754 | westwood_filter(sk, delta); | ||
2755 | |||
2756 | tp->westwood.bk = 0; | ||
2757 | tp->westwood.rtt_win_sx = tcp_time_stamp; | ||
2758 | } | ||
2759 | } | ||
2760 | |||
2761 | |||
2762 | static void westwood_update_window(struct sock *sk, __u32 now) | ||
2763 | { | ||
2764 | if (westwood_new_window(sk)) | ||
2765 | __westwood_update_window(sk, now); | ||
2766 | } | ||
2767 | |||
2768 | /* | ||
2769 | * @__tcp_westwood_fast_bw | ||
2770 | * It is called when we are in fast path. In particular it is called when | ||
2771 | * header prediction is successfull. In such case infact update is | ||
2772 | * straight forward and doesn't need any particular care. | ||
2773 | */ | ||
2774 | |||
2775 | static void __tcp_westwood_fast_bw(struct sock *sk, struct sk_buff *skb) | ||
2776 | { | ||
2777 | struct tcp_sock *tp = tcp_sk(sk); | ||
2778 | |||
2779 | westwood_update_window(sk, tcp_time_stamp); | ||
2780 | |||
2781 | tp->westwood.bk += westwood_acked(sk); | ||
2782 | tp->westwood.snd_una = tp->snd_una; | ||
2783 | tp->westwood.rtt_min = westwood_update_rttmin(sk); | ||
2784 | } | ||
2785 | |||
2786 | static inline void tcp_westwood_fast_bw(struct sock *sk, struct sk_buff *skb) | ||
2787 | { | ||
2788 | if (tcp_is_westwood(tcp_sk(sk))) | ||
2789 | __tcp_westwood_fast_bw(sk, skb); | ||
2790 | } | ||
2791 | |||
2792 | |||
2793 | /* | ||
2794 | * @westwood_dupack_update | ||
2795 | * It updates accounted and cumul_ack when receiving a dupack. | ||
2796 | */ | ||
2797 | |||
2798 | static void westwood_dupack_update(struct sock *sk) | ||
2799 | { | ||
2800 | struct tcp_sock *tp = tcp_sk(sk); | ||
2801 | |||
2802 | tp->westwood.accounted += tp->mss_cache_std; | ||
2803 | tp->westwood.cumul_ack = tp->mss_cache_std; | ||
2804 | } | ||
2805 | |||
2806 | static inline int westwood_may_change_cumul(struct tcp_sock *tp) | ||
2807 | { | ||
2808 | return (tp->westwood.cumul_ack > tp->mss_cache_std); | ||
2809 | } | ||
2810 | |||
2811 | static inline void westwood_partial_update(struct tcp_sock *tp) | ||
2812 | { | ||
2813 | tp->westwood.accounted -= tp->westwood.cumul_ack; | ||
2814 | tp->westwood.cumul_ack = tp->mss_cache_std; | ||
2815 | } | ||
2816 | |||
2817 | static inline void westwood_complete_update(struct tcp_sock *tp) | ||
2818 | { | ||
2819 | tp->westwood.cumul_ack -= tp->westwood.accounted; | ||
2820 | tp->westwood.accounted = 0; | ||
2821 | } | ||
2822 | |||
2823 | /* | ||
2824 | * @westwood_acked_count | ||
2825 | * This function evaluates cumul_ack for evaluating dk in case of | ||
2826 | * delayed or partial acks. | ||
2827 | */ | ||
2828 | |||
2829 | static inline __u32 westwood_acked_count(struct sock *sk) | ||
2830 | { | ||
2831 | struct tcp_sock *tp = tcp_sk(sk); | ||
2832 | |||
2833 | tp->westwood.cumul_ack = westwood_acked(sk); | ||
2834 | |||
2835 | /* If cumul_ack is 0 this is a dupack since it's not moving | ||
2836 | * tp->snd_una. | ||
2837 | */ | ||
2838 | if (!(tp->westwood.cumul_ack)) | ||
2839 | westwood_dupack_update(sk); | ||
2840 | |||
2841 | if (westwood_may_change_cumul(tp)) { | ||
2842 | /* Partial or delayed ack */ | ||
2843 | if (tp->westwood.accounted >= tp->westwood.cumul_ack) | ||
2844 | westwood_partial_update(tp); | ||
2845 | else | ||
2846 | westwood_complete_update(tp); | ||
2847 | } | ||
2848 | |||
2849 | tp->westwood.snd_una = tp->snd_una; | ||
2850 | |||
2851 | return tp->westwood.cumul_ack; | ||
2852 | } | ||
2853 | |||
2854 | |||
2855 | /* | ||
2856 | * @__tcp_westwood_slow_bw | ||
2857 | * It is called when something is going wrong..even if there could | ||
2858 | * be no problems! Infact a simple delayed packet may trigger a | ||
2859 | * dupack. But we need to be careful in such case. | ||
2860 | */ | ||
2861 | |||
2862 | static void __tcp_westwood_slow_bw(struct sock *sk, struct sk_buff *skb) | ||
2863 | { | ||
2864 | struct tcp_sock *tp = tcp_sk(sk); | ||
2865 | |||
2866 | westwood_update_window(sk, tcp_time_stamp); | ||
2867 | |||
2868 | tp->westwood.bk += westwood_acked_count(sk); | ||
2869 | tp->westwood.rtt_min = westwood_update_rttmin(sk); | ||
2870 | } | ||
2871 | |||
2872 | static inline void tcp_westwood_slow_bw(struct sock *sk, struct sk_buff *skb) | ||
2873 | { | ||
2874 | if (tcp_is_westwood(tcp_sk(sk))) | ||
2875 | __tcp_westwood_slow_bw(sk, skb); | ||
2876 | } | ||
2877 | |||
2878 | /* This routine deals with incoming acks, but not outgoing ones. */ | 2249 | /* This routine deals with incoming acks, but not outgoing ones. */ |
2879 | static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) | 2250 | static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) |
2880 | { | 2251 | { |
@@ -2884,6 +2255,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) | |||
2884 | u32 ack = TCP_SKB_CB(skb)->ack_seq; | 2255 | u32 ack = TCP_SKB_CB(skb)->ack_seq; |
2885 | u32 prior_in_flight; | 2256 | u32 prior_in_flight; |
2886 | s32 seq_rtt; | 2257 | s32 seq_rtt; |
2258 | s32 seq_usrtt = 0; | ||
2887 | int prior_packets; | 2259 | int prior_packets; |
2888 | 2260 | ||
2889 | /* If the ack is newer than sent or older than previous acks | 2261 | /* If the ack is newer than sent or older than previous acks |
@@ -2902,9 +2274,10 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) | |||
2902 | */ | 2274 | */ |
2903 | tcp_update_wl(tp, ack, ack_seq); | 2275 | tcp_update_wl(tp, ack, ack_seq); |
2904 | tp->snd_una = ack; | 2276 | tp->snd_una = ack; |
2905 | tcp_westwood_fast_bw(sk, skb); | ||
2906 | flag |= FLAG_WIN_UPDATE; | 2277 | flag |= FLAG_WIN_UPDATE; |
2907 | 2278 | ||
2279 | tcp_ca_event(tp, CA_EVENT_FAST_ACK); | ||
2280 | |||
2908 | NET_INC_STATS_BH(LINUX_MIB_TCPHPACKS); | 2281 | NET_INC_STATS_BH(LINUX_MIB_TCPHPACKS); |
2909 | } else { | 2282 | } else { |
2910 | if (ack_seq != TCP_SKB_CB(skb)->end_seq) | 2283 | if (ack_seq != TCP_SKB_CB(skb)->end_seq) |
@@ -2920,7 +2293,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) | |||
2920 | if (TCP_ECN_rcv_ecn_echo(tp, skb->h.th)) | 2293 | if (TCP_ECN_rcv_ecn_echo(tp, skb->h.th)) |
2921 | flag |= FLAG_ECE; | 2294 | flag |= FLAG_ECE; |
2922 | 2295 | ||
2923 | tcp_westwood_slow_bw(sk,skb); | 2296 | tcp_ca_event(tp, CA_EVENT_SLOW_ACK); |
2924 | } | 2297 | } |
2925 | 2298 | ||
2926 | /* We passed data and got it acked, remove any soft error | 2299 | /* We passed data and got it acked, remove any soft error |
@@ -2935,22 +2308,20 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) | |||
2935 | prior_in_flight = tcp_packets_in_flight(tp); | 2308 | prior_in_flight = tcp_packets_in_flight(tp); |
2936 | 2309 | ||
2937 | /* See if we can take anything off of the retransmit queue. */ | 2310 | /* See if we can take anything off of the retransmit queue. */ |
2938 | flag |= tcp_clean_rtx_queue(sk, &seq_rtt); | 2311 | flag |= tcp_clean_rtx_queue(sk, &seq_rtt, |
2312 | tp->ca_ops->rtt_sample ? &seq_usrtt : NULL); | ||
2939 | 2313 | ||
2940 | if (tp->frto_counter) | 2314 | if (tp->frto_counter) |
2941 | tcp_process_frto(sk, prior_snd_una); | 2315 | tcp_process_frto(sk, prior_snd_una); |
2942 | 2316 | ||
2943 | if (tcp_ack_is_dubious(tp, flag)) { | 2317 | if (tcp_ack_is_dubious(tp, flag)) { |
2944 | /* Advanve CWND, if state allows this. */ | 2318 | /* Advanve CWND, if state allows this. */ |
2945 | if ((flag & FLAG_DATA_ACKED) && | 2319 | if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(tp, flag)) |
2946 | (tcp_vegas_enabled(tp) || prior_in_flight >= tp->snd_cwnd) && | 2320 | tcp_cong_avoid(tp, ack, seq_rtt, prior_in_flight, 0); |
2947 | tcp_may_raise_cwnd(tp, flag)) | ||
2948 | tcp_cong_avoid(tp, ack, seq_rtt); | ||
2949 | tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag); | 2321 | tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag); |
2950 | } else { | 2322 | } else { |
2951 | if ((flag & FLAG_DATA_ACKED) && | 2323 | if ((flag & FLAG_DATA_ACKED)) |
2952 | (tcp_vegas_enabled(tp) || prior_in_flight >= tp->snd_cwnd)) | 2324 | tcp_cong_avoid(tp, ack, seq_rtt, prior_in_flight, 1); |
2953 | tcp_cong_avoid(tp, ack, seq_rtt); | ||
2954 | } | 2325 | } |
2955 | 2326 | ||
2956 | if ((flag & FLAG_FORWARD_PROGRESS) || !(flag&FLAG_NOT_DUP)) | 2327 | if ((flag & FLAG_FORWARD_PROGRESS) || !(flag&FLAG_NOT_DUP)) |
@@ -4552,6 +3923,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, | |||
4552 | 3923 | ||
4553 | tcp_init_metrics(sk); | 3924 | tcp_init_metrics(sk); |
4554 | 3925 | ||
3926 | tcp_init_congestion_control(tp); | ||
3927 | |||
4555 | /* Prevent spurious tcp_cwnd_restart() on first data | 3928 | /* Prevent spurious tcp_cwnd_restart() on first data |
4556 | * packet. | 3929 | * packet. |
4557 | */ | 3930 | */ |
@@ -4708,9 +4081,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, | |||
4708 | if(tp->af_specific->conn_request(sk, skb) < 0) | 4081 | if(tp->af_specific->conn_request(sk, skb) < 0) |
4709 | return 1; | 4082 | return 1; |
4710 | 4083 | ||
4711 | init_westwood(sk); | ||
4712 | init_bictcp(tp); | ||
4713 | |||
4714 | /* Now we have several options: In theory there is | 4084 | /* Now we have several options: In theory there is |
4715 | * nothing else in the frame. KA9Q has an option to | 4085 | * nothing else in the frame. KA9Q has an option to |
4716 | * send data with the syn, BSD accepts data with the | 4086 | * send data with the syn, BSD accepts data with the |
@@ -4732,9 +4102,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, | |||
4732 | goto discard; | 4102 | goto discard; |
4733 | 4103 | ||
4734 | case TCP_SYN_SENT: | 4104 | case TCP_SYN_SENT: |
4735 | init_westwood(sk); | ||
4736 | init_bictcp(tp); | ||
4737 | |||
4738 | queued = tcp_rcv_synsent_state_process(sk, skb, th, len); | 4105 | queued = tcp_rcv_synsent_state_process(sk, skb, th, len); |
4739 | if (queued >= 0) | 4106 | if (queued >= 0) |
4740 | return queued; | 4107 | return queued; |
@@ -4816,7 +4183,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, | |||
4816 | */ | 4183 | */ |
4817 | if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && | 4184 | if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && |
4818 | !tp->srtt) | 4185 | !tp->srtt) |
4819 | tcp_ack_saw_tstamp(tp, 0); | 4186 | tcp_ack_saw_tstamp(tp, 0, 0); |
4820 | 4187 | ||
4821 | if (tp->rx_opt.tstamp_ok) | 4188 | if (tp->rx_opt.tstamp_ok) |
4822 | tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; | 4189 | tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; |
@@ -4828,6 +4195,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, | |||
4828 | 4195 | ||
4829 | tcp_init_metrics(sk); | 4196 | tcp_init_metrics(sk); |
4830 | 4197 | ||
4198 | tcp_init_congestion_control(tp); | ||
4199 | |||
4831 | /* Prevent spurious tcp_cwnd_restart() on | 4200 | /* Prevent spurious tcp_cwnd_restart() on |
4832 | * first data packet. | 4201 | * first data packet. |
4833 | */ | 4202 | */ |
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 2d41d5d6ad19..ebf112347a97 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c | |||
@@ -2048,6 +2048,7 @@ static int tcp_v4_init_sock(struct sock *sk) | |||
2048 | tp->mss_cache_std = tp->mss_cache = 536; | 2048 | tp->mss_cache_std = tp->mss_cache = 536; |
2049 | 2049 | ||
2050 | tp->reordering = sysctl_tcp_reordering; | 2050 | tp->reordering = sysctl_tcp_reordering; |
2051 | tp->ca_ops = &tcp_init_congestion_ops; | ||
2051 | 2052 | ||
2052 | sk->sk_state = TCP_CLOSE; | 2053 | sk->sk_state = TCP_CLOSE; |
2053 | 2054 | ||
@@ -2070,6 +2071,8 @@ int tcp_v4_destroy_sock(struct sock *sk) | |||
2070 | 2071 | ||
2071 | tcp_clear_xmit_timers(sk); | 2072 | tcp_clear_xmit_timers(sk); |
2072 | 2073 | ||
2074 | tcp_cleanup_congestion_control(tp); | ||
2075 | |||
2073 | /* Cleanup up the write buffer. */ | 2076 | /* Cleanup up the write buffer. */ |
2074 | sk_stream_writequeue_purge(sk); | 2077 | sk_stream_writequeue_purge(sk); |
2075 | 2078 | ||
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index b3943e7562f3..f42a284164b7 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c | |||
@@ -774,6 +774,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, | |||
774 | newtp->frto_counter = 0; | 774 | newtp->frto_counter = 0; |
775 | newtp->frto_highmark = 0; | 775 | newtp->frto_highmark = 0; |
776 | 776 | ||
777 | newtp->ca_ops = &tcp_reno; | ||
778 | |||
777 | tcp_set_ca_state(newtp, TCP_CA_Open); | 779 | tcp_set_ca_state(newtp, TCP_CA_Open); |
778 | tcp_init_xmit_timers(newsk); | 780 | tcp_init_xmit_timers(newsk); |
779 | skb_queue_head_init(&newtp->out_of_order_queue); | 781 | skb_queue_head_init(&newtp->out_of_order_queue); |
@@ -842,8 +844,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, | |||
842 | if (newtp->ecn_flags&TCP_ECN_OK) | 844 | if (newtp->ecn_flags&TCP_ECN_OK) |
843 | sock_set_flag(newsk, SOCK_NO_LARGESEND); | 845 | sock_set_flag(newsk, SOCK_NO_LARGESEND); |
844 | 846 | ||
845 | tcp_ca_init(newtp); | ||
846 | |||
847 | TCP_INC_STATS_BH(TCP_MIB_PASSIVEOPENS); | 847 | TCP_INC_STATS_BH(TCP_MIB_PASSIVEOPENS); |
848 | } | 848 | } |
849 | return newsk; | 849 | return newsk; |
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index f17c6577e337..0e17c244875c 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c | |||
@@ -111,8 +111,7 @@ static void tcp_cwnd_restart(struct tcp_sock *tp, struct dst_entry *dst) | |||
111 | u32 restart_cwnd = tcp_init_cwnd(tp, dst); | 111 | u32 restart_cwnd = tcp_init_cwnd(tp, dst); |
112 | u32 cwnd = tp->snd_cwnd; | 112 | u32 cwnd = tp->snd_cwnd; |
113 | 113 | ||
114 | if (tcp_is_vegas(tp)) | 114 | tcp_ca_event(tp, CA_EVENT_CWND_RESTART); |
115 | tcp_vegas_enable(tp); | ||
116 | 115 | ||
117 | tp->snd_ssthresh = tcp_current_ssthresh(tp); | 116 | tp->snd_ssthresh = tcp_current_ssthresh(tp); |
118 | restart_cwnd = min(restart_cwnd, cwnd); | 117 | restart_cwnd = min(restart_cwnd, cwnd); |
@@ -280,6 +279,10 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) | |||
280 | #define SYSCTL_FLAG_WSCALE 0x2 | 279 | #define SYSCTL_FLAG_WSCALE 0x2 |
281 | #define SYSCTL_FLAG_SACK 0x4 | 280 | #define SYSCTL_FLAG_SACK 0x4 |
282 | 281 | ||
282 | /* If congestion control is doing timestamping */ | ||
283 | if (tp->ca_ops->rtt_sample) | ||
284 | do_gettimeofday(&skb->stamp); | ||
285 | |||
283 | sysctl_flags = 0; | 286 | sysctl_flags = 0; |
284 | if (tcb->flags & TCPCB_FLAG_SYN) { | 287 | if (tcb->flags & TCPCB_FLAG_SYN) { |
285 | tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS; | 288 | tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS; |
@@ -304,17 +307,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) | |||
304 | (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK)); | 307 | (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK)); |
305 | } | 308 | } |
306 | 309 | ||
307 | /* | 310 | if (tcp_packets_in_flight(tp) == 0) |
308 | * If the connection is idle and we are restarting, | 311 | tcp_ca_event(tp, CA_EVENT_TX_START); |
309 | * then we don't want to do any Vegas calculations | ||
310 | * until we get fresh RTT samples. So when we | ||
311 | * restart, we reset our Vegas state to a clean | ||
312 | * slate. After we get acks for this flight of | ||
313 | * packets, _then_ we can make Vegas calculations | ||
314 | * again. | ||
315 | */ | ||
316 | if (tcp_is_vegas(tp) && tcp_packets_in_flight(tp) == 0) | ||
317 | tcp_vegas_enable(tp); | ||
318 | 312 | ||
319 | th = (struct tcphdr *) skb_push(skb, tcp_header_size); | 313 | th = (struct tcphdr *) skb_push(skb, tcp_header_size); |
320 | skb->h.th = th; | 314 | skb->h.th = th; |
@@ -521,6 +515,7 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len) | |||
521 | * skbs, which it never sent before. --ANK | 515 | * skbs, which it never sent before. --ANK |
522 | */ | 516 | */ |
523 | TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when; | 517 | TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when; |
518 | buff->stamp = skb->stamp; | ||
524 | 519 | ||
525 | if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) { | 520 | if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) { |
526 | tp->lost_out -= tcp_skb_pcount(skb); | 521 | tp->lost_out -= tcp_skb_pcount(skb); |
@@ -1449,7 +1444,6 @@ static inline void tcp_connect_init(struct sock *sk) | |||
1449 | tp->window_clamp = dst_metric(dst, RTAX_WINDOW); | 1444 | tp->window_clamp = dst_metric(dst, RTAX_WINDOW); |
1450 | tp->advmss = dst_metric(dst, RTAX_ADVMSS); | 1445 | tp->advmss = dst_metric(dst, RTAX_ADVMSS); |
1451 | tcp_initialize_rcv_mss(sk); | 1446 | tcp_initialize_rcv_mss(sk); |
1452 | tcp_ca_init(tp); | ||
1453 | 1447 | ||
1454 | tcp_select_initial_window(tcp_full_space(sk), | 1448 | tcp_select_initial_window(tcp_full_space(sk), |
1455 | tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), | 1449 | tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), |
@@ -1503,7 +1497,6 @@ int tcp_connect(struct sock *sk) | |||
1503 | TCP_SKB_CB(buff)->end_seq = tp->write_seq; | 1497 | TCP_SKB_CB(buff)->end_seq = tp->write_seq; |
1504 | tp->snd_nxt = tp->write_seq; | 1498 | tp->snd_nxt = tp->write_seq; |
1505 | tp->pushed_seq = tp->write_seq; | 1499 | tp->pushed_seq = tp->write_seq; |
1506 | tcp_ca_init(tp); | ||
1507 | 1500 | ||
1508 | /* Send it off. */ | 1501 | /* Send it off. */ |
1509 | TCP_SKB_CB(buff)->when = tcp_time_stamp; | 1502 | TCP_SKB_CB(buff)->when = tcp_time_stamp; |
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c new file mode 100644 index 000000000000..70e108e15c71 --- /dev/null +++ b/net/ipv4/tcp_scalable.c | |||
@@ -0,0 +1,68 @@ | |||
1 | /* Tom Kelly's Scalable TCP | ||
2 | * | ||
3 | * See htt://www-lce.eng.cam.ac.uk/~ctk21/scalable/ | ||
4 | * | ||
5 | * John Heffner <jheffner@sc.edu> | ||
6 | */ | ||
7 | |||
8 | #include <linux/config.h> | ||
9 | #include <linux/module.h> | ||
10 | #include <net/tcp.h> | ||
11 | |||
12 | /* These factors derived from the recommended values in the aer: | ||
13 | * .01 and and 7/8. We use 50 instead of 100 to account for | ||
14 | * delayed ack. | ||
15 | */ | ||
16 | #define TCP_SCALABLE_AI_CNT 50U | ||
17 | #define TCP_SCALABLE_MD_SCALE 3 | ||
18 | |||
19 | static void tcp_scalable_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, | ||
20 | u32 in_flight, int flag) | ||
21 | { | ||
22 | if (in_flight < tp->snd_cwnd) | ||
23 | return; | ||
24 | |||
25 | if (tp->snd_cwnd <= tp->snd_ssthresh) { | ||
26 | tp->snd_cwnd++; | ||
27 | } else { | ||
28 | tp->snd_cwnd_cnt++; | ||
29 | if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)){ | ||
30 | tp->snd_cwnd++; | ||
31 | tp->snd_cwnd_cnt = 0; | ||
32 | } | ||
33 | } | ||
34 | tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp); | ||
35 | tp->snd_cwnd_stamp = tcp_time_stamp; | ||
36 | } | ||
37 | |||
38 | static u32 tcp_scalable_ssthresh(struct tcp_sock *tp) | ||
39 | { | ||
40 | return max(tp->snd_cwnd - (tp->snd_cwnd>>TCP_SCALABLE_MD_SCALE), 2U); | ||
41 | } | ||
42 | |||
43 | |||
44 | static struct tcp_congestion_ops tcp_scalable = { | ||
45 | .ssthresh = tcp_scalable_ssthresh, | ||
46 | .cong_avoid = tcp_scalable_cong_avoid, | ||
47 | .min_cwnd = tcp_reno_min_cwnd, | ||
48 | |||
49 | .owner = THIS_MODULE, | ||
50 | .name = "scalable", | ||
51 | }; | ||
52 | |||
53 | static int __init tcp_scalable_register(void) | ||
54 | { | ||
55 | return tcp_register_congestion_control(&tcp_scalable); | ||
56 | } | ||
57 | |||
58 | static void __exit tcp_scalable_unregister(void) | ||
59 | { | ||
60 | tcp_unregister_congestion_control(&tcp_scalable); | ||
61 | } | ||
62 | |||
63 | module_init(tcp_scalable_register); | ||
64 | module_exit(tcp_scalable_unregister); | ||
65 | |||
66 | MODULE_AUTHOR("John Heffner"); | ||
67 | MODULE_LICENSE("GPL"); | ||
68 | MODULE_DESCRIPTION("Scalable TCP"); | ||
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c new file mode 100644 index 000000000000..9bd443db5193 --- /dev/null +++ b/net/ipv4/tcp_vegas.c | |||
@@ -0,0 +1,411 @@ | |||
1 | /* | ||
2 | * TCP Vegas congestion control | ||
3 | * | ||
4 | * This is based on the congestion detection/avoidance scheme described in | ||
5 | * Lawrence S. Brakmo and Larry L. Peterson. | ||
6 | * "TCP Vegas: End to end congestion avoidance on a global internet." | ||
7 | * IEEE Journal on Selected Areas in Communication, 13(8):1465--1480, | ||
8 | * October 1995. Available from: | ||
9 | * ftp://ftp.cs.arizona.edu/xkernel/Papers/jsac.ps | ||
10 | * | ||
11 | * See http://www.cs.arizona.edu/xkernel/ for their implementation. | ||
12 | * The main aspects that distinguish this implementation from the | ||
13 | * Arizona Vegas implementation are: | ||
14 | * o We do not change the loss detection or recovery mechanisms of | ||
15 | * Linux in any way. Linux already recovers from losses quite well, | ||
16 | * using fine-grained timers, NewReno, and FACK. | ||
17 | * o To avoid the performance penalty imposed by increasing cwnd | ||
18 | * only every-other RTT during slow start, we increase during | ||
19 | * every RTT during slow start, just like Reno. | ||
20 | * o Largely to allow continuous cwnd growth during slow start, | ||
21 | * we use the rate at which ACKs come back as the "actual" | ||
22 | * rate, rather than the rate at which data is sent. | ||
23 | * o To speed convergence to the right rate, we set the cwnd | ||
24 | * to achieve the right ("actual") rate when we exit slow start. | ||
25 | * o To filter out the noise caused by delayed ACKs, we use the | ||
26 | * minimum RTT sample observed during the last RTT to calculate | ||
27 | * the actual rate. | ||
28 | * o When the sender re-starts from idle, it waits until it has | ||
29 | * received ACKs for an entire flight of new data before making | ||
30 | * a cwnd adjustment decision. The original Vegas implementation | ||
31 | * assumed senders never went idle. | ||
32 | */ | ||
33 | |||
34 | #include <linux/config.h> | ||
35 | #include <linux/mm.h> | ||
36 | #include <linux/module.h> | ||
37 | #include <linux/skbuff.h> | ||
38 | #include <linux/tcp_diag.h> | ||
39 | |||
40 | #include <net/tcp.h> | ||
41 | |||
42 | /* Default values of the Vegas variables, in fixed-point representation | ||
43 | * with V_PARAM_SHIFT bits to the right of the binary point. | ||
44 | */ | ||
45 | #define V_PARAM_SHIFT 1 | ||
46 | static int alpha = 1<<V_PARAM_SHIFT; | ||
47 | static int beta = 3<<V_PARAM_SHIFT; | ||
48 | static int gamma = 1<<V_PARAM_SHIFT; | ||
49 | |||
50 | module_param(alpha, int, 0644); | ||
51 | MODULE_PARM_DESC(alpha, "lower bound of packets in network (scale by 2)"); | ||
52 | module_param(beta, int, 0644); | ||
53 | MODULE_PARM_DESC(beta, "upper bound of packets in network (scale by 2)"); | ||
54 | module_param(gamma, int, 0644); | ||
55 | MODULE_PARM_DESC(gamma, "limit on increase (scale by 2)"); | ||
56 | |||
57 | |||
58 | /* Vegas variables */ | ||
59 | struct vegas { | ||
60 | u32 beg_snd_nxt; /* right edge during last RTT */ | ||
61 | u32 beg_snd_una; /* left edge during last RTT */ | ||
62 | u32 beg_snd_cwnd; /* saves the size of the cwnd */ | ||
63 | u8 doing_vegas_now;/* if true, do vegas for this RTT */ | ||
64 | u16 cntRTT; /* # of RTTs measured within last RTT */ | ||
65 | u32 minRTT; /* min of RTTs measured within last RTT (in usec) */ | ||
66 | u32 baseRTT; /* the min of all Vegas RTT measurements seen (in usec) */ | ||
67 | }; | ||
68 | |||
69 | /* There are several situations when we must "re-start" Vegas: | ||
70 | * | ||
71 | * o when a connection is established | ||
72 | * o after an RTO | ||
73 | * o after fast recovery | ||
74 | * o when we send a packet and there is no outstanding | ||
75 | * unacknowledged data (restarting an idle connection) | ||
76 | * | ||
77 | * In these circumstances we cannot do a Vegas calculation at the | ||
78 | * end of the first RTT, because any calculation we do is using | ||
79 | * stale info -- both the saved cwnd and congestion feedback are | ||
80 | * stale. | ||
81 | * | ||
82 | * Instead we must wait until the completion of an RTT during | ||
83 | * which we actually receive ACKs. | ||
84 | */ | ||
85 | static inline void vegas_enable(struct tcp_sock *tp) | ||
86 | { | ||
87 | struct vegas *vegas = tcp_ca(tp); | ||
88 | |||
89 | /* Begin taking Vegas samples next time we send something. */ | ||
90 | vegas->doing_vegas_now = 1; | ||
91 | |||
92 | /* Set the beginning of the next send window. */ | ||
93 | vegas->beg_snd_nxt = tp->snd_nxt; | ||
94 | |||
95 | vegas->cntRTT = 0; | ||
96 | vegas->minRTT = 0x7fffffff; | ||
97 | } | ||
98 | |||
99 | /* Stop taking Vegas samples for now. */ | ||
100 | static inline void vegas_disable(struct tcp_sock *tp) | ||
101 | { | ||
102 | struct vegas *vegas = tcp_ca(tp); | ||
103 | |||
104 | vegas->doing_vegas_now = 0; | ||
105 | } | ||
106 | |||
107 | static void tcp_vegas_init(struct tcp_sock *tp) | ||
108 | { | ||
109 | struct vegas *vegas = tcp_ca(tp); | ||
110 | |||
111 | vegas->baseRTT = 0x7fffffff; | ||
112 | vegas_enable(tp); | ||
113 | } | ||
114 | |||
115 | /* Do RTT sampling needed for Vegas. | ||
116 | * Basically we: | ||
117 | * o min-filter RTT samples from within an RTT to get the current | ||
118 | * propagation delay + queuing delay (we are min-filtering to try to | ||
119 | * avoid the effects of delayed ACKs) | ||
120 | * o min-filter RTT samples from a much longer window (forever for now) | ||
121 | * to find the propagation delay (baseRTT) | ||
122 | */ | ||
123 | static void tcp_vegas_rtt_calc(struct tcp_sock *tp, u32 usrtt) | ||
124 | { | ||
125 | struct vegas *vegas = tcp_ca(tp); | ||
126 | u32 vrtt = usrtt + 1; /* Never allow zero rtt or baseRTT */ | ||
127 | |||
128 | /* Filter to find propagation delay: */ | ||
129 | if (vrtt < vegas->baseRTT) | ||
130 | vegas->baseRTT = vrtt; | ||
131 | |||
132 | /* Find the min RTT during the last RTT to find | ||
133 | * the current prop. delay + queuing delay: | ||
134 | */ | ||
135 | vegas->minRTT = min(vegas->minRTT, vrtt); | ||
136 | vegas->cntRTT++; | ||
137 | } | ||
138 | |||
139 | static void tcp_vegas_state(struct tcp_sock *tp, u8 ca_state) | ||
140 | { | ||
141 | |||
142 | if (ca_state == TCP_CA_Open) | ||
143 | vegas_enable(tp); | ||
144 | else | ||
145 | vegas_disable(tp); | ||
146 | } | ||
147 | |||
148 | /* | ||
149 | * If the connection is idle and we are restarting, | ||
150 | * then we don't want to do any Vegas calculations | ||
151 | * until we get fresh RTT samples. So when we | ||
152 | * restart, we reset our Vegas state to a clean | ||
153 | * slate. After we get acks for this flight of | ||
154 | * packets, _then_ we can make Vegas calculations | ||
155 | * again. | ||
156 | */ | ||
157 | static void tcp_vegas_cwnd_event(struct tcp_sock *tp, enum tcp_ca_event event) | ||
158 | { | ||
159 | if (event == CA_EVENT_CWND_RESTART || | ||
160 | event == CA_EVENT_TX_START) | ||
161 | tcp_vegas_init(tp); | ||
162 | } | ||
163 | |||
164 | static void tcp_vegas_cong_avoid(struct tcp_sock *tp, u32 ack, | ||
165 | u32 seq_rtt, u32 in_flight, int flag) | ||
166 | { | ||
167 | struct vegas *vegas = tcp_ca(tp); | ||
168 | |||
169 | if (!vegas->doing_vegas_now) | ||
170 | return tcp_reno_cong_avoid(tp, ack, seq_rtt, in_flight, flag); | ||
171 | |||
172 | /* The key players are v_beg_snd_una and v_beg_snd_nxt. | ||
173 | * | ||
174 | * These are so named because they represent the approximate values | ||
175 | * of snd_una and snd_nxt at the beginning of the current RTT. More | ||
176 | * precisely, they represent the amount of data sent during the RTT. | ||
177 | * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt, | ||
178 | * we will calculate that (v_beg_snd_nxt - v_beg_snd_una) outstanding | ||
179 | * bytes of data have been ACKed during the course of the RTT, giving | ||
180 | * an "actual" rate of: | ||
181 | * | ||
182 | * (v_beg_snd_nxt - v_beg_snd_una) / (rtt duration) | ||
183 | * | ||
184 | * Unfortunately, v_beg_snd_una is not exactly equal to snd_una, | ||
185 | * because delayed ACKs can cover more than one segment, so they | ||
186 | * don't line up nicely with the boundaries of RTTs. | ||
187 | * | ||
188 | * Another unfortunate fact of life is that delayed ACKs delay the | ||
189 | * advance of the left edge of our send window, so that the number | ||
190 | * of bytes we send in an RTT is often less than our cwnd will allow. | ||
191 | * So we keep track of our cwnd separately, in v_beg_snd_cwnd. | ||
192 | */ | ||
193 | |||
194 | if (after(ack, vegas->beg_snd_nxt)) { | ||
195 | /* Do the Vegas once-per-RTT cwnd adjustment. */ | ||
196 | u32 old_wnd, old_snd_cwnd; | ||
197 | |||
198 | |||
199 | /* Here old_wnd is essentially the window of data that was | ||
200 | * sent during the previous RTT, and has all | ||
201 | * been acknowledged in the course of the RTT that ended | ||
202 | * with the ACK we just received. Likewise, old_snd_cwnd | ||
203 | * is the cwnd during the previous RTT. | ||
204 | */ | ||
205 | old_wnd = (vegas->beg_snd_nxt - vegas->beg_snd_una) / | ||
206 | tp->mss_cache; | ||
207 | old_snd_cwnd = vegas->beg_snd_cwnd; | ||
208 | |||
209 | /* Save the extent of the current window so we can use this | ||
210 | * at the end of the next RTT. | ||
211 | */ | ||
212 | vegas->beg_snd_una = vegas->beg_snd_nxt; | ||
213 | vegas->beg_snd_nxt = tp->snd_nxt; | ||
214 | vegas->beg_snd_cwnd = tp->snd_cwnd; | ||
215 | |||
216 | /* Take into account the current RTT sample too, to | ||
217 | * decrease the impact of delayed acks. This double counts | ||
218 | * this sample since we count it for the next window as well, | ||
219 | * but that's not too awful, since we're taking the min, | ||
220 | * rather than averaging. | ||
221 | */ | ||
222 | tcp_vegas_rtt_calc(tp, seq_rtt*1000); | ||
223 | |||
224 | /* We do the Vegas calculations only if we got enough RTT | ||
225 | * samples that we can be reasonably sure that we got | ||
226 | * at least one RTT sample that wasn't from a delayed ACK. | ||
227 | * If we only had 2 samples total, | ||
228 | * then that means we're getting only 1 ACK per RTT, which | ||
229 | * means they're almost certainly delayed ACKs. | ||
230 | * If we have 3 samples, we should be OK. | ||
231 | */ | ||
232 | |||
233 | if (vegas->cntRTT <= 2) { | ||
234 | /* We don't have enough RTT samples to do the Vegas | ||
235 | * calculation, so we'll behave like Reno. | ||
236 | */ | ||
237 | if (tp->snd_cwnd > tp->snd_ssthresh) | ||
238 | tp->snd_cwnd++; | ||
239 | } else { | ||
240 | u32 rtt, target_cwnd, diff; | ||
241 | |||
242 | /* We have enough RTT samples, so, using the Vegas | ||
243 | * algorithm, we determine if we should increase or | ||
244 | * decrease cwnd, and by how much. | ||
245 | */ | ||
246 | |||
247 | /* Pluck out the RTT we are using for the Vegas | ||
248 | * calculations. This is the min RTT seen during the | ||
249 | * last RTT. Taking the min filters out the effects | ||
250 | * of delayed ACKs, at the cost of noticing congestion | ||
251 | * a bit later. | ||
252 | */ | ||
253 | rtt = vegas->minRTT; | ||
254 | |||
255 | /* Calculate the cwnd we should have, if we weren't | ||
256 | * going too fast. | ||
257 | * | ||
258 | * This is: | ||
259 | * (actual rate in segments) * baseRTT | ||
260 | * We keep it as a fixed point number with | ||
261 | * V_PARAM_SHIFT bits to the right of the binary point. | ||
262 | */ | ||
263 | target_cwnd = ((old_wnd * vegas->baseRTT) | ||
264 | << V_PARAM_SHIFT) / rtt; | ||
265 | |||
266 | /* Calculate the difference between the window we had, | ||
267 | * and the window we would like to have. This quantity | ||
268 | * is the "Diff" from the Arizona Vegas papers. | ||
269 | * | ||
270 | * Again, this is a fixed point number with | ||
271 | * V_PARAM_SHIFT bits to the right of the binary | ||
272 | * point. | ||
273 | */ | ||
274 | diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd; | ||
275 | |||
276 | if (tp->snd_cwnd < tp->snd_ssthresh) { | ||
277 | /* Slow start. */ | ||
278 | if (diff > gamma) { | ||
279 | /* Going too fast. Time to slow down | ||
280 | * and switch to congestion avoidance. | ||
281 | */ | ||
282 | tp->snd_ssthresh = 2; | ||
283 | |||
284 | /* Set cwnd to match the actual rate | ||
285 | * exactly: | ||
286 | * cwnd = (actual rate) * baseRTT | ||
287 | * Then we add 1 because the integer | ||
288 | * truncation robs us of full link | ||
289 | * utilization. | ||
290 | */ | ||
291 | tp->snd_cwnd = min(tp->snd_cwnd, | ||
292 | (target_cwnd >> | ||
293 | V_PARAM_SHIFT)+1); | ||
294 | |||
295 | } | ||
296 | } else { | ||
297 | /* Congestion avoidance. */ | ||
298 | u32 next_snd_cwnd; | ||
299 | |||
300 | /* Figure out where we would like cwnd | ||
301 | * to be. | ||
302 | */ | ||
303 | if (diff > beta) { | ||
304 | /* The old window was too fast, so | ||
305 | * we slow down. | ||
306 | */ | ||
307 | next_snd_cwnd = old_snd_cwnd - 1; | ||
308 | } else if (diff < alpha) { | ||
309 | /* We don't have enough extra packets | ||
310 | * in the network, so speed up. | ||
311 | */ | ||
312 | next_snd_cwnd = old_snd_cwnd + 1; | ||
313 | } else { | ||
314 | /* Sending just as fast as we | ||
315 | * should be. | ||
316 | */ | ||
317 | next_snd_cwnd = old_snd_cwnd; | ||
318 | } | ||
319 | |||
320 | /* Adjust cwnd upward or downward, toward the | ||
321 | * desired value. | ||
322 | */ | ||
323 | if (next_snd_cwnd > tp->snd_cwnd) | ||
324 | tp->snd_cwnd++; | ||
325 | else if (next_snd_cwnd < tp->snd_cwnd) | ||
326 | tp->snd_cwnd--; | ||
327 | } | ||
328 | } | ||
329 | |||
330 | /* Wipe the slate clean for the next RTT. */ | ||
331 | vegas->cntRTT = 0; | ||
332 | vegas->minRTT = 0x7fffffff; | ||
333 | } | ||
334 | |||
335 | /* The following code is executed for every ack we receive, | ||
336 | * except for conditions checked in should_advance_cwnd() | ||
337 | * before the call to tcp_cong_avoid(). Mainly this means that | ||
338 | * we only execute this code if the ack actually acked some | ||
339 | * data. | ||
340 | */ | ||
341 | |||
342 | /* If we are in slow start, increase our cwnd in response to this ACK. | ||
343 | * (If we are not in slow start then we are in congestion avoidance, | ||
344 | * and adjust our congestion window only once per RTT. See the code | ||
345 | * above.) | ||
346 | */ | ||
347 | if (tp->snd_cwnd <= tp->snd_ssthresh) | ||
348 | tp->snd_cwnd++; | ||
349 | |||
350 | /* to keep cwnd from growing without bound */ | ||
351 | tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp); | ||
352 | |||
353 | /* Make sure that we are never so timid as to reduce our cwnd below | ||
354 | * 2 MSS. | ||
355 | * | ||
356 | * Going below 2 MSS would risk huge delayed ACKs from our receiver. | ||
357 | */ | ||
358 | tp->snd_cwnd = max(tp->snd_cwnd, 2U); | ||
359 | } | ||
360 | |||
361 | /* Extract info for Tcp socket info provided via netlink. */ | ||
362 | static void tcp_vegas_get_info(struct tcp_sock *tp, u32 ext, | ||
363 | struct sk_buff *skb) | ||
364 | { | ||
365 | const struct vegas *ca = tcp_ca(tp); | ||
366 | if (ext & (1<<(TCPDIAG_VEGASINFO-1))) { | ||
367 | struct tcpvegas_info *info; | ||
368 | |||
369 | info = RTA_DATA(__RTA_PUT(skb, TCPDIAG_VEGASINFO, | ||
370 | sizeof(*info))); | ||
371 | |||
372 | info->tcpv_enabled = ca->doing_vegas_now; | ||
373 | info->tcpv_rttcnt = ca->cntRTT; | ||
374 | info->tcpv_rtt = ca->baseRTT; | ||
375 | info->tcpv_minrtt = ca->minRTT; | ||
376 | rtattr_failure: ; | ||
377 | } | ||
378 | } | ||
379 | |||
380 | static struct tcp_congestion_ops tcp_vegas = { | ||
381 | .init = tcp_vegas_init, | ||
382 | .ssthresh = tcp_reno_ssthresh, | ||
383 | .cong_avoid = tcp_vegas_cong_avoid, | ||
384 | .min_cwnd = tcp_reno_min_cwnd, | ||
385 | .rtt_sample = tcp_vegas_rtt_calc, | ||
386 | .set_state = tcp_vegas_state, | ||
387 | .cwnd_event = tcp_vegas_cwnd_event, | ||
388 | .get_info = tcp_vegas_get_info, | ||
389 | |||
390 | .owner = THIS_MODULE, | ||
391 | .name = "vegas", | ||
392 | }; | ||
393 | |||
394 | static int __init tcp_vegas_register(void) | ||
395 | { | ||
396 | BUG_ON(sizeof(struct vegas) > TCP_CA_PRIV_SIZE); | ||
397 | tcp_register_congestion_control(&tcp_vegas); | ||
398 | return 0; | ||
399 | } | ||
400 | |||
401 | static void __exit tcp_vegas_unregister(void) | ||
402 | { | ||
403 | tcp_unregister_congestion_control(&tcp_vegas); | ||
404 | } | ||
405 | |||
406 | module_init(tcp_vegas_register); | ||
407 | module_exit(tcp_vegas_unregister); | ||
408 | |||
409 | MODULE_AUTHOR("Stephen Hemminger"); | ||
410 | MODULE_LICENSE("GPL"); | ||
411 | MODULE_DESCRIPTION("TCP Vegas"); | ||
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c new file mode 100644 index 000000000000..ef827242c940 --- /dev/null +++ b/net/ipv4/tcp_westwood.c | |||
@@ -0,0 +1,259 @@ | |||
1 | /* | ||
2 | * TCP Westwood+ | ||
3 | * | ||
4 | * Angelo Dell'Aera: TCP Westwood+ support | ||
5 | */ | ||
6 | |||
7 | #include <linux/config.h> | ||
8 | #include <linux/mm.h> | ||
9 | #include <linux/module.h> | ||
10 | #include <linux/skbuff.h> | ||
11 | #include <linux/tcp_diag.h> | ||
12 | #include <net/tcp.h> | ||
13 | |||
14 | /* TCP Westwood structure */ | ||
15 | struct westwood { | ||
16 | u32 bw_ns_est; /* first bandwidth estimation..not too smoothed 8) */ | ||
17 | u32 bw_est; /* bandwidth estimate */ | ||
18 | u32 rtt_win_sx; /* here starts a new evaluation... */ | ||
19 | u32 bk; | ||
20 | u32 snd_una; /* used for evaluating the number of acked bytes */ | ||
21 | u32 cumul_ack; | ||
22 | u32 accounted; | ||
23 | u32 rtt; | ||
24 | u32 rtt_min; /* minimum observed RTT */ | ||
25 | }; | ||
26 | |||
27 | |||
28 | /* TCP Westwood functions and constants */ | ||
29 | #define TCP_WESTWOOD_RTT_MIN (HZ/20) /* 50ms */ | ||
30 | #define TCP_WESTWOOD_INIT_RTT (20*HZ) /* maybe too conservative?! */ | ||
31 | |||
32 | /* | ||
33 | * @tcp_westwood_create | ||
34 | * This function initializes fields used in TCP Westwood+, | ||
35 | * it is called after the initial SYN, so the sequence numbers | ||
36 | * are correct but new passive connections we have no | ||
37 | * information about RTTmin at this time so we simply set it to | ||
38 | * TCP_WESTWOOD_INIT_RTT. This value was chosen to be too conservative | ||
39 | * since in this way we're sure it will be updated in a consistent | ||
40 | * way as soon as possible. It will reasonably happen within the first | ||
41 | * RTT period of the connection lifetime. | ||
42 | */ | ||
43 | static void tcp_westwood_init(struct tcp_sock *tp) | ||
44 | { | ||
45 | struct westwood *w = tcp_ca(tp); | ||
46 | |||
47 | w->bk = 0; | ||
48 | w->bw_ns_est = 0; | ||
49 | w->bw_est = 0; | ||
50 | w->accounted = 0; | ||
51 | w->cumul_ack = 0; | ||
52 | w->rtt_min = w->rtt = TCP_WESTWOOD_INIT_RTT; | ||
53 | w->rtt_win_sx = tcp_time_stamp; | ||
54 | w->snd_una = tp->snd_una; | ||
55 | } | ||
56 | |||
57 | /* | ||
58 | * @westwood_do_filter | ||
59 | * Low-pass filter. Implemented using constant coefficients. | ||
60 | */ | ||
61 | static inline u32 westwood_do_filter(u32 a, u32 b) | ||
62 | { | ||
63 | return (((7 * a) + b) >> 3); | ||
64 | } | ||
65 | |||
66 | static inline void westwood_filter(struct westwood *w, u32 delta) | ||
67 | { | ||
68 | w->bw_ns_est = westwood_do_filter(w->bw_ns_est, w->bk / delta); | ||
69 | w->bw_est = westwood_do_filter(w->bw_est, w->bw_ns_est); | ||
70 | } | ||
71 | |||
72 | /* | ||
73 | * @westwood_pkts_acked | ||
74 | * Called after processing group of packets. | ||
75 | * but all westwood needs is the last sample of srtt. | ||
76 | */ | ||
77 | static void tcp_westwood_pkts_acked(struct tcp_sock *tp, u32 cnt) | ||
78 | { | ||
79 | struct westwood *w = tcp_ca(tp); | ||
80 | if (cnt > 0) | ||
81 | w->rtt = tp->srtt >> 3; | ||
82 | } | ||
83 | |||
84 | /* | ||
85 | * @westwood_update_window | ||
86 | * It updates RTT evaluation window if it is the right moment to do | ||
87 | * it. If so it calls filter for evaluating bandwidth. | ||
88 | */ | ||
89 | static void westwood_update_window(struct tcp_sock *tp) | ||
90 | { | ||
91 | struct westwood *w = tcp_ca(tp); | ||
92 | s32 delta = tcp_time_stamp - w->rtt_win_sx; | ||
93 | |||
94 | /* | ||
95 | * See if a RTT-window has passed. | ||
96 | * Be careful since if RTT is less than | ||
97 | * 50ms we don't filter but we continue 'building the sample'. | ||
98 | * This minimum limit was chosen since an estimation on small | ||
99 | * time intervals is better to avoid... | ||
100 | * Obviously on a LAN we reasonably will always have | ||
101 | * right_bound = left_bound + WESTWOOD_RTT_MIN | ||
102 | */ | ||
103 | if (w->rtt && delta > max_t(u32, w->rtt, TCP_WESTWOOD_RTT_MIN)) { | ||
104 | westwood_filter(w, delta); | ||
105 | |||
106 | w->bk = 0; | ||
107 | w->rtt_win_sx = tcp_time_stamp; | ||
108 | } | ||
109 | } | ||
110 | |||
111 | /* | ||
112 | * @westwood_fast_bw | ||
113 | * It is called when we are in fast path. In particular it is called when | ||
114 | * header prediction is successful. In such case in fact update is | ||
115 | * straight forward and doesn't need any particular care. | ||
116 | */ | ||
117 | static inline void westwood_fast_bw(struct tcp_sock *tp) | ||
118 | { | ||
119 | struct westwood *w = tcp_ca(tp); | ||
120 | |||
121 | westwood_update_window(tp); | ||
122 | |||
123 | w->bk += tp->snd_una - w->snd_una; | ||
124 | w->snd_una = tp->snd_una; | ||
125 | w->rtt_min = min(w->rtt, w->rtt_min); | ||
126 | } | ||
127 | |||
128 | /* | ||
129 | * @westwood_acked_count | ||
130 | * This function evaluates cumul_ack for evaluating bk in case of | ||
131 | * delayed or partial acks. | ||
132 | */ | ||
133 | static inline u32 westwood_acked_count(struct tcp_sock *tp) | ||
134 | { | ||
135 | struct westwood *w = tcp_ca(tp); | ||
136 | |||
137 | w->cumul_ack = tp->snd_una - w->snd_una; | ||
138 | |||
139 | /* If cumul_ack is 0 this is a dupack since it's not moving | ||
140 | * tp->snd_una. | ||
141 | */ | ||
142 | if (!w->cumul_ack) { | ||
143 | w->accounted += tp->mss_cache; | ||
144 | w->cumul_ack = tp->mss_cache; | ||
145 | } | ||
146 | |||
147 | if (w->cumul_ack > tp->mss_cache) { | ||
148 | /* Partial or delayed ack */ | ||
149 | if (w->accounted >= w->cumul_ack) { | ||
150 | w->accounted -= w->cumul_ack; | ||
151 | w->cumul_ack = tp->mss_cache; | ||
152 | } else { | ||
153 | w->cumul_ack -= w->accounted; | ||
154 | w->accounted = 0; | ||
155 | } | ||
156 | } | ||
157 | |||
158 | w->snd_una = tp->snd_una; | ||
159 | |||
160 | return w->cumul_ack; | ||
161 | } | ||
162 | |||
163 | static inline u32 westwood_bw_rttmin(const struct tcp_sock *tp) | ||
164 | { | ||
165 | struct westwood *w = tcp_ca(tp); | ||
166 | return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2); | ||
167 | } | ||
168 | |||
169 | /* | ||
170 | * TCP Westwood | ||
171 | * Here limit is evaluated as Bw estimation*RTTmin (for obtaining it | ||
172 | * in packets we use mss_cache). Rttmin is guaranteed to be >= 2 | ||
173 | * so avoids ever returning 0. | ||
174 | */ | ||
175 | static u32 tcp_westwood_cwnd_min(struct tcp_sock *tp) | ||
176 | { | ||
177 | return westwood_bw_rttmin(tp); | ||
178 | } | ||
179 | |||
180 | static void tcp_westwood_event(struct tcp_sock *tp, enum tcp_ca_event event) | ||
181 | { | ||
182 | struct westwood *w = tcp_ca(tp); | ||
183 | |||
184 | switch(event) { | ||
185 | case CA_EVENT_FAST_ACK: | ||
186 | westwood_fast_bw(tp); | ||
187 | break; | ||
188 | |||
189 | case CA_EVENT_COMPLETE_CWR: | ||
190 | tp->snd_cwnd = tp->snd_ssthresh = westwood_bw_rttmin(tp); | ||
191 | break; | ||
192 | |||
193 | case CA_EVENT_FRTO: | ||
194 | tp->snd_ssthresh = westwood_bw_rttmin(tp); | ||
195 | break; | ||
196 | |||
197 | case CA_EVENT_SLOW_ACK: | ||
198 | westwood_update_window(tp); | ||
199 | w->bk += westwood_acked_count(tp); | ||
200 | w->rtt_min = min(w->rtt, w->rtt_min); | ||
201 | break; | ||
202 | |||
203 | default: | ||
204 | /* don't care */ | ||
205 | break; | ||
206 | } | ||
207 | } | ||
208 | |||
209 | |||
210 | /* Extract info for Tcp socket info provided via netlink. */ | ||
211 | static void tcp_westwood_info(struct tcp_sock *tp, u32 ext, | ||
212 | struct sk_buff *skb) | ||
213 | { | ||
214 | const struct westwood *ca = tcp_ca(tp); | ||
215 | if (ext & (1<<(TCPDIAG_VEGASINFO-1))) { | ||
216 | struct rtattr *rta; | ||
217 | struct tcpvegas_info *info; | ||
218 | |||
219 | rta = __RTA_PUT(skb, TCPDIAG_VEGASINFO, sizeof(*info)); | ||
220 | info = RTA_DATA(rta); | ||
221 | info->tcpv_enabled = 1; | ||
222 | info->tcpv_rttcnt = 0; | ||
223 | info->tcpv_rtt = jiffies_to_usecs(ca->rtt); | ||
224 | info->tcpv_minrtt = jiffies_to_usecs(ca->rtt_min); | ||
225 | rtattr_failure: ; | ||
226 | } | ||
227 | } | ||
228 | |||
229 | |||
230 | static struct tcp_congestion_ops tcp_westwood = { | ||
231 | .init = tcp_westwood_init, | ||
232 | .ssthresh = tcp_reno_ssthresh, | ||
233 | .cong_avoid = tcp_reno_cong_avoid, | ||
234 | .min_cwnd = tcp_westwood_cwnd_min, | ||
235 | .cwnd_event = tcp_westwood_event, | ||
236 | .get_info = tcp_westwood_info, | ||
237 | .pkts_acked = tcp_westwood_pkts_acked, | ||
238 | |||
239 | .owner = THIS_MODULE, | ||
240 | .name = "westwood" | ||
241 | }; | ||
242 | |||
243 | static int __init tcp_westwood_register(void) | ||
244 | { | ||
245 | BUG_ON(sizeof(struct westwood) > TCP_CA_PRIV_SIZE); | ||
246 | return tcp_register_congestion_control(&tcp_westwood); | ||
247 | } | ||
248 | |||
249 | static void __exit tcp_westwood_unregister(void) | ||
250 | { | ||
251 | tcp_unregister_congestion_control(&tcp_westwood); | ||
252 | } | ||
253 | |||
254 | module_init(tcp_westwood_register); | ||
255 | module_exit(tcp_westwood_unregister); | ||
256 | |||
257 | MODULE_AUTHOR("Stephen Hemminger, Angelo Dell'Aera"); | ||
258 | MODULE_LICENSE("GPL"); | ||
259 | MODULE_DESCRIPTION("TCP Westwood+"); | ||