aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Kconfig180
-rw-r--r--net/ipv4/Makefile12
-rw-r--r--net/ipv4/af_inet.c13
-rw-r--r--net/ipv4/devinet.c2
-rw-r--r--net/ipv4/fib_semantics.c9
-rw-r--r--net/ipv4/fib_trie.c954
-rw-r--r--net/ipv4/icmp.c18
-rw-r--r--net/ipv4/igmp.c96
-rw-r--r--net/ipv4/inetpeer.c11
-rw-r--r--net/ipv4/ip_fragment.c8
-rw-r--r--net/ipv4/ip_gre.c21
-rw-r--r--net/ipv4/ip_input.c6
-rw-r--r--net/ipv4/ip_output.c19
-rw-r--r--net/ipv4/ip_sockglue.c9
-rw-r--r--net/ipv4/ipcomp.c2
-rw-r--r--net/ipv4/ipconfig.c4
-rw-r--r--net/ipv4/ipip.c56
-rw-r--r--net/ipv4/ipmr.c16
-rw-r--r--net/ipv4/ipvs/Kconfig4
-rw-r--r--net/ipv4/ipvs/ip_vs_conn.c31
-rw-r--r--net/ipv4/ipvs/ip_vs_ctl.c17
-rw-r--r--net/ipv4/ipvs/ip_vs_sync.c4
-rw-r--r--net/ipv4/netfilter/ip_conntrack_amanda.c8
-rw-r--r--net/ipv4/netfilter/ip_conntrack_core.c50
-rw-r--r--net/ipv4/netfilter/ip_conntrack_ftp.c14
-rw-r--r--net/ipv4/netfilter/ip_conntrack_irc.c8
-rw-r--r--net/ipv4/netfilter/ip_conntrack_standalone.c9
-rw-r--r--net/ipv4/netfilter/ip_conntrack_tftp.c8
-rw-r--r--net/ipv4/netfilter/ip_nat_amanda.c4
-rw-r--r--net/ipv4/netfilter/ip_nat_ftp.c4
-rw-r--r--net/ipv4/netfilter/ip_nat_irc.c4
-rw-r--r--net/ipv4/netfilter/ip_nat_proto_icmp.c7
-rw-r--r--net/ipv4/netfilter/ip_nat_proto_tcp.c3
-rw-r--r--net/ipv4/netfilter/ip_nat_proto_udp.c3
-rw-r--r--net/ipv4/netfilter/ip_nat_standalone.c4
-rw-r--r--net/ipv4/netfilter/ip_nat_tftp.c4
-rw-r--r--net/ipv4/netfilter/ip_queue.c7
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c9
-rw-r--r--net/ipv4/netfilter/ipt_ECN.c17
-rw-r--r--net/ipv4/netfilter/ipt_TCPMSS.c7
-rw-r--r--net/ipv4/route.c145
-rw-r--r--net/ipv4/sysctl_net_ipv4.c114
-rw-r--r--net/ipv4/tcp.c87
-rw-r--r--net/ipv4/tcp_bic.c331
-rw-r--r--net/ipv4/tcp_cong.c237
-rw-r--r--net/ipv4/tcp_diag.c34
-rw-r--r--net/ipv4/tcp_highspeed.c181
-rw-r--r--net/ipv4/tcp_htcp.c289
-rw-r--r--net/ipv4/tcp_hybla.c187
-rw-r--r--net/ipv4/tcp_input.c824
-rw-r--r--net/ipv4/tcp_ipv4.c28
-rw-r--r--net/ipv4/tcp_minisocks.c4
-rw-r--r--net/ipv4/tcp_output.c584
-rw-r--r--net/ipv4/tcp_scalable.c68
-rw-r--r--net/ipv4/tcp_timer.c5
-rw-r--r--net/ipv4/tcp_vegas.c411
-rw-r--r--net/ipv4/tcp_westwood.c259
-rw-r--r--net/ipv4/udp.c34
-rw-r--r--net/ipv4/utils.c59
-rw-r--r--net/ipv4/xfrm4_tunnel.c3
60 files changed, 3751 insertions, 1795 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 567b03b1c3..0b3d9f1d80 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -1,35 +1,8 @@
1# 1#
2# IP configuration 2# IP configuration
3# 3#
4choice
5 prompt "Choose IP: FIB lookup"
6 depends on INET
7 default IP_FIB_HASH
8
9config IP_FIB_HASH
10 bool "FIB_HASH"
11 ---help---
12 Current FIB is very proven and good enough for most users.
13
14config IP_FIB_TRIE
15 bool "FIB_TRIE"
16 ---help---
17 Use new experimental LC-trie as FIB lookup algoritm.
18 This improves lookup performance
19
20 LC-trie is described in:
21
22 IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24 An experimental study of compression methods for dynamic tries
25 Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
26 http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
27
28endchoice
29
30config IP_MULTICAST 4config IP_MULTICAST
31 bool "IP: multicasting" 5 bool "IP: multicasting"
32 depends on INET
33 help 6 help
34 This is code for addressing several networked computers at once, 7 This is code for addressing several networked computers at once,
35 enlarging your kernel by about 2 KB. You need multicasting if you 8 enlarging your kernel by about 2 KB. You need multicasting if you
@@ -43,7 +16,6 @@ config IP_MULTICAST
43 16
44config IP_ADVANCED_ROUTER 17config IP_ADVANCED_ROUTER
45 bool "IP: advanced router" 18 bool "IP: advanced router"
46 depends on INET
47 ---help--- 19 ---help---
48 If you intend to run your Linux box mostly as a router, i.e. as a 20 If you intend to run your Linux box mostly as a router, i.e. as a
49 computer that forwards and redistributes network packets, say Y; you 21 computer that forwards and redistributes network packets, say Y; you
@@ -79,6 +51,40 @@ config IP_ADVANCED_ROUTER
79 51
80 If unsure, say N here. 52 If unsure, say N here.
81 53
54choice
55 prompt "Choose IP: FIB lookup algorithm (choose FIB_HASH if unsure)"
56 depends on IP_ADVANCED_ROUTER
57 default ASK_IP_FIB_HASH
58
59config ASK_IP_FIB_HASH
60 bool "FIB_HASH"
61 ---help---
62 Current FIB is very proven and good enough for most users.
63
64config IP_FIB_TRIE
65 bool "FIB_TRIE"
66 ---help---
67 Use new experimental LC-trie as FIB lookup algoritm.
68 This improves lookup performance if you have a large
69 number of routes.
70
71 LC-trie is a longest matching prefix lookup algorithm which
72 performs better than FIB_HASH for large routing tables.
73 But, it consumes more memory and is more complex.
74
75 LC-trie is described in:
76
77 IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
78 IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
79 An experimental study of compression methods for dynamic tries
80 Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
81 http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
82
83endchoice
84
85config IP_FIB_HASH
86 def_bool ASK_IP_FIB_HASH || !IP_ADVANCED_ROUTER
87
82config IP_MULTIPLE_TABLES 88config IP_MULTIPLE_TABLES
83 bool "IP: policy routing" 89 bool "IP: policy routing"
84 depends on IP_ADVANCED_ROUTER 90 depends on IP_ADVANCED_ROUTER
@@ -118,7 +124,7 @@ config IP_ROUTE_MULTIPATH
118 124
119config IP_ROUTE_MULTIPATH_CACHED 125config IP_ROUTE_MULTIPATH_CACHED
120 bool "IP: equal cost multipath with caching support (EXPERIMENTAL)" 126 bool "IP: equal cost multipath with caching support (EXPERIMENTAL)"
121 depends on: IP_ROUTE_MULTIPATH 127 depends on IP_ROUTE_MULTIPATH
122 help 128 help
123 Normally, equal cost multipath routing is not supported by the 129 Normally, equal cost multipath routing is not supported by the
124 routing cache. If you say Y here, alternative routes are cached 130 routing cache. If you say Y here, alternative routes are cached
@@ -171,7 +177,6 @@ config IP_ROUTE_VERBOSE
171 177
172config IP_PNP 178config IP_PNP
173 bool "IP: kernel level autoconfiguration" 179 bool "IP: kernel level autoconfiguration"
174 depends on INET
175 help 180 help
176 This enables automatic configuration of IP addresses of devices and 181 This enables automatic configuration of IP addresses of devices and
177 of the routing table during kernel boot, based on either information 182 of the routing table during kernel boot, based on either information
@@ -230,8 +235,6 @@ config IP_PNP_RARP
230# bool ' IP: ARP support' CONFIG_IP_PNP_ARP 235# bool ' IP: ARP support' CONFIG_IP_PNP_ARP
231config NET_IPIP 236config NET_IPIP
232 tristate "IP: tunneling" 237 tristate "IP: tunneling"
233 depends on INET
234 select INET_TUNNEL
235 ---help--- 238 ---help---
236 Tunneling means encapsulating data of one protocol type within 239 Tunneling means encapsulating data of one protocol type within
237 another protocol and sending it over a channel that understands the 240 another protocol and sending it over a channel that understands the
@@ -248,8 +251,6 @@ config NET_IPIP
248 251
249config NET_IPGRE 252config NET_IPGRE
250 tristate "IP: GRE tunnels over IP" 253 tristate "IP: GRE tunnels over IP"
251 depends on INET
252 select XFRM
253 help 254 help
254 Tunneling means encapsulating data of one protocol type within 255 Tunneling means encapsulating data of one protocol type within
255 another protocol and sending it over a channel that understands the 256 another protocol and sending it over a channel that understands the
@@ -307,7 +308,7 @@ config IP_PIMSM_V2
307 308
308config ARPD 309config ARPD
309 bool "IP: ARP daemon support (EXPERIMENTAL)" 310 bool "IP: ARP daemon support (EXPERIMENTAL)"
310 depends on INET && EXPERIMENTAL 311 depends on EXPERIMENTAL
311 ---help--- 312 ---help---
312 Normally, the kernel maintains an internal cache which maps IP 313 Normally, the kernel maintains an internal cache which maps IP
313 addresses to hardware addresses on the local network, so that 314 addresses to hardware addresses on the local network, so that
@@ -332,7 +333,6 @@ config ARPD
332 333
333config SYN_COOKIES 334config SYN_COOKIES
334 bool "IP: TCP syncookie support (disabled per default)" 335 bool "IP: TCP syncookie support (disabled per default)"
335 depends on INET
336 ---help--- 336 ---help---
337 Normal TCP/IP networking is open to an attack known as "SYN 337 Normal TCP/IP networking is open to an attack known as "SYN
338 flooding". This denial-of-service attack prevents legitimate remote 338 flooding". This denial-of-service attack prevents legitimate remote
@@ -369,7 +369,6 @@ config SYN_COOKIES
369 369
370config INET_AH 370config INET_AH
371 tristate "IP: AH transformation" 371 tristate "IP: AH transformation"
372 depends on INET
373 select XFRM 372 select XFRM
374 select CRYPTO 373 select CRYPTO
375 select CRYPTO_HMAC 374 select CRYPTO_HMAC
@@ -382,7 +381,6 @@ config INET_AH
382 381
383config INET_ESP 382config INET_ESP
384 tristate "IP: ESP transformation" 383 tristate "IP: ESP transformation"
385 depends on INET
386 select XFRM 384 select XFRM
387 select CRYPTO 385 select CRYPTO
388 select CRYPTO_HMAC 386 select CRYPTO_HMAC
@@ -396,7 +394,6 @@ config INET_ESP
396 394
397config INET_IPCOMP 395config INET_IPCOMP
398 tristate "IP: IPComp transformation" 396 tristate "IP: IPComp transformation"
399 depends on INET
400 select XFRM 397 select XFRM
401 select INET_TUNNEL 398 select INET_TUNNEL
402 select CRYPTO 399 select CRYPTO
@@ -409,7 +406,6 @@ config INET_IPCOMP
409 406
410config INET_TUNNEL 407config INET_TUNNEL
411 tristate "IP: tunnel transformation" 408 tristate "IP: tunnel transformation"
412 depends on INET
413 select XFRM 409 select XFRM
414 ---help--- 410 ---help---
415 Support for generic IP tunnel transformation, which is required by 411 Support for generic IP tunnel transformation, which is required by
@@ -419,7 +415,6 @@ config INET_TUNNEL
419 415
420config IP_TCPDIAG 416config IP_TCPDIAG
421 tristate "IP: TCP socket monitoring interface" 417 tristate "IP: TCP socket monitoring interface"
422 depends on INET
423 default y 418 default y
424 ---help--- 419 ---help---
425 Support for TCP socket monitoring interface used by native Linux 420 Support for TCP socket monitoring interface used by native Linux
@@ -433,5 +428,108 @@ config IP_TCPDIAG
433config IP_TCPDIAG_IPV6 428config IP_TCPDIAG_IPV6
434 def_bool (IP_TCPDIAG=y && IPV6=y) || (IP_TCPDIAG=m && IPV6) 429 def_bool (IP_TCPDIAG=y && IPV6=y) || (IP_TCPDIAG=m && IPV6)
435 430
431config TCP_CONG_ADVANCED
432 bool "TCP: advanced congestion control"
433 ---help---
434 Support for selection of various TCP congestion control
435 modules.
436
437 Nearly all users can safely say no here, and a safe default
438 selection will be made (BIC-TCP with new Reno as a fallback).
439
440 If unsure, say N.
441
442# TCP Reno is builtin (required as fallback)
443menu "TCP congestion control"
444 depends on TCP_CONG_ADVANCED
445
446config TCP_CONG_BIC
447 tristate "Binary Increase Congestion (BIC) control"
448 default y
449 ---help---
450 BIC-TCP is a sender-side only change that ensures a linear RTT
451 fairness under large windows while offering both scalability and
452 bounded TCP-friendliness. The protocol combines two schemes
453 called additive increase and binary search increase. When the
454 congestion window is large, additive increase with a large
455 increment ensures linear RTT fairness as well as good
456 scalability. Under small congestion windows, binary search
457 increase provides TCP friendliness.
458 See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/
459
460config TCP_CONG_WESTWOOD
461 tristate "TCP Westwood+"
462 default m
463 ---help---
464 TCP Westwood+ is a sender-side only modification of the TCP Reno
465 protocol stack that optimizes the performance of TCP congestion
466 control. It is based on end-to-end bandwidth estimation to set
467 congestion window and slow start threshold after a congestion
468 episode. Using this estimation, TCP Westwood+ adaptively sets a
469 slow start threshold and a congestion window which takes into
470 account the bandwidth used at the time congestion is experienced.
471 TCP Westwood+ significantly increases fairness wrt TCP Reno in
472 wired networks and throughput over wireless links.
473
474config TCP_CONG_HTCP
475 tristate "H-TCP"
476 default m
477 ---help---
478 H-TCP is a send-side only modifications of the TCP Reno
479 protocol stack that optimizes the performance of TCP
480 congestion control for high speed network links. It uses a
481 modeswitch to change the alpha and beta parameters of TCP Reno
482 based on network conditions and in a way so as to be fair with
483 other Reno and H-TCP flows.
484
485config TCP_CONG_HSTCP
486 tristate "High Speed TCP"
487 depends on EXPERIMENTAL
488 default n
489 ---help---
490 Sally Floyd's High Speed TCP (RFC 3649) congestion control.
491 A modification to TCP's congestion control mechanism for use
492 with large congestion windows. A table indicates how much to
493 increase the congestion window by when an ACK is received.
494 For more detail see http://www.icir.org/floyd/hstcp.html
495
496config TCP_CONG_HYBLA
497 tristate "TCP-Hybla congestion control algorithm"
498 depends on EXPERIMENTAL
499 default n
500 ---help---
501 TCP-Hybla is a sender-side only change that eliminates penalization of
502 long-RTT, large-bandwidth connections, like when satellite legs are
503 involved, expecially when sharing a common bottleneck with normal
504 terrestrial connections.
505
506config TCP_CONG_VEGAS
507 tristate "TCP Vegas"
508 depends on EXPERIMENTAL
509 default n
510 ---help---
511 TCP Vegas is a sender-side only change to TCP that anticipates
512 the onset of congestion by estimating the bandwidth. TCP Vegas
513 adjusts the sending rate by modifying the congestion
514 window. TCP Vegas should provide less packet loss, but it is
515 not as aggressive as TCP Reno.
516
517config TCP_CONG_SCALABLE
518 tristate "Scalable TCP"
519 depends on EXPERIMENTAL
520 default n
521 ---help---
522 Scalable TCP is a sender-side only change to TCP which uses a
523 MIMD congestion control algorithm which has some nice scaling
524 properties, though is known to have fairness issues.
525 See http://www-lce.eng.cam.ac.uk/~ctk21/scalable/
526
527endmenu
528
529config TCP_CONG_BIC
530 tristate
531 depends on !TCP_CONG_ADVANCED
532 default y
533
436source "net/ipv4/ipvs/Kconfig" 534source "net/ipv4/ipvs/Kconfig"
437 535
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 65d57d8e1a..55dc6cca1e 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -2,10 +2,11 @@
2# Makefile for the Linux TCP/IP (INET) layer. 2# Makefile for the Linux TCP/IP (INET) layer.
3# 3#
4 4
5obj-y := utils.o route.o inetpeer.o protocol.o \ 5obj-y := route.o inetpeer.o protocol.o \
6 ip_input.o ip_fragment.o ip_forward.o ip_options.o \ 6 ip_input.o ip_fragment.o ip_forward.o ip_options.o \
7 ip_output.o ip_sockglue.o \ 7 ip_output.o ip_sockglue.o \
8 tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o tcp_minisocks.o \ 8 tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
9 tcp_minisocks.o tcp_cong.o \
9 datagram.o raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \ 10 datagram.o raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \
10 sysctl_net_ipv4.o fib_frontend.o fib_semantics.o 11 sysctl_net_ipv4.o fib_frontend.o fib_semantics.o
11 12
@@ -30,6 +31,13 @@ obj-$(CONFIG_NETFILTER) += netfilter/
30obj-$(CONFIG_IP_VS) += ipvs/ 31obj-$(CONFIG_IP_VS) += ipvs/
31obj-$(CONFIG_IP_TCPDIAG) += tcp_diag.o 32obj-$(CONFIG_IP_TCPDIAG) += tcp_diag.o
32obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o 33obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o
34obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
35obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o
36obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o
37obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o
38obj-$(CONFIG_TCP_CONG_HTCP) += tcp_htcp.o
39obj-$(CONFIG_TCP_CONG_VEGAS) += tcp_vegas.o
40obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o
33 41
34obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ 42obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
35 xfrm4_output.o 43 xfrm4_output.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 658e797792..163ae4068b 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1009,6 +1009,15 @@ static int __init init_ipv4_mibs(void)
1009static int ipv4_proc_init(void); 1009static int ipv4_proc_init(void);
1010extern void ipfrag_init(void); 1010extern void ipfrag_init(void);
1011 1011
1012/*
1013 * IP protocol layer initialiser
1014 */
1015
1016static struct packet_type ip_packet_type = {
1017 .type = __constant_htons(ETH_P_IP),
1018 .func = ip_rcv,
1019};
1020
1012static int __init inet_init(void) 1021static int __init inet_init(void)
1013{ 1022{
1014 struct sk_buff *dummy_skb; 1023 struct sk_buff *dummy_skb;
@@ -1102,6 +1111,8 @@ static int __init inet_init(void)
1102 1111
1103 ipfrag_init(); 1112 ipfrag_init();
1104 1113
1114 dev_add_pack(&ip_packet_type);
1115
1105 rc = 0; 1116 rc = 0;
1106out: 1117out:
1107 return rc; 1118 return rc;
@@ -1146,7 +1157,7 @@ static int __init ipv4_proc_init(void)
1146#ifdef CONFIG_IP_FIB_TRIE 1157#ifdef CONFIG_IP_FIB_TRIE
1147 if (fib_stat_proc_init()) 1158 if (fib_stat_proc_init())
1148 goto out_fib_stat; 1159 goto out_fib_stat;
1149 #endif 1160#endif
1150 if (ip_misc_proc_init()) 1161 if (ip_misc_proc_init())
1151 goto out_misc; 1162 goto out_misc;
1152out: 1163out:
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 650dcb12d9..d8a10e3dd7 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1471,7 +1471,7 @@ static void devinet_sysctl_register(struct in_device *in_dev,
1471 * by sysctl and we wouldn't want anyone to change it under our feet 1471 * by sysctl and we wouldn't want anyone to change it under our feet
1472 * (see SIOCSIFNAME). 1472 * (see SIOCSIFNAME).
1473 */ 1473 */
1474 dev_name = net_sysctl_strdup(dev_name); 1474 dev_name = kstrdup(dev_name, GFP_KERNEL);
1475 if (!dev_name) 1475 if (!dev_name)
1476 goto free; 1476 goto free;
1477 1477
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index c886b28ba9..e278cb9d00 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -593,10 +593,13 @@ static void fib_hash_move(struct hlist_head *new_info_hash,
593 struct hlist_head *new_laddrhash, 593 struct hlist_head *new_laddrhash,
594 unsigned int new_size) 594 unsigned int new_size)
595{ 595{
596 struct hlist_head *old_info_hash, *old_laddrhash;
596 unsigned int old_size = fib_hash_size; 597 unsigned int old_size = fib_hash_size;
597 unsigned int i; 598 unsigned int i, bytes;
598 599
599 write_lock(&fib_info_lock); 600 write_lock(&fib_info_lock);
601 old_info_hash = fib_info_hash;
602 old_laddrhash = fib_info_laddrhash;
600 fib_hash_size = new_size; 603 fib_hash_size = new_size;
601 604
602 for (i = 0; i < old_size; i++) { 605 for (i = 0; i < old_size; i++) {
@@ -636,6 +639,10 @@ static void fib_hash_move(struct hlist_head *new_info_hash,
636 fib_info_laddrhash = new_laddrhash; 639 fib_info_laddrhash = new_laddrhash;
637 640
638 write_unlock(&fib_info_lock); 641 write_unlock(&fib_info_lock);
642
643 bytes = old_size * sizeof(struct hlist_head *);
644 fib_hash_free(old_info_hash, bytes);
645 fib_hash_free(old_laddrhash, bytes);
639} 646}
640 647
641struct fib_info * 648struct fib_info *
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 0671569ee6..45efd5f474 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -43,7 +43,7 @@
43 * 2 of the License, or (at your option) any later version. 43 * 2 of the License, or (at your option) any later version.
44 */ 44 */
45 45
46#define VERSION "0.323" 46#define VERSION "0.325"
47 47
48#include <linux/config.h> 48#include <linux/config.h>
49#include <asm/uaccess.h> 49#include <asm/uaccess.h>
@@ -90,14 +90,14 @@ typedef unsigned int t_key;
90#define T_LEAF 1 90#define T_LEAF 1
91#define NODE_TYPE_MASK 0x1UL 91#define NODE_TYPE_MASK 0x1UL
92#define NODE_PARENT(_node) \ 92#define NODE_PARENT(_node) \
93((struct tnode *)((_node)->_parent & ~NODE_TYPE_MASK)) 93 ((struct tnode *)((_node)->_parent & ~NODE_TYPE_MASK))
94#define NODE_SET_PARENT(_node, _ptr) \ 94#define NODE_SET_PARENT(_node, _ptr) \
95((_node)->_parent = (((unsigned long)(_ptr)) | \ 95 ((_node)->_parent = (((unsigned long)(_ptr)) | \
96 ((_node)->_parent & NODE_TYPE_MASK))) 96 ((_node)->_parent & NODE_TYPE_MASK)))
97#define NODE_INIT_PARENT(_node, _type) \ 97#define NODE_INIT_PARENT(_node, _type) \
98((_node)->_parent = (_type)) 98 ((_node)->_parent = (_type))
99#define NODE_TYPE(_node) \ 99#define NODE_TYPE(_node) \
100((_node)->_parent & NODE_TYPE_MASK) 100 ((_node)->_parent & NODE_TYPE_MASK)
101 101
102#define IS_TNODE(n) (!(n->_parent & T_LEAF)) 102#define IS_TNODE(n) (!(n->_parent & T_LEAF))
103#define IS_LEAF(n) (n->_parent & T_LEAF) 103#define IS_LEAF(n) (n->_parent & T_LEAF)
@@ -136,6 +136,7 @@ struct trie_use_stats {
136 unsigned int semantic_match_passed; 136 unsigned int semantic_match_passed;
137 unsigned int semantic_match_miss; 137 unsigned int semantic_match_miss;
138 unsigned int null_node_hit; 138 unsigned int null_node_hit;
139 unsigned int resize_node_skipped;
139}; 140};
140#endif 141#endif
141 142
@@ -146,7 +147,7 @@ struct trie_stat {
146 unsigned int leaves; 147 unsigned int leaves;
147 unsigned int nullpointers; 148 unsigned int nullpointers;
148 unsigned int nodesizes[MAX_CHILDS]; 149 unsigned int nodesizes[MAX_CHILDS];
149}; 150};
150 151
151struct trie { 152struct trie {
152 struct node *trie; 153 struct node *trie;
@@ -164,8 +165,8 @@ static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
164static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull); 165static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull);
165static int tnode_child_length(struct tnode *tn); 166static int tnode_child_length(struct tnode *tn);
166static struct node *resize(struct trie *t, struct tnode *tn); 167static struct node *resize(struct trie *t, struct tnode *tn);
167static struct tnode *inflate(struct trie *t, struct tnode *tn); 168static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err);
168static struct tnode *halve(struct trie *t, struct tnode *tn); 169static struct tnode *halve(struct trie *t, struct tnode *tn, int *err);
169static void tnode_free(struct tnode *tn); 170static void tnode_free(struct tnode *tn);
170static void trie_dump_seq(struct seq_file *seq, struct trie *t); 171static void trie_dump_seq(struct seq_file *seq, struct trie *t);
171extern struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio); 172extern struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio);
@@ -184,9 +185,9 @@ static void trie_bug(char *err)
184 BUG(); 185 BUG();
185} 186}
186 187
187static inline struct node *tnode_get_child(struct tnode *tn, int i) 188static inline struct node *tnode_get_child(struct tnode *tn, int i)
188{ 189{
189 if (i >= 1<<tn->bits) 190 if (i >= 1<<tn->bits)
190 trie_bug("tnode_get_child"); 191 trie_bug("tnode_get_child");
191 192
192 return tn->child[i]; 193 return tn->child[i];
@@ -201,7 +202,7 @@ static inline int tnode_child_length(struct tnode *tn)
201 _________________________________________________________________ 202 _________________________________________________________________
202 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C | 203 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
203 ---------------------------------------------------------------- 204 ----------------------------------------------------------------
204 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 205 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
205 206
206 _________________________________________________________________ 207 _________________________________________________________________
207 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u | 208 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
@@ -225,25 +226,25 @@ static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
225 226
226static inline int tkey_equals(t_key a, t_key b) 227static inline int tkey_equals(t_key a, t_key b)
227{ 228{
228 return a == b; 229 return a == b;
229} 230}
230 231
231static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b) 232static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
232{ 233{
233 if (bits == 0 || offset >= KEYLENGTH) 234 if (bits == 0 || offset >= KEYLENGTH)
234 return 1; 235 return 1;
235 bits = bits > KEYLENGTH ? KEYLENGTH : bits; 236 bits = bits > KEYLENGTH ? KEYLENGTH : bits;
236 return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0; 237 return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
237} 238}
238 239
239static inline int tkey_mismatch(t_key a, int offset, t_key b) 240static inline int tkey_mismatch(t_key a, int offset, t_key b)
240{ 241{
241 t_key diff = a ^ b; 242 t_key diff = a ^ b;
242 int i = offset; 243 int i = offset;
243 244
244 if(!diff) 245 if (!diff)
245 return 0; 246 return 0;
246 while((diff << i) >> (KEYLENGTH-1) == 0) 247 while ((diff << i) >> (KEYLENGTH-1) == 0)
247 i++; 248 i++;
248 return i; 249 return i;
249} 250}
@@ -313,6 +314,7 @@ static void fn_free_alias(struct fib_alias *fa)
313 The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into 314 The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
314 n's child array, and will of course be different for each child. 315 n's child array, and will of course be different for each child.
315 316
317
316 The rest of the bits, from (n->pos + n->bits) onward, are completely unknown 318 The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
317 at this point. 319 at this point.
318 320
@@ -320,7 +322,7 @@ static void fn_free_alias(struct fib_alias *fa)
320 322
321static void check_tnode(struct tnode *tn) 323static void check_tnode(struct tnode *tn)
322{ 324{
323 if(tn && tn->pos+tn->bits > 32) { 325 if (tn && tn->pos+tn->bits > 32) {
324 printk("TNODE ERROR tn=%p, pos=%d, bits=%d\n", tn, tn->pos, tn->bits); 326 printk("TNODE ERROR tn=%p, pos=%d, bits=%d\n", tn, tn->pos, tn->bits);
325 } 327 }
326} 328}
@@ -331,7 +333,7 @@ static int inflate_threshold = 50;
331static struct leaf *leaf_new(void) 333static struct leaf *leaf_new(void)
332{ 334{
333 struct leaf *l = kmalloc(sizeof(struct leaf), GFP_KERNEL); 335 struct leaf *l = kmalloc(sizeof(struct leaf), GFP_KERNEL);
334 if(l) { 336 if (l) {
335 NODE_INIT_PARENT(l, T_LEAF); 337 NODE_INIT_PARENT(l, T_LEAF);
336 INIT_HLIST_HEAD(&l->list); 338 INIT_HLIST_HEAD(&l->list);
337 } 339 }
@@ -341,8 +343,10 @@ static struct leaf *leaf_new(void)
341static struct leaf_info *leaf_info_new(int plen) 343static struct leaf_info *leaf_info_new(int plen)
342{ 344{
343 struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL); 345 struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL);
344 li->plen = plen; 346 if (li) {
345 INIT_LIST_HEAD(&li->falh); 347 li->plen = plen;
348 INIT_LIST_HEAD(&li->falh);
349 }
346 return li; 350 return li;
347} 351}
348 352
@@ -356,13 +360,34 @@ static inline void free_leaf_info(struct leaf_info *li)
356 kfree(li); 360 kfree(li);
357} 361}
358 362
363static struct tnode *tnode_alloc(unsigned int size)
364{
365 if (size <= PAGE_SIZE) {
366 return kmalloc(size, GFP_KERNEL);
367 } else {
368 return (struct tnode *)
369 __get_free_pages(GFP_KERNEL, get_order(size));
370 }
371}
372
373static void __tnode_free(struct tnode *tn)
374{
375 unsigned int size = sizeof(struct tnode) +
376 (1<<tn->bits) * sizeof(struct node *);
377
378 if (size <= PAGE_SIZE)
379 kfree(tn);
380 else
381 free_pages((unsigned long)tn, get_order(size));
382}
383
359static struct tnode* tnode_new(t_key key, int pos, int bits) 384static struct tnode* tnode_new(t_key key, int pos, int bits)
360{ 385{
361 int nchildren = 1<<bits; 386 int nchildren = 1<<bits;
362 int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *); 387 int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *);
363 struct tnode *tn = kmalloc(sz, GFP_KERNEL); 388 struct tnode *tn = tnode_alloc(sz);
364 389
365 if(tn) { 390 if (tn) {
366 memset(tn, 0, sz); 391 memset(tn, 0, sz);
367 NODE_INIT_PARENT(tn, T_TNODE); 392 NODE_INIT_PARENT(tn, T_TNODE);
368 tn->pos = pos; 393 tn->pos = pos;
@@ -371,7 +396,8 @@ static struct tnode* tnode_new(t_key key, int pos, int bits)
371 tn->full_children = 0; 396 tn->full_children = 0;
372 tn->empty_children = 1<<bits; 397 tn->empty_children = 1<<bits;
373 } 398 }
374 if(trie_debug > 0) 399
400 if (trie_debug > 0)
375 printk("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode), 401 printk("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode),
376 (unsigned int) (sizeof(struct node) * 1<<bits)); 402 (unsigned int) (sizeof(struct node) * 1<<bits));
377 return tn; 403 return tn;
@@ -379,17 +405,17 @@ static struct tnode* tnode_new(t_key key, int pos, int bits)
379 405
380static void tnode_free(struct tnode *tn) 406static void tnode_free(struct tnode *tn)
381{ 407{
382 if(!tn) { 408 if (!tn) {
383 trie_bug("tnode_free\n"); 409 trie_bug("tnode_free\n");
384 } 410 }
385 if(IS_LEAF(tn)) { 411 if (IS_LEAF(tn)) {
386 free_leaf((struct leaf *)tn); 412 free_leaf((struct leaf *)tn);
387 if(trie_debug > 0 ) 413 if (trie_debug > 0 )
388 printk("FL %p \n", tn); 414 printk("FL %p \n", tn);
389 } 415 }
390 else if(IS_TNODE(tn)) { 416 else if (IS_TNODE(tn)) {
391 kfree(tn); 417 __tnode_free(tn);
392 if(trie_debug > 0 ) 418 if (trie_debug > 0 )
393 printk("FT %p \n", tn); 419 printk("FT %p \n", tn);
394 } 420 }
395 else { 421 else {
@@ -404,66 +430,67 @@ static void tnode_free(struct tnode *tn)
404 430
405static inline int tnode_full(struct tnode *tn, struct node *n) 431static inline int tnode_full(struct tnode *tn, struct node *n)
406{ 432{
407 if(n == NULL || IS_LEAF(n)) 433 if (n == NULL || IS_LEAF(n))
408 return 0; 434 return 0;
409 435
410 return ((struct tnode *) n)->pos == tn->pos + tn->bits; 436 return ((struct tnode *) n)->pos == tn->pos + tn->bits;
411} 437}
412 438
413static inline void put_child(struct trie *t, struct tnode *tn, int i, struct node *n) 439static inline void put_child(struct trie *t, struct tnode *tn, int i, struct node *n)
414{ 440{
415 tnode_put_child_reorg(tn, i, n, -1); 441 tnode_put_child_reorg(tn, i, n, -1);
416} 442}
417 443
418 /* 444 /*
419 * Add a child at position i overwriting the old value. 445 * Add a child at position i overwriting the old value.
420 * Update the value of full_children and empty_children. 446 * Update the value of full_children and empty_children.
421 */ 447 */
422 448
423static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull) 449static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull)
424{ 450{
425 struct node *chi; 451 struct node *chi;
426 int isfull; 452 int isfull;
427 453
428 if(i >= 1<<tn->bits) { 454 if (i >= 1<<tn->bits) {
429 printk("bits=%d, i=%d\n", tn->bits, i); 455 printk("bits=%d, i=%d\n", tn->bits, i);
430 trie_bug("tnode_put_child_reorg bits"); 456 trie_bug("tnode_put_child_reorg bits");
431 } 457 }
432 write_lock_bh(&fib_lock); 458 write_lock_bh(&fib_lock);
433 chi = tn->child[i]; 459 chi = tn->child[i];
434 460
435 /* update emptyChildren */ 461 /* update emptyChildren */
436 if (n == NULL && chi != NULL) 462 if (n == NULL && chi != NULL)
437 tn->empty_children++; 463 tn->empty_children++;
438 else if (n != NULL && chi == NULL) 464 else if (n != NULL && chi == NULL)
439 tn->empty_children--; 465 tn->empty_children--;
440 466
441 /* update fullChildren */ 467 /* update fullChildren */
442 if (wasfull == -1) 468 if (wasfull == -1)
443 wasfull = tnode_full(tn, chi); 469 wasfull = tnode_full(tn, chi);
444 470
445 isfull = tnode_full(tn, n); 471 isfull = tnode_full(tn, n);
446 if (wasfull && !isfull) 472 if (wasfull && !isfull)
447 tn->full_children--; 473 tn->full_children--;
448 474
449 else if (!wasfull && isfull) 475 else if (!wasfull && isfull)
450 tn->full_children++; 476 tn->full_children++;
451 if(n) 477 if (n)
452 NODE_SET_PARENT(n, tn); 478 NODE_SET_PARENT(n, tn);
453 479
454 tn->child[i] = n; 480 tn->child[i] = n;
455 write_unlock_bh(&fib_lock); 481 write_unlock_bh(&fib_lock);
456} 482}
457 483
458static struct node *resize(struct trie *t, struct tnode *tn) 484static struct node *resize(struct trie *t, struct tnode *tn)
459{ 485{
460 int i; 486 int i;
487 int err = 0;
461 488
462 if (!tn) 489 if (!tn)
463 return NULL; 490 return NULL;
464 491
465 if(trie_debug) 492 if (trie_debug)
466 printk("In tnode_resize %p inflate_threshold=%d threshold=%d\n", 493 printk("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
467 tn, inflate_threshold, halve_threshold); 494 tn, inflate_threshold, halve_threshold);
468 495
469 /* No children */ 496 /* No children */
@@ -480,7 +507,7 @@ static struct node *resize(struct trie *t, struct tnode *tn)
480 507
481 /* compress one level */ 508 /* compress one level */
482 struct node *n = tn->child[i]; 509 struct node *n = tn->child[i];
483 if(n) 510 if (n)
484 NODE_INIT_PARENT(n, NODE_TYPE(n)); 511 NODE_INIT_PARENT(n, NODE_TYPE(n));
485 512
486 write_unlock_bh(&fib_lock); 513 write_unlock_bh(&fib_lock);
@@ -489,77 +516,85 @@ static struct node *resize(struct trie *t, struct tnode *tn)
489 } 516 }
490 write_unlock_bh(&fib_lock); 517 write_unlock_bh(&fib_lock);
491 } 518 }
492 /* 519 /*
493 * Double as long as the resulting node has a number of 520 * Double as long as the resulting node has a number of
494 * nonempty nodes that are above the threshold. 521 * nonempty nodes that are above the threshold.
495 */ 522 */
496 523
497 /* 524 /*
498 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of 525 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
499 * the Helsinki University of Technology and Matti Tikkanen of Nokia 526 * the Helsinki University of Technology and Matti Tikkanen of Nokia
500 * Telecommunications, page 6: 527 * Telecommunications, page 6:
501 * "A node is doubled if the ratio of non-empty children to all 528 * "A node is doubled if the ratio of non-empty children to all
502 * children in the *doubled* node is at least 'high'." 529 * children in the *doubled* node is at least 'high'."
503 * 530 *
504 * 'high' in this instance is the variable 'inflate_threshold'. It 531 * 'high' in this instance is the variable 'inflate_threshold'. It
505 * is expressed as a percentage, so we multiply it with 532 * is expressed as a percentage, so we multiply it with
506 * tnode_child_length() and instead of multiplying by 2 (since the 533 * tnode_child_length() and instead of multiplying by 2 (since the
507 * child array will be doubled by inflate()) and multiplying 534 * child array will be doubled by inflate()) and multiplying
508 * the left-hand side by 100 (to handle the percentage thing) we 535 * the left-hand side by 100 (to handle the percentage thing) we
509 * multiply the left-hand side by 50. 536 * multiply the left-hand side by 50.
510 * 537 *
511 * The left-hand side may look a bit weird: tnode_child_length(tn) 538 * The left-hand side may look a bit weird: tnode_child_length(tn)
512 * - tn->empty_children is of course the number of non-null children 539 * - tn->empty_children is of course the number of non-null children
513 * in the current node. tn->full_children is the number of "full" 540 * in the current node. tn->full_children is the number of "full"
514 * children, that is non-null tnodes with a skip value of 0. 541 * children, that is non-null tnodes with a skip value of 0.
515 * All of those will be doubled in the resulting inflated tnode, so 542 * All of those will be doubled in the resulting inflated tnode, so
516 * we just count them one extra time here. 543 * we just count them one extra time here.
517 * 544 *
518 * A clearer way to write this would be: 545 * A clearer way to write this would be:
519 * 546 *
520 * to_be_doubled = tn->full_children; 547 * to_be_doubled = tn->full_children;
521 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children - 548 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
522 * tn->full_children; 549 * tn->full_children;
523 * 550 *
524 * new_child_length = tnode_child_length(tn) * 2; 551 * new_child_length = tnode_child_length(tn) * 2;
525 * 552 *
526 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) / 553 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
527 * new_child_length; 554 * new_child_length;
528 * if (new_fill_factor >= inflate_threshold) 555 * if (new_fill_factor >= inflate_threshold)
529 * 556 *
530 * ...and so on, tho it would mess up the while() loop. 557 * ...and so on, tho it would mess up the while () loop.
531 * 558 *
532 * anyway, 559 * anyway,
533 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >= 560 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
534 * inflate_threshold 561 * inflate_threshold
535 * 562 *
536 * avoid a division: 563 * avoid a division:
537 * 100 * (not_to_be_doubled + 2*to_be_doubled) >= 564 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
538 * inflate_threshold * new_child_length 565 * inflate_threshold * new_child_length
539 * 566 *
540 * expand not_to_be_doubled and to_be_doubled, and shorten: 567 * expand not_to_be_doubled and to_be_doubled, and shorten:
541 * 100 * (tnode_child_length(tn) - tn->empty_children + 568 * 100 * (tnode_child_length(tn) - tn->empty_children +
542 * tn->full_children ) >= inflate_threshold * new_child_length 569 * tn->full_children ) >= inflate_threshold * new_child_length
543 * 570 *
544 * expand new_child_length: 571 * expand new_child_length:
545 * 100 * (tnode_child_length(tn) - tn->empty_children + 572 * 100 * (tnode_child_length(tn) - tn->empty_children +
546 * tn->full_children ) >= 573 * tn->full_children ) >=
547 * inflate_threshold * tnode_child_length(tn) * 2 574 * inflate_threshold * tnode_child_length(tn) * 2
548 * 575 *
549 * shorten again: 576 * shorten again:
550 * 50 * (tn->full_children + tnode_child_length(tn) - 577 * 50 * (tn->full_children + tnode_child_length(tn) -
551 * tn->empty_children ) >= inflate_threshold * 578 * tn->empty_children ) >= inflate_threshold *
552 * tnode_child_length(tn) 579 * tnode_child_length(tn)
553 * 580 *
554 */ 581 */
555 582
556 check_tnode(tn); 583 check_tnode(tn);
557 584
585 err = 0;
558 while ((tn->full_children > 0 && 586 while ((tn->full_children > 0 &&
559 50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >= 587 50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >=
560 inflate_threshold * tnode_child_length(tn))) { 588 inflate_threshold * tnode_child_length(tn))) {
561 589
562 tn = inflate(t, tn); 590 tn = inflate(t, tn, &err);
591
592 if (err) {
593#ifdef CONFIG_IP_FIB_TRIE_STATS
594 t->stats.resize_node_skipped++;
595#endif
596 break;
597 }
563 } 598 }
564 599
565 check_tnode(tn); 600 check_tnode(tn);
@@ -568,23 +603,34 @@ static struct node *resize(struct trie *t, struct tnode *tn)
568 * Halve as long as the number of empty children in this 603 * Halve as long as the number of empty children in this
569 * node is above threshold. 604 * node is above threshold.
570 */ 605 */
606
607 err = 0;
571 while (tn->bits > 1 && 608 while (tn->bits > 1 &&
572 100 * (tnode_child_length(tn) - tn->empty_children) < 609 100 * (tnode_child_length(tn) - tn->empty_children) <
573 halve_threshold * tnode_child_length(tn)) 610 halve_threshold * tnode_child_length(tn)) {
611
612 tn = halve(t, tn, &err);
613
614 if (err) {
615#ifdef CONFIG_IP_FIB_TRIE_STATS
616 t->stats.resize_node_skipped++;
617#endif
618 break;
619 }
620 }
621
574 622
575 tn = halve(t, tn);
576
577 /* Only one child remains */ 623 /* Only one child remains */
578 624
579 if (tn->empty_children == tnode_child_length(tn) - 1) 625 if (tn->empty_children == tnode_child_length(tn) - 1)
580 for (i = 0; i < tnode_child_length(tn); i++) { 626 for (i = 0; i < tnode_child_length(tn); i++) {
581 627
582 write_lock_bh(&fib_lock); 628 write_lock_bh(&fib_lock);
583 if (tn->child[i] != NULL) { 629 if (tn->child[i] != NULL) {
584 /* compress one level */ 630 /* compress one level */
585 struct node *n = tn->child[i]; 631 struct node *n = tn->child[i];
586 632
587 if(n) 633 if (n)
588 NODE_INIT_PARENT(n, NODE_TYPE(n)); 634 NODE_INIT_PARENT(n, NODE_TYPE(n));
589 635
590 write_unlock_bh(&fib_lock); 636 write_unlock_bh(&fib_lock);
@@ -597,33 +643,88 @@ static struct node *resize(struct trie *t, struct tnode *tn)
597 return (struct node *) tn; 643 return (struct node *) tn;
598} 644}
599 645
600static struct tnode *inflate(struct trie *t, struct tnode *tn) 646static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err)
601{ 647{
602 struct tnode *inode; 648 struct tnode *inode;
603 struct tnode *oldtnode = tn; 649 struct tnode *oldtnode = tn;
604 int olen = tnode_child_length(tn); 650 int olen = tnode_child_length(tn);
605 int i; 651 int i;
606 652
607 if(trie_debug) 653 if (trie_debug)
608 printk("In inflate\n"); 654 printk("In inflate\n");
609 655
610 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1); 656 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
611 657
612 if (!tn) 658 if (!tn) {
613 trie_bug("tnode_new failed"); 659 *err = -ENOMEM;
660 return oldtnode;
661 }
662
663 /*
664 * Preallocate and store tnodes before the actual work so we
665 * don't get into an inconsistent state if memory allocation
666 * fails. In case of failure we return the oldnode and inflate
667 * of tnode is ignored.
668 */
669
670 for(i = 0; i < olen; i++) {
671 struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i);
672
673 if (inode &&
674 IS_TNODE(inode) &&
675 inode->pos == oldtnode->pos + oldtnode->bits &&
676 inode->bits > 1) {
677 struct tnode *left, *right;
678
679 t_key m = TKEY_GET_MASK(inode->pos, 1);
680
681 left = tnode_new(inode->key&(~m), inode->pos + 1,
682 inode->bits - 1);
683
684 if (!left) {
685 *err = -ENOMEM;
686 break;
687 }
688
689 right = tnode_new(inode->key|m, inode->pos + 1,
690 inode->bits - 1);
691
692 if (!right) {
693 *err = -ENOMEM;
694 break;
695 }
696
697 put_child(t, tn, 2*i, (struct node *) left);
698 put_child(t, tn, 2*i+1, (struct node *) right);
699 }
700 }
701
702 if (*err) {
703 int size = tnode_child_length(tn);
704 int j;
705
706 for(j = 0; j < size; j++)
707 if (tn->child[j])
708 tnode_free((struct tnode *)tn->child[j]);
709
710 tnode_free(tn);
711
712 *err = -ENOMEM;
713 return oldtnode;
714 }
614 715
615 for(i = 0; i < olen; i++) { 716 for(i = 0; i < olen; i++) {
616 struct node *node = tnode_get_child(oldtnode, i); 717 struct node *node = tnode_get_child(oldtnode, i);
617 718
618 /* An empty child */ 719 /* An empty child */
619 if (node == NULL) 720 if (node == NULL)
620 continue; 721 continue;
621 722
622 /* A leaf or an internal node with skipped bits */ 723 /* A leaf or an internal node with skipped bits */
623 724
624 if(IS_LEAF(node) || ((struct tnode *) node)->pos > 725 if (IS_LEAF(node) || ((struct tnode *) node)->pos >
625 tn->pos + tn->bits - 1) { 726 tn->pos + tn->bits - 1) {
626 if(tkey_extract_bits(node->key, tn->pos + tn->bits - 1, 727 if (tkey_extract_bits(node->key, oldtnode->pos + oldtnode->bits,
627 1) == 0) 728 1) == 0)
628 put_child(t, tn, 2*i, node); 729 put_child(t, tn, 2*i, node);
629 else 730 else
@@ -646,44 +747,39 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
646 struct tnode *left, *right; 747 struct tnode *left, *right;
647 int size, j; 748 int size, j;
648 749
649 /* We will replace this node 'inode' with two new 750 /* We will replace this node 'inode' with two new
650 * ones, 'left' and 'right', each with half of the 751 * ones, 'left' and 'right', each with half of the
651 * original children. The two new nodes will have 752 * original children. The two new nodes will have
652 * a position one bit further down the key and this 753 * a position one bit further down the key and this
653 * means that the "significant" part of their keys 754 * means that the "significant" part of their keys
654 * (see the discussion near the top of this file) 755 * (see the discussion near the top of this file)
655 * will differ by one bit, which will be "0" in 756 * will differ by one bit, which will be "0" in
656 * left's key and "1" in right's key. Since we are 757 * left's key and "1" in right's key. Since we are
657 * moving the key position by one step, the bit that 758 * moving the key position by one step, the bit that
658 * we are moving away from - the bit at position 759 * we are moving away from - the bit at position
659 * (inode->pos) - is the one that will differ between 760 * (inode->pos) - is the one that will differ between
660 * left and right. So... we synthesize that bit in the 761 * left and right. So... we synthesize that bit in the
661 * two new keys. 762 * two new keys.
662 * The mask 'm' below will be a single "one" bit at 763 * The mask 'm' below will be a single "one" bit at
663 * the position (inode->pos) 764 * the position (inode->pos)
664 */ 765 */
665 766
666 t_key m = TKEY_GET_MASK(inode->pos, 1); 767 /* Use the old key, but set the new significant
667 768 * bit to zero.
668 /* Use the old key, but set the new significant
669 * bit to zero.
670 */ 769 */
671 left = tnode_new(inode->key&(~m), inode->pos + 1,
672 inode->bits - 1);
673 770
674 if(!left) 771 left = (struct tnode *) tnode_get_child(tn, 2*i);
675 trie_bug("tnode_new failed"); 772 put_child(t, tn, 2*i, NULL);
676 773
677 774 if (!left)
678 /* Use the old key, but set the new significant 775 BUG();
679 * bit to one. 776
680 */ 777 right = (struct tnode *) tnode_get_child(tn, 2*i+1);
681 right = tnode_new(inode->key|m, inode->pos + 1, 778 put_child(t, tn, 2*i+1, NULL);
682 inode->bits - 1); 779
780 if (!right)
781 BUG();
683 782
684 if(!right)
685 trie_bug("tnode_new failed");
686
687 size = tnode_child_length(left); 783 size = tnode_child_length(left);
688 for(j = 0; j < size; j++) { 784 for(j = 0; j < size; j++) {
689 put_child(t, left, j, inode->child[j]); 785 put_child(t, left, j, inode->child[j]);
@@ -699,24 +795,64 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
699 return tn; 795 return tn;
700} 796}
701 797
702static struct tnode *halve(struct trie *t, struct tnode *tn) 798static struct tnode *halve(struct trie *t, struct tnode *tn, int *err)
703{ 799{
704 struct tnode *oldtnode = tn; 800 struct tnode *oldtnode = tn;
705 struct node *left, *right; 801 struct node *left, *right;
706 int i; 802 int i;
707 int olen = tnode_child_length(tn); 803 int olen = tnode_child_length(tn);
708 804
709 if(trie_debug) printk("In halve\n"); 805 if (trie_debug) printk("In halve\n");
710 806
711 tn=tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1); 807 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
808
809 if (!tn) {
810 *err = -ENOMEM;
811 return oldtnode;
812 }
813
814 /*
815 * Preallocate and store tnodes before the actual work so we
816 * don't get into an inconsistent state if memory allocation
817 * fails. In case of failure we return the oldnode and halve
818 * of tnode is ignored.
819 */
820
821 for(i = 0; i < olen; i += 2) {
822 left = tnode_get_child(oldtnode, i);
823 right = tnode_get_child(oldtnode, i+1);
824
825 /* Two nonempty children */
826 if (left && right) {
827 struct tnode *newBinNode =
828 tnode_new(left->key, tn->pos + tn->bits, 1);
829
830 if (!newBinNode) {
831 *err = -ENOMEM;
832 break;
833 }
834 put_child(t, tn, i/2, (struct node *)newBinNode);
835 }
836 }
837
838 if (*err) {
839 int size = tnode_child_length(tn);
840 int j;
841
842 for(j = 0; j < size; j++)
843 if (tn->child[j])
844 tnode_free((struct tnode *)tn->child[j]);
712 845
713 if(!tn) 846 tnode_free(tn);
714 trie_bug("tnode_new failed"); 847
848 *err = -ENOMEM;
849 return oldtnode;
850 }
715 851
716 for(i = 0; i < olen; i += 2) { 852 for(i = 0; i < olen; i += 2) {
717 left = tnode_get_child(oldtnode, i); 853 left = tnode_get_child(oldtnode, i);
718 right = tnode_get_child(oldtnode, i+1); 854 right = tnode_get_child(oldtnode, i+1);
719 855
720 /* At least one of the children is empty */ 856 /* At least one of the children is empty */
721 if (left == NULL) { 857 if (left == NULL) {
722 if (right == NULL) /* Both are empty */ 858 if (right == NULL) /* Both are empty */
@@ -724,14 +860,15 @@ static struct tnode *halve(struct trie *t, struct tnode *tn)
724 put_child(t, tn, i/2, right); 860 put_child(t, tn, i/2, right);
725 } else if (right == NULL) 861 } else if (right == NULL)
726 put_child(t, tn, i/2, left); 862 put_child(t, tn, i/2, left);
727 863
728 /* Two nonempty children */ 864 /* Two nonempty children */
729 else { 865 else {
730 struct tnode *newBinNode = 866 struct tnode *newBinNode =
731 tnode_new(left->key, tn->pos + tn->bits, 1); 867 (struct tnode *) tnode_get_child(tn, i/2);
868 put_child(t, tn, i/2, NULL);
732 869
733 if(!newBinNode) 870 if (!newBinNode)
734 trie_bug("tnode_new failed"); 871 BUG();
735 872
736 put_child(t, newBinNode, 0, left); 873 put_child(t, newBinNode, 0, left);
737 put_child(t, newBinNode, 1, right); 874 put_child(t, newBinNode, 1, right);
@@ -744,7 +881,7 @@ static struct tnode *halve(struct trie *t, struct tnode *tn)
744 881
745static void *trie_init(struct trie *t) 882static void *trie_init(struct trie *t)
746{ 883{
747 if(t) { 884 if (t) {
748 t->size = 0; 885 t->size = 0;
749 t->trie = NULL; 886 t->trie = NULL;
750 t->revision = 0; 887 t->revision = 0;
@@ -761,8 +898,7 @@ static struct leaf_info *find_leaf_info(struct hlist_head *head, int plen)
761 struct leaf_info *li; 898 struct leaf_info *li;
762 899
763 hlist_for_each_entry(li, node, head, hlist) { 900 hlist_for_each_entry(li, node, head, hlist) {
764 901 if (li->plen == plen)
765 if ( li->plen == plen )
766 return li; 902 return li;
767 } 903 }
768 return NULL; 904 return NULL;
@@ -770,35 +906,35 @@ static struct leaf_info *find_leaf_info(struct hlist_head *head, int plen)
770 906
771static inline struct list_head * get_fa_head(struct leaf *l, int plen) 907static inline struct list_head * get_fa_head(struct leaf *l, int plen)
772{ 908{
773 struct list_head *fa_head=NULL; 909 struct list_head *fa_head = NULL;
774 struct leaf_info *li = find_leaf_info(&l->list, plen); 910 struct leaf_info *li = find_leaf_info(&l->list, plen);
775 911
776 if(li) 912 if (li)
777 fa_head = &li->falh; 913 fa_head = &li->falh;
778 914
779 return fa_head; 915 return fa_head;
780} 916}
781 917
782static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new) 918static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
783{ 919{
784 struct leaf_info *li=NULL, *last=NULL; 920 struct leaf_info *li = NULL, *last = NULL;
785 struct hlist_node *node, *tmp; 921 struct hlist_node *node, *tmp;
786 922
787 write_lock_bh(&fib_lock); 923 write_lock_bh(&fib_lock);
788 924
789 if(hlist_empty(head)) 925 if (hlist_empty(head))
790 hlist_add_head(&new->hlist, head); 926 hlist_add_head(&new->hlist, head);
791 else { 927 else {
792 hlist_for_each_entry_safe(li, node, tmp, head, hlist) { 928 hlist_for_each_entry_safe(li, node, tmp, head, hlist) {
793 929
794 if (new->plen > li->plen) 930 if (new->plen > li->plen)
795 break; 931 break;
796 932
797 last = li; 933 last = li;
798 } 934 }
799 if(last) 935 if (last)
800 hlist_add_after(&last->hlist, &new->hlist); 936 hlist_add_after(&last->hlist, &new->hlist);
801 else 937 else
802 hlist_add_before(&new->hlist, &li->hlist); 938 hlist_add_before(&new->hlist, &li->hlist);
803 } 939 }
804 write_unlock_bh(&fib_lock); 940 write_unlock_bh(&fib_lock);
@@ -812,14 +948,14 @@ fib_find_node(struct trie *t, u32 key)
812 struct node *n; 948 struct node *n;
813 949
814 pos = 0; 950 pos = 0;
815 n=t->trie; 951 n = t->trie;
816 952
817 while (n != NULL && NODE_TYPE(n) == T_TNODE) { 953 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
818 tn = (struct tnode *) n; 954 tn = (struct tnode *) n;
819 955
820 check_tnode(tn); 956 check_tnode(tn);
821 957
822 if(tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) { 958 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
823 pos=tn->pos + tn->bits; 959 pos=tn->pos + tn->bits;
824 n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits)); 960 n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
825 } 961 }
@@ -842,23 +978,23 @@ static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
842 t_key cindex, key; 978 t_key cindex, key;
843 struct tnode *tp = NULL; 979 struct tnode *tp = NULL;
844 980
845 if(!tn) 981 if (!tn)
846 BUG(); 982 BUG();
847 983
848 key = tn->key; 984 key = tn->key;
849 i = 0; 985 i = 0;
850 986
851 while (tn != NULL && NODE_PARENT(tn) != NULL) { 987 while (tn != NULL && NODE_PARENT(tn) != NULL) {
852 988
853 if( i > 10 ) { 989 if (i > 10) {
854 printk("Rebalance tn=%p \n", tn); 990 printk("Rebalance tn=%p \n", tn);
855 if(tn) printk("tn->parent=%p \n", NODE_PARENT(tn)); 991 if (tn) printk("tn->parent=%p \n", NODE_PARENT(tn));
856 992
857 printk("Rebalance tp=%p \n", tp); 993 printk("Rebalance tp=%p \n", tp);
858 if(tp) printk("tp->parent=%p \n", NODE_PARENT(tp)); 994 if (tp) printk("tp->parent=%p \n", NODE_PARENT(tp));
859 } 995 }
860 996
861 if( i > 12 ) BUG(); 997 if (i > 12) BUG();
862 i++; 998 i++;
863 999
864 tp = NODE_PARENT(tn); 1000 tp = NODE_PARENT(tn);
@@ -866,63 +1002,63 @@ static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
866 wasfull = tnode_full(tp, tnode_get_child(tp, cindex)); 1002 wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
867 tn = (struct tnode *) resize (t, (struct tnode *)tn); 1003 tn = (struct tnode *) resize (t, (struct tnode *)tn);
868 tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull); 1004 tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull);
869 1005
870 if(!NODE_PARENT(tn)) 1006 if (!NODE_PARENT(tn))
871 break; 1007 break;
872 1008
873 tn = NODE_PARENT(tn); 1009 tn = NODE_PARENT(tn);
874 } 1010 }
875 /* Handle last (top) tnode */ 1011 /* Handle last (top) tnode */
876 if (IS_TNODE(tn)) 1012 if (IS_TNODE(tn))
877 tn = (struct tnode*) resize(t, (struct tnode *)tn); 1013 tn = (struct tnode*) resize(t, (struct tnode *)tn);
878 1014
879 return (struct node*) tn; 1015 return (struct node*) tn;
880} 1016}
881 1017
882static struct list_head * 1018static struct list_head *
883fib_insert_node(struct trie *t, u32 key, int plen) 1019fib_insert_node(struct trie *t, int *err, u32 key, int plen)
884{ 1020{
885 int pos, newpos; 1021 int pos, newpos;
886 struct tnode *tp = NULL, *tn = NULL; 1022 struct tnode *tp = NULL, *tn = NULL;
887 struct node *n; 1023 struct node *n;
888 struct leaf *l; 1024 struct leaf *l;
889 int missbit; 1025 int missbit;
890 struct list_head *fa_head=NULL; 1026 struct list_head *fa_head = NULL;
891 struct leaf_info *li; 1027 struct leaf_info *li;
892 t_key cindex; 1028 t_key cindex;
893 1029
894 pos = 0; 1030 pos = 0;
895 n=t->trie; 1031 n = t->trie;
896 1032
897 /* If we point to NULL, stop. Either the tree is empty and we should 1033 /* If we point to NULL, stop. Either the tree is empty and we should
898 * just put a new leaf in if, or we have reached an empty child slot, 1034 * just put a new leaf in if, or we have reached an empty child slot,
899 * and we should just put our new leaf in that. 1035 * and we should just put our new leaf in that.
900 * If we point to a T_TNODE, check if it matches our key. Note that 1036 * If we point to a T_TNODE, check if it matches our key. Note that
901 * a T_TNODE might be skipping any number of bits - its 'pos' need 1037 * a T_TNODE might be skipping any number of bits - its 'pos' need
902 * not be the parent's 'pos'+'bits'! 1038 * not be the parent's 'pos'+'bits'!
903 * 1039 *
904 * If it does match the current key, get pos/bits from it, extract 1040 * If it does match the current key, get pos/bits from it, extract
905 * the index from our key, push the T_TNODE and walk the tree. 1041 * the index from our key, push the T_TNODE and walk the tree.
906 * 1042 *
907 * If it doesn't, we have to replace it with a new T_TNODE. 1043 * If it doesn't, we have to replace it with a new T_TNODE.
908 * 1044 *
909 * If we point to a T_LEAF, it might or might not have the same key 1045 * If we point to a T_LEAF, it might or might not have the same key
910 * as we do. If it does, just change the value, update the T_LEAF's 1046 * as we do. If it does, just change the value, update the T_LEAF's
911 * value, and return it. 1047 * value, and return it.
912 * If it doesn't, we need to replace it with a T_TNODE. 1048 * If it doesn't, we need to replace it with a T_TNODE.
913 */ 1049 */
914 1050
915 while (n != NULL && NODE_TYPE(n) == T_TNODE) { 1051 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
916 tn = (struct tnode *) n; 1052 tn = (struct tnode *) n;
917
918 check_tnode(tn);
919 1053
920 if(tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) { 1054 check_tnode(tn);
1055
1056 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
921 tp = tn; 1057 tp = tn;
922 pos=tn->pos + tn->bits; 1058 pos=tn->pos + tn->bits;
923 n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits)); 1059 n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
924 1060
925 if(n && NODE_PARENT(n) != tn) { 1061 if (n && NODE_PARENT(n) != tn) {
926 printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n)); 1062 printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n));
927 BUG(); 1063 BUG();
928 } 1064 }
@@ -934,23 +1070,24 @@ fib_insert_node(struct trie *t, u32 key, int plen)
934 /* 1070 /*
935 * n ----> NULL, LEAF or TNODE 1071 * n ----> NULL, LEAF or TNODE
936 * 1072 *
937 * tp is n's (parent) ----> NULL or TNODE 1073 * tp is n's (parent) ----> NULL or TNODE
938 */ 1074 */
939 1075
940 if(tp && IS_LEAF(tp)) 1076 if (tp && IS_LEAF(tp))
941 BUG(); 1077 BUG();
942 1078
943 t->revision++;
944 1079
945 /* Case 1: n is a leaf. Compare prefixes */ 1080 /* Case 1: n is a leaf. Compare prefixes */
946 1081
947 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) { 1082 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
948 struct leaf *l = ( struct leaf *) n; 1083 struct leaf *l = ( struct leaf *) n;
949 1084
950 li = leaf_info_new(plen); 1085 li = leaf_info_new(plen);
951 1086
952 if(! li) 1087 if (!li) {
953 BUG(); 1088 *err = -ENOMEM;
1089 goto err;
1090 }
954 1091
955 fa_head = &li->falh; 1092 fa_head = &li->falh;
956 insert_leaf_info(&l->list, li); 1093 insert_leaf_info(&l->list, li);
@@ -959,14 +1096,19 @@ fib_insert_node(struct trie *t, u32 key, int plen)
959 t->size++; 1096 t->size++;
960 l = leaf_new(); 1097 l = leaf_new();
961 1098
962 if(! l) 1099 if (!l) {
963 BUG(); 1100 *err = -ENOMEM;
1101 goto err;
1102 }
964 1103
965 l->key = key; 1104 l->key = key;
966 li = leaf_info_new(plen); 1105 li = leaf_info_new(plen);
967 1106
968 if(! li) 1107 if (!li) {
969 BUG(); 1108 tnode_free((struct tnode *) l);
1109 *err = -ENOMEM;
1110 goto err;
1111 }
970 1112
971 fa_head = &li->falh; 1113 fa_head = &li->falh;
972 insert_leaf_info(&l->list, li); 1114 insert_leaf_info(&l->list, li);
@@ -975,8 +1117,8 @@ fib_insert_node(struct trie *t, u32 key, int plen)
975 if (t->trie && n == NULL) { 1117 if (t->trie && n == NULL) {
976 1118
977 NODE_SET_PARENT(l, tp); 1119 NODE_SET_PARENT(l, tp);
978 1120
979 if (!tp) 1121 if (!tp)
980 BUG(); 1122 BUG();
981 1123
982 else { 1124 else {
@@ -986,8 +1128,8 @@ fib_insert_node(struct trie *t, u32 key, int plen)
986 } 1128 }
987 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */ 1129 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
988 else { 1130 else {
989 /* 1131 /*
990 * Add a new tnode here 1132 * Add a new tnode here
991 * first tnode need some special handling 1133 * first tnode need some special handling
992 */ 1134 */
993 1135
@@ -995,39 +1137,46 @@ fib_insert_node(struct trie *t, u32 key, int plen)
995 pos=tp->pos+tp->bits; 1137 pos=tp->pos+tp->bits;
996 else 1138 else
997 pos=0; 1139 pos=0;
998 if(n) { 1140 if (n) {
999 newpos = tkey_mismatch(key, pos, n->key); 1141 newpos = tkey_mismatch(key, pos, n->key);
1000 tn = tnode_new(n->key, newpos, 1); 1142 tn = tnode_new(n->key, newpos, 1);
1001 } 1143 }
1002 else { 1144 else {
1003 newpos = 0; 1145 newpos = 0;
1004 tn = tnode_new(key, newpos, 1); /* First tnode */ 1146 tn = tnode_new(key, newpos, 1); /* First tnode */
1005 } 1147 }
1006 if(!tn)
1007 trie_bug("tnode_pfx_new failed");
1008 1148
1149 if (!tn) {
1150 free_leaf_info(li);
1151 tnode_free((struct tnode *) l);
1152 *err = -ENOMEM;
1153 goto err;
1154 }
1155
1009 NODE_SET_PARENT(tn, tp); 1156 NODE_SET_PARENT(tn, tp);
1010 1157
1011 missbit=tkey_extract_bits(key, newpos, 1); 1158 missbit=tkey_extract_bits(key, newpos, 1);
1012 put_child(t, tn, missbit, (struct node *)l); 1159 put_child(t, tn, missbit, (struct node *)l);
1013 put_child(t, tn, 1-missbit, n); 1160 put_child(t, tn, 1-missbit, n);
1014 1161
1015 if(tp) { 1162 if (tp) {
1016 cindex = tkey_extract_bits(key, tp->pos, tp->bits); 1163 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1017 put_child(t, (struct tnode *)tp, cindex, (struct node *)tn); 1164 put_child(t, (struct tnode *)tp, cindex, (struct node *)tn);
1018 } 1165 }
1019 else { 1166 else {
1020 t->trie = (struct node*) tn; /* First tnode */ 1167 t->trie = (struct node*) tn; /* First tnode */
1021 tp = tn; 1168 tp = tn;
1022 } 1169 }
1023 } 1170 }
1024 if(tp && tp->pos+tp->bits > 32) { 1171 if (tp && tp->pos+tp->bits > 32) {
1025 printk("ERROR tp=%p pos=%d, bits=%d, key=%0x plen=%d\n", 1172 printk("ERROR tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1026 tp, tp->pos, tp->bits, key, plen); 1173 tp, tp->pos, tp->bits, key, plen);
1027 } 1174 }
1028 /* Rebalance the trie */ 1175 /* Rebalance the trie */
1029 t->trie = trie_rebalance(t, tp); 1176 t->trie = trie_rebalance(t, tp);
1030done:; 1177done:
1178 t->revision++;
1179err:;
1031 return fa_head; 1180 return fa_head;
1032} 1181}
1033 1182
@@ -1037,7 +1186,7 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1037{ 1186{
1038 struct trie *t = (struct trie *) tb->tb_data; 1187 struct trie *t = (struct trie *) tb->tb_data;
1039 struct fib_alias *fa, *new_fa; 1188 struct fib_alias *fa, *new_fa;
1040 struct list_head *fa_head=NULL; 1189 struct list_head *fa_head = NULL;
1041 struct fib_info *fi; 1190 struct fib_info *fi;
1042 int plen = r->rtm_dst_len; 1191 int plen = r->rtm_dst_len;
1043 int type = r->rtm_type; 1192 int type = r->rtm_type;
@@ -1050,17 +1199,17 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1050 return -EINVAL; 1199 return -EINVAL;
1051 1200
1052 key = 0; 1201 key = 0;
1053 if (rta->rta_dst) 1202 if (rta->rta_dst)
1054 memcpy(&key, rta->rta_dst, 4); 1203 memcpy(&key, rta->rta_dst, 4);
1055 1204
1056 key = ntohl(key); 1205 key = ntohl(key);
1057 1206
1058 if(trie_debug) 1207 if (trie_debug)
1059 printk("Insert table=%d %08x/%d\n", tb->tb_id, key, plen); 1208 printk("Insert table=%d %08x/%d\n", tb->tb_id, key, plen);
1060 1209
1061 mask = ntohl( inet_make_mask(plen) ); 1210 mask = ntohl( inet_make_mask(plen) );
1062 1211
1063 if(key & ~mask) 1212 if (key & ~mask)
1064 return -EINVAL; 1213 return -EINVAL;
1065 1214
1066 key = key & mask; 1215 key = key & mask;
@@ -1069,9 +1218,9 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1069 goto err; 1218 goto err;
1070 1219
1071 l = fib_find_node(t, key); 1220 l = fib_find_node(t, key);
1072 fa = NULL; 1221 fa = NULL;
1073 1222
1074 if(l) { 1223 if (l) {
1075 fa_head = get_fa_head(l, plen); 1224 fa_head = get_fa_head(l, plen);
1076 fa = fib_find_alias(fa_head, tos, fi->fib_priority); 1225 fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1077 } 1226 }
@@ -1150,14 +1299,18 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1150 new_fa->fa_scope = r->rtm_scope; 1299 new_fa->fa_scope = r->rtm_scope;
1151 new_fa->fa_state = 0; 1300 new_fa->fa_state = 0;
1152#if 0 1301#if 0
1153 new_fa->dst = NULL; 1302 new_fa->dst = NULL;
1154#endif 1303#endif
1155 /* 1304 /*
1156 * Insert new entry to the list. 1305 * Insert new entry to the list.
1157 */ 1306 */
1158 1307
1159 if(!fa_head) 1308 if (!fa_head) {
1160 fa_head = fib_insert_node(t, key, plen); 1309 fa_head = fib_insert_node(t, &err, key, plen);
1310 err = 0;
1311 if (err)
1312 goto out_free_new_fa;
1313 }
1161 1314
1162 write_lock_bh(&fib_lock); 1315 write_lock_bh(&fib_lock);
1163 1316
@@ -1170,40 +1323,43 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1170 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, nlhdr, req); 1323 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, nlhdr, req);
1171succeeded: 1324succeeded:
1172 return 0; 1325 return 0;
1326
1327out_free_new_fa:
1328 kmem_cache_free(fn_alias_kmem, new_fa);
1173out: 1329out:
1174 fib_release_info(fi); 1330 fib_release_info(fi);
1175err:; 1331err:;
1176 return err; 1332 return err;
1177} 1333}
1178 1334
1179static inline int check_leaf(struct trie *t, struct leaf *l, t_key key, int *plen, const struct flowi *flp, 1335static inline int check_leaf(struct trie *t, struct leaf *l, t_key key, int *plen, const struct flowi *flp,
1180 struct fib_result *res, int *err) 1336 struct fib_result *res)
1181{ 1337{
1182 int i; 1338 int err, i;
1183 t_key mask; 1339 t_key mask;
1184 struct leaf_info *li; 1340 struct leaf_info *li;
1185 struct hlist_head *hhead = &l->list; 1341 struct hlist_head *hhead = &l->list;
1186 struct hlist_node *node; 1342 struct hlist_node *node;
1187 1343
1188 hlist_for_each_entry(li, node, hhead, hlist) { 1344 hlist_for_each_entry(li, node, hhead, hlist) {
1189 1345
1190 i = li->plen; 1346 i = li->plen;
1191 mask = ntohl(inet_make_mask(i)); 1347 mask = ntohl(inet_make_mask(i));
1192 if (l->key != (key & mask)) 1348 if (l->key != (key & mask))
1193 continue; 1349 continue;
1194 1350
1195 if (((*err) = fib_semantic_match(&li->falh, flp, res, l->key, mask, i)) == 0) { 1351 if ((err = fib_semantic_match(&li->falh, flp, res, l->key, mask, i)) <= 0) {
1196 *plen = i; 1352 *plen = i;
1197#ifdef CONFIG_IP_FIB_TRIE_STATS 1353#ifdef CONFIG_IP_FIB_TRIE_STATS
1198 t->stats.semantic_match_passed++; 1354 t->stats.semantic_match_passed++;
1199#endif 1355#endif
1200 return 1; 1356 return err;
1201 } 1357 }
1202#ifdef CONFIG_IP_FIB_TRIE_STATS 1358#ifdef CONFIG_IP_FIB_TRIE_STATS
1203 t->stats.semantic_match_miss++; 1359 t->stats.semantic_match_miss++;
1204#endif 1360#endif
1205 } 1361 }
1206 return 0; 1362 return 1;
1207} 1363}
1208 1364
1209static int 1365static int
@@ -1221,7 +1377,7 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result
1221 n = t->trie; 1377 n = t->trie;
1222 1378
1223 read_lock(&fib_lock); 1379 read_lock(&fib_lock);
1224 if(!n) 1380 if (!n)
1225 goto failed; 1381 goto failed;
1226 1382
1227#ifdef CONFIG_IP_FIB_TRIE_STATS 1383#ifdef CONFIG_IP_FIB_TRIE_STATS
@@ -1230,19 +1386,19 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result
1230 1386
1231 /* Just a leaf? */ 1387 /* Just a leaf? */
1232 if (IS_LEAF(n)) { 1388 if (IS_LEAF(n)) {
1233 if( check_leaf(t, (struct leaf *)n, key, &plen, flp, res, &ret) ) 1389 if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
1234 goto found; 1390 goto found;
1235 goto failed; 1391 goto failed;
1236 } 1392 }
1237 pn = (struct tnode *) n; 1393 pn = (struct tnode *) n;
1238 chopped_off = 0; 1394 chopped_off = 0;
1239 1395
1240 while (pn) { 1396 while (pn) {
1241 1397
1242 pos = pn->pos; 1398 pos = pn->pos;
1243 bits = pn->bits; 1399 bits = pn->bits;
1244 1400
1245 if(!chopped_off) 1401 if (!chopped_off)
1246 cindex = tkey_extract_bits(MASK_PFX(key, current_prefix_length), pos, bits); 1402 cindex = tkey_extract_bits(MASK_PFX(key, current_prefix_length), pos, bits);
1247 1403
1248 n = tnode_get_child(pn, cindex); 1404 n = tnode_get_child(pn, cindex);
@@ -1262,33 +1418,33 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result
1262 int mp; 1418 int mp;
1263 1419
1264 /* 1420 /*
1265 * It's a tnode, and we can do some extra checks here if we 1421 * It's a tnode, and we can do some extra checks here if we
1266 * like, to avoid descending into a dead-end branch. 1422 * like, to avoid descending into a dead-end branch.
1267 * This tnode is in the parent's child array at index 1423 * This tnode is in the parent's child array at index
1268 * key[p_pos..p_pos+p_bits] but potentially with some bits 1424 * key[p_pos..p_pos+p_bits] but potentially with some bits
1269 * chopped off, so in reality the index may be just a 1425 * chopped off, so in reality the index may be just a
1270 * subprefix, padded with zero at the end. 1426 * subprefix, padded with zero at the end.
1271 * We can also take a look at any skipped bits in this 1427 * We can also take a look at any skipped bits in this
1272 * tnode - everything up to p_pos is supposed to be ok, 1428 * tnode - everything up to p_pos is supposed to be ok,
1273 * and the non-chopped bits of the index (se previous 1429 * and the non-chopped bits of the index (se previous
1274 * paragraph) are also guaranteed ok, but the rest is 1430 * paragraph) are also guaranteed ok, but the rest is
1275 * considered unknown. 1431 * considered unknown.
1276 * 1432 *
1277 * The skipped bits are key[pos+bits..cn->pos]. 1433 * The skipped bits are key[pos+bits..cn->pos].
1278 */ 1434 */
1279 1435
1280 /* If current_prefix_length < pos+bits, we are already doing 1436 /* If current_prefix_length < pos+bits, we are already doing
1281 * actual prefix matching, which means everything from 1437 * actual prefix matching, which means everything from
1282 * pos+(bits-chopped_off) onward must be zero along some 1438 * pos+(bits-chopped_off) onward must be zero along some
1283 * branch of this subtree - otherwise there is *no* valid 1439 * branch of this subtree - otherwise there is *no* valid
1284 * prefix present. Here we can only check the skipped 1440 * prefix present. Here we can only check the skipped
1285 * bits. Remember, since we have already indexed into the 1441 * bits. Remember, since we have already indexed into the
1286 * parent's child array, we know that the bits we chopped of 1442 * parent's child array, we know that the bits we chopped of
1287 * *are* zero. 1443 * *are* zero.
1288 */ 1444 */
1289 1445
1290 /* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */ 1446 /* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */
1291 1447
1292 if (current_prefix_length < pos+bits) { 1448 if (current_prefix_length < pos+bits) {
1293 if (tkey_extract_bits(cn->key, current_prefix_length, 1449 if (tkey_extract_bits(cn->key, current_prefix_length,
1294 cn->pos - current_prefix_length) != 0 || 1450 cn->pos - current_prefix_length) != 0 ||
@@ -1297,13 +1453,13 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result
1297 } 1453 }
1298 1454
1299 /* 1455 /*
1300 * If chopped_off=0, the index is fully validated and we 1456 * If chopped_off=0, the index is fully validated and we
1301 * only need to look at the skipped bits for this, the new, 1457 * only need to look at the skipped bits for this, the new,
1302 * tnode. What we actually want to do is to find out if 1458 * tnode. What we actually want to do is to find out if
1303 * these skipped bits match our key perfectly, or if we will 1459 * these skipped bits match our key perfectly, or if we will
1304 * have to count on finding a matching prefix further down, 1460 * have to count on finding a matching prefix further down,
1305 * because if we do, we would like to have some way of 1461 * because if we do, we would like to have some way of
1306 * verifying the existence of such a prefix at this point. 1462 * verifying the existence of such a prefix at this point.
1307 */ 1463 */
1308 1464
1309 /* The only thing we can do at this point is to verify that 1465 /* The only thing we can do at this point is to verify that
@@ -1315,22 +1471,22 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result
1315 * new tnode's key. 1471 * new tnode's key.
1316 */ 1472 */
1317 1473
1318 /* Note: We aren't very concerned about the piece of the key 1474 /* Note: We aren't very concerned about the piece of the key
1319 * that precede pn->pos+pn->bits, since these have already been 1475 * that precede pn->pos+pn->bits, since these have already been
1320 * checked. The bits after cn->pos aren't checked since these are 1476 * checked. The bits after cn->pos aren't checked since these are
1321 * by definition "unknown" at this point. Thus, what we want to 1477 * by definition "unknown" at this point. Thus, what we want to
1322 * see is if we are about to enter the "prefix matching" state, 1478 * see is if we are about to enter the "prefix matching" state,
1323 * and in that case verify that the skipped bits that will prevail 1479 * and in that case verify that the skipped bits that will prevail
1324 * throughout this subtree are zero, as they have to be if we are 1480 * throughout this subtree are zero, as they have to be if we are
1325 * to find a matching prefix. 1481 * to find a matching prefix.
1326 */ 1482 */
1327 1483
1328 node_prefix = MASK_PFX(cn->key, cn->pos); 1484 node_prefix = MASK_PFX(cn->key, cn->pos);
1329 key_prefix = MASK_PFX(key, cn->pos); 1485 key_prefix = MASK_PFX(key, cn->pos);
1330 pref_mismatch = key_prefix^node_prefix; 1486 pref_mismatch = key_prefix^node_prefix;
1331 mp = 0; 1487 mp = 0;
1332 1488
1333 /* In short: If skipped bits in this node do not match the search 1489 /* In short: If skipped bits in this node do not match the search
1334 * key, enter the "prefix matching" state.directly. 1490 * key, enter the "prefix matching" state.directly.
1335 */ 1491 */
1336 if (pref_mismatch) { 1492 if (pref_mismatch) {
@@ -1339,7 +1495,7 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result
1339 pref_mismatch = pref_mismatch <<1; 1495 pref_mismatch = pref_mismatch <<1;
1340 } 1496 }
1341 key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp); 1497 key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1342 1498
1343 if (key_prefix != 0) 1499 if (key_prefix != 0)
1344 goto backtrace; 1500 goto backtrace;
1345 1501
@@ -1350,9 +1506,9 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result
1350 pn = (struct tnode *)n; /* Descend */ 1506 pn = (struct tnode *)n; /* Descend */
1351 chopped_off = 0; 1507 chopped_off = 0;
1352 continue; 1508 continue;
1353 } 1509 }
1354 if (IS_LEAF(n)) { 1510 if (IS_LEAF(n)) {
1355 if( check_leaf(t, (struct leaf *)n, key, &plen, flp, res, &ret)) 1511 if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
1356 goto found; 1512 goto found;
1357 } 1513 }
1358backtrace: 1514backtrace:
@@ -1366,18 +1522,18 @@ backtrace:
1366 /* Decrease current_... with bits chopped off */ 1522 /* Decrease current_... with bits chopped off */
1367 if (current_prefix_length > pn->pos + pn->bits - chopped_off) 1523 if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1368 current_prefix_length = pn->pos + pn->bits - chopped_off; 1524 current_prefix_length = pn->pos + pn->bits - chopped_off;
1369 1525
1370 /* 1526 /*
1371 * Either we do the actual chop off according or if we have 1527 * Either we do the actual chop off according or if we have
1372 * chopped off all bits in this tnode walk up to our parent. 1528 * chopped off all bits in this tnode walk up to our parent.
1373 */ 1529 */
1374 1530
1375 if(chopped_off <= pn->bits) 1531 if (chopped_off <= pn->bits)
1376 cindex &= ~(1 << (chopped_off-1)); 1532 cindex &= ~(1 << (chopped_off-1));
1377 else { 1533 else {
1378 if( NODE_PARENT(pn) == NULL) 1534 if (NODE_PARENT(pn) == NULL)
1379 goto failed; 1535 goto failed;
1380 1536
1381 /* Get Child's index */ 1537 /* Get Child's index */
1382 cindex = tkey_extract_bits(pn->key, NODE_PARENT(pn)->pos, NODE_PARENT(pn)->bits); 1538 cindex = tkey_extract_bits(pn->key, NODE_PARENT(pn)->pos, NODE_PARENT(pn)->bits);
1383 pn = NODE_PARENT(pn); 1539 pn = NODE_PARENT(pn);
@@ -1387,10 +1543,10 @@ backtrace:
1387 t->stats.backtrack++; 1543 t->stats.backtrack++;
1388#endif 1544#endif
1389 goto backtrace; 1545 goto backtrace;
1390 } 1546 }
1391 } 1547 }
1392failed: 1548failed:
1393 ret = 1; 1549 ret = 1;
1394found: 1550found:
1395 read_unlock(&fib_lock); 1551 read_unlock(&fib_lock);
1396 return ret; 1552 return ret;
@@ -1403,11 +1559,11 @@ static int trie_leaf_remove(struct trie *t, t_key key)
1403 struct node *n = t->trie; 1559 struct node *n = t->trie;
1404 struct leaf *l; 1560 struct leaf *l;
1405 1561
1406 if(trie_debug) 1562 if (trie_debug)
1407 printk("entering trie_leaf_remove(%p)\n", n); 1563 printk("entering trie_leaf_remove(%p)\n", n);
1408 1564
1409 /* Note that in the case skipped bits, those bits are *not* checked! 1565 /* Note that in the case skipped bits, those bits are *not* checked!
1410 * When we finish this, we will have NULL or a T_LEAF, and the 1566 * When we finish this, we will have NULL or a T_LEAF, and the
1411 * T_LEAF may or may not match our key. 1567 * T_LEAF may or may not match our key.
1412 */ 1568 */
1413 1569
@@ -1416,19 +1572,19 @@ static int trie_leaf_remove(struct trie *t, t_key key)
1416 check_tnode(tn); 1572 check_tnode(tn);
1417 n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits)); 1573 n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits));
1418 1574
1419 if(n && NODE_PARENT(n) != tn) { 1575 if (n && NODE_PARENT(n) != tn) {
1420 printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n)); 1576 printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n));
1421 BUG(); 1577 BUG();
1422 } 1578 }
1423 } 1579 }
1424 l = (struct leaf *) n; 1580 l = (struct leaf *) n;
1425 1581
1426 if(!n || !tkey_equals(l->key, key)) 1582 if (!n || !tkey_equals(l->key, key))
1427 return 0; 1583 return 0;
1428 1584
1429 /* 1585 /*
1430 * Key found. 1586 * Key found.
1431 * Remove the leaf and rebalance the tree 1587 * Remove the leaf and rebalance the tree
1432 */ 1588 */
1433 1589
1434 t->revision++; 1590 t->revision++;
@@ -1437,7 +1593,7 @@ static int trie_leaf_remove(struct trie *t, t_key key)
1437 tp = NODE_PARENT(n); 1593 tp = NODE_PARENT(n);
1438 tnode_free((struct tnode *) n); 1594 tnode_free((struct tnode *) n);
1439 1595
1440 if(tp) { 1596 if (tp) {
1441 cindex = tkey_extract_bits(key, tp->pos, tp->bits); 1597 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1442 put_child(t, (struct tnode *)tp, cindex, NULL); 1598 put_child(t, (struct tnode *)tp, cindex, NULL);
1443 t->trie = trie_rebalance(t, tp); 1599 t->trie = trie_rebalance(t, tp);
@@ -1460,23 +1616,23 @@ fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1460 struct list_head *fa_head; 1616 struct list_head *fa_head;
1461 struct leaf *l; 1617 struct leaf *l;
1462 1618
1463 if (plen > 32) 1619 if (plen > 32)
1464 return -EINVAL; 1620 return -EINVAL;
1465 1621
1466 key = 0; 1622 key = 0;
1467 if (rta->rta_dst) 1623 if (rta->rta_dst)
1468 memcpy(&key, rta->rta_dst, 4); 1624 memcpy(&key, rta->rta_dst, 4);
1469 1625
1470 key = ntohl(key); 1626 key = ntohl(key);
1471 mask = ntohl( inet_make_mask(plen) ); 1627 mask = ntohl( inet_make_mask(plen) );
1472 1628
1473 if(key & ~mask) 1629 if (key & ~mask)
1474 return -EINVAL; 1630 return -EINVAL;
1475 1631
1476 key = key & mask; 1632 key = key & mask;
1477 l = fib_find_node(t, key); 1633 l = fib_find_node(t, key);
1478 1634
1479 if(!l) 1635 if (!l)
1480 return -ESRCH; 1636 return -ESRCH;
1481 1637
1482 fa_head = get_fa_head(l, plen); 1638 fa_head = get_fa_head(l, plen);
@@ -1522,16 +1678,16 @@ fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1522 1678
1523 list_del(&fa->fa_list); 1679 list_del(&fa->fa_list);
1524 1680
1525 if(list_empty(fa_head)) { 1681 if (list_empty(fa_head)) {
1526 hlist_del(&li->hlist); 1682 hlist_del(&li->hlist);
1527 kill_li = 1; 1683 kill_li = 1;
1528 } 1684 }
1529 write_unlock_bh(&fib_lock); 1685 write_unlock_bh(&fib_lock);
1530 1686
1531 if(kill_li) 1687 if (kill_li)
1532 free_leaf_info(li); 1688 free_leaf_info(li);
1533 1689
1534 if(hlist_empty(&l->list)) 1690 if (hlist_empty(&l->list))
1535 trie_leaf_remove(t, key); 1691 trie_leaf_remove(t, key);
1536 1692
1537 if (fa->fa_state & FA_S_ACCESSED) 1693 if (fa->fa_state & FA_S_ACCESSED)
@@ -1550,12 +1706,12 @@ static int trie_flush_list(struct trie *t, struct list_head *head)
1550 1706
1551 list_for_each_entry_safe(fa, fa_node, head, fa_list) { 1707 list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1552 struct fib_info *fi = fa->fa_info; 1708 struct fib_info *fi = fa->fa_info;
1553 1709
1554 if (fi && (fi->fib_flags&RTNH_F_DEAD)) { 1710 if (fi && (fi->fib_flags&RTNH_F_DEAD)) {
1555 1711
1556 write_lock_bh(&fib_lock); 1712 write_lock_bh(&fib_lock);
1557 list_del(&fa->fa_list); 1713 list_del(&fa->fa_list);
1558 write_unlock_bh(&fib_lock); 1714 write_unlock_bh(&fib_lock);
1559 1715
1560 fn_free_alias(fa); 1716 fn_free_alias(fa);
1561 found++; 1717 found++;
@@ -1572,14 +1728,14 @@ static int trie_flush_leaf(struct trie *t, struct leaf *l)
1572 struct leaf_info *li = NULL; 1728 struct leaf_info *li = NULL;
1573 1729
1574 hlist_for_each_entry_safe(li, node, tmp, lih, hlist) { 1730 hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
1575 1731
1576 found += trie_flush_list(t, &li->falh); 1732 found += trie_flush_list(t, &li->falh);
1577 1733
1578 if (list_empty(&li->falh)) { 1734 if (list_empty(&li->falh)) {
1579 1735
1580 write_lock_bh(&fib_lock); 1736 write_lock_bh(&fib_lock);
1581 hlist_del(&li->hlist); 1737 hlist_del(&li->hlist);
1582 write_unlock_bh(&fib_lock); 1738 write_unlock_bh(&fib_lock);
1583 1739
1584 free_leaf_info(li); 1740 free_leaf_info(li);
1585 } 1741 }
@@ -1593,8 +1749,8 @@ static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
1593 struct tnode *p; 1749 struct tnode *p;
1594 int idx; 1750 int idx;
1595 1751
1596 if(c == NULL) { 1752 if (c == NULL) {
1597 if(t->trie == NULL) 1753 if (t->trie == NULL)
1598 return NULL; 1754 return NULL;
1599 1755
1600 if (IS_LEAF(t->trie)) /* trie w. just a leaf */ 1756 if (IS_LEAF(t->trie)) /* trie w. just a leaf */
@@ -1602,33 +1758,34 @@ static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
1602 1758
1603 p = (struct tnode*) t->trie; /* Start */ 1759 p = (struct tnode*) t->trie; /* Start */
1604 } 1760 }
1605 else 1761 else
1606 p = (struct tnode *) NODE_PARENT(c); 1762 p = (struct tnode *) NODE_PARENT(c);
1763
1607 while (p) { 1764 while (p) {
1608 int pos, last; 1765 int pos, last;
1609 1766
1610 /* Find the next child of the parent */ 1767 /* Find the next child of the parent */
1611 if(c) 1768 if (c)
1612 pos = 1 + tkey_extract_bits(c->key, p->pos, p->bits); 1769 pos = 1 + tkey_extract_bits(c->key, p->pos, p->bits);
1613 else 1770 else
1614 pos = 0; 1771 pos = 0;
1615 1772
1616 last = 1 << p->bits; 1773 last = 1 << p->bits;
1617 for(idx = pos; idx < last ; idx++) { 1774 for(idx = pos; idx < last ; idx++) {
1618 if( p->child[idx]) { 1775 if (p->child[idx]) {
1619 1776
1620 /* Decend if tnode */ 1777 /* Decend if tnode */
1621 1778
1622 while (IS_TNODE(p->child[idx])) { 1779 while (IS_TNODE(p->child[idx])) {
1623 p = (struct tnode*) p->child[idx]; 1780 p = (struct tnode*) p->child[idx];
1624 idx = 0; 1781 idx = 0;
1625 1782
1626 /* Rightmost non-NULL branch */ 1783 /* Rightmost non-NULL branch */
1627 if( p && IS_TNODE(p) ) 1784 if (p && IS_TNODE(p))
1628 while ( p->child[idx] == NULL && idx < (1 << p->bits) ) idx++; 1785 while (p->child[idx] == NULL && idx < (1 << p->bits)) idx++;
1629 1786
1630 /* Done with this tnode? */ 1787 /* Done with this tnode? */
1631 if( idx >= (1 << p->bits) || p->child[idx] == NULL ) 1788 if (idx >= (1 << p->bits) || p->child[idx] == NULL )
1632 goto up; 1789 goto up;
1633 } 1790 }
1634 return (struct leaf*) p->child[idx]; 1791 return (struct leaf*) p->child[idx];
@@ -1661,7 +1818,7 @@ static int fn_trie_flush(struct fib_table *tb)
1661 if (ll && hlist_empty(&ll->list)) 1818 if (ll && hlist_empty(&ll->list))
1662 trie_leaf_remove(t, ll->key); 1819 trie_leaf_remove(t, ll->key);
1663 1820
1664 if(trie_debug) 1821 if (trie_debug)
1665 printk("trie_flush found=%d\n", found); 1822 printk("trie_flush found=%d\n", found);
1666 return found; 1823 return found;
1667} 1824}
@@ -1684,32 +1841,32 @@ fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib
1684 order = -1; 1841 order = -1;
1685 1842
1686 read_lock(&fib_lock); 1843 read_lock(&fib_lock);
1687 1844
1688 l = fib_find_node(t, 0); 1845 l = fib_find_node(t, 0);
1689 if(!l) 1846 if (!l)
1690 goto out; 1847 goto out;
1691 1848
1692 fa_head = get_fa_head(l, 0); 1849 fa_head = get_fa_head(l, 0);
1693 if(!fa_head) 1850 if (!fa_head)
1694 goto out; 1851 goto out;
1695 1852
1696 if (list_empty(fa_head)) 1853 if (list_empty(fa_head))
1697 goto out; 1854 goto out;
1698 1855
1699 list_for_each_entry(fa, fa_head, fa_list) { 1856 list_for_each_entry(fa, fa_head, fa_list) {
1700 struct fib_info *next_fi = fa->fa_info; 1857 struct fib_info *next_fi = fa->fa_info;
1701 1858
1702 if (fa->fa_scope != res->scope || 1859 if (fa->fa_scope != res->scope ||
1703 fa->fa_type != RTN_UNICAST) 1860 fa->fa_type != RTN_UNICAST)
1704 continue; 1861 continue;
1705 1862
1706 if (next_fi->fib_priority > res->fi->fib_priority) 1863 if (next_fi->fib_priority > res->fi->fib_priority)
1707 break; 1864 break;
1708 if (!next_fi->fib_nh[0].nh_gw || 1865 if (!next_fi->fib_nh[0].nh_gw ||
1709 next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK) 1866 next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1710 continue; 1867 continue;
1711 fa->fa_state |= FA_S_ACCESSED; 1868 fa->fa_state |= FA_S_ACCESSED;
1712 1869
1713 if (fi == NULL) { 1870 if (fi == NULL) {
1714 if (next_fi != res->fi) 1871 if (next_fi != res->fi)
1715 break; 1872 break;
@@ -1747,10 +1904,10 @@ fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib
1747 } 1904 }
1748 trie_last_dflt = last_idx; 1905 trie_last_dflt = last_idx;
1749 out:; 1906 out:;
1750 read_unlock(&fib_lock); 1907 read_unlock(&fib_lock);
1751} 1908}
1752 1909
1753static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb, 1910static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb,
1754 struct sk_buff *skb, struct netlink_callback *cb) 1911 struct sk_buff *skb, struct netlink_callback *cb)
1755{ 1912{
1756 int i, s_i; 1913 int i, s_i;
@@ -1796,7 +1953,7 @@ static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fi
1796 return skb->len; 1953 return skb->len;
1797} 1954}
1798 1955
1799static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, struct sk_buff *skb, 1956static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, struct sk_buff *skb,
1800 struct netlink_callback *cb) 1957 struct netlink_callback *cb)
1801{ 1958{
1802 int h, s_h; 1959 int h, s_h;
@@ -1813,11 +1970,11 @@ static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, str
1813 sizeof(cb->args) - 3*sizeof(cb->args[0])); 1970 sizeof(cb->args) - 3*sizeof(cb->args[0]));
1814 1971
1815 fa_head = get_fa_head(l, plen); 1972 fa_head = get_fa_head(l, plen);
1816 1973
1817 if(!fa_head) 1974 if (!fa_head)
1818 continue; 1975 continue;
1819 1976
1820 if(list_empty(fa_head)) 1977 if (list_empty(fa_head))
1821 continue; 1978 continue;
1822 1979
1823 if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) { 1980 if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) {
@@ -1893,10 +2050,10 @@ struct fib_table * __init fib_hash_init(int id)
1893 2050
1894 trie_init(t); 2051 trie_init(t);
1895 2052
1896 if (id == RT_TABLE_LOCAL) 2053 if (id == RT_TABLE_LOCAL)
1897 trie_local=t; 2054 trie_local = t;
1898 else if (id == RT_TABLE_MAIN) 2055 else if (id == RT_TABLE_MAIN)
1899 trie_main=t; 2056 trie_main = t;
1900 2057
1901 if (id == RT_TABLE_LOCAL) 2058 if (id == RT_TABLE_LOCAL)
1902 printk("IPv4 FIB: Using LC-trie version %s\n", VERSION); 2059 printk("IPv4 FIB: Using LC-trie version %s\n", VERSION);
@@ -1917,7 +2074,7 @@ static void printbin_seq(struct seq_file *seq, unsigned int v, int bits)
1917 seq_printf(seq, "%s", (v & (1<<bits))?"1":"0"); 2074 seq_printf(seq, "%s", (v & (1<<bits))?"1":"0");
1918} 2075}
1919 2076
1920static void printnode_seq(struct seq_file *seq, int indent, struct node *n, 2077static void printnode_seq(struct seq_file *seq, int indent, struct node *n,
1921 int pend, int cindex, int bits) 2078 int pend, int cindex, int bits)
1922{ 2079{
1923 putspace_seq(seq, indent); 2080 putspace_seq(seq, indent);
@@ -1935,12 +2092,12 @@ static void printnode_seq(struct seq_file *seq, int indent, struct node *n,
1935 seq_printf(seq, "%s:%p ", IS_LEAF(n)?"Leaf":"Internal node", n); 2092 seq_printf(seq, "%s:%p ", IS_LEAF(n)?"Leaf":"Internal node", n);
1936 2093
1937 if (IS_LEAF(n)) 2094 if (IS_LEAF(n))
1938 seq_printf(seq, "key=%d.%d.%d.%d\n", 2095 seq_printf(seq, "key=%d.%d.%d.%d\n",
1939 n->key >> 24, (n->key >> 16) % 256, (n->key >> 8) % 256, n->key % 256); 2096 n->key >> 24, (n->key >> 16) % 256, (n->key >> 8) % 256, n->key % 256);
1940 else { 2097 else {
1941 int plen=((struct tnode *)n)->pos; 2098 int plen = ((struct tnode *)n)->pos;
1942 t_key prf=MASK_PFX(n->key, plen); 2099 t_key prf=MASK_PFX(n->key, plen);
1943 seq_printf(seq, "key=%d.%d.%d.%d/%d\n", 2100 seq_printf(seq, "key=%d.%d.%d.%d/%d\n",
1944 prf >> 24, (prf >> 16) % 256, (prf >> 8) % 256, prf % 256, plen); 2101 prf >> 24, (prf >> 16) % 256, (prf >> 8) % 256, prf % 256, plen);
1945 } 2102 }
1946 if (IS_LEAF(n)) { 2103 if (IS_LEAF(n)) {
@@ -1948,14 +2105,14 @@ static void printnode_seq(struct seq_file *seq, int indent, struct node *n,
1948 struct fib_alias *fa; 2105 struct fib_alias *fa;
1949 int i; 2106 int i;
1950 for (i=32; i>=0; i--) 2107 for (i=32; i>=0; i--)
1951 if(find_leaf_info(&l->list, i)) { 2108 if (find_leaf_info(&l->list, i)) {
1952 2109
1953 struct list_head *fa_head = get_fa_head(l, i); 2110 struct list_head *fa_head = get_fa_head(l, i);
1954 2111
1955 if(!fa_head) 2112 if (!fa_head)
1956 continue; 2113 continue;
1957 2114
1958 if(list_empty(fa_head)) 2115 if (list_empty(fa_head))
1959 continue; 2116 continue;
1960 2117
1961 putspace_seq(seq, indent+2); 2118 putspace_seq(seq, indent+2);
@@ -1981,7 +2138,7 @@ static void printnode_seq(struct seq_file *seq, int indent, struct node *n,
1981 } 2138 }
1982 } 2139 }
1983 else if (IS_TNODE(n)) { 2140 else if (IS_TNODE(n)) {
1984 struct tnode *tn=(struct tnode *)n; 2141 struct tnode *tn = (struct tnode *)n;
1985 putspace_seq(seq, indent); seq_printf(seq, "| "); 2142 putspace_seq(seq, indent); seq_printf(seq, "| ");
1986 seq_printf(seq, "{key prefix=%08x/", tn->key&TKEY_GET_MASK(0, tn->pos)); 2143 seq_printf(seq, "{key prefix=%08x/", tn->key&TKEY_GET_MASK(0, tn->pos));
1987 printbin_seq(seq, tkey_extract_bits(tn->key, 0, tn->pos), tn->pos); 2144 printbin_seq(seq, tkey_extract_bits(tn->key, 0, tn->pos), tn->pos);
@@ -1997,7 +2154,7 @@ static void printnode_seq(struct seq_file *seq, int indent, struct node *n,
1997 2154
1998static void trie_dump_seq(struct seq_file *seq, struct trie *t) 2155static void trie_dump_seq(struct seq_file *seq, struct trie *t)
1999{ 2156{
2000 struct node *n=t->trie; 2157 struct node *n = t->trie;
2001 int cindex=0; 2158 int cindex=0;
2002 int indent=1; 2159 int indent=1;
2003 int pend=0; 2160 int pend=0;
@@ -2009,7 +2166,7 @@ static void trie_dump_seq(struct seq_file *seq, struct trie *t)
2009 if (n) { 2166 if (n) {
2010 printnode_seq(seq, indent, n, pend, cindex, 0); 2167 printnode_seq(seq, indent, n, pend, cindex, 0);
2011 if (IS_TNODE(n)) { 2168 if (IS_TNODE(n)) {
2012 struct tnode *tn=(struct tnode *)n; 2169 struct tnode *tn = (struct tnode *)n;
2013 pend = tn->pos+tn->bits; 2170 pend = tn->pos+tn->bits;
2014 putspace_seq(seq, indent); seq_printf(seq, "\\--\n"); 2171 putspace_seq(seq, indent); seq_printf(seq, "\\--\n");
2015 indent += 3; 2172 indent += 3;
@@ -2017,42 +2174,42 @@ static void trie_dump_seq(struct seq_file *seq, struct trie *t)
2017 2174
2018 while (tn && cindex < (1 << tn->bits)) { 2175 while (tn && cindex < (1 << tn->bits)) {
2019 if (tn->child[cindex]) { 2176 if (tn->child[cindex]) {
2020 2177
2021 /* Got a child */ 2178 /* Got a child */
2022 2179
2023 printnode_seq(seq, indent, tn->child[cindex], pend, cindex, tn->bits); 2180 printnode_seq(seq, indent, tn->child[cindex], pend, cindex, tn->bits);
2024 if (IS_LEAF(tn->child[cindex])) { 2181 if (IS_LEAF(tn->child[cindex])) {
2025 cindex++; 2182 cindex++;
2026 2183
2027 } 2184 }
2028 else { 2185 else {
2029 /* 2186 /*
2030 * New tnode. Decend one level 2187 * New tnode. Decend one level
2031 */ 2188 */
2032 2189
2033 depth++; 2190 depth++;
2034 n=tn->child[cindex]; 2191 n = tn->child[cindex];
2035 tn=(struct tnode *)n; 2192 tn = (struct tnode *)n;
2036 pend=tn->pos+tn->bits; 2193 pend = tn->pos+tn->bits;
2037 putspace_seq(seq, indent); seq_printf(seq, "\\--\n"); 2194 putspace_seq(seq, indent); seq_printf(seq, "\\--\n");
2038 indent+=3; 2195 indent+=3;
2039 cindex=0; 2196 cindex=0;
2040 } 2197 }
2041 } 2198 }
2042 else 2199 else
2043 cindex++; 2200 cindex++;
2044 2201
2045 /* 2202 /*
2046 * Test if we are done 2203 * Test if we are done
2047 */ 2204 */
2048 2205
2049 while (cindex >= (1 << tn->bits)) { 2206 while (cindex >= (1 << tn->bits)) {
2050 2207
2051 /* 2208 /*
2052 * Move upwards and test for root 2209 * Move upwards and test for root
2053 * pop off all traversed nodes 2210 * pop off all traversed nodes
2054 */ 2211 */
2055 2212
2056 if (NODE_PARENT(tn) == NULL) { 2213 if (NODE_PARENT(tn) == NULL) {
2057 tn = NULL; 2214 tn = NULL;
2058 n = NULL; 2215 n = NULL;
@@ -2062,8 +2219,8 @@ static void trie_dump_seq(struct seq_file *seq, struct trie *t)
2062 cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits); 2219 cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
2063 tn = NODE_PARENT(tn); 2220 tn = NODE_PARENT(tn);
2064 cindex++; 2221 cindex++;
2065 n=(struct node *)tn; 2222 n = (struct node *)tn;
2066 pend=tn->pos+tn->bits; 2223 pend = tn->pos+tn->bits;
2067 indent-=3; 2224 indent-=3;
2068 depth--; 2225 depth--;
2069 } 2226 }
@@ -2081,36 +2238,36 @@ static struct trie_stat *trie_stat_new(void)
2081{ 2238{
2082 struct trie_stat *s = kmalloc(sizeof(struct trie_stat), GFP_KERNEL); 2239 struct trie_stat *s = kmalloc(sizeof(struct trie_stat), GFP_KERNEL);
2083 int i; 2240 int i;
2084 2241
2085 if(s) { 2242 if (s) {
2086 s->totdepth = 0; 2243 s->totdepth = 0;
2087 s->maxdepth = 0; 2244 s->maxdepth = 0;
2088 s->tnodes = 0; 2245 s->tnodes = 0;
2089 s->leaves = 0; 2246 s->leaves = 0;
2090 s->nullpointers = 0; 2247 s->nullpointers = 0;
2091 2248
2092 for(i=0; i< MAX_CHILDS; i++) 2249 for(i=0; i< MAX_CHILDS; i++)
2093 s->nodesizes[i] = 0; 2250 s->nodesizes[i] = 0;
2094 } 2251 }
2095 return s; 2252 return s;
2096} 2253}
2097 2254
2098static struct trie_stat *trie_collect_stats(struct trie *t) 2255static struct trie_stat *trie_collect_stats(struct trie *t)
2099{ 2256{
2100 struct node *n=t->trie; 2257 struct node *n = t->trie;
2101 struct trie_stat *s = trie_stat_new(); 2258 struct trie_stat *s = trie_stat_new();
2102 int cindex = 0; 2259 int cindex = 0;
2103 int indent = 1; 2260 int indent = 1;
2104 int pend = 0; 2261 int pend = 0;
2105 int depth = 0; 2262 int depth = 0;
2106 2263
2107 read_lock(&fib_lock); 2264 read_lock(&fib_lock);
2108 2265
2109 if (s) { 2266 if (s) {
2110 if (n) { 2267 if (n) {
2111 if (IS_TNODE(n)) { 2268 if (IS_TNODE(n)) {
2112 struct tnode *tn = (struct tnode *)n; 2269 struct tnode *tn = (struct tnode *)n;
2113 pend=tn->pos+tn->bits; 2270 pend = tn->pos+tn->bits;
2114 indent += 3; 2271 indent += 3;
2115 s->nodesizes[tn->bits]++; 2272 s->nodesizes[tn->bits]++;
2116 depth++; 2273 depth++;
@@ -2118,26 +2275,26 @@ static struct trie_stat *trie_collect_stats(struct trie *t)
2118 while (tn && cindex < (1 << tn->bits)) { 2275 while (tn && cindex < (1 << tn->bits)) {
2119 if (tn->child[cindex]) { 2276 if (tn->child[cindex]) {
2120 /* Got a child */ 2277 /* Got a child */
2121 2278
2122 if (IS_LEAF(tn->child[cindex])) { 2279 if (IS_LEAF(tn->child[cindex])) {
2123 cindex++; 2280 cindex++;
2124 2281
2125 /* stats */ 2282 /* stats */
2126 if (depth > s->maxdepth) 2283 if (depth > s->maxdepth)
2127 s->maxdepth = depth; 2284 s->maxdepth = depth;
2128 s->totdepth += depth; 2285 s->totdepth += depth;
2129 s->leaves++; 2286 s->leaves++;
2130 } 2287 }
2131 2288
2132 else { 2289 else {
2133 /* 2290 /*
2134 * New tnode. Decend one level 2291 * New tnode. Decend one level
2135 */ 2292 */
2136 2293
2137 s->tnodes++; 2294 s->tnodes++;
2138 s->nodesizes[tn->bits]++; 2295 s->nodesizes[tn->bits]++;
2139 depth++; 2296 depth++;
2140 2297
2141 n = tn->child[cindex]; 2298 n = tn->child[cindex];
2142 tn = (struct tnode *)n; 2299 tn = (struct tnode *)n;
2143 pend = tn->pos+tn->bits; 2300 pend = tn->pos+tn->bits;
@@ -2148,13 +2305,13 @@ static struct trie_stat *trie_collect_stats(struct trie *t)
2148 } 2305 }
2149 else { 2306 else {
2150 cindex++; 2307 cindex++;
2151 s->nullpointers++; 2308 s->nullpointers++;
2152 } 2309 }
2153 2310
2154 /* 2311 /*
2155 * Test if we are done 2312 * Test if we are done
2156 */ 2313 */
2157 2314
2158 while (cindex >= (1 << tn->bits)) { 2315 while (cindex >= (1 << tn->bits)) {
2159 2316
2160 /* 2317 /*
@@ -2162,7 +2319,7 @@ static struct trie_stat *trie_collect_stats(struct trie *t)
2162 * pop off all traversed nodes 2319 * pop off all traversed nodes
2163 */ 2320 */
2164 2321
2165 2322
2166 if (NODE_PARENT(tn) == NULL) { 2323 if (NODE_PARENT(tn) == NULL) {
2167 tn = NULL; 2324 tn = NULL;
2168 n = NULL; 2325 n = NULL;
@@ -2171,9 +2328,9 @@ static struct trie_stat *trie_collect_stats(struct trie *t)
2171 else { 2328 else {
2172 cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits); 2329 cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
2173 tn = NODE_PARENT(tn); 2330 tn = NODE_PARENT(tn);
2174 cindex++; 2331 cindex++;
2175 n = (struct node *)tn; 2332 n = (struct node *)tn;
2176 pend=tn->pos+tn->bits; 2333 pend = tn->pos+tn->bits;
2177 indent -= 3; 2334 indent -= 3;
2178 depth--; 2335 depth--;
2179 } 2336 }
@@ -2184,7 +2341,7 @@ static struct trie_stat *trie_collect_stats(struct trie *t)
2184 } 2341 }
2185 } 2342 }
2186 2343
2187 read_unlock(&fib_lock); 2344 read_unlock(&fib_lock);
2188 return s; 2345 return s;
2189} 2346}
2190 2347
@@ -2220,7 +2377,7 @@ static void fib_triestat_seq_stop(struct seq_file *seq, void *v)
2220 2377
2221} 2378}
2222 2379
2223/* 2380/*
2224 * This outputs /proc/net/fib_triestats 2381 * This outputs /proc/net/fib_triestats
2225 * 2382 *
2226 * It always works in backward compatibility mode. 2383 * It always works in backward compatibility mode.
@@ -2246,7 +2403,7 @@ static void collect_and_show(struct trie *t, struct seq_file *seq)
2246 avdepth=0; 2403 avdepth=0;
2247 seq_printf(seq, "Aver depth: %d.%02d\n", avdepth / 100, avdepth % 100 ); 2404 seq_printf(seq, "Aver depth: %d.%02d\n", avdepth / 100, avdepth % 100 );
2248 seq_printf(seq, "Max depth: %4d\n", stat->maxdepth); 2405 seq_printf(seq, "Max depth: %4d\n", stat->maxdepth);
2249 2406
2250 seq_printf(seq, "Leaves: %d\n", stat->leaves); 2407 seq_printf(seq, "Leaves: %d\n", stat->leaves);
2251 bytes += sizeof(struct leaf) * stat->leaves; 2408 bytes += sizeof(struct leaf) * stat->leaves;
2252 seq_printf(seq, "Internal nodes: %d\n", stat->tnodes); 2409 seq_printf(seq, "Internal nodes: %d\n", stat->tnodes);
@@ -2258,7 +2415,7 @@ static void collect_and_show(struct trie *t, struct seq_file *seq)
2258 max--; 2415 max--;
2259 pointers = 0; 2416 pointers = 0;
2260 2417
2261 for (i = 1; i <= max; i++) 2418 for (i = 1; i <= max; i++)
2262 if (stat->nodesizes[i] != 0) { 2419 if (stat->nodesizes[i] != 0) {
2263 seq_printf(seq, " %d: %d", i, stat->nodesizes[i]); 2420 seq_printf(seq, " %d: %d", i, stat->nodesizes[i]);
2264 pointers += (1<<i) * stat->nodesizes[i]; 2421 pointers += (1<<i) * stat->nodesizes[i];
@@ -2279,6 +2436,7 @@ static void collect_and_show(struct trie *t, struct seq_file *seq)
2279 seq_printf(seq,"semantic match passed = %d\n", t->stats.semantic_match_passed); 2436 seq_printf(seq,"semantic match passed = %d\n", t->stats.semantic_match_passed);
2280 seq_printf(seq,"semantic match miss = %d\n", t->stats.semantic_match_miss); 2437 seq_printf(seq,"semantic match miss = %d\n", t->stats.semantic_match_miss);
2281 seq_printf(seq,"null node hit= %d\n", t->stats.null_node_hit); 2438 seq_printf(seq,"null node hit= %d\n", t->stats.null_node_hit);
2439 seq_printf(seq,"skipped node resize = %d\n", t->stats.resize_node_skipped);
2282#ifdef CLEAR_STATS 2440#ifdef CLEAR_STATS
2283 memset(&(t->stats), 0, sizeof(t->stats)); 2441 memset(&(t->stats), 0, sizeof(t->stats));
2284#endif 2442#endif
@@ -2288,30 +2446,30 @@ static void collect_and_show(struct trie *t, struct seq_file *seq)
2288static int fib_triestat_seq_show(struct seq_file *seq, void *v) 2446static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2289{ 2447{
2290 char bf[128]; 2448 char bf[128];
2291 2449
2292 if (v == SEQ_START_TOKEN) { 2450 if (v == SEQ_START_TOKEN) {
2293 seq_printf(seq, "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n", 2451 seq_printf(seq, "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n",
2294 sizeof(struct leaf), sizeof(struct tnode)); 2452 sizeof(struct leaf), sizeof(struct tnode));
2295 if (trie_local) 2453 if (trie_local)
2296 collect_and_show(trie_local, seq); 2454 collect_and_show(trie_local, seq);
2297 2455
2298 if (trie_main) 2456 if (trie_main)
2299 collect_and_show(trie_main, seq); 2457 collect_and_show(trie_main, seq);
2300 } 2458 }
2301 else { 2459 else {
2302 snprintf(bf, sizeof(bf), 2460 snprintf(bf, sizeof(bf),
2303 "*\t%08X\t%08X", 200, 400); 2461 "*\t%08X\t%08X", 200, 400);
2304 2462
2305 seq_printf(seq, "%-127s\n", bf); 2463 seq_printf(seq, "%-127s\n", bf);
2306 } 2464 }
2307 return 0; 2465 return 0;
2308} 2466}
2309 2467
2310static struct seq_operations fib_triestat_seq_ops = { 2468static struct seq_operations fib_triestat_seq_ops = {
2311 .start = fib_triestat_seq_start, 2469 .start = fib_triestat_seq_start,
2312 .next = fib_triestat_seq_next, 2470 .next = fib_triestat_seq_next,
2313 .stop = fib_triestat_seq_stop, 2471 .stop = fib_triestat_seq_stop,
2314 .show = fib_triestat_seq_show, 2472 .show = fib_triestat_seq_show,
2315}; 2473};
2316 2474
2317static int fib_triestat_seq_open(struct inode *inode, struct file *file) 2475static int fib_triestat_seq_open(struct inode *inode, struct file *file)
@@ -2323,7 +2481,7 @@ static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2323 if (rc) 2481 if (rc)
2324 goto out_kfree; 2482 goto out_kfree;
2325 2483
2326 seq = file->private_data; 2484 seq = file->private_data;
2327out: 2485out:
2328 return rc; 2486 return rc;
2329out_kfree: 2487out_kfree:
@@ -2331,11 +2489,11 @@ out_kfree:
2331} 2489}
2332 2490
2333static struct file_operations fib_triestat_seq_fops = { 2491static struct file_operations fib_triestat_seq_fops = {
2334 .owner = THIS_MODULE, 2492 .owner = THIS_MODULE,
2335 .open = fib_triestat_seq_open, 2493 .open = fib_triestat_seq_open,
2336 .read = seq_read, 2494 .read = seq_read,
2337 .llseek = seq_lseek, 2495 .llseek = seq_lseek,
2338 .release = seq_release_private, 2496 .release = seq_release_private,
2339}; 2497};
2340 2498
2341int __init fib_stat_proc_init(void) 2499int __init fib_stat_proc_init(void)
@@ -2380,7 +2538,7 @@ static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2380 2538
2381} 2539}
2382 2540
2383/* 2541/*
2384 * This outputs /proc/net/fib_trie. 2542 * This outputs /proc/net/fib_trie.
2385 * 2543 *
2386 * It always works in backward compatibility mode. 2544 * It always works in backward compatibility mode.
@@ -2392,10 +2550,10 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v)
2392 char bf[128]; 2550 char bf[128];
2393 2551
2394 if (v == SEQ_START_TOKEN) { 2552 if (v == SEQ_START_TOKEN) {
2395 if (trie_local) 2553 if (trie_local)
2396 trie_dump_seq(seq, trie_local); 2554 trie_dump_seq(seq, trie_local);
2397 2555
2398 if (trie_main) 2556 if (trie_main)
2399 trie_dump_seq(seq, trie_main); 2557 trie_dump_seq(seq, trie_main);
2400 } 2558 }
2401 2559
@@ -2409,10 +2567,10 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v)
2409} 2567}
2410 2568
2411static struct seq_operations fib_trie_seq_ops = { 2569static struct seq_operations fib_trie_seq_ops = {
2412 .start = fib_trie_seq_start, 2570 .start = fib_trie_seq_start,
2413 .next = fib_trie_seq_next, 2571 .next = fib_trie_seq_next,
2414 .stop = fib_trie_seq_stop, 2572 .stop = fib_trie_seq_stop,
2415 .show = fib_trie_seq_show, 2573 .show = fib_trie_seq_show,
2416}; 2574};
2417 2575
2418static int fib_trie_seq_open(struct inode *inode, struct file *file) 2576static int fib_trie_seq_open(struct inode *inode, struct file *file)
@@ -2424,7 +2582,7 @@ static int fib_trie_seq_open(struct inode *inode, struct file *file)
2424 if (rc) 2582 if (rc)
2425 goto out_kfree; 2583 goto out_kfree;
2426 2584
2427 seq = file->private_data; 2585 seq = file->private_data;
2428out: 2586out:
2429 return rc; 2587 return rc;
2430out_kfree: 2588out_kfree:
@@ -2432,11 +2590,11 @@ out_kfree:
2432} 2590}
2433 2591
2434static struct file_operations fib_trie_seq_fops = { 2592static struct file_operations fib_trie_seq_fops = {
2435 .owner = THIS_MODULE, 2593 .owner = THIS_MODULE,
2436 .open = fib_trie_seq_open, 2594 .open = fib_trie_seq_open,
2437 .read = seq_read, 2595 .read = seq_read,
2438 .llseek = seq_lseek, 2596 .llseek = seq_lseek,
2439 .release = seq_release_private, 2597 .release= seq_release_private,
2440}; 2598};
2441 2599
2442int __init fib_proc_init(void) 2600int __init fib_proc_init(void)
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index cb75948497..badfc58499 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -349,12 +349,12 @@ static void icmp_push_reply(struct icmp_bxm *icmp_param,
349{ 349{
350 struct sk_buff *skb; 350 struct sk_buff *skb;
351 351
352 ip_append_data(icmp_socket->sk, icmp_glue_bits, icmp_param, 352 if (ip_append_data(icmp_socket->sk, icmp_glue_bits, icmp_param,
353 icmp_param->data_len+icmp_param->head_len, 353 icmp_param->data_len+icmp_param->head_len,
354 icmp_param->head_len, 354 icmp_param->head_len,
355 ipc, rt, MSG_DONTWAIT); 355 ipc, rt, MSG_DONTWAIT) < 0)
356 356 ip_flush_pending_frames(icmp_socket->sk);
357 if ((skb = skb_peek(&icmp_socket->sk->sk_write_queue)) != NULL) { 357 else if ((skb = skb_peek(&icmp_socket->sk->sk_write_queue)) != NULL) {
358 struct icmphdr *icmph = skb->h.icmph; 358 struct icmphdr *icmph = skb->h.icmph;
359 unsigned int csum = 0; 359 unsigned int csum = 0;
360 struct sk_buff *skb1; 360 struct sk_buff *skb1;
@@ -936,8 +936,7 @@ int icmp_rcv(struct sk_buff *skb)
936 case CHECKSUM_HW: 936 case CHECKSUM_HW:
937 if (!(u16)csum_fold(skb->csum)) 937 if (!(u16)csum_fold(skb->csum))
938 break; 938 break;
939 NETDEBUG(if (net_ratelimit()) 939 LIMIT_NETDEBUG(printk(KERN_DEBUG "icmp v4 hw csum failure\n"));
940 printk(KERN_DEBUG "icmp v4 hw csum failure\n"));
941 case CHECKSUM_NONE: 940 case CHECKSUM_NONE:
942 if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0))) 941 if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0)))
943 goto error; 942 goto error;
@@ -970,7 +969,8 @@ int icmp_rcv(struct sk_buff *skb)
970 * RFC 1122: 3.2.2.8 An ICMP_TIMESTAMP MAY be silently 969 * RFC 1122: 3.2.2.8 An ICMP_TIMESTAMP MAY be silently
971 * discarded if to broadcast/multicast. 970 * discarded if to broadcast/multicast.
972 */ 971 */
973 if (icmph->type == ICMP_ECHO && 972 if ((icmph->type == ICMP_ECHO ||
973 icmph->type == ICMP_TIMESTAMP) &&
974 sysctl_icmp_echo_ignore_broadcasts) { 974 sysctl_icmp_echo_ignore_broadcasts) {
975 goto error; 975 goto error;
976 } 976 }
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 1f3183168a..5088f90835 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -1615,9 +1615,10 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr)
1615{ 1615{
1616 int err; 1616 int err;
1617 u32 addr = imr->imr_multiaddr.s_addr; 1617 u32 addr = imr->imr_multiaddr.s_addr;
1618 struct ip_mc_socklist *iml, *i; 1618 struct ip_mc_socklist *iml=NULL, *i;
1619 struct in_device *in_dev; 1619 struct in_device *in_dev;
1620 struct inet_sock *inet = inet_sk(sk); 1620 struct inet_sock *inet = inet_sk(sk);
1621 int ifindex;
1621 int count = 0; 1622 int count = 0;
1622 1623
1623 if (!MULTICAST(addr)) 1624 if (!MULTICAST(addr))
@@ -1633,37 +1634,30 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr)
1633 goto done; 1634 goto done;
1634 } 1635 }
1635 1636
1636 iml = (struct ip_mc_socklist *)sock_kmalloc(sk, sizeof(*iml), GFP_KERNEL);
1637
1638 err = -EADDRINUSE; 1637 err = -EADDRINUSE;
1638 ifindex = imr->imr_ifindex;
1639 for (i = inet->mc_list; i; i = i->next) { 1639 for (i = inet->mc_list; i; i = i->next) {
1640 if (memcmp(&i->multi, imr, sizeof(*imr)) == 0) { 1640 if (i->multi.imr_multiaddr.s_addr == addr &&
1641 /* New style additions are reference counted */ 1641 i->multi.imr_ifindex == ifindex)
1642 if (imr->imr_address.s_addr == 0) {
1643 i->count++;
1644 err = 0;
1645 }
1646 goto done; 1642 goto done;
1647 }
1648 count++; 1643 count++;
1649 } 1644 }
1650 err = -ENOBUFS; 1645 err = -ENOBUFS;
1651 if (iml == NULL || count >= sysctl_igmp_max_memberships) 1646 if (count >= sysctl_igmp_max_memberships)
1647 goto done;
1648 iml = (struct ip_mc_socklist *)sock_kmalloc(sk,sizeof(*iml),GFP_KERNEL);
1649 if (iml == NULL)
1652 goto done; 1650 goto done;
1651
1653 memcpy(&iml->multi, imr, sizeof(*imr)); 1652 memcpy(&iml->multi, imr, sizeof(*imr));
1654 iml->next = inet->mc_list; 1653 iml->next = inet->mc_list;
1655 iml->count = 1;
1656 iml->sflist = NULL; 1654 iml->sflist = NULL;
1657 iml->sfmode = MCAST_EXCLUDE; 1655 iml->sfmode = MCAST_EXCLUDE;
1658 inet->mc_list = iml; 1656 inet->mc_list = iml;
1659 ip_mc_inc_group(in_dev, addr); 1657 ip_mc_inc_group(in_dev, addr);
1660 iml = NULL;
1661 err = 0; 1658 err = 0;
1662
1663done: 1659done:
1664 rtnl_shunlock(); 1660 rtnl_shunlock();
1665 if (iml)
1666 sock_kfree_s(sk, iml, sizeof(*iml));
1667 return err; 1661 return err;
1668} 1662}
1669 1663
@@ -1693,30 +1687,25 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
1693{ 1687{
1694 struct inet_sock *inet = inet_sk(sk); 1688 struct inet_sock *inet = inet_sk(sk);
1695 struct ip_mc_socklist *iml, **imlp; 1689 struct ip_mc_socklist *iml, **imlp;
1690 struct in_device *in_dev;
1691 u32 group = imr->imr_multiaddr.s_addr;
1692 u32 ifindex;
1696 1693
1697 rtnl_lock(); 1694 rtnl_lock();
1695 in_dev = ip_mc_find_dev(imr);
1696 if (!in_dev) {
1697 rtnl_unlock();
1698 return -ENODEV;
1699 }
1700 ifindex = imr->imr_ifindex;
1698 for (imlp = &inet->mc_list; (iml = *imlp) != NULL; imlp = &iml->next) { 1701 for (imlp = &inet->mc_list; (iml = *imlp) != NULL; imlp = &iml->next) {
1699 if (iml->multi.imr_multiaddr.s_addr==imr->imr_multiaddr.s_addr && 1702 if (iml->multi.imr_multiaddr.s_addr == group &&
1700 iml->multi.imr_address.s_addr==imr->imr_address.s_addr && 1703 iml->multi.imr_ifindex == ifindex) {
1701 (!imr->imr_ifindex || iml->multi.imr_ifindex==imr->imr_ifindex)) { 1704 (void) ip_mc_leave_src(sk, iml, in_dev);
1702 struct in_device *in_dev;
1703
1704 in_dev = inetdev_by_index(iml->multi.imr_ifindex);
1705 if (in_dev)
1706 (void) ip_mc_leave_src(sk, iml, in_dev);
1707 if (--iml->count) {
1708 rtnl_unlock();
1709 if (in_dev)
1710 in_dev_put(in_dev);
1711 return 0;
1712 }
1713 1705
1714 *imlp = iml->next; 1706 *imlp = iml->next;
1715 1707
1716 if (in_dev) { 1708 ip_mc_dec_group(in_dev, group);
1717 ip_mc_dec_group(in_dev, imr->imr_multiaddr.s_addr);
1718 in_dev_put(in_dev);
1719 }
1720 rtnl_unlock(); 1709 rtnl_unlock();
1721 sock_kfree_s(sk, iml, sizeof(*iml)); 1710 sock_kfree_s(sk, iml, sizeof(*iml));
1722 return 0; 1711 return 0;
@@ -1736,6 +1725,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
1736 struct in_device *in_dev = NULL; 1725 struct in_device *in_dev = NULL;
1737 struct inet_sock *inet = inet_sk(sk); 1726 struct inet_sock *inet = inet_sk(sk);
1738 struct ip_sf_socklist *psl; 1727 struct ip_sf_socklist *psl;
1728 int leavegroup = 0;
1739 int i, j, rv; 1729 int i, j, rv;
1740 1730
1741 if (!MULTICAST(addr)) 1731 if (!MULTICAST(addr))
@@ -1755,15 +1745,20 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
1755 err = -EADDRNOTAVAIL; 1745 err = -EADDRNOTAVAIL;
1756 1746
1757 for (pmc=inet->mc_list; pmc; pmc=pmc->next) { 1747 for (pmc=inet->mc_list; pmc; pmc=pmc->next) {
1758 if (memcmp(&pmc->multi, mreqs, 2*sizeof(__u32)) == 0) 1748 if (pmc->multi.imr_multiaddr.s_addr == imr.imr_multiaddr.s_addr
1749 && pmc->multi.imr_ifindex == imr.imr_ifindex)
1759 break; 1750 break;
1760 } 1751 }
1761 if (!pmc) /* must have a prior join */ 1752 if (!pmc) { /* must have a prior join */
1753 err = -EINVAL;
1762 goto done; 1754 goto done;
1755 }
1763 /* if a source filter was set, must be the same mode as before */ 1756 /* if a source filter was set, must be the same mode as before */
1764 if (pmc->sflist) { 1757 if (pmc->sflist) {
1765 if (pmc->sfmode != omode) 1758 if (pmc->sfmode != omode) {
1759 err = -EINVAL;
1766 goto done; 1760 goto done;
1761 }
1767 } else if (pmc->sfmode != omode) { 1762 } else if (pmc->sfmode != omode) {
1768 /* allow mode switches for empty-set filters */ 1763 /* allow mode switches for empty-set filters */
1769 ip_mc_add_src(in_dev, &mreqs->imr_multiaddr, omode, 0, NULL, 0); 1764 ip_mc_add_src(in_dev, &mreqs->imr_multiaddr, omode, 0, NULL, 0);
@@ -1775,7 +1770,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
1775 psl = pmc->sflist; 1770 psl = pmc->sflist;
1776 if (!add) { 1771 if (!add) {
1777 if (!psl) 1772 if (!psl)
1778 goto done; 1773 goto done; /* err = -EADDRNOTAVAIL */
1779 rv = !0; 1774 rv = !0;
1780 for (i=0; i<psl->sl_count; i++) { 1775 for (i=0; i<psl->sl_count; i++) {
1781 rv = memcmp(&psl->sl_addr[i], &mreqs->imr_sourceaddr, 1776 rv = memcmp(&psl->sl_addr[i], &mreqs->imr_sourceaddr,
@@ -1784,7 +1779,13 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
1784 break; 1779 break;
1785 } 1780 }
1786 if (rv) /* source not found */ 1781 if (rv) /* source not found */
1782 goto done; /* err = -EADDRNOTAVAIL */
1783
1784 /* special case - (INCLUDE, empty) == LEAVE_GROUP */
1785 if (psl->sl_count == 1 && omode == MCAST_INCLUDE) {
1786 leavegroup = 1;
1787 goto done; 1787 goto done;
1788 }
1788 1789
1789 /* update the interface filter */ 1790 /* update the interface filter */
1790 ip_mc_del_src(in_dev, &mreqs->imr_multiaddr, omode, 1, 1791 ip_mc_del_src(in_dev, &mreqs->imr_multiaddr, omode, 1,
@@ -1842,18 +1843,21 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
1842 &mreqs->imr_sourceaddr, 1); 1843 &mreqs->imr_sourceaddr, 1);
1843done: 1844done:
1844 rtnl_shunlock(); 1845 rtnl_shunlock();
1846 if (leavegroup)
1847 return ip_mc_leave_group(sk, &imr);
1845 return err; 1848 return err;
1846} 1849}
1847 1850
1848int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex) 1851int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
1849{ 1852{
1850 int err; 1853 int err = 0;
1851 struct ip_mreqn imr; 1854 struct ip_mreqn imr;
1852 u32 addr = msf->imsf_multiaddr; 1855 u32 addr = msf->imsf_multiaddr;
1853 struct ip_mc_socklist *pmc; 1856 struct ip_mc_socklist *pmc;
1854 struct in_device *in_dev; 1857 struct in_device *in_dev;
1855 struct inet_sock *inet = inet_sk(sk); 1858 struct inet_sock *inet = inet_sk(sk);
1856 struct ip_sf_socklist *newpsl, *psl; 1859 struct ip_sf_socklist *newpsl, *psl;
1860 int leavegroup = 0;
1857 1861
1858 if (!MULTICAST(addr)) 1862 if (!MULTICAST(addr))
1859 return -EINVAL; 1863 return -EINVAL;
@@ -1872,15 +1876,22 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
1872 err = -ENODEV; 1876 err = -ENODEV;
1873 goto done; 1877 goto done;
1874 } 1878 }
1875 err = -EADDRNOTAVAIL; 1879
1880 /* special case - (INCLUDE, empty) == LEAVE_GROUP */
1881 if (msf->imsf_fmode == MCAST_INCLUDE && msf->imsf_numsrc == 0) {
1882 leavegroup = 1;
1883 goto done;
1884 }
1876 1885
1877 for (pmc=inet->mc_list; pmc; pmc=pmc->next) { 1886 for (pmc=inet->mc_list; pmc; pmc=pmc->next) {
1878 if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr && 1887 if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr &&
1879 pmc->multi.imr_ifindex == imr.imr_ifindex) 1888 pmc->multi.imr_ifindex == imr.imr_ifindex)
1880 break; 1889 break;
1881 } 1890 }
1882 if (!pmc) /* must have a prior join */ 1891 if (!pmc) { /* must have a prior join */
1892 err = -EINVAL;
1883 goto done; 1893 goto done;
1894 }
1884 if (msf->imsf_numsrc) { 1895 if (msf->imsf_numsrc) {
1885 newpsl = (struct ip_sf_socklist *)sock_kmalloc(sk, 1896 newpsl = (struct ip_sf_socklist *)sock_kmalloc(sk,
1886 IP_SFLSIZE(msf->imsf_numsrc), GFP_KERNEL); 1897 IP_SFLSIZE(msf->imsf_numsrc), GFP_KERNEL);
@@ -1909,8 +1920,11 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
1909 0, NULL, 0); 1920 0, NULL, 0);
1910 pmc->sflist = newpsl; 1921 pmc->sflist = newpsl;
1911 pmc->sfmode = msf->imsf_fmode; 1922 pmc->sfmode = msf->imsf_fmode;
1923 err = 0;
1912done: 1924done:
1913 rtnl_shunlock(); 1925 rtnl_shunlock();
1926 if (leavegroup)
1927 err = ip_mc_leave_group(sk, &imr);
1914 return err; 1928 return err;
1915} 1929}
1916 1930
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index 95473953c4..ab18a853d7 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -450,10 +450,13 @@ static void peer_check_expire(unsigned long dummy)
450 /* Trigger the timer after inet_peer_gc_mintime .. inet_peer_gc_maxtime 450 /* Trigger the timer after inet_peer_gc_mintime .. inet_peer_gc_maxtime
451 * interval depending on the total number of entries (more entries, 451 * interval depending on the total number of entries (more entries,
452 * less interval). */ 452 * less interval). */
453 peer_periodic_timer.expires = jiffies 453 if (peer_total >= inet_peer_threshold)
454 + inet_peer_gc_maxtime 454 peer_periodic_timer.expires = jiffies + inet_peer_gc_mintime;
455 - (inet_peer_gc_maxtime - inet_peer_gc_mintime) / HZ * 455 else
456 peer_total / inet_peer_threshold * HZ; 456 peer_periodic_timer.expires = jiffies
457 + inet_peer_gc_maxtime
458 - (inet_peer_gc_maxtime - inet_peer_gc_mintime) / HZ *
459 peer_total / inet_peer_threshold * HZ;
457 add_timer(&peer_periodic_timer); 460 add_timer(&peer_periodic_timer);
458} 461}
459 462
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 7f68e27eb4..eb377ae153 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -377,7 +377,7 @@ static struct ipq *ip_frag_create(unsigned hash, struct iphdr *iph, u32 user)
377 return ip_frag_intern(hash, qp); 377 return ip_frag_intern(hash, qp);
378 378
379out_nomem: 379out_nomem:
380 NETDEBUG(if (net_ratelimit()) printk(KERN_ERR "ip_frag_create: no memory left !\n")); 380 LIMIT_NETDEBUG(printk(KERN_ERR "ip_frag_create: no memory left !\n"));
381 return NULL; 381 return NULL;
382} 382}
383 383
@@ -625,10 +625,8 @@ static struct sk_buff *ip_frag_reasm(struct ipq *qp, struct net_device *dev)
625 return head; 625 return head;
626 626
627out_nomem: 627out_nomem:
628 NETDEBUG(if (net_ratelimit()) 628 LIMIT_NETDEBUG(printk(KERN_ERR "IP: queue_glue: no memory for gluing "
629 printk(KERN_ERR 629 "queue %p\n", qp));
630 "IP: queue_glue: no memory for gluing queue %p\n",
631 qp));
632 goto out_fail; 630 goto out_fail;
633out_oversize: 631out_oversize:
634 if (net_ratelimit()) 632 if (net_ratelimit())
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 8848355222..f0d5740d7e 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -290,7 +290,6 @@ static struct ip_tunnel * ipgre_tunnel_locate(struct ip_tunnel_parm *parms, int
290 290
291 dev_hold(dev); 291 dev_hold(dev);
292 ipgre_tunnel_link(nt); 292 ipgre_tunnel_link(nt);
293 /* Do not decrement MOD_USE_COUNT here. */
294 return nt; 293 return nt;
295 294
296failed: 295failed:
@@ -1277,12 +1276,28 @@ err1:
1277 goto out; 1276 goto out;
1278} 1277}
1279 1278
1280static void ipgre_fini(void) 1279static void __exit ipgre_destroy_tunnels(void)
1280{
1281 int prio;
1282
1283 for (prio = 0; prio < 4; prio++) {
1284 int h;
1285 for (h = 0; h < HASH_SIZE; h++) {
1286 struct ip_tunnel *t;
1287 while ((t = tunnels[prio][h]) != NULL)
1288 unregister_netdevice(t->dev);
1289 }
1290 }
1291}
1292
1293static void __exit ipgre_fini(void)
1281{ 1294{
1282 if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) 1295 if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
1283 printk(KERN_INFO "ipgre close: can't remove protocol\n"); 1296 printk(KERN_INFO "ipgre close: can't remove protocol\n");
1284 1297
1285 unregister_netdev(ipgre_fb_tunnel_dev); 1298 rtnl_lock();
1299 ipgre_destroy_tunnels();
1300 rtnl_unlock();
1286} 1301}
1287 1302
1288module_init(ipgre_init); 1303module_init(ipgre_init);
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index af2ec88bbb..c703528e0b 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -283,14 +283,18 @@ static inline int ip_rcv_finish(struct sk_buff *skb)
283{ 283{
284 struct net_device *dev = skb->dev; 284 struct net_device *dev = skb->dev;
285 struct iphdr *iph = skb->nh.iph; 285 struct iphdr *iph = skb->nh.iph;
286 int err;
286 287
287 /* 288 /*
288 * Initialise the virtual path cache for the packet. It describes 289 * Initialise the virtual path cache for the packet. It describes
289 * how the packet travels inside Linux networking. 290 * how the packet travels inside Linux networking.
290 */ 291 */
291 if (skb->dst == NULL) { 292 if (skb->dst == NULL) {
292 if (ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev)) 293 if ((err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))) {
294 if (err == -EHOSTUNREACH)
295 IP_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
293 goto drop; 296 goto drop;
297 }
294 } 298 }
295 299
296#ifdef CONFIG_NET_CLS_ROUTE 300#ifdef CONFIG_NET_CLS_ROUTE
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index ee07aec215..80d13103b2 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -107,7 +107,6 @@ static int ip_dev_loopback_xmit(struct sk_buff *newskb)
107 newskb->pkt_type = PACKET_LOOPBACK; 107 newskb->pkt_type = PACKET_LOOPBACK;
108 newskb->ip_summed = CHECKSUM_UNNECESSARY; 108 newskb->ip_summed = CHECKSUM_UNNECESSARY;
109 BUG_TRAP(newskb->dst); 109 BUG_TRAP(newskb->dst);
110 nf_reset(newskb);
111 netif_rx(newskb); 110 netif_rx(newskb);
112 return 0; 111 return 0;
113} 112}
@@ -188,8 +187,6 @@ static inline int ip_finish_output2(struct sk_buff *skb)
188 skb = skb2; 187 skb = skb2;
189 } 188 }
190 189
191 nf_reset(skb);
192
193 if (hh) { 190 if (hh) {
194 int hh_alen; 191 int hh_alen;
195 192
@@ -383,7 +380,6 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
383 to->pkt_type = from->pkt_type; 380 to->pkt_type = from->pkt_type;
384 to->priority = from->priority; 381 to->priority = from->priority;
385 to->protocol = from->protocol; 382 to->protocol = from->protocol;
386 to->security = from->security;
387 dst_release(to->dst); 383 dst_release(to->dst);
388 to->dst = dst_clone(from->dst); 384 to->dst = dst_clone(from->dst);
389 to->dev = from->dev; 385 to->dev = from->dev;
@@ -1323,23 +1319,8 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar
1323 ip_rt_put(rt); 1319 ip_rt_put(rt);
1324} 1320}
1325 1321
1326/*
1327 * IP protocol layer initialiser
1328 */
1329
1330static struct packet_type ip_packet_type = {
1331 .type = __constant_htons(ETH_P_IP),
1332 .func = ip_rcv,
1333};
1334
1335/*
1336 * IP registers the packet type and then calls the subprotocol initialisers
1337 */
1338
1339void __init ip_init(void) 1322void __init ip_init(void)
1340{ 1323{
1341 dev_add_pack(&ip_packet_type);
1342
1343 ip_rt_init(); 1324 ip_rt_init();
1344 inet_initpeers(); 1325 inet_initpeers();
1345 1326
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index f8b172f898..ff4bd067b3 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -677,11 +677,11 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
677 mreq.imr_address.s_addr = mreqs.imr_interface; 677 mreq.imr_address.s_addr = mreqs.imr_interface;
678 mreq.imr_ifindex = 0; 678 mreq.imr_ifindex = 0;
679 err = ip_mc_join_group(sk, &mreq); 679 err = ip_mc_join_group(sk, &mreq);
680 if (err) 680 if (err && err != -EADDRINUSE)
681 break; 681 break;
682 omode = MCAST_INCLUDE; 682 omode = MCAST_INCLUDE;
683 add = 1; 683 add = 1;
684 } else /*IP_DROP_SOURCE_MEMBERSHIP */ { 684 } else /* IP_DROP_SOURCE_MEMBERSHIP */ {
685 omode = MCAST_INCLUDE; 685 omode = MCAST_INCLUDE;
686 add = 0; 686 add = 0;
687 } 687 }
@@ -754,7 +754,7 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
754 mreq.imr_address.s_addr = 0; 754 mreq.imr_address.s_addr = 0;
755 mreq.imr_ifindex = greqs.gsr_interface; 755 mreq.imr_ifindex = greqs.gsr_interface;
756 err = ip_mc_join_group(sk, &mreq); 756 err = ip_mc_join_group(sk, &mreq);
757 if (err) 757 if (err && err != -EADDRINUSE)
758 break; 758 break;
759 greqs.gsr_interface = mreq.imr_ifindex; 759 greqs.gsr_interface = mreq.imr_ifindex;
760 omode = MCAST_INCLUDE; 760 omode = MCAST_INCLUDE;
@@ -848,6 +848,9 @@ mc_msf_out:
848 848
849 case IP_IPSEC_POLICY: 849 case IP_IPSEC_POLICY:
850 case IP_XFRM_POLICY: 850 case IP_XFRM_POLICY:
851 err = -EPERM;
852 if (!capable(CAP_NET_ADMIN))
853 break;
851 err = xfrm_user_policy(sk, optname, optval, optlen); 854 err = xfrm_user_policy(sk, optname, optval, optlen);
852 break; 855 break;
853 856
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
index 2065944fd9..7ded6e60f4 100644
--- a/net/ipv4/ipcomp.c
+++ b/net/ipv4/ipcomp.c
@@ -358,7 +358,7 @@ static struct crypto_tfm **ipcomp_alloc_tfms(const char *alg_name)
358 int cpu; 358 int cpu;
359 359
360 /* This can be any valid CPU ID so we don't need locking. */ 360 /* This can be any valid CPU ID so we don't need locking. */
361 cpu = smp_processor_id(); 361 cpu = raw_smp_processor_id();
362 362
363 list_for_each_entry(pos, &ipcomp_tfms_list, list) { 363 list_for_each_entry(pos, &ipcomp_tfms_list, list) {
364 struct crypto_tfm *tfm; 364 struct crypto_tfm *tfm;
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index f2509034ce..d2bf8e1930 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -1149,8 +1149,10 @@ static int __init ic_dynamic(void)
1149 ic_rarp_cleanup(); 1149 ic_rarp_cleanup();
1150#endif 1150#endif
1151 1151
1152 if (!ic_got_reply) 1152 if (!ic_got_reply) {
1153 ic_myaddr = INADDR_NONE;
1153 return -1; 1154 return -1;
1155 }
1154 1156
1155 printk("IP-Config: Got %s answer from %u.%u.%u.%u, ", 1157 printk("IP-Config: Got %s answer from %u.%u.%u.%u, ",
1156 ((ic_got_reply & IC_RARP) ? "RARP" 1158 ((ic_got_reply & IC_RARP) ? "RARP"
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 68a78731f7..c05c1df0bb 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -255,7 +255,6 @@ static struct ip_tunnel * ipip_tunnel_locate(struct ip_tunnel_parm *parms, int c
255 255
256 dev_hold(dev); 256 dev_hold(dev);
257 ipip_tunnel_link(nt); 257 ipip_tunnel_link(nt);
258 /* Do not decrement MOD_USE_COUNT here. */
259 return nt; 258 return nt;
260 259
261failed: 260failed:
@@ -273,7 +272,7 @@ static void ipip_tunnel_uninit(struct net_device *dev)
273 dev_put(dev); 272 dev_put(dev);
274} 273}
275 274
276static void ipip_err(struct sk_buff *skb, void *__unused) 275static void ipip_err(struct sk_buff *skb, u32 info)
277{ 276{
278#ifndef I_WISH_WORLD_WERE_PERFECT 277#ifndef I_WISH_WORLD_WERE_PERFECT
279 278
@@ -852,11 +851,39 @@ static int __init ipip_fb_tunnel_init(struct net_device *dev)
852 return 0; 851 return 0;
853} 852}
854 853
854#ifdef CONFIG_INET_TUNNEL
855static struct xfrm_tunnel ipip_handler = { 855static struct xfrm_tunnel ipip_handler = {
856 .handler = ipip_rcv, 856 .handler = ipip_rcv,
857 .err_handler = ipip_err, 857 .err_handler = ipip_err,
858}; 858};
859 859
860static inline int ipip_register(void)
861{
862 return xfrm4_tunnel_register(&ipip_handler);
863}
864
865static inline int ipip_unregister(void)
866{
867 return xfrm4_tunnel_deregister(&ipip_handler);
868}
869#else
870static struct net_protocol ipip_protocol = {
871 .handler = ipip_rcv,
872 .err_handler = ipip_err,
873 .no_policy = 1,
874};
875
876static inline int ipip_register(void)
877{
878 return inet_add_protocol(&ipip_protocol, IPPROTO_IPIP);
879}
880
881static inline int ipip_unregister(void)
882{
883 return inet_del_protocol(&ipip_protocol, IPPROTO_IPIP);
884}
885#endif
886
860static char banner[] __initdata = 887static char banner[] __initdata =
861 KERN_INFO "IPv4 over IPv4 tunneling driver\n"; 888 KERN_INFO "IPv4 over IPv4 tunneling driver\n";
862 889
@@ -866,7 +893,7 @@ static int __init ipip_init(void)
866 893
867 printk(banner); 894 printk(banner);
868 895
869 if (xfrm4_tunnel_register(&ipip_handler) < 0) { 896 if (ipip_register() < 0) {
870 printk(KERN_INFO "ipip init: can't register tunnel\n"); 897 printk(KERN_INFO "ipip init: can't register tunnel\n");
871 return -EAGAIN; 898 return -EAGAIN;
872 } 899 }
@@ -888,16 +915,33 @@ static int __init ipip_init(void)
888 err2: 915 err2:
889 free_netdev(ipip_fb_tunnel_dev); 916 free_netdev(ipip_fb_tunnel_dev);
890 err1: 917 err1:
891 xfrm4_tunnel_deregister(&ipip_handler); 918 ipip_unregister();
892 goto out; 919 goto out;
893} 920}
894 921
922static void __exit ipip_destroy_tunnels(void)
923{
924 int prio;
925
926 for (prio = 1; prio < 4; prio++) {
927 int h;
928 for (h = 0; h < HASH_SIZE; h++) {
929 struct ip_tunnel *t;
930 while ((t = tunnels[prio][h]) != NULL)
931 unregister_netdevice(t->dev);
932 }
933 }
934}
935
895static void __exit ipip_fini(void) 936static void __exit ipip_fini(void)
896{ 937{
897 if (xfrm4_tunnel_deregister(&ipip_handler) < 0) 938 if (ipip_unregister() < 0)
898 printk(KERN_INFO "ipip close: can't deregister tunnel\n"); 939 printk(KERN_INFO "ipip close: can't deregister tunnel\n");
899 940
900 unregister_netdev(ipip_fb_tunnel_dev); 941 rtnl_lock();
942 ipip_destroy_tunnels();
943 unregister_netdevice(ipip_fb_tunnel_dev);
944 rtnl_unlock();
901} 945}
902 946
903module_init(ipip_init); 947module_init(ipip_init);
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index e4f809a93f..dc806b5784 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -297,6 +297,7 @@ static int vif_delete(int vifi)
297static void ipmr_destroy_unres(struct mfc_cache *c) 297static void ipmr_destroy_unres(struct mfc_cache *c)
298{ 298{
299 struct sk_buff *skb; 299 struct sk_buff *skb;
300 struct nlmsgerr *e;
300 301
301 atomic_dec(&cache_resolve_queue_len); 302 atomic_dec(&cache_resolve_queue_len);
302 303
@@ -306,7 +307,9 @@ static void ipmr_destroy_unres(struct mfc_cache *c)
306 nlh->nlmsg_type = NLMSG_ERROR; 307 nlh->nlmsg_type = NLMSG_ERROR;
307 nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr)); 308 nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
308 skb_trim(skb, nlh->nlmsg_len); 309 skb_trim(skb, nlh->nlmsg_len);
309 ((struct nlmsgerr*)NLMSG_DATA(nlh))->error = -ETIMEDOUT; 310 e = NLMSG_DATA(nlh);
311 e->error = -ETIMEDOUT;
312 memset(&e->msg, 0, sizeof(e->msg));
310 netlink_unicast(rtnl, skb, NETLINK_CB(skb).dst_pid, MSG_DONTWAIT); 313 netlink_unicast(rtnl, skb, NETLINK_CB(skb).dst_pid, MSG_DONTWAIT);
311 } else 314 } else
312 kfree_skb(skb); 315 kfree_skb(skb);
@@ -359,7 +362,7 @@ out:
359 362
360/* Fill oifs list. It is called under write locked mrt_lock. */ 363/* Fill oifs list. It is called under write locked mrt_lock. */
361 364
362static void ipmr_update_threshoulds(struct mfc_cache *cache, unsigned char *ttls) 365static void ipmr_update_thresholds(struct mfc_cache *cache, unsigned char *ttls)
363{ 366{
364 int vifi; 367 int vifi;
365 368
@@ -499,6 +502,7 @@ static struct mfc_cache *ipmr_cache_alloc_unres(void)
499static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c) 502static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
500{ 503{
501 struct sk_buff *skb; 504 struct sk_buff *skb;
505 struct nlmsgerr *e;
502 506
503 /* 507 /*
504 * Play the pending entries through our router 508 * Play the pending entries through our router
@@ -515,7 +519,9 @@ static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
515 nlh->nlmsg_type = NLMSG_ERROR; 519 nlh->nlmsg_type = NLMSG_ERROR;
516 nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr)); 520 nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
517 skb_trim(skb, nlh->nlmsg_len); 521 skb_trim(skb, nlh->nlmsg_len);
518 ((struct nlmsgerr*)NLMSG_DATA(nlh))->error = -EMSGSIZE; 522 e = NLMSG_DATA(nlh);
523 e->error = -EMSGSIZE;
524 memset(&e->msg, 0, sizeof(e->msg));
519 } 525 }
520 err = netlink_unicast(rtnl, skb, NETLINK_CB(skb).dst_pid, MSG_DONTWAIT); 526 err = netlink_unicast(rtnl, skb, NETLINK_CB(skb).dst_pid, MSG_DONTWAIT);
521 } else 527 } else
@@ -721,7 +727,7 @@ static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock)
721 if (c != NULL) { 727 if (c != NULL) {
722 write_lock_bh(&mrt_lock); 728 write_lock_bh(&mrt_lock);
723 c->mfc_parent = mfc->mfcc_parent; 729 c->mfc_parent = mfc->mfcc_parent;
724 ipmr_update_threshoulds(c, mfc->mfcc_ttls); 730 ipmr_update_thresholds(c, mfc->mfcc_ttls);
725 if (!mrtsock) 731 if (!mrtsock)
726 c->mfc_flags |= MFC_STATIC; 732 c->mfc_flags |= MFC_STATIC;
727 write_unlock_bh(&mrt_lock); 733 write_unlock_bh(&mrt_lock);
@@ -738,7 +744,7 @@ static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock)
738 c->mfc_origin=mfc->mfcc_origin.s_addr; 744 c->mfc_origin=mfc->mfcc_origin.s_addr;
739 c->mfc_mcastgrp=mfc->mfcc_mcastgrp.s_addr; 745 c->mfc_mcastgrp=mfc->mfcc_mcastgrp.s_addr;
740 c->mfc_parent=mfc->mfcc_parent; 746 c->mfc_parent=mfc->mfcc_parent;
741 ipmr_update_threshoulds(c, mfc->mfcc_ttls); 747 ipmr_update_thresholds(c, mfc->mfcc_ttls);
742 if (!mrtsock) 748 if (!mrtsock)
743 c->mfc_flags |= MFC_STATIC; 749 c->mfc_flags |= MFC_STATIC;
744 750
diff --git a/net/ipv4/ipvs/Kconfig b/net/ipv4/ipvs/Kconfig
index 63a82b4b64..c9820bfc49 100644
--- a/net/ipv4/ipvs/Kconfig
+++ b/net/ipv4/ipvs/Kconfig
@@ -2,11 +2,11 @@
2# IP Virtual Server configuration 2# IP Virtual Server configuration
3# 3#
4menu "IP: Virtual Server Configuration" 4menu "IP: Virtual Server Configuration"
5 depends on INET && NETFILTER 5 depends on NETFILTER
6 6
7config IP_VS 7config IP_VS
8 tristate "IP virtual server support (EXPERIMENTAL)" 8 tristate "IP virtual server support (EXPERIMENTAL)"
9 depends on INET && NETFILTER 9 depends on NETFILTER
10 ---help--- 10 ---help---
11 IP Virtual Server support will let you build a high-performance 11 IP Virtual Server support will let you build a high-performance
12 virtual server based on cluster of two or more real servers. This 12 virtual server based on cluster of two or more real servers. This
diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c
index fd6feb5499..d0145a8b15 100644
--- a/net/ipv4/ipvs/ip_vs_conn.c
+++ b/net/ipv4/ipvs/ip_vs_conn.c
@@ -548,7 +548,6 @@ void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
548{ 548{
549 if (del_timer(&cp->timer)) 549 if (del_timer(&cp->timer))
550 mod_timer(&cp->timer, jiffies); 550 mod_timer(&cp->timer, jiffies);
551 __ip_vs_conn_put(cp);
552} 551}
553 552
554 553
@@ -759,12 +758,11 @@ static inline int todrop_entry(struct ip_vs_conn *cp)
759 return 1; 758 return 1;
760} 759}
761 760
762 761/* Called from keventd and must protect itself from softirqs */
763void ip_vs_random_dropentry(void) 762void ip_vs_random_dropentry(void)
764{ 763{
765 int idx; 764 int idx;
766 struct ip_vs_conn *cp; 765 struct ip_vs_conn *cp;
767 struct ip_vs_conn *ct;
768 766
769 /* 767 /*
770 * Randomly scan 1/32 of the whole table every second 768 * Randomly scan 1/32 of the whole table every second
@@ -775,7 +773,7 @@ void ip_vs_random_dropentry(void)
775 /* 773 /*
776 * Lock is actually needed in this loop. 774 * Lock is actually needed in this loop.
777 */ 775 */
778 ct_write_lock(hash); 776 ct_write_lock_bh(hash);
779 777
780 list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { 778 list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
781 if (!cp->cport && !(cp->flags & IP_VS_CONN_F_NO_CPORT)) 779 if (!cp->cport && !(cp->flags & IP_VS_CONN_F_NO_CPORT))
@@ -801,23 +799,14 @@ void ip_vs_random_dropentry(void)
801 continue; 799 continue;
802 } 800 }
803 801
804 /*
805 * Drop the entry, and drop its ct if not referenced
806 */
807 atomic_inc(&cp->refcnt);
808 ct_write_unlock(hash);
809
810 if ((ct = cp->control))
811 atomic_inc(&ct->refcnt);
812 IP_VS_DBG(4, "del connection\n"); 802 IP_VS_DBG(4, "del connection\n");
813 ip_vs_conn_expire_now(cp); 803 ip_vs_conn_expire_now(cp);
814 if (ct) { 804 if (cp->control) {
815 IP_VS_DBG(4, "del conn template\n"); 805 IP_VS_DBG(4, "del conn template\n");
816 ip_vs_conn_expire_now(ct); 806 ip_vs_conn_expire_now(cp->control);
817 } 807 }
818 ct_write_lock(hash);
819 } 808 }
820 ct_write_unlock(hash); 809 ct_write_unlock_bh(hash);
821 } 810 }
822} 811}
823 812
@@ -829,7 +818,6 @@ static void ip_vs_conn_flush(void)
829{ 818{
830 int idx; 819 int idx;
831 struct ip_vs_conn *cp; 820 struct ip_vs_conn *cp;
832 struct ip_vs_conn *ct;
833 821
834 flush_again: 822 flush_again:
835 for (idx=0; idx<IP_VS_CONN_TAB_SIZE; idx++) { 823 for (idx=0; idx<IP_VS_CONN_TAB_SIZE; idx++) {
@@ -839,18 +827,13 @@ static void ip_vs_conn_flush(void)
839 ct_write_lock_bh(idx); 827 ct_write_lock_bh(idx);
840 828
841 list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { 829 list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
842 atomic_inc(&cp->refcnt);
843 ct_write_unlock(idx);
844 830
845 if ((ct = cp->control))
846 atomic_inc(&ct->refcnt);
847 IP_VS_DBG(4, "del connection\n"); 831 IP_VS_DBG(4, "del connection\n");
848 ip_vs_conn_expire_now(cp); 832 ip_vs_conn_expire_now(cp);
849 if (ct) { 833 if (cp->control) {
850 IP_VS_DBG(4, "del conn template\n"); 834 IP_VS_DBG(4, "del conn template\n");
851 ip_vs_conn_expire_now(ct); 835 ip_vs_conn_expire_now(cp->control);
852 } 836 }
853 ct_write_lock(idx);
854 } 837 }
855 ct_write_unlock_bh(idx); 838 ct_write_unlock_bh(idx);
856 } 839 }
diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c
index 218d970103..7d99ede2ef 100644
--- a/net/ipv4/ipvs/ip_vs_ctl.c
+++ b/net/ipv4/ipvs/ip_vs_ctl.c
@@ -90,7 +90,8 @@ int ip_vs_get_debug_level(void)
90#endif 90#endif
91 91
92/* 92/*
93 * update_defense_level is called from keventd and from sysctl. 93 * update_defense_level is called from keventd and from sysctl,
94 * so it needs to protect itself from softirqs
94 */ 95 */
95static void update_defense_level(void) 96static void update_defense_level(void)
96{ 97{
@@ -110,6 +111,8 @@ static void update_defense_level(void)
110 111
111 nomem = (availmem < sysctl_ip_vs_amemthresh); 112 nomem = (availmem < sysctl_ip_vs_amemthresh);
112 113
114 local_bh_disable();
115
113 /* drop_entry */ 116 /* drop_entry */
114 spin_lock(&__ip_vs_dropentry_lock); 117 spin_lock(&__ip_vs_dropentry_lock);
115 switch (sysctl_ip_vs_drop_entry) { 118 switch (sysctl_ip_vs_drop_entry) {
@@ -206,6 +209,8 @@ static void update_defense_level(void)
206 if (to_change >= 0) 209 if (to_change >= 0)
207 ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1); 210 ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1);
208 write_unlock(&__ip_vs_securetcp_lock); 211 write_unlock(&__ip_vs_securetcp_lock);
212
213 local_bh_enable();
209} 214}
210 215
211 216
@@ -1360,9 +1365,7 @@ proc_do_defense_mode(ctl_table *table, int write, struct file * filp,
1360 /* Restore the correct value */ 1365 /* Restore the correct value */
1361 *valp = val; 1366 *valp = val;
1362 } else { 1367 } else {
1363 local_bh_disable();
1364 update_defense_level(); 1368 update_defense_level();
1365 local_bh_enable();
1366 } 1369 }
1367 } 1370 }
1368 return rc; 1371 return rc;
@@ -2059,7 +2062,7 @@ ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
2059 dst->addr = src->addr; 2062 dst->addr = src->addr;
2060 dst->port = src->port; 2063 dst->port = src->port;
2061 dst->fwmark = src->fwmark; 2064 dst->fwmark = src->fwmark;
2062 strcpy(dst->sched_name, src->scheduler->name); 2065 strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name));
2063 dst->flags = src->flags; 2066 dst->flags = src->flags;
2064 dst->timeout = src->timeout / HZ; 2067 dst->timeout = src->timeout / HZ;
2065 dst->netmask = src->netmask; 2068 dst->netmask = src->netmask;
@@ -2080,6 +2083,7 @@ __ip_vs_get_service_entries(const struct ip_vs_get_services *get,
2080 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { 2083 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
2081 if (count >= get->num_services) 2084 if (count >= get->num_services)
2082 goto out; 2085 goto out;
2086 memset(&entry, 0, sizeof(entry));
2083 ip_vs_copy_service(&entry, svc); 2087 ip_vs_copy_service(&entry, svc);
2084 if (copy_to_user(&uptr->entrytable[count], 2088 if (copy_to_user(&uptr->entrytable[count],
2085 &entry, sizeof(entry))) { 2089 &entry, sizeof(entry))) {
@@ -2094,6 +2098,7 @@ __ip_vs_get_service_entries(const struct ip_vs_get_services *get,
2094 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { 2098 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
2095 if (count >= get->num_services) 2099 if (count >= get->num_services)
2096 goto out; 2100 goto out;
2101 memset(&entry, 0, sizeof(entry));
2097 ip_vs_copy_service(&entry, svc); 2102 ip_vs_copy_service(&entry, svc);
2098 if (copy_to_user(&uptr->entrytable[count], 2103 if (copy_to_user(&uptr->entrytable[count],
2099 &entry, sizeof(entry))) { 2104 &entry, sizeof(entry))) {
@@ -2304,12 +2309,12 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
2304 memset(&d, 0, sizeof(d)); 2309 memset(&d, 0, sizeof(d));
2305 if (ip_vs_sync_state & IP_VS_STATE_MASTER) { 2310 if (ip_vs_sync_state & IP_VS_STATE_MASTER) {
2306 d[0].state = IP_VS_STATE_MASTER; 2311 d[0].state = IP_VS_STATE_MASTER;
2307 strcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn); 2312 strlcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn, sizeof(d[0].mcast_ifn));
2308 d[0].syncid = ip_vs_master_syncid; 2313 d[0].syncid = ip_vs_master_syncid;
2309 } 2314 }
2310 if (ip_vs_sync_state & IP_VS_STATE_BACKUP) { 2315 if (ip_vs_sync_state & IP_VS_STATE_BACKUP) {
2311 d[1].state = IP_VS_STATE_BACKUP; 2316 d[1].state = IP_VS_STATE_BACKUP;
2312 strcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn); 2317 strlcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn, sizeof(d[1].mcast_ifn));
2313 d[1].syncid = ip_vs_backup_syncid; 2318 d[1].syncid = ip_vs_backup_syncid;
2314 } 2319 }
2315 if (copy_to_user(user, &d, sizeof(d)) != 0) 2320 if (copy_to_user(user, &d, sizeof(d)) != 0)
diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c
index 25c479550a..574d1f509b 100644
--- a/net/ipv4/ipvs/ip_vs_sync.c
+++ b/net/ipv4/ipvs/ip_vs_sync.c
@@ -839,10 +839,10 @@ int start_sync_thread(int state, char *mcast_ifn, __u8 syncid)
839 839
840 ip_vs_sync_state |= state; 840 ip_vs_sync_state |= state;
841 if (state == IP_VS_STATE_MASTER) { 841 if (state == IP_VS_STATE_MASTER) {
842 strcpy(ip_vs_master_mcast_ifn, mcast_ifn); 842 strlcpy(ip_vs_master_mcast_ifn, mcast_ifn, sizeof(ip_vs_master_mcast_ifn));
843 ip_vs_master_syncid = syncid; 843 ip_vs_master_syncid = syncid;
844 } else { 844 } else {
845 strcpy(ip_vs_backup_mcast_ifn, mcast_ifn); 845 strlcpy(ip_vs_backup_mcast_ifn, mcast_ifn, sizeof(ip_vs_backup_mcast_ifn));
846 ip_vs_backup_syncid = syncid; 846 ip_vs_backup_syncid = syncid;
847 } 847 }
848 848
diff --git a/net/ipv4/netfilter/ip_conntrack_amanda.c b/net/ipv4/netfilter/ip_conntrack_amanda.c
index a78a320eee..01e1b58322 100644
--- a/net/ipv4/netfilter/ip_conntrack_amanda.c
+++ b/net/ipv4/netfilter/ip_conntrack_amanda.c
@@ -101,14 +101,13 @@ static int help(struct sk_buff **pskb,
101 if (port == 0 || len > 5) 101 if (port == 0 || len > 5)
102 break; 102 break;
103 103
104 exp = ip_conntrack_expect_alloc(); 104 exp = ip_conntrack_expect_alloc(ct);
105 if (exp == NULL) { 105 if (exp == NULL) {
106 ret = NF_DROP; 106 ret = NF_DROP;
107 goto out; 107 goto out;
108 } 108 }
109 109
110 exp->expectfn = NULL; 110 exp->expectfn = NULL;
111 exp->master = ct;
112 111
113 exp->tuple.src.ip = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip; 112 exp->tuple.src.ip = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip;
114 exp->tuple.src.u.tcp.port = 0; 113 exp->tuple.src.u.tcp.port = 0;
@@ -126,10 +125,9 @@ static int help(struct sk_buff **pskb,
126 ret = ip_nat_amanda_hook(pskb, ctinfo, 125 ret = ip_nat_amanda_hook(pskb, ctinfo,
127 tmp - amanda_buffer, 126 tmp - amanda_buffer,
128 len, exp); 127 len, exp);
129 else if (ip_conntrack_expect_related(exp) != 0) { 128 else if (ip_conntrack_expect_related(exp) != 0)
130 ip_conntrack_expect_free(exp);
131 ret = NF_DROP; 129 ret = NF_DROP;
132 } 130 ip_conntrack_expect_put(exp);
133 } 131 }
134 132
135out: 133out:
diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c
index 4b78ebeb66..a7f0c821a9 100644
--- a/net/ipv4/netfilter/ip_conntrack_core.c
+++ b/net/ipv4/netfilter/ip_conntrack_core.c
@@ -137,19 +137,12 @@ ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse,
137 137
138 138
139/* ip_conntrack_expect helper functions */ 139/* ip_conntrack_expect helper functions */
140static void destroy_expect(struct ip_conntrack_expect *exp)
141{
142 ip_conntrack_put(exp->master);
143 IP_NF_ASSERT(!timer_pending(&exp->timeout));
144 kmem_cache_free(ip_conntrack_expect_cachep, exp);
145 CONNTRACK_STAT_INC(expect_delete);
146}
147
148static void unlink_expect(struct ip_conntrack_expect *exp) 140static void unlink_expect(struct ip_conntrack_expect *exp)
149{ 141{
150 ASSERT_WRITE_LOCK(&ip_conntrack_lock); 142 ASSERT_WRITE_LOCK(&ip_conntrack_lock);
143 IP_NF_ASSERT(!timer_pending(&exp->timeout));
151 list_del(&exp->list); 144 list_del(&exp->list);
152 /* Logically in destroy_expect, but we hold the lock here. */ 145 CONNTRACK_STAT_INC(expect_delete);
153 exp->master->expecting--; 146 exp->master->expecting--;
154} 147}
155 148
@@ -160,7 +153,7 @@ static void expectation_timed_out(unsigned long ul_expect)
160 write_lock_bh(&ip_conntrack_lock); 153 write_lock_bh(&ip_conntrack_lock);
161 unlink_expect(exp); 154 unlink_expect(exp);
162 write_unlock_bh(&ip_conntrack_lock); 155 write_unlock_bh(&ip_conntrack_lock);
163 destroy_expect(exp); 156 ip_conntrack_expect_put(exp);
164} 157}
165 158
166/* If an expectation for this connection is found, it gets delete from 159/* If an expectation for this connection is found, it gets delete from
@@ -198,7 +191,7 @@ static void remove_expectations(struct ip_conntrack *ct)
198 list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) { 191 list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) {
199 if (i->master == ct && del_timer(&i->timeout)) { 192 if (i->master == ct && del_timer(&i->timeout)) {
200 unlink_expect(i); 193 unlink_expect(i);
201 destroy_expect(i); 194 ip_conntrack_expect_put(i);
202 } 195 }
203 } 196 }
204} 197}
@@ -517,9 +510,14 @@ init_conntrack(const struct ip_conntrack_tuple *tuple,
517 /* Welcome, Mr. Bond. We've been expecting you... */ 510 /* Welcome, Mr. Bond. We've been expecting you... */
518 __set_bit(IPS_EXPECTED_BIT, &conntrack->status); 511 __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
519 conntrack->master = exp->master; 512 conntrack->master = exp->master;
520#if CONFIG_IP_NF_CONNTRACK_MARK 513#ifdef CONFIG_IP_NF_CONNTRACK_MARK
521 conntrack->mark = exp->master->mark; 514 conntrack->mark = exp->master->mark;
522#endif 515#endif
516#if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \
517 defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE)
518 /* this is ugly, but there is no other place where to put it */
519 conntrack->nat.masq_index = exp->master->nat.masq_index;
520#endif
523 nf_conntrack_get(&conntrack->master->ct_general); 521 nf_conntrack_get(&conntrack->master->ct_general);
524 CONNTRACK_STAT_INC(expect_new); 522 CONNTRACK_STAT_INC(expect_new);
525 } else { 523 } else {
@@ -537,7 +535,7 @@ init_conntrack(const struct ip_conntrack_tuple *tuple,
537 if (exp) { 535 if (exp) {
538 if (exp->expectfn) 536 if (exp->expectfn)
539 exp->expectfn(conntrack, exp); 537 exp->expectfn(conntrack, exp);
540 destroy_expect(exp); 538 ip_conntrack_expect_put(exp);
541 } 539 }
542 540
543 return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL]; 541 return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
@@ -729,14 +727,14 @@ void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp)
729 if (expect_matches(i, exp) && del_timer(&i->timeout)) { 727 if (expect_matches(i, exp) && del_timer(&i->timeout)) {
730 unlink_expect(i); 728 unlink_expect(i);
731 write_unlock_bh(&ip_conntrack_lock); 729 write_unlock_bh(&ip_conntrack_lock);
732 destroy_expect(i); 730 ip_conntrack_expect_put(i);
733 return; 731 return;
734 } 732 }
735 } 733 }
736 write_unlock_bh(&ip_conntrack_lock); 734 write_unlock_bh(&ip_conntrack_lock);
737} 735}
738 736
739struct ip_conntrack_expect *ip_conntrack_expect_alloc(void) 737struct ip_conntrack_expect *ip_conntrack_expect_alloc(struct ip_conntrack *me)
740{ 738{
741 struct ip_conntrack_expect *new; 739 struct ip_conntrack_expect *new;
742 740
@@ -745,18 +743,23 @@ struct ip_conntrack_expect *ip_conntrack_expect_alloc(void)
745 DEBUGP("expect_related: OOM allocating expect\n"); 743 DEBUGP("expect_related: OOM allocating expect\n");
746 return NULL; 744 return NULL;
747 } 745 }
748 new->master = NULL; 746 new->master = me;
747 atomic_inc(&new->master->ct_general.use);
748 atomic_set(&new->use, 1);
749 return new; 749 return new;
750} 750}
751 751
752void ip_conntrack_expect_free(struct ip_conntrack_expect *expect) 752void ip_conntrack_expect_put(struct ip_conntrack_expect *exp)
753{ 753{
754 kmem_cache_free(ip_conntrack_expect_cachep, expect); 754 if (atomic_dec_and_test(&exp->use)) {
755 ip_conntrack_put(exp->master);
756 kmem_cache_free(ip_conntrack_expect_cachep, exp);
757 }
755} 758}
756 759
757static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp) 760static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp)
758{ 761{
759 atomic_inc(&exp->master->ct_general.use); 762 atomic_inc(&exp->use);
760 exp->master->expecting++; 763 exp->master->expecting++;
761 list_add(&exp->list, &ip_conntrack_expect_list); 764 list_add(&exp->list, &ip_conntrack_expect_list);
762 765
@@ -778,7 +781,7 @@ static void evict_oldest_expect(struct ip_conntrack *master)
778 if (i->master == master) { 781 if (i->master == master) {
779 if (del_timer(&i->timeout)) { 782 if (del_timer(&i->timeout)) {
780 unlink_expect(i); 783 unlink_expect(i);
781 destroy_expect(i); 784 ip_conntrack_expect_put(i);
782 } 785 }
783 break; 786 break;
784 } 787 }
@@ -810,8 +813,6 @@ int ip_conntrack_expect_related(struct ip_conntrack_expect *expect)
810 /* Refresh timer: if it's dying, ignore.. */ 813 /* Refresh timer: if it's dying, ignore.. */
811 if (refresh_timer(i)) { 814 if (refresh_timer(i)) {
812 ret = 0; 815 ret = 0;
813 /* We don't need the one they've given us. */
814 ip_conntrack_expect_free(expect);
815 goto out; 816 goto out;
816 } 817 }
817 } else if (expect_clash(i, expect)) { 818 } else if (expect_clash(i, expect)) {
@@ -881,7 +882,7 @@ void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
881 list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) { 882 list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) {
882 if (exp->master->helper == me && del_timer(&exp->timeout)) { 883 if (exp->master->helper == me && del_timer(&exp->timeout)) {
883 unlink_expect(exp); 884 unlink_expect(exp);
884 destroy_expect(exp); 885 ip_conntrack_expect_put(exp);
885 } 886 }
886 } 887 }
887 /* Get rid of expecteds, set helpers to NULL. */ 888 /* Get rid of expecteds, set helpers to NULL. */
@@ -1111,6 +1112,9 @@ void ip_conntrack_cleanup(void)
1111 schedule(); 1112 schedule();
1112 goto i_see_dead_people; 1113 goto i_see_dead_people;
1113 } 1114 }
1115 /* wait until all references to ip_conntrack_untracked are dropped */
1116 while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1)
1117 schedule();
1114 1118
1115 kmem_cache_destroy(ip_conntrack_cachep); 1119 kmem_cache_destroy(ip_conntrack_cachep);
1116 kmem_cache_destroy(ip_conntrack_expect_cachep); 1120 kmem_cache_destroy(ip_conntrack_expect_cachep);
diff --git a/net/ipv4/netfilter/ip_conntrack_ftp.c b/net/ipv4/netfilter/ip_conntrack_ftp.c
index fea6dd2a00..7a3b773be3 100644
--- a/net/ipv4/netfilter/ip_conntrack_ftp.c
+++ b/net/ipv4/netfilter/ip_conntrack_ftp.c
@@ -376,7 +376,7 @@ static int help(struct sk_buff **pskb,
376 fb_ptr + matchoff, matchlen, ntohl(th->seq) + matchoff); 376 fb_ptr + matchoff, matchlen, ntohl(th->seq) + matchoff);
377 377
378 /* Allocate expectation which will be inserted */ 378 /* Allocate expectation which will be inserted */
379 exp = ip_conntrack_expect_alloc(); 379 exp = ip_conntrack_expect_alloc(ct);
380 if (exp == NULL) { 380 if (exp == NULL) {
381 ret = NF_DROP; 381 ret = NF_DROP;
382 goto out; 382 goto out;
@@ -403,8 +403,7 @@ static int help(struct sk_buff **pskb,
403 networks, or the packet filter itself). */ 403 networks, or the packet filter itself). */
404 if (!loose) { 404 if (!loose) {
405 ret = NF_ACCEPT; 405 ret = NF_ACCEPT;
406 ip_conntrack_expect_free(exp); 406 goto out_put_expect;
407 goto out_update_nl;
408 } 407 }
409 exp->tuple.dst.ip = htonl((array[0] << 24) | (array[1] << 16) 408 exp->tuple.dst.ip = htonl((array[0] << 24) | (array[1] << 16)
410 | (array[2] << 8) | array[3]); 409 | (array[2] << 8) | array[3]);
@@ -419,7 +418,6 @@ static int help(struct sk_buff **pskb,
419 { 0xFFFFFFFF, { .tcp = { 0xFFFF } }, 0xFF }}); 418 { 0xFFFFFFFF, { .tcp = { 0xFFFF } }, 0xFF }});
420 419
421 exp->expectfn = NULL; 420 exp->expectfn = NULL;
422 exp->master = ct;
423 421
424 /* Now, NAT might want to mangle the packet, and register the 422 /* Now, NAT might want to mangle the packet, and register the
425 * (possibly changed) expectation itself. */ 423 * (possibly changed) expectation itself. */
@@ -428,13 +426,15 @@ static int help(struct sk_buff **pskb,
428 matchoff, matchlen, exp, &seq); 426 matchoff, matchlen, exp, &seq);
429 else { 427 else {
430 /* Can't expect this? Best to drop packet now. */ 428 /* Can't expect this? Best to drop packet now. */
431 if (ip_conntrack_expect_related(exp) != 0) { 429 if (ip_conntrack_expect_related(exp) != 0)
432 ip_conntrack_expect_free(exp);
433 ret = NF_DROP; 430 ret = NF_DROP;
434 } else 431 else
435 ret = NF_ACCEPT; 432 ret = NF_ACCEPT;
436 } 433 }
437 434
435out_put_expect:
436 ip_conntrack_expect_put(exp);
437
438out_update_nl: 438out_update_nl:
439 /* Now if this ends in \n, update ftp info. Seq may have been 439 /* Now if this ends in \n, update ftp info. Seq may have been
440 * adjusted by NAT code. */ 440 * adjusted by NAT code. */
diff --git a/net/ipv4/netfilter/ip_conntrack_irc.c b/net/ipv4/netfilter/ip_conntrack_irc.c
index cd98772cc3..4a28f297d5 100644
--- a/net/ipv4/netfilter/ip_conntrack_irc.c
+++ b/net/ipv4/netfilter/ip_conntrack_irc.c
@@ -197,7 +197,7 @@ static int help(struct sk_buff **pskb,
197 continue; 197 continue;
198 } 198 }
199 199
200 exp = ip_conntrack_expect_alloc(); 200 exp = ip_conntrack_expect_alloc(ct);
201 if (exp == NULL) { 201 if (exp == NULL) {
202 ret = NF_DROP; 202 ret = NF_DROP;
203 goto out; 203 goto out;
@@ -221,16 +221,14 @@ static int help(struct sk_buff **pskb,
221 { { 0, { 0 } }, 221 { { 0, { 0 } },
222 { 0xFFFFFFFF, { .tcp = { 0xFFFF } }, 0xFF }}); 222 { 0xFFFFFFFF, { .tcp = { 0xFFFF } }, 0xFF }});
223 exp->expectfn = NULL; 223 exp->expectfn = NULL;
224 exp->master = ct;
225 if (ip_nat_irc_hook) 224 if (ip_nat_irc_hook)
226 ret = ip_nat_irc_hook(pskb, ctinfo, 225 ret = ip_nat_irc_hook(pskb, ctinfo,
227 addr_beg_p - ib_ptr, 226 addr_beg_p - ib_ptr,
228 addr_end_p - addr_beg_p, 227 addr_end_p - addr_beg_p,
229 exp); 228 exp);
230 else if (ip_conntrack_expect_related(exp) != 0) { 229 else if (ip_conntrack_expect_related(exp) != 0)
231 ip_conntrack_expect_free(exp);
232 ret = NF_DROP; 230 ret = NF_DROP;
233 } 231 ip_conntrack_expect_put(exp);
234 goto out; 232 goto out;
235 } /* for .. NUM_DCCPROTO */ 233 } /* for .. NUM_DCCPROTO */
236 } /* while data < ... */ 234 } /* while data < ... */
diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c
index 42dc951028..61798c46e9 100644
--- a/net/ipv4/netfilter/ip_conntrack_standalone.c
+++ b/net/ipv4/netfilter/ip_conntrack_standalone.c
@@ -432,6 +432,13 @@ static unsigned int ip_conntrack_defrag(unsigned int hooknum,
432 const struct net_device *out, 432 const struct net_device *out,
433 int (*okfn)(struct sk_buff *)) 433 int (*okfn)(struct sk_buff *))
434{ 434{
435#if !defined(CONFIG_IP_NF_NAT) && !defined(CONFIG_IP_NF_NAT_MODULE)
436 /* Previously seen (loopback)? Ignore. Do this before
437 fragment check. */
438 if ((*pskb)->nfct)
439 return NF_ACCEPT;
440#endif
441
435 /* Gather fragments. */ 442 /* Gather fragments. */
436 if ((*pskb)->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) { 443 if ((*pskb)->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) {
437 *pskb = ip_ct_gather_frags(*pskb, 444 *pskb = ip_ct_gather_frags(*pskb,
@@ -978,7 +985,7 @@ EXPORT_SYMBOL(ip_ct_refresh_acct);
978EXPORT_SYMBOL(ip_ct_protos); 985EXPORT_SYMBOL(ip_ct_protos);
979EXPORT_SYMBOL(ip_ct_find_proto); 986EXPORT_SYMBOL(ip_ct_find_proto);
980EXPORT_SYMBOL(ip_conntrack_expect_alloc); 987EXPORT_SYMBOL(ip_conntrack_expect_alloc);
981EXPORT_SYMBOL(ip_conntrack_expect_free); 988EXPORT_SYMBOL(ip_conntrack_expect_put);
982EXPORT_SYMBOL(ip_conntrack_expect_related); 989EXPORT_SYMBOL(ip_conntrack_expect_related);
983EXPORT_SYMBOL(ip_conntrack_unexpect_related); 990EXPORT_SYMBOL(ip_conntrack_unexpect_related);
984EXPORT_SYMBOL(ip_conntrack_tuple_taken); 991EXPORT_SYMBOL(ip_conntrack_tuple_taken);
diff --git a/net/ipv4/netfilter/ip_conntrack_tftp.c b/net/ipv4/netfilter/ip_conntrack_tftp.c
index 992fac3e36..f8ff170f39 100644
--- a/net/ipv4/netfilter/ip_conntrack_tftp.c
+++ b/net/ipv4/netfilter/ip_conntrack_tftp.c
@@ -65,7 +65,7 @@ static int tftp_help(struct sk_buff **pskb,
65 DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); 65 DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
66 DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); 66 DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
67 67
68 exp = ip_conntrack_expect_alloc(); 68 exp = ip_conntrack_expect_alloc(ct);
69 if (exp == NULL) 69 if (exp == NULL)
70 return NF_DROP; 70 return NF_DROP;
71 71
@@ -75,17 +75,15 @@ static int tftp_help(struct sk_buff **pskb,
75 exp->mask.dst.u.udp.port = 0xffff; 75 exp->mask.dst.u.udp.port = 0xffff;
76 exp->mask.dst.protonum = 0xff; 76 exp->mask.dst.protonum = 0xff;
77 exp->expectfn = NULL; 77 exp->expectfn = NULL;
78 exp->master = ct;
79 78
80 DEBUGP("expect: "); 79 DEBUGP("expect: ");
81 DUMP_TUPLE(&exp->tuple); 80 DUMP_TUPLE(&exp->tuple);
82 DUMP_TUPLE(&exp->mask); 81 DUMP_TUPLE(&exp->mask);
83 if (ip_nat_tftp_hook) 82 if (ip_nat_tftp_hook)
84 ret = ip_nat_tftp_hook(pskb, ctinfo, exp); 83 ret = ip_nat_tftp_hook(pskb, ctinfo, exp);
85 else if (ip_conntrack_expect_related(exp) != 0) { 84 else if (ip_conntrack_expect_related(exp) != 0)
86 ip_conntrack_expect_free(exp);
87 ret = NF_DROP; 85 ret = NF_DROP;
88 } 86 ip_conntrack_expect_put(exp);
89 break; 87 break;
90 case TFTP_OPCODE_DATA: 88 case TFTP_OPCODE_DATA:
91 case TFTP_OPCODE_ACK: 89 case TFTP_OPCODE_ACK:
diff --git a/net/ipv4/netfilter/ip_nat_amanda.c b/net/ipv4/netfilter/ip_nat_amanda.c
index da1f412583..706c8074f4 100644
--- a/net/ipv4/netfilter/ip_nat_amanda.c
+++ b/net/ipv4/netfilter/ip_nat_amanda.c
@@ -56,10 +56,8 @@ static unsigned int help(struct sk_buff **pskb,
56 break; 56 break;
57 } 57 }
58 58
59 if (port == 0) { 59 if (port == 0)
60 ip_conntrack_expect_free(exp);
61 return NF_DROP; 60 return NF_DROP;
62 }
63 61
64 sprintf(buffer, "%u", port); 62 sprintf(buffer, "%u", port);
65 ret = ip_nat_mangle_udp_packet(pskb, exp->master, ctinfo, 63 ret = ip_nat_mangle_udp_packet(pskb, exp->master, ctinfo,
diff --git a/net/ipv4/netfilter/ip_nat_ftp.c b/net/ipv4/netfilter/ip_nat_ftp.c
index c6000e794a..d83757a70d 100644
--- a/net/ipv4/netfilter/ip_nat_ftp.c
+++ b/net/ipv4/netfilter/ip_nat_ftp.c
@@ -143,10 +143,8 @@ static unsigned int ip_nat_ftp(struct sk_buff **pskb,
143 break; 143 break;
144 } 144 }
145 145
146 if (port == 0) { 146 if (port == 0)
147 ip_conntrack_expect_free(exp);
148 return NF_DROP; 147 return NF_DROP;
149 }
150 148
151 if (!mangle[type](pskb, newip, port, matchoff, matchlen, ct, ctinfo, 149 if (!mangle[type](pskb, newip, port, matchoff, matchlen, ct, ctinfo,
152 seq)) { 150 seq)) {
diff --git a/net/ipv4/netfilter/ip_nat_irc.c b/net/ipv4/netfilter/ip_nat_irc.c
index 9c1ca3381d..de31942bab 100644
--- a/net/ipv4/netfilter/ip_nat_irc.c
+++ b/net/ipv4/netfilter/ip_nat_irc.c
@@ -65,10 +65,8 @@ static unsigned int help(struct sk_buff **pskb,
65 break; 65 break;
66 } 66 }
67 67
68 if (port == 0) { 68 if (port == 0)
69 ip_conntrack_expect_free(exp);
70 return NF_DROP; 69 return NF_DROP;
71 }
72 70
73 /* strlen("\1DCC CHAT chat AAAAAAAA P\1\n")=27 71 /* strlen("\1DCC CHAT chat AAAAAAAA P\1\n")=27
74 * strlen("\1DCC SCHAT chat AAAAAAAA P\1\n")=28 72 * strlen("\1DCC SCHAT chat AAAAAAAA P\1\n")=28
diff --git a/net/ipv4/netfilter/ip_nat_proto_icmp.c b/net/ipv4/netfilter/ip_nat_proto_icmp.c
index a558cf0eee..6596c9ee16 100644
--- a/net/ipv4/netfilter/ip_nat_proto_icmp.c
+++ b/net/ipv4/netfilter/ip_nat_proto_icmp.c
@@ -35,16 +35,17 @@ icmp_unique_tuple(struct ip_conntrack_tuple *tuple,
35 const struct ip_conntrack *conntrack) 35 const struct ip_conntrack *conntrack)
36{ 36{
37 static u_int16_t id; 37 static u_int16_t id;
38 unsigned int range_size 38 unsigned int range_size;
39 = (unsigned int)range->max.icmp.id - range->min.icmp.id + 1;
40 unsigned int i; 39 unsigned int i;
41 40
41 range_size = ntohs(range->max.icmp.id) - ntohs(range->min.icmp.id) + 1;
42 /* If no range specified... */ 42 /* If no range specified... */
43 if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) 43 if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED))
44 range_size = 0xFFFF; 44 range_size = 0xFFFF;
45 45
46 for (i = 0; i < range_size; i++, id++) { 46 for (i = 0; i < range_size; i++, id++) {
47 tuple->src.u.icmp.id = range->min.icmp.id + (id % range_size); 47 tuple->src.u.icmp.id = htons(ntohs(range->min.icmp.id) +
48 (id % range_size));
48 if (!ip_nat_used_tuple(tuple, conntrack)) 49 if (!ip_nat_used_tuple(tuple, conntrack))
49 return 1; 50 return 1;
50 } 51 }
diff --git a/net/ipv4/netfilter/ip_nat_proto_tcp.c b/net/ipv4/netfilter/ip_nat_proto_tcp.c
index a91cfceff2..a98e36d2b3 100644
--- a/net/ipv4/netfilter/ip_nat_proto_tcp.c
+++ b/net/ipv4/netfilter/ip_nat_proto_tcp.c
@@ -40,7 +40,8 @@ tcp_unique_tuple(struct ip_conntrack_tuple *tuple,
40 enum ip_nat_manip_type maniptype, 40 enum ip_nat_manip_type maniptype,
41 const struct ip_conntrack *conntrack) 41 const struct ip_conntrack *conntrack)
42{ 42{
43 static u_int16_t port, *portptr; 43 static u_int16_t port;
44 u_int16_t *portptr;
44 unsigned int range_size, min, i; 45 unsigned int range_size, min, i;
45 46
46 if (maniptype == IP_NAT_MANIP_SRC) 47 if (maniptype == IP_NAT_MANIP_SRC)
diff --git a/net/ipv4/netfilter/ip_nat_proto_udp.c b/net/ipv4/netfilter/ip_nat_proto_udp.c
index c669e3b5f5..9f66e56256 100644
--- a/net/ipv4/netfilter/ip_nat_proto_udp.c
+++ b/net/ipv4/netfilter/ip_nat_proto_udp.c
@@ -41,7 +41,8 @@ udp_unique_tuple(struct ip_conntrack_tuple *tuple,
41 enum ip_nat_manip_type maniptype, 41 enum ip_nat_manip_type maniptype,
42 const struct ip_conntrack *conntrack) 42 const struct ip_conntrack *conntrack)
43{ 43{
44 static u_int16_t port, *portptr; 44 static u_int16_t port;
45 u_int16_t *portptr;
45 unsigned int range_size, min, i; 46 unsigned int range_size, min, i;
46 47
47 if (maniptype == IP_NAT_MANIP_SRC) 48 if (maniptype == IP_NAT_MANIP_SRC)
diff --git a/net/ipv4/netfilter/ip_nat_standalone.c b/net/ipv4/netfilter/ip_nat_standalone.c
index bc59d0d6e8..91d5ea1dbb 100644
--- a/net/ipv4/netfilter/ip_nat_standalone.c
+++ b/net/ipv4/netfilter/ip_nat_standalone.c
@@ -102,6 +102,10 @@ ip_nat_fn(unsigned int hooknum,
102 return NF_ACCEPT; 102 return NF_ACCEPT;
103 } 103 }
104 104
105 /* Don't try to NAT if this packet is not conntracked */
106 if (ct == &ip_conntrack_untracked)
107 return NF_ACCEPT;
108
105 switch (ctinfo) { 109 switch (ctinfo) {
106 case IP_CT_RELATED: 110 case IP_CT_RELATED:
107 case IP_CT_RELATED+IP_CT_IS_REPLY: 111 case IP_CT_RELATED+IP_CT_IS_REPLY:
diff --git a/net/ipv4/netfilter/ip_nat_tftp.c b/net/ipv4/netfilter/ip_nat_tftp.c
index 0343e0d646..2215317c76 100644
--- a/net/ipv4/netfilter/ip_nat_tftp.c
+++ b/net/ipv4/netfilter/ip_nat_tftp.c
@@ -45,10 +45,8 @@ static unsigned int help(struct sk_buff **pskb,
45 exp->saved_proto.udp.port = exp->tuple.dst.u.tcp.port; 45 exp->saved_proto.udp.port = exp->tuple.dst.u.tcp.port;
46 exp->dir = IP_CT_DIR_REPLY; 46 exp->dir = IP_CT_DIR_REPLY;
47 exp->expectfn = ip_nat_follow_master; 47 exp->expectfn = ip_nat_follow_master;
48 if (ip_conntrack_expect_related(exp) != 0) { 48 if (ip_conntrack_expect_related(exp) != 0)
49 ip_conntrack_expect_free(exp);
50 return NF_DROP; 49 return NF_DROP;
51 }
52 return NF_ACCEPT; 50 return NF_ACCEPT;
53} 51}
54 52
diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c
index eda1fba431..c6baa81743 100644
--- a/net/ipv4/netfilter/ip_queue.c
+++ b/net/ipv4/netfilter/ip_queue.c
@@ -214,6 +214,12 @@ ipq_build_packet_message(struct ipq_queue_entry *entry, int *errp)
214 break; 214 break;
215 215
216 case IPQ_COPY_PACKET: 216 case IPQ_COPY_PACKET:
217 if (entry->skb->ip_summed == CHECKSUM_HW &&
218 (*errp = skb_checksum_help(entry->skb,
219 entry->info->outdev == NULL))) {
220 read_unlock_bh(&queue_lock);
221 return NULL;
222 }
217 if (copy_range == 0 || copy_range > entry->skb->len) 223 if (copy_range == 0 || copy_range > entry->skb->len)
218 data_len = entry->skb->len; 224 data_len = entry->skb->len;
219 else 225 else
@@ -385,6 +391,7 @@ ipq_mangle_ipv4(ipq_verdict_msg_t *v, struct ipq_queue_entry *e)
385 if (!skb_ip_make_writable(&e->skb, v->data_len)) 391 if (!skb_ip_make_writable(&e->skb, v->data_len))
386 return -ENOMEM; 392 return -ENOMEM;
387 memcpy(e->skb->data, v->payload, v->data_len); 393 memcpy(e->skb->data, v->payload, v->data_len);
394 e->skb->ip_summed = CHECKSUM_NONE;
388 e->skb->nfcache |= NFC_ALTERED; 395 e->skb->nfcache |= NFC_ALTERED;
389 396
390 /* 397 /*
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index dc4362b57c..6706d3a1bc 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -30,7 +30,7 @@
30#include <linux/netfilter_ipv4/ipt_CLUSTERIP.h> 30#include <linux/netfilter_ipv4/ipt_CLUSTERIP.h>
31#include <linux/netfilter_ipv4/ip_conntrack.h> 31#include <linux/netfilter_ipv4/ip_conntrack.h>
32 32
33#define CLUSTERIP_VERSION "0.6" 33#define CLUSTERIP_VERSION "0.7"
34 34
35#define DEBUG_CLUSTERIP 35#define DEBUG_CLUSTERIP
36 36
@@ -339,7 +339,7 @@ target(struct sk_buff **pskb,
339 * error messages (RELATED) and information requests (see below) */ 339 * error messages (RELATED) and information requests (see below) */
340 if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP 340 if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP
341 && (ctinfo == IP_CT_RELATED 341 && (ctinfo == IP_CT_RELATED
342 || ctinfo == IP_CT_IS_REPLY+IP_CT_IS_REPLY)) 342 || ctinfo == IP_CT_RELATED+IP_CT_IS_REPLY))
343 return IPT_CONTINUE; 343 return IPT_CONTINUE;
344 344
345 /* ip_conntrack_icmp guarantees us that we only have ICMP_ECHO, 345 /* ip_conntrack_icmp guarantees us that we only have ICMP_ECHO,
@@ -524,8 +524,9 @@ arp_mangle(unsigned int hook,
524 || arp->ar_pln != 4 || arp->ar_hln != ETH_ALEN) 524 || arp->ar_pln != 4 || arp->ar_hln != ETH_ALEN)
525 return NF_ACCEPT; 525 return NF_ACCEPT;
526 526
527 /* we only want to mangle arp replies */ 527 /* we only want to mangle arp requests and replies */
528 if (arp->ar_op != htons(ARPOP_REPLY)) 528 if (arp->ar_op != htons(ARPOP_REPLY)
529 && arp->ar_op != htons(ARPOP_REQUEST))
529 return NF_ACCEPT; 530 return NF_ACCEPT;
530 531
531 payload = (void *)(arp+1); 532 payload = (void *)(arp+1);
diff --git a/net/ipv4/netfilter/ipt_ECN.c b/net/ipv4/netfilter/ipt_ECN.c
index ada9911118..94a0ce1c1c 100644
--- a/net/ipv4/netfilter/ipt_ECN.c
+++ b/net/ipv4/netfilter/ipt_ECN.c
@@ -61,16 +61,20 @@ set_ect_tcp(struct sk_buff **pskb, const struct ipt_ECN_info *einfo, int inward)
61 if (!tcph) 61 if (!tcph)
62 return 0; 62 return 0;
63 63
64 if (!(einfo->operation & IPT_ECN_OP_SET_ECE 64 if ((!(einfo->operation & IPT_ECN_OP_SET_ECE) ||
65 || tcph->ece == einfo->proto.tcp.ece) 65 tcph->ece == einfo->proto.tcp.ece) &&
66 && (!(einfo->operation & IPT_ECN_OP_SET_CWR 66 ((!(einfo->operation & IPT_ECN_OP_SET_CWR) ||
67 || tcph->cwr == einfo->proto.tcp.cwr))) 67 tcph->cwr == einfo->proto.tcp.cwr)))
68 return 1; 68 return 1;
69 69
70 if (!skb_ip_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph))) 70 if (!skb_ip_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph)))
71 return 0; 71 return 0;
72 tcph = (void *)(*pskb)->nh.iph + (*pskb)->nh.iph->ihl*4; 72 tcph = (void *)(*pskb)->nh.iph + (*pskb)->nh.iph->ihl*4;
73 73
74 if ((*pskb)->ip_summed == CHECKSUM_HW &&
75 skb_checksum_help(*pskb, inward))
76 return 0;
77
74 diffs[0] = ((u_int16_t *)tcph)[6]; 78 diffs[0] = ((u_int16_t *)tcph)[6];
75 if (einfo->operation & IPT_ECN_OP_SET_ECE) 79 if (einfo->operation & IPT_ECN_OP_SET_ECE)
76 tcph->ece = einfo->proto.tcp.ece; 80 tcph->ece = einfo->proto.tcp.ece;
@@ -79,13 +83,10 @@ set_ect_tcp(struct sk_buff **pskb, const struct ipt_ECN_info *einfo, int inward)
79 diffs[1] = ((u_int16_t *)tcph)[6]; 83 diffs[1] = ((u_int16_t *)tcph)[6];
80 diffs[0] = diffs[0] ^ 0xFFFF; 84 diffs[0] = diffs[0] ^ 0xFFFF;
81 85
82 if ((*pskb)->ip_summed != CHECKSUM_HW) 86 if ((*pskb)->ip_summed != CHECKSUM_UNNECESSARY)
83 tcph->check = csum_fold(csum_partial((char *)diffs, 87 tcph->check = csum_fold(csum_partial((char *)diffs,
84 sizeof(diffs), 88 sizeof(diffs),
85 tcph->check^0xFFFF)); 89 tcph->check^0xFFFF));
86 else
87 if (skb_checksum_help(*pskb, inward))
88 return 0;
89 (*pskb)->nfcache |= NFC_ALTERED; 90 (*pskb)->nfcache |= NFC_ALTERED;
90 return 1; 91 return 1;
91} 92}
diff --git a/net/ipv4/netfilter/ipt_TCPMSS.c b/net/ipv4/netfilter/ipt_TCPMSS.c
index 1049050b2b..7b84a25444 100644
--- a/net/ipv4/netfilter/ipt_TCPMSS.c
+++ b/net/ipv4/netfilter/ipt_TCPMSS.c
@@ -61,6 +61,10 @@ ipt_tcpmss_target(struct sk_buff **pskb,
61 if (!skb_ip_make_writable(pskb, (*pskb)->len)) 61 if (!skb_ip_make_writable(pskb, (*pskb)->len))
62 return NF_DROP; 62 return NF_DROP;
63 63
64 if ((*pskb)->ip_summed == CHECKSUM_HW &&
65 skb_checksum_help(*pskb, out == NULL))
66 return NF_DROP;
67
64 iph = (*pskb)->nh.iph; 68 iph = (*pskb)->nh.iph;
65 tcplen = (*pskb)->len - iph->ihl*4; 69 tcplen = (*pskb)->len - iph->ihl*4;
66 70
@@ -186,9 +190,6 @@ ipt_tcpmss_target(struct sk_buff **pskb,
186 newmss); 190 newmss);
187 191
188 retmodified: 192 retmodified:
189 /* We never hw checksum SYN packets. */
190 BUG_ON((*pskb)->ip_summed == CHECKSUM_HW);
191
192 (*pskb)->nfcache |= NFC_UNKNOWN | NFC_ALTERED; 193 (*pskb)->nfcache |= NFC_UNKNOWN | NFC_ALTERED;
193 return IPT_CONTINUE; 194 return IPT_CONTINUE;
194} 195}
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index f4d53c9198..d675ff80b0 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -54,6 +54,7 @@
54 * Marc Boucher : routing by fwmark 54 * Marc Boucher : routing by fwmark
55 * Robert Olsson : Added rt_cache statistics 55 * Robert Olsson : Added rt_cache statistics
56 * Arnaldo C. Melo : Convert proc stuff to seq_file 56 * Arnaldo C. Melo : Convert proc stuff to seq_file
57 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
57 * 58 *
58 * This program is free software; you can redistribute it and/or 59 * This program is free software; you can redistribute it and/or
59 * modify it under the terms of the GNU General Public License 60 * modify it under the terms of the GNU General Public License
@@ -70,6 +71,7 @@
70#include <linux/kernel.h> 71#include <linux/kernel.h>
71#include <linux/sched.h> 72#include <linux/sched.h>
72#include <linux/mm.h> 73#include <linux/mm.h>
74#include <linux/bootmem.h>
73#include <linux/string.h> 75#include <linux/string.h>
74#include <linux/socket.h> 76#include <linux/socket.h>
75#include <linux/sockios.h> 77#include <linux/sockios.h>
@@ -201,8 +203,37 @@ __u8 ip_tos2prio[16] = {
201 203
202struct rt_hash_bucket { 204struct rt_hash_bucket {
203 struct rtable *chain; 205 struct rtable *chain;
204 spinlock_t lock; 206};
205} __attribute__((__aligned__(8))); 207#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
208/*
209 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
210 * The size of this table is a power of two and depends on the number of CPUS.
211 */
212#if NR_CPUS >= 32
213#define RT_HASH_LOCK_SZ 4096
214#elif NR_CPUS >= 16
215#define RT_HASH_LOCK_SZ 2048
216#elif NR_CPUS >= 8
217#define RT_HASH_LOCK_SZ 1024
218#elif NR_CPUS >= 4
219#define RT_HASH_LOCK_SZ 512
220#else
221#define RT_HASH_LOCK_SZ 256
222#endif
223
224static spinlock_t *rt_hash_locks;
225# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
226# define rt_hash_lock_init() { \
227 int i; \
228 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
229 if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
230 for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
231 spin_lock_init(&rt_hash_locks[i]); \
232 }
233#else
234# define rt_hash_lock_addr(slot) NULL
235# define rt_hash_lock_init()
236#endif
206 237
207static struct rt_hash_bucket *rt_hash_table; 238static struct rt_hash_bucket *rt_hash_table;
208static unsigned rt_hash_mask; 239static unsigned rt_hash_mask;
@@ -575,19 +606,26 @@ static struct rtable **rt_remove_balanced_route(struct rtable **chain_head,
575/* This runs via a timer and thus is always in BH context. */ 606/* This runs via a timer and thus is always in BH context. */
576static void rt_check_expire(unsigned long dummy) 607static void rt_check_expire(unsigned long dummy)
577{ 608{
578 static int rover; 609 static unsigned int rover;
579 int i = rover, t; 610 unsigned int i = rover, goal;
580 struct rtable *rth, **rthp; 611 struct rtable *rth, **rthp;
581 unsigned long now = jiffies; 612 unsigned long now = jiffies;
582 613 u64 mult;
583 for (t = ip_rt_gc_interval << rt_hash_log; t >= 0; 614
584 t -= ip_rt_gc_timeout) { 615 mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
616 if (ip_rt_gc_timeout > 1)
617 do_div(mult, ip_rt_gc_timeout);
618 goal = (unsigned int)mult;
619 if (goal > rt_hash_mask) goal = rt_hash_mask + 1;
620 for (; goal > 0; goal--) {
585 unsigned long tmo = ip_rt_gc_timeout; 621 unsigned long tmo = ip_rt_gc_timeout;
586 622
587 i = (i + 1) & rt_hash_mask; 623 i = (i + 1) & rt_hash_mask;
588 rthp = &rt_hash_table[i].chain; 624 rthp = &rt_hash_table[i].chain;
589 625
590 spin_lock(&rt_hash_table[i].lock); 626 if (*rthp == 0)
627 continue;
628 spin_lock(rt_hash_lock_addr(i));
591 while ((rth = *rthp) != NULL) { 629 while ((rth = *rthp) != NULL) {
592 if (rth->u.dst.expires) { 630 if (rth->u.dst.expires) {
593 /* Entry is expired even if it is in use */ 631 /* Entry is expired even if it is in use */
@@ -620,14 +658,14 @@ static void rt_check_expire(unsigned long dummy)
620 rt_free(rth); 658 rt_free(rth);
621#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ 659#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
622 } 660 }
623 spin_unlock(&rt_hash_table[i].lock); 661 spin_unlock(rt_hash_lock_addr(i));
624 662
625 /* Fallback loop breaker. */ 663 /* Fallback loop breaker. */
626 if (time_after(jiffies, now)) 664 if (time_after(jiffies, now))
627 break; 665 break;
628 } 666 }
629 rover = i; 667 rover = i;
630 mod_timer(&rt_periodic_timer, now + ip_rt_gc_interval); 668 mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval);
631} 669}
632 670
633/* This can run from both BH and non-BH contexts, the latter 671/* This can run from both BH and non-BH contexts, the latter
@@ -643,11 +681,11 @@ static void rt_run_flush(unsigned long dummy)
643 get_random_bytes(&rt_hash_rnd, 4); 681 get_random_bytes(&rt_hash_rnd, 4);
644 682
645 for (i = rt_hash_mask; i >= 0; i--) { 683 for (i = rt_hash_mask; i >= 0; i--) {
646 spin_lock_bh(&rt_hash_table[i].lock); 684 spin_lock_bh(rt_hash_lock_addr(i));
647 rth = rt_hash_table[i].chain; 685 rth = rt_hash_table[i].chain;
648 if (rth) 686 if (rth)
649 rt_hash_table[i].chain = NULL; 687 rt_hash_table[i].chain = NULL;
650 spin_unlock_bh(&rt_hash_table[i].lock); 688 spin_unlock_bh(rt_hash_lock_addr(i));
651 689
652 for (; rth; rth = next) { 690 for (; rth; rth = next) {
653 next = rth->u.rt_next; 691 next = rth->u.rt_next;
@@ -780,7 +818,7 @@ static int rt_garbage_collect(void)
780 818
781 k = (k + 1) & rt_hash_mask; 819 k = (k + 1) & rt_hash_mask;
782 rthp = &rt_hash_table[k].chain; 820 rthp = &rt_hash_table[k].chain;
783 spin_lock_bh(&rt_hash_table[k].lock); 821 spin_lock_bh(rt_hash_lock_addr(k));
784 while ((rth = *rthp) != NULL) { 822 while ((rth = *rthp) != NULL) {
785 if (!rt_may_expire(rth, tmo, expire)) { 823 if (!rt_may_expire(rth, tmo, expire)) {
786 tmo >>= 1; 824 tmo >>= 1;
@@ -812,7 +850,7 @@ static int rt_garbage_collect(void)
812 goal--; 850 goal--;
813#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ 851#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
814 } 852 }
815 spin_unlock_bh(&rt_hash_table[k].lock); 853 spin_unlock_bh(rt_hash_lock_addr(k));
816 if (goal <= 0) 854 if (goal <= 0)
817 break; 855 break;
818 } 856 }
@@ -882,7 +920,7 @@ restart:
882 920
883 rthp = &rt_hash_table[hash].chain; 921 rthp = &rt_hash_table[hash].chain;
884 922
885 spin_lock_bh(&rt_hash_table[hash].lock); 923 spin_lock_bh(rt_hash_lock_addr(hash));
886 while ((rth = *rthp) != NULL) { 924 while ((rth = *rthp) != NULL) {
887#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED 925#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
888 if (!(rth->u.dst.flags & DST_BALANCED) && 926 if (!(rth->u.dst.flags & DST_BALANCED) &&
@@ -908,7 +946,7 @@ restart:
908 rth->u.dst.__use++; 946 rth->u.dst.__use++;
909 dst_hold(&rth->u.dst); 947 dst_hold(&rth->u.dst);
910 rth->u.dst.lastuse = now; 948 rth->u.dst.lastuse = now;
911 spin_unlock_bh(&rt_hash_table[hash].lock); 949 spin_unlock_bh(rt_hash_lock_addr(hash));
912 950
913 rt_drop(rt); 951 rt_drop(rt);
914 *rp = rth; 952 *rp = rth;
@@ -949,7 +987,7 @@ restart:
949 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) { 987 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
950 int err = arp_bind_neighbour(&rt->u.dst); 988 int err = arp_bind_neighbour(&rt->u.dst);
951 if (err) { 989 if (err) {
952 spin_unlock_bh(&rt_hash_table[hash].lock); 990 spin_unlock_bh(rt_hash_lock_addr(hash));
953 991
954 if (err != -ENOBUFS) { 992 if (err != -ENOBUFS) {
955 rt_drop(rt); 993 rt_drop(rt);
@@ -990,7 +1028,7 @@ restart:
990 } 1028 }
991#endif 1029#endif
992 rt_hash_table[hash].chain = rt; 1030 rt_hash_table[hash].chain = rt;
993 spin_unlock_bh(&rt_hash_table[hash].lock); 1031 spin_unlock_bh(rt_hash_lock_addr(hash));
994 *rp = rt; 1032 *rp = rt;
995 return 0; 1033 return 0;
996} 1034}
@@ -1058,7 +1096,7 @@ static void rt_del(unsigned hash, struct rtable *rt)
1058{ 1096{
1059 struct rtable **rthp; 1097 struct rtable **rthp;
1060 1098
1061 spin_lock_bh(&rt_hash_table[hash].lock); 1099 spin_lock_bh(rt_hash_lock_addr(hash));
1062 ip_rt_put(rt); 1100 ip_rt_put(rt);
1063 for (rthp = &rt_hash_table[hash].chain; *rthp; 1101 for (rthp = &rt_hash_table[hash].chain; *rthp;
1064 rthp = &(*rthp)->u.rt_next) 1102 rthp = &(*rthp)->u.rt_next)
@@ -1067,7 +1105,7 @@ static void rt_del(unsigned hash, struct rtable *rt)
1067 rt_free(rt); 1105 rt_free(rt);
1068 break; 1106 break;
1069 } 1107 }
1070 spin_unlock_bh(&rt_hash_table[hash].lock); 1108 spin_unlock_bh(rt_hash_lock_addr(hash));
1071} 1109}
1072 1110
1073void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw, 1111void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
@@ -1647,7 +1685,7 @@ static void ip_handle_martian_source(struct net_device *dev,
1647 printk(KERN_WARNING "martian source %u.%u.%u.%u from " 1685 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1648 "%u.%u.%u.%u, on dev %s\n", 1686 "%u.%u.%u.%u, on dev %s\n",
1649 NIPQUAD(daddr), NIPQUAD(saddr), dev->name); 1687 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1650 if (dev->hard_header_len) { 1688 if (dev->hard_header_len && skb->mac.raw) {
1651 int i; 1689 int i;
1652 unsigned char *p = skb->mac.raw; 1690 unsigned char *p = skb->mac.raw;
1653 printk(KERN_WARNING "ll header: "); 1691 printk(KERN_WARNING "ll header: ");
@@ -1767,7 +1805,7 @@ static inline int ip_mkroute_input_def(struct sk_buff *skb,
1767 struct in_device *in_dev, 1805 struct in_device *in_dev,
1768 u32 daddr, u32 saddr, u32 tos) 1806 u32 daddr, u32 saddr, u32 tos)
1769{ 1807{
1770 struct rtable* rth; 1808 struct rtable* rth = NULL;
1771 int err; 1809 int err;
1772 unsigned hash; 1810 unsigned hash;
1773 1811
@@ -1794,7 +1832,7 @@ static inline int ip_mkroute_input(struct sk_buff *skb,
1794 u32 daddr, u32 saddr, u32 tos) 1832 u32 daddr, u32 saddr, u32 tos)
1795{ 1833{
1796#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED 1834#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1797 struct rtable* rth; 1835 struct rtable* rth = NULL;
1798 unsigned char hop, hopcount, lasthop; 1836 unsigned char hop, hopcount, lasthop;
1799 int err = -EINVAL; 1837 int err = -EINVAL;
1800 unsigned int hash; 1838 unsigned int hash;
@@ -1909,7 +1947,7 @@ static int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
1909 */ 1947 */
1910 if ((err = fib_lookup(&fl, &res)) != 0) { 1948 if ((err = fib_lookup(&fl, &res)) != 0) {
1911 if (!IN_DEV_FORWARD(in_dev)) 1949 if (!IN_DEV_FORWARD(in_dev))
1912 goto e_inval; 1950 goto e_hostunreach;
1913 goto no_route; 1951 goto no_route;
1914 } 1952 }
1915 free_res = 1; 1953 free_res = 1;
@@ -1933,7 +1971,7 @@ static int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
1933 } 1971 }
1934 1972
1935 if (!IN_DEV_FORWARD(in_dev)) 1973 if (!IN_DEV_FORWARD(in_dev))
1936 goto e_inval; 1974 goto e_hostunreach;
1937 if (res.type != RTN_UNICAST) 1975 if (res.type != RTN_UNICAST)
1938 goto martian_destination; 1976 goto martian_destination;
1939 1977
@@ -2025,6 +2063,11 @@ martian_destination:
2025 "%u.%u.%u.%u, dev %s\n", 2063 "%u.%u.%u.%u, dev %s\n",
2026 NIPQUAD(daddr), NIPQUAD(saddr), dev->name); 2064 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2027#endif 2065#endif
2066
2067e_hostunreach:
2068 err = -EHOSTUNREACH;
2069 goto done;
2070
2028e_inval: 2071e_inval:
2029 err = -EINVAL; 2072 err = -EINVAL;
2030 goto done; 2073 goto done;
@@ -2239,7 +2282,7 @@ static inline int ip_mkroute_output_def(struct rtable **rp,
2239 struct net_device *dev_out, 2282 struct net_device *dev_out,
2240 unsigned flags) 2283 unsigned flags)
2241{ 2284{
2242 struct rtable *rth; 2285 struct rtable *rth = NULL;
2243 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags); 2286 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2244 unsigned hash; 2287 unsigned hash;
2245 if (err == 0) { 2288 if (err == 0) {
@@ -2267,7 +2310,7 @@ static inline int ip_mkroute_output(struct rtable** rp,
2267 unsigned char hop; 2310 unsigned char hop;
2268 unsigned hash; 2311 unsigned hash;
2269 int err = -EINVAL; 2312 int err = -EINVAL;
2270 struct rtable *rth; 2313 struct rtable *rth = NULL;
2271 2314
2272 if (res->fi && res->fi->fib_nhs > 1) { 2315 if (res->fi && res->fi->fib_nhs > 1) {
2273 unsigned char hopcount = res->fi->fib_nhs; 2316 unsigned char hopcount = res->fi->fib_nhs;
@@ -3068,12 +3111,14 @@ __setup("rhash_entries=", set_rhash_entries);
3068 3111
3069int __init ip_rt_init(void) 3112int __init ip_rt_init(void)
3070{ 3113{
3071 int i, order, goal, rc = 0; 3114 int rc = 0;
3072 3115
3073 rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^ 3116 rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
3074 (jiffies ^ (jiffies >> 7))); 3117 (jiffies ^ (jiffies >> 7)));
3075 3118
3076#ifdef CONFIG_NET_CLS_ROUTE 3119#ifdef CONFIG_NET_CLS_ROUTE
3120 {
3121 int order;
3077 for (order = 0; 3122 for (order = 0;
3078 (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++) 3123 (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
3079 /* NOTHING */; 3124 /* NOTHING */;
@@ -3081,6 +3126,7 @@ int __init ip_rt_init(void)
3081 if (!ip_rt_acct) 3126 if (!ip_rt_acct)
3082 panic("IP: failed to allocate ip_rt_acct\n"); 3127 panic("IP: failed to allocate ip_rt_acct\n");
3083 memset(ip_rt_acct, 0, PAGE_SIZE << order); 3128 memset(ip_rt_acct, 0, PAGE_SIZE << order);
3129 }
3084#endif 3130#endif
3085 3131
3086 ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache", 3132 ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
@@ -3091,36 +3137,19 @@ int __init ip_rt_init(void)
3091 if (!ipv4_dst_ops.kmem_cachep) 3137 if (!ipv4_dst_ops.kmem_cachep)
3092 panic("IP: failed to allocate ip_dst_cache\n"); 3138 panic("IP: failed to allocate ip_dst_cache\n");
3093 3139
3094 goal = num_physpages >> (26 - PAGE_SHIFT); 3140 rt_hash_table = (struct rt_hash_bucket *)
3095 if (rhash_entries) 3141 alloc_large_system_hash("IP route cache",
3096 goal = (rhash_entries * sizeof(struct rt_hash_bucket)) >> PAGE_SHIFT; 3142 sizeof(struct rt_hash_bucket),
3097 for (order = 0; (1UL << order) < goal; order++) 3143 rhash_entries,
3098 /* NOTHING */; 3144 (num_physpages >= 128 * 1024) ?
3099 3145 (27 - PAGE_SHIFT) :
3100 do { 3146 (29 - PAGE_SHIFT),
3101 rt_hash_mask = (1UL << order) * PAGE_SIZE / 3147 HASH_HIGHMEM,
3102 sizeof(struct rt_hash_bucket); 3148 &rt_hash_log,
3103 while (rt_hash_mask & (rt_hash_mask - 1)) 3149 &rt_hash_mask,
3104 rt_hash_mask--; 3150 0);
3105 rt_hash_table = (struct rt_hash_bucket *) 3151 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3106 __get_free_pages(GFP_ATOMIC, order); 3152 rt_hash_lock_init();
3107 } while (rt_hash_table == NULL && --order > 0);
3108
3109 if (!rt_hash_table)
3110 panic("Failed to allocate IP route cache hash table\n");
3111
3112 printk(KERN_INFO "IP: routing cache hash table of %u buckets, %ldKbytes\n",
3113 rt_hash_mask,
3114 (long) (rt_hash_mask * sizeof(struct rt_hash_bucket)) / 1024);
3115
3116 for (rt_hash_log = 0; (1 << rt_hash_log) != rt_hash_mask; rt_hash_log++)
3117 /* NOTHING */;
3118
3119 rt_hash_mask--;
3120 for (i = 0; i <= rt_hash_mask; i++) {
3121 spin_lock_init(&rt_hash_table[i].lock);
3122 rt_hash_table[i].chain = NULL;
3123 }
3124 3153
3125 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1); 3154 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3126 ip_rt_max_size = (rt_hash_mask + 1) * 16; 3155 ip_rt_max_size = (rt_hash_mask + 1) * 16;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 23068bddbf..e328945324 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -118,6 +118,45 @@ static int ipv4_sysctl_forward_strategy(ctl_table *table,
118 return 1; 118 return 1;
119} 119}
120 120
121static int proc_tcp_congestion_control(ctl_table *ctl, int write, struct file * filp,
122 void __user *buffer, size_t *lenp, loff_t *ppos)
123{
124 char val[TCP_CA_NAME_MAX];
125 ctl_table tbl = {
126 .data = val,
127 .maxlen = TCP_CA_NAME_MAX,
128 };
129 int ret;
130
131 tcp_get_default_congestion_control(val);
132
133 ret = proc_dostring(&tbl, write, filp, buffer, lenp, ppos);
134 if (write && ret == 0)
135 ret = tcp_set_default_congestion_control(val);
136 return ret;
137}
138
139int sysctl_tcp_congestion_control(ctl_table *table, int __user *name, int nlen,
140 void __user *oldval, size_t __user *oldlenp,
141 void __user *newval, size_t newlen,
142 void **context)
143{
144 char val[TCP_CA_NAME_MAX];
145 ctl_table tbl = {
146 .data = val,
147 .maxlen = TCP_CA_NAME_MAX,
148 };
149 int ret;
150
151 tcp_get_default_congestion_control(val);
152 ret = sysctl_string(&tbl, name, nlen, oldval, oldlenp, newval, newlen,
153 context);
154 if (ret == 0 && newval && newlen)
155 ret = tcp_set_default_congestion_control(val);
156 return ret;
157}
158
159
121ctl_table ipv4_table[] = { 160ctl_table ipv4_table[] = {
122 { 161 {
123 .ctl_name = NET_IPV4_TCP_TIMESTAMPS, 162 .ctl_name = NET_IPV4_TCP_TIMESTAMPS,
@@ -612,70 +651,6 @@ ctl_table ipv4_table[] = {
612 .proc_handler = &proc_dointvec, 651 .proc_handler = &proc_dointvec,
613 }, 652 },
614 { 653 {
615 .ctl_name = NET_TCP_WESTWOOD,
616 .procname = "tcp_westwood",
617 .data = &sysctl_tcp_westwood,
618 .maxlen = sizeof(int),
619 .mode = 0644,
620 .proc_handler = &proc_dointvec,
621 },
622 {
623 .ctl_name = NET_TCP_VEGAS,
624 .procname = "tcp_vegas_cong_avoid",
625 .data = &sysctl_tcp_vegas_cong_avoid,
626 .maxlen = sizeof(int),
627 .mode = 0644,
628 .proc_handler = &proc_dointvec,
629 },
630 {
631 .ctl_name = NET_TCP_VEGAS_ALPHA,
632 .procname = "tcp_vegas_alpha",
633 .data = &sysctl_tcp_vegas_alpha,
634 .maxlen = sizeof(int),
635 .mode = 0644,
636 .proc_handler = &proc_dointvec,
637 },
638 {
639 .ctl_name = NET_TCP_VEGAS_BETA,
640 .procname = "tcp_vegas_beta",
641 .data = &sysctl_tcp_vegas_beta,
642 .maxlen = sizeof(int),
643 .mode = 0644,
644 .proc_handler = &proc_dointvec,
645 },
646 {
647 .ctl_name = NET_TCP_VEGAS_GAMMA,
648 .procname = "tcp_vegas_gamma",
649 .data = &sysctl_tcp_vegas_gamma,
650 .maxlen = sizeof(int),
651 .mode = 0644,
652 .proc_handler = &proc_dointvec,
653 },
654 {
655 .ctl_name = NET_TCP_BIC,
656 .procname = "tcp_bic",
657 .data = &sysctl_tcp_bic,
658 .maxlen = sizeof(int),
659 .mode = 0644,
660 .proc_handler = &proc_dointvec,
661 },
662 {
663 .ctl_name = NET_TCP_BIC_FAST_CONVERGENCE,
664 .procname = "tcp_bic_fast_convergence",
665 .data = &sysctl_tcp_bic_fast_convergence,
666 .maxlen = sizeof(int),
667 .mode = 0644,
668 .proc_handler = &proc_dointvec,
669 },
670 {
671 .ctl_name = NET_TCP_BIC_LOW_WINDOW,
672 .procname = "tcp_bic_low_window",
673 .data = &sysctl_tcp_bic_low_window,
674 .maxlen = sizeof(int),
675 .mode = 0644,
676 .proc_handler = &proc_dointvec,
677 },
678 {
679 .ctl_name = NET_TCP_MODERATE_RCVBUF, 654 .ctl_name = NET_TCP_MODERATE_RCVBUF,
680 .procname = "tcp_moderate_rcvbuf", 655 .procname = "tcp_moderate_rcvbuf",
681 .data = &sysctl_tcp_moderate_rcvbuf, 656 .data = &sysctl_tcp_moderate_rcvbuf,
@@ -692,13 +667,14 @@ ctl_table ipv4_table[] = {
692 .proc_handler = &proc_dointvec, 667 .proc_handler = &proc_dointvec,
693 }, 668 },
694 { 669 {
695 .ctl_name = NET_TCP_BIC_BETA, 670 .ctl_name = NET_TCP_CONG_CONTROL,
696 .procname = "tcp_bic_beta", 671 .procname = "tcp_congestion_control",
697 .data = &sysctl_tcp_bic_beta,
698 .maxlen = sizeof(int),
699 .mode = 0644, 672 .mode = 0644,
700 .proc_handler = &proc_dointvec, 673 .maxlen = TCP_CA_NAME_MAX,
674 .proc_handler = &proc_tcp_congestion_control,
675 .strategy = &sysctl_tcp_congestion_control,
701 }, 676 },
677
702 { .ctl_name = 0 } 678 { .ctl_name = 0 }
703}; 679};
704 680
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 674bbd8cfd..69b1fcf700 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -584,7 +584,7 @@ static inline void skb_entail(struct sock *sk, struct tcp_sock *tp,
584 sk_charge_skb(sk, skb); 584 sk_charge_skb(sk, skb);
585 if (!sk->sk_send_head) 585 if (!sk->sk_send_head)
586 sk->sk_send_head = skb; 586 sk->sk_send_head = skb;
587 else if (tp->nonagle&TCP_NAGLE_PUSH) 587 if (tp->nonagle & TCP_NAGLE_PUSH)
588 tp->nonagle &= ~TCP_NAGLE_PUSH; 588 tp->nonagle &= ~TCP_NAGLE_PUSH;
589} 589}
590 590
@@ -615,7 +615,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
615 size_t psize, int flags) 615 size_t psize, int flags)
616{ 616{
617 struct tcp_sock *tp = tcp_sk(sk); 617 struct tcp_sock *tp = tcp_sk(sk);
618 int mss_now; 618 int mss_now, size_goal;
619 int err; 619 int err;
620 ssize_t copied; 620 ssize_t copied;
621 long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); 621 long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
@@ -628,6 +628,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
628 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); 628 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
629 629
630 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); 630 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
631 size_goal = tp->xmit_size_goal;
631 copied = 0; 632 copied = 0;
632 633
633 err = -EPIPE; 634 err = -EPIPE;
@@ -641,7 +642,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
641 int offset = poffset % PAGE_SIZE; 642 int offset = poffset % PAGE_SIZE;
642 int size = min_t(size_t, psize, PAGE_SIZE - offset); 643 int size = min_t(size_t, psize, PAGE_SIZE - offset);
643 644
644 if (!sk->sk_send_head || (copy = mss_now - skb->len) <= 0) { 645 if (!sk->sk_send_head || (copy = size_goal - skb->len) <= 0) {
645new_segment: 646new_segment:
646 if (!sk_stream_memory_free(sk)) 647 if (!sk_stream_memory_free(sk))
647 goto wait_for_sndbuf; 648 goto wait_for_sndbuf;
@@ -652,7 +653,7 @@ new_segment:
652 goto wait_for_memory; 653 goto wait_for_memory;
653 654
654 skb_entail(sk, tp, skb); 655 skb_entail(sk, tp, skb);
655 copy = mss_now; 656 copy = size_goal;
656 } 657 }
657 658
658 if (copy > size) 659 if (copy > size)
@@ -693,7 +694,7 @@ new_segment:
693 if (!(psize -= copy)) 694 if (!(psize -= copy))
694 goto out; 695 goto out;
695 696
696 if (skb->len != mss_now || (flags & MSG_OOB)) 697 if (skb->len < mss_now || (flags & MSG_OOB))
697 continue; 698 continue;
698 699
699 if (forced_push(tp)) { 700 if (forced_push(tp)) {
@@ -713,6 +714,7 @@ wait_for_memory:
713 goto do_error; 714 goto do_error;
714 715
715 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); 716 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
717 size_goal = tp->xmit_size_goal;
716 } 718 }
717 719
718out: 720out:
@@ -754,15 +756,20 @@ ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
754 756
755static inline int select_size(struct sock *sk, struct tcp_sock *tp) 757static inline int select_size(struct sock *sk, struct tcp_sock *tp)
756{ 758{
757 int tmp = tp->mss_cache_std; 759 int tmp = tp->mss_cache;
758 760
759 if (sk->sk_route_caps & NETIF_F_SG) { 761 if (sk->sk_route_caps & NETIF_F_SG) {
760 int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER); 762 if (sk->sk_route_caps & NETIF_F_TSO)
763 tmp = 0;
764 else {
765 int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
761 766
762 if (tmp >= pgbreak && 767 if (tmp >= pgbreak &&
763 tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE) 768 tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
764 tmp = pgbreak; 769 tmp = pgbreak;
770 }
765 } 771 }
772
766 return tmp; 773 return tmp;
767} 774}
768 775
@@ -773,7 +780,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
773 struct tcp_sock *tp = tcp_sk(sk); 780 struct tcp_sock *tp = tcp_sk(sk);
774 struct sk_buff *skb; 781 struct sk_buff *skb;
775 int iovlen, flags; 782 int iovlen, flags;
776 int mss_now; 783 int mss_now, size_goal;
777 int err, copied; 784 int err, copied;
778 long timeo; 785 long timeo;
779 786
@@ -792,6 +799,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
792 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); 799 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
793 800
794 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); 801 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
802 size_goal = tp->xmit_size_goal;
795 803
796 /* Ok commence sending. */ 804 /* Ok commence sending. */
797 iovlen = msg->msg_iovlen; 805 iovlen = msg->msg_iovlen;
@@ -814,7 +822,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
814 skb = sk->sk_write_queue.prev; 822 skb = sk->sk_write_queue.prev;
815 823
816 if (!sk->sk_send_head || 824 if (!sk->sk_send_head ||
817 (copy = mss_now - skb->len) <= 0) { 825 (copy = size_goal - skb->len) <= 0) {
818 826
819new_segment: 827new_segment:
820 /* Allocate new segment. If the interface is SG, 828 /* Allocate new segment. If the interface is SG,
@@ -837,7 +845,7 @@ new_segment:
837 skb->ip_summed = CHECKSUM_HW; 845 skb->ip_summed = CHECKSUM_HW;
838 846
839 skb_entail(sk, tp, skb); 847 skb_entail(sk, tp, skb);
840 copy = mss_now; 848 copy = size_goal;
841 } 849 }
842 850
843 /* Try to append data to the end of skb. */ 851 /* Try to append data to the end of skb. */
@@ -872,11 +880,6 @@ new_segment:
872 tcp_mark_push(tp, skb); 880 tcp_mark_push(tp, skb);
873 goto new_segment; 881 goto new_segment;
874 } else if (page) { 882 } else if (page) {
875 /* If page is cached, align
876 * offset to L1 cache boundary
877 */
878 off = (off + L1_CACHE_BYTES - 1) &
879 ~(L1_CACHE_BYTES - 1);
880 if (off == PAGE_SIZE) { 883 if (off == PAGE_SIZE) {
881 put_page(page); 884 put_page(page);
882 TCP_PAGE(sk) = page = NULL; 885 TCP_PAGE(sk) = page = NULL;
@@ -937,7 +940,7 @@ new_segment:
937 if ((seglen -= copy) == 0 && iovlen == 0) 940 if ((seglen -= copy) == 0 && iovlen == 0)
938 goto out; 941 goto out;
939 942
940 if (skb->len != mss_now || (flags & MSG_OOB)) 943 if (skb->len < mss_now || (flags & MSG_OOB))
941 continue; 944 continue;
942 945
943 if (forced_push(tp)) { 946 if (forced_push(tp)) {
@@ -957,6 +960,7 @@ wait_for_memory:
957 goto do_error; 960 goto do_error;
958 961
959 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); 962 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
963 size_goal = tp->xmit_size_goal;
960 } 964 }
961 } 965 }
962 966
@@ -1101,7 +1105,7 @@ static void tcp_prequeue_process(struct sock *sk)
1101 struct sk_buff *skb; 1105 struct sk_buff *skb;
1102 struct tcp_sock *tp = tcp_sk(sk); 1106 struct tcp_sock *tp = tcp_sk(sk);
1103 1107
1104 NET_ADD_STATS_USER(LINUX_MIB_TCPPREQUEUED, skb_queue_len(&tp->ucopy.prequeue)); 1108 NET_INC_STATS_USER(LINUX_MIB_TCPPREQUEUED);
1105 1109
1106 /* RX process wants to run with disabled BHs, though it is not 1110 /* RX process wants to run with disabled BHs, though it is not
1107 * necessary */ 1111 * necessary */
@@ -1365,7 +1369,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1365 * is not empty. It is more elegant, but eats cycles, 1369 * is not empty. It is more elegant, but eats cycles,
1366 * unfortunately. 1370 * unfortunately.
1367 */ 1371 */
1368 if (skb_queue_len(&tp->ucopy.prequeue)) 1372 if (!skb_queue_empty(&tp->ucopy.prequeue))
1369 goto do_prequeue; 1373 goto do_prequeue;
1370 1374
1371 /* __ Set realtime policy in scheduler __ */ 1375 /* __ Set realtime policy in scheduler __ */
@@ -1390,7 +1394,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1390 } 1394 }
1391 1395
1392 if (tp->rcv_nxt == tp->copied_seq && 1396 if (tp->rcv_nxt == tp->copied_seq &&
1393 skb_queue_len(&tp->ucopy.prequeue)) { 1397 !skb_queue_empty(&tp->ucopy.prequeue)) {
1394do_prequeue: 1398do_prequeue:
1395 tcp_prequeue_process(sk); 1399 tcp_prequeue_process(sk);
1396 1400
@@ -1472,7 +1476,7 @@ skip_copy:
1472 } while (len > 0); 1476 } while (len > 0);
1473 1477
1474 if (user_recv) { 1478 if (user_recv) {
1475 if (skb_queue_len(&tp->ucopy.prequeue)) { 1479 if (!skb_queue_empty(&tp->ucopy.prequeue)) {
1476 int chunk; 1480 int chunk;
1477 1481
1478 tp->ucopy.len = copied > 0 ? len : 0; 1482 tp->ucopy.len = copied > 0 ? len : 0;
@@ -1927,6 +1931,25 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
1927 return tp->af_specific->setsockopt(sk, level, optname, 1931 return tp->af_specific->setsockopt(sk, level, optname,
1928 optval, optlen); 1932 optval, optlen);
1929 1933
1934 /* This is a string value all the others are int's */
1935 if (optname == TCP_CONGESTION) {
1936 char name[TCP_CA_NAME_MAX];
1937
1938 if (optlen < 1)
1939 return -EINVAL;
1940
1941 val = strncpy_from_user(name, optval,
1942 min(TCP_CA_NAME_MAX-1, optlen));
1943 if (val < 0)
1944 return -EFAULT;
1945 name[val] = 0;
1946
1947 lock_sock(sk);
1948 err = tcp_set_congestion_control(tp, name);
1949 release_sock(sk);
1950 return err;
1951 }
1952
1930 if (optlen < sizeof(int)) 1953 if (optlen < sizeof(int))
1931 return -EINVAL; 1954 return -EINVAL;
1932 1955
@@ -2109,7 +2132,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
2109 2132
2110 info->tcpi_rto = jiffies_to_usecs(tp->rto); 2133 info->tcpi_rto = jiffies_to_usecs(tp->rto);
2111 info->tcpi_ato = jiffies_to_usecs(tp->ack.ato); 2134 info->tcpi_ato = jiffies_to_usecs(tp->ack.ato);
2112 info->tcpi_snd_mss = tp->mss_cache_std; 2135 info->tcpi_snd_mss = tp->mss_cache;
2113 info->tcpi_rcv_mss = tp->ack.rcv_mss; 2136 info->tcpi_rcv_mss = tp->ack.rcv_mss;
2114 2137
2115 info->tcpi_unacked = tp->packets_out; 2138 info->tcpi_unacked = tp->packets_out;
@@ -2159,7 +2182,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2159 2182
2160 switch (optname) { 2183 switch (optname) {
2161 case TCP_MAXSEG: 2184 case TCP_MAXSEG:
2162 val = tp->mss_cache_std; 2185 val = tp->mss_cache;
2163 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) 2186 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
2164 val = tp->rx_opt.user_mss; 2187 val = tp->rx_opt.user_mss;
2165 break; 2188 break;
@@ -2211,6 +2234,16 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2211 case TCP_QUICKACK: 2234 case TCP_QUICKACK:
2212 val = !tp->ack.pingpong; 2235 val = !tp->ack.pingpong;
2213 break; 2236 break;
2237
2238 case TCP_CONGESTION:
2239 if (get_user(len, optlen))
2240 return -EFAULT;
2241 len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
2242 if (put_user(len, optlen))
2243 return -EFAULT;
2244 if (copy_to_user(optval, tp->ca_ops->name, len))
2245 return -EFAULT;
2246 return 0;
2214 default: 2247 default:
2215 return -ENOPROTOOPT; 2248 return -ENOPROTOOPT;
2216 }; 2249 };
@@ -2224,7 +2257,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2224 2257
2225 2258
2226extern void __skb_cb_too_small_for_tcp(int, int); 2259extern void __skb_cb_too_small_for_tcp(int, int);
2227extern void tcpdiag_init(void); 2260extern struct tcp_congestion_ops tcp_reno;
2228 2261
2229static __initdata unsigned long thash_entries; 2262static __initdata unsigned long thash_entries;
2230static int __init set_thash_entries(char *str) 2263static int __init set_thash_entries(char *str)
@@ -2333,6 +2366,8 @@ void __init tcp_init(void)
2333 printk(KERN_INFO "TCP: Hash tables configured " 2366 printk(KERN_INFO "TCP: Hash tables configured "
2334 "(established %d bind %d)\n", 2367 "(established %d bind %d)\n",
2335 tcp_ehash_size << 1, tcp_bhash_size); 2368 tcp_ehash_size << 1, tcp_bhash_size);
2369
2370 tcp_register_congestion_control(&tcp_reno);
2336} 2371}
2337 2372
2338EXPORT_SYMBOL(tcp_accept); 2373EXPORT_SYMBOL(tcp_accept);
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
new file mode 100644
index 0000000000..ec38d45d66
--- /dev/null
+++ b/net/ipv4/tcp_bic.c
@@ -0,0 +1,331 @@
1/*
2 * Binary Increase Congestion control for TCP
3 *
4 * This is from the implementation of BICTCP in
5 * Lison-Xu, Kahaled Harfoush, and Injong Rhee.
6 * "Binary Increase Congestion Control for Fast, Long Distance
7 * Networks" in InfoComm 2004
8 * Available from:
9 * http://www.csc.ncsu.edu/faculty/rhee/export/bitcp.pdf
10 *
11 * Unless BIC is enabled and congestion window is large
12 * this behaves the same as the original Reno.
13 */
14
15#include <linux/config.h>
16#include <linux/mm.h>
17#include <linux/module.h>
18#include <net/tcp.h>
19
20
21#define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation
22 * max_cwnd = snd_cwnd * beta
23 */
24#define BICTCP_B 4 /*
25 * In binary search,
26 * go to point (max+min)/N
27 */
28
29static int fast_convergence = 1;
30static int max_increment = 32;
31static int low_window = 14;
32static int beta = 819; /* = 819/1024 (BICTCP_BETA_SCALE) */
33static int low_utilization_threshold = 153;
34static int low_utilization_period = 2;
35static int initial_ssthresh = 100;
36static int smooth_part = 20;
37
38module_param(fast_convergence, int, 0644);
39MODULE_PARM_DESC(fast_convergence, "turn on/off fast convergence");
40module_param(max_increment, int, 0644);
41MODULE_PARM_DESC(max_increment, "Limit on increment allowed during binary search");
42module_param(low_window, int, 0644);
43MODULE_PARM_DESC(low_window, "lower bound on congestion window (for TCP friendliness)");
44module_param(beta, int, 0644);
45MODULE_PARM_DESC(beta, "beta for multiplicative increase");
46module_param(low_utilization_threshold, int, 0644);
47MODULE_PARM_DESC(low_utilization_threshold, "percent (scaled by 1024) for low utilization mode");
48module_param(low_utilization_period, int, 0644);
49MODULE_PARM_DESC(low_utilization_period, "if average delay exceeds then goto to low utilization mode (seconds)");
50module_param(initial_ssthresh, int, 0644);
51MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold");
52module_param(smooth_part, int, 0644);
53MODULE_PARM_DESC(smooth_part, "log(B/(B*Smin))/log(B/(B-1))+B, # of RTT from Wmax-B to Wmax");
54
55
56/* BIC TCP Parameters */
57struct bictcp {
58 u32 cnt; /* increase cwnd by 1 after ACKs */
59 u32 last_max_cwnd; /* last maximum snd_cwnd */
60 u32 loss_cwnd; /* congestion window at last loss */
61 u32 last_cwnd; /* the last snd_cwnd */
62 u32 last_time; /* time when updated last_cwnd */
63 u32 delay_min; /* min delay */
64 u32 delay_max; /* max delay */
65 u32 last_delay;
66 u8 low_utilization;/* 0: high; 1: low */
67 u32 low_utilization_start; /* starting time of low utilization detection*/
68 u32 epoch_start; /* beginning of an epoch */
69#define ACK_RATIO_SHIFT 4
70 u32 delayed_ack; /* estimate the ratio of Packets/ACKs << 4 */
71};
72
73static inline void bictcp_reset(struct bictcp *ca)
74{
75 ca->cnt = 0;
76 ca->last_max_cwnd = 0;
77 ca->loss_cwnd = 0;
78 ca->last_cwnd = 0;
79 ca->last_time = 0;
80 ca->delay_min = 0;
81 ca->delay_max = 0;
82 ca->last_delay = 0;
83 ca->low_utilization = 0;
84 ca->low_utilization_start = 0;
85 ca->epoch_start = 0;
86 ca->delayed_ack = 2 << ACK_RATIO_SHIFT;
87}
88
89static void bictcp_init(struct tcp_sock *tp)
90{
91 bictcp_reset(tcp_ca(tp));
92 if (initial_ssthresh)
93 tp->snd_ssthresh = initial_ssthresh;
94}
95
96/*
97 * Compute congestion window to use.
98 */
99static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
100{
101 if (ca->last_cwnd == cwnd &&
102 (s32)(tcp_time_stamp - ca->last_time) <= HZ / 32)
103 return;
104
105 ca->last_cwnd = cwnd;
106 ca->last_time = tcp_time_stamp;
107
108 if (ca->epoch_start == 0) /* record the beginning of an epoch */
109 ca->epoch_start = tcp_time_stamp;
110
111 /* start off normal */
112 if (cwnd <= low_window) {
113 ca->cnt = cwnd;
114 return;
115 }
116
117 /* binary increase */
118 if (cwnd < ca->last_max_cwnd) {
119 __u32 dist = (ca->last_max_cwnd - cwnd)
120 / BICTCP_B;
121
122 if (dist > max_increment)
123 /* linear increase */
124 ca->cnt = cwnd / max_increment;
125 else if (dist <= 1U)
126 /* binary search increase */
127 ca->cnt = (cwnd * smooth_part) / BICTCP_B;
128 else
129 /* binary search increase */
130 ca->cnt = cwnd / dist;
131 } else {
132 /* slow start AMD linear increase */
133 if (cwnd < ca->last_max_cwnd + BICTCP_B)
134 /* slow start */
135 ca->cnt = (cwnd * smooth_part) / BICTCP_B;
136 else if (cwnd < ca->last_max_cwnd + max_increment*(BICTCP_B-1))
137 /* slow start */
138 ca->cnt = (cwnd * (BICTCP_B-1))
139 / cwnd-ca->last_max_cwnd;
140 else
141 /* linear increase */
142 ca->cnt = cwnd / max_increment;
143 }
144
145 /* if in slow start or link utilization is very low */
146 if ( ca->loss_cwnd == 0 ||
147 (cwnd > ca->loss_cwnd && ca->low_utilization)) {
148 if (ca->cnt > 20) /* increase cwnd 5% per RTT */
149 ca->cnt = 20;
150 }
151
152 ca->cnt = (ca->cnt << ACK_RATIO_SHIFT) / ca->delayed_ack;
153 if (ca->cnt == 0) /* cannot be zero */
154 ca->cnt = 1;
155}
156
157
158/* Detect low utilization in congestion avoidance */
159static inline void bictcp_low_utilization(struct tcp_sock *tp, int flag)
160{
161 struct bictcp *ca = tcp_ca(tp);
162 u32 dist, delay;
163
164 /* No time stamp */
165 if (!(tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) ||
166 /* Discard delay samples right after fast recovery */
167 tcp_time_stamp < ca->epoch_start + HZ ||
168 /* this delay samples may not be accurate */
169 flag == 0) {
170 ca->last_delay = 0;
171 goto notlow;
172 }
173
174 delay = ca->last_delay<<3; /* use the same scale as tp->srtt*/
175 ca->last_delay = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
176 if (delay == 0) /* no previous delay sample */
177 goto notlow;
178
179 /* first time call or link delay decreases */
180 if (ca->delay_min == 0 || ca->delay_min > delay) {
181 ca->delay_min = ca->delay_max = delay;
182 goto notlow;
183 }
184
185 if (ca->delay_max < delay)
186 ca->delay_max = delay;
187
188 /* utilization is low, if avg delay < dist*threshold
189 for checking_period time */
190 dist = ca->delay_max - ca->delay_min;
191 if (dist <= ca->delay_min>>6 ||
192 tp->srtt - ca->delay_min >= (dist*low_utilization_threshold)>>10)
193 goto notlow;
194
195 if (ca->low_utilization_start == 0) {
196 ca->low_utilization = 0;
197 ca->low_utilization_start = tcp_time_stamp;
198 } else if ((s32)(tcp_time_stamp - ca->low_utilization_start)
199 > low_utilization_period*HZ) {
200 ca->low_utilization = 1;
201 }
202
203 return;
204
205 notlow:
206 ca->low_utilization = 0;
207 ca->low_utilization_start = 0;
208
209}
210
211static void bictcp_cong_avoid(struct tcp_sock *tp, u32 ack,
212 u32 seq_rtt, u32 in_flight, int data_acked)
213{
214 struct bictcp *ca = tcp_ca(tp);
215
216 bictcp_low_utilization(tp, data_acked);
217
218 if (in_flight < tp->snd_cwnd)
219 return;
220
221 if (tp->snd_cwnd <= tp->snd_ssthresh) {
222 /* In "safe" area, increase. */
223 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
224 tp->snd_cwnd++;
225 } else {
226 bictcp_update(ca, tp->snd_cwnd);
227
228 /* In dangerous area, increase slowly.
229 * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
230 */
231 if (tp->snd_cwnd_cnt >= ca->cnt) {
232 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
233 tp->snd_cwnd++;
234 tp->snd_cwnd_cnt = 0;
235 } else
236 tp->snd_cwnd_cnt++;
237 }
238
239}
240
241/*
242 * behave like Reno until low_window is reached,
243 * then increase congestion window slowly
244 */
245static u32 bictcp_recalc_ssthresh(struct tcp_sock *tp)
246{
247 struct bictcp *ca = tcp_ca(tp);
248
249 ca->epoch_start = 0; /* end of epoch */
250
251 /* in case of wrong delay_max*/
252 if (ca->delay_min > 0 && ca->delay_max > ca->delay_min)
253 ca->delay_max = ca->delay_min
254 + ((ca->delay_max - ca->delay_min)* 90) / 100;
255
256 /* Wmax and fast convergence */
257 if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence)
258 ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta))
259 / (2 * BICTCP_BETA_SCALE);
260 else
261 ca->last_max_cwnd = tp->snd_cwnd;
262
263 ca->loss_cwnd = tp->snd_cwnd;
264
265
266 if (tp->snd_cwnd <= low_window)
267 return max(tp->snd_cwnd >> 1U, 2U);
268 else
269 return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U);
270}
271
272static u32 bictcp_undo_cwnd(struct tcp_sock *tp)
273{
274 struct bictcp *ca = tcp_ca(tp);
275
276 return max(tp->snd_cwnd, ca->last_max_cwnd);
277}
278
279static u32 bictcp_min_cwnd(struct tcp_sock *tp)
280{
281 return tp->snd_ssthresh;
282}
283
284static void bictcp_state(struct tcp_sock *tp, u8 new_state)
285{
286 if (new_state == TCP_CA_Loss)
287 bictcp_reset(tcp_ca(tp));
288}
289
290/* Track delayed acknowledgement ratio using sliding window
291 * ratio = (15*ratio + sample) / 16
292 */
293static void bictcp_acked(struct tcp_sock *tp, u32 cnt)
294{
295 if (cnt > 0 && tp->ca_state == TCP_CA_Open) {
296 struct bictcp *ca = tcp_ca(tp);
297 cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT;
298 ca->delayed_ack += cnt;
299 }
300}
301
302
303static struct tcp_congestion_ops bictcp = {
304 .init = bictcp_init,
305 .ssthresh = bictcp_recalc_ssthresh,
306 .cong_avoid = bictcp_cong_avoid,
307 .set_state = bictcp_state,
308 .undo_cwnd = bictcp_undo_cwnd,
309 .min_cwnd = bictcp_min_cwnd,
310 .pkts_acked = bictcp_acked,
311 .owner = THIS_MODULE,
312 .name = "bic",
313};
314
315static int __init bictcp_register(void)
316{
317 BUG_ON(sizeof(struct bictcp) > TCP_CA_PRIV_SIZE);
318 return tcp_register_congestion_control(&bictcp);
319}
320
321static void __exit bictcp_unregister(void)
322{
323 tcp_unregister_congestion_control(&bictcp);
324}
325
326module_init(bictcp_register);
327module_exit(bictcp_unregister);
328
329MODULE_AUTHOR("Stephen Hemminger");
330MODULE_LICENSE("GPL");
331MODULE_DESCRIPTION("BIC TCP");
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
new file mode 100644
index 0000000000..4970d10a77
--- /dev/null
+++ b/net/ipv4/tcp_cong.c
@@ -0,0 +1,237 @@
1/*
2 * Plugable TCP congestion control support and newReno
3 * congestion control.
4 * Based on ideas from I/O scheduler suport and Web100.
5 *
6 * Copyright (C) 2005 Stephen Hemminger <shemminger@osdl.org>
7 */
8
9#include <linux/config.h>
10#include <linux/module.h>
11#include <linux/mm.h>
12#include <linux/types.h>
13#include <linux/list.h>
14#include <net/tcp.h>
15
16static DEFINE_SPINLOCK(tcp_cong_list_lock);
17static LIST_HEAD(tcp_cong_list);
18
19/* Simple linear search, don't expect many entries! */
20static struct tcp_congestion_ops *tcp_ca_find(const char *name)
21{
22 struct tcp_congestion_ops *e;
23
24 list_for_each_entry_rcu(e, &tcp_cong_list, list) {
25 if (strcmp(e->name, name) == 0)
26 return e;
27 }
28
29 return NULL;
30}
31
32/*
33 * Attach new congestion control algorthim to the list
34 * of available options.
35 */
36int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
37{
38 int ret = 0;
39
40 /* all algorithms must implement ssthresh and cong_avoid ops */
41 if (!ca->ssthresh || !ca->cong_avoid || !ca->min_cwnd) {
42 printk(KERN_ERR "TCP %s does not implement required ops\n",
43 ca->name);
44 return -EINVAL;
45 }
46
47 spin_lock(&tcp_cong_list_lock);
48 if (tcp_ca_find(ca->name)) {
49 printk(KERN_NOTICE "TCP %s already registered\n", ca->name);
50 ret = -EEXIST;
51 } else {
52 list_add_rcu(&ca->list, &tcp_cong_list);
53 printk(KERN_INFO "TCP %s registered\n", ca->name);
54 }
55 spin_unlock(&tcp_cong_list_lock);
56
57 return ret;
58}
59EXPORT_SYMBOL_GPL(tcp_register_congestion_control);
60
61/*
62 * Remove congestion control algorithm, called from
63 * the module's remove function. Module ref counts are used
64 * to ensure that this can't be done till all sockets using
65 * that method are closed.
66 */
67void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca)
68{
69 spin_lock(&tcp_cong_list_lock);
70 list_del_rcu(&ca->list);
71 spin_unlock(&tcp_cong_list_lock);
72}
73EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control);
74
75/* Assign choice of congestion control. */
76void tcp_init_congestion_control(struct tcp_sock *tp)
77{
78 struct tcp_congestion_ops *ca;
79
80 if (tp->ca_ops != &tcp_init_congestion_ops)
81 return;
82
83 rcu_read_lock();
84 list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
85 if (try_module_get(ca->owner)) {
86 tp->ca_ops = ca;
87 break;
88 }
89
90 }
91 rcu_read_unlock();
92
93 if (tp->ca_ops->init)
94 tp->ca_ops->init(tp);
95}
96
97/* Manage refcounts on socket close. */
98void tcp_cleanup_congestion_control(struct tcp_sock *tp)
99{
100 if (tp->ca_ops->release)
101 tp->ca_ops->release(tp);
102 module_put(tp->ca_ops->owner);
103}
104
105/* Used by sysctl to change default congestion control */
106int tcp_set_default_congestion_control(const char *name)
107{
108 struct tcp_congestion_ops *ca;
109 int ret = -ENOENT;
110
111 spin_lock(&tcp_cong_list_lock);
112 ca = tcp_ca_find(name);
113#ifdef CONFIG_KMOD
114 if (!ca) {
115 spin_unlock(&tcp_cong_list_lock);
116
117 request_module("tcp_%s", name);
118 spin_lock(&tcp_cong_list_lock);
119 ca = tcp_ca_find(name);
120 }
121#endif
122
123 if (ca) {
124 list_move(&ca->list, &tcp_cong_list);
125 ret = 0;
126 }
127 spin_unlock(&tcp_cong_list_lock);
128
129 return ret;
130}
131
132/* Get current default congestion control */
133void tcp_get_default_congestion_control(char *name)
134{
135 struct tcp_congestion_ops *ca;
136 /* We will always have reno... */
137 BUG_ON(list_empty(&tcp_cong_list));
138
139 rcu_read_lock();
140 ca = list_entry(tcp_cong_list.next, struct tcp_congestion_ops, list);
141 strncpy(name, ca->name, TCP_CA_NAME_MAX);
142 rcu_read_unlock();
143}
144
145/* Change congestion control for socket */
146int tcp_set_congestion_control(struct tcp_sock *tp, const char *name)
147{
148 struct tcp_congestion_ops *ca;
149 int err = 0;
150
151 rcu_read_lock();
152 ca = tcp_ca_find(name);
153 if (ca == tp->ca_ops)
154 goto out;
155
156 if (!ca)
157 err = -ENOENT;
158
159 else if (!try_module_get(ca->owner))
160 err = -EBUSY;
161
162 else {
163 tcp_cleanup_congestion_control(tp);
164 tp->ca_ops = ca;
165 if (tp->ca_ops->init)
166 tp->ca_ops->init(tp);
167 }
168 out:
169 rcu_read_unlock();
170 return err;
171}
172
173/*
174 * TCP Reno congestion control
175 * This is special case used for fallback as well.
176 */
177/* This is Jacobson's slow start and congestion avoidance.
178 * SIGCOMM '88, p. 328.
179 */
180void tcp_reno_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, u32 in_flight,
181 int flag)
182{
183 if (in_flight < tp->snd_cwnd)
184 return;
185
186 if (tp->snd_cwnd <= tp->snd_ssthresh) {
187 /* In "safe" area, increase. */
188 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
189 tp->snd_cwnd++;
190 } else {
191 /* In dangerous area, increase slowly.
192 * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
193 */
194 if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
195 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
196 tp->snd_cwnd++;
197 tp->snd_cwnd_cnt = 0;
198 } else
199 tp->snd_cwnd_cnt++;
200 }
201}
202EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid);
203
204/* Slow start threshold is half the congestion window (min 2) */
205u32 tcp_reno_ssthresh(struct tcp_sock *tp)
206{
207 return max(tp->snd_cwnd >> 1U, 2U);
208}
209EXPORT_SYMBOL_GPL(tcp_reno_ssthresh);
210
211/* Lower bound on congestion window. */
212u32 tcp_reno_min_cwnd(struct tcp_sock *tp)
213{
214 return tp->snd_ssthresh/2;
215}
216EXPORT_SYMBOL_GPL(tcp_reno_min_cwnd);
217
218struct tcp_congestion_ops tcp_reno = {
219 .name = "reno",
220 .owner = THIS_MODULE,
221 .ssthresh = tcp_reno_ssthresh,
222 .cong_avoid = tcp_reno_cong_avoid,
223 .min_cwnd = tcp_reno_min_cwnd,
224};
225
226/* Initial congestion control used (until SYN)
227 * really reno under another name so we can tell difference
228 * during tcp_set_default_congestion_control
229 */
230struct tcp_congestion_ops tcp_init_congestion_ops = {
231 .name = "",
232 .owner = THIS_MODULE,
233 .ssthresh = tcp_reno_ssthresh,
234 .cong_avoid = tcp_reno_cong_avoid,
235 .min_cwnd = tcp_reno_min_cwnd,
236};
237EXPORT_SYMBOL_GPL(tcp_init_congestion_ops);
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index 634befc079..f66945cb15 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -42,15 +42,8 @@ struct tcpdiag_entry
42 42
43static struct sock *tcpnl; 43static struct sock *tcpnl;
44 44
45
46#define TCPDIAG_PUT(skb, attrtype, attrlen) \ 45#define TCPDIAG_PUT(skb, attrtype, attrlen) \
47({ int rtalen = RTA_LENGTH(attrlen); \ 46 RTA_DATA(__RTA_PUT(skb, attrtype, attrlen))
48 struct rtattr *rta; \
49 if (skb_tailroom(skb) < RTA_ALIGN(rtalen)) goto nlmsg_failure; \
50 rta = (void*)__skb_put(skb, RTA_ALIGN(rtalen)); \
51 rta->rta_type = attrtype; \
52 rta->rta_len = rtalen; \
53 RTA_DATA(rta); })
54 47
55static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, 48static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk,
56 int ext, u32 pid, u32 seq, u16 nlmsg_flags) 49 int ext, u32 pid, u32 seq, u16 nlmsg_flags)
@@ -61,7 +54,6 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk,
61 struct nlmsghdr *nlh; 54 struct nlmsghdr *nlh;
62 struct tcp_info *info = NULL; 55 struct tcp_info *info = NULL;
63 struct tcpdiag_meminfo *minfo = NULL; 56 struct tcpdiag_meminfo *minfo = NULL;
64 struct tcpvegas_info *vinfo = NULL;
65 unsigned char *b = skb->tail; 57 unsigned char *b = skb->tail;
66 58
67 nlh = NLMSG_PUT(skb, pid, seq, TCPDIAG_GETSOCK, sizeof(*r)); 59 nlh = NLMSG_PUT(skb, pid, seq, TCPDIAG_GETSOCK, sizeof(*r));
@@ -73,9 +65,11 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk,
73 if (ext & (1<<(TCPDIAG_INFO-1))) 65 if (ext & (1<<(TCPDIAG_INFO-1)))
74 info = TCPDIAG_PUT(skb, TCPDIAG_INFO, sizeof(*info)); 66 info = TCPDIAG_PUT(skb, TCPDIAG_INFO, sizeof(*info));
75 67
76 if ((tcp_is_westwood(tp) || tcp_is_vegas(tp)) 68 if (ext & (1<<(TCPDIAG_CONG-1))) {
77 && (ext & (1<<(TCPDIAG_VEGASINFO-1)))) 69 size_t len = strlen(tp->ca_ops->name);
78 vinfo = TCPDIAG_PUT(skb, TCPDIAG_VEGASINFO, sizeof(*vinfo)); 70 strcpy(TCPDIAG_PUT(skb, TCPDIAG_CONG, len+1),
71 tp->ca_ops->name);
72 }
79 } 73 }
80 r->tcpdiag_family = sk->sk_family; 74 r->tcpdiag_family = sk->sk_family;
81 r->tcpdiag_state = sk->sk_state; 75 r->tcpdiag_state = sk->sk_state;
@@ -166,23 +160,13 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk,
166 if (info) 160 if (info)
167 tcp_get_info(sk, info); 161 tcp_get_info(sk, info);
168 162
169 if (vinfo) { 163 if (sk->sk_state < TCP_TIME_WAIT && tp->ca_ops->get_info)
170 if (tcp_is_vegas(tp)) { 164 tp->ca_ops->get_info(tp, ext, skb);
171 vinfo->tcpv_enabled = tp->vegas.doing_vegas_now;
172 vinfo->tcpv_rttcnt = tp->vegas.cntRTT;
173 vinfo->tcpv_rtt = jiffies_to_usecs(tp->vegas.baseRTT);
174 vinfo->tcpv_minrtt = jiffies_to_usecs(tp->vegas.minRTT);
175 } else {
176 vinfo->tcpv_enabled = 0;
177 vinfo->tcpv_rttcnt = 0;
178 vinfo->tcpv_rtt = jiffies_to_usecs(tp->westwood.rtt);
179 vinfo->tcpv_minrtt = jiffies_to_usecs(tp->westwood.rtt_min);
180 }
181 }
182 165
183 nlh->nlmsg_len = skb->tail - b; 166 nlh->nlmsg_len = skb->tail - b;
184 return skb->len; 167 return skb->len;
185 168
169rtattr_failure:
186nlmsg_failure: 170nlmsg_failure:
187 skb_trim(skb, b - skb->data); 171 skb_trim(skb, b - skb->data);
188 return -1; 172 return -1;
diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c
new file mode 100644
index 0000000000..36c51f8136
--- /dev/null
+++ b/net/ipv4/tcp_highspeed.c
@@ -0,0 +1,181 @@
1/*
2 * Sally Floyd's High Speed TCP (RFC 3649) congestion control
3 *
4 * See http://www.icir.org/floyd/hstcp.html
5 *
6 * John Heffner <jheffner@psc.edu>
7 */
8
9#include <linux/config.h>
10#include <linux/module.h>
11#include <net/tcp.h>
12
13
14/* From AIMD tables from RFC 3649 appendix B,
15 * with fixed-point MD scaled <<8.
16 */
17static const struct hstcp_aimd_val {
18 unsigned int cwnd;
19 unsigned int md;
20} hstcp_aimd_vals[] = {
21 { 38, 128, /* 0.50 */ },
22 { 118, 112, /* 0.44 */ },
23 { 221, 104, /* 0.41 */ },
24 { 347, 98, /* 0.38 */ },
25 { 495, 93, /* 0.37 */ },
26 { 663, 89, /* 0.35 */ },
27 { 851, 86, /* 0.34 */ },
28 { 1058, 83, /* 0.33 */ },
29 { 1284, 81, /* 0.32 */ },
30 { 1529, 78, /* 0.31 */ },
31 { 1793, 76, /* 0.30 */ },
32 { 2076, 74, /* 0.29 */ },
33 { 2378, 72, /* 0.28 */ },
34 { 2699, 71, /* 0.28 */ },
35 { 3039, 69, /* 0.27 */ },
36 { 3399, 68, /* 0.27 */ },
37 { 3778, 66, /* 0.26 */ },
38 { 4177, 65, /* 0.26 */ },
39 { 4596, 64, /* 0.25 */ },
40 { 5036, 62, /* 0.25 */ },
41 { 5497, 61, /* 0.24 */ },
42 { 5979, 60, /* 0.24 */ },
43 { 6483, 59, /* 0.23 */ },
44 { 7009, 58, /* 0.23 */ },
45 { 7558, 57, /* 0.22 */ },
46 { 8130, 56, /* 0.22 */ },
47 { 8726, 55, /* 0.22 */ },
48 { 9346, 54, /* 0.21 */ },
49 { 9991, 53, /* 0.21 */ },
50 { 10661, 52, /* 0.21 */ },
51 { 11358, 52, /* 0.20 */ },
52 { 12082, 51, /* 0.20 */ },
53 { 12834, 50, /* 0.20 */ },
54 { 13614, 49, /* 0.19 */ },
55 { 14424, 48, /* 0.19 */ },
56 { 15265, 48, /* 0.19 */ },
57 { 16137, 47, /* 0.19 */ },
58 { 17042, 46, /* 0.18 */ },
59 { 17981, 45, /* 0.18 */ },
60 { 18955, 45, /* 0.18 */ },
61 { 19965, 44, /* 0.17 */ },
62 { 21013, 43, /* 0.17 */ },
63 { 22101, 43, /* 0.17 */ },
64 { 23230, 42, /* 0.17 */ },
65 { 24402, 41, /* 0.16 */ },
66 { 25618, 41, /* 0.16 */ },
67 { 26881, 40, /* 0.16 */ },
68 { 28193, 39, /* 0.16 */ },
69 { 29557, 39, /* 0.15 */ },
70 { 30975, 38, /* 0.15 */ },
71 { 32450, 38, /* 0.15 */ },
72 { 33986, 37, /* 0.15 */ },
73 { 35586, 36, /* 0.14 */ },
74 { 37253, 36, /* 0.14 */ },
75 { 38992, 35, /* 0.14 */ },
76 { 40808, 35, /* 0.14 */ },
77 { 42707, 34, /* 0.13 */ },
78 { 44694, 33, /* 0.13 */ },
79 { 46776, 33, /* 0.13 */ },
80 { 48961, 32, /* 0.13 */ },
81 { 51258, 32, /* 0.13 */ },
82 { 53677, 31, /* 0.12 */ },
83 { 56230, 30, /* 0.12 */ },
84 { 58932, 30, /* 0.12 */ },
85 { 61799, 29, /* 0.12 */ },
86 { 64851, 28, /* 0.11 */ },
87 { 68113, 28, /* 0.11 */ },
88 { 71617, 27, /* 0.11 */ },
89 { 75401, 26, /* 0.10 */ },
90 { 79517, 26, /* 0.10 */ },
91 { 84035, 25, /* 0.10 */ },
92 { 89053, 24, /* 0.10 */ },
93};
94
95#define HSTCP_AIMD_MAX ARRAY_SIZE(hstcp_aimd_vals)
96
97struct hstcp {
98 u32 ai;
99};
100
101static void hstcp_init(struct tcp_sock *tp)
102{
103 struct hstcp *ca = tcp_ca(tp);
104
105 ca->ai = 0;
106
107 /* Ensure the MD arithmetic works. This is somewhat pedantic,
108 * since I don't think we will see a cwnd this large. :) */
109 tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128);
110}
111
112static void hstcp_cong_avoid(struct tcp_sock *tp, u32 adk, u32 rtt,
113 u32 in_flight, int good)
114{
115 struct hstcp *ca = tcp_ca(tp);
116
117 if (in_flight < tp->snd_cwnd)
118 return;
119
120 if (tp->snd_cwnd <= tp->snd_ssthresh) {
121 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
122 tp->snd_cwnd++;
123 } else {
124 /* Update AIMD parameters */
125 if (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd) {
126 while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd &&
127 ca->ai < HSTCP_AIMD_MAX)
128 ca->ai++;
129 } else if (tp->snd_cwnd < hstcp_aimd_vals[ca->ai].cwnd) {
130 while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd &&
131 ca->ai > 0)
132 ca->ai--;
133 }
134
135 /* Do additive increase */
136 if (tp->snd_cwnd < tp->snd_cwnd_clamp) {
137 tp->snd_cwnd_cnt += ca->ai;
138 if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
139 tp->snd_cwnd++;
140 tp->snd_cwnd_cnt -= tp->snd_cwnd;
141 }
142 }
143 }
144}
145
146static u32 hstcp_ssthresh(struct tcp_sock *tp)
147{
148 struct hstcp *ca = tcp_ca(tp);
149
150 /* Do multiplicative decrease */
151 return max(tp->snd_cwnd - ((tp->snd_cwnd * hstcp_aimd_vals[ca->ai].md) >> 8), 2U);
152}
153
154
155static struct tcp_congestion_ops tcp_highspeed = {
156 .init = hstcp_init,
157 .ssthresh = hstcp_ssthresh,
158 .cong_avoid = hstcp_cong_avoid,
159 .min_cwnd = tcp_reno_min_cwnd,
160
161 .owner = THIS_MODULE,
162 .name = "highspeed"
163};
164
165static int __init hstcp_register(void)
166{
167 BUG_ON(sizeof(struct hstcp) > TCP_CA_PRIV_SIZE);
168 return tcp_register_congestion_control(&tcp_highspeed);
169}
170
171static void __exit hstcp_unregister(void)
172{
173 tcp_unregister_congestion_control(&tcp_highspeed);
174}
175
176module_init(hstcp_register);
177module_exit(hstcp_unregister);
178
179MODULE_AUTHOR("John Heffner");
180MODULE_LICENSE("GPL");
181MODULE_DESCRIPTION("High Speed TCP");
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
new file mode 100644
index 0000000000..40168275ac
--- /dev/null
+++ b/net/ipv4/tcp_htcp.c
@@ -0,0 +1,289 @@
1/*
2 * H-TCP congestion control. The algorithm is detailed in:
3 * R.N.Shorten, D.J.Leith:
4 * "H-TCP: TCP for high-speed and long-distance networks"
5 * Proc. PFLDnet, Argonne, 2004.
6 * http://www.hamilton.ie/net/htcp3.pdf
7 */
8
9#include <linux/config.h>
10#include <linux/mm.h>
11#include <linux/module.h>
12#include <net/tcp.h>
13
14#define ALPHA_BASE (1<<7) /* 1.0 with shift << 7 */
15#define BETA_MIN (1<<6) /* 0.5 with shift << 7 */
16#define BETA_MAX 102 /* 0.8 with shift << 7 */
17
18static int use_rtt_scaling = 1;
19module_param(use_rtt_scaling, int, 0644);
20MODULE_PARM_DESC(use_rtt_scaling, "turn on/off RTT scaling");
21
22static int use_bandwidth_switch = 1;
23module_param(use_bandwidth_switch, int, 0644);
24MODULE_PARM_DESC(use_bandwidth_switch, "turn on/off bandwidth switcher");
25
26struct htcp {
27 u16 alpha; /* Fixed point arith, << 7 */
28 u8 beta; /* Fixed point arith, << 7 */
29 u8 modeswitch; /* Delay modeswitch until we had at least one congestion event */
30 u8 ccount; /* Number of RTTs since last congestion event */
31 u8 undo_ccount;
32 u16 packetcount;
33 u32 minRTT;
34 u32 maxRTT;
35 u32 snd_cwnd_cnt2;
36
37 u32 undo_maxRTT;
38 u32 undo_old_maxB;
39
40 /* Bandwidth estimation */
41 u32 minB;
42 u32 maxB;
43 u32 old_maxB;
44 u32 Bi;
45 u32 lasttime;
46};
47
48static inline void htcp_reset(struct htcp *ca)
49{
50 ca->undo_ccount = ca->ccount;
51 ca->undo_maxRTT = ca->maxRTT;
52 ca->undo_old_maxB = ca->old_maxB;
53
54 ca->ccount = 0;
55 ca->snd_cwnd_cnt2 = 0;
56}
57
58static u32 htcp_cwnd_undo(struct tcp_sock *tp)
59{
60 struct htcp *ca = tcp_ca(tp);
61 ca->ccount = ca->undo_ccount;
62 ca->maxRTT = ca->undo_maxRTT;
63 ca->old_maxB = ca->undo_old_maxB;
64 return max(tp->snd_cwnd, (tp->snd_ssthresh<<7)/ca->beta);
65}
66
67static inline void measure_rtt(struct tcp_sock *tp)
68{
69 struct htcp *ca = tcp_ca(tp);
70 u32 srtt = tp->srtt>>3;
71
72 /* keep track of minimum RTT seen so far, minRTT is zero at first */
73 if (ca->minRTT > srtt || !ca->minRTT)
74 ca->minRTT = srtt;
75
76 /* max RTT */
77 if (tp->ca_state == TCP_CA_Open && tp->snd_ssthresh < 0xFFFF && ca->ccount > 3) {
78 if (ca->maxRTT < ca->minRTT)
79 ca->maxRTT = ca->minRTT;
80 if (ca->maxRTT < srtt && srtt <= ca->maxRTT+HZ/50)
81 ca->maxRTT = srtt;
82 }
83}
84
85static void measure_achieved_throughput(struct tcp_sock *tp, u32 pkts_acked)
86{
87 struct htcp *ca = tcp_ca(tp);
88 u32 now = tcp_time_stamp;
89
90 /* achieved throughput calculations */
91 if (tp->ca_state != TCP_CA_Open && tp->ca_state != TCP_CA_Disorder) {
92 ca->packetcount = 0;
93 ca->lasttime = now;
94 return;
95 }
96
97 ca->packetcount += pkts_acked;
98
99 if (ca->packetcount >= tp->snd_cwnd - (ca->alpha>>7? : 1)
100 && now - ca->lasttime >= ca->minRTT
101 && ca->minRTT > 0) {
102 __u32 cur_Bi = ca->packetcount*HZ/(now - ca->lasttime);
103 if (ca->ccount <= 3) {
104 /* just after backoff */
105 ca->minB = ca->maxB = ca->Bi = cur_Bi;
106 } else {
107 ca->Bi = (3*ca->Bi + cur_Bi)/4;
108 if (ca->Bi > ca->maxB)
109 ca->maxB = ca->Bi;
110 if (ca->minB > ca->maxB)
111 ca->minB = ca->maxB;
112 }
113 ca->packetcount = 0;
114 ca->lasttime = now;
115 }
116}
117
118static inline void htcp_beta_update(struct htcp *ca, u32 minRTT, u32 maxRTT)
119{
120 if (use_bandwidth_switch) {
121 u32 maxB = ca->maxB;
122 u32 old_maxB = ca->old_maxB;
123 ca->old_maxB = ca->maxB;
124
125 if (!between(5*maxB, 4*old_maxB, 6*old_maxB)) {
126 ca->beta = BETA_MIN;
127 ca->modeswitch = 0;
128 return;
129 }
130 }
131
132 if (ca->modeswitch && minRTT > max(HZ/100, 1) && maxRTT) {
133 ca->beta = (minRTT<<7)/maxRTT;
134 if (ca->beta < BETA_MIN)
135 ca->beta = BETA_MIN;
136 else if (ca->beta > BETA_MAX)
137 ca->beta = BETA_MAX;
138 } else {
139 ca->beta = BETA_MIN;
140 ca->modeswitch = 1;
141 }
142}
143
144static inline void htcp_alpha_update(struct htcp *ca)
145{
146 u32 minRTT = ca->minRTT;
147 u32 factor = 1;
148 u32 diff = ca->ccount * minRTT; /* time since last backoff */
149
150 if (diff > HZ) {
151 diff -= HZ;
152 factor = 1+ ( 10*diff + ((diff/2)*(diff/2)/HZ) )/HZ;
153 }
154
155 if (use_rtt_scaling && minRTT) {
156 u32 scale = (HZ<<3)/(10*minRTT);
157 scale = min(max(scale, 1U<<2), 10U<<3); /* clamping ratio to interval [0.5,10]<<3 */
158 factor = (factor<<3)/scale;
159 if (!factor)
160 factor = 1;
161 }
162
163 ca->alpha = 2*factor*((1<<7)-ca->beta);
164 if (!ca->alpha)
165 ca->alpha = ALPHA_BASE;
166}
167
168/* After we have the rtt data to calculate beta, we'd still prefer to wait one
169 * rtt before we adjust our beta to ensure we are working from a consistent
170 * data.
171 *
172 * This function should be called when we hit a congestion event since only at
173 * that point do we really have a real sense of maxRTT (the queues en route
174 * were getting just too full now).
175 */
176static void htcp_param_update(struct tcp_sock *tp)
177{
178 struct htcp *ca = tcp_ca(tp);
179 u32 minRTT = ca->minRTT;
180 u32 maxRTT = ca->maxRTT;
181
182 htcp_beta_update(ca, minRTT, maxRTT);
183 htcp_alpha_update(ca);
184
185 /* add slowly fading memory for maxRTT to accommodate routing changes etc */
186 if (minRTT > 0 && maxRTT > minRTT)
187 ca->maxRTT = minRTT + ((maxRTT-minRTT)*95)/100;
188}
189
190static u32 htcp_recalc_ssthresh(struct tcp_sock *tp)
191{
192 struct htcp *ca = tcp_ca(tp);
193 htcp_param_update(tp);
194 return max((tp->snd_cwnd * ca->beta) >> 7, 2U);
195}
196
197static void htcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
198 u32 in_flight, int data_acked)
199{
200 struct htcp *ca = tcp_ca(tp);
201
202 if (in_flight < tp->snd_cwnd)
203 return;
204
205 if (tp->snd_cwnd <= tp->snd_ssthresh) {
206 /* In "safe" area, increase. */
207 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
208 tp->snd_cwnd++;
209 } else {
210 measure_rtt(tp);
211
212 /* keep track of number of round-trip times since last backoff event */
213 if (ca->snd_cwnd_cnt2++ > tp->snd_cwnd) {
214 ca->ccount++;
215 ca->snd_cwnd_cnt2 = 0;
216 htcp_alpha_update(ca);
217 }
218
219 /* In dangerous area, increase slowly.
220 * In theory this is tp->snd_cwnd += alpha / tp->snd_cwnd
221 */
222 if ((tp->snd_cwnd_cnt++ * ca->alpha)>>7 >= tp->snd_cwnd) {
223 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
224 tp->snd_cwnd++;
225 tp->snd_cwnd_cnt = 0;
226 ca->ccount++;
227 }
228 }
229}
230
231/* Lower bound on congestion window. */
232static u32 htcp_min_cwnd(struct tcp_sock *tp)
233{
234 return tp->snd_ssthresh;
235}
236
237
238static void htcp_init(struct tcp_sock *tp)
239{
240 struct htcp *ca = tcp_ca(tp);
241
242 memset(ca, 0, sizeof(struct htcp));
243 ca->alpha = ALPHA_BASE;
244 ca->beta = BETA_MIN;
245}
246
247static void htcp_state(struct tcp_sock *tp, u8 new_state)
248{
249 switch (new_state) {
250 case TCP_CA_CWR:
251 case TCP_CA_Recovery:
252 case TCP_CA_Loss:
253 htcp_reset(tcp_ca(tp));
254 break;
255 }
256}
257
258static struct tcp_congestion_ops htcp = {
259 .init = htcp_init,
260 .ssthresh = htcp_recalc_ssthresh,
261 .min_cwnd = htcp_min_cwnd,
262 .cong_avoid = htcp_cong_avoid,
263 .set_state = htcp_state,
264 .undo_cwnd = htcp_cwnd_undo,
265 .pkts_acked = measure_achieved_throughput,
266 .owner = THIS_MODULE,
267 .name = "htcp",
268};
269
270static int __init htcp_register(void)
271{
272 BUG_ON(sizeof(struct htcp) > TCP_CA_PRIV_SIZE);
273 BUILD_BUG_ON(BETA_MIN >= BETA_MAX);
274 if (!use_bandwidth_switch)
275 htcp.pkts_acked = NULL;
276 return tcp_register_congestion_control(&htcp);
277}
278
279static void __exit htcp_unregister(void)
280{
281 tcp_unregister_congestion_control(&htcp);
282}
283
284module_init(htcp_register);
285module_exit(htcp_unregister);
286
287MODULE_AUTHOR("Baruch Even");
288MODULE_LICENSE("GPL");
289MODULE_DESCRIPTION("H-TCP");
diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c
new file mode 100644
index 0000000000..13a66342c3
--- /dev/null
+++ b/net/ipv4/tcp_hybla.c
@@ -0,0 +1,187 @@
1/*
2 * TCP HYBLA
3 *
4 * TCP-HYBLA Congestion control algorithm, based on:
5 * C.Caini, R.Firrincieli, "TCP-Hybla: A TCP Enhancement
6 * for Heterogeneous Networks",
7 * International Journal on satellite Communications,
8 * September 2004
9 * Daniele Lacamera
10 * root at danielinux.net
11 */
12
13#include <linux/config.h>
14#include <linux/module.h>
15#include <net/tcp.h>
16
17/* Tcp Hybla structure. */
18struct hybla {
19 u8 hybla_en;
20 u32 snd_cwnd_cents; /* Keeps increment values when it is <1, <<7 */
21 u32 rho; /* Rho parameter, integer part */
22 u32 rho2; /* Rho * Rho, integer part */
23 u32 rho_3ls; /* Rho parameter, <<3 */
24 u32 rho2_7ls; /* Rho^2, <<7 */
25 u32 minrtt; /* Minimum smoothed round trip time value seen */
26};
27
28/* Hybla reference round trip time (default= 1/40 sec = 25 ms),
29 expressed in jiffies */
30static int rtt0 = 25;
31module_param(rtt0, int, 0644);
32MODULE_PARM_DESC(rtt0, "reference rout trip time (ms)");
33
34
35/* This is called to refresh values for hybla parameters */
36static inline void hybla_recalc_param (struct tcp_sock *tp)
37{
38 struct hybla *ca = tcp_ca(tp);
39
40 ca->rho_3ls = max_t(u32, tp->srtt / msecs_to_jiffies(rtt0), 8);
41 ca->rho = ca->rho_3ls >> 3;
42 ca->rho2_7ls = (ca->rho_3ls * ca->rho_3ls) << 1;
43 ca->rho2 = ca->rho2_7ls >>7;
44}
45
46static void hybla_init(struct tcp_sock *tp)
47{
48 struct hybla *ca = tcp_ca(tp);
49
50 ca->rho = 0;
51 ca->rho2 = 0;
52 ca->rho_3ls = 0;
53 ca->rho2_7ls = 0;
54 ca->snd_cwnd_cents = 0;
55 ca->hybla_en = 1;
56 tp->snd_cwnd = 2;
57 tp->snd_cwnd_clamp = 65535;
58
59 /* 1st Rho measurement based on initial srtt */
60 hybla_recalc_param(tp);
61
62 /* set minimum rtt as this is the 1st ever seen */
63 ca->minrtt = tp->srtt;
64 tp->snd_cwnd = ca->rho;
65}
66
67static void hybla_state(struct tcp_sock *tp, u8 ca_state)
68{
69 struct hybla *ca = tcp_ca(tp);
70
71 ca->hybla_en = (ca_state == TCP_CA_Open);
72}
73
74static inline u32 hybla_fraction(u32 odds)
75{
76 static const u32 fractions[] = {
77 128, 139, 152, 165, 181, 197, 215, 234,
78 };
79
80 return (odds < ARRAY_SIZE(fractions)) ? fractions[odds] : 128;
81}
82
83/* TCP Hybla main routine.
84 * This is the algorithm behavior:
85 * o Recalc Hybla parameters if min_rtt has changed
86 * o Give cwnd a new value based on the model proposed
87 * o remember increments <1
88 */
89static void hybla_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
90 u32 in_flight, int flag)
91{
92 struct hybla *ca = tcp_ca(tp);
93 u32 increment, odd, rho_fractions;
94 int is_slowstart = 0;
95
96 /* Recalculate rho only if this srtt is the lowest */
97 if (tp->srtt < ca->minrtt){
98 hybla_recalc_param(tp);
99 ca->minrtt = tp->srtt;
100 }
101
102 if (!ca->hybla_en)
103 return tcp_reno_cong_avoid(tp, ack, rtt, in_flight, flag);
104
105 if (in_flight < tp->snd_cwnd)
106 return;
107
108 if (ca->rho == 0)
109 hybla_recalc_param(tp);
110
111 rho_fractions = ca->rho_3ls - (ca->rho << 3);
112
113 if (tp->snd_cwnd < tp->snd_ssthresh) {
114 /*
115 * slow start
116 * INC = 2^RHO - 1
117 * This is done by splitting the rho parameter
118 * into 2 parts: an integer part and a fraction part.
119 * Inrement<<7 is estimated by doing:
120 * [2^(int+fract)]<<7
121 * that is equal to:
122 * (2^int) * [(2^fract) <<7]
123 * 2^int is straightly computed as 1<<int,
124 * while we will use hybla_slowstart_fraction_increment() to
125 * calculate 2^fract in a <<7 value.
126 */
127 is_slowstart = 1;
128 increment = ((1 << ca->rho) * hybla_fraction(rho_fractions))
129 - 128;
130 } else {
131 /*
132 * congestion avoidance
133 * INC = RHO^2 / W
134 * as long as increment is estimated as (rho<<7)/window
135 * it already is <<7 and we can easily count its fractions.
136 */
137 increment = ca->rho2_7ls / tp->snd_cwnd;
138 if (increment < 128)
139 tp->snd_cwnd_cnt++;
140 }
141
142 odd = increment % 128;
143 tp->snd_cwnd += increment >> 7;
144 ca->snd_cwnd_cents += odd;
145
146 /* check when fractions goes >=128 and increase cwnd by 1. */
147 while(ca->snd_cwnd_cents >= 128) {
148 tp->snd_cwnd++;
149 ca->snd_cwnd_cents -= 128;
150 tp->snd_cwnd_cnt = 0;
151 }
152
153 /* clamp down slowstart cwnd to ssthresh value. */
154 if (is_slowstart)
155 tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
156
157 tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
158}
159
160static struct tcp_congestion_ops tcp_hybla = {
161 .init = hybla_init,
162 .ssthresh = tcp_reno_ssthresh,
163 .min_cwnd = tcp_reno_min_cwnd,
164 .cong_avoid = hybla_cong_avoid,
165 .set_state = hybla_state,
166
167 .owner = THIS_MODULE,
168 .name = "hybla"
169};
170
171static int __init hybla_register(void)
172{
173 BUG_ON(sizeof(struct hybla) > TCP_CA_PRIV_SIZE);
174 return tcp_register_congestion_control(&tcp_hybla);
175}
176
177static void __exit hybla_unregister(void)
178{
179 tcp_unregister_congestion_control(&tcp_hybla);
180}
181
182module_init(hybla_register);
183module_exit(hybla_unregister);
184
185MODULE_AUTHOR("Daniele Lacamera");
186MODULE_LICENSE("GPL");
187MODULE_DESCRIPTION("TCP Hybla");
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 5bad504630..53a8a5399f 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -61,7 +61,6 @@
61 * Panu Kuhlberg: Experimental audit of TCP (re)transmission 61 * Panu Kuhlberg: Experimental audit of TCP (re)transmission
62 * engine. Lots of bugs are found. 62 * engine. Lots of bugs are found.
63 * Pasi Sarolahti: F-RTO for dealing with spurious RTOs 63 * Pasi Sarolahti: F-RTO for dealing with spurious RTOs
64 * Angelo Dell'Aera: TCP Westwood+ support
65 */ 64 */
66 65
67#include <linux/config.h> 66#include <linux/config.h>
@@ -88,23 +87,9 @@ int sysctl_tcp_rfc1337;
88int sysctl_tcp_max_orphans = NR_FILE; 87int sysctl_tcp_max_orphans = NR_FILE;
89int sysctl_tcp_frto; 88int sysctl_tcp_frto;
90int sysctl_tcp_nometrics_save; 89int sysctl_tcp_nometrics_save;
91int sysctl_tcp_westwood;
92int sysctl_tcp_vegas_cong_avoid;
93 90
94int sysctl_tcp_moderate_rcvbuf = 1; 91int sysctl_tcp_moderate_rcvbuf = 1;
95 92
96/* Default values of the Vegas variables, in fixed-point representation
97 * with V_PARAM_SHIFT bits to the right of the binary point.
98 */
99#define V_PARAM_SHIFT 1
100int sysctl_tcp_vegas_alpha = 1<<V_PARAM_SHIFT;
101int sysctl_tcp_vegas_beta = 3<<V_PARAM_SHIFT;
102int sysctl_tcp_vegas_gamma = 1<<V_PARAM_SHIFT;
103int sysctl_tcp_bic = 1;
104int sysctl_tcp_bic_fast_convergence = 1;
105int sysctl_tcp_bic_low_window = 14;
106int sysctl_tcp_bic_beta = 819; /* = 819/1024 (BICTCP_BETA_SCALE) */
107
108#define FLAG_DATA 0x01 /* Incoming frame contained data. */ 93#define FLAG_DATA 0x01 /* Incoming frame contained data. */
109#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ 94#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
110#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */ 95#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */
@@ -333,15 +318,6 @@ static void tcp_init_buffer_space(struct sock *sk)
333 tp->snd_cwnd_stamp = tcp_time_stamp; 318 tp->snd_cwnd_stamp = tcp_time_stamp;
334} 319}
335 320
336static void init_bictcp(struct tcp_sock *tp)
337{
338 tp->bictcp.cnt = 0;
339
340 tp->bictcp.last_max_cwnd = 0;
341 tp->bictcp.last_cwnd = 0;
342 tp->bictcp.last_stamp = 0;
343}
344
345/* 5. Recalculate window clamp after socket hit its memory bounds. */ 321/* 5. Recalculate window clamp after socket hit its memory bounds. */
346static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp) 322static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp)
347{ 323{
@@ -558,45 +534,6 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_
558 tcp_grow_window(sk, tp, skb); 534 tcp_grow_window(sk, tp, skb);
559} 535}
560 536
561/* When starting a new connection, pin down the current choice of
562 * congestion algorithm.
563 */
564void tcp_ca_init(struct tcp_sock *tp)
565{
566 if (sysctl_tcp_westwood)
567 tp->adv_cong = TCP_WESTWOOD;
568 else if (sysctl_tcp_bic)
569 tp->adv_cong = TCP_BIC;
570 else if (sysctl_tcp_vegas_cong_avoid) {
571 tp->adv_cong = TCP_VEGAS;
572 tp->vegas.baseRTT = 0x7fffffff;
573 tcp_vegas_enable(tp);
574 }
575}
576
577/* Do RTT sampling needed for Vegas.
578 * Basically we:
579 * o min-filter RTT samples from within an RTT to get the current
580 * propagation delay + queuing delay (we are min-filtering to try to
581 * avoid the effects of delayed ACKs)
582 * o min-filter RTT samples from a much longer window (forever for now)
583 * to find the propagation delay (baseRTT)
584 */
585static inline void vegas_rtt_calc(struct tcp_sock *tp, __u32 rtt)
586{
587 __u32 vrtt = rtt + 1; /* Never allow zero rtt or baseRTT */
588
589 /* Filter to find propagation delay: */
590 if (vrtt < tp->vegas.baseRTT)
591 tp->vegas.baseRTT = vrtt;
592
593 /* Find the min RTT during the last RTT to find
594 * the current prop. delay + queuing delay:
595 */
596 tp->vegas.minRTT = min(tp->vegas.minRTT, vrtt);
597 tp->vegas.cntRTT++;
598}
599
600/* Called to compute a smoothed rtt estimate. The data fed to this 537/* Called to compute a smoothed rtt estimate. The data fed to this
601 * routine either comes from timestamps, or from segments that were 538 * routine either comes from timestamps, or from segments that were
602 * known _not_ to have been retransmitted [see Karn/Partridge 539 * known _not_ to have been retransmitted [see Karn/Partridge
@@ -606,13 +543,10 @@ static inline void vegas_rtt_calc(struct tcp_sock *tp, __u32 rtt)
606 * To save cycles in the RFC 1323 implementation it was better to break 543 * To save cycles in the RFC 1323 implementation it was better to break
607 * it up into three procedures. -- erics 544 * it up into three procedures. -- erics
608 */ 545 */
609static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt) 546static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt, u32 *usrtt)
610{ 547{
611 long m = mrtt; /* RTT */ 548 long m = mrtt; /* RTT */
612 549
613 if (tcp_vegas_enabled(tp))
614 vegas_rtt_calc(tp, mrtt);
615
616 /* The following amusing code comes from Jacobson's 550 /* The following amusing code comes from Jacobson's
617 * article in SIGCOMM '88. Note that rtt and mdev 551 * article in SIGCOMM '88. Note that rtt and mdev
618 * are scaled versions of rtt and mean deviation. 552 * are scaled versions of rtt and mean deviation.
@@ -670,7 +604,8 @@ static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt)
670 tp->rtt_seq = tp->snd_nxt; 604 tp->rtt_seq = tp->snd_nxt;
671 } 605 }
672 606
673 tcp_westwood_update_rtt(tp, tp->srtt >> 3); 607 if (tp->ca_ops->rtt_sample)
608 tp->ca_ops->rtt_sample(tp, *usrtt);
674} 609}
675 610
676/* Calculate rto without backoff. This is the second half of Van Jacobson's 611/* Calculate rto without backoff. This is the second half of Van Jacobson's
@@ -805,10 +740,10 @@ __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst)
805 __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); 740 __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
806 741
807 if (!cwnd) { 742 if (!cwnd) {
808 if (tp->mss_cache_std > 1460) 743 if (tp->mss_cache > 1460)
809 cwnd = 2; 744 cwnd = 2;
810 else 745 else
811 cwnd = (tp->mss_cache_std > 1095) ? 3 : 4; 746 cwnd = (tp->mss_cache > 1095) ? 3 : 4;
812 } 747 }
813 return min_t(__u32, cwnd, tp->snd_cwnd_clamp); 748 return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
814} 749}
@@ -979,7 +914,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
979 if (sk->sk_route_caps & NETIF_F_TSO) { 914 if (sk->sk_route_caps & NETIF_F_TSO) {
980 sk->sk_route_caps &= ~NETIF_F_TSO; 915 sk->sk_route_caps &= ~NETIF_F_TSO;
981 sock_set_flag(sk, SOCK_NO_LARGESEND); 916 sock_set_flag(sk, SOCK_NO_LARGESEND);
982 tp->mss_cache = tp->mss_cache_std; 917 tp->mss_cache = tp->mss_cache;
983 } 918 }
984 919
985 if (!tp->sacked_out) 920 if (!tp->sacked_out)
@@ -1142,7 +1077,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
1142 (IsFack(tp) || 1077 (IsFack(tp) ||
1143 !before(lost_retrans, 1078 !before(lost_retrans,
1144 TCP_SKB_CB(skb)->ack_seq + tp->reordering * 1079 TCP_SKB_CB(skb)->ack_seq + tp->reordering *
1145 tp->mss_cache_std))) { 1080 tp->mss_cache))) {
1146 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; 1081 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
1147 tp->retrans_out -= tcp_skb_pcount(skb); 1082 tp->retrans_out -= tcp_skb_pcount(skb);
1148 1083
@@ -1185,8 +1120,8 @@ void tcp_enter_frto(struct sock *sk)
1185 tp->snd_una == tp->high_seq || 1120 tp->snd_una == tp->high_seq ||
1186 (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) { 1121 (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) {
1187 tp->prior_ssthresh = tcp_current_ssthresh(tp); 1122 tp->prior_ssthresh = tcp_current_ssthresh(tp);
1188 if (!tcp_westwood_ssthresh(tp)) 1123 tp->snd_ssthresh = tp->ca_ops->ssthresh(tp);
1189 tp->snd_ssthresh = tcp_recalc_ssthresh(tp); 1124 tcp_ca_event(tp, CA_EVENT_FRTO);
1190 } 1125 }
1191 1126
1192 /* Have to clear retransmission markers here to keep the bookkeeping 1127 /* Have to clear retransmission markers here to keep the bookkeeping
@@ -1252,8 +1187,6 @@ static void tcp_enter_frto_loss(struct sock *sk)
1252 tcp_set_ca_state(tp, TCP_CA_Loss); 1187 tcp_set_ca_state(tp, TCP_CA_Loss);
1253 tp->high_seq = tp->frto_highmark; 1188 tp->high_seq = tp->frto_highmark;
1254 TCP_ECN_queue_cwr(tp); 1189 TCP_ECN_queue_cwr(tp);
1255
1256 init_bictcp(tp);
1257} 1190}
1258 1191
1259void tcp_clear_retrans(struct tcp_sock *tp) 1192void tcp_clear_retrans(struct tcp_sock *tp)
@@ -1283,7 +1216,8 @@ void tcp_enter_loss(struct sock *sk, int how)
1283 if (tp->ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq || 1216 if (tp->ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq ||
1284 (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) { 1217 (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) {
1285 tp->prior_ssthresh = tcp_current_ssthresh(tp); 1218 tp->prior_ssthresh = tcp_current_ssthresh(tp);
1286 tp->snd_ssthresh = tcp_recalc_ssthresh(tp); 1219 tp->snd_ssthresh = tp->ca_ops->ssthresh(tp);
1220 tcp_ca_event(tp, CA_EVENT_LOSS);
1287 } 1221 }
1288 tp->snd_cwnd = 1; 1222 tp->snd_cwnd = 1;
1289 tp->snd_cwnd_cnt = 0; 1223 tp->snd_cwnd_cnt = 0;
@@ -1596,28 +1530,14 @@ static inline void tcp_moderate_cwnd(struct tcp_sock *tp)
1596} 1530}
1597 1531
1598/* Decrease cwnd each second ack. */ 1532/* Decrease cwnd each second ack. */
1599
1600static void tcp_cwnd_down(struct tcp_sock *tp) 1533static void tcp_cwnd_down(struct tcp_sock *tp)
1601{ 1534{
1602 int decr = tp->snd_cwnd_cnt + 1; 1535 int decr = tp->snd_cwnd_cnt + 1;
1603 __u32 limit;
1604
1605 /*
1606 * TCP Westwood
1607 * Here limit is evaluated as BWestimation*RTTmin (for obtaining it
1608 * in packets we use mss_cache). If sysctl_tcp_westwood is off
1609 * tcp_westwood_bw_rttmin() returns 0. In such case snd_ssthresh is
1610 * still used as usual. It prevents other strange cases in which
1611 * BWE*RTTmin could assume value 0. It should not happen but...
1612 */
1613
1614 if (!(limit = tcp_westwood_bw_rttmin(tp)))
1615 limit = tp->snd_ssthresh/2;
1616 1536
1617 tp->snd_cwnd_cnt = decr&1; 1537 tp->snd_cwnd_cnt = decr&1;
1618 decr >>= 1; 1538 decr >>= 1;
1619 1539
1620 if (decr && tp->snd_cwnd > limit) 1540 if (decr && tp->snd_cwnd > tp->ca_ops->min_cwnd(tp))
1621 tp->snd_cwnd -= decr; 1541 tp->snd_cwnd -= decr;
1622 1542
1623 tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1); 1543 tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1);
@@ -1654,8 +1574,8 @@ static void DBGUNDO(struct sock *sk, struct tcp_sock *tp, const char *msg)
1654static void tcp_undo_cwr(struct tcp_sock *tp, int undo) 1574static void tcp_undo_cwr(struct tcp_sock *tp, int undo)
1655{ 1575{
1656 if (tp->prior_ssthresh) { 1576 if (tp->prior_ssthresh) {
1657 if (tcp_is_bic(tp)) 1577 if (tp->ca_ops->undo_cwnd)
1658 tp->snd_cwnd = max(tp->snd_cwnd, tp->bictcp.last_max_cwnd); 1578 tp->snd_cwnd = tp->ca_ops->undo_cwnd(tp);
1659 else 1579 else
1660 tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh<<1); 1580 tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh<<1);
1661 1581
@@ -1767,11 +1687,9 @@ static int tcp_try_undo_loss(struct sock *sk, struct tcp_sock *tp)
1767 1687
1768static inline void tcp_complete_cwr(struct tcp_sock *tp) 1688static inline void tcp_complete_cwr(struct tcp_sock *tp)
1769{ 1689{
1770 if (tcp_westwood_cwnd(tp)) 1690 tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
1771 tp->snd_ssthresh = tp->snd_cwnd;
1772 else
1773 tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
1774 tp->snd_cwnd_stamp = tcp_time_stamp; 1691 tp->snd_cwnd_stamp = tcp_time_stamp;
1692 tcp_ca_event(tp, CA_EVENT_COMPLETE_CWR);
1775} 1693}
1776 1694
1777static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag) 1695static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag)
@@ -1946,7 +1864,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
1946 if (tp->ca_state < TCP_CA_CWR) { 1864 if (tp->ca_state < TCP_CA_CWR) {
1947 if (!(flag&FLAG_ECE)) 1865 if (!(flag&FLAG_ECE))
1948 tp->prior_ssthresh = tcp_current_ssthresh(tp); 1866 tp->prior_ssthresh = tcp_current_ssthresh(tp);
1949 tp->snd_ssthresh = tcp_recalc_ssthresh(tp); 1867 tp->snd_ssthresh = tp->ca_ops->ssthresh(tp);
1950 TCP_ECN_queue_cwr(tp); 1868 TCP_ECN_queue_cwr(tp);
1951 } 1869 }
1952 1870
@@ -1963,7 +1881,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
1963/* Read draft-ietf-tcplw-high-performance before mucking 1881/* Read draft-ietf-tcplw-high-performance before mucking
1964 * with this code. (Superceeds RFC1323) 1882 * with this code. (Superceeds RFC1323)
1965 */ 1883 */
1966static void tcp_ack_saw_tstamp(struct tcp_sock *tp, int flag) 1884static void tcp_ack_saw_tstamp(struct tcp_sock *tp, u32 *usrtt, int flag)
1967{ 1885{
1968 __u32 seq_rtt; 1886 __u32 seq_rtt;
1969 1887
@@ -1983,13 +1901,13 @@ static void tcp_ack_saw_tstamp(struct tcp_sock *tp, int flag)
1983 * in window is lost... Voila. --ANK (010210) 1901 * in window is lost... Voila. --ANK (010210)
1984 */ 1902 */
1985 seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; 1903 seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
1986 tcp_rtt_estimator(tp, seq_rtt); 1904 tcp_rtt_estimator(tp, seq_rtt, usrtt);
1987 tcp_set_rto(tp); 1905 tcp_set_rto(tp);
1988 tp->backoff = 0; 1906 tp->backoff = 0;
1989 tcp_bound_rto(tp); 1907 tcp_bound_rto(tp);
1990} 1908}
1991 1909
1992static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, int flag) 1910static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, u32 *usrtt, int flag)
1993{ 1911{
1994 /* We don't have a timestamp. Can only use 1912 /* We don't have a timestamp. Can only use
1995 * packets that are not retransmitted to determine 1913 * packets that are not retransmitted to determine
@@ -2003,338 +1921,29 @@ static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, int flag)
2003 if (flag & FLAG_RETRANS_DATA_ACKED) 1921 if (flag & FLAG_RETRANS_DATA_ACKED)
2004 return; 1922 return;
2005 1923
2006 tcp_rtt_estimator(tp, seq_rtt); 1924 tcp_rtt_estimator(tp, seq_rtt, usrtt);
2007 tcp_set_rto(tp); 1925 tcp_set_rto(tp);
2008 tp->backoff = 0; 1926 tp->backoff = 0;
2009 tcp_bound_rto(tp); 1927 tcp_bound_rto(tp);
2010} 1928}
2011 1929
2012static inline void tcp_ack_update_rtt(struct tcp_sock *tp, 1930static inline void tcp_ack_update_rtt(struct tcp_sock *tp,
2013 int flag, s32 seq_rtt) 1931 int flag, s32 seq_rtt, u32 *usrtt)
2014{ 1932{
2015 /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */ 1933 /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */
2016 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) 1934 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
2017 tcp_ack_saw_tstamp(tp, flag); 1935 tcp_ack_saw_tstamp(tp, usrtt, flag);
2018 else if (seq_rtt >= 0) 1936 else if (seq_rtt >= 0)
2019 tcp_ack_no_tstamp(tp, seq_rtt, flag); 1937 tcp_ack_no_tstamp(tp, seq_rtt, usrtt, flag);
2020}
2021
2022/*
2023 * Compute congestion window to use.
2024 *
2025 * This is from the implementation of BICTCP in
2026 * Lison-Xu, Kahaled Harfoush, and Injog Rhee.
2027 * "Binary Increase Congestion Control for Fast, Long Distance
2028 * Networks" in InfoComm 2004
2029 * Available from:
2030 * http://www.csc.ncsu.edu/faculty/rhee/export/bitcp.pdf
2031 *
2032 * Unless BIC is enabled and congestion window is large
2033 * this behaves the same as the original Reno.
2034 */
2035static inline __u32 bictcp_cwnd(struct tcp_sock *tp)
2036{
2037 /* orignal Reno behaviour */
2038 if (!tcp_is_bic(tp))
2039 return tp->snd_cwnd;
2040
2041 if (tp->bictcp.last_cwnd == tp->snd_cwnd &&
2042 (s32)(tcp_time_stamp - tp->bictcp.last_stamp) <= (HZ>>5))
2043 return tp->bictcp.cnt;
2044
2045 tp->bictcp.last_cwnd = tp->snd_cwnd;
2046 tp->bictcp.last_stamp = tcp_time_stamp;
2047
2048 /* start off normal */
2049 if (tp->snd_cwnd <= sysctl_tcp_bic_low_window)
2050 tp->bictcp.cnt = tp->snd_cwnd;
2051
2052 /* binary increase */
2053 else if (tp->snd_cwnd < tp->bictcp.last_max_cwnd) {
2054 __u32 dist = (tp->bictcp.last_max_cwnd - tp->snd_cwnd)
2055 / BICTCP_B;
2056
2057 if (dist > BICTCP_MAX_INCREMENT)
2058 /* linear increase */
2059 tp->bictcp.cnt = tp->snd_cwnd / BICTCP_MAX_INCREMENT;
2060 else if (dist <= 1U)
2061 /* binary search increase */
2062 tp->bictcp.cnt = tp->snd_cwnd * BICTCP_FUNC_OF_MIN_INCR
2063 / BICTCP_B;
2064 else
2065 /* binary search increase */
2066 tp->bictcp.cnt = tp->snd_cwnd / dist;
2067 } else {
2068 /* slow start amd linear increase */
2069 if (tp->snd_cwnd < tp->bictcp.last_max_cwnd + BICTCP_B)
2070 /* slow start */
2071 tp->bictcp.cnt = tp->snd_cwnd * BICTCP_FUNC_OF_MIN_INCR
2072 / BICTCP_B;
2073 else if (tp->snd_cwnd < tp->bictcp.last_max_cwnd
2074 + BICTCP_MAX_INCREMENT*(BICTCP_B-1))
2075 /* slow start */
2076 tp->bictcp.cnt = tp->snd_cwnd * (BICTCP_B-1)
2077 / (tp->snd_cwnd-tp->bictcp.last_max_cwnd);
2078 else
2079 /* linear increase */
2080 tp->bictcp.cnt = tp->snd_cwnd / BICTCP_MAX_INCREMENT;
2081 }
2082 return tp->bictcp.cnt;
2083} 1938}
2084 1939
2085/* This is Jacobson's slow start and congestion avoidance. 1940static inline void tcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
2086 * SIGCOMM '88, p. 328. 1941 u32 in_flight, int good)
2087 */
2088static inline void reno_cong_avoid(struct tcp_sock *tp)
2089{ 1942{
2090 if (tp->snd_cwnd <= tp->snd_ssthresh) { 1943 tp->ca_ops->cong_avoid(tp, ack, rtt, in_flight, good);
2091 /* In "safe" area, increase. */
2092 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
2093 tp->snd_cwnd++;
2094 } else {
2095 /* In dangerous area, increase slowly.
2096 * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
2097 */
2098 if (tp->snd_cwnd_cnt >= bictcp_cwnd(tp)) {
2099 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
2100 tp->snd_cwnd++;
2101 tp->snd_cwnd_cnt=0;
2102 } else
2103 tp->snd_cwnd_cnt++;
2104 }
2105 tp->snd_cwnd_stamp = tcp_time_stamp; 1944 tp->snd_cwnd_stamp = tcp_time_stamp;
2106} 1945}
2107 1946
2108/* This is based on the congestion detection/avoidance scheme described in
2109 * Lawrence S. Brakmo and Larry L. Peterson.
2110 * "TCP Vegas: End to end congestion avoidance on a global internet."
2111 * IEEE Journal on Selected Areas in Communication, 13(8):1465--1480,
2112 * October 1995. Available from:
2113 * ftp://ftp.cs.arizona.edu/xkernel/Papers/jsac.ps
2114 *
2115 * See http://www.cs.arizona.edu/xkernel/ for their implementation.
2116 * The main aspects that distinguish this implementation from the
2117 * Arizona Vegas implementation are:
2118 * o We do not change the loss detection or recovery mechanisms of
2119 * Linux in any way. Linux already recovers from losses quite well,
2120 * using fine-grained timers, NewReno, and FACK.
2121 * o To avoid the performance penalty imposed by increasing cwnd
2122 * only every-other RTT during slow start, we increase during
2123 * every RTT during slow start, just like Reno.
2124 * o Largely to allow continuous cwnd growth during slow start,
2125 * we use the rate at which ACKs come back as the "actual"
2126 * rate, rather than the rate at which data is sent.
2127 * o To speed convergence to the right rate, we set the cwnd
2128 * to achieve the right ("actual") rate when we exit slow start.
2129 * o To filter out the noise caused by delayed ACKs, we use the
2130 * minimum RTT sample observed during the last RTT to calculate
2131 * the actual rate.
2132 * o When the sender re-starts from idle, it waits until it has
2133 * received ACKs for an entire flight of new data before making
2134 * a cwnd adjustment decision. The original Vegas implementation
2135 * assumed senders never went idle.
2136 */
2137static void vegas_cong_avoid(struct tcp_sock *tp, u32 ack, u32 seq_rtt)
2138{
2139 /* The key players are v_beg_snd_una and v_beg_snd_nxt.
2140 *
2141 * These are so named because they represent the approximate values
2142 * of snd_una and snd_nxt at the beginning of the current RTT. More
2143 * precisely, they represent the amount of data sent during the RTT.
2144 * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt,
2145 * we will calculate that (v_beg_snd_nxt - v_beg_snd_una) outstanding
2146 * bytes of data have been ACKed during the course of the RTT, giving
2147 * an "actual" rate of:
2148 *
2149 * (v_beg_snd_nxt - v_beg_snd_una) / (rtt duration)
2150 *
2151 * Unfortunately, v_beg_snd_una is not exactly equal to snd_una,
2152 * because delayed ACKs can cover more than one segment, so they
2153 * don't line up nicely with the boundaries of RTTs.
2154 *
2155 * Another unfortunate fact of life is that delayed ACKs delay the
2156 * advance of the left edge of our send window, so that the number
2157 * of bytes we send in an RTT is often less than our cwnd will allow.
2158 * So we keep track of our cwnd separately, in v_beg_snd_cwnd.
2159 */
2160
2161 if (after(ack, tp->vegas.beg_snd_nxt)) {
2162 /* Do the Vegas once-per-RTT cwnd adjustment. */
2163 u32 old_wnd, old_snd_cwnd;
2164
2165
2166 /* Here old_wnd is essentially the window of data that was
2167 * sent during the previous RTT, and has all
2168 * been acknowledged in the course of the RTT that ended
2169 * with the ACK we just received. Likewise, old_snd_cwnd
2170 * is the cwnd during the previous RTT.
2171 */
2172 old_wnd = (tp->vegas.beg_snd_nxt - tp->vegas.beg_snd_una) /
2173 tp->mss_cache_std;
2174 old_snd_cwnd = tp->vegas.beg_snd_cwnd;
2175
2176 /* Save the extent of the current window so we can use this
2177 * at the end of the next RTT.
2178 */
2179 tp->vegas.beg_snd_una = tp->vegas.beg_snd_nxt;
2180 tp->vegas.beg_snd_nxt = tp->snd_nxt;
2181 tp->vegas.beg_snd_cwnd = tp->snd_cwnd;
2182
2183 /* Take into account the current RTT sample too, to
2184 * decrease the impact of delayed acks. This double counts
2185 * this sample since we count it for the next window as well,
2186 * but that's not too awful, since we're taking the min,
2187 * rather than averaging.
2188 */
2189 vegas_rtt_calc(tp, seq_rtt);
2190
2191 /* We do the Vegas calculations only if we got enough RTT
2192 * samples that we can be reasonably sure that we got
2193 * at least one RTT sample that wasn't from a delayed ACK.
2194 * If we only had 2 samples total,
2195 * then that means we're getting only 1 ACK per RTT, which
2196 * means they're almost certainly delayed ACKs.
2197 * If we have 3 samples, we should be OK.
2198 */
2199
2200 if (tp->vegas.cntRTT <= 2) {
2201 /* We don't have enough RTT samples to do the Vegas
2202 * calculation, so we'll behave like Reno.
2203 */
2204 if (tp->snd_cwnd > tp->snd_ssthresh)
2205 tp->snd_cwnd++;
2206 } else {
2207 u32 rtt, target_cwnd, diff;
2208
2209 /* We have enough RTT samples, so, using the Vegas
2210 * algorithm, we determine if we should increase or
2211 * decrease cwnd, and by how much.
2212 */
2213
2214 /* Pluck out the RTT we are using for the Vegas
2215 * calculations. This is the min RTT seen during the
2216 * last RTT. Taking the min filters out the effects
2217 * of delayed ACKs, at the cost of noticing congestion
2218 * a bit later.
2219 */
2220 rtt = tp->vegas.minRTT;
2221
2222 /* Calculate the cwnd we should have, if we weren't
2223 * going too fast.
2224 *
2225 * This is:
2226 * (actual rate in segments) * baseRTT
2227 * We keep it as a fixed point number with
2228 * V_PARAM_SHIFT bits to the right of the binary point.
2229 */
2230 target_cwnd = ((old_wnd * tp->vegas.baseRTT)
2231 << V_PARAM_SHIFT) / rtt;
2232
2233 /* Calculate the difference between the window we had,
2234 * and the window we would like to have. This quantity
2235 * is the "Diff" from the Arizona Vegas papers.
2236 *
2237 * Again, this is a fixed point number with
2238 * V_PARAM_SHIFT bits to the right of the binary
2239 * point.
2240 */
2241 diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd;
2242
2243 if (tp->snd_cwnd < tp->snd_ssthresh) {
2244 /* Slow start. */
2245 if (diff > sysctl_tcp_vegas_gamma) {
2246 /* Going too fast. Time to slow down
2247 * and switch to congestion avoidance.
2248 */
2249 tp->snd_ssthresh = 2;
2250
2251 /* Set cwnd to match the actual rate
2252 * exactly:
2253 * cwnd = (actual rate) * baseRTT
2254 * Then we add 1 because the integer
2255 * truncation robs us of full link
2256 * utilization.
2257 */
2258 tp->snd_cwnd = min(tp->snd_cwnd,
2259 (target_cwnd >>
2260 V_PARAM_SHIFT)+1);
2261
2262 }
2263 } else {
2264 /* Congestion avoidance. */
2265 u32 next_snd_cwnd;
2266
2267 /* Figure out where we would like cwnd
2268 * to be.
2269 */
2270 if (diff > sysctl_tcp_vegas_beta) {
2271 /* The old window was too fast, so
2272 * we slow down.
2273 */
2274 next_snd_cwnd = old_snd_cwnd - 1;
2275 } else if (diff < sysctl_tcp_vegas_alpha) {
2276 /* We don't have enough extra packets
2277 * in the network, so speed up.
2278 */
2279 next_snd_cwnd = old_snd_cwnd + 1;
2280 } else {
2281 /* Sending just as fast as we
2282 * should be.
2283 */
2284 next_snd_cwnd = old_snd_cwnd;
2285 }
2286
2287 /* Adjust cwnd upward or downward, toward the
2288 * desired value.
2289 */
2290 if (next_snd_cwnd > tp->snd_cwnd)
2291 tp->snd_cwnd++;
2292 else if (next_snd_cwnd < tp->snd_cwnd)
2293 tp->snd_cwnd--;
2294 }
2295 }
2296
2297 /* Wipe the slate clean for the next RTT. */
2298 tp->vegas.cntRTT = 0;
2299 tp->vegas.minRTT = 0x7fffffff;
2300 }
2301
2302 /* The following code is executed for every ack we receive,
2303 * except for conditions checked in should_advance_cwnd()
2304 * before the call to tcp_cong_avoid(). Mainly this means that
2305 * we only execute this code if the ack actually acked some
2306 * data.
2307 */
2308
2309 /* If we are in slow start, increase our cwnd in response to this ACK.
2310 * (If we are not in slow start then we are in congestion avoidance,
2311 * and adjust our congestion window only once per RTT. See the code
2312 * above.)
2313 */
2314 if (tp->snd_cwnd <= tp->snd_ssthresh)
2315 tp->snd_cwnd++;
2316
2317 /* to keep cwnd from growing without bound */
2318 tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
2319
2320 /* Make sure that we are never so timid as to reduce our cwnd below
2321 * 2 MSS.
2322 *
2323 * Going below 2 MSS would risk huge delayed ACKs from our receiver.
2324 */
2325 tp->snd_cwnd = max(tp->snd_cwnd, 2U);
2326
2327 tp->snd_cwnd_stamp = tcp_time_stamp;
2328}
2329
2330static inline void tcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 seq_rtt)
2331{
2332 if (tcp_vegas_enabled(tp))
2333 vegas_cong_avoid(tp, ack, seq_rtt);
2334 else
2335 reno_cong_avoid(tp);
2336}
2337
2338/* Restart timer after forward progress on connection. 1947/* Restart timer after forward progress on connection.
2339 * RFC2988 recommends to restart timer to now+rto. 1948 * RFC2988 recommends to restart timer to now+rto.
2340 */ 1949 */
@@ -2348,15 +1957,6 @@ static inline void tcp_ack_packets_out(struct sock *sk, struct tcp_sock *tp)
2348 } 1957 }
2349} 1958}
2350 1959
2351/* There is one downside to this scheme. Although we keep the
2352 * ACK clock ticking, adjusting packet counters and advancing
2353 * congestion window, we do not liberate socket send buffer
2354 * space.
2355 *
2356 * Mucking with skb->truesize and sk->sk_wmem_alloc et al.
2357 * then making a write space wakeup callback is a possible
2358 * future enhancement. WARNING: it is not trivial to make.
2359 */
2360static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb, 1960static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb,
2361 __u32 now, __s32 *seq_rtt) 1961 __u32 now, __s32 *seq_rtt)
2362{ 1962{
@@ -2415,13 +2015,18 @@ static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb,
2415 2015
2416 2016
2417/* Remove acknowledged frames from the retransmission queue. */ 2017/* Remove acknowledged frames from the retransmission queue. */
2418static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) 2018static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt)
2419{ 2019{
2420 struct tcp_sock *tp = tcp_sk(sk); 2020 struct tcp_sock *tp = tcp_sk(sk);
2421 struct sk_buff *skb; 2021 struct sk_buff *skb;
2422 __u32 now = tcp_time_stamp; 2022 __u32 now = tcp_time_stamp;
2423 int acked = 0; 2023 int acked = 0;
2424 __s32 seq_rtt = -1; 2024 __s32 seq_rtt = -1;
2025 struct timeval usnow;
2026 u32 pkts_acked = 0;
2027
2028 if (seq_usrtt)
2029 do_gettimeofday(&usnow);
2425 2030
2426 while ((skb = skb_peek(&sk->sk_write_queue)) && 2031 while ((skb = skb_peek(&sk->sk_write_queue)) &&
2427 skb != sk->sk_send_head) { 2032 skb != sk->sk_send_head) {
@@ -2433,7 +2038,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
2433 * the other end. 2038 * the other end.
2434 */ 2039 */
2435 if (after(scb->end_seq, tp->snd_una)) { 2040 if (after(scb->end_seq, tp->snd_una)) {
2436 if (tcp_skb_pcount(skb) > 1) 2041 if (tcp_skb_pcount(skb) > 1 &&
2042 after(tp->snd_una, scb->seq))
2437 acked |= tcp_tso_acked(sk, skb, 2043 acked |= tcp_tso_acked(sk, skb,
2438 now, &seq_rtt); 2044 now, &seq_rtt);
2439 break; 2045 break;
@@ -2448,6 +2054,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
2448 */ 2054 */
2449 if (!(scb->flags & TCPCB_FLAG_SYN)) { 2055 if (!(scb->flags & TCPCB_FLAG_SYN)) {
2450 acked |= FLAG_DATA_ACKED; 2056 acked |= FLAG_DATA_ACKED;
2057 ++pkts_acked;
2451 } else { 2058 } else {
2452 acked |= FLAG_SYN_ACKED; 2059 acked |= FLAG_SYN_ACKED;
2453 tp->retrans_stamp = 0; 2060 tp->retrans_stamp = 0;
@@ -2461,6 +2068,10 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
2461 seq_rtt = -1; 2068 seq_rtt = -1;
2462 } else if (seq_rtt < 0) 2069 } else if (seq_rtt < 0)
2463 seq_rtt = now - scb->when; 2070 seq_rtt = now - scb->when;
2071 if (seq_usrtt)
2072 *seq_usrtt = (usnow.tv_sec - skb->stamp.tv_sec) * 1000000
2073 + (usnow.tv_usec - skb->stamp.tv_usec);
2074
2464 if (sacked & TCPCB_SACKED_ACKED) 2075 if (sacked & TCPCB_SACKED_ACKED)
2465 tp->sacked_out -= tcp_skb_pcount(skb); 2076 tp->sacked_out -= tcp_skb_pcount(skb);
2466 if (sacked & TCPCB_LOST) 2077 if (sacked & TCPCB_LOST)
@@ -2479,8 +2090,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
2479 } 2090 }
2480 2091
2481 if (acked&FLAG_ACKED) { 2092 if (acked&FLAG_ACKED) {
2482 tcp_ack_update_rtt(tp, acked, seq_rtt); 2093 tcp_ack_update_rtt(tp, acked, seq_rtt, seq_usrtt);
2483 tcp_ack_packets_out(sk, tp); 2094 tcp_ack_packets_out(sk, tp);
2095
2096 if (tp->ca_ops->pkts_acked)
2097 tp->ca_ops->pkts_acked(tp, pkts_acked);
2484 } 2098 }
2485 2099
2486#if FASTRETRANS_DEBUG > 0 2100#if FASTRETRANS_DEBUG > 0
@@ -2624,257 +2238,6 @@ static void tcp_process_frto(struct sock *sk, u32 prior_snd_una)
2624 tp->frto_counter = (tp->frto_counter + 1) % 3; 2238 tp->frto_counter = (tp->frto_counter + 1) % 3;
2625} 2239}
2626 2240
2627/*
2628 * TCP Westwood+
2629 */
2630
2631/*
2632 * @init_westwood
2633 * This function initializes fields used in TCP Westwood+. We can't
2634 * get no information about RTTmin at this time so we simply set it to
2635 * TCP_WESTWOOD_INIT_RTT. This value was chosen to be too conservative
2636 * since in this way we're sure it will be updated in a consistent
2637 * way as soon as possible. It will reasonably happen within the first
2638 * RTT period of the connection lifetime.
2639 */
2640
2641static void init_westwood(struct sock *sk)
2642{
2643 struct tcp_sock *tp = tcp_sk(sk);
2644
2645 tp->westwood.bw_ns_est = 0;
2646 tp->westwood.bw_est = 0;
2647 tp->westwood.accounted = 0;
2648 tp->westwood.cumul_ack = 0;
2649 tp->westwood.rtt_win_sx = tcp_time_stamp;
2650 tp->westwood.rtt = TCP_WESTWOOD_INIT_RTT;
2651 tp->westwood.rtt_min = TCP_WESTWOOD_INIT_RTT;
2652 tp->westwood.snd_una = tp->snd_una;
2653}
2654
2655/*
2656 * @westwood_do_filter
2657 * Low-pass filter. Implemented using constant coeffients.
2658 */
2659
2660static inline __u32 westwood_do_filter(__u32 a, __u32 b)
2661{
2662 return (((7 * a) + b) >> 3);
2663}
2664
2665static void westwood_filter(struct sock *sk, __u32 delta)
2666{
2667 struct tcp_sock *tp = tcp_sk(sk);
2668
2669 tp->westwood.bw_ns_est =
2670 westwood_do_filter(tp->westwood.bw_ns_est,
2671 tp->westwood.bk / delta);
2672 tp->westwood.bw_est =
2673 westwood_do_filter(tp->westwood.bw_est,
2674 tp->westwood.bw_ns_est);
2675}
2676
2677/*
2678 * @westwood_update_rttmin
2679 * It is used to update RTTmin. In this case we MUST NOT use
2680 * WESTWOOD_RTT_MIN minimum bound since we could be on a LAN!
2681 */
2682
2683static inline __u32 westwood_update_rttmin(const struct sock *sk)
2684{
2685 const struct tcp_sock *tp = tcp_sk(sk);
2686 __u32 rttmin = tp->westwood.rtt_min;
2687
2688 if (tp->westwood.rtt != 0 &&
2689 (tp->westwood.rtt < tp->westwood.rtt_min || !rttmin))
2690 rttmin = tp->westwood.rtt;
2691
2692 return rttmin;
2693}
2694
2695/*
2696 * @westwood_acked
2697 * Evaluate increases for dk.
2698 */
2699
2700static inline __u32 westwood_acked(const struct sock *sk)
2701{
2702 const struct tcp_sock *tp = tcp_sk(sk);
2703
2704 return tp->snd_una - tp->westwood.snd_una;
2705}
2706
2707/*
2708 * @westwood_new_window
2709 * It evaluates if we are receiving data inside the same RTT window as
2710 * when we started.
2711 * Return value:
2712 * It returns 0 if we are still evaluating samples in the same RTT
2713 * window, 1 if the sample has to be considered in the next window.
2714 */
2715
2716static int westwood_new_window(const struct sock *sk)
2717{
2718 const struct tcp_sock *tp = tcp_sk(sk);
2719 __u32 left_bound;
2720 __u32 rtt;
2721 int ret = 0;
2722
2723 left_bound = tp->westwood.rtt_win_sx;
2724 rtt = max(tp->westwood.rtt, (u32) TCP_WESTWOOD_RTT_MIN);
2725
2726 /*
2727 * A RTT-window has passed. Be careful since if RTT is less than
2728 * 50ms we don't filter but we continue 'building the sample'.
2729 * This minimum limit was choosen since an estimation on small
2730 * time intervals is better to avoid...
2731 * Obvioulsy on a LAN we reasonably will always have
2732 * right_bound = left_bound + WESTWOOD_RTT_MIN
2733 */
2734
2735 if ((left_bound + rtt) < tcp_time_stamp)
2736 ret = 1;
2737
2738 return ret;
2739}
2740
2741/*
2742 * @westwood_update_window
2743 * It updates RTT evaluation window if it is the right moment to do
2744 * it. If so it calls filter for evaluating bandwidth.
2745 */
2746
2747static void __westwood_update_window(struct sock *sk, __u32 now)
2748{
2749 struct tcp_sock *tp = tcp_sk(sk);
2750 __u32 delta = now - tp->westwood.rtt_win_sx;
2751
2752 if (delta) {
2753 if (tp->westwood.rtt)
2754 westwood_filter(sk, delta);
2755
2756 tp->westwood.bk = 0;
2757 tp->westwood.rtt_win_sx = tcp_time_stamp;
2758 }
2759}
2760
2761
2762static void westwood_update_window(struct sock *sk, __u32 now)
2763{
2764 if (westwood_new_window(sk))
2765 __westwood_update_window(sk, now);
2766}
2767
2768/*
2769 * @__tcp_westwood_fast_bw
2770 * It is called when we are in fast path. In particular it is called when
2771 * header prediction is successfull. In such case infact update is
2772 * straight forward and doesn't need any particular care.
2773 */
2774
2775static void __tcp_westwood_fast_bw(struct sock *sk, struct sk_buff *skb)
2776{
2777 struct tcp_sock *tp = tcp_sk(sk);
2778
2779 westwood_update_window(sk, tcp_time_stamp);
2780
2781 tp->westwood.bk += westwood_acked(sk);
2782 tp->westwood.snd_una = tp->snd_una;
2783 tp->westwood.rtt_min = westwood_update_rttmin(sk);
2784}
2785
2786static inline void tcp_westwood_fast_bw(struct sock *sk, struct sk_buff *skb)
2787{
2788 if (tcp_is_westwood(tcp_sk(sk)))
2789 __tcp_westwood_fast_bw(sk, skb);
2790}
2791
2792
2793/*
2794 * @westwood_dupack_update
2795 * It updates accounted and cumul_ack when receiving a dupack.
2796 */
2797
2798static void westwood_dupack_update(struct sock *sk)
2799{
2800 struct tcp_sock *tp = tcp_sk(sk);
2801
2802 tp->westwood.accounted += tp->mss_cache_std;
2803 tp->westwood.cumul_ack = tp->mss_cache_std;
2804}
2805
2806static inline int westwood_may_change_cumul(struct tcp_sock *tp)
2807{
2808 return (tp->westwood.cumul_ack > tp->mss_cache_std);
2809}
2810
2811static inline void westwood_partial_update(struct tcp_sock *tp)
2812{
2813 tp->westwood.accounted -= tp->westwood.cumul_ack;
2814 tp->westwood.cumul_ack = tp->mss_cache_std;
2815}
2816
2817static inline void westwood_complete_update(struct tcp_sock *tp)
2818{
2819 tp->westwood.cumul_ack -= tp->westwood.accounted;
2820 tp->westwood.accounted = 0;
2821}
2822
2823/*
2824 * @westwood_acked_count
2825 * This function evaluates cumul_ack for evaluating dk in case of
2826 * delayed or partial acks.
2827 */
2828
2829static inline __u32 westwood_acked_count(struct sock *sk)
2830{
2831 struct tcp_sock *tp = tcp_sk(sk);
2832
2833 tp->westwood.cumul_ack = westwood_acked(sk);
2834
2835 /* If cumul_ack is 0 this is a dupack since it's not moving
2836 * tp->snd_una.
2837 */
2838 if (!(tp->westwood.cumul_ack))
2839 westwood_dupack_update(sk);
2840
2841 if (westwood_may_change_cumul(tp)) {
2842 /* Partial or delayed ack */
2843 if (tp->westwood.accounted >= tp->westwood.cumul_ack)
2844 westwood_partial_update(tp);
2845 else
2846 westwood_complete_update(tp);
2847 }
2848
2849 tp->westwood.snd_una = tp->snd_una;
2850
2851 return tp->westwood.cumul_ack;
2852}
2853
2854
2855/*
2856 * @__tcp_westwood_slow_bw
2857 * It is called when something is going wrong..even if there could
2858 * be no problems! Infact a simple delayed packet may trigger a
2859 * dupack. But we need to be careful in such case.
2860 */
2861
2862static void __tcp_westwood_slow_bw(struct sock *sk, struct sk_buff *skb)
2863{
2864 struct tcp_sock *tp = tcp_sk(sk);
2865
2866 westwood_update_window(sk, tcp_time_stamp);
2867
2868 tp->westwood.bk += westwood_acked_count(sk);
2869 tp->westwood.rtt_min = westwood_update_rttmin(sk);
2870}
2871
2872static inline void tcp_westwood_slow_bw(struct sock *sk, struct sk_buff *skb)
2873{
2874 if (tcp_is_westwood(tcp_sk(sk)))
2875 __tcp_westwood_slow_bw(sk, skb);
2876}
2877
2878/* This routine deals with incoming acks, but not outgoing ones. */ 2241/* This routine deals with incoming acks, but not outgoing ones. */
2879static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) 2242static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
2880{ 2243{
@@ -2884,6 +2247,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
2884 u32 ack = TCP_SKB_CB(skb)->ack_seq; 2247 u32 ack = TCP_SKB_CB(skb)->ack_seq;
2885 u32 prior_in_flight; 2248 u32 prior_in_flight;
2886 s32 seq_rtt; 2249 s32 seq_rtt;
2250 s32 seq_usrtt = 0;
2887 int prior_packets; 2251 int prior_packets;
2888 2252
2889 /* If the ack is newer than sent or older than previous acks 2253 /* If the ack is newer than sent or older than previous acks
@@ -2902,9 +2266,10 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
2902 */ 2266 */
2903 tcp_update_wl(tp, ack, ack_seq); 2267 tcp_update_wl(tp, ack, ack_seq);
2904 tp->snd_una = ack; 2268 tp->snd_una = ack;
2905 tcp_westwood_fast_bw(sk, skb);
2906 flag |= FLAG_WIN_UPDATE; 2269 flag |= FLAG_WIN_UPDATE;
2907 2270
2271 tcp_ca_event(tp, CA_EVENT_FAST_ACK);
2272
2908 NET_INC_STATS_BH(LINUX_MIB_TCPHPACKS); 2273 NET_INC_STATS_BH(LINUX_MIB_TCPHPACKS);
2909 } else { 2274 } else {
2910 if (ack_seq != TCP_SKB_CB(skb)->end_seq) 2275 if (ack_seq != TCP_SKB_CB(skb)->end_seq)
@@ -2920,7 +2285,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
2920 if (TCP_ECN_rcv_ecn_echo(tp, skb->h.th)) 2285 if (TCP_ECN_rcv_ecn_echo(tp, skb->h.th))
2921 flag |= FLAG_ECE; 2286 flag |= FLAG_ECE;
2922 2287
2923 tcp_westwood_slow_bw(sk,skb); 2288 tcp_ca_event(tp, CA_EVENT_SLOW_ACK);
2924 } 2289 }
2925 2290
2926 /* We passed data and got it acked, remove any soft error 2291 /* We passed data and got it acked, remove any soft error
@@ -2935,22 +2300,20 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
2935 prior_in_flight = tcp_packets_in_flight(tp); 2300 prior_in_flight = tcp_packets_in_flight(tp);
2936 2301
2937 /* See if we can take anything off of the retransmit queue. */ 2302 /* See if we can take anything off of the retransmit queue. */
2938 flag |= tcp_clean_rtx_queue(sk, &seq_rtt); 2303 flag |= tcp_clean_rtx_queue(sk, &seq_rtt,
2304 tp->ca_ops->rtt_sample ? &seq_usrtt : NULL);
2939 2305
2940 if (tp->frto_counter) 2306 if (tp->frto_counter)
2941 tcp_process_frto(sk, prior_snd_una); 2307 tcp_process_frto(sk, prior_snd_una);
2942 2308
2943 if (tcp_ack_is_dubious(tp, flag)) { 2309 if (tcp_ack_is_dubious(tp, flag)) {
2944 /* Advanve CWND, if state allows this. */ 2310 /* Advanve CWND, if state allows this. */
2945 if ((flag & FLAG_DATA_ACKED) && 2311 if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(tp, flag))
2946 (tcp_vegas_enabled(tp) || prior_in_flight >= tp->snd_cwnd) && 2312 tcp_cong_avoid(tp, ack, seq_rtt, prior_in_flight, 0);
2947 tcp_may_raise_cwnd(tp, flag))
2948 tcp_cong_avoid(tp, ack, seq_rtt);
2949 tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag); 2313 tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag);
2950 } else { 2314 } else {
2951 if ((flag & FLAG_DATA_ACKED) && 2315 if ((flag & FLAG_DATA_ACKED))
2952 (tcp_vegas_enabled(tp) || prior_in_flight >= tp->snd_cwnd)) 2316 tcp_cong_avoid(tp, ack, seq_rtt, prior_in_flight, 1);
2953 tcp_cong_avoid(tp, ack, seq_rtt);
2954 } 2317 }
2955 2318
2956 if ((flag & FLAG_FORWARD_PROGRESS) || !(flag&FLAG_NOT_DUP)) 2319 if ((flag & FLAG_FORWARD_PROGRESS) || !(flag&FLAG_NOT_DUP))
@@ -3439,7 +2802,7 @@ static void tcp_sack_remove(struct tcp_sock *tp)
3439 int this_sack; 2802 int this_sack;
3440 2803
3441 /* Empty ofo queue, hence, all the SACKs are eaten. Clear. */ 2804 /* Empty ofo queue, hence, all the SACKs are eaten. Clear. */
3442 if (skb_queue_len(&tp->out_of_order_queue) == 0) { 2805 if (skb_queue_empty(&tp->out_of_order_queue)) {
3443 tp->rx_opt.num_sacks = 0; 2806 tp->rx_opt.num_sacks = 0;
3444 tp->rx_opt.eff_sacks = tp->rx_opt.dsack; 2807 tp->rx_opt.eff_sacks = tp->rx_opt.dsack;
3445 return; 2808 return;
@@ -3572,13 +2935,13 @@ queue_and_out:
3572 if(th->fin) 2935 if(th->fin)
3573 tcp_fin(skb, sk, th); 2936 tcp_fin(skb, sk, th);
3574 2937
3575 if (skb_queue_len(&tp->out_of_order_queue)) { 2938 if (!skb_queue_empty(&tp->out_of_order_queue)) {
3576 tcp_ofo_queue(sk); 2939 tcp_ofo_queue(sk);
3577 2940
3578 /* RFC2581. 4.2. SHOULD send immediate ACK, when 2941 /* RFC2581. 4.2. SHOULD send immediate ACK, when
3579 * gap in queue is filled. 2942 * gap in queue is filled.
3580 */ 2943 */
3581 if (!skb_queue_len(&tp->out_of_order_queue)) 2944 if (skb_queue_empty(&tp->out_of_order_queue))
3582 tp->ack.pingpong = 0; 2945 tp->ack.pingpong = 0;
3583 } 2946 }
3584 2947
@@ -3886,9 +3249,8 @@ static int tcp_prune_queue(struct sock *sk)
3886 * This must not ever occur. */ 3249 * This must not ever occur. */
3887 3250
3888 /* First, purge the out_of_order queue. */ 3251 /* First, purge the out_of_order queue. */
3889 if (skb_queue_len(&tp->out_of_order_queue)) { 3252 if (!skb_queue_empty(&tp->out_of_order_queue)) {
3890 NET_ADD_STATS_BH(LINUX_MIB_OFOPRUNED, 3253 NET_INC_STATS_BH(LINUX_MIB_OFOPRUNED);
3891 skb_queue_len(&tp->out_of_order_queue));
3892 __skb_queue_purge(&tp->out_of_order_queue); 3254 __skb_queue_purge(&tp->out_of_order_queue);
3893 3255
3894 /* Reset SACK state. A conforming SACK implementation will 3256 /* Reset SACK state. A conforming SACK implementation will
@@ -3937,6 +3299,28 @@ void tcp_cwnd_application_limited(struct sock *sk)
3937 tp->snd_cwnd_stamp = tcp_time_stamp; 3299 tp->snd_cwnd_stamp = tcp_time_stamp;
3938} 3300}
3939 3301
3302static inline int tcp_should_expand_sndbuf(struct sock *sk, struct tcp_sock *tp)
3303{
3304 /* If the user specified a specific send buffer setting, do
3305 * not modify it.
3306 */
3307 if (sk->sk_userlocks & SOCK_SNDBUF_LOCK)
3308 return 0;
3309
3310 /* If we are under global TCP memory pressure, do not expand. */
3311 if (tcp_memory_pressure)
3312 return 0;
3313
3314 /* If we are under soft global TCP memory pressure, do not expand. */
3315 if (atomic_read(&tcp_memory_allocated) >= sysctl_tcp_mem[0])
3316 return 0;
3317
3318 /* If we filled the congestion window, do not expand. */
3319 if (tp->packets_out >= tp->snd_cwnd)
3320 return 0;
3321
3322 return 1;
3323}
3940 3324
3941/* When incoming ACK allowed to free some skb from write_queue, 3325/* When incoming ACK allowed to free some skb from write_queue,
3942 * we remember this event in flag SOCK_QUEUE_SHRUNK and wake up socket 3326 * we remember this event in flag SOCK_QUEUE_SHRUNK and wake up socket
@@ -3948,11 +3332,8 @@ static void tcp_new_space(struct sock *sk)
3948{ 3332{
3949 struct tcp_sock *tp = tcp_sk(sk); 3333 struct tcp_sock *tp = tcp_sk(sk);
3950 3334
3951 if (tp->packets_out < tp->snd_cwnd && 3335 if (tcp_should_expand_sndbuf(sk, tp)) {
3952 !(sk->sk_userlocks & SOCK_SNDBUF_LOCK) && 3336 int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) +
3953 !tcp_memory_pressure &&
3954 atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
3955 int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache_std) +
3956 MAX_TCP_HEADER + 16 + sizeof(struct sk_buff), 3337 MAX_TCP_HEADER + 16 + sizeof(struct sk_buff),
3957 demanded = max_t(unsigned int, tp->snd_cwnd, 3338 demanded = max_t(unsigned int, tp->snd_cwnd,
3958 tp->reordering + 1); 3339 tp->reordering + 1);
@@ -3975,22 +3356,9 @@ static inline void tcp_check_space(struct sock *sk)
3975 } 3356 }
3976} 3357}
3977 3358
3978static void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb) 3359static __inline__ void tcp_data_snd_check(struct sock *sk, struct tcp_sock *tp)
3979{ 3360{
3980 struct tcp_sock *tp = tcp_sk(sk); 3361 tcp_push_pending_frames(sk, tp);
3981
3982 if (after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) ||
3983 tcp_packets_in_flight(tp) >= tp->snd_cwnd ||
3984 tcp_write_xmit(sk, tp->nonagle))
3985 tcp_check_probe_timer(sk, tp);
3986}
3987
3988static __inline__ void tcp_data_snd_check(struct sock *sk)
3989{
3990 struct sk_buff *skb = sk->sk_send_head;
3991
3992 if (skb != NULL)
3993 __tcp_data_snd_check(sk, skb);
3994 tcp_check_space(sk); 3362 tcp_check_space(sk);
3995} 3363}
3996 3364
@@ -4284,7 +3652,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
4284 */ 3652 */
4285 tcp_ack(sk, skb, 0); 3653 tcp_ack(sk, skb, 0);
4286 __kfree_skb(skb); 3654 __kfree_skb(skb);
4287 tcp_data_snd_check(sk); 3655 tcp_data_snd_check(sk, tp);
4288 return 0; 3656 return 0;
4289 } else { /* Header too small */ 3657 } else { /* Header too small */
4290 TCP_INC_STATS_BH(TCP_MIB_INERRS); 3658 TCP_INC_STATS_BH(TCP_MIB_INERRS);
@@ -4350,7 +3718,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
4350 if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) { 3718 if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) {
4351 /* Well, only one small jumplet in fast path... */ 3719 /* Well, only one small jumplet in fast path... */
4352 tcp_ack(sk, skb, FLAG_DATA); 3720 tcp_ack(sk, skb, FLAG_DATA);
4353 tcp_data_snd_check(sk); 3721 tcp_data_snd_check(sk, tp);
4354 if (!tcp_ack_scheduled(tp)) 3722 if (!tcp_ack_scheduled(tp))
4355 goto no_ack; 3723 goto no_ack;
4356 } 3724 }
@@ -4428,7 +3796,7 @@ step5:
4428 /* step 7: process the segment text */ 3796 /* step 7: process the segment text */
4429 tcp_data_queue(sk, skb); 3797 tcp_data_queue(sk, skb);
4430 3798
4431 tcp_data_snd_check(sk); 3799 tcp_data_snd_check(sk, tp);
4432 tcp_ack_snd_check(sk); 3800 tcp_ack_snd_check(sk);
4433 return 0; 3801 return 0;
4434 3802
@@ -4552,6 +3920,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
4552 3920
4553 tcp_init_metrics(sk); 3921 tcp_init_metrics(sk);
4554 3922
3923 tcp_init_congestion_control(tp);
3924
4555 /* Prevent spurious tcp_cwnd_restart() on first data 3925 /* Prevent spurious tcp_cwnd_restart() on first data
4556 * packet. 3926 * packet.
4557 */ 3927 */
@@ -4708,9 +4078,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4708 if(tp->af_specific->conn_request(sk, skb) < 0) 4078 if(tp->af_specific->conn_request(sk, skb) < 0)
4709 return 1; 4079 return 1;
4710 4080
4711 init_westwood(sk);
4712 init_bictcp(tp);
4713
4714 /* Now we have several options: In theory there is 4081 /* Now we have several options: In theory there is
4715 * nothing else in the frame. KA9Q has an option to 4082 * nothing else in the frame. KA9Q has an option to
4716 * send data with the syn, BSD accepts data with the 4083 * send data with the syn, BSD accepts data with the
@@ -4732,9 +4099,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4732 goto discard; 4099 goto discard;
4733 4100
4734 case TCP_SYN_SENT: 4101 case TCP_SYN_SENT:
4735 init_westwood(sk);
4736 init_bictcp(tp);
4737
4738 queued = tcp_rcv_synsent_state_process(sk, skb, th, len); 4102 queued = tcp_rcv_synsent_state_process(sk, skb, th, len);
4739 if (queued >= 0) 4103 if (queued >= 0)
4740 return queued; 4104 return queued;
@@ -4742,7 +4106,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4742 /* Do step6 onward by hand. */ 4106 /* Do step6 onward by hand. */
4743 tcp_urg(sk, skb, th); 4107 tcp_urg(sk, skb, th);
4744 __kfree_skb(skb); 4108 __kfree_skb(skb);
4745 tcp_data_snd_check(sk); 4109 tcp_data_snd_check(sk, tp);
4746 return 0; 4110 return 0;
4747 } 4111 }
4748 4112
@@ -4816,7 +4180,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4816 */ 4180 */
4817 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && 4181 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
4818 !tp->srtt) 4182 !tp->srtt)
4819 tcp_ack_saw_tstamp(tp, 0); 4183 tcp_ack_saw_tstamp(tp, 0, 0);
4820 4184
4821 if (tp->rx_opt.tstamp_ok) 4185 if (tp->rx_opt.tstamp_ok)
4822 tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; 4186 tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
@@ -4828,6 +4192,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4828 4192
4829 tcp_init_metrics(sk); 4193 tcp_init_metrics(sk);
4830 4194
4195 tcp_init_congestion_control(tp);
4196
4831 /* Prevent spurious tcp_cwnd_restart() on 4197 /* Prevent spurious tcp_cwnd_restart() on
4832 * first data packet. 4198 * first data packet.
4833 */ 4199 */
@@ -4931,7 +4297,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4931 4297
4932 /* tcp_data could move socket to TIME-WAIT */ 4298 /* tcp_data could move socket to TIME-WAIT */
4933 if (sk->sk_state != TCP_CLOSE) { 4299 if (sk->sk_state != TCP_CLOSE) {
4934 tcp_data_snd_check(sk); 4300 tcp_data_snd_check(sk, tp);
4935 tcp_ack_snd_check(sk); 4301 tcp_ack_snd_check(sk);
4936 } 4302 }
4937 4303
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 2d41d5d6ad..67c670886c 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -242,9 +242,14 @@ static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
242 tcp_port_rover = rover; 242 tcp_port_rover = rover;
243 spin_unlock(&tcp_portalloc_lock); 243 spin_unlock(&tcp_portalloc_lock);
244 244
245 /* Exhausted local port range during search? */ 245 /* Exhausted local port range during search? It is not
246 * possible for us to be holding one of the bind hash
247 * locks if this test triggers, because if 'remaining'
248 * drops to zero, we broke out of the do/while loop at
249 * the top level, not from the 'break;' statement.
250 */
246 ret = 1; 251 ret = 1;
247 if (remaining <= 0) 252 if (unlikely(remaining <= 0))
248 goto fail; 253 goto fail;
249 254
250 /* OK, here is the one we will use. HEAD is 255 /* OK, here is the one we will use. HEAD is
@@ -1494,12 +1499,11 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1494 * to destinations, already remembered 1499 * to destinations, already remembered
1495 * to the moment of synflood. 1500 * to the moment of synflood.
1496 */ 1501 */
1497 NETDEBUG(if (net_ratelimit()) \ 1502 LIMIT_NETDEBUG(printk(KERN_DEBUG "TCP: drop open "
1498 printk(KERN_DEBUG "TCP: drop open " 1503 "request from %u.%u."
1499 "request from %u.%u." 1504 "%u.%u/%u\n",
1500 "%u.%u/%u\n", \ 1505 NIPQUAD(saddr),
1501 NIPQUAD(saddr), 1506 ntohs(skb->h.th->source)));
1502 ntohs(skb->h.th->source)));
1503 dst_release(dst); 1507 dst_release(dst);
1504 goto drop_and_free; 1508 goto drop_and_free;
1505 } 1509 }
@@ -1627,8 +1631,7 @@ static int tcp_v4_checksum_init(struct sk_buff *skb)
1627 skb->nh.iph->daddr, skb->csum)) 1631 skb->nh.iph->daddr, skb->csum))
1628 return 0; 1632 return 0;
1629 1633
1630 NETDEBUG(if (net_ratelimit()) 1634 LIMIT_NETDEBUG(printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1631 printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1632 skb->ip_summed = CHECKSUM_NONE; 1635 skb->ip_summed = CHECKSUM_NONE;
1633 } 1636 }
1634 if (skb->len <= 76) { 1637 if (skb->len <= 76) {
@@ -2045,9 +2048,10 @@ static int tcp_v4_init_sock(struct sock *sk)
2045 */ 2048 */
2046 tp->snd_ssthresh = 0x7fffffff; /* Infinity */ 2049 tp->snd_ssthresh = 0x7fffffff; /* Infinity */
2047 tp->snd_cwnd_clamp = ~0; 2050 tp->snd_cwnd_clamp = ~0;
2048 tp->mss_cache_std = tp->mss_cache = 536; 2051 tp->mss_cache = 536;
2049 2052
2050 tp->reordering = sysctl_tcp_reordering; 2053 tp->reordering = sysctl_tcp_reordering;
2054 tp->ca_ops = &tcp_init_congestion_ops;
2051 2055
2052 sk->sk_state = TCP_CLOSE; 2056 sk->sk_state = TCP_CLOSE;
2053 2057
@@ -2070,6 +2074,8 @@ int tcp_v4_destroy_sock(struct sock *sk)
2070 2074
2071 tcp_clear_xmit_timers(sk); 2075 tcp_clear_xmit_timers(sk);
2072 2076
2077 tcp_cleanup_congestion_control(tp);
2078
2073 /* Cleanup up the write buffer. */ 2079 /* Cleanup up the write buffer. */
2074 sk_stream_writequeue_purge(sk); 2080 sk_stream_writequeue_purge(sk);
2075 2081
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index b3943e7562..f42a284164 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -774,6 +774,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
774 newtp->frto_counter = 0; 774 newtp->frto_counter = 0;
775 newtp->frto_highmark = 0; 775 newtp->frto_highmark = 0;
776 776
777 newtp->ca_ops = &tcp_reno;
778
777 tcp_set_ca_state(newtp, TCP_CA_Open); 779 tcp_set_ca_state(newtp, TCP_CA_Open);
778 tcp_init_xmit_timers(newsk); 780 tcp_init_xmit_timers(newsk);
779 skb_queue_head_init(&newtp->out_of_order_queue); 781 skb_queue_head_init(&newtp->out_of_order_queue);
@@ -842,8 +844,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
842 if (newtp->ecn_flags&TCP_ECN_OK) 844 if (newtp->ecn_flags&TCP_ECN_OK)
843 sock_set_flag(newsk, SOCK_NO_LARGESEND); 845 sock_set_flag(newsk, SOCK_NO_LARGESEND);
844 846
845 tcp_ca_init(newtp);
846
847 TCP_INC_STATS_BH(TCP_MIB_PASSIVEOPENS); 847 TCP_INC_STATS_BH(TCP_MIB_PASSIVEOPENS);
848 } 848 }
849 return newsk; 849 return newsk;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index f17c6577e3..dd30dd137b 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -49,7 +49,7 @@ int sysctl_tcp_retrans_collapse = 1;
49 * will allow a single TSO frame to consume. Building TSO frames 49 * will allow a single TSO frame to consume. Building TSO frames
50 * which are too large can cause TCP streams to be bursty. 50 * which are too large can cause TCP streams to be bursty.
51 */ 51 */
52int sysctl_tcp_tso_win_divisor = 8; 52int sysctl_tcp_tso_win_divisor = 3;
53 53
54static inline void update_send_head(struct sock *sk, struct tcp_sock *tp, 54static inline void update_send_head(struct sock *sk, struct tcp_sock *tp,
55 struct sk_buff *skb) 55 struct sk_buff *skb)
@@ -111,8 +111,7 @@ static void tcp_cwnd_restart(struct tcp_sock *tp, struct dst_entry *dst)
111 u32 restart_cwnd = tcp_init_cwnd(tp, dst); 111 u32 restart_cwnd = tcp_init_cwnd(tp, dst);
112 u32 cwnd = tp->snd_cwnd; 112 u32 cwnd = tp->snd_cwnd;
113 113
114 if (tcp_is_vegas(tp)) 114 tcp_ca_event(tp, CA_EVENT_CWND_RESTART);
115 tcp_vegas_enable(tp);
116 115
117 tp->snd_ssthresh = tcp_current_ssthresh(tp); 116 tp->snd_ssthresh = tcp_current_ssthresh(tp);
118 restart_cwnd = min(restart_cwnd, cwnd); 117 restart_cwnd = min(restart_cwnd, cwnd);
@@ -141,11 +140,11 @@ static inline void tcp_event_data_sent(struct tcp_sock *tp,
141 tp->ack.pingpong = 1; 140 tp->ack.pingpong = 1;
142} 141}
143 142
144static __inline__ void tcp_event_ack_sent(struct sock *sk) 143static __inline__ void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
145{ 144{
146 struct tcp_sock *tp = tcp_sk(sk); 145 struct tcp_sock *tp = tcp_sk(sk);
147 146
148 tcp_dec_quickack_mode(tp); 147 tcp_dec_quickack_mode(tp, pkts);
149 tcp_clear_xmit_timer(sk, TCP_TIME_DACK); 148 tcp_clear_xmit_timer(sk, TCP_TIME_DACK);
150} 149}
151 150
@@ -280,6 +279,10 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
280#define SYSCTL_FLAG_WSCALE 0x2 279#define SYSCTL_FLAG_WSCALE 0x2
281#define SYSCTL_FLAG_SACK 0x4 280#define SYSCTL_FLAG_SACK 0x4
282 281
282 /* If congestion control is doing timestamping */
283 if (tp->ca_ops->rtt_sample)
284 do_gettimeofday(&skb->stamp);
285
283 sysctl_flags = 0; 286 sysctl_flags = 0;
284 if (tcb->flags & TCPCB_FLAG_SYN) { 287 if (tcb->flags & TCPCB_FLAG_SYN) {
285 tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS; 288 tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS;
@@ -304,17 +307,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
304 (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK)); 307 (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK));
305 } 308 }
306 309
307 /* 310 if (tcp_packets_in_flight(tp) == 0)
308 * If the connection is idle and we are restarting, 311 tcp_ca_event(tp, CA_EVENT_TX_START);
309 * then we don't want to do any Vegas calculations
310 * until we get fresh RTT samples. So when we
311 * restart, we reset our Vegas state to a clean
312 * slate. After we get acks for this flight of
313 * packets, _then_ we can make Vegas calculations
314 * again.
315 */
316 if (tcp_is_vegas(tp) && tcp_packets_in_flight(tp) == 0)
317 tcp_vegas_enable(tp);
318 312
319 th = (struct tcphdr *) skb_push(skb, tcp_header_size); 313 th = (struct tcphdr *) skb_push(skb, tcp_header_size);
320 skb->h.th = th; 314 skb->h.th = th;
@@ -361,7 +355,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
361 tp->af_specific->send_check(sk, th, skb->len, skb); 355 tp->af_specific->send_check(sk, th, skb->len, skb);
362 356
363 if (tcb->flags & TCPCB_FLAG_ACK) 357 if (tcb->flags & TCPCB_FLAG_ACK)
364 tcp_event_ack_sent(sk); 358 tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
365 359
366 if (skb->len != tcp_header_size) 360 if (skb->len != tcp_header_size)
367 tcp_event_data_sent(tp, skb, sk); 361 tcp_event_data_sent(tp, skb, sk);
@@ -409,42 +403,9 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
409 sk->sk_send_head = skb; 403 sk->sk_send_head = skb;
410} 404}
411 405
412static inline void tcp_tso_set_push(struct sk_buff *skb) 406static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now)
413{
414 /* Force push to be on for any TSO frames to workaround
415 * problems with busted implementations like Mac OS-X that
416 * hold off socket receive wakeups until push is seen.
417 */
418 if (tcp_skb_pcount(skb) > 1)
419 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
420}
421
422/* Send _single_ skb sitting at the send head. This function requires
423 * true push pending frames to setup probe timer etc.
424 */
425void tcp_push_one(struct sock *sk, unsigned cur_mss)
426{
427 struct tcp_sock *tp = tcp_sk(sk);
428 struct sk_buff *skb = sk->sk_send_head;
429
430 if (tcp_snd_test(sk, skb, cur_mss, TCP_NAGLE_PUSH)) {
431 /* Send it out now. */
432 TCP_SKB_CB(skb)->when = tcp_time_stamp;
433 tcp_tso_set_push(skb);
434 if (!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation))) {
435 sk->sk_send_head = NULL;
436 tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
437 tcp_packets_out_inc(sk, tp, skb);
438 return;
439 }
440 }
441}
442
443void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb)
444{ 407{
445 struct tcp_sock *tp = tcp_sk(sk); 408 if (skb->len <= mss_now ||
446
447 if (skb->len <= tp->mss_cache_std ||
448 !(sk->sk_route_caps & NETIF_F_TSO)) { 409 !(sk->sk_route_caps & NETIF_F_TSO)) {
449 /* Avoid the costly divide in the normal 410 /* Avoid the costly divide in the normal
450 * non-TSO case. 411 * non-TSO case.
@@ -454,10 +415,10 @@ void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb)
454 } else { 415 } else {
455 unsigned int factor; 416 unsigned int factor;
456 417
457 factor = skb->len + (tp->mss_cache_std - 1); 418 factor = skb->len + (mss_now - 1);
458 factor /= tp->mss_cache_std; 419 factor /= mss_now;
459 skb_shinfo(skb)->tso_segs = factor; 420 skb_shinfo(skb)->tso_segs = factor;
460 skb_shinfo(skb)->tso_size = tp->mss_cache_std; 421 skb_shinfo(skb)->tso_size = mss_now;
461 } 422 }
462} 423}
463 424
@@ -466,7 +427,7 @@ void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb)
466 * packet to the list. This won't be called frequently, I hope. 427 * packet to the list. This won't be called frequently, I hope.
467 * Remember, these are still headerless SKBs at this point. 428 * Remember, these are still headerless SKBs at this point.
468 */ 429 */
469static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len) 430static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss_now)
470{ 431{
471 struct tcp_sock *tp = tcp_sk(sk); 432 struct tcp_sock *tp = tcp_sk(sk);
472 struct sk_buff *buff; 433 struct sk_buff *buff;
@@ -521,6 +482,7 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
521 * skbs, which it never sent before. --ANK 482 * skbs, which it never sent before. --ANK
522 */ 483 */
523 TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when; 484 TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
485 buff->stamp = skb->stamp;
524 486
525 if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) { 487 if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) {
526 tp->lost_out -= tcp_skb_pcount(skb); 488 tp->lost_out -= tcp_skb_pcount(skb);
@@ -528,8 +490,8 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
528 } 490 }
529 491
530 /* Fix up tso_factor for both original and new SKB. */ 492 /* Fix up tso_factor for both original and new SKB. */
531 tcp_set_skb_tso_segs(sk, skb); 493 tcp_set_skb_tso_segs(sk, skb, mss_now);
532 tcp_set_skb_tso_segs(sk, buff); 494 tcp_set_skb_tso_segs(sk, buff, mss_now);
533 495
534 if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) { 496 if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) {
535 tp->lost_out += tcp_skb_pcount(skb); 497 tp->lost_out += tcp_skb_pcount(skb);
@@ -542,6 +504,7 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
542 } 504 }
543 505
544 /* Link BUFF into the send queue. */ 506 /* Link BUFF into the send queue. */
507 skb_header_release(buff);
545 __skb_append(skb, buff); 508 __skb_append(skb, buff);
546 509
547 return 0; 510 return 0;
@@ -604,7 +567,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
604 * factor and mss. 567 * factor and mss.
605 */ 568 */
606 if (tcp_skb_pcount(skb) > 1) 569 if (tcp_skb_pcount(skb) > 1)
607 tcp_set_skb_tso_segs(sk, skb); 570 tcp_set_skb_tso_segs(sk, skb, tcp_current_mss(sk, 1));
608 571
609 return 0; 572 return 0;
610} 573}
@@ -662,7 +625,7 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
662 625
663 /* And store cached results */ 626 /* And store cached results */
664 tp->pmtu_cookie = pmtu; 627 tp->pmtu_cookie = pmtu;
665 tp->mss_cache = tp->mss_cache_std = mss_now; 628 tp->mss_cache = mss_now;
666 629
667 return mss_now; 630 return mss_now;
668} 631}
@@ -674,57 +637,315 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
674 * cannot be large. However, taking into account rare use of URG, this 637 * cannot be large. However, taking into account rare use of URG, this
675 * is not a big flaw. 638 * is not a big flaw.
676 */ 639 */
677 640unsigned int tcp_current_mss(struct sock *sk, int large_allowed)
678unsigned int tcp_current_mss(struct sock *sk, int large)
679{ 641{
680 struct tcp_sock *tp = tcp_sk(sk); 642 struct tcp_sock *tp = tcp_sk(sk);
681 struct dst_entry *dst = __sk_dst_get(sk); 643 struct dst_entry *dst = __sk_dst_get(sk);
682 unsigned int do_large, mss_now; 644 u32 mss_now;
645 u16 xmit_size_goal;
646 int doing_tso = 0;
647
648 mss_now = tp->mss_cache;
649
650 if (large_allowed &&
651 (sk->sk_route_caps & NETIF_F_TSO) &&
652 !tp->urg_mode)
653 doing_tso = 1;
683 654
684 mss_now = tp->mss_cache_std;
685 if (dst) { 655 if (dst) {
686 u32 mtu = dst_mtu(dst); 656 u32 mtu = dst_mtu(dst);
687 if (mtu != tp->pmtu_cookie) 657 if (mtu != tp->pmtu_cookie)
688 mss_now = tcp_sync_mss(sk, mtu); 658 mss_now = tcp_sync_mss(sk, mtu);
689 } 659 }
690 660
691 do_large = (large && 661 if (tp->rx_opt.eff_sacks)
692 (sk->sk_route_caps & NETIF_F_TSO) && 662 mss_now -= (TCPOLEN_SACK_BASE_ALIGNED +
693 !tp->urg_mode); 663 (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK));
694 664
695 if (do_large) { 665 xmit_size_goal = mss_now;
696 unsigned int large_mss, factor, limit;
697 666
698 large_mss = 65535 - tp->af_specific->net_header_len - 667 if (doing_tso) {
668 xmit_size_goal = 65535 -
669 tp->af_specific->net_header_len -
699 tp->ext_header_len - tp->tcp_header_len; 670 tp->ext_header_len - tp->tcp_header_len;
700 671
701 if (tp->max_window && large_mss > (tp->max_window>>1)) 672 if (tp->max_window &&
702 large_mss = max((tp->max_window>>1), 673 (xmit_size_goal > (tp->max_window >> 1)))
703 68U - tp->tcp_header_len); 674 xmit_size_goal = max((tp->max_window >> 1),
675 68U - tp->tcp_header_len);
704 676
705 factor = large_mss / mss_now; 677 xmit_size_goal -= (xmit_size_goal % mss_now);
678 }
679 tp->xmit_size_goal = xmit_size_goal;
706 680
707 /* Always keep large mss multiple of real mss, but 681 return mss_now;
708 * do not exceed 1/tso_win_divisor of the congestion window 682}
709 * so we can keep the ACK clock ticking and minimize 683
710 * bursting. 684/* Congestion window validation. (RFC2861) */
711 */
712 limit = tp->snd_cwnd;
713 if (sysctl_tcp_tso_win_divisor)
714 limit /= sysctl_tcp_tso_win_divisor;
715 limit = max(1U, limit);
716 if (factor > limit)
717 factor = limit;
718 685
719 tp->mss_cache = mss_now * factor; 686static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp)
687{
688 __u32 packets_out = tp->packets_out;
689
690 if (packets_out >= tp->snd_cwnd) {
691 /* Network is feed fully. */
692 tp->snd_cwnd_used = 0;
693 tp->snd_cwnd_stamp = tcp_time_stamp;
694 } else {
695 /* Network starves. */
696 if (tp->packets_out > tp->snd_cwnd_used)
697 tp->snd_cwnd_used = tp->packets_out;
720 698
721 mss_now = tp->mss_cache; 699 if ((s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= tp->rto)
700 tcp_cwnd_application_limited(sk);
722 } 701 }
702}
723 703
724 if (tp->rx_opt.eff_sacks) 704static unsigned int tcp_window_allows(struct tcp_sock *tp, struct sk_buff *skb, unsigned int mss_now, unsigned int cwnd)
725 mss_now -= (TCPOLEN_SACK_BASE_ALIGNED + 705{
726 (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK)); 706 u32 window, cwnd_len;
727 return mss_now; 707
708 window = (tp->snd_una + tp->snd_wnd - TCP_SKB_CB(skb)->seq);
709 cwnd_len = mss_now * cwnd;
710 return min(window, cwnd_len);
711}
712
713/* Can at least one segment of SKB be sent right now, according to the
714 * congestion window rules? If so, return how many segments are allowed.
715 */
716static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, struct sk_buff *skb)
717{
718 u32 in_flight, cwnd;
719
720 /* Don't be strict about the congestion window for the final FIN. */
721 if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)
722 return 1;
723
724 in_flight = tcp_packets_in_flight(tp);
725 cwnd = tp->snd_cwnd;
726 if (in_flight < cwnd)
727 return (cwnd - in_flight);
728
729 return 0;
730}
731
732/* This must be invoked the first time we consider transmitting
733 * SKB onto the wire.
734 */
735static inline int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now)
736{
737 int tso_segs = tcp_skb_pcount(skb);
738
739 if (!tso_segs ||
740 (tso_segs > 1 &&
741 skb_shinfo(skb)->tso_size != mss_now)) {
742 tcp_set_skb_tso_segs(sk, skb, mss_now);
743 tso_segs = tcp_skb_pcount(skb);
744 }
745 return tso_segs;
746}
747
748static inline int tcp_minshall_check(const struct tcp_sock *tp)
749{
750 return after(tp->snd_sml,tp->snd_una) &&
751 !after(tp->snd_sml, tp->snd_nxt);
752}
753
754/* Return 0, if packet can be sent now without violation Nagle's rules:
755 * 1. It is full sized.
756 * 2. Or it contains FIN. (already checked by caller)
757 * 3. Or TCP_NODELAY was set.
758 * 4. Or TCP_CORK is not set, and all sent packets are ACKed.
759 * With Minshall's modification: all sent small packets are ACKed.
760 */
761
762static inline int tcp_nagle_check(const struct tcp_sock *tp,
763 const struct sk_buff *skb,
764 unsigned mss_now, int nonagle)
765{
766 return (skb->len < mss_now &&
767 ((nonagle&TCP_NAGLE_CORK) ||
768 (!nonagle &&
769 tp->packets_out &&
770 tcp_minshall_check(tp))));
771}
772
773/* Return non-zero if the Nagle test allows this packet to be
774 * sent now.
775 */
776static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb,
777 unsigned int cur_mss, int nonagle)
778{
779 /* Nagle rule does not apply to frames, which sit in the middle of the
780 * write_queue (they have no chances to get new data).
781 *
782 * This is implemented in the callers, where they modify the 'nonagle'
783 * argument based upon the location of SKB in the send queue.
784 */
785 if (nonagle & TCP_NAGLE_PUSH)
786 return 1;
787
788 /* Don't use the nagle rule for urgent data (or for the final FIN). */
789 if (tp->urg_mode ||
790 (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN))
791 return 1;
792
793 if (!tcp_nagle_check(tp, skb, cur_mss, nonagle))
794 return 1;
795
796 return 0;
797}
798
799/* Does at least the first segment of SKB fit into the send window? */
800static inline int tcp_snd_wnd_test(struct tcp_sock *tp, struct sk_buff *skb, unsigned int cur_mss)
801{
802 u32 end_seq = TCP_SKB_CB(skb)->end_seq;
803
804 if (skb->len > cur_mss)
805 end_seq = TCP_SKB_CB(skb)->seq + cur_mss;
806
807 return !after(end_seq, tp->snd_una + tp->snd_wnd);
808}
809
810/* This checks if the data bearing packet SKB (usually sk->sk_send_head)
811 * should be put on the wire right now. If so, it returns the number of
812 * packets allowed by the congestion window.
813 */
814static unsigned int tcp_snd_test(struct sock *sk, struct sk_buff *skb,
815 unsigned int cur_mss, int nonagle)
816{
817 struct tcp_sock *tp = tcp_sk(sk);
818 unsigned int cwnd_quota;
819
820 tcp_init_tso_segs(sk, skb, cur_mss);
821
822 if (!tcp_nagle_test(tp, skb, cur_mss, nonagle))
823 return 0;
824
825 cwnd_quota = tcp_cwnd_test(tp, skb);
826 if (cwnd_quota &&
827 !tcp_snd_wnd_test(tp, skb, cur_mss))
828 cwnd_quota = 0;
829
830 return cwnd_quota;
831}
832
833static inline int tcp_skb_is_last(const struct sock *sk,
834 const struct sk_buff *skb)
835{
836 return skb->next == (struct sk_buff *)&sk->sk_write_queue;
837}
838
839int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp)
840{
841 struct sk_buff *skb = sk->sk_send_head;
842
843 return (skb &&
844 tcp_snd_test(sk, skb, tcp_current_mss(sk, 1),
845 (tcp_skb_is_last(sk, skb) ?
846 TCP_NAGLE_PUSH :
847 tp->nonagle)));
848}
849
850/* Trim TSO SKB to LEN bytes, put the remaining data into a new packet
851 * which is put after SKB on the list. It is very much like
852 * tcp_fragment() except that it may make several kinds of assumptions
853 * in order to speed up the splitting operation. In particular, we
854 * know that all the data is in scatter-gather pages, and that the
855 * packet has never been sent out before (and thus is not cloned).
856 */
857static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, unsigned int mss_now)
858{
859 struct sk_buff *buff;
860 int nlen = skb->len - len;
861 u16 flags;
862
863 /* All of a TSO frame must be composed of paged data. */
864 if (skb->len != skb->data_len)
865 return tcp_fragment(sk, skb, len, mss_now);
866
867 buff = sk_stream_alloc_pskb(sk, 0, 0, GFP_ATOMIC);
868 if (unlikely(buff == NULL))
869 return -ENOMEM;
870
871 buff->truesize = nlen;
872 skb->truesize -= nlen;
873
874 /* Correct the sequence numbers. */
875 TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
876 TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
877 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
878
879 /* PSH and FIN should only be set in the second packet. */
880 flags = TCP_SKB_CB(skb)->flags;
881 TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
882 TCP_SKB_CB(buff)->flags = flags;
883
884 /* This packet was never sent out yet, so no SACK bits. */
885 TCP_SKB_CB(buff)->sacked = 0;
886
887 buff->ip_summed = skb->ip_summed = CHECKSUM_HW;
888 skb_split(skb, buff, len);
889
890 /* Fix up tso_factor for both original and new SKB. */
891 tcp_set_skb_tso_segs(sk, skb, mss_now);
892 tcp_set_skb_tso_segs(sk, buff, mss_now);
893
894 /* Link BUFF into the send queue. */
895 skb_header_release(buff);
896 __skb_append(skb, buff);
897
898 return 0;
899}
900
901/* Try to defer sending, if possible, in order to minimize the amount
902 * of TSO splitting we do. View it as a kind of TSO Nagle test.
903 *
904 * This algorithm is from John Heffner.
905 */
906static int tcp_tso_should_defer(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb)
907{
908 u32 send_win, cong_win, limit, in_flight;
909
910 if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)
911 return 0;
912
913 if (tp->ca_state != TCP_CA_Open)
914 return 0;
915
916 in_flight = tcp_packets_in_flight(tp);
917
918 BUG_ON(tcp_skb_pcount(skb) <= 1 ||
919 (tp->snd_cwnd <= in_flight));
920
921 send_win = (tp->snd_una + tp->snd_wnd) - TCP_SKB_CB(skb)->seq;
922
923 /* From in_flight test above, we know that cwnd > in_flight. */
924 cong_win = (tp->snd_cwnd - in_flight) * tp->mss_cache;
925
926 limit = min(send_win, cong_win);
927
928 if (sysctl_tcp_tso_win_divisor) {
929 u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
930
931 /* If at least some fraction of a window is available,
932 * just use it.
933 */
934 chunk /= sysctl_tcp_tso_win_divisor;
935 if (limit >= chunk)
936 return 0;
937 } else {
938 /* Different approach, try not to defer past a single
939 * ACK. Receiver should ACK every other full sized
940 * frame, so if we have space for more than 3 frames
941 * then send now.
942 */
943 if (limit > tcp_max_burst(tp) * tp->mss_cache)
944 return 0;
945 }
946
947 /* Ok, it looks like it is advisable to defer. */
948 return 1;
728} 949}
729 950
730/* This routine writes packets to the network. It advances the 951/* This routine writes packets to the network. It advances the
@@ -734,57 +955,142 @@ unsigned int tcp_current_mss(struct sock *sk, int large)
734 * Returns 1, if no segments are in flight and we have queued segments, but 955 * Returns 1, if no segments are in flight and we have queued segments, but
735 * cannot send anything now because of SWS or another problem. 956 * cannot send anything now because of SWS or another problem.
736 */ 957 */
737int tcp_write_xmit(struct sock *sk, int nonagle) 958static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
738{ 959{
739 struct tcp_sock *tp = tcp_sk(sk); 960 struct tcp_sock *tp = tcp_sk(sk);
740 unsigned int mss_now; 961 struct sk_buff *skb;
962 unsigned int tso_segs, sent_pkts;
963 int cwnd_quota;
741 964
742 /* If we are closed, the bytes will have to remain here. 965 /* If we are closed, the bytes will have to remain here.
743 * In time closedown will finish, we empty the write queue and all 966 * In time closedown will finish, we empty the write queue and all
744 * will be happy. 967 * will be happy.
745 */ 968 */
746 if (sk->sk_state != TCP_CLOSE) { 969 if (unlikely(sk->sk_state == TCP_CLOSE))
747 struct sk_buff *skb; 970 return 0;
748 int sent_pkts = 0;
749 971
750 /* Account for SACKS, we may need to fragment due to this. 972 sent_pkts = 0;
751 * It is just like the real MSS changing on us midstream. 973 while ((skb = sk->sk_send_head)) {
752 * We also handle things correctly when the user adds some 974 unsigned int limit;
753 * IP options mid-stream. Silly to do, but cover it.
754 */
755 mss_now = tcp_current_mss(sk, 1);
756
757 while ((skb = sk->sk_send_head) &&
758 tcp_snd_test(sk, skb, mss_now,
759 tcp_skb_is_last(sk, skb) ? nonagle :
760 TCP_NAGLE_PUSH)) {
761 if (skb->len > mss_now) {
762 if (tcp_fragment(sk, skb, mss_now))
763 break;
764 }
765 975
766 TCP_SKB_CB(skb)->when = tcp_time_stamp; 976 tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
767 tcp_tso_set_push(skb); 977 BUG_ON(!tso_segs);
768 if (tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC))) 978
979 cwnd_quota = tcp_cwnd_test(tp, skb);
980 if (!cwnd_quota)
981 break;
982
983 if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now)))
984 break;
985
986 if (tso_segs == 1) {
987 if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
988 (tcp_skb_is_last(sk, skb) ?
989 nonagle : TCP_NAGLE_PUSH))))
769 break; 990 break;
991 } else {
992 if (tcp_tso_should_defer(sk, tp, skb))
993 break;
994 }
770 995
771 /* Advance the send_head. This one is sent out. 996 limit = mss_now;
772 * This call will increment packets_out. 997 if (tso_segs > 1) {
773 */ 998 limit = tcp_window_allows(tp, skb,
774 update_send_head(sk, tp, skb); 999 mss_now, cwnd_quota);
1000
1001 if (skb->len < limit) {
1002 unsigned int trim = skb->len % mss_now;
775 1003
776 tcp_minshall_update(tp, mss_now, skb); 1004 if (trim)
777 sent_pkts = 1; 1005 limit = skb->len - trim;
1006 }
778 } 1007 }
779 1008
780 if (sent_pkts) { 1009 if (skb->len > limit &&
781 tcp_cwnd_validate(sk, tp); 1010 unlikely(tso_fragment(sk, skb, limit, mss_now)))
782 return 0; 1011 break;
1012
1013 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1014
1015 if (unlikely(tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC))))
1016 break;
1017
1018 /* Advance the send_head. This one is sent out.
1019 * This call will increment packets_out.
1020 */
1021 update_send_head(sk, tp, skb);
1022
1023 tcp_minshall_update(tp, mss_now, skb);
1024 sent_pkts++;
1025 }
1026
1027 if (likely(sent_pkts)) {
1028 tcp_cwnd_validate(sk, tp);
1029 return 0;
1030 }
1031 return !tp->packets_out && sk->sk_send_head;
1032}
1033
1034/* Push out any pending frames which were held back due to
1035 * TCP_CORK or attempt at coalescing tiny packets.
1036 * The socket must be locked by the caller.
1037 */
1038void __tcp_push_pending_frames(struct sock *sk, struct tcp_sock *tp,
1039 unsigned int cur_mss, int nonagle)
1040{
1041 struct sk_buff *skb = sk->sk_send_head;
1042
1043 if (skb) {
1044 if (tcp_write_xmit(sk, cur_mss, nonagle))
1045 tcp_check_probe_timer(sk, tp);
1046 }
1047}
1048
1049/* Send _single_ skb sitting at the send head. This function requires
1050 * true push pending frames to setup probe timer etc.
1051 */
1052void tcp_push_one(struct sock *sk, unsigned int mss_now)
1053{
1054 struct tcp_sock *tp = tcp_sk(sk);
1055 struct sk_buff *skb = sk->sk_send_head;
1056 unsigned int tso_segs, cwnd_quota;
1057
1058 BUG_ON(!skb || skb->len < mss_now);
1059
1060 tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
1061 cwnd_quota = tcp_snd_test(sk, skb, mss_now, TCP_NAGLE_PUSH);
1062
1063 if (likely(cwnd_quota)) {
1064 unsigned int limit;
1065
1066 BUG_ON(!tso_segs);
1067
1068 limit = mss_now;
1069 if (tso_segs > 1) {
1070 limit = tcp_window_allows(tp, skb,
1071 mss_now, cwnd_quota);
1072
1073 if (skb->len < limit) {
1074 unsigned int trim = skb->len % mss_now;
1075
1076 if (trim)
1077 limit = skb->len - trim;
1078 }
783 } 1079 }
784 1080
785 return !tp->packets_out && sk->sk_send_head; 1081 if (skb->len > limit &&
1082 unlikely(tso_fragment(sk, skb, limit, mss_now)))
1083 return;
1084
1085 /* Send it out now. */
1086 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1087
1088 if (likely(!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation)))) {
1089 update_send_head(sk, tp, skb);
1090 tcp_cwnd_validate(sk, tp);
1091 return;
1092 }
786 } 1093 }
787 return 0;
788} 1094}
789 1095
790/* This function returns the amount that we can raise the 1096/* This function returns the amount that we can raise the
@@ -1044,7 +1350,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
1044 if (sk->sk_route_caps & NETIF_F_TSO) { 1350 if (sk->sk_route_caps & NETIF_F_TSO) {
1045 sk->sk_route_caps &= ~NETIF_F_TSO; 1351 sk->sk_route_caps &= ~NETIF_F_TSO;
1046 sock_set_flag(sk, SOCK_NO_LARGESEND); 1352 sock_set_flag(sk, SOCK_NO_LARGESEND);
1047 tp->mss_cache = tp->mss_cache_std;
1048 } 1353 }
1049 1354
1050 if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq)) 1355 if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
@@ -1062,15 +1367,21 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
1062 1367
1063 if (skb->len > cur_mss) { 1368 if (skb->len > cur_mss) {
1064 int old_factor = tcp_skb_pcount(skb); 1369 int old_factor = tcp_skb_pcount(skb);
1065 int new_factor; 1370 int diff;
1066 1371
1067 if (tcp_fragment(sk, skb, cur_mss)) 1372 if (tcp_fragment(sk, skb, cur_mss, cur_mss))
1068 return -ENOMEM; /* We'll try again later. */ 1373 return -ENOMEM; /* We'll try again later. */
1069 1374
1070 /* New SKB created, account for it. */ 1375 /* New SKB created, account for it. */
1071 new_factor = tcp_skb_pcount(skb); 1376 diff = old_factor - tcp_skb_pcount(skb) -
1072 tp->packets_out -= old_factor - new_factor; 1377 tcp_skb_pcount(skb->next);
1073 tp->packets_out += tcp_skb_pcount(skb->next); 1378 tp->packets_out -= diff;
1379
1380 if (diff > 0) {
1381 tp->fackets_out -= diff;
1382 if ((int)tp->fackets_out < 0)
1383 tp->fackets_out = 0;
1384 }
1074 } 1385 }
1075 1386
1076 /* Collapse two adjacent packets if worthwhile and we can. */ 1387 /* Collapse two adjacent packets if worthwhile and we can. */
@@ -1106,7 +1417,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
1106 * is still in somebody's hands, else make a clone. 1417 * is still in somebody's hands, else make a clone.
1107 */ 1418 */
1108 TCP_SKB_CB(skb)->when = tcp_time_stamp; 1419 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1109 tcp_tso_set_push(skb);
1110 1420
1111 err = tcp_transmit_skb(sk, (skb_cloned(skb) ? 1421 err = tcp_transmit_skb(sk, (skb_cloned(skb) ?
1112 pskb_copy(skb, GFP_ATOMIC): 1422 pskb_copy(skb, GFP_ATOMIC):
@@ -1290,7 +1600,7 @@ void tcp_send_fin(struct sock *sk)
1290 * was unread data in the receive queue. This behavior is recommended 1600 * was unread data in the receive queue. This behavior is recommended
1291 * by draft-ietf-tcpimpl-prob-03.txt section 3.10. -DaveM 1601 * by draft-ietf-tcpimpl-prob-03.txt section 3.10. -DaveM
1292 */ 1602 */
1293void tcp_send_active_reset(struct sock *sk, int priority) 1603void tcp_send_active_reset(struct sock *sk, unsigned int __nocast priority)
1294{ 1604{
1295 struct tcp_sock *tp = tcp_sk(sk); 1605 struct tcp_sock *tp = tcp_sk(sk);
1296 struct sk_buff *skb; 1606 struct sk_buff *skb;
@@ -1449,7 +1759,6 @@ static inline void tcp_connect_init(struct sock *sk)
1449 tp->window_clamp = dst_metric(dst, RTAX_WINDOW); 1759 tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
1450 tp->advmss = dst_metric(dst, RTAX_ADVMSS); 1760 tp->advmss = dst_metric(dst, RTAX_ADVMSS);
1451 tcp_initialize_rcv_mss(sk); 1761 tcp_initialize_rcv_mss(sk);
1452 tcp_ca_init(tp);
1453 1762
1454 tcp_select_initial_window(tcp_full_space(sk), 1763 tcp_select_initial_window(tcp_full_space(sk),
1455 tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), 1764 tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
@@ -1503,7 +1812,6 @@ int tcp_connect(struct sock *sk)
1503 TCP_SKB_CB(buff)->end_seq = tp->write_seq; 1812 TCP_SKB_CB(buff)->end_seq = tp->write_seq;
1504 tp->snd_nxt = tp->write_seq; 1813 tp->snd_nxt = tp->write_seq;
1505 tp->pushed_seq = tp->write_seq; 1814 tp->pushed_seq = tp->write_seq;
1506 tcp_ca_init(tp);
1507 1815
1508 /* Send it off. */ 1816 /* Send it off. */
1509 TCP_SKB_CB(buff)->when = tcp_time_stamp; 1817 TCP_SKB_CB(buff)->when = tcp_time_stamp;
@@ -1670,21 +1978,19 @@ int tcp_write_wakeup(struct sock *sk)
1670 skb->len > mss) { 1978 skb->len > mss) {
1671 seg_size = min(seg_size, mss); 1979 seg_size = min(seg_size, mss);
1672 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; 1980 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
1673 if (tcp_fragment(sk, skb, seg_size)) 1981 if (tcp_fragment(sk, skb, seg_size, mss))
1674 return -1; 1982 return -1;
1675 /* SWS override triggered forced fragmentation. 1983 /* SWS override triggered forced fragmentation.
1676 * Disable TSO, the connection is too sick. */ 1984 * Disable TSO, the connection is too sick. */
1677 if (sk->sk_route_caps & NETIF_F_TSO) { 1985 if (sk->sk_route_caps & NETIF_F_TSO) {
1678 sock_set_flag(sk, SOCK_NO_LARGESEND); 1986 sock_set_flag(sk, SOCK_NO_LARGESEND);
1679 sk->sk_route_caps &= ~NETIF_F_TSO; 1987 sk->sk_route_caps &= ~NETIF_F_TSO;
1680 tp->mss_cache = tp->mss_cache_std;
1681 } 1988 }
1682 } else if (!tcp_skb_pcount(skb)) 1989 } else if (!tcp_skb_pcount(skb))
1683 tcp_set_skb_tso_segs(sk, skb); 1990 tcp_set_skb_tso_segs(sk, skb, mss);
1684 1991
1685 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; 1992 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
1686 TCP_SKB_CB(skb)->when = tcp_time_stamp; 1993 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1687 tcp_tso_set_push(skb);
1688 err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); 1994 err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
1689 if (!err) { 1995 if (!err) {
1690 update_send_head(sk, tp, skb); 1996 update_send_head(sk, tp, skb);
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c
new file mode 100644
index 0000000000..70e108e15c
--- /dev/null
+++ b/net/ipv4/tcp_scalable.c
@@ -0,0 +1,68 @@
1/* Tom Kelly's Scalable TCP
2 *
3 * See htt://www-lce.eng.cam.ac.uk/~ctk21/scalable/
4 *
5 * John Heffner <jheffner@sc.edu>
6 */
7
8#include <linux/config.h>
9#include <linux/module.h>
10#include <net/tcp.h>
11
12/* These factors derived from the recommended values in the aer:
13 * .01 and and 7/8. We use 50 instead of 100 to account for
14 * delayed ack.
15 */
16#define TCP_SCALABLE_AI_CNT 50U
17#define TCP_SCALABLE_MD_SCALE 3
18
19static void tcp_scalable_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
20 u32 in_flight, int flag)
21{
22 if (in_flight < tp->snd_cwnd)
23 return;
24
25 if (tp->snd_cwnd <= tp->snd_ssthresh) {
26 tp->snd_cwnd++;
27 } else {
28 tp->snd_cwnd_cnt++;
29 if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)){
30 tp->snd_cwnd++;
31 tp->snd_cwnd_cnt = 0;
32 }
33 }
34 tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
35 tp->snd_cwnd_stamp = tcp_time_stamp;
36}
37
38static u32 tcp_scalable_ssthresh(struct tcp_sock *tp)
39{
40 return max(tp->snd_cwnd - (tp->snd_cwnd>>TCP_SCALABLE_MD_SCALE), 2U);
41}
42
43
44static struct tcp_congestion_ops tcp_scalable = {
45 .ssthresh = tcp_scalable_ssthresh,
46 .cong_avoid = tcp_scalable_cong_avoid,
47 .min_cwnd = tcp_reno_min_cwnd,
48
49 .owner = THIS_MODULE,
50 .name = "scalable",
51};
52
53static int __init tcp_scalable_register(void)
54{
55 return tcp_register_congestion_control(&tcp_scalable);
56}
57
58static void __exit tcp_scalable_unregister(void)
59{
60 tcp_unregister_congestion_control(&tcp_scalable);
61}
62
63module_init(tcp_scalable_register);
64module_exit(tcp_scalable_unregister);
65
66MODULE_AUTHOR("John Heffner");
67MODULE_LICENSE("GPL");
68MODULE_DESCRIPTION("Scalable TCP");
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index b127b44985..0084227438 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -231,11 +231,10 @@ static void tcp_delack_timer(unsigned long data)
231 } 231 }
232 tp->ack.pending &= ~TCP_ACK_TIMER; 232 tp->ack.pending &= ~TCP_ACK_TIMER;
233 233
234 if (skb_queue_len(&tp->ucopy.prequeue)) { 234 if (!skb_queue_empty(&tp->ucopy.prequeue)) {
235 struct sk_buff *skb; 235 struct sk_buff *skb;
236 236
237 NET_ADD_STATS_BH(LINUX_MIB_TCPSCHEDULERFAILED, 237 NET_INC_STATS_BH(LINUX_MIB_TCPSCHEDULERFAILED);
238 skb_queue_len(&tp->ucopy.prequeue));
239 238
240 while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) 239 while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
241 sk->sk_backlog_rcv(sk, skb); 240 sk->sk_backlog_rcv(sk, skb);
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
new file mode 100644
index 0000000000..9bd443db51
--- /dev/null
+++ b/net/ipv4/tcp_vegas.c
@@ -0,0 +1,411 @@
1/*
2 * TCP Vegas congestion control
3 *
4 * This is based on the congestion detection/avoidance scheme described in
5 * Lawrence S. Brakmo and Larry L. Peterson.
6 * "TCP Vegas: End to end congestion avoidance on a global internet."
7 * IEEE Journal on Selected Areas in Communication, 13(8):1465--1480,
8 * October 1995. Available from:
9 * ftp://ftp.cs.arizona.edu/xkernel/Papers/jsac.ps
10 *
11 * See http://www.cs.arizona.edu/xkernel/ for their implementation.
12 * The main aspects that distinguish this implementation from the
13 * Arizona Vegas implementation are:
14 * o We do not change the loss detection or recovery mechanisms of
15 * Linux in any way. Linux already recovers from losses quite well,
16 * using fine-grained timers, NewReno, and FACK.
17 * o To avoid the performance penalty imposed by increasing cwnd
18 * only every-other RTT during slow start, we increase during
19 * every RTT during slow start, just like Reno.
20 * o Largely to allow continuous cwnd growth during slow start,
21 * we use the rate at which ACKs come back as the "actual"
22 * rate, rather than the rate at which data is sent.
23 * o To speed convergence to the right rate, we set the cwnd
24 * to achieve the right ("actual") rate when we exit slow start.
25 * o To filter out the noise caused by delayed ACKs, we use the
26 * minimum RTT sample observed during the last RTT to calculate
27 * the actual rate.
28 * o When the sender re-starts from idle, it waits until it has
29 * received ACKs for an entire flight of new data before making
30 * a cwnd adjustment decision. The original Vegas implementation
31 * assumed senders never went idle.
32 */
33
34#include <linux/config.h>
35#include <linux/mm.h>
36#include <linux/module.h>
37#include <linux/skbuff.h>
38#include <linux/tcp_diag.h>
39
40#include <net/tcp.h>
41
42/* Default values of the Vegas variables, in fixed-point representation
43 * with V_PARAM_SHIFT bits to the right of the binary point.
44 */
45#define V_PARAM_SHIFT 1
46static int alpha = 1<<V_PARAM_SHIFT;
47static int beta = 3<<V_PARAM_SHIFT;
48static int gamma = 1<<V_PARAM_SHIFT;
49
50module_param(alpha, int, 0644);
51MODULE_PARM_DESC(alpha, "lower bound of packets in network (scale by 2)");
52module_param(beta, int, 0644);
53MODULE_PARM_DESC(beta, "upper bound of packets in network (scale by 2)");
54module_param(gamma, int, 0644);
55MODULE_PARM_DESC(gamma, "limit on increase (scale by 2)");
56
57
58/* Vegas variables */
59struct vegas {
60 u32 beg_snd_nxt; /* right edge during last RTT */
61 u32 beg_snd_una; /* left edge during last RTT */
62 u32 beg_snd_cwnd; /* saves the size of the cwnd */
63 u8 doing_vegas_now;/* if true, do vegas for this RTT */
64 u16 cntRTT; /* # of RTTs measured within last RTT */
65 u32 minRTT; /* min of RTTs measured within last RTT (in usec) */
66 u32 baseRTT; /* the min of all Vegas RTT measurements seen (in usec) */
67};
68
69/* There are several situations when we must "re-start" Vegas:
70 *
71 * o when a connection is established
72 * o after an RTO
73 * o after fast recovery
74 * o when we send a packet and there is no outstanding
75 * unacknowledged data (restarting an idle connection)
76 *
77 * In these circumstances we cannot do a Vegas calculation at the
78 * end of the first RTT, because any calculation we do is using
79 * stale info -- both the saved cwnd and congestion feedback are
80 * stale.
81 *
82 * Instead we must wait until the completion of an RTT during
83 * which we actually receive ACKs.
84 */
85static inline void vegas_enable(struct tcp_sock *tp)
86{
87 struct vegas *vegas = tcp_ca(tp);
88
89 /* Begin taking Vegas samples next time we send something. */
90 vegas->doing_vegas_now = 1;
91
92 /* Set the beginning of the next send window. */
93 vegas->beg_snd_nxt = tp->snd_nxt;
94
95 vegas->cntRTT = 0;
96 vegas->minRTT = 0x7fffffff;
97}
98
99/* Stop taking Vegas samples for now. */
100static inline void vegas_disable(struct tcp_sock *tp)
101{
102 struct vegas *vegas = tcp_ca(tp);
103
104 vegas->doing_vegas_now = 0;
105}
106
107static void tcp_vegas_init(struct tcp_sock *tp)
108{
109 struct vegas *vegas = tcp_ca(tp);
110
111 vegas->baseRTT = 0x7fffffff;
112 vegas_enable(tp);
113}
114
115/* Do RTT sampling needed for Vegas.
116 * Basically we:
117 * o min-filter RTT samples from within an RTT to get the current
118 * propagation delay + queuing delay (we are min-filtering to try to
119 * avoid the effects of delayed ACKs)
120 * o min-filter RTT samples from a much longer window (forever for now)
121 * to find the propagation delay (baseRTT)
122 */
123static void tcp_vegas_rtt_calc(struct tcp_sock *tp, u32 usrtt)
124{
125 struct vegas *vegas = tcp_ca(tp);
126 u32 vrtt = usrtt + 1; /* Never allow zero rtt or baseRTT */
127
128 /* Filter to find propagation delay: */
129 if (vrtt < vegas->baseRTT)
130 vegas->baseRTT = vrtt;
131
132 /* Find the min RTT during the last RTT to find
133 * the current prop. delay + queuing delay:
134 */
135 vegas->minRTT = min(vegas->minRTT, vrtt);
136 vegas->cntRTT++;
137}
138
139static void tcp_vegas_state(struct tcp_sock *tp, u8 ca_state)
140{
141
142 if (ca_state == TCP_CA_Open)
143 vegas_enable(tp);
144 else
145 vegas_disable(tp);
146}
147
148/*
149 * If the connection is idle and we are restarting,
150 * then we don't want to do any Vegas calculations
151 * until we get fresh RTT samples. So when we
152 * restart, we reset our Vegas state to a clean
153 * slate. After we get acks for this flight of
154 * packets, _then_ we can make Vegas calculations
155 * again.
156 */
157static void tcp_vegas_cwnd_event(struct tcp_sock *tp, enum tcp_ca_event event)
158{
159 if (event == CA_EVENT_CWND_RESTART ||
160 event == CA_EVENT_TX_START)
161 tcp_vegas_init(tp);
162}
163
164static void tcp_vegas_cong_avoid(struct tcp_sock *tp, u32 ack,
165 u32 seq_rtt, u32 in_flight, int flag)
166{
167 struct vegas *vegas = tcp_ca(tp);
168
169 if (!vegas->doing_vegas_now)
170 return tcp_reno_cong_avoid(tp, ack, seq_rtt, in_flight, flag);
171
172 /* The key players are v_beg_snd_una and v_beg_snd_nxt.
173 *
174 * These are so named because they represent the approximate values
175 * of snd_una and snd_nxt at the beginning of the current RTT. More
176 * precisely, they represent the amount of data sent during the RTT.
177 * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt,
178 * we will calculate that (v_beg_snd_nxt - v_beg_snd_una) outstanding
179 * bytes of data have been ACKed during the course of the RTT, giving
180 * an "actual" rate of:
181 *
182 * (v_beg_snd_nxt - v_beg_snd_una) / (rtt duration)
183 *
184 * Unfortunately, v_beg_snd_una is not exactly equal to snd_una,
185 * because delayed ACKs can cover more than one segment, so they
186 * don't line up nicely with the boundaries of RTTs.
187 *
188 * Another unfortunate fact of life is that delayed ACKs delay the
189 * advance of the left edge of our send window, so that the number
190 * of bytes we send in an RTT is often less than our cwnd will allow.
191 * So we keep track of our cwnd separately, in v_beg_snd_cwnd.
192 */
193
194 if (after(ack, vegas->beg_snd_nxt)) {
195 /* Do the Vegas once-per-RTT cwnd adjustment. */
196 u32 old_wnd, old_snd_cwnd;
197
198
199 /* Here old_wnd is essentially the window of data that was
200 * sent during the previous RTT, and has all
201 * been acknowledged in the course of the RTT that ended
202 * with the ACK we just received. Likewise, old_snd_cwnd
203 * is the cwnd during the previous RTT.
204 */
205 old_wnd = (vegas->beg_snd_nxt - vegas->beg_snd_una) /
206 tp->mss_cache;
207 old_snd_cwnd = vegas->beg_snd_cwnd;
208
209 /* Save the extent of the current window so we can use this
210 * at the end of the next RTT.
211 */
212 vegas->beg_snd_una = vegas->beg_snd_nxt;
213 vegas->beg_snd_nxt = tp->snd_nxt;
214 vegas->beg_snd_cwnd = tp->snd_cwnd;
215
216 /* Take into account the current RTT sample too, to
217 * decrease the impact of delayed acks. This double counts
218 * this sample since we count it for the next window as well,
219 * but that's not too awful, since we're taking the min,
220 * rather than averaging.
221 */
222 tcp_vegas_rtt_calc(tp, seq_rtt*1000);
223
224 /* We do the Vegas calculations only if we got enough RTT
225 * samples that we can be reasonably sure that we got
226 * at least one RTT sample that wasn't from a delayed ACK.
227 * If we only had 2 samples total,
228 * then that means we're getting only 1 ACK per RTT, which
229 * means they're almost certainly delayed ACKs.
230 * If we have 3 samples, we should be OK.
231 */
232
233 if (vegas->cntRTT <= 2) {
234 /* We don't have enough RTT samples to do the Vegas
235 * calculation, so we'll behave like Reno.
236 */
237 if (tp->snd_cwnd > tp->snd_ssthresh)
238 tp->snd_cwnd++;
239 } else {
240 u32 rtt, target_cwnd, diff;
241
242 /* We have enough RTT samples, so, using the Vegas
243 * algorithm, we determine if we should increase or
244 * decrease cwnd, and by how much.
245 */
246
247 /* Pluck out the RTT we are using for the Vegas
248 * calculations. This is the min RTT seen during the
249 * last RTT. Taking the min filters out the effects
250 * of delayed ACKs, at the cost of noticing congestion
251 * a bit later.
252 */
253 rtt = vegas->minRTT;
254
255 /* Calculate the cwnd we should have, if we weren't
256 * going too fast.
257 *
258 * This is:
259 * (actual rate in segments) * baseRTT
260 * We keep it as a fixed point number with
261 * V_PARAM_SHIFT bits to the right of the binary point.
262 */
263 target_cwnd = ((old_wnd * vegas->baseRTT)
264 << V_PARAM_SHIFT) / rtt;
265
266 /* Calculate the difference between the window we had,
267 * and the window we would like to have. This quantity
268 * is the "Diff" from the Arizona Vegas papers.
269 *
270 * Again, this is a fixed point number with
271 * V_PARAM_SHIFT bits to the right of the binary
272 * point.
273 */
274 diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd;
275
276 if (tp->snd_cwnd < tp->snd_ssthresh) {
277 /* Slow start. */
278 if (diff > gamma) {
279 /* Going too fast. Time to slow down
280 * and switch to congestion avoidance.
281 */
282 tp->snd_ssthresh = 2;
283
284 /* Set cwnd to match the actual rate
285 * exactly:
286 * cwnd = (actual rate) * baseRTT
287 * Then we add 1 because the integer
288 * truncation robs us of full link
289 * utilization.
290 */
291 tp->snd_cwnd = min(tp->snd_cwnd,
292 (target_cwnd >>
293 V_PARAM_SHIFT)+1);
294
295 }
296 } else {
297 /* Congestion avoidance. */
298 u32 next_snd_cwnd;
299
300 /* Figure out where we would like cwnd
301 * to be.
302 */
303 if (diff > beta) {
304 /* The old window was too fast, so
305 * we slow down.
306 */
307 next_snd_cwnd = old_snd_cwnd - 1;
308 } else if (diff < alpha) {
309 /* We don't have enough extra packets
310 * in the network, so speed up.
311 */
312 next_snd_cwnd = old_snd_cwnd + 1;
313 } else {
314 /* Sending just as fast as we
315 * should be.
316 */
317 next_snd_cwnd = old_snd_cwnd;
318 }
319
320 /* Adjust cwnd upward or downward, toward the
321 * desired value.
322 */
323 if (next_snd_cwnd > tp->snd_cwnd)
324 tp->snd_cwnd++;
325 else if (next_snd_cwnd < tp->snd_cwnd)
326 tp->snd_cwnd--;
327 }
328 }
329
330 /* Wipe the slate clean for the next RTT. */
331 vegas->cntRTT = 0;
332 vegas->minRTT = 0x7fffffff;
333 }
334
335 /* The following code is executed for every ack we receive,
336 * except for conditions checked in should_advance_cwnd()
337 * before the call to tcp_cong_avoid(). Mainly this means that
338 * we only execute this code if the ack actually acked some
339 * data.
340 */
341
342 /* If we are in slow start, increase our cwnd in response to this ACK.
343 * (If we are not in slow start then we are in congestion avoidance,
344 * and adjust our congestion window only once per RTT. See the code
345 * above.)
346 */
347 if (tp->snd_cwnd <= tp->snd_ssthresh)
348 tp->snd_cwnd++;
349
350 /* to keep cwnd from growing without bound */
351 tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
352
353 /* Make sure that we are never so timid as to reduce our cwnd below
354 * 2 MSS.
355 *
356 * Going below 2 MSS would risk huge delayed ACKs from our receiver.
357 */
358 tp->snd_cwnd = max(tp->snd_cwnd, 2U);
359}
360
361/* Extract info for Tcp socket info provided via netlink. */
362static void tcp_vegas_get_info(struct tcp_sock *tp, u32 ext,
363 struct sk_buff *skb)
364{
365 const struct vegas *ca = tcp_ca(tp);
366 if (ext & (1<<(TCPDIAG_VEGASINFO-1))) {
367 struct tcpvegas_info *info;
368
369 info = RTA_DATA(__RTA_PUT(skb, TCPDIAG_VEGASINFO,
370 sizeof(*info)));
371
372 info->tcpv_enabled = ca->doing_vegas_now;
373 info->tcpv_rttcnt = ca->cntRTT;
374 info->tcpv_rtt = ca->baseRTT;
375 info->tcpv_minrtt = ca->minRTT;
376 rtattr_failure: ;
377 }
378}
379
380static struct tcp_congestion_ops tcp_vegas = {
381 .init = tcp_vegas_init,
382 .ssthresh = tcp_reno_ssthresh,
383 .cong_avoid = tcp_vegas_cong_avoid,
384 .min_cwnd = tcp_reno_min_cwnd,
385 .rtt_sample = tcp_vegas_rtt_calc,
386 .set_state = tcp_vegas_state,
387 .cwnd_event = tcp_vegas_cwnd_event,
388 .get_info = tcp_vegas_get_info,
389
390 .owner = THIS_MODULE,
391 .name = "vegas",
392};
393
394static int __init tcp_vegas_register(void)
395{
396 BUG_ON(sizeof(struct vegas) > TCP_CA_PRIV_SIZE);
397 tcp_register_congestion_control(&tcp_vegas);
398 return 0;
399}
400
401static void __exit tcp_vegas_unregister(void)
402{
403 tcp_unregister_congestion_control(&tcp_vegas);
404}
405
406module_init(tcp_vegas_register);
407module_exit(tcp_vegas_unregister);
408
409MODULE_AUTHOR("Stephen Hemminger");
410MODULE_LICENSE("GPL");
411MODULE_DESCRIPTION("TCP Vegas");
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
new file mode 100644
index 0000000000..ef827242c9
--- /dev/null
+++ b/net/ipv4/tcp_westwood.c
@@ -0,0 +1,259 @@
1/*
2 * TCP Westwood+
3 *
4 * Angelo Dell'Aera: TCP Westwood+ support
5 */
6
7#include <linux/config.h>
8#include <linux/mm.h>
9#include <linux/module.h>
10#include <linux/skbuff.h>
11#include <linux/tcp_diag.h>
12#include <net/tcp.h>
13
14/* TCP Westwood structure */
15struct westwood {
16 u32 bw_ns_est; /* first bandwidth estimation..not too smoothed 8) */
17 u32 bw_est; /* bandwidth estimate */
18 u32 rtt_win_sx; /* here starts a new evaluation... */
19 u32 bk;
20 u32 snd_una; /* used for evaluating the number of acked bytes */
21 u32 cumul_ack;
22 u32 accounted;
23 u32 rtt;
24 u32 rtt_min; /* minimum observed RTT */
25};
26
27
28/* TCP Westwood functions and constants */
29#define TCP_WESTWOOD_RTT_MIN (HZ/20) /* 50ms */
30#define TCP_WESTWOOD_INIT_RTT (20*HZ) /* maybe too conservative?! */
31
32/*
33 * @tcp_westwood_create
34 * This function initializes fields used in TCP Westwood+,
35 * it is called after the initial SYN, so the sequence numbers
36 * are correct but new passive connections we have no
37 * information about RTTmin at this time so we simply set it to
38 * TCP_WESTWOOD_INIT_RTT. This value was chosen to be too conservative
39 * since in this way we're sure it will be updated in a consistent
40 * way as soon as possible. It will reasonably happen within the first
41 * RTT period of the connection lifetime.
42 */
43static void tcp_westwood_init(struct tcp_sock *tp)
44{
45 struct westwood *w = tcp_ca(tp);
46
47 w->bk = 0;
48 w->bw_ns_est = 0;
49 w->bw_est = 0;
50 w->accounted = 0;
51 w->cumul_ack = 0;
52 w->rtt_min = w->rtt = TCP_WESTWOOD_INIT_RTT;
53 w->rtt_win_sx = tcp_time_stamp;
54 w->snd_una = tp->snd_una;
55}
56
57/*
58 * @westwood_do_filter
59 * Low-pass filter. Implemented using constant coefficients.
60 */
61static inline u32 westwood_do_filter(u32 a, u32 b)
62{
63 return (((7 * a) + b) >> 3);
64}
65
66static inline void westwood_filter(struct westwood *w, u32 delta)
67{
68 w->bw_ns_est = westwood_do_filter(w->bw_ns_est, w->bk / delta);
69 w->bw_est = westwood_do_filter(w->bw_est, w->bw_ns_est);
70}
71
72/*
73 * @westwood_pkts_acked
74 * Called after processing group of packets.
75 * but all westwood needs is the last sample of srtt.
76 */
77static void tcp_westwood_pkts_acked(struct tcp_sock *tp, u32 cnt)
78{
79 struct westwood *w = tcp_ca(tp);
80 if (cnt > 0)
81 w->rtt = tp->srtt >> 3;
82}
83
84/*
85 * @westwood_update_window
86 * It updates RTT evaluation window if it is the right moment to do
87 * it. If so it calls filter for evaluating bandwidth.
88 */
89static void westwood_update_window(struct tcp_sock *tp)
90{
91 struct westwood *w = tcp_ca(tp);
92 s32 delta = tcp_time_stamp - w->rtt_win_sx;
93
94 /*
95 * See if a RTT-window has passed.
96 * Be careful since if RTT is less than
97 * 50ms we don't filter but we continue 'building the sample'.
98 * This minimum limit was chosen since an estimation on small
99 * time intervals is better to avoid...
100 * Obviously on a LAN we reasonably will always have
101 * right_bound = left_bound + WESTWOOD_RTT_MIN
102 */
103 if (w->rtt && delta > max_t(u32, w->rtt, TCP_WESTWOOD_RTT_MIN)) {
104 westwood_filter(w, delta);
105
106 w->bk = 0;
107 w->rtt_win_sx = tcp_time_stamp;
108 }
109}
110
111/*
112 * @westwood_fast_bw
113 * It is called when we are in fast path. In particular it is called when
114 * header prediction is successful. In such case in fact update is
115 * straight forward and doesn't need any particular care.
116 */
117static inline void westwood_fast_bw(struct tcp_sock *tp)
118{
119 struct westwood *w = tcp_ca(tp);
120
121 westwood_update_window(tp);
122
123 w->bk += tp->snd_una - w->snd_una;
124 w->snd_una = tp->snd_una;
125 w->rtt_min = min(w->rtt, w->rtt_min);
126}
127
128/*
129 * @westwood_acked_count
130 * This function evaluates cumul_ack for evaluating bk in case of
131 * delayed or partial acks.
132 */
133static inline u32 westwood_acked_count(struct tcp_sock *tp)
134{
135 struct westwood *w = tcp_ca(tp);
136
137 w->cumul_ack = tp->snd_una - w->snd_una;
138
139 /* If cumul_ack is 0 this is a dupack since it's not moving
140 * tp->snd_una.
141 */
142 if (!w->cumul_ack) {
143 w->accounted += tp->mss_cache;
144 w->cumul_ack = tp->mss_cache;
145 }
146
147 if (w->cumul_ack > tp->mss_cache) {
148 /* Partial or delayed ack */
149 if (w->accounted >= w->cumul_ack) {
150 w->accounted -= w->cumul_ack;
151 w->cumul_ack = tp->mss_cache;
152 } else {
153 w->cumul_ack -= w->accounted;
154 w->accounted = 0;
155 }
156 }
157
158 w->snd_una = tp->snd_una;
159
160 return w->cumul_ack;
161}
162
163static inline u32 westwood_bw_rttmin(const struct tcp_sock *tp)
164{
165 struct westwood *w = tcp_ca(tp);
166 return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2);
167}
168
169/*
170 * TCP Westwood
171 * Here limit is evaluated as Bw estimation*RTTmin (for obtaining it
172 * in packets we use mss_cache). Rttmin is guaranteed to be >= 2
173 * so avoids ever returning 0.
174 */
175static u32 tcp_westwood_cwnd_min(struct tcp_sock *tp)
176{
177 return westwood_bw_rttmin(tp);
178}
179
180static void tcp_westwood_event(struct tcp_sock *tp, enum tcp_ca_event event)
181{
182 struct westwood *w = tcp_ca(tp);
183
184 switch(event) {
185 case CA_EVENT_FAST_ACK:
186 westwood_fast_bw(tp);
187 break;
188
189 case CA_EVENT_COMPLETE_CWR:
190 tp->snd_cwnd = tp->snd_ssthresh = westwood_bw_rttmin(tp);
191 break;
192
193 case CA_EVENT_FRTO:
194 tp->snd_ssthresh = westwood_bw_rttmin(tp);
195 break;
196
197 case CA_EVENT_SLOW_ACK:
198 westwood_update_window(tp);
199 w->bk += westwood_acked_count(tp);
200 w->rtt_min = min(w->rtt, w->rtt_min);
201 break;
202
203 default:
204 /* don't care */
205 break;
206 }
207}
208
209
210/* Extract info for Tcp socket info provided via netlink. */
211static void tcp_westwood_info(struct tcp_sock *tp, u32 ext,
212 struct sk_buff *skb)
213{
214 const struct westwood *ca = tcp_ca(tp);
215 if (ext & (1<<(TCPDIAG_VEGASINFO-1))) {
216 struct rtattr *rta;
217 struct tcpvegas_info *info;
218
219 rta = __RTA_PUT(skb, TCPDIAG_VEGASINFO, sizeof(*info));
220 info = RTA_DATA(rta);
221 info->tcpv_enabled = 1;
222 info->tcpv_rttcnt = 0;
223 info->tcpv_rtt = jiffies_to_usecs(ca->rtt);
224 info->tcpv_minrtt = jiffies_to_usecs(ca->rtt_min);
225 rtattr_failure: ;
226 }
227}
228
229
230static struct tcp_congestion_ops tcp_westwood = {
231 .init = tcp_westwood_init,
232 .ssthresh = tcp_reno_ssthresh,
233 .cong_avoid = tcp_reno_cong_avoid,
234 .min_cwnd = tcp_westwood_cwnd_min,
235 .cwnd_event = tcp_westwood_event,
236 .get_info = tcp_westwood_info,
237 .pkts_acked = tcp_westwood_pkts_acked,
238
239 .owner = THIS_MODULE,
240 .name = "westwood"
241};
242
243static int __init tcp_westwood_register(void)
244{
245 BUG_ON(sizeof(struct westwood) > TCP_CA_PRIV_SIZE);
246 return tcp_register_congestion_control(&tcp_westwood);
247}
248
249static void __exit tcp_westwood_unregister(void)
250{
251 tcp_unregister_congestion_control(&tcp_westwood);
252}
253
254module_init(tcp_westwood_register);
255module_exit(tcp_westwood_unregister);
256
257MODULE_AUTHOR("Stephen Hemminger, Angelo Dell'Aera");
258MODULE_LICENSE("GPL");
259MODULE_DESCRIPTION("TCP Westwood+");
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 7c24e64b44..dc4d07357e 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -628,7 +628,7 @@ back_from_confirm:
628 /* ... which is an evident application bug. --ANK */ 628 /* ... which is an evident application bug. --ANK */
629 release_sock(sk); 629 release_sock(sk);
630 630
631 NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "udp cork app bug 2\n")); 631 LIMIT_NETDEBUG(printk(KERN_DEBUG "udp cork app bug 2\n"));
632 err = -EINVAL; 632 err = -EINVAL;
633 goto out; 633 goto out;
634 } 634 }
@@ -693,7 +693,7 @@ static int udp_sendpage(struct sock *sk, struct page *page, int offset,
693 if (unlikely(!up->pending)) { 693 if (unlikely(!up->pending)) {
694 release_sock(sk); 694 release_sock(sk);
695 695
696 NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "udp cork app bug 3\n")); 696 LIMIT_NETDEBUG(printk(KERN_DEBUG "udp cork app bug 3\n"));
697 return -EINVAL; 697 return -EINVAL;
698 } 698 }
699 699
@@ -1102,7 +1102,7 @@ static int udp_checksum_init(struct sk_buff *skb, struct udphdr *uh,
1102 skb->ip_summed = CHECKSUM_UNNECESSARY; 1102 skb->ip_summed = CHECKSUM_UNNECESSARY;
1103 if (!udp_check(uh, ulen, saddr, daddr, skb->csum)) 1103 if (!udp_check(uh, ulen, saddr, daddr, skb->csum))
1104 return 0; 1104 return 0;
1105 NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "udp v4 hw csum failure.\n")); 1105 LIMIT_NETDEBUG(printk(KERN_DEBUG "udp v4 hw csum failure.\n"));
1106 skb->ip_summed = CHECKSUM_NONE; 1106 skb->ip_summed = CHECKSUM_NONE;
1107 } 1107 }
1108 if (skb->ip_summed != CHECKSUM_UNNECESSARY) 1108 if (skb->ip_summed != CHECKSUM_UNNECESSARY)
@@ -1181,14 +1181,13 @@ int udp_rcv(struct sk_buff *skb)
1181 return(0); 1181 return(0);
1182 1182
1183short_packet: 1183short_packet:
1184 NETDEBUG(if (net_ratelimit()) 1184 LIMIT_NETDEBUG(printk(KERN_DEBUG "UDP: short packet: From %u.%u.%u.%u:%u %d/%d to %u.%u.%u.%u:%u\n",
1185 printk(KERN_DEBUG "UDP: short packet: From %u.%u.%u.%u:%u %d/%d to %u.%u.%u.%u:%u\n", 1185 NIPQUAD(saddr),
1186 NIPQUAD(saddr), 1186 ntohs(uh->source),
1187 ntohs(uh->source), 1187 ulen,
1188 ulen, 1188 len,
1189 len, 1189 NIPQUAD(daddr),
1190 NIPQUAD(daddr), 1190 ntohs(uh->dest)));
1191 ntohs(uh->dest)));
1192no_header: 1191no_header:
1193 UDP_INC_STATS_BH(UDP_MIB_INERRORS); 1192 UDP_INC_STATS_BH(UDP_MIB_INERRORS);
1194 kfree_skb(skb); 1193 kfree_skb(skb);
@@ -1199,13 +1198,12 @@ csum_error:
1199 * RFC1122: OK. Discards the bad packet silently (as far as 1198 * RFC1122: OK. Discards the bad packet silently (as far as
1200 * the network is concerned, anyway) as per 4.1.3.4 (MUST). 1199 * the network is concerned, anyway) as per 4.1.3.4 (MUST).
1201 */ 1200 */
1202 NETDEBUG(if (net_ratelimit()) 1201 LIMIT_NETDEBUG(printk(KERN_DEBUG "UDP: bad checksum. From %d.%d.%d.%d:%d to %d.%d.%d.%d:%d ulen %d\n",
1203 printk(KERN_DEBUG "UDP: bad checksum. From %d.%d.%d.%d:%d to %d.%d.%d.%d:%d ulen %d\n", 1202 NIPQUAD(saddr),
1204 NIPQUAD(saddr), 1203 ntohs(uh->source),
1205 ntohs(uh->source), 1204 NIPQUAD(daddr),
1206 NIPQUAD(daddr), 1205 ntohs(uh->dest),
1207 ntohs(uh->dest), 1206 ulen));
1208 ulen));
1209drop: 1207drop:
1210 UDP_INC_STATS_BH(UDP_MIB_INERRORS); 1208 UDP_INC_STATS_BH(UDP_MIB_INERRORS);
1211 kfree_skb(skb); 1209 kfree_skb(skb);
diff --git a/net/ipv4/utils.c b/net/ipv4/utils.c
deleted file mode 100644
index 6aecd7a435..0000000000
--- a/net/ipv4/utils.c
+++ /dev/null
@@ -1,59 +0,0 @@
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Various kernel-resident INET utility functions; mainly
7 * for format conversion and debugging output.
8 *
9 * Version: $Id: utils.c,v 1.8 2000/10/03 07:29:01 anton Exp $
10 *
11 * Author: Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 *
13 * Fixes:
14 * Alan Cox : verify_area check.
15 * Alan Cox : removed old debugging.
16 * Andi Kleen : add net_ratelimit()
17 *
18 * This program is free software; you can redistribute it and/or
19 * modify it under the terms of the GNU General Public License
20 * as published by the Free Software Foundation; either version
21 * 2 of the License, or (at your option) any later version.
22 */
23
24#include <linux/module.h>
25#include <linux/types.h>
26#include <asm/byteorder.h>
27
28/*
29 * Convert an ASCII string to binary IP.
30 */
31
32__u32 in_aton(const char *str)
33{
34 unsigned long l;
35 unsigned int val;
36 int i;
37
38 l = 0;
39 for (i = 0; i < 4; i++)
40 {
41 l <<= 8;
42 if (*str != '\0')
43 {
44 val = 0;
45 while (*str != '\0' && *str != '.')
46 {
47 val *= 10;
48 val += *str - '0';
49 str++;
50 }
51 l |= val;
52 if (*str != '\0')
53 str++;
54 }
55 }
56 return(htonl(l));
57}
58
59EXPORT_SYMBOL(in_aton);
diff --git a/net/ipv4/xfrm4_tunnel.c b/net/ipv4/xfrm4_tunnel.c
index e1fe360ed2..afbb0d4cc3 100644
--- a/net/ipv4/xfrm4_tunnel.c
+++ b/net/ipv4/xfrm4_tunnel.c
@@ -78,10 +78,9 @@ static int ipip_rcv(struct sk_buff *skb)
78static void ipip_err(struct sk_buff *skb, u32 info) 78static void ipip_err(struct sk_buff *skb, u32 info)
79{ 79{
80 struct xfrm_tunnel *handler = ipip_handler; 80 struct xfrm_tunnel *handler = ipip_handler;
81 u32 arg = info;
82 81
83 if (handler) 82 if (handler)
84 handler->err_handler(skb, &arg); 83 handler->err_handler(skb, info);
85} 84}
86 85
87static int ipip_init_state(struct xfrm_state *x) 86static int ipip_init_state(struct xfrm_state *x)