aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Kconfig8
-rw-r--r--net/ipv4/Makefile1
-rw-r--r--net/ipv4/af_inet.c19
-rw-r--r--net/ipv4/ah4.c1
-rw-r--r--net/ipv4/arp.c1
-rw-r--r--net/ipv4/devinet.c1
-rw-r--r--net/ipv4/esp4.c1
-rw-r--r--net/ipv4/fib_frontend.c1
-rw-r--r--net/ipv4/fib_hash.c1
-rw-r--r--net/ipv4/fib_rules.c1
-rw-r--r--net/ipv4/fib_semantics.c2
-rw-r--r--net/ipv4/fib_trie.c8
-rw-r--r--net/ipv4/icmp.c1
-rw-r--r--net/ipv4/igmp.c2
-rw-r--r--net/ipv4/inet_connection_sock.c25
-rw-r--r--net/ipv4/inet_diag.c14
-rw-r--r--net/ipv4/inet_hashtables.c178
-rw-r--r--net/ipv4/inet_timewait_sock.c5
-rw-r--r--net/ipv4/inetpeer.c1
-rw-r--r--net/ipv4/ip_fragment.c68
-rw-r--r--net/ipv4/ip_input.c1
-rw-r--r--net/ipv4/ip_options.c1
-rw-r--r--net/ipv4/ip_output.c1
-rw-r--r--net/ipv4/ip_sockglue.c14
-rw-r--r--net/ipv4/ipcomp.c1
-rw-r--r--net/ipv4/ipconfig.c2
-rw-r--r--net/ipv4/ipmr.c1
-rw-r--r--net/ipv4/ipvs/ip_vs_app.c28
-rw-r--r--net/ipv4/ipvs/ip_vs_conn.c21
-rw-r--r--net/ipv4/ipvs/ip_vs_core.c2
-rw-r--r--net/ipv4/ipvs/ip_vs_ctl.c10
-rw-r--r--net/ipv4/ipvs/ip_vs_dh.c2
-rw-r--r--net/ipv4/ipvs/ip_vs_est.c3
-rw-r--r--net/ipv4/ipvs/ip_vs_lblc.c29
-rw-r--r--net/ipv4/ipvs/ip_vs_lblcr.c29
-rw-r--r--net/ipv4/ipvs/ip_vs_proto_ah.c2
-rw-r--r--net/ipv4/ipvs/ip_vs_proto_esp.c2
-rw-r--r--net/ipv4/ipvs/ip_vs_proto_tcp.c24
-rw-r--r--net/ipv4/ipvs/ip_vs_proto_udp.c3
-rw-r--r--net/ipv4/ipvs/ip_vs_sh.c2
-rw-r--r--net/ipv4/ipvs/ip_vs_sync.c2
-rw-r--r--net/ipv4/netfilter/arp_tables.c175
-rw-r--r--net/ipv4/netfilter/ip_conntrack_amanda.c2
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_gre.c1
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_udp.c1
-rw-r--r--net/ipv4/netfilter/ip_conntrack_standalone.c1
-rw-r--r--net/ipv4/netfilter/ip_nat_snmp_basic.c2
-rw-r--r--net/ipv4/netfilter/ip_tables.c199
-rw-r--r--net/ipv4/netfilter/ipt_MASQUERADE.c2
-rw-r--r--net/ipv4/netfilter/ipt_physdev.c1
-rw-r--r--net/ipv4/proc.c1
-rw-r--r--net/ipv4/syncookies.c4
-rw-r--r--net/ipv4/sysctl_net_ipv4.c11
-rw-r--r--net/ipv4/tcp.c10
-rw-r--r--net/ipv4/tcp_bic.c85
-rw-r--r--net/ipv4/tcp_cong.c28
-rw-r--r--net/ipv4/tcp_cubic.c411
-rw-r--r--net/ipv4/tcp_input.c99
-rw-r--r--net/ipv4/tcp_ipv4.c269
-rw-r--r--net/ipv4/tcp_minisocks.c16
-rw-r--r--net/ipv4/tcp_output.c118
-rw-r--r--net/ipv4/tcp_vegas.c4
-rw-r--r--net/ipv4/udp.c22
63 files changed, 1330 insertions, 651 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index e55136ae09f4..011cca7ae02b 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -456,6 +456,14 @@ config TCP_CONG_BIC
456 increase provides TCP friendliness. 456 increase provides TCP friendliness.
457 See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/ 457 See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/
458 458
459config TCP_CONG_CUBIC
460 tristate "CUBIC TCP"
461 default m
462 ---help---
463 This is version 2.0 of BIC-TCP which uses a cubic growth function
464 among other techniques.
465 See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/cubic-paper.pdf
466
459config TCP_CONG_WESTWOOD 467config TCP_CONG_WESTWOOD
460 tristate "TCP Westwood+" 468 tristate "TCP Westwood+"
461 default m 469 default m
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index f0435d00db6b..c54edd76de09 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -34,6 +34,7 @@ obj-$(CONFIG_INET_DIAG) += inet_diag.o
34obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o 34obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o
35obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o 35obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
36obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o 36obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
37obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o
37obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o 38obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o
38obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o 39obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o
39obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o 40obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index d368cf249000..966a071a408c 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -93,6 +93,7 @@
93#include <linux/smp_lock.h> 93#include <linux/smp_lock.h>
94#include <linux/inet.h> 94#include <linux/inet.h>
95#include <linux/igmp.h> 95#include <linux/igmp.h>
96#include <linux/inetdevice.h>
96#include <linux/netdevice.h> 97#include <linux/netdevice.h>
97#include <net/ip.h> 98#include <net/ip.h>
98#include <net/protocol.h> 99#include <net/protocol.h>
@@ -302,6 +303,7 @@ lookup_protocol:
302 sk->sk_reuse = 1; 303 sk->sk_reuse = 1;
303 304
304 inet = inet_sk(sk); 305 inet = inet_sk(sk);
306 inet->is_icsk = INET_PROTOSW_ICSK & answer_flags;
305 307
306 if (SOCK_RAW == sock->type) { 308 if (SOCK_RAW == sock->type) {
307 inet->num = protocol; 309 inet->num = protocol;
@@ -775,16 +777,16 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
775 err = devinet_ioctl(cmd, (void __user *)arg); 777 err = devinet_ioctl(cmd, (void __user *)arg);
776 break; 778 break;
777 default: 779 default:
778 if (!sk->sk_prot->ioctl || 780 if (sk->sk_prot->ioctl)
779 (err = sk->sk_prot->ioctl(sk, cmd, arg)) == 781 err = sk->sk_prot->ioctl(sk, cmd, arg);
780 -ENOIOCTLCMD) 782 else
781 err = dev_ioctl(cmd, (void __user *)arg); 783 err = -ENOIOCTLCMD;
782 break; 784 break;
783 } 785 }
784 return err; 786 return err;
785} 787}
786 788
787struct proto_ops inet_stream_ops = { 789const struct proto_ops inet_stream_ops = {
788 .family = PF_INET, 790 .family = PF_INET,
789 .owner = THIS_MODULE, 791 .owner = THIS_MODULE,
790 .release = inet_release, 792 .release = inet_release,
@@ -805,7 +807,7 @@ struct proto_ops inet_stream_ops = {
805 .sendpage = tcp_sendpage 807 .sendpage = tcp_sendpage
806}; 808};
807 809
808struct proto_ops inet_dgram_ops = { 810const struct proto_ops inet_dgram_ops = {
809 .family = PF_INET, 811 .family = PF_INET,
810 .owner = THIS_MODULE, 812 .owner = THIS_MODULE,
811 .release = inet_release, 813 .release = inet_release,
@@ -830,7 +832,7 @@ struct proto_ops inet_dgram_ops = {
830 * For SOCK_RAW sockets; should be the same as inet_dgram_ops but without 832 * For SOCK_RAW sockets; should be the same as inet_dgram_ops but without
831 * udp_poll 833 * udp_poll
832 */ 834 */
833static struct proto_ops inet_sockraw_ops = { 835static const struct proto_ops inet_sockraw_ops = {
834 .family = PF_INET, 836 .family = PF_INET,
835 .owner = THIS_MODULE, 837 .owner = THIS_MODULE,
836 .release = inet_release, 838 .release = inet_release,
@@ -869,7 +871,8 @@ static struct inet_protosw inetsw_array[] =
869 .ops = &inet_stream_ops, 871 .ops = &inet_stream_ops,
870 .capability = -1, 872 .capability = -1,
871 .no_check = 0, 873 .no_check = 0,
872 .flags = INET_PROTOSW_PERMANENT, 874 .flags = INET_PROTOSW_PERMANENT |
875 INET_PROTOSW_ICSK,
873 }, 876 },
874 877
875 { 878 {
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index 035ad2c9e1ba..aed537fa2c88 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -6,6 +6,7 @@
6#include <linux/crypto.h> 6#include <linux/crypto.h>
7#include <linux/pfkeyv2.h> 7#include <linux/pfkeyv2.h>
8#include <net/icmp.h> 8#include <net/icmp.h>
9#include <net/protocol.h>
9#include <asm/scatterlist.h> 10#include <asm/scatterlist.h>
10 11
11 12
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index b425748f02d7..37432088fe6d 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -86,6 +86,7 @@
86#include <linux/in.h> 86#include <linux/in.h>
87#include <linux/mm.h> 87#include <linux/mm.h>
88#include <linux/inet.h> 88#include <linux/inet.h>
89#include <linux/inetdevice.h>
89#include <linux/netdevice.h> 90#include <linux/netdevice.h>
90#include <linux/etherdevice.h> 91#include <linux/etherdevice.h>
91#include <linux/fddidevice.h> 92#include <linux/fddidevice.h>
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 04a6fe3e95a2..7b9bb28e2ee9 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -58,6 +58,7 @@
58#endif 58#endif
59#include <linux/kmod.h> 59#include <linux/kmod.h>
60 60
61#include <net/arp.h>
61#include <net/ip.h> 62#include <net/ip.h>
62#include <net/route.h> 63#include <net/route.h>
63#include <net/ip_fib.h> 64#include <net/ip_fib.h>
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 1b18ce66e7b7..73bfcae8af9c 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -9,6 +9,7 @@
9#include <linux/pfkeyv2.h> 9#include <linux/pfkeyv2.h>
10#include <linux/random.h> 10#include <linux/random.h>
11#include <net/icmp.h> 11#include <net/icmp.h>
12#include <net/protocol.h>
12#include <net/udp.h> 13#include <net/udp.h>
13 14
14/* decapsulation data for use when post-processing */ 15/* decapsulation data for use when post-processing */
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 19b1b984d687..18f5e509281a 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -30,6 +30,7 @@
30#include <linux/errno.h> 30#include <linux/errno.h>
31#include <linux/in.h> 31#include <linux/in.h>
32#include <linux/inet.h> 32#include <linux/inet.h>
33#include <linux/inetdevice.h>
33#include <linux/netdevice.h> 34#include <linux/netdevice.h>
34#include <linux/if_arp.h> 35#include <linux/if_arp.h>
35#include <linux/skbuff.h> 36#include <linux/skbuff.h>
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
index 7ea0209cb169..e2890ec8159e 100644
--- a/net/ipv4/fib_hash.c
+++ b/net/ipv4/fib_hash.c
@@ -29,6 +29,7 @@
29#include <linux/errno.h> 29#include <linux/errno.h>
30#include <linux/in.h> 30#include <linux/in.h>
31#include <linux/inet.h> 31#include <linux/inet.h>
32#include <linux/inetdevice.h>
32#include <linux/netdevice.h> 33#include <linux/netdevice.h>
33#include <linux/if_arp.h> 34#include <linux/if_arp.h>
34#include <linux/proc_fs.h> 35#include <linux/proc_fs.h>
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 0b298bbc1518..0dd4d06e456d 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -33,6 +33,7 @@
33#include <linux/errno.h> 33#include <linux/errno.h>
34#include <linux/in.h> 34#include <linux/in.h>
35#include <linux/inet.h> 35#include <linux/inet.h>
36#include <linux/inetdevice.h>
36#include <linux/netdevice.h> 37#include <linux/netdevice.h>
37#include <linux/if_arp.h> 38#include <linux/if_arp.h>
38#include <linux/proc_fs.h> 39#include <linux/proc_fs.h>
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 6d2a6ac070e3..ef4724de7350 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -29,6 +29,7 @@
29#include <linux/errno.h> 29#include <linux/errno.h>
30#include <linux/in.h> 30#include <linux/in.h>
31#include <linux/inet.h> 31#include <linux/inet.h>
32#include <linux/inetdevice.h>
32#include <linux/netdevice.h> 33#include <linux/netdevice.h>
33#include <linux/if_arp.h> 34#include <linux/if_arp.h>
34#include <linux/proc_fs.h> 35#include <linux/proc_fs.h>
@@ -36,6 +37,7 @@
36#include <linux/netlink.h> 37#include <linux/netlink.h>
37#include <linux/init.h> 38#include <linux/init.h>
38 39
40#include <net/arp.h>
39#include <net/ip.h> 41#include <net/ip.h>
40#include <net/protocol.h> 42#include <net/protocol.h>
41#include <net/route.h> 43#include <net/route.h>
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 705e3ce86df9..e320b32373e5 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -41,6 +41,13 @@
41 * modify it under the terms of the GNU General Public License 41 * modify it under the terms of the GNU General Public License
42 * as published by the Free Software Foundation; either version 42 * as published by the Free Software Foundation; either version
43 * 2 of the License, or (at your option) any later version. 43 * 2 of the License, or (at your option) any later version.
44 *
45 * Substantial contributions to this work comes from:
46 *
47 * David S. Miller, <davem@davemloft.net>
48 * Stephen Hemminger <shemminger@osdl.org>
49 * Paul E. McKenney <paulmck@us.ibm.com>
50 * Patrick McHardy <kaber@trash.net>
44 */ 51 */
45 52
46#define VERSION "0.404" 53#define VERSION "0.404"
@@ -59,6 +66,7 @@
59#include <linux/errno.h> 66#include <linux/errno.h>
60#include <linux/in.h> 67#include <linux/in.h>
61#include <linux/inet.h> 68#include <linux/inet.h>
69#include <linux/inetdevice.h>
62#include <linux/netdevice.h> 70#include <linux/netdevice.h>
63#include <linux/if_arp.h> 71#include <linux/if_arp.h>
64#include <linux/proc_fs.h> 72#include <linux/proc_fs.h>
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 92e23b2ad4d2..be5a519cd2f8 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -73,6 +73,7 @@
73#include <linux/socket.h> 73#include <linux/socket.h>
74#include <linux/in.h> 74#include <linux/in.h>
75#include <linux/inet.h> 75#include <linux/inet.h>
76#include <linux/inetdevice.h>
76#include <linux/netdevice.h> 77#include <linux/netdevice.h>
77#include <linux/string.h> 78#include <linux/string.h>
78#include <linux/netfilter_ipv4.h> 79#include <linux/netfilter_ipv4.h>
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 4a195c724f01..34758118c10c 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -91,6 +91,8 @@
91#include <linux/if_arp.h> 91#include <linux/if_arp.h>
92#include <linux/rtnetlink.h> 92#include <linux/rtnetlink.h>
93#include <linux/times.h> 93#include <linux/times.h>
94
95#include <net/arp.h>
94#include <net/ip.h> 96#include <net/ip.h>
95#include <net/protocol.h> 97#include <net/protocol.h>
96#include <net/route.h> 98#include <net/route.h>
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 3fe021f1a566..ae20281d8deb 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -37,7 +37,8 @@ EXPORT_SYMBOL(inet_csk_timer_bug_msg);
37 */ 37 */
38int sysctl_local_port_range[2] = { 1024, 4999 }; 38int sysctl_local_port_range[2] = { 1024, 4999 };
39 39
40static inline int inet_csk_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb) 40int inet_csk_bind_conflict(const struct sock *sk,
41 const struct inet_bind_bucket *tb)
41{ 42{
42 const u32 sk_rcv_saddr = inet_rcv_saddr(sk); 43 const u32 sk_rcv_saddr = inet_rcv_saddr(sk);
43 struct sock *sk2; 44 struct sock *sk2;
@@ -62,11 +63,15 @@ static inline int inet_csk_bind_conflict(struct sock *sk, struct inet_bind_bucke
62 return node != NULL; 63 return node != NULL;
63} 64}
64 65
66EXPORT_SYMBOL_GPL(inet_csk_bind_conflict);
67
65/* Obtain a reference to a local port for the given sock, 68/* Obtain a reference to a local port for the given sock,
66 * if snum is zero it means select any available local port. 69 * if snum is zero it means select any available local port.
67 */ 70 */
68int inet_csk_get_port(struct inet_hashinfo *hashinfo, 71int inet_csk_get_port(struct inet_hashinfo *hashinfo,
69 struct sock *sk, unsigned short snum) 72 struct sock *sk, unsigned short snum,
73 int (*bind_conflict)(const struct sock *sk,
74 const struct inet_bind_bucket *tb))
70{ 75{
71 struct inet_bind_hashbucket *head; 76 struct inet_bind_hashbucket *head;
72 struct hlist_node *node; 77 struct hlist_node *node;
@@ -125,7 +130,7 @@ tb_found:
125 goto success; 130 goto success;
126 } else { 131 } else {
127 ret = 1; 132 ret = 1;
128 if (inet_csk_bind_conflict(sk, tb)) 133 if (bind_conflict(sk, tb))
129 goto fail_unlock; 134 goto fail_unlock;
130 } 135 }
131 } 136 }
@@ -380,7 +385,7 @@ struct request_sock *inet_csk_search_req(const struct sock *sk,
380EXPORT_SYMBOL_GPL(inet_csk_search_req); 385EXPORT_SYMBOL_GPL(inet_csk_search_req);
381 386
382void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, 387void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
383 const unsigned timeout) 388 unsigned long timeout)
384{ 389{
385 struct inet_connection_sock *icsk = inet_csk(sk); 390 struct inet_connection_sock *icsk = inet_csk(sk);
386 struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; 391 struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
@@ -631,3 +636,15 @@ void inet_csk_listen_stop(struct sock *sk)
631} 636}
632 637
633EXPORT_SYMBOL_GPL(inet_csk_listen_stop); 638EXPORT_SYMBOL_GPL(inet_csk_listen_stop);
639
640void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr)
641{
642 struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
643 const struct inet_sock *inet = inet_sk(sk);
644
645 sin->sin_family = AF_INET;
646 sin->sin_addr.s_addr = inet->daddr;
647 sin->sin_port = inet->dport;
648}
649
650EXPORT_SYMBOL_GPL(inet_csk_addr2sockaddr);
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 39061ed53cfd..c49908192047 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -112,12 +112,12 @@ static int inet_diag_fill(struct sk_buff *skb, struct sock *sk,
112 r->idiag_inode = 0; 112 r->idiag_inode = 0;
113#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) 113#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
114 if (r->idiag_family == AF_INET6) { 114 if (r->idiag_family == AF_INET6) {
115 const struct tcp6_timewait_sock *tcp6tw = tcp6_twsk(sk); 115 const struct inet6_timewait_sock *tw6 = inet6_twsk(sk);
116 116
117 ipv6_addr_copy((struct in6_addr *)r->id.idiag_src, 117 ipv6_addr_copy((struct in6_addr *)r->id.idiag_src,
118 &tcp6tw->tw_v6_rcv_saddr); 118 &tw6->tw_v6_rcv_saddr);
119 ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst, 119 ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst,
120 &tcp6tw->tw_v6_daddr); 120 &tw6->tw_v6_daddr);
121 } 121 }
122#endif 122#endif
123 nlh->nlmsg_len = skb->tail - b; 123 nlh->nlmsg_len = skb->tail - b;
@@ -489,9 +489,9 @@ static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk,
489#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) 489#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
490 if (r->idiag_family == AF_INET6) { 490 if (r->idiag_family == AF_INET6) {
491 ipv6_addr_copy((struct in6_addr *)r->id.idiag_src, 491 ipv6_addr_copy((struct in6_addr *)r->id.idiag_src,
492 &tcp6_rsk(req)->loc_addr); 492 &inet6_rsk(req)->loc_addr);
493 ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst, 493 ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst,
494 &tcp6_rsk(req)->rmt_addr); 494 &inet6_rsk(req)->rmt_addr);
495 } 495 }
496#endif 496#endif
497 nlh->nlmsg_len = skb->tail - b; 497 nlh->nlmsg_len = skb->tail - b;
@@ -553,13 +553,13 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
553 entry.saddr = 553 entry.saddr =
554#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) 554#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
555 (entry.family == AF_INET6) ? 555 (entry.family == AF_INET6) ?
556 tcp6_rsk(req)->loc_addr.s6_addr32 : 556 inet6_rsk(req)->loc_addr.s6_addr32 :
557#endif 557#endif
558 &ireq->loc_addr; 558 &ireq->loc_addr;
559 entry.daddr = 559 entry.daddr =
560#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) 560#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
561 (entry.family == AF_INET6) ? 561 (entry.family == AF_INET6) ?
562 tcp6_rsk(req)->rmt_addr.s6_addr32 : 562 inet6_rsk(req)->rmt_addr.s6_addr32 :
563#endif 563#endif
564 &ireq->rmt_addr; 564 &ireq->rmt_addr;
565 entry.dport = ntohs(ireq->rmt_port); 565 entry.dport = ntohs(ireq->rmt_port);
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index e8d29fe736d2..33228115cda4 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -15,12 +15,14 @@
15 15
16#include <linux/config.h> 16#include <linux/config.h>
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/random.h>
18#include <linux/sched.h> 19#include <linux/sched.h>
19#include <linux/slab.h> 20#include <linux/slab.h>
20#include <linux/wait.h> 21#include <linux/wait.h>
21 22
22#include <net/inet_connection_sock.h> 23#include <net/inet_connection_sock.h>
23#include <net/inet_hashtables.h> 24#include <net/inet_hashtables.h>
25#include <net/ip.h>
24 26
25/* 27/*
26 * Allocate and initialize a new local port bind bucket. 28 * Allocate and initialize a new local port bind bucket.
@@ -163,3 +165,179 @@ struct sock *__inet_lookup_listener(const struct hlist_head *head, const u32 dad
163} 165}
164 166
165EXPORT_SYMBOL_GPL(__inet_lookup_listener); 167EXPORT_SYMBOL_GPL(__inet_lookup_listener);
168
169/* called with local bh disabled */
170static int __inet_check_established(struct inet_timewait_death_row *death_row,
171 struct sock *sk, __u16 lport,
172 struct inet_timewait_sock **twp)
173{
174 struct inet_hashinfo *hinfo = death_row->hashinfo;
175 struct inet_sock *inet = inet_sk(sk);
176 u32 daddr = inet->rcv_saddr;
177 u32 saddr = inet->daddr;
178 int dif = sk->sk_bound_dev_if;
179 INET_ADDR_COOKIE(acookie, saddr, daddr)
180 const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
181 unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport);
182 struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
183 struct sock *sk2;
184 const struct hlist_node *node;
185 struct inet_timewait_sock *tw;
186
187 prefetch(head->chain.first);
188 write_lock(&head->lock);
189
190 /* Check TIME-WAIT sockets first. */
191 sk_for_each(sk2, node, &(head + hinfo->ehash_size)->chain) {
192 tw = inet_twsk(sk2);
193
194 if (INET_TW_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) {
195 if (twsk_unique(sk, sk2, twp))
196 goto unique;
197 else
198 goto not_unique;
199 }
200 }
201 tw = NULL;
202
203 /* And established part... */
204 sk_for_each(sk2, node, &head->chain) {
205 if (INET_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif))
206 goto not_unique;
207 }
208
209unique:
210 /* Must record num and sport now. Otherwise we will see
211 * in hash table socket with a funny identity. */
212 inet->num = lport;
213 inet->sport = htons(lport);
214 sk->sk_hash = hash;
215 BUG_TRAP(sk_unhashed(sk));
216 __sk_add_node(sk, &head->chain);
217 sock_prot_inc_use(sk->sk_prot);
218 write_unlock(&head->lock);
219
220 if (twp) {
221 *twp = tw;
222 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
223 } else if (tw) {
224 /* Silly. Should hash-dance instead... */
225 inet_twsk_deschedule(tw, death_row);
226 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
227
228 inet_twsk_put(tw);
229 }
230
231 return 0;
232
233not_unique:
234 write_unlock(&head->lock);
235 return -EADDRNOTAVAIL;
236}
237
238static inline u32 inet_sk_port_offset(const struct sock *sk)
239{
240 const struct inet_sock *inet = inet_sk(sk);
241 return secure_ipv4_port_ephemeral(inet->rcv_saddr, inet->daddr,
242 inet->dport);
243}
244
245/*
246 * Bind a port for a connect operation and hash it.
247 */
248int inet_hash_connect(struct inet_timewait_death_row *death_row,
249 struct sock *sk)
250{
251 struct inet_hashinfo *hinfo = death_row->hashinfo;
252 const unsigned short snum = inet_sk(sk)->num;
253 struct inet_bind_hashbucket *head;
254 struct inet_bind_bucket *tb;
255 int ret;
256
257 if (!snum) {
258 int low = sysctl_local_port_range[0];
259 int high = sysctl_local_port_range[1];
260 int range = high - low;
261 int i;
262 int port;
263 static u32 hint;
264 u32 offset = hint + inet_sk_port_offset(sk);
265 struct hlist_node *node;
266 struct inet_timewait_sock *tw = NULL;
267
268 local_bh_disable();
269 for (i = 1; i <= range; i++) {
270 port = low + (i + offset) % range;
271 head = &hinfo->bhash[inet_bhashfn(port, hinfo->bhash_size)];
272 spin_lock(&head->lock);
273
274 /* Does not bother with rcv_saddr checks,
275 * because the established check is already
276 * unique enough.
277 */
278 inet_bind_bucket_for_each(tb, node, &head->chain) {
279 if (tb->port == port) {
280 BUG_TRAP(!hlist_empty(&tb->owners));
281 if (tb->fastreuse >= 0)
282 goto next_port;
283 if (!__inet_check_established(death_row,
284 sk, port,
285 &tw))
286 goto ok;
287 goto next_port;
288 }
289 }
290
291 tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, head, port);
292 if (!tb) {
293 spin_unlock(&head->lock);
294 break;
295 }
296 tb->fastreuse = -1;
297 goto ok;
298
299 next_port:
300 spin_unlock(&head->lock);
301 }
302 local_bh_enable();
303
304 return -EADDRNOTAVAIL;
305
306ok:
307 hint += i;
308
309 /* Head lock still held and bh's disabled */
310 inet_bind_hash(sk, tb, port);
311 if (sk_unhashed(sk)) {
312 inet_sk(sk)->sport = htons(port);
313 __inet_hash(hinfo, sk, 0);
314 }
315 spin_unlock(&head->lock);
316
317 if (tw) {
318 inet_twsk_deschedule(tw, death_row);;
319 inet_twsk_put(tw);
320 }
321
322 ret = 0;
323 goto out;
324 }
325
326 head = &hinfo->bhash[inet_bhashfn(snum, hinfo->bhash_size)];
327 tb = inet_csk(sk)->icsk_bind_hash;
328 spin_lock_bh(&head->lock);
329 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
330 __inet_hash(hinfo, sk, 0);
331 spin_unlock_bh(&head->lock);
332 return 0;
333 } else {
334 spin_unlock(&head->lock);
335 /* No definite answer... Walk to established hash table */
336 ret = __inet_check_established(death_row, sk, snum, NULL);
337out:
338 local_bh_enable();
339 return ret;
340 }
341}
342
343EXPORT_SYMBOL_GPL(inet_hash_connect);
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index a010e9a68811..417f126c749e 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -90,8 +90,9 @@ EXPORT_SYMBOL_GPL(__inet_twsk_hashdance);
90 90
91struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int state) 91struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int state)
92{ 92{
93 struct inet_timewait_sock *tw = kmem_cache_alloc(sk->sk_prot_creator->twsk_slab, 93 struct inet_timewait_sock *tw =
94 SLAB_ATOMIC); 94 kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab,
95 SLAB_ATOMIC);
95 if (tw != NULL) { 96 if (tw != NULL) {
96 const struct inet_sock *inet = inet_sk(sk); 97 const struct inet_sock *inet = inet_sk(sk);
97 98
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index 2fc3fd38924f..ce5fe3f74a3d 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -401,6 +401,7 @@ struct inet_peer *inet_getpeer(__u32 daddr, int create)
401 return NULL; 401 return NULL;
402 n->v4daddr = daddr; 402 n->v4daddr = daddr;
403 atomic_set(&n->refcnt, 1); 403 atomic_set(&n->refcnt, 1);
404 atomic_set(&n->rid, 0);
404 n->ip_id_count = secure_ip_id(daddr); 405 n->ip_id_count = secure_ip_id(daddr);
405 n->tcp_ts_stamp = 0; 406 n->tcp_ts_stamp = 0;
406 407
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 8ce0ce2ee48e..ce2b70ce4018 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -22,6 +22,7 @@
22 * Patrick McHardy : LRU queue of frag heads for evictor. 22 * Patrick McHardy : LRU queue of frag heads for evictor.
23 */ 23 */
24 24
25#include <linux/compiler.h>
25#include <linux/config.h> 26#include <linux/config.h>
26#include <linux/module.h> 27#include <linux/module.h>
27#include <linux/types.h> 28#include <linux/types.h>
@@ -38,6 +39,7 @@
38#include <net/ip.h> 39#include <net/ip.h>
39#include <net/icmp.h> 40#include <net/icmp.h>
40#include <net/checksum.h> 41#include <net/checksum.h>
42#include <net/inetpeer.h>
41#include <linux/tcp.h> 43#include <linux/tcp.h>
42#include <linux/udp.h> 44#include <linux/udp.h>
43#include <linux/inet.h> 45#include <linux/inet.h>
@@ -56,6 +58,8 @@
56int sysctl_ipfrag_high_thresh = 256*1024; 58int sysctl_ipfrag_high_thresh = 256*1024;
57int sysctl_ipfrag_low_thresh = 192*1024; 59int sysctl_ipfrag_low_thresh = 192*1024;
58 60
61int sysctl_ipfrag_max_dist = 64;
62
59/* Important NOTE! Fragment queue must be destroyed before MSL expires. 63/* Important NOTE! Fragment queue must be destroyed before MSL expires.
60 * RFC791 is wrong proposing to prolongate timer each fragment arrival by TTL. 64 * RFC791 is wrong proposing to prolongate timer each fragment arrival by TTL.
61 */ 65 */
@@ -89,8 +93,10 @@ struct ipq {
89 spinlock_t lock; 93 spinlock_t lock;
90 atomic_t refcnt; 94 atomic_t refcnt;
91 struct timer_list timer; /* when will this queue expire? */ 95 struct timer_list timer; /* when will this queue expire? */
92 int iif;
93 struct timeval stamp; 96 struct timeval stamp;
97 int iif;
98 unsigned int rid;
99 struct inet_peer *peer;
94}; 100};
95 101
96/* Hash table. */ 102/* Hash table. */
@@ -195,6 +201,9 @@ static void ip_frag_destroy(struct ipq *qp, int *work)
195 BUG_TRAP(qp->last_in&COMPLETE); 201 BUG_TRAP(qp->last_in&COMPLETE);
196 BUG_TRAP(del_timer(&qp->timer) == 0); 202 BUG_TRAP(del_timer(&qp->timer) == 0);
197 203
204 if (qp->peer)
205 inet_putpeer(qp->peer);
206
198 /* Release all fragment data. */ 207 /* Release all fragment data. */
199 fp = qp->fragments; 208 fp = qp->fragments;
200 while (fp) { 209 while (fp) {
@@ -353,6 +362,7 @@ static struct ipq *ip_frag_create(unsigned hash, struct iphdr *iph, u32 user)
353 qp->meat = 0; 362 qp->meat = 0;
354 qp->fragments = NULL; 363 qp->fragments = NULL;
355 qp->iif = 0; 364 qp->iif = 0;
365 qp->peer = sysctl_ipfrag_max_dist ? inet_getpeer(iph->saddr, 1) : NULL;
356 366
357 /* Initialize a timer for this entry. */ 367 /* Initialize a timer for this entry. */
358 init_timer(&qp->timer); 368 init_timer(&qp->timer);
@@ -398,6 +408,56 @@ static inline struct ipq *ip_find(struct iphdr *iph, u32 user)
398 return ip_frag_create(hash, iph, user); 408 return ip_frag_create(hash, iph, user);
399} 409}
400 410
411/* Is the fragment too far ahead to be part of ipq? */
412static inline int ip_frag_too_far(struct ipq *qp)
413{
414 struct inet_peer *peer = qp->peer;
415 unsigned int max = sysctl_ipfrag_max_dist;
416 unsigned int start, end;
417
418 int rc;
419
420 if (!peer || !max)
421 return 0;
422
423 start = qp->rid;
424 end = atomic_inc_return(&peer->rid);
425 qp->rid = end;
426
427 rc = qp->fragments && (end - start) > max;
428
429 if (rc) {
430 IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS);
431 }
432
433 return rc;
434}
435
436static int ip_frag_reinit(struct ipq *qp)
437{
438 struct sk_buff *fp;
439
440 if (!mod_timer(&qp->timer, jiffies + sysctl_ipfrag_time)) {
441 atomic_inc(&qp->refcnt);
442 return -ETIMEDOUT;
443 }
444
445 fp = qp->fragments;
446 do {
447 struct sk_buff *xp = fp->next;
448 frag_kfree_skb(fp, NULL);
449 fp = xp;
450 } while (fp);
451
452 qp->last_in = 0;
453 qp->len = 0;
454 qp->meat = 0;
455 qp->fragments = NULL;
456 qp->iif = 0;
457
458 return 0;
459}
460
401/* Add new segment to existing queue. */ 461/* Add new segment to existing queue. */
402static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb) 462static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
403{ 463{
@@ -408,6 +468,12 @@ static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
408 if (qp->last_in & COMPLETE) 468 if (qp->last_in & COMPLETE)
409 goto err; 469 goto err;
410 470
471 if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) &&
472 unlikely(ip_frag_too_far(qp)) && unlikely(ip_frag_reinit(qp))) {
473 ipq_kill(qp);
474 goto err;
475 }
476
411 offset = ntohs(skb->nh.iph->frag_off); 477 offset = ntohs(skb->nh.iph->frag_off);
412 flags = offset & ~IP_OFFSET; 478 flags = offset & ~IP_OFFSET;
413 offset &= IP_OFFSET; 479 offset &= IP_OFFSET;
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 473d0f2b2e0d..e45846ae570b 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -128,6 +128,7 @@
128#include <linux/sockios.h> 128#include <linux/sockios.h>
129#include <linux/in.h> 129#include <linux/in.h>
130#include <linux/inet.h> 130#include <linux/inet.h>
131#include <linux/inetdevice.h>
131#include <linux/netdevice.h> 132#include <linux/netdevice.h>
132#include <linux/etherdevice.h> 133#include <linux/etherdevice.h>
133 134
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index dbe12da8d8b3..d3f6c468faf4 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -22,6 +22,7 @@
22#include <net/sock.h> 22#include <net/sock.h>
23#include <net/ip.h> 23#include <net/ip.h>
24#include <net/icmp.h> 24#include <net/icmp.h>
25#include <net/route.h>
25 26
26/* 27/*
27 * Write options to IP header, record destination address to 28 * Write options to IP header, record destination address to
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index eba64e2bd397..2a830de3a699 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -445,6 +445,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
445 445
446 hlen = iph->ihl * 4; 446 hlen = iph->ihl * 4;
447 mtu = dst_mtu(&rt->u.dst) - hlen; /* Size of data space */ 447 mtu = dst_mtu(&rt->u.dst) - hlen; /* Size of data space */
448 IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
448 449
449 /* When frag_list is given, use it. First, check its validity: 450 /* When frag_list is given, use it. First, check its validity:
450 * some transformers could create wrong frag_list or break existing 451 * some transformers could create wrong frag_list or break existing
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 4f2d87257309..6986e11d65cc 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -25,12 +25,12 @@
25#include <linux/skbuff.h> 25#include <linux/skbuff.h>
26#include <linux/ip.h> 26#include <linux/ip.h>
27#include <linux/icmp.h> 27#include <linux/icmp.h>
28#include <linux/inetdevice.h>
28#include <linux/netdevice.h> 29#include <linux/netdevice.h>
29#include <net/sock.h> 30#include <net/sock.h>
30#include <net/ip.h> 31#include <net/ip.h>
31#include <net/icmp.h> 32#include <net/icmp.h>
32#include <net/tcp.h> 33#include <net/tcp_states.h>
33#include <linux/tcp.h>
34#include <linux/udp.h> 34#include <linux/udp.h>
35#include <linux/igmp.h> 35#include <linux/igmp.h>
36#include <linux/netfilter.h> 36#include <linux/netfilter.h>
@@ -427,8 +427,8 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
427 err = ip_options_get_from_user(&opt, optval, optlen); 427 err = ip_options_get_from_user(&opt, optval, optlen);
428 if (err) 428 if (err)
429 break; 429 break;
430 if (sk->sk_type == SOCK_STREAM) { 430 if (inet->is_icsk) {
431 struct tcp_sock *tp = tcp_sk(sk); 431 struct inet_connection_sock *icsk = inet_csk(sk);
432#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 432#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
433 if (sk->sk_family == PF_INET || 433 if (sk->sk_family == PF_INET ||
434 (!((1 << sk->sk_state) & 434 (!((1 << sk->sk_state) &
@@ -436,10 +436,10 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
436 inet->daddr != LOOPBACK4_IPV6)) { 436 inet->daddr != LOOPBACK4_IPV6)) {
437#endif 437#endif
438 if (inet->opt) 438 if (inet->opt)
439 tp->ext_header_len -= inet->opt->optlen; 439 icsk->icsk_ext_hdr_len -= inet->opt->optlen;
440 if (opt) 440 if (opt)
441 tp->ext_header_len += opt->optlen; 441 icsk->icsk_ext_hdr_len += opt->optlen;
442 tcp_sync_mss(sk, tp->pmtu_cookie); 442 icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie);
443#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 443#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
444 } 444 }
445#endif 445#endif
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
index fc718df17b40..d64e2ec8da7b 100644
--- a/net/ipv4/ipcomp.c
+++ b/net/ipv4/ipcomp.c
@@ -28,6 +28,7 @@
28#include <net/xfrm.h> 28#include <net/xfrm.h>
29#include <net/icmp.h> 29#include <net/icmp.h>
30#include <net/ipcomp.h> 30#include <net/ipcomp.h>
31#include <net/protocol.h>
31 32
32struct ipcomp_tfms { 33struct ipcomp_tfms {
33 struct list_head list; 34 struct list_head list;
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index e8674baaa8d9..bb3613ec448c 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -42,6 +42,7 @@
42#include <linux/in.h> 42#include <linux/in.h>
43#include <linux/if.h> 43#include <linux/if.h>
44#include <linux/inet.h> 44#include <linux/inet.h>
45#include <linux/inetdevice.h>
45#include <linux/netdevice.h> 46#include <linux/netdevice.h>
46#include <linux/if_arp.h> 47#include <linux/if_arp.h>
47#include <linux/skbuff.h> 48#include <linux/skbuff.h>
@@ -58,6 +59,7 @@
58#include <net/arp.h> 59#include <net/arp.h>
59#include <net/ip.h> 60#include <net/ip.h>
60#include <net/ipconfig.h> 61#include <net/ipconfig.h>
62#include <net/route.h>
61 63
62#include <asm/uaccess.h> 64#include <asm/uaccess.h>
63#include <net/checksum.h> 65#include <net/checksum.h>
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 302b7eb507c9..caa3b7d2e48a 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -52,6 +52,7 @@
52#include <net/ip.h> 52#include <net/ip.h>
53#include <net/protocol.h> 53#include <net/protocol.h>
54#include <linux/skbuff.h> 54#include <linux/skbuff.h>
55#include <net/route.h>
55#include <net/sock.h> 56#include <net/sock.h>
56#include <net/icmp.h> 57#include <net/icmp.h>
57#include <net/udp.h> 58#include <net/udp.h>
diff --git a/net/ipv4/ipvs/ip_vs_app.c b/net/ipv4/ipvs/ip_vs_app.c
index d7eb680101c2..9b176a942ac5 100644
--- a/net/ipv4/ipvs/ip_vs_app.c
+++ b/net/ipv4/ipvs/ip_vs_app.c
@@ -224,34 +224,6 @@ void unregister_ip_vs_app(struct ip_vs_app *app)
224} 224}
225 225
226 226
227#if 0000
228/*
229 * Get reference to app by name (called from user context)
230 */
231struct ip_vs_app *ip_vs_app_get_by_name(char *appname)
232{
233 struct ip_vs_app *app, *a = NULL;
234
235 down(&__ip_vs_app_mutex);
236
237 list_for_each_entry(ent, &ip_vs_app_list, a_list) {
238 if (strcmp(app->name, appname))
239 continue;
240
241 /* softirq may call ip_vs_app_get too, so the caller
242 must disable softirq on the current CPU */
243 if (ip_vs_app_get(app))
244 a = app;
245 break;
246 }
247
248 up(&__ip_vs_app_mutex);
249
250 return a;
251}
252#endif
253
254
255/* 227/*
256 * Bind ip_vs_conn to its ip_vs_app (called by cp constructor) 228 * Bind ip_vs_conn to its ip_vs_app (called by cp constructor)
257 */ 229 */
diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c
index 2a3a8c59c655..81d90354c928 100644
--- a/net/ipv4/ipvs/ip_vs_conn.c
+++ b/net/ipv4/ipvs/ip_vs_conn.c
@@ -24,7 +24,10 @@
24 * 24 *
25 */ 25 */
26 26
27#include <linux/in.h>
28#include <linux/net.h>
27#include <linux/kernel.h> 29#include <linux/kernel.h>
30#include <linux/module.h>
28#include <linux/vmalloc.h> 31#include <linux/vmalloc.h>
29#include <linux/proc_fs.h> /* for proc_net_* */ 32#include <linux/proc_fs.h> /* for proc_net_* */
30#include <linux/seq_file.h> 33#include <linux/seq_file.h>
@@ -219,7 +222,7 @@ struct ip_vs_conn *ip_vs_conn_in_get
219 if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt)) 222 if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt))
220 cp = __ip_vs_conn_in_get(protocol, s_addr, 0, d_addr, d_port); 223 cp = __ip_vs_conn_in_get(protocol, s_addr, 0, d_addr, d_port);
221 224
222 IP_VS_DBG(7, "lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", 225 IP_VS_DBG(9, "lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
223 ip_vs_proto_name(protocol), 226 ip_vs_proto_name(protocol),
224 NIPQUAD(s_addr), ntohs(s_port), 227 NIPQUAD(s_addr), ntohs(s_port),
225 NIPQUAD(d_addr), ntohs(d_port), 228 NIPQUAD(d_addr), ntohs(d_port),
@@ -254,7 +257,7 @@ struct ip_vs_conn *ip_vs_ct_in_get
254 out: 257 out:
255 ct_read_unlock(hash); 258 ct_read_unlock(hash);
256 259
257 IP_VS_DBG(7, "template lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", 260 IP_VS_DBG(9, "template lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
258 ip_vs_proto_name(protocol), 261 ip_vs_proto_name(protocol),
259 NIPQUAD(s_addr), ntohs(s_port), 262 NIPQUAD(s_addr), ntohs(s_port),
260 NIPQUAD(d_addr), ntohs(d_port), 263 NIPQUAD(d_addr), ntohs(d_port),
@@ -295,7 +298,7 @@ struct ip_vs_conn *ip_vs_conn_out_get
295 298
296 ct_read_unlock(hash); 299 ct_read_unlock(hash);
297 300
298 IP_VS_DBG(7, "lookup/out %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", 301 IP_VS_DBG(9, "lookup/out %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
299 ip_vs_proto_name(protocol), 302 ip_vs_proto_name(protocol),
300 NIPQUAD(s_addr), ntohs(s_port), 303 NIPQUAD(s_addr), ntohs(s_port),
301 NIPQUAD(d_addr), ntohs(d_port), 304 NIPQUAD(d_addr), ntohs(d_port),
@@ -391,8 +394,9 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
391 cp->flags |= atomic_read(&dest->conn_flags); 394 cp->flags |= atomic_read(&dest->conn_flags);
392 cp->dest = dest; 395 cp->dest = dest;
393 396
394 IP_VS_DBG(9, "Bind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d " 397 IP_VS_DBG(7, "Bind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
395 "d:%u.%u.%u.%u:%d fwd:%c s:%u flg:%X cnt:%d destcnt:%d\n", 398 "d:%u.%u.%u.%u:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d "
399 "dest->refcnt:%d\n",
396 ip_vs_proto_name(cp->protocol), 400 ip_vs_proto_name(cp->protocol),
397 NIPQUAD(cp->caddr), ntohs(cp->cport), 401 NIPQUAD(cp->caddr), ntohs(cp->cport),
398 NIPQUAD(cp->vaddr), ntohs(cp->vport), 402 NIPQUAD(cp->vaddr), ntohs(cp->vport),
@@ -430,8 +434,9 @@ static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp)
430 if (!dest) 434 if (!dest)
431 return; 435 return;
432 436
433 IP_VS_DBG(9, "Unbind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d " 437 IP_VS_DBG(7, "Unbind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
434 "d:%u.%u.%u.%u:%d fwd:%c s:%u flg:%X cnt:%d destcnt:%d\n", 438 "d:%u.%u.%u.%u:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d "
439 "dest->refcnt:%d\n",
435 ip_vs_proto_name(cp->protocol), 440 ip_vs_proto_name(cp->protocol),
436 NIPQUAD(cp->caddr), ntohs(cp->cport), 441 NIPQUAD(cp->caddr), ntohs(cp->cport),
437 NIPQUAD(cp->vaddr), ntohs(cp->vport), 442 NIPQUAD(cp->vaddr), ntohs(cp->vport),
@@ -571,7 +576,7 @@ static void ip_vs_conn_expire(unsigned long data)
571 ip_vs_conn_hash(cp); 576 ip_vs_conn_hash(cp);
572 577
573 expire_later: 578 expire_later:
574 IP_VS_DBG(7, "delayed: refcnt-1=%d conn.n_control=%d\n", 579 IP_VS_DBG(7, "delayed: conn->refcnt-1=%d conn->n_control=%d\n",
575 atomic_read(&cp->refcnt)-1, 580 atomic_read(&cp->refcnt)-1,
576 atomic_read(&cp->n_control)); 581 atomic_read(&cp->n_control));
577 582
diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c
index 1a0843cd58a9..1aca94a9fd8b 100644
--- a/net/ipv4/ipvs/ip_vs_core.c
+++ b/net/ipv4/ipvs/ip_vs_core.c
@@ -426,7 +426,7 @@ ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
426 return NULL; 426 return NULL;
427 427
428 IP_VS_DBG(6, "Schedule fwd:%c c:%u.%u.%u.%u:%u v:%u.%u.%u.%u:%u " 428 IP_VS_DBG(6, "Schedule fwd:%c c:%u.%u.%u.%u:%u v:%u.%u.%u.%u:%u "
429 "d:%u.%u.%u.%u:%u flg:%X cnt:%d\n", 429 "d:%u.%u.%u.%u:%u conn->flags:%X conn->refcnt:%d\n",
430 ip_vs_fwd_tag(cp), 430 ip_vs_fwd_tag(cp),
431 NIPQUAD(cp->caddr), ntohs(cp->cport), 431 NIPQUAD(cp->caddr), ntohs(cp->cport),
432 NIPQUAD(cp->vaddr), ntohs(cp->vport), 432 NIPQUAD(cp->vaddr), ntohs(cp->vport),
diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c
index 9bdcf31b760e..c935c5086d33 100644
--- a/net/ipv4/ipvs/ip_vs_ctl.c
+++ b/net/ipv4/ipvs/ip_vs_ctl.c
@@ -35,6 +35,7 @@
35#include <linux/netfilter_ipv4.h> 35#include <linux/netfilter_ipv4.h>
36 36
37#include <net/ip.h> 37#include <net/ip.h>
38#include <net/route.h>
38#include <net/sock.h> 39#include <net/sock.h>
39 40
40#include <asm/uaccess.h> 41#include <asm/uaccess.h>
@@ -447,7 +448,7 @@ ip_vs_service_get(__u32 fwmark, __u16 protocol, __u32 vaddr, __u16 vport)
447 out: 448 out:
448 read_unlock(&__ip_vs_svc_lock); 449 read_unlock(&__ip_vs_svc_lock);
449 450
450 IP_VS_DBG(6, "lookup service: fwm %u %s %u.%u.%u.%u:%u %s\n", 451 IP_VS_DBG(9, "lookup service: fwm %u %s %u.%u.%u.%u:%u %s\n",
451 fwmark, ip_vs_proto_name(protocol), 452 fwmark, ip_vs_proto_name(protocol),
452 NIPQUAD(vaddr), ntohs(vport), 453 NIPQUAD(vaddr), ntohs(vport),
453 svc?"hit":"not hit"); 454 svc?"hit":"not hit");
@@ -597,7 +598,7 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, __u32 daddr, __u16 dport)
597 */ 598 */
598 list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) { 599 list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
599 IP_VS_DBG(3, "Destination %u/%u.%u.%u.%u:%u still in trash, " 600 IP_VS_DBG(3, "Destination %u/%u.%u.%u.%u:%u still in trash, "
600 "refcnt=%d\n", 601 "dest->refcnt=%d\n",
601 dest->vfwmark, 602 dest->vfwmark,
602 NIPQUAD(dest->addr), ntohs(dest->port), 603 NIPQUAD(dest->addr), ntohs(dest->port),
603 atomic_read(&dest->refcnt)); 604 atomic_read(&dest->refcnt));
@@ -804,7 +805,7 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
804 dest = ip_vs_trash_get_dest(svc, daddr, dport); 805 dest = ip_vs_trash_get_dest(svc, daddr, dport);
805 if (dest != NULL) { 806 if (dest != NULL) {
806 IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%u from trash, " 807 IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%u from trash, "
807 "refcnt=%d, service %u/%u.%u.%u.%u:%u\n", 808 "dest->refcnt=%d, service %u/%u.%u.%u.%u:%u\n",
808 NIPQUAD(daddr), ntohs(dport), 809 NIPQUAD(daddr), ntohs(dport),
809 atomic_read(&dest->refcnt), 810 atomic_read(&dest->refcnt),
810 dest->vfwmark, 811 dest->vfwmark,
@@ -949,7 +950,8 @@ static void __ip_vs_del_dest(struct ip_vs_dest *dest)
949 atomic_dec(&dest->svc->refcnt); 950 atomic_dec(&dest->svc->refcnt);
950 kfree(dest); 951 kfree(dest);
951 } else { 952 } else {
952 IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, refcnt=%d\n", 953 IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, "
954 "dest->refcnt=%d\n",
953 NIPQUAD(dest->addr), ntohs(dest->port), 955 NIPQUAD(dest->addr), ntohs(dest->port),
954 atomic_read(&dest->refcnt)); 956 atomic_read(&dest->refcnt));
955 list_add(&dest->n_list, &ip_vs_dest_trash); 957 list_add(&dest->n_list, &ip_vs_dest_trash);
diff --git a/net/ipv4/ipvs/ip_vs_dh.c b/net/ipv4/ipvs/ip_vs_dh.c
index f3bc320dce93..9fee19c4c617 100644
--- a/net/ipv4/ipvs/ip_vs_dh.c
+++ b/net/ipv4/ipvs/ip_vs_dh.c
@@ -37,8 +37,10 @@
37 * 37 *
38 */ 38 */
39 39
40#include <linux/ip.h>
40#include <linux/module.h> 41#include <linux/module.h>
41#include <linux/kernel.h> 42#include <linux/kernel.h>
43#include <linux/skbuff.h>
42 44
43#include <net/ip_vs.h> 45#include <net/ip_vs.h>
44 46
diff --git a/net/ipv4/ipvs/ip_vs_est.c b/net/ipv4/ipvs/ip_vs_est.c
index 67b3e2fc1fa1..e7004741ac73 100644
--- a/net/ipv4/ipvs/ip_vs_est.c
+++ b/net/ipv4/ipvs/ip_vs_est.c
@@ -13,7 +13,10 @@
13 * Changes: 13 * Changes:
14 * 14 *
15 */ 15 */
16#include <linux/config.h>
16#include <linux/kernel.h> 17#include <linux/kernel.h>
18#include <linux/jiffies.h>
19#include <linux/slab.h>
17#include <linux/types.h> 20#include <linux/types.h>
18 21
19#include <net/ip_vs.h> 22#include <net/ip_vs.h>
diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c
index 561cda326fa8..6e5cb92a5c83 100644
--- a/net/ipv4/ipvs/ip_vs_lblc.c
+++ b/net/ipv4/ipvs/ip_vs_lblc.c
@@ -41,8 +41,10 @@
41 * me to write this module. 41 * me to write this module.
42 */ 42 */
43 43
44#include <linux/ip.h>
44#include <linux/module.h> 45#include <linux/module.h>
45#include <linux/kernel.h> 46#include <linux/kernel.h>
47#include <linux/skbuff.h>
46 48
47/* for sysctl */ 49/* for sysctl */
48#include <linux/fs.h> 50#include <linux/fs.h>
@@ -228,33 +230,6 @@ ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en)
228} 230}
229 231
230 232
231#if 0000
232/*
233 * Unhash ip_vs_lblc_entry from ip_vs_lblc_table.
234 * returns bool success.
235 */
236static int ip_vs_lblc_unhash(struct ip_vs_lblc_table *tbl,
237 struct ip_vs_lblc_entry *en)
238{
239 if (list_empty(&en->list)) {
240 IP_VS_ERR("ip_vs_lblc_unhash(): request for not hashed entry, "
241 "called from %p\n", __builtin_return_address(0));
242 return 0;
243 }
244
245 /*
246 * Remove it from the table
247 */
248 write_lock(&tbl->lock);
249 list_del(&en->list);
250 INIT_LIST_HEAD(&en->list);
251 write_unlock(&tbl->lock);
252
253 return 1;
254}
255#endif
256
257
258/* 233/*
259 * Get ip_vs_lblc_entry associated with supplied parameters. 234 * Get ip_vs_lblc_entry associated with supplied parameters.
260 */ 235 */
diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c
index ce456dbf09a5..32ba37ba72d8 100644
--- a/net/ipv4/ipvs/ip_vs_lblcr.c
+++ b/net/ipv4/ipvs/ip_vs_lblcr.c
@@ -39,8 +39,10 @@
39 * 39 *
40 */ 40 */
41 41
42#include <linux/ip.h>
42#include <linux/module.h> 43#include <linux/module.h>
43#include <linux/kernel.h> 44#include <linux/kernel.h>
45#include <linux/skbuff.h>
44 46
45/* for sysctl */ 47/* for sysctl */
46#include <linux/fs.h> 48#include <linux/fs.h>
@@ -414,33 +416,6 @@ ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en)
414} 416}
415 417
416 418
417#if 0000
418/*
419 * Unhash ip_vs_lblcr_entry from ip_vs_lblcr_table.
420 * returns bool success.
421 */
422static int ip_vs_lblcr_unhash(struct ip_vs_lblcr_table *tbl,
423 struct ip_vs_lblcr_entry *en)
424{
425 if (list_empty(&en->list)) {
426 IP_VS_ERR("ip_vs_lblcr_unhash(): request for not hashed entry, "
427 "called from %p\n", __builtin_return_address(0));
428 return 0;
429 }
430
431 /*
432 * Remove it from the table
433 */
434 write_lock(&tbl->lock);
435 list_del(&en->list);
436 INIT_LIST_HEAD(&en->list);
437 write_unlock(&tbl->lock);
438
439 return 1;
440}
441#endif
442
443
444/* 419/*
445 * Get ip_vs_lblcr_entry associated with supplied parameters. 420 * Get ip_vs_lblcr_entry associated with supplied parameters.
446 */ 421 */
diff --git a/net/ipv4/ipvs/ip_vs_proto_ah.c b/net/ipv4/ipvs/ip_vs_proto_ah.c
index 453e94a0bbd7..8b0505b09317 100644
--- a/net/ipv4/ipvs/ip_vs_proto_ah.c
+++ b/net/ipv4/ipvs/ip_vs_proto_ah.c
@@ -12,6 +12,8 @@
12 * 12 *
13 */ 13 */
14 14
15#include <linux/in.h>
16#include <linux/ip.h>
15#include <linux/module.h> 17#include <linux/module.h>
16#include <linux/kernel.h> 18#include <linux/kernel.h>
17#include <linux/netfilter.h> 19#include <linux/netfilter.h>
diff --git a/net/ipv4/ipvs/ip_vs_proto_esp.c b/net/ipv4/ipvs/ip_vs_proto_esp.c
index 478e5c7c7e8e..c36ccf057a19 100644
--- a/net/ipv4/ipvs/ip_vs_proto_esp.c
+++ b/net/ipv4/ipvs/ip_vs_proto_esp.c
@@ -12,6 +12,8 @@
12 * 12 *
13 */ 13 */
14 14
15#include <linux/in.h>
16#include <linux/ip.h>
15#include <linux/module.h> 17#include <linux/module.h>
16#include <linux/kernel.h> 18#include <linux/kernel.h>
17#include <linux/netfilter.h> 19#include <linux/netfilter.h>
diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c
index 0e878fd6215c..bc28b1160a3a 100644
--- a/net/ipv4/ipvs/ip_vs_proto_tcp.c
+++ b/net/ipv4/ipvs/ip_vs_proto_tcp.c
@@ -275,28 +275,6 @@ static int tcp_timeouts[IP_VS_TCP_S_LAST+1] = {
275 [IP_VS_TCP_S_LAST] = 2*HZ, 275 [IP_VS_TCP_S_LAST] = 2*HZ,
276}; 276};
277 277
278
279#if 0
280
281/* FIXME: This is going to die */
282
283static int tcp_timeouts_dos[IP_VS_TCP_S_LAST+1] = {
284 [IP_VS_TCP_S_NONE] = 2*HZ,
285 [IP_VS_TCP_S_ESTABLISHED] = 8*60*HZ,
286 [IP_VS_TCP_S_SYN_SENT] = 60*HZ,
287 [IP_VS_TCP_S_SYN_RECV] = 10*HZ,
288 [IP_VS_TCP_S_FIN_WAIT] = 60*HZ,
289 [IP_VS_TCP_S_TIME_WAIT] = 60*HZ,
290 [IP_VS_TCP_S_CLOSE] = 10*HZ,
291 [IP_VS_TCP_S_CLOSE_WAIT] = 60*HZ,
292 [IP_VS_TCP_S_LAST_ACK] = 30*HZ,
293 [IP_VS_TCP_S_LISTEN] = 2*60*HZ,
294 [IP_VS_TCP_S_SYNACK] = 100*HZ,
295 [IP_VS_TCP_S_LAST] = 2*HZ,
296};
297
298#endif
299
300static char * tcp_state_name_table[IP_VS_TCP_S_LAST+1] = { 278static char * tcp_state_name_table[IP_VS_TCP_S_LAST+1] = {
301 [IP_VS_TCP_S_NONE] = "NONE", 279 [IP_VS_TCP_S_NONE] = "NONE",
302 [IP_VS_TCP_S_ESTABLISHED] = "ESTABLISHED", 280 [IP_VS_TCP_S_ESTABLISHED] = "ESTABLISHED",
@@ -448,7 +426,7 @@ set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
448 struct ip_vs_dest *dest = cp->dest; 426 struct ip_vs_dest *dest = cp->dest;
449 427
450 IP_VS_DBG(8, "%s %s [%c%c%c%c] %u.%u.%u.%u:%d->" 428 IP_VS_DBG(8, "%s %s [%c%c%c%c] %u.%u.%u.%u:%d->"
451 "%u.%u.%u.%u:%d state: %s->%s cnt:%d\n", 429 "%u.%u.%u.%u:%d state: %s->%s conn->refcnt:%d\n",
452 pp->name, 430 pp->name,
453 (state_off==TCP_DIR_OUTPUT)?"output ":"input ", 431 (state_off==TCP_DIR_OUTPUT)?"output ":"input ",
454 th->syn? 'S' : '.', 432 th->syn? 'S' : '.',
diff --git a/net/ipv4/ipvs/ip_vs_proto_udp.c b/net/ipv4/ipvs/ip_vs_proto_udp.c
index 8ae5f2e0aefa..89d9175d8f28 100644
--- a/net/ipv4/ipvs/ip_vs_proto_udp.c
+++ b/net/ipv4/ipvs/ip_vs_proto_udp.c
@@ -15,8 +15,11 @@
15 * 15 *
16 */ 16 */
17 17
18#include <linux/in.h>
19#include <linux/ip.h>
18#include <linux/kernel.h> 20#include <linux/kernel.h>
19#include <linux/netfilter_ipv4.h> 21#include <linux/netfilter_ipv4.h>
22#include <linux/udp.h>
20 23
21#include <net/ip_vs.h> 24#include <net/ip_vs.h>
22 25
diff --git a/net/ipv4/ipvs/ip_vs_sh.c b/net/ipv4/ipvs/ip_vs_sh.c
index 6f7c50e44a39..7775e6cc68be 100644
--- a/net/ipv4/ipvs/ip_vs_sh.c
+++ b/net/ipv4/ipvs/ip_vs_sh.c
@@ -34,8 +34,10 @@
34 * 34 *
35 */ 35 */
36 36
37#include <linux/ip.h>
37#include <linux/module.h> 38#include <linux/module.h>
38#include <linux/kernel.h> 39#include <linux/kernel.h>
40#include <linux/skbuff.h>
39 41
40#include <net/ip_vs.h> 42#include <net/ip_vs.h>
41 43
diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c
index 2e5ced3d8062..1bca714bda3d 100644
--- a/net/ipv4/ipvs/ip_vs_sync.c
+++ b/net/ipv4/ipvs/ip_vs_sync.c
@@ -21,12 +21,14 @@
21 21
22#include <linux/module.h> 22#include <linux/module.h>
23#include <linux/slab.h> 23#include <linux/slab.h>
24#include <linux/inetdevice.h>
24#include <linux/net.h> 25#include <linux/net.h>
25#include <linux/completion.h> 26#include <linux/completion.h>
26#include <linux/delay.h> 27#include <linux/delay.h>
27#include <linux/skbuff.h> 28#include <linux/skbuff.h>
28#include <linux/in.h> 29#include <linux/in.h>
29#include <linux/igmp.h> /* for ip_mc_join_group */ 30#include <linux/igmp.h> /* for ip_mc_join_group */
31#include <linux/udp.h>
30 32
31#include <net/ip.h> 33#include <net/ip.h>
32#include <net/sock.h> 34#include <net/sock.h>
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 3c2e9639bba6..bba156304695 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -68,19 +68,14 @@ struct arpt_table_info {
68 unsigned int initial_entries; 68 unsigned int initial_entries;
69 unsigned int hook_entry[NF_ARP_NUMHOOKS]; 69 unsigned int hook_entry[NF_ARP_NUMHOOKS];
70 unsigned int underflow[NF_ARP_NUMHOOKS]; 70 unsigned int underflow[NF_ARP_NUMHOOKS];
71 char entries[0] __attribute__((aligned(SMP_CACHE_BYTES))); 71 void *entries[NR_CPUS];
72}; 72};
73 73
74static LIST_HEAD(arpt_target); 74static LIST_HEAD(arpt_target);
75static LIST_HEAD(arpt_tables); 75static LIST_HEAD(arpt_tables);
76#define SET_COUNTER(c,b,p) do { (c).bcnt = (b); (c).pcnt = (p); } while(0)
76#define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0) 77#define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0)
77 78
78#ifdef CONFIG_SMP
79#define TABLE_OFFSET(t,p) (SMP_ALIGN((t)->size)*(p))
80#else
81#define TABLE_OFFSET(t,p) 0
82#endif
83
84static inline int arp_devaddr_compare(const struct arpt_devaddr_info *ap, 79static inline int arp_devaddr_compare(const struct arpt_devaddr_info *ap,
85 char *hdr_addr, int len) 80 char *hdr_addr, int len)
86{ 81{
@@ -269,9 +264,7 @@ unsigned int arpt_do_table(struct sk_buff **pskb,
269 outdev = out ? out->name : nulldevname; 264 outdev = out ? out->name : nulldevname;
270 265
271 read_lock_bh(&table->lock); 266 read_lock_bh(&table->lock);
272 table_base = (void *)table->private->entries 267 table_base = (void *)table->private->entries[smp_processor_id()];
273 + TABLE_OFFSET(table->private,
274 smp_processor_id());
275 e = get_entry(table_base, table->private->hook_entry[hook]); 268 e = get_entry(table_base, table->private->hook_entry[hook]);
276 back = get_entry(table_base, table->private->underflow[hook]); 269 back = get_entry(table_base, table->private->underflow[hook]);
277 270
@@ -462,7 +455,8 @@ static inline int unconditional(const struct arpt_arp *arp)
462/* Figures out from what hook each rule can be called: returns 0 if 455/* Figures out from what hook each rule can be called: returns 0 if
463 * there are loops. Puts hook bitmask in comefrom. 456 * there are loops. Puts hook bitmask in comefrom.
464 */ 457 */
465static int mark_source_chains(struct arpt_table_info *newinfo, unsigned int valid_hooks) 458static int mark_source_chains(struct arpt_table_info *newinfo,
459 unsigned int valid_hooks, void *entry0)
466{ 460{
467 unsigned int hook; 461 unsigned int hook;
468 462
@@ -472,7 +466,7 @@ static int mark_source_chains(struct arpt_table_info *newinfo, unsigned int vali
472 for (hook = 0; hook < NF_ARP_NUMHOOKS; hook++) { 466 for (hook = 0; hook < NF_ARP_NUMHOOKS; hook++) {
473 unsigned int pos = newinfo->hook_entry[hook]; 467 unsigned int pos = newinfo->hook_entry[hook];
474 struct arpt_entry *e 468 struct arpt_entry *e
475 = (struct arpt_entry *)(newinfo->entries + pos); 469 = (struct arpt_entry *)(entry0 + pos);
476 470
477 if (!(valid_hooks & (1 << hook))) 471 if (!(valid_hooks & (1 << hook)))
478 continue; 472 continue;
@@ -514,13 +508,13 @@ static int mark_source_chains(struct arpt_table_info *newinfo, unsigned int vali
514 goto next; 508 goto next;
515 509
516 e = (struct arpt_entry *) 510 e = (struct arpt_entry *)
517 (newinfo->entries + pos); 511 (entry0 + pos);
518 } while (oldpos == pos + e->next_offset); 512 } while (oldpos == pos + e->next_offset);
519 513
520 /* Move along one */ 514 /* Move along one */
521 size = e->next_offset; 515 size = e->next_offset;
522 e = (struct arpt_entry *) 516 e = (struct arpt_entry *)
523 (newinfo->entries + pos + size); 517 (entry0 + pos + size);
524 e->counters.pcnt = pos; 518 e->counters.pcnt = pos;
525 pos += size; 519 pos += size;
526 } else { 520 } else {
@@ -537,7 +531,7 @@ static int mark_source_chains(struct arpt_table_info *newinfo, unsigned int vali
537 newpos = pos + e->next_offset; 531 newpos = pos + e->next_offset;
538 } 532 }
539 e = (struct arpt_entry *) 533 e = (struct arpt_entry *)
540 (newinfo->entries + newpos); 534 (entry0 + newpos);
541 e->counters.pcnt = pos; 535 e->counters.pcnt = pos;
542 pos = newpos; 536 pos = newpos;
543 } 537 }
@@ -689,6 +683,7 @@ static inline int cleanup_entry(struct arpt_entry *e, unsigned int *i)
689static int translate_table(const char *name, 683static int translate_table(const char *name,
690 unsigned int valid_hooks, 684 unsigned int valid_hooks,
691 struct arpt_table_info *newinfo, 685 struct arpt_table_info *newinfo,
686 void *entry0,
692 unsigned int size, 687 unsigned int size,
693 unsigned int number, 688 unsigned int number,
694 const unsigned int *hook_entries, 689 const unsigned int *hook_entries,
@@ -710,11 +705,11 @@ static int translate_table(const char *name,
710 i = 0; 705 i = 0;
711 706
712 /* Walk through entries, checking offsets. */ 707 /* Walk through entries, checking offsets. */
713 ret = ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, 708 ret = ARPT_ENTRY_ITERATE(entry0, newinfo->size,
714 check_entry_size_and_hooks, 709 check_entry_size_and_hooks,
715 newinfo, 710 newinfo,
716 newinfo->entries, 711 entry0,
717 newinfo->entries + size, 712 entry0 + size,
718 hook_entries, underflows, &i); 713 hook_entries, underflows, &i);
719 duprintf("translate_table: ARPT_ENTRY_ITERATE gives %d\n", ret); 714 duprintf("translate_table: ARPT_ENTRY_ITERATE gives %d\n", ret);
720 if (ret != 0) 715 if (ret != 0)
@@ -743,29 +738,26 @@ static int translate_table(const char *name,
743 } 738 }
744 } 739 }
745 740
746 if (!mark_source_chains(newinfo, valid_hooks)) { 741 if (!mark_source_chains(newinfo, valid_hooks, entry0)) {
747 duprintf("Looping hook\n"); 742 duprintf("Looping hook\n");
748 return -ELOOP; 743 return -ELOOP;
749 } 744 }
750 745
751 /* Finally, each sanity check must pass */ 746 /* Finally, each sanity check must pass */
752 i = 0; 747 i = 0;
753 ret = ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, 748 ret = ARPT_ENTRY_ITERATE(entry0, newinfo->size,
754 check_entry, name, size, &i); 749 check_entry, name, size, &i);
755 750
756 if (ret != 0) { 751 if (ret != 0) {
757 ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, 752 ARPT_ENTRY_ITERATE(entry0, newinfo->size,
758 cleanup_entry, &i); 753 cleanup_entry, &i);
759 return ret; 754 return ret;
760 } 755 }
761 756
762 /* And one copy for every other CPU */ 757 /* And one copy for every other CPU */
763 for_each_cpu(i) { 758 for_each_cpu(i) {
764 if (i == 0) 759 if (newinfo->entries[i] && newinfo->entries[i] != entry0)
765 continue; 760 memcpy(newinfo->entries[i], entry0, newinfo->size);
766 memcpy(newinfo->entries + SMP_ALIGN(newinfo->size) * i,
767 newinfo->entries,
768 SMP_ALIGN(newinfo->size));
769 } 761 }
770 762
771 return ret; 763 return ret;
@@ -807,15 +799,42 @@ static inline int add_entry_to_counter(const struct arpt_entry *e,
807 return 0; 799 return 0;
808} 800}
809 801
802static inline int set_entry_to_counter(const struct arpt_entry *e,
803 struct arpt_counters total[],
804 unsigned int *i)
805{
806 SET_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt);
807
808 (*i)++;
809 return 0;
810}
811
810static void get_counters(const struct arpt_table_info *t, 812static void get_counters(const struct arpt_table_info *t,
811 struct arpt_counters counters[]) 813 struct arpt_counters counters[])
812{ 814{
813 unsigned int cpu; 815 unsigned int cpu;
814 unsigned int i; 816 unsigned int i;
817 unsigned int curcpu;
818
819 /* Instead of clearing (by a previous call to memset())
820 * the counters and using adds, we set the counters
821 * with data used by 'current' CPU
822 * We dont care about preemption here.
823 */
824 curcpu = raw_smp_processor_id();
825
826 i = 0;
827 ARPT_ENTRY_ITERATE(t->entries[curcpu],
828 t->size,
829 set_entry_to_counter,
830 counters,
831 &i);
815 832
816 for_each_cpu(cpu) { 833 for_each_cpu(cpu) {
834 if (cpu == curcpu)
835 continue;
817 i = 0; 836 i = 0;
818 ARPT_ENTRY_ITERATE(t->entries + TABLE_OFFSET(t, cpu), 837 ARPT_ENTRY_ITERATE(t->entries[cpu],
819 t->size, 838 t->size,
820 add_entry_to_counter, 839 add_entry_to_counter,
821 counters, 840 counters,
@@ -831,6 +850,7 @@ static int copy_entries_to_user(unsigned int total_size,
831 struct arpt_entry *e; 850 struct arpt_entry *e;
832 struct arpt_counters *counters; 851 struct arpt_counters *counters;
833 int ret = 0; 852 int ret = 0;
853 void *loc_cpu_entry;
834 854
835 /* We need atomic snapshot of counters: rest doesn't change 855 /* We need atomic snapshot of counters: rest doesn't change
836 * (other than comefrom, which userspace doesn't care 856 * (other than comefrom, which userspace doesn't care
@@ -843,13 +863,13 @@ static int copy_entries_to_user(unsigned int total_size,
843 return -ENOMEM; 863 return -ENOMEM;
844 864
845 /* First, sum counters... */ 865 /* First, sum counters... */
846 memset(counters, 0, countersize);
847 write_lock_bh(&table->lock); 866 write_lock_bh(&table->lock);
848 get_counters(table->private, counters); 867 get_counters(table->private, counters);
849 write_unlock_bh(&table->lock); 868 write_unlock_bh(&table->lock);
850 869
851 /* ... then copy entire thing from CPU 0... */ 870 loc_cpu_entry = table->private->entries[raw_smp_processor_id()];
852 if (copy_to_user(userptr, table->private->entries, total_size) != 0) { 871 /* ... then copy entire thing ... */
872 if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) {
853 ret = -EFAULT; 873 ret = -EFAULT;
854 goto free_counters; 874 goto free_counters;
855 } 875 }
@@ -859,7 +879,7 @@ static int copy_entries_to_user(unsigned int total_size,
859 for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){ 879 for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){
860 struct arpt_entry_target *t; 880 struct arpt_entry_target *t;
861 881
862 e = (struct arpt_entry *)(table->private->entries + off); 882 e = (struct arpt_entry *)(loc_cpu_entry + off);
863 if (copy_to_user(userptr + off 883 if (copy_to_user(userptr + off
864 + offsetof(struct arpt_entry, counters), 884 + offsetof(struct arpt_entry, counters),
865 &counters[num], 885 &counters[num],
@@ -911,6 +931,47 @@ static int get_entries(const struct arpt_get_entries *entries,
911 return ret; 931 return ret;
912} 932}
913 933
934static void free_table_info(struct arpt_table_info *info)
935{
936 int cpu;
937 for_each_cpu(cpu) {
938 if (info->size <= PAGE_SIZE)
939 kfree(info->entries[cpu]);
940 else
941 vfree(info->entries[cpu]);
942 }
943 kfree(info);
944}
945
946static struct arpt_table_info *alloc_table_info(unsigned int size)
947{
948 struct arpt_table_info *newinfo;
949 int cpu;
950
951 newinfo = kzalloc(sizeof(struct arpt_table_info), GFP_KERNEL);
952 if (!newinfo)
953 return NULL;
954
955 newinfo->size = size;
956
957 for_each_cpu(cpu) {
958 if (size <= PAGE_SIZE)
959 newinfo->entries[cpu] = kmalloc_node(size,
960 GFP_KERNEL,
961 cpu_to_node(cpu));
962 else
963 newinfo->entries[cpu] = vmalloc_node(size,
964 cpu_to_node(cpu));
965
966 if (newinfo->entries[cpu] == NULL) {
967 free_table_info(newinfo);
968 return NULL;
969 }
970 }
971
972 return newinfo;
973}
974
914static int do_replace(void __user *user, unsigned int len) 975static int do_replace(void __user *user, unsigned int len)
915{ 976{
916 int ret; 977 int ret;
@@ -918,6 +979,7 @@ static int do_replace(void __user *user, unsigned int len)
918 struct arpt_table *t; 979 struct arpt_table *t;
919 struct arpt_table_info *newinfo, *oldinfo; 980 struct arpt_table_info *newinfo, *oldinfo;
920 struct arpt_counters *counters; 981 struct arpt_counters *counters;
982 void *loc_cpu_entry, *loc_cpu_old_entry;
921 983
922 if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) 984 if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
923 return -EFAULT; 985 return -EFAULT;
@@ -930,13 +992,13 @@ static int do_replace(void __user *user, unsigned int len)
930 if ((SMP_ALIGN(tmp.size) >> PAGE_SHIFT) + 2 > num_physpages) 992 if ((SMP_ALIGN(tmp.size) >> PAGE_SHIFT) + 2 > num_physpages)
931 return -ENOMEM; 993 return -ENOMEM;
932 994
933 newinfo = vmalloc(sizeof(struct arpt_table_info) 995 newinfo = alloc_table_info(tmp.size);
934 + SMP_ALIGN(tmp.size) *
935 (highest_possible_processor_id()+1));
936 if (!newinfo) 996 if (!newinfo)
937 return -ENOMEM; 997 return -ENOMEM;
938 998
939 if (copy_from_user(newinfo->entries, user + sizeof(tmp), 999 /* choose the copy that is on our node/cpu */
1000 loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
1001 if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
940 tmp.size) != 0) { 1002 tmp.size) != 0) {
941 ret = -EFAULT; 1003 ret = -EFAULT;
942 goto free_newinfo; 1004 goto free_newinfo;
@@ -947,10 +1009,9 @@ static int do_replace(void __user *user, unsigned int len)
947 ret = -ENOMEM; 1009 ret = -ENOMEM;
948 goto free_newinfo; 1010 goto free_newinfo;
949 } 1011 }
950 memset(counters, 0, tmp.num_counters * sizeof(struct arpt_counters));
951 1012
952 ret = translate_table(tmp.name, tmp.valid_hooks, 1013 ret = translate_table(tmp.name, tmp.valid_hooks,
953 newinfo, tmp.size, tmp.num_entries, 1014 newinfo, loc_cpu_entry, tmp.size, tmp.num_entries,
954 tmp.hook_entry, tmp.underflow); 1015 tmp.hook_entry, tmp.underflow);
955 if (ret != 0) 1016 if (ret != 0)
956 goto free_newinfo_counters; 1017 goto free_newinfo_counters;
@@ -989,8 +1050,10 @@ static int do_replace(void __user *user, unsigned int len)
989 /* Get the old counters. */ 1050 /* Get the old counters. */
990 get_counters(oldinfo, counters); 1051 get_counters(oldinfo, counters);
991 /* Decrease module usage counts and free resource */ 1052 /* Decrease module usage counts and free resource */
992 ARPT_ENTRY_ITERATE(oldinfo->entries, oldinfo->size, cleanup_entry,NULL); 1053 loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
993 vfree(oldinfo); 1054 ARPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,NULL);
1055
1056 free_table_info(oldinfo);
994 if (copy_to_user(tmp.counters, counters, 1057 if (copy_to_user(tmp.counters, counters,
995 sizeof(struct arpt_counters) * tmp.num_counters) != 0) 1058 sizeof(struct arpt_counters) * tmp.num_counters) != 0)
996 ret = -EFAULT; 1059 ret = -EFAULT;
@@ -1002,11 +1065,11 @@ static int do_replace(void __user *user, unsigned int len)
1002 module_put(t->me); 1065 module_put(t->me);
1003 up(&arpt_mutex); 1066 up(&arpt_mutex);
1004 free_newinfo_counters_untrans: 1067 free_newinfo_counters_untrans:
1005 ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, cleanup_entry, NULL); 1068 ARPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry, NULL);
1006 free_newinfo_counters: 1069 free_newinfo_counters:
1007 vfree(counters); 1070 vfree(counters);
1008 free_newinfo: 1071 free_newinfo:
1009 vfree(newinfo); 1072 free_table_info(newinfo);
1010 return ret; 1073 return ret;
1011} 1074}
1012 1075
@@ -1030,6 +1093,7 @@ static int do_add_counters(void __user *user, unsigned int len)
1030 struct arpt_counters_info tmp, *paddc; 1093 struct arpt_counters_info tmp, *paddc;
1031 struct arpt_table *t; 1094 struct arpt_table *t;
1032 int ret = 0; 1095 int ret = 0;
1096 void *loc_cpu_entry;
1033 1097
1034 if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) 1098 if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
1035 return -EFAULT; 1099 return -EFAULT;
@@ -1059,7 +1123,9 @@ static int do_add_counters(void __user *user, unsigned int len)
1059 } 1123 }
1060 1124
1061 i = 0; 1125 i = 0;
1062 ARPT_ENTRY_ITERATE(t->private->entries, 1126 /* Choose the copy that is on our node */
1127 loc_cpu_entry = t->private->entries[smp_processor_id()];
1128 ARPT_ENTRY_ITERATE(loc_cpu_entry,
1063 t->private->size, 1129 t->private->size,
1064 add_counter_to_entry, 1130 add_counter_to_entry,
1065 paddc->counters, 1131 paddc->counters,
@@ -1220,30 +1286,32 @@ int arpt_register_table(struct arpt_table *table,
1220 struct arpt_table_info *newinfo; 1286 struct arpt_table_info *newinfo;
1221 static struct arpt_table_info bootstrap 1287 static struct arpt_table_info bootstrap
1222 = { 0, 0, 0, { 0 }, { 0 }, { } }; 1288 = { 0, 0, 0, { 0 }, { 0 }, { } };
1289 void *loc_cpu_entry;
1223 1290
1224 newinfo = vmalloc(sizeof(struct arpt_table_info) 1291 newinfo = alloc_table_info(repl->size);
1225 + SMP_ALIGN(repl->size) *
1226 (highest_possible_processor_id()+1));
1227 if (!newinfo) { 1292 if (!newinfo) {
1228 ret = -ENOMEM; 1293 ret = -ENOMEM;
1229 return ret; 1294 return ret;
1230 } 1295 }
1231 memcpy(newinfo->entries, repl->entries, repl->size); 1296
1297 /* choose the copy on our node/cpu */
1298 loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
1299 memcpy(loc_cpu_entry, repl->entries, repl->size);
1232 1300
1233 ret = translate_table(table->name, table->valid_hooks, 1301 ret = translate_table(table->name, table->valid_hooks,
1234 newinfo, repl->size, 1302 newinfo, loc_cpu_entry, repl->size,
1235 repl->num_entries, 1303 repl->num_entries,
1236 repl->hook_entry, 1304 repl->hook_entry,
1237 repl->underflow); 1305 repl->underflow);
1238 duprintf("arpt_register_table: translate table gives %d\n", ret); 1306 duprintf("arpt_register_table: translate table gives %d\n", ret);
1239 if (ret != 0) { 1307 if (ret != 0) {
1240 vfree(newinfo); 1308 free_table_info(newinfo);
1241 return ret; 1309 return ret;
1242 } 1310 }
1243 1311
1244 ret = down_interruptible(&arpt_mutex); 1312 ret = down_interruptible(&arpt_mutex);
1245 if (ret != 0) { 1313 if (ret != 0) {
1246 vfree(newinfo); 1314 free_table_info(newinfo);
1247 return ret; 1315 return ret;
1248 } 1316 }
1249 1317
@@ -1272,20 +1340,23 @@ int arpt_register_table(struct arpt_table *table,
1272 return ret; 1340 return ret;
1273 1341
1274 free_unlock: 1342 free_unlock:
1275 vfree(newinfo); 1343 free_table_info(newinfo);
1276 goto unlock; 1344 goto unlock;
1277} 1345}
1278 1346
1279void arpt_unregister_table(struct arpt_table *table) 1347void arpt_unregister_table(struct arpt_table *table)
1280{ 1348{
1349 void *loc_cpu_entry;
1350
1281 down(&arpt_mutex); 1351 down(&arpt_mutex);
1282 LIST_DELETE(&arpt_tables, table); 1352 LIST_DELETE(&arpt_tables, table);
1283 up(&arpt_mutex); 1353 up(&arpt_mutex);
1284 1354
1285 /* Decrease module usage counts and free resources */ 1355 /* Decrease module usage counts and free resources */
1286 ARPT_ENTRY_ITERATE(table->private->entries, table->private->size, 1356 loc_cpu_entry = table->private->entries[raw_smp_processor_id()];
1357 ARPT_ENTRY_ITERATE(loc_cpu_entry, table->private->size,
1287 cleanup_entry, NULL); 1358 cleanup_entry, NULL);
1288 vfree(table->private); 1359 free_table_info(table->private);
1289} 1360}
1290 1361
1291/* The built-in targets: standard (NULL) and error. */ 1362/* The built-in targets: standard (NULL) and error. */
diff --git a/net/ipv4/netfilter/ip_conntrack_amanda.c b/net/ipv4/netfilter/ip_conntrack_amanda.c
index e52847fa10f5..0366eedb4d70 100644
--- a/net/ipv4/netfilter/ip_conntrack_amanda.c
+++ b/net/ipv4/netfilter/ip_conntrack_amanda.c
@@ -18,11 +18,13 @@
18 * 18 *
19 */ 19 */
20 20
21#include <linux/in.h>
21#include <linux/kernel.h> 22#include <linux/kernel.h>
22#include <linux/module.h> 23#include <linux/module.h>
23#include <linux/netfilter.h> 24#include <linux/netfilter.h>
24#include <linux/ip.h> 25#include <linux/ip.h>
25#include <linux/moduleparam.h> 26#include <linux/moduleparam.h>
27#include <linux/udp.h>
26#include <net/checksum.h> 28#include <net/checksum.h>
27#include <net/udp.h> 29#include <net/udp.h>
28 30
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_gre.c b/net/ipv4/netfilter/ip_conntrack_proto_gre.c
index 744abb9d377a..57956dee60c8 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_gre.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_gre.c
@@ -31,6 +31,7 @@
31#include <linux/ip.h> 31#include <linux/ip.h>
32#include <linux/in.h> 32#include <linux/in.h>
33#include <linux/list.h> 33#include <linux/list.h>
34#include <linux/seq_file.h>
34 35
35static DEFINE_RWLOCK(ip_ct_gre_lock); 36static DEFINE_RWLOCK(ip_ct_gre_lock);
36#define ASSERT_READ_LOCK(x) 37#define ASSERT_READ_LOCK(x)
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_udp.c b/net/ipv4/netfilter/ip_conntrack_proto_udp.c
index f2dcac7c7660..46becbe4fe58 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_udp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_udp.c
@@ -11,6 +11,7 @@
11#include <linux/timer.h> 11#include <linux/timer.h>
12#include <linux/netfilter.h> 12#include <linux/netfilter.h>
13#include <linux/in.h> 13#include <linux/in.h>
14#include <linux/ip.h>
14#include <linux/udp.h> 15#include <linux/udp.h>
15#include <linux/seq_file.h> 16#include <linux/seq_file.h>
16#include <net/checksum.h> 17#include <net/checksum.h>
diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c
index dd476b191f4b..a88bcc551244 100644
--- a/net/ipv4/netfilter/ip_conntrack_standalone.c
+++ b/net/ipv4/netfilter/ip_conntrack_standalone.c
@@ -27,6 +27,7 @@
27#endif 27#endif
28#include <net/checksum.h> 28#include <net/checksum.h>
29#include <net/ip.h> 29#include <net/ip.h>
30#include <net/route.h>
30 31
31#define ASSERT_READ_LOCK(x) 32#define ASSERT_READ_LOCK(x)
32#define ASSERT_WRITE_LOCK(x) 33#define ASSERT_WRITE_LOCK(x)
diff --git a/net/ipv4/netfilter/ip_nat_snmp_basic.c b/net/ipv4/netfilter/ip_nat_snmp_basic.c
index 8acb7ed40b47..4f95d477805c 100644
--- a/net/ipv4/netfilter/ip_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/ip_nat_snmp_basic.c
@@ -44,6 +44,7 @@
44 * 44 *
45 */ 45 */
46#include <linux/config.h> 46#include <linux/config.h>
47#include <linux/in.h>
47#include <linux/module.h> 48#include <linux/module.h>
48#include <linux/types.h> 49#include <linux/types.h>
49#include <linux/kernel.h> 50#include <linux/kernel.h>
@@ -53,6 +54,7 @@
53#include <linux/netfilter_ipv4/ip_conntrack_helper.h> 54#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
54#include <linux/netfilter_ipv4/ip_nat_helper.h> 55#include <linux/netfilter_ipv4/ip_nat_helper.h>
55#include <linux/ip.h> 56#include <linux/ip.h>
57#include <linux/udp.h>
56#include <net/checksum.h> 58#include <net/checksum.h>
57#include <net/udp.h> 59#include <net/udp.h>
58#include <asm/uaccess.h> 60#include <asm/uaccess.h>
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 45886c8475e8..2a26d167e149 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -83,11 +83,6 @@ static DECLARE_MUTEX(ipt_mutex);
83 context stops packets coming through and allows user context to read 83 context stops packets coming through and allows user context to read
84 the counters or update the rules. 84 the counters or update the rules.
85 85
86 To be cache friendly on SMP, we arrange them like so:
87 [ n-entries ]
88 ... cache-align padding ...
89 [ n-entries ]
90
91 Hence the start of any table is given by get_table() below. */ 86 Hence the start of any table is given by get_table() below. */
92 87
93/* The table itself */ 88/* The table itself */
@@ -105,20 +100,15 @@ struct ipt_table_info
105 unsigned int underflow[NF_IP_NUMHOOKS]; 100 unsigned int underflow[NF_IP_NUMHOOKS];
106 101
107 /* ipt_entry tables: one per CPU */ 102 /* ipt_entry tables: one per CPU */
108 char entries[0] ____cacheline_aligned; 103 void *entries[NR_CPUS];
109}; 104};
110 105
111static LIST_HEAD(ipt_target); 106static LIST_HEAD(ipt_target);
112static LIST_HEAD(ipt_match); 107static LIST_HEAD(ipt_match);
113static LIST_HEAD(ipt_tables); 108static LIST_HEAD(ipt_tables);
109#define SET_COUNTER(c,b,p) do { (c).bcnt = (b); (c).pcnt = (p); } while(0)
114#define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0) 110#define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0)
115 111
116#ifdef CONFIG_SMP
117#define TABLE_OFFSET(t,p) (SMP_ALIGN((t)->size)*(p))
118#else
119#define TABLE_OFFSET(t,p) 0
120#endif
121
122#if 0 112#if 0
123#define down(x) do { printk("DOWN:%u:" #x "\n", __LINE__); down(x); } while(0) 113#define down(x) do { printk("DOWN:%u:" #x "\n", __LINE__); down(x); } while(0)
124#define down_interruptible(x) ({ int __r; printk("DOWNi:%u:" #x "\n", __LINE__); __r = down_interruptible(x); if (__r != 0) printk("ABORT-DOWNi:%u\n", __LINE__); __r; }) 114#define down_interruptible(x) ({ int __r; printk("DOWNi:%u:" #x "\n", __LINE__); __r = down_interruptible(x); if (__r != 0) printk("ABORT-DOWNi:%u\n", __LINE__); __r; })
@@ -290,8 +280,7 @@ ipt_do_table(struct sk_buff **pskb,
290 280
291 read_lock_bh(&table->lock); 281 read_lock_bh(&table->lock);
292 IP_NF_ASSERT(table->valid_hooks & (1 << hook)); 282 IP_NF_ASSERT(table->valid_hooks & (1 << hook));
293 table_base = (void *)table->private->entries 283 table_base = (void *)table->private->entries[smp_processor_id()];
294 + TABLE_OFFSET(table->private, smp_processor_id());
295 e = get_entry(table_base, table->private->hook_entry[hook]); 284 e = get_entry(table_base, table->private->hook_entry[hook]);
296 285
297#ifdef CONFIG_NETFILTER_DEBUG 286#ifdef CONFIG_NETFILTER_DEBUG
@@ -563,7 +552,8 @@ unconditional(const struct ipt_ip *ip)
563/* Figures out from what hook each rule can be called: returns 0 if 552/* Figures out from what hook each rule can be called: returns 0 if
564 there are loops. Puts hook bitmask in comefrom. */ 553 there are loops. Puts hook bitmask in comefrom. */
565static int 554static int
566mark_source_chains(struct ipt_table_info *newinfo, unsigned int valid_hooks) 555mark_source_chains(struct ipt_table_info *newinfo,
556 unsigned int valid_hooks, void *entry0)
567{ 557{
568 unsigned int hook; 558 unsigned int hook;
569 559
@@ -572,7 +562,7 @@ mark_source_chains(struct ipt_table_info *newinfo, unsigned int valid_hooks)
572 for (hook = 0; hook < NF_IP_NUMHOOKS; hook++) { 562 for (hook = 0; hook < NF_IP_NUMHOOKS; hook++) {
573 unsigned int pos = newinfo->hook_entry[hook]; 563 unsigned int pos = newinfo->hook_entry[hook];
574 struct ipt_entry *e 564 struct ipt_entry *e
575 = (struct ipt_entry *)(newinfo->entries + pos); 565 = (struct ipt_entry *)(entry0 + pos);
576 566
577 if (!(valid_hooks & (1 << hook))) 567 if (!(valid_hooks & (1 << hook)))
578 continue; 568 continue;
@@ -622,13 +612,13 @@ mark_source_chains(struct ipt_table_info *newinfo, unsigned int valid_hooks)
622 goto next; 612 goto next;
623 613
624 e = (struct ipt_entry *) 614 e = (struct ipt_entry *)
625 (newinfo->entries + pos); 615 (entry0 + pos);
626 } while (oldpos == pos + e->next_offset); 616 } while (oldpos == pos + e->next_offset);
627 617
628 /* Move along one */ 618 /* Move along one */
629 size = e->next_offset; 619 size = e->next_offset;
630 e = (struct ipt_entry *) 620 e = (struct ipt_entry *)
631 (newinfo->entries + pos + size); 621 (entry0 + pos + size);
632 e->counters.pcnt = pos; 622 e->counters.pcnt = pos;
633 pos += size; 623 pos += size;
634 } else { 624 } else {
@@ -645,7 +635,7 @@ mark_source_chains(struct ipt_table_info *newinfo, unsigned int valid_hooks)
645 newpos = pos + e->next_offset; 635 newpos = pos + e->next_offset;
646 } 636 }
647 e = (struct ipt_entry *) 637 e = (struct ipt_entry *)
648 (newinfo->entries + newpos); 638 (entry0 + newpos);
649 e->counters.pcnt = pos; 639 e->counters.pcnt = pos;
650 pos = newpos; 640 pos = newpos;
651 } 641 }
@@ -855,6 +845,7 @@ static int
855translate_table(const char *name, 845translate_table(const char *name,
856 unsigned int valid_hooks, 846 unsigned int valid_hooks,
857 struct ipt_table_info *newinfo, 847 struct ipt_table_info *newinfo,
848 void *entry0,
858 unsigned int size, 849 unsigned int size,
859 unsigned int number, 850 unsigned int number,
860 const unsigned int *hook_entries, 851 const unsigned int *hook_entries,
@@ -875,11 +866,11 @@ translate_table(const char *name,
875 duprintf("translate_table: size %u\n", newinfo->size); 866 duprintf("translate_table: size %u\n", newinfo->size);
876 i = 0; 867 i = 0;
877 /* Walk through entries, checking offsets. */ 868 /* Walk through entries, checking offsets. */
878 ret = IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, 869 ret = IPT_ENTRY_ITERATE(entry0, newinfo->size,
879 check_entry_size_and_hooks, 870 check_entry_size_and_hooks,
880 newinfo, 871 newinfo,
881 newinfo->entries, 872 entry0,
882 newinfo->entries + size, 873 entry0 + size,
883 hook_entries, underflows, &i); 874 hook_entries, underflows, &i);
884 if (ret != 0) 875 if (ret != 0)
885 return ret; 876 return ret;
@@ -907,27 +898,24 @@ translate_table(const char *name,
907 } 898 }
908 } 899 }
909 900
910 if (!mark_source_chains(newinfo, valid_hooks)) 901 if (!mark_source_chains(newinfo, valid_hooks, entry0))
911 return -ELOOP; 902 return -ELOOP;
912 903
913 /* Finally, each sanity check must pass */ 904 /* Finally, each sanity check must pass */
914 i = 0; 905 i = 0;
915 ret = IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, 906 ret = IPT_ENTRY_ITERATE(entry0, newinfo->size,
916 check_entry, name, size, &i); 907 check_entry, name, size, &i);
917 908
918 if (ret != 0) { 909 if (ret != 0) {
919 IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, 910 IPT_ENTRY_ITERATE(entry0, newinfo->size,
920 cleanup_entry, &i); 911 cleanup_entry, &i);
921 return ret; 912 return ret;
922 } 913 }
923 914
924 /* And one copy for every other CPU */ 915 /* And one copy for every other CPU */
925 for_each_cpu(i) { 916 for_each_cpu(i) {
926 if (i == 0) 917 if (newinfo->entries[i] && newinfo->entries[i] != entry0)
927 continue; 918 memcpy(newinfo->entries[i], entry0, newinfo->size);
928 memcpy(newinfo->entries + SMP_ALIGN(newinfo->size) * i,
929 newinfo->entries,
930 SMP_ALIGN(newinfo->size));
931 } 919 }
932 920
933 return ret; 921 return ret;
@@ -943,15 +931,12 @@ replace_table(struct ipt_table *table,
943 931
944#ifdef CONFIG_NETFILTER_DEBUG 932#ifdef CONFIG_NETFILTER_DEBUG
945 { 933 {
946 struct ipt_entry *table_base; 934 int cpu;
947 unsigned int i;
948 935
949 for_each_cpu(i) { 936 for_each_cpu(cpu) {
950 table_base = 937 struct ipt_entry *table_base = newinfo->entries[cpu];
951 (void *)newinfo->entries 938 if (table_base)
952 + TABLE_OFFSET(newinfo, i); 939 table_base->comefrom = 0xdead57ac;
953
954 table_base->comefrom = 0xdead57ac;
955 } 940 }
956 } 941 }
957#endif 942#endif
@@ -986,16 +971,44 @@ add_entry_to_counter(const struct ipt_entry *e,
986 return 0; 971 return 0;
987} 972}
988 973
974static inline int
975set_entry_to_counter(const struct ipt_entry *e,
976 struct ipt_counters total[],
977 unsigned int *i)
978{
979 SET_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt);
980
981 (*i)++;
982 return 0;
983}
984
989static void 985static void
990get_counters(const struct ipt_table_info *t, 986get_counters(const struct ipt_table_info *t,
991 struct ipt_counters counters[]) 987 struct ipt_counters counters[])
992{ 988{
993 unsigned int cpu; 989 unsigned int cpu;
994 unsigned int i; 990 unsigned int i;
991 unsigned int curcpu;
992
993 /* Instead of clearing (by a previous call to memset())
994 * the counters and using adds, we set the counters
995 * with data used by 'current' CPU
996 * We dont care about preemption here.
997 */
998 curcpu = raw_smp_processor_id();
999
1000 i = 0;
1001 IPT_ENTRY_ITERATE(t->entries[curcpu],
1002 t->size,
1003 set_entry_to_counter,
1004 counters,
1005 &i);
995 1006
996 for_each_cpu(cpu) { 1007 for_each_cpu(cpu) {
1008 if (cpu == curcpu)
1009 continue;
997 i = 0; 1010 i = 0;
998 IPT_ENTRY_ITERATE(t->entries + TABLE_OFFSET(t, cpu), 1011 IPT_ENTRY_ITERATE(t->entries[cpu],
999 t->size, 1012 t->size,
1000 add_entry_to_counter, 1013 add_entry_to_counter,
1001 counters, 1014 counters,
@@ -1012,24 +1025,29 @@ copy_entries_to_user(unsigned int total_size,
1012 struct ipt_entry *e; 1025 struct ipt_entry *e;
1013 struct ipt_counters *counters; 1026 struct ipt_counters *counters;
1014 int ret = 0; 1027 int ret = 0;
1028 void *loc_cpu_entry;
1015 1029
1016 /* We need atomic snapshot of counters: rest doesn't change 1030 /* We need atomic snapshot of counters: rest doesn't change
1017 (other than comefrom, which userspace doesn't care 1031 (other than comefrom, which userspace doesn't care
1018 about). */ 1032 about). */
1019 countersize = sizeof(struct ipt_counters) * table->private->number; 1033 countersize = sizeof(struct ipt_counters) * table->private->number;
1020 counters = vmalloc(countersize); 1034 counters = vmalloc_node(countersize, numa_node_id());
1021 1035
1022 if (counters == NULL) 1036 if (counters == NULL)
1023 return -ENOMEM; 1037 return -ENOMEM;
1024 1038
1025 /* First, sum counters... */ 1039 /* First, sum counters... */
1026 memset(counters, 0, countersize);
1027 write_lock_bh(&table->lock); 1040 write_lock_bh(&table->lock);
1028 get_counters(table->private, counters); 1041 get_counters(table->private, counters);
1029 write_unlock_bh(&table->lock); 1042 write_unlock_bh(&table->lock);
1030 1043
1031 /* ... then copy entire thing from CPU 0... */ 1044 /* choose the copy that is on our node/cpu, ...
1032 if (copy_to_user(userptr, table->private->entries, total_size) != 0) { 1045 * This choice is lazy (because current thread is
1046 * allowed to migrate to another cpu)
1047 */
1048 loc_cpu_entry = table->private->entries[raw_smp_processor_id()];
1049 /* ... then copy entire thing ... */
1050 if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) {
1033 ret = -EFAULT; 1051 ret = -EFAULT;
1034 goto free_counters; 1052 goto free_counters;
1035 } 1053 }
@@ -1041,7 +1059,7 @@ copy_entries_to_user(unsigned int total_size,
1041 struct ipt_entry_match *m; 1059 struct ipt_entry_match *m;
1042 struct ipt_entry_target *t; 1060 struct ipt_entry_target *t;
1043 1061
1044 e = (struct ipt_entry *)(table->private->entries + off); 1062 e = (struct ipt_entry *)(loc_cpu_entry + off);
1045 if (copy_to_user(userptr + off 1063 if (copy_to_user(userptr + off
1046 + offsetof(struct ipt_entry, counters), 1064 + offsetof(struct ipt_entry, counters),
1047 &counters[num], 1065 &counters[num],
@@ -1110,6 +1128,45 @@ get_entries(const struct ipt_get_entries *entries,
1110 return ret; 1128 return ret;
1111} 1129}
1112 1130
1131static void free_table_info(struct ipt_table_info *info)
1132{
1133 int cpu;
1134 for_each_cpu(cpu) {
1135 if (info->size <= PAGE_SIZE)
1136 kfree(info->entries[cpu]);
1137 else
1138 vfree(info->entries[cpu]);
1139 }
1140 kfree(info);
1141}
1142
1143static struct ipt_table_info *alloc_table_info(unsigned int size)
1144{
1145 struct ipt_table_info *newinfo;
1146 int cpu;
1147
1148 newinfo = kzalloc(sizeof(struct ipt_table_info), GFP_KERNEL);
1149 if (!newinfo)
1150 return NULL;
1151
1152 newinfo->size = size;
1153
1154 for_each_cpu(cpu) {
1155 if (size <= PAGE_SIZE)
1156 newinfo->entries[cpu] = kmalloc_node(size,
1157 GFP_KERNEL,
1158 cpu_to_node(cpu));
1159 else
1160 newinfo->entries[cpu] = vmalloc_node(size, cpu_to_node(cpu));
1161 if (newinfo->entries[cpu] == 0) {
1162 free_table_info(newinfo);
1163 return NULL;
1164 }
1165 }
1166
1167 return newinfo;
1168}
1169
1113static int 1170static int
1114do_replace(void __user *user, unsigned int len) 1171do_replace(void __user *user, unsigned int len)
1115{ 1172{
@@ -1118,6 +1175,7 @@ do_replace(void __user *user, unsigned int len)
1118 struct ipt_table *t; 1175 struct ipt_table *t;
1119 struct ipt_table_info *newinfo, *oldinfo; 1176 struct ipt_table_info *newinfo, *oldinfo;
1120 struct ipt_counters *counters; 1177 struct ipt_counters *counters;
1178 void *loc_cpu_entry, *loc_cpu_old_entry;
1121 1179
1122 if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) 1180 if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
1123 return -EFAULT; 1181 return -EFAULT;
@@ -1130,13 +1188,13 @@ do_replace(void __user *user, unsigned int len)
1130 if ((SMP_ALIGN(tmp.size) >> PAGE_SHIFT) + 2 > num_physpages) 1188 if ((SMP_ALIGN(tmp.size) >> PAGE_SHIFT) + 2 > num_physpages)
1131 return -ENOMEM; 1189 return -ENOMEM;
1132 1190
1133 newinfo = vmalloc(sizeof(struct ipt_table_info) 1191 newinfo = alloc_table_info(tmp.size);
1134 + SMP_ALIGN(tmp.size) *
1135 (highest_possible_processor_id()+1));
1136 if (!newinfo) 1192 if (!newinfo)
1137 return -ENOMEM; 1193 return -ENOMEM;
1138 1194
1139 if (copy_from_user(newinfo->entries, user + sizeof(tmp), 1195 /* choose the copy that is our node/cpu */
1196 loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
1197 if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
1140 tmp.size) != 0) { 1198 tmp.size) != 0) {
1141 ret = -EFAULT; 1199 ret = -EFAULT;
1142 goto free_newinfo; 1200 goto free_newinfo;
@@ -1147,10 +1205,9 @@ do_replace(void __user *user, unsigned int len)
1147 ret = -ENOMEM; 1205 ret = -ENOMEM;
1148 goto free_newinfo; 1206 goto free_newinfo;
1149 } 1207 }
1150 memset(counters, 0, tmp.num_counters * sizeof(struct ipt_counters));
1151 1208
1152 ret = translate_table(tmp.name, tmp.valid_hooks, 1209 ret = translate_table(tmp.name, tmp.valid_hooks,
1153 newinfo, tmp.size, tmp.num_entries, 1210 newinfo, loc_cpu_entry, tmp.size, tmp.num_entries,
1154 tmp.hook_entry, tmp.underflow); 1211 tmp.hook_entry, tmp.underflow);
1155 if (ret != 0) 1212 if (ret != 0)
1156 goto free_newinfo_counters; 1213 goto free_newinfo_counters;
@@ -1189,8 +1246,9 @@ do_replace(void __user *user, unsigned int len)
1189 /* Get the old counters. */ 1246 /* Get the old counters. */
1190 get_counters(oldinfo, counters); 1247 get_counters(oldinfo, counters);
1191 /* Decrease module usage counts and free resource */ 1248 /* Decrease module usage counts and free resource */
1192 IPT_ENTRY_ITERATE(oldinfo->entries, oldinfo->size, cleanup_entry,NULL); 1249 loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
1193 vfree(oldinfo); 1250 IPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,NULL);
1251 free_table_info(oldinfo);
1194 if (copy_to_user(tmp.counters, counters, 1252 if (copy_to_user(tmp.counters, counters,
1195 sizeof(struct ipt_counters) * tmp.num_counters) != 0) 1253 sizeof(struct ipt_counters) * tmp.num_counters) != 0)
1196 ret = -EFAULT; 1254 ret = -EFAULT;
@@ -1202,11 +1260,11 @@ do_replace(void __user *user, unsigned int len)
1202 module_put(t->me); 1260 module_put(t->me);
1203 up(&ipt_mutex); 1261 up(&ipt_mutex);
1204 free_newinfo_counters_untrans: 1262 free_newinfo_counters_untrans:
1205 IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, cleanup_entry,NULL); 1263 IPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry,NULL);
1206 free_newinfo_counters: 1264 free_newinfo_counters:
1207 vfree(counters); 1265 vfree(counters);
1208 free_newinfo: 1266 free_newinfo:
1209 vfree(newinfo); 1267 free_table_info(newinfo);
1210 return ret; 1268 return ret;
1211} 1269}
1212 1270
@@ -1239,6 +1297,7 @@ do_add_counters(void __user *user, unsigned int len)
1239 struct ipt_counters_info tmp, *paddc; 1297 struct ipt_counters_info tmp, *paddc;
1240 struct ipt_table *t; 1298 struct ipt_table *t;
1241 int ret = 0; 1299 int ret = 0;
1300 void *loc_cpu_entry;
1242 1301
1243 if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) 1302 if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
1244 return -EFAULT; 1303 return -EFAULT;
@@ -1246,7 +1305,7 @@ do_add_counters(void __user *user, unsigned int len)
1246 if (len != sizeof(tmp) + tmp.num_counters*sizeof(struct ipt_counters)) 1305 if (len != sizeof(tmp) + tmp.num_counters*sizeof(struct ipt_counters))
1247 return -EINVAL; 1306 return -EINVAL;
1248 1307
1249 paddc = vmalloc(len); 1308 paddc = vmalloc_node(len, numa_node_id());
1250 if (!paddc) 1309 if (!paddc)
1251 return -ENOMEM; 1310 return -ENOMEM;
1252 1311
@@ -1268,7 +1327,9 @@ do_add_counters(void __user *user, unsigned int len)
1268 } 1327 }
1269 1328
1270 i = 0; 1329 i = 0;
1271 IPT_ENTRY_ITERATE(t->private->entries, 1330 /* Choose the copy that is on our node */
1331 loc_cpu_entry = t->private->entries[raw_smp_processor_id()];
1332 IPT_ENTRY_ITERATE(loc_cpu_entry,
1272 t->private->size, 1333 t->private->size,
1273 add_counter_to_entry, 1334 add_counter_to_entry,
1274 paddc->counters, 1335 paddc->counters,
@@ -1460,28 +1521,31 @@ int ipt_register_table(struct ipt_table *table, const struct ipt_replace *repl)
1460 struct ipt_table_info *newinfo; 1521 struct ipt_table_info *newinfo;
1461 static struct ipt_table_info bootstrap 1522 static struct ipt_table_info bootstrap
1462 = { 0, 0, 0, { 0 }, { 0 }, { } }; 1523 = { 0, 0, 0, { 0 }, { 0 }, { } };
1524 void *loc_cpu_entry;
1463 1525
1464 newinfo = vmalloc(sizeof(struct ipt_table_info) 1526 newinfo = alloc_table_info(repl->size);
1465 + SMP_ALIGN(repl->size) *
1466 (highest_possible_processor_id()+1));
1467 if (!newinfo) 1527 if (!newinfo)
1468 return -ENOMEM; 1528 return -ENOMEM;
1469 1529
1470 memcpy(newinfo->entries, repl->entries, repl->size); 1530 /* choose the copy on our node/cpu
1531 * but dont care of preemption
1532 */
1533 loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
1534 memcpy(loc_cpu_entry, repl->entries, repl->size);
1471 1535
1472 ret = translate_table(table->name, table->valid_hooks, 1536 ret = translate_table(table->name, table->valid_hooks,
1473 newinfo, repl->size, 1537 newinfo, loc_cpu_entry, repl->size,
1474 repl->num_entries, 1538 repl->num_entries,
1475 repl->hook_entry, 1539 repl->hook_entry,
1476 repl->underflow); 1540 repl->underflow);
1477 if (ret != 0) { 1541 if (ret != 0) {
1478 vfree(newinfo); 1542 free_table_info(newinfo);
1479 return ret; 1543 return ret;
1480 } 1544 }
1481 1545
1482 ret = down_interruptible(&ipt_mutex); 1546 ret = down_interruptible(&ipt_mutex);
1483 if (ret != 0) { 1547 if (ret != 0) {
1484 vfree(newinfo); 1548 free_table_info(newinfo);
1485 return ret; 1549 return ret;
1486 } 1550 }
1487 1551
@@ -1510,20 +1574,23 @@ int ipt_register_table(struct ipt_table *table, const struct ipt_replace *repl)
1510 return ret; 1574 return ret;
1511 1575
1512 free_unlock: 1576 free_unlock:
1513 vfree(newinfo); 1577 free_table_info(newinfo);
1514 goto unlock; 1578 goto unlock;
1515} 1579}
1516 1580
1517void ipt_unregister_table(struct ipt_table *table) 1581void ipt_unregister_table(struct ipt_table *table)
1518{ 1582{
1583 void *loc_cpu_entry;
1584
1519 down(&ipt_mutex); 1585 down(&ipt_mutex);
1520 LIST_DELETE(&ipt_tables, table); 1586 LIST_DELETE(&ipt_tables, table);
1521 up(&ipt_mutex); 1587 up(&ipt_mutex);
1522 1588
1523 /* Decrease module usage counts and free resources */ 1589 /* Decrease module usage counts and free resources */
1524 IPT_ENTRY_ITERATE(table->private->entries, table->private->size, 1590 loc_cpu_entry = table->private->entries[raw_smp_processor_id()];
1591 IPT_ENTRY_ITERATE(loc_cpu_entry, table->private->size,
1525 cleanup_entry, NULL); 1592 cleanup_entry, NULL);
1526 vfree(table->private); 1593 free_table_info(table->private);
1527} 1594}
1528 1595
1529/* Returns 1 if the port is matched by the range, 0 otherwise */ 1596/* Returns 1 if the port is matched by the range, 0 otherwise */
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index 275a174c6fe6..27860510ca6d 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -11,6 +11,7 @@
11 11
12#include <linux/config.h> 12#include <linux/config.h>
13#include <linux/types.h> 13#include <linux/types.h>
14#include <linux/inetdevice.h>
14#include <linux/ip.h> 15#include <linux/ip.h>
15#include <linux/timer.h> 16#include <linux/timer.h>
16#include <linux/module.h> 17#include <linux/module.h>
@@ -18,6 +19,7 @@
18#include <net/protocol.h> 19#include <net/protocol.h>
19#include <net/ip.h> 20#include <net/ip.h>
20#include <net/checksum.h> 21#include <net/checksum.h>
22#include <net/route.h>
21#include <linux/netfilter_ipv4.h> 23#include <linux/netfilter_ipv4.h>
22#include <linux/netfilter_ipv4/ip_nat_rule.h> 24#include <linux/netfilter_ipv4/ip_nat_rule.h>
23#include <linux/netfilter_ipv4/ip_tables.h> 25#include <linux/netfilter_ipv4/ip_tables.h>
diff --git a/net/ipv4/netfilter/ipt_physdev.c b/net/ipv4/netfilter/ipt_physdev.c
index 1a53924041fc..03f554857a4d 100644
--- a/net/ipv4/netfilter/ipt_physdev.c
+++ b/net/ipv4/netfilter/ipt_physdev.c
@@ -9,6 +9,7 @@
9 */ 9 */
10 10
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/netdevice.h>
12#include <linux/skbuff.h> 13#include <linux/skbuff.h>
13#include <linux/netfilter_ipv4/ipt_physdev.h> 14#include <linux/netfilter_ipv4/ipt_physdev.h>
14#include <linux/netfilter_ipv4/ip_tables.h> 15#include <linux/netfilter_ipv4/ip_tables.h>
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 0d7dc668db46..39d49dc333a7 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -38,6 +38,7 @@
38#include <net/protocol.h> 38#include <net/protocol.h>
39#include <net/tcp.h> 39#include <net/tcp.h>
40#include <net/udp.h> 40#include <net/udp.h>
41#include <linux/inetdevice.h>
41#include <linux/proc_fs.h> 42#include <linux/proc_fs.h>
42#include <linux/seq_file.h> 43#include <linux/seq_file.h>
43#include <net/sock.h> 44#include <net/sock.h>
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index a34e60ea48a1..e20be3331f67 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -173,10 +173,10 @@ static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
173 struct request_sock *req, 173 struct request_sock *req,
174 struct dst_entry *dst) 174 struct dst_entry *dst)
175{ 175{
176 struct tcp_sock *tp = tcp_sk(sk); 176 struct inet_connection_sock *icsk = inet_csk(sk);
177 struct sock *child; 177 struct sock *child;
178 178
179 child = tp->af_specific->syn_recv_sock(sk, skb, req, dst); 179 child = icsk->icsk_af_ops->syn_recv_sock(sk, skb, req, dst);
180 if (child) 180 if (child)
181 inet_csk_reqsk_queue_add(sk, req, child); 181 inet_csk_reqsk_queue_add(sk, req, child);
182 else 182 else
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 01444a02b48b..16984d4a8a06 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -12,6 +12,7 @@
12#include <linux/sysctl.h> 12#include <linux/sysctl.h>
13#include <linux/config.h> 13#include <linux/config.h>
14#include <linux/igmp.h> 14#include <linux/igmp.h>
15#include <linux/inetdevice.h>
15#include <net/snmp.h> 16#include <net/snmp.h>
16#include <net/icmp.h> 17#include <net/icmp.h>
17#include <net/ip.h> 18#include <net/ip.h>
@@ -22,6 +23,7 @@
22extern int sysctl_ip_nonlocal_bind; 23extern int sysctl_ip_nonlocal_bind;
23 24
24#ifdef CONFIG_SYSCTL 25#ifdef CONFIG_SYSCTL
26static int zero;
25static int tcp_retr1_max = 255; 27static int tcp_retr1_max = 255;
26static int ip_local_port_range_min[] = { 1, 1 }; 28static int ip_local_port_range_min[] = { 1, 1 };
27static int ip_local_port_range_max[] = { 65535, 65535 }; 29static int ip_local_port_range_max[] = { 65535, 65535 };
@@ -614,6 +616,15 @@ ctl_table ipv4_table[] = {
614 .strategy = &sysctl_jiffies 616 .strategy = &sysctl_jiffies
615 }, 617 },
616 { 618 {
619 .ctl_name = NET_IPV4_IPFRAG_MAX_DIST,
620 .procname = "ipfrag_max_dist",
621 .data = &sysctl_ipfrag_max_dist,
622 .maxlen = sizeof(int),
623 .mode = 0644,
624 .proc_handler = &proc_dointvec_minmax,
625 .extra1 = &zero
626 },
627 {
617 .ctl_name = NET_TCP_NO_METRICS_SAVE, 628 .ctl_name = NET_TCP_NO_METRICS_SAVE,
618 .procname = "tcp_no_metrics_save", 629 .procname = "tcp_no_metrics_save",
619 .data = &sysctl_tcp_nometrics_save, 630 .data = &sysctl_tcp_nometrics_save,
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index ef98b14ac56d..00aa80e93243 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1696,8 +1696,8 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
1696 int err = 0; 1696 int err = 0;
1697 1697
1698 if (level != SOL_TCP) 1698 if (level != SOL_TCP)
1699 return tp->af_specific->setsockopt(sk, level, optname, 1699 return icsk->icsk_af_ops->setsockopt(sk, level, optname,
1700 optval, optlen); 1700 optval, optlen);
1701 1701
1702 /* This is a string value all the others are int's */ 1702 /* This is a string value all the others are int's */
1703 if (optname == TCP_CONGESTION) { 1703 if (optname == TCP_CONGESTION) {
@@ -1914,7 +1914,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
1914 info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime); 1914 info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
1915 info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp); 1915 info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
1916 1916
1917 info->tcpi_pmtu = tp->pmtu_cookie; 1917 info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
1918 info->tcpi_rcv_ssthresh = tp->rcv_ssthresh; 1918 info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
1919 info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3; 1919 info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
1920 info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2; 1920 info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
@@ -1939,8 +1939,8 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
1939 int val, len; 1939 int val, len;
1940 1940
1941 if (level != SOL_TCP) 1941 if (level != SOL_TCP)
1942 return tp->af_specific->getsockopt(sk, level, optname, 1942 return icsk->icsk_af_ops->getsockopt(sk, level, optname,
1943 optval, optlen); 1943 optval, optlen);
1944 1944
1945 if (get_user(len, optlen)) 1945 if (get_user(len, optlen))
1946 return -EFAULT; 1946 return -EFAULT;
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index 1d0cd86621b1..035f2092d73a 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -30,8 +30,6 @@ static int fast_convergence = 1;
30static int max_increment = 16; 30static int max_increment = 16;
31static int low_window = 14; 31static int low_window = 14;
32static int beta = 819; /* = 819/1024 (BICTCP_BETA_SCALE) */ 32static int beta = 819; /* = 819/1024 (BICTCP_BETA_SCALE) */
33static int low_utilization_threshold = 153;
34static int low_utilization_period = 2;
35static int initial_ssthresh = 100; 33static int initial_ssthresh = 100;
36static int smooth_part = 20; 34static int smooth_part = 20;
37 35
@@ -43,10 +41,6 @@ module_param(low_window, int, 0644);
43MODULE_PARM_DESC(low_window, "lower bound on congestion window (for TCP friendliness)"); 41MODULE_PARM_DESC(low_window, "lower bound on congestion window (for TCP friendliness)");
44module_param(beta, int, 0644); 42module_param(beta, int, 0644);
45MODULE_PARM_DESC(beta, "beta for multiplicative increase"); 43MODULE_PARM_DESC(beta, "beta for multiplicative increase");
46module_param(low_utilization_threshold, int, 0644);
47MODULE_PARM_DESC(low_utilization_threshold, "percent (scaled by 1024) for low utilization mode");
48module_param(low_utilization_period, int, 0644);
49MODULE_PARM_DESC(low_utilization_period, "if average delay exceeds then goto to low utilization mode (seconds)");
50module_param(initial_ssthresh, int, 0644); 44module_param(initial_ssthresh, int, 0644);
51MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold"); 45MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold");
52module_param(smooth_part, int, 0644); 46module_param(smooth_part, int, 0644);
@@ -60,11 +54,6 @@ struct bictcp {
60 u32 loss_cwnd; /* congestion window at last loss */ 54 u32 loss_cwnd; /* congestion window at last loss */
61 u32 last_cwnd; /* the last snd_cwnd */ 55 u32 last_cwnd; /* the last snd_cwnd */
62 u32 last_time; /* time when updated last_cwnd */ 56 u32 last_time; /* time when updated last_cwnd */
63 u32 delay_min; /* min delay */
64 u32 delay_max; /* max delay */
65 u32 last_delay;
66 u8 low_utilization;/* 0: high; 1: low */
67 u32 low_utilization_start; /* starting time of low utilization detection*/
68 u32 epoch_start; /* beginning of an epoch */ 57 u32 epoch_start; /* beginning of an epoch */
69#define ACK_RATIO_SHIFT 4 58#define ACK_RATIO_SHIFT 4
70 u32 delayed_ack; /* estimate the ratio of Packets/ACKs << 4 */ 59 u32 delayed_ack; /* estimate the ratio of Packets/ACKs << 4 */
@@ -77,11 +66,6 @@ static inline void bictcp_reset(struct bictcp *ca)
77 ca->loss_cwnd = 0; 66 ca->loss_cwnd = 0;
78 ca->last_cwnd = 0; 67 ca->last_cwnd = 0;
79 ca->last_time = 0; 68 ca->last_time = 0;
80 ca->delay_min = 0;
81 ca->delay_max = 0;
82 ca->last_delay = 0;
83 ca->low_utilization = 0;
84 ca->low_utilization_start = 0;
85 ca->epoch_start = 0; 69 ca->epoch_start = 0;
86 ca->delayed_ack = 2 << ACK_RATIO_SHIFT; 70 ca->delayed_ack = 2 << ACK_RATIO_SHIFT;
87} 71}
@@ -143,8 +127,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
143 } 127 }
144 128
145 /* if in slow start or link utilization is very low */ 129 /* if in slow start or link utilization is very low */
146 if ( ca->loss_cwnd == 0 || 130 if (ca->loss_cwnd == 0) {
147 (cwnd > ca->loss_cwnd && ca->low_utilization)) {
148 if (ca->cnt > 20) /* increase cwnd 5% per RTT */ 131 if (ca->cnt > 20) /* increase cwnd 5% per RTT */
149 ca->cnt = 20; 132 ca->cnt = 20;
150 } 133 }
@@ -154,69 +137,12 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
154 ca->cnt = 1; 137 ca->cnt = 1;
155} 138}
156 139
157
158/* Detect low utilization in congestion avoidance */
159static inline void bictcp_low_utilization(struct sock *sk, int flag)
160{
161 const struct tcp_sock *tp = tcp_sk(sk);
162 struct bictcp *ca = inet_csk_ca(sk);
163 u32 dist, delay;
164
165 /* No time stamp */
166 if (!(tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) ||
167 /* Discard delay samples right after fast recovery */
168 tcp_time_stamp < ca->epoch_start + HZ ||
169 /* this delay samples may not be accurate */
170 flag == 0) {
171 ca->last_delay = 0;
172 goto notlow;
173 }
174
175 delay = ca->last_delay<<3; /* use the same scale as tp->srtt*/
176 ca->last_delay = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
177 if (delay == 0) /* no previous delay sample */
178 goto notlow;
179
180 /* first time call or link delay decreases */
181 if (ca->delay_min == 0 || ca->delay_min > delay) {
182 ca->delay_min = ca->delay_max = delay;
183 goto notlow;
184 }
185
186 if (ca->delay_max < delay)
187 ca->delay_max = delay;
188
189 /* utilization is low, if avg delay < dist*threshold
190 for checking_period time */
191 dist = ca->delay_max - ca->delay_min;
192 if (dist <= ca->delay_min>>6 ||
193 tp->srtt - ca->delay_min >= (dist*low_utilization_threshold)>>10)
194 goto notlow;
195
196 if (ca->low_utilization_start == 0) {
197 ca->low_utilization = 0;
198 ca->low_utilization_start = tcp_time_stamp;
199 } else if ((s32)(tcp_time_stamp - ca->low_utilization_start)
200 > low_utilization_period*HZ) {
201 ca->low_utilization = 1;
202 }
203
204 return;
205
206 notlow:
207 ca->low_utilization = 0;
208 ca->low_utilization_start = 0;
209
210}
211
212static void bictcp_cong_avoid(struct sock *sk, u32 ack, 140static void bictcp_cong_avoid(struct sock *sk, u32 ack,
213 u32 seq_rtt, u32 in_flight, int data_acked) 141 u32 seq_rtt, u32 in_flight, int data_acked)
214{ 142{
215 struct tcp_sock *tp = tcp_sk(sk); 143 struct tcp_sock *tp = tcp_sk(sk);
216 struct bictcp *ca = inet_csk_ca(sk); 144 struct bictcp *ca = inet_csk_ca(sk);
217 145
218 bictcp_low_utilization(sk, data_acked);
219
220 if (!tcp_is_cwnd_limited(sk, in_flight)) 146 if (!tcp_is_cwnd_limited(sk, in_flight))
221 return; 147 return;
222 148
@@ -249,11 +175,6 @@ static u32 bictcp_recalc_ssthresh(struct sock *sk)
249 175
250 ca->epoch_start = 0; /* end of epoch */ 176 ca->epoch_start = 0; /* end of epoch */
251 177
252 /* in case of wrong delay_max*/
253 if (ca->delay_min > 0 && ca->delay_max > ca->delay_min)
254 ca->delay_max = ca->delay_min
255 + ((ca->delay_max - ca->delay_min)* 90) / 100;
256
257 /* Wmax and fast convergence */ 178 /* Wmax and fast convergence */
258 if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence) 179 if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence)
259 ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta)) 180 ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta))
@@ -289,14 +210,14 @@ static void bictcp_state(struct sock *sk, u8 new_state)
289 bictcp_reset(inet_csk_ca(sk)); 210 bictcp_reset(inet_csk_ca(sk));
290} 211}
291 212
292/* Track delayed acknowledgement ratio using sliding window 213/* Track delayed acknowledgment ratio using sliding window
293 * ratio = (15*ratio + sample) / 16 214 * ratio = (15*ratio + sample) / 16
294 */ 215 */
295static void bictcp_acked(struct sock *sk, u32 cnt) 216static void bictcp_acked(struct sock *sk, u32 cnt)
296{ 217{
297 const struct inet_connection_sock *icsk = inet_csk(sk); 218 const struct inet_connection_sock *icsk = inet_csk(sk);
298 219
299 if (cnt > 0 && icsk->icsk_ca_state == TCP_CA_Open) { 220 if (cnt > 0 && icsk->icsk_ca_state == TCP_CA_Open) {
300 struct bictcp *ca = inet_csk_ca(sk); 221 struct bictcp *ca = inet_csk_ca(sk);
301 cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT; 222 cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT;
302 ca->delayed_ack += cnt; 223 ca->delayed_ack += cnt;
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index c7cc62c8dc12..e688c687d62d 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -174,6 +174,34 @@ int tcp_set_congestion_control(struct sock *sk, const char *name)
174 return err; 174 return err;
175} 175}
176 176
177
178/*
179 * Linear increase during slow start
180 */
181void tcp_slow_start(struct tcp_sock *tp)
182{
183 if (sysctl_tcp_abc) {
184 /* RFC3465: Slow Start
185 * TCP sender SHOULD increase cwnd by the number of
186 * previously unacknowledged bytes ACKed by each incoming
187 * acknowledgment, provided the increase is not more than L
188 */
189 if (tp->bytes_acked < tp->mss_cache)
190 return;
191
192 /* We MAY increase by 2 if discovered delayed ack */
193 if (sysctl_tcp_abc > 1 && tp->bytes_acked > 2*tp->mss_cache) {
194 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
195 tp->snd_cwnd++;
196 }
197 }
198 tp->bytes_acked = 0;
199
200 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
201 tp->snd_cwnd++;
202}
203EXPORT_SYMBOL_GPL(tcp_slow_start);
204
177/* 205/*
178 * TCP Reno congestion control 206 * TCP Reno congestion control
179 * This is special case used for fallback as well. 207 * This is special case used for fallback as well.
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
new file mode 100644
index 000000000000..31a4986dfbf7
--- /dev/null
+++ b/net/ipv4/tcp_cubic.c
@@ -0,0 +1,411 @@
1/*
2 * TCP CUBIC: Binary Increase Congestion control for TCP v2.0
3 *
4 * This is from the implementation of CUBIC TCP in
5 * Injong Rhee, Lisong Xu.
6 * "CUBIC: A New TCP-Friendly High-Speed TCP Variant
7 * in PFLDnet 2005
8 * Available from:
9 * http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/cubic-paper.pdf
10 *
11 * Unless CUBIC is enabled and congestion window is large
12 * this behaves the same as the original Reno.
13 */
14
15#include <linux/config.h>
16#include <linux/mm.h>
17#include <linux/module.h>
18#include <net/tcp.h>
19#include <asm/div64.h>
20
21#define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation
22 * max_cwnd = snd_cwnd * beta
23 */
24#define BICTCP_B 4 /*
25 * In binary search,
26 * go to point (max+min)/N
27 */
28#define BICTCP_HZ 10 /* BIC HZ 2^10 = 1024 */
29
30static int fast_convergence = 1;
31static int max_increment = 16;
32static int beta = 819; /* = 819/1024 (BICTCP_BETA_SCALE) */
33static int initial_ssthresh = 100;
34static int bic_scale = 41;
35static int tcp_friendliness = 1;
36
37static u32 cube_rtt_scale;
38static u32 beta_scale;
39static u64 cube_factor;
40
41/* Note parameters that are used for precomputing scale factors are read-only */
42module_param(fast_convergence, int, 0644);
43MODULE_PARM_DESC(fast_convergence, "turn on/off fast convergence");
44module_param(max_increment, int, 0644);
45MODULE_PARM_DESC(max_increment, "Limit on increment allowed during binary search");
46module_param(beta, int, 0444);
47MODULE_PARM_DESC(beta, "beta for multiplicative increase");
48module_param(initial_ssthresh, int, 0644);
49MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold");
50module_param(bic_scale, int, 0444);
51MODULE_PARM_DESC(bic_scale, "scale (scaled by 1024) value for bic function (bic_scale/1024)");
52module_param(tcp_friendliness, int, 0644);
53MODULE_PARM_DESC(tcp_friendliness, "turn on/off tcp friendliness");
54
55#include <asm/div64.h>
56
57/* BIC TCP Parameters */
58struct bictcp {
59 u32 cnt; /* increase cwnd by 1 after ACKs */
60 u32 last_max_cwnd; /* last maximum snd_cwnd */
61 u32 loss_cwnd; /* congestion window at last loss */
62 u32 last_cwnd; /* the last snd_cwnd */
63 u32 last_time; /* time when updated last_cwnd */
64 u32 bic_origin_point;/* origin point of bic function */
65 u32 bic_K; /* time to origin point from the beginning of the current epoch */
66 u32 delay_min; /* min delay */
67 u32 epoch_start; /* beginning of an epoch */
68 u32 ack_cnt; /* number of acks */
69 u32 tcp_cwnd; /* estimated tcp cwnd */
70#define ACK_RATIO_SHIFT 4
71 u32 delayed_ack; /* estimate the ratio of Packets/ACKs << 4 */
72};
73
74static inline void bictcp_reset(struct bictcp *ca)
75{
76 ca->cnt = 0;
77 ca->last_max_cwnd = 0;
78 ca->loss_cwnd = 0;
79 ca->last_cwnd = 0;
80 ca->last_time = 0;
81 ca->bic_origin_point = 0;
82 ca->bic_K = 0;
83 ca->delay_min = 0;
84 ca->epoch_start = 0;
85 ca->delayed_ack = 2 << ACK_RATIO_SHIFT;
86 ca->ack_cnt = 0;
87 ca->tcp_cwnd = 0;
88}
89
90static void bictcp_init(struct sock *sk)
91{
92 bictcp_reset(inet_csk_ca(sk));
93 if (initial_ssthresh)
94 tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
95}
96
97/* 64bit divisor, dividend and result. dynamic precision */
98static inline u_int64_t div64_64(u_int64_t dividend, u_int64_t divisor)
99{
100 u_int32_t d = divisor;
101
102 if (divisor > 0xffffffffULL) {
103 unsigned int shift = fls(divisor >> 32);
104
105 d = divisor >> shift;
106 dividend >>= shift;
107 }
108
109 /* avoid 64 bit division if possible */
110 if (dividend >> 32)
111 do_div(dividend, d);
112 else
113 dividend = (uint32_t) dividend / d;
114
115 return dividend;
116}
117
118/*
119 * calculate the cubic root of x using Newton-Raphson
120 */
121static u32 cubic_root(u64 a)
122{
123 u32 x, x1;
124
125 /* Initial estimate is based on:
126 * cbrt(x) = exp(log(x) / 3)
127 */
128 x = 1u << (fls64(a)/3);
129
130 /*
131 * Iteration based on:
132 * 2
133 * x = ( 2 * x + a / x ) / 3
134 * k+1 k k
135 */
136 do {
137 x1 = x;
138 x = (2 * x + (uint32_t) div64_64(a, x*x)) / 3;
139 } while (abs(x1 - x) > 1);
140
141 return x;
142}
143
144/*
145 * Compute congestion window to use.
146 */
147static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
148{
149 u64 offs;
150 u32 delta, t, bic_target, min_cnt, max_cnt;
151
152 ca->ack_cnt++; /* count the number of ACKs */
153
154 if (ca->last_cwnd == cwnd &&
155 (s32)(tcp_time_stamp - ca->last_time) <= HZ / 32)
156 return;
157
158 ca->last_cwnd = cwnd;
159 ca->last_time = tcp_time_stamp;
160
161 if (ca->epoch_start == 0) {
162 ca->epoch_start = tcp_time_stamp; /* record the beginning of an epoch */
163 ca->ack_cnt = 1; /* start counting */
164 ca->tcp_cwnd = cwnd; /* syn with cubic */
165
166 if (ca->last_max_cwnd <= cwnd) {
167 ca->bic_K = 0;
168 ca->bic_origin_point = cwnd;
169 } else {
170 /* Compute new K based on
171 * (wmax-cwnd) * (srtt>>3 / HZ) / c * 2^(3*bictcp_HZ)
172 */
173 ca->bic_K = cubic_root(cube_factor
174 * (ca->last_max_cwnd - cwnd));
175 ca->bic_origin_point = ca->last_max_cwnd;
176 }
177 }
178
179 /* cubic function - calc*/
180 /* calculate c * time^3 / rtt,
181 * while considering overflow in calculation of time^3
182 * (so time^3 is done by using 64 bit)
183 * and without the support of division of 64bit numbers
184 * (so all divisions are done by using 32 bit)
185 * also NOTE the unit of those veriables
186 * time = (t - K) / 2^bictcp_HZ
187 * c = bic_scale >> 10
188 * rtt = (srtt >> 3) / HZ
189 * !!! The following code does not have overflow problems,
190 * if the cwnd < 1 million packets !!!
191 */
192
193 /* change the unit from HZ to bictcp_HZ */
194 t = ((tcp_time_stamp + ca->delay_min - ca->epoch_start)
195 << BICTCP_HZ) / HZ;
196
197 if (t < ca->bic_K) /* t - K */
198 offs = ca->bic_K - t;
199 else
200 offs = t - ca->bic_K;
201
202 /* c/rtt * (t-K)^3 */
203 delta = (cube_rtt_scale * offs * offs * offs) >> (10+3*BICTCP_HZ);
204 if (t < ca->bic_K) /* below origin*/
205 bic_target = ca->bic_origin_point - delta;
206 else /* above origin*/
207 bic_target = ca->bic_origin_point + delta;
208
209 /* cubic function - calc bictcp_cnt*/
210 if (bic_target > cwnd) {
211 ca->cnt = cwnd / (bic_target - cwnd);
212 } else {
213 ca->cnt = 100 * cwnd; /* very small increment*/
214 }
215
216 if (ca->delay_min > 0) {
217 /* max increment = Smax * rtt / 0.1 */
218 min_cnt = (cwnd * HZ * 8)/(10 * max_increment * ca->delay_min);
219 if (ca->cnt < min_cnt)
220 ca->cnt = min_cnt;
221 }
222
223 /* slow start and low utilization */
224 if (ca->loss_cwnd == 0) /* could be aggressive in slow start */
225 ca->cnt = 50;
226
227 /* TCP Friendly */
228 if (tcp_friendliness) {
229 u32 scale = beta_scale;
230 delta = (cwnd * scale) >> 3;
231 while (ca->ack_cnt > delta) { /* update tcp cwnd */
232 ca->ack_cnt -= delta;
233 ca->tcp_cwnd++;
234 }
235
236 if (ca->tcp_cwnd > cwnd){ /* if bic is slower than tcp */
237 delta = ca->tcp_cwnd - cwnd;
238 max_cnt = cwnd / delta;
239 if (ca->cnt > max_cnt)
240 ca->cnt = max_cnt;
241 }
242 }
243
244 ca->cnt = (ca->cnt << ACK_RATIO_SHIFT) / ca->delayed_ack;
245 if (ca->cnt == 0) /* cannot be zero */
246 ca->cnt = 1;
247}
248
249
250/* Keep track of minimum rtt */
251static inline void measure_delay(struct sock *sk)
252{
253 const struct tcp_sock *tp = tcp_sk(sk);
254 struct bictcp *ca = inet_csk_ca(sk);
255 u32 delay;
256
257 /* No time stamp */
258 if (!(tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) ||
259 /* Discard delay samples right after fast recovery */
260 (s32)(tcp_time_stamp - ca->epoch_start) < HZ)
261 return;
262
263 delay = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
264 if (delay == 0)
265 delay = 1;
266
267 /* first time call or link delay decreases */
268 if (ca->delay_min == 0 || ca->delay_min > delay)
269 ca->delay_min = delay;
270}
271
272static void bictcp_cong_avoid(struct sock *sk, u32 ack,
273 u32 seq_rtt, u32 in_flight, int data_acked)
274{
275 struct tcp_sock *tp = tcp_sk(sk);
276 struct bictcp *ca = inet_csk_ca(sk);
277
278 if (data_acked)
279 measure_delay(sk);
280
281 if (!tcp_is_cwnd_limited(sk, in_flight))
282 return;
283
284 if (tp->snd_cwnd <= tp->snd_ssthresh)
285 tcp_slow_start(tp);
286 else {
287 bictcp_update(ca, tp->snd_cwnd);
288
289 /* In dangerous area, increase slowly.
290 * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
291 */
292 if (tp->snd_cwnd_cnt >= ca->cnt) {
293 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
294 tp->snd_cwnd++;
295 tp->snd_cwnd_cnt = 0;
296 } else
297 tp->snd_cwnd_cnt++;
298 }
299
300}
301
302static u32 bictcp_recalc_ssthresh(struct sock *sk)
303{
304 const struct tcp_sock *tp = tcp_sk(sk);
305 struct bictcp *ca = inet_csk_ca(sk);
306
307 ca->epoch_start = 0; /* end of epoch */
308
309 /* Wmax and fast convergence */
310 if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence)
311 ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta))
312 / (2 * BICTCP_BETA_SCALE);
313 else
314 ca->last_max_cwnd = tp->snd_cwnd;
315
316 ca->loss_cwnd = tp->snd_cwnd;
317
318 return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U);
319}
320
321static u32 bictcp_undo_cwnd(struct sock *sk)
322{
323 struct bictcp *ca = inet_csk_ca(sk);
324
325 return max(tcp_sk(sk)->snd_cwnd, ca->last_max_cwnd);
326}
327
328static u32 bictcp_min_cwnd(struct sock *sk)
329{
330 return tcp_sk(sk)->snd_ssthresh;
331}
332
333static void bictcp_state(struct sock *sk, u8 new_state)
334{
335 if (new_state == TCP_CA_Loss)
336 bictcp_reset(inet_csk_ca(sk));
337}
338
339/* Track delayed acknowledgment ratio using sliding window
340 * ratio = (15*ratio + sample) / 16
341 */
342static void bictcp_acked(struct sock *sk, u32 cnt)
343{
344 const struct inet_connection_sock *icsk = inet_csk(sk);
345
346 if (cnt > 0 && icsk->icsk_ca_state == TCP_CA_Open) {
347 struct bictcp *ca = inet_csk_ca(sk);
348 cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT;
349 ca->delayed_ack += cnt;
350 }
351}
352
353
354static struct tcp_congestion_ops cubictcp = {
355 .init = bictcp_init,
356 .ssthresh = bictcp_recalc_ssthresh,
357 .cong_avoid = bictcp_cong_avoid,
358 .set_state = bictcp_state,
359 .undo_cwnd = bictcp_undo_cwnd,
360 .min_cwnd = bictcp_min_cwnd,
361 .pkts_acked = bictcp_acked,
362 .owner = THIS_MODULE,
363 .name = "cubic",
364};
365
366static int __init cubictcp_register(void)
367{
368 BUG_ON(sizeof(struct bictcp) > ICSK_CA_PRIV_SIZE);
369
370 /* Precompute a bunch of the scaling factors that are used per-packet
371 * based on SRTT of 100ms
372 */
373
374 beta_scale = 8*(BICTCP_BETA_SCALE+beta)/ 3 / (BICTCP_BETA_SCALE - beta);
375
376 cube_rtt_scale = (bic_scale << 3) / 10; /* 1024*c/rtt */
377
378 /* calculate the "K" for (wmax-cwnd) = c/rtt * K^3
379 * so K = cubic_root( (wmax-cwnd)*rtt/c )
380 * the unit of K is bictcp_HZ=2^10, not HZ
381 *
382 * c = bic_scale >> 10
383 * rtt = 100ms
384 *
385 * the following code has been designed and tested for
386 * cwnd < 1 million packets
387 * RTT < 100 seconds
388 * HZ < 1,000,00 (corresponding to 10 nano-second)
389 */
390
391 /* 1/c * 2^2*bictcp_HZ * srtt */
392 cube_factor = 1ull << (10+3*BICTCP_HZ); /* 2^40 */
393
394 /* divide by bic_scale and by constant Srtt (100ms) */
395 do_div(cube_factor, bic_scale * 10);
396
397 return tcp_register_congestion_control(&cubictcp);
398}
399
400static void __exit cubictcp_unregister(void)
401{
402 tcp_unregister_congestion_control(&cubictcp);
403}
404
405module_init(cubictcp_register);
406module_exit(cubictcp_unregister);
407
408MODULE_AUTHOR("Sangtae Ha, Stephen Hemminger");
409MODULE_LICENSE("GPL");
410MODULE_DESCRIPTION("CUBIC TCP");
411MODULE_VERSION("2.0");
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index bf2e23086bce..0a461232329f 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -115,8 +115,8 @@ int sysctl_tcp_abc = 1;
115/* Adapt the MSS value used to make delayed ack decision to the 115/* Adapt the MSS value used to make delayed ack decision to the
116 * real world. 116 * real world.
117 */ 117 */
118static inline void tcp_measure_rcv_mss(struct sock *sk, 118static void tcp_measure_rcv_mss(struct sock *sk,
119 const struct sk_buff *skb) 119 const struct sk_buff *skb)
120{ 120{
121 struct inet_connection_sock *icsk = inet_csk(sk); 121 struct inet_connection_sock *icsk = inet_csk(sk);
122 const unsigned int lss = icsk->icsk_ack.last_seg_size; 122 const unsigned int lss = icsk->icsk_ack.last_seg_size;
@@ -246,8 +246,8 @@ static int __tcp_grow_window(const struct sock *sk, struct tcp_sock *tp,
246 return 0; 246 return 0;
247} 247}
248 248
249static inline void tcp_grow_window(struct sock *sk, struct tcp_sock *tp, 249static void tcp_grow_window(struct sock *sk, struct tcp_sock *tp,
250 struct sk_buff *skb) 250 struct sk_buff *skb)
251{ 251{
252 /* Check #1 */ 252 /* Check #1 */
253 if (tp->rcv_ssthresh < tp->window_clamp && 253 if (tp->rcv_ssthresh < tp->window_clamp &&
@@ -341,6 +341,26 @@ static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp)
341 tp->rcv_ssthresh = min(tp->window_clamp, 2U*tp->advmss); 341 tp->rcv_ssthresh = min(tp->window_clamp, 2U*tp->advmss);
342} 342}
343 343
344
345/* Initialize RCV_MSS value.
346 * RCV_MSS is an our guess about MSS used by the peer.
347 * We haven't any direct information about the MSS.
348 * It's better to underestimate the RCV_MSS rather than overestimate.
349 * Overestimations make us ACKing less frequently than needed.
350 * Underestimations are more easy to detect and fix by tcp_measure_rcv_mss().
351 */
352void tcp_initialize_rcv_mss(struct sock *sk)
353{
354 struct tcp_sock *tp = tcp_sk(sk);
355 unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache);
356
357 hint = min(hint, tp->rcv_wnd/2);
358 hint = min(hint, TCP_MIN_RCVMSS);
359 hint = max(hint, TCP_MIN_MSS);
360
361 inet_csk(sk)->icsk_ack.rcv_mss = hint;
362}
363
344/* Receiver "autotuning" code. 364/* Receiver "autotuning" code.
345 * 365 *
346 * The algorithm for RTT estimation w/o timestamps is based on 366 * The algorithm for RTT estimation w/o timestamps is based on
@@ -735,6 +755,27 @@ __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst)
735 return min_t(__u32, cwnd, tp->snd_cwnd_clamp); 755 return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
736} 756}
737 757
758/* Set slow start threshold and cwnd not falling to slow start */
759void tcp_enter_cwr(struct sock *sk)
760{
761 struct tcp_sock *tp = tcp_sk(sk);
762
763 tp->prior_ssthresh = 0;
764 tp->bytes_acked = 0;
765 if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
766 tp->undo_marker = 0;
767 tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk);
768 tp->snd_cwnd = min(tp->snd_cwnd,
769 tcp_packets_in_flight(tp) + 1U);
770 tp->snd_cwnd_cnt = 0;
771 tp->high_seq = tp->snd_nxt;
772 tp->snd_cwnd_stamp = tcp_time_stamp;
773 TCP_ECN_queue_cwr(tp);
774
775 tcp_set_ca_state(sk, TCP_CA_CWR);
776 }
777}
778
738/* Initialize metrics on socket. */ 779/* Initialize metrics on socket. */
739 780
740static void tcp_init_metrics(struct sock *sk) 781static void tcp_init_metrics(struct sock *sk)
@@ -2070,8 +2111,8 @@ static inline void tcp_ack_update_rtt(struct sock *sk, const int flag,
2070 tcp_ack_no_tstamp(sk, seq_rtt, flag); 2111 tcp_ack_no_tstamp(sk, seq_rtt, flag);
2071} 2112}
2072 2113
2073static inline void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, 2114static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
2074 u32 in_flight, int good) 2115 u32 in_flight, int good)
2075{ 2116{
2076 const struct inet_connection_sock *icsk = inet_csk(sk); 2117 const struct inet_connection_sock *icsk = inet_csk(sk);
2077 icsk->icsk_ca_ops->cong_avoid(sk, ack, rtt, in_flight, good); 2118 icsk->icsk_ca_ops->cong_avoid(sk, ack, rtt, in_flight, good);
@@ -2082,7 +2123,7 @@ static inline void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
2082 * RFC2988 recommends to restart timer to now+rto. 2123 * RFC2988 recommends to restart timer to now+rto.
2083 */ 2124 */
2084 2125
2085static inline void tcp_ack_packets_out(struct sock *sk, struct tcp_sock *tp) 2126static void tcp_ack_packets_out(struct sock *sk, struct tcp_sock *tp)
2086{ 2127{
2087 if (!tp->packets_out) { 2128 if (!tp->packets_out) {
2088 inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); 2129 inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
@@ -2147,7 +2188,7 @@ static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb,
2147 return acked; 2188 return acked;
2148} 2189}
2149 2190
2150static inline u32 tcp_usrtt(const struct sk_buff *skb) 2191static u32 tcp_usrtt(const struct sk_buff *skb)
2151{ 2192{
2152 struct timeval tv, now; 2193 struct timeval tv, now;
2153 2194
@@ -2342,7 +2383,7 @@ static int tcp_ack_update_window(struct sock *sk, struct tcp_sock *tp,
2342 2383
2343 if (nwin > tp->max_window) { 2384 if (nwin > tp->max_window) {
2344 tp->max_window = nwin; 2385 tp->max_window = nwin;
2345 tcp_sync_mss(sk, tp->pmtu_cookie); 2386 tcp_sync_mss(sk, inet_csk(sk)->icsk_pmtu_cookie);
2346 } 2387 }
2347 } 2388 }
2348 } 2389 }
@@ -2583,8 +2624,8 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
2583/* Fast parse options. This hopes to only see timestamps. 2624/* Fast parse options. This hopes to only see timestamps.
2584 * If it is wrong it falls back on tcp_parse_options(). 2625 * If it is wrong it falls back on tcp_parse_options().
2585 */ 2626 */
2586static inline int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th, 2627static int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th,
2587 struct tcp_sock *tp) 2628 struct tcp_sock *tp)
2588{ 2629{
2589 if (th->doff == sizeof(struct tcphdr)>>2) { 2630 if (th->doff == sizeof(struct tcphdr)>>2) {
2590 tp->rx_opt.saw_tstamp = 0; 2631 tp->rx_opt.saw_tstamp = 0;
@@ -2804,8 +2845,7 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
2804 } 2845 }
2805} 2846}
2806 2847
2807static __inline__ int 2848static inline int tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, u32 end_seq)
2808tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, u32 end_seq)
2809{ 2849{
2810 if (!after(seq, sp->end_seq) && !after(sp->start_seq, end_seq)) { 2850 if (!after(seq, sp->end_seq) && !after(sp->start_seq, end_seq)) {
2811 if (before(seq, sp->start_seq)) 2851 if (before(seq, sp->start_seq))
@@ -2817,7 +2857,7 @@ tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, u32 end_seq)
2817 return 0; 2857 return 0;
2818} 2858}
2819 2859
2820static inline void tcp_dsack_set(struct tcp_sock *tp, u32 seq, u32 end_seq) 2860static void tcp_dsack_set(struct tcp_sock *tp, u32 seq, u32 end_seq)
2821{ 2861{
2822 if (tp->rx_opt.sack_ok && sysctl_tcp_dsack) { 2862 if (tp->rx_opt.sack_ok && sysctl_tcp_dsack) {
2823 if (before(seq, tp->rcv_nxt)) 2863 if (before(seq, tp->rcv_nxt))
@@ -2832,7 +2872,7 @@ static inline void tcp_dsack_set(struct tcp_sock *tp, u32 seq, u32 end_seq)
2832 } 2872 }
2833} 2873}
2834 2874
2835static inline void tcp_dsack_extend(struct tcp_sock *tp, u32 seq, u32 end_seq) 2875static void tcp_dsack_extend(struct tcp_sock *tp, u32 seq, u32 end_seq)
2836{ 2876{
2837 if (!tp->rx_opt.dsack) 2877 if (!tp->rx_opt.dsack)
2838 tcp_dsack_set(tp, seq, end_seq); 2878 tcp_dsack_set(tp, seq, end_seq);
@@ -2890,7 +2930,7 @@ static void tcp_sack_maybe_coalesce(struct tcp_sock *tp)
2890 } 2930 }
2891} 2931}
2892 2932
2893static __inline__ void tcp_sack_swap(struct tcp_sack_block *sack1, struct tcp_sack_block *sack2) 2933static inline void tcp_sack_swap(struct tcp_sack_block *sack1, struct tcp_sack_block *sack2)
2894{ 2934{
2895 __u32 tmp; 2935 __u32 tmp;
2896 2936
@@ -3455,7 +3495,7 @@ void tcp_cwnd_application_limited(struct sock *sk)
3455 tp->snd_cwnd_stamp = tcp_time_stamp; 3495 tp->snd_cwnd_stamp = tcp_time_stamp;
3456} 3496}
3457 3497
3458static inline int tcp_should_expand_sndbuf(struct sock *sk, struct tcp_sock *tp) 3498static int tcp_should_expand_sndbuf(struct sock *sk, struct tcp_sock *tp)
3459{ 3499{
3460 /* If the user specified a specific send buffer setting, do 3500 /* If the user specified a specific send buffer setting, do
3461 * not modify it. 3501 * not modify it.
@@ -3502,7 +3542,7 @@ static void tcp_new_space(struct sock *sk)
3502 sk->sk_write_space(sk); 3542 sk->sk_write_space(sk);
3503} 3543}
3504 3544
3505static inline void tcp_check_space(struct sock *sk) 3545static void tcp_check_space(struct sock *sk)
3506{ 3546{
3507 if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) { 3547 if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) {
3508 sock_reset_flag(sk, SOCK_QUEUE_SHRUNK); 3548 sock_reset_flag(sk, SOCK_QUEUE_SHRUNK);
@@ -3512,7 +3552,7 @@ static inline void tcp_check_space(struct sock *sk)
3512 } 3552 }
3513} 3553}
3514 3554
3515static __inline__ void tcp_data_snd_check(struct sock *sk, struct tcp_sock *tp) 3555static inline void tcp_data_snd_check(struct sock *sk, struct tcp_sock *tp)
3516{ 3556{
3517 tcp_push_pending_frames(sk, tp); 3557 tcp_push_pending_frames(sk, tp);
3518 tcp_check_space(sk); 3558 tcp_check_space(sk);
@@ -3544,7 +3584,7 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
3544 } 3584 }
3545} 3585}
3546 3586
3547static __inline__ void tcp_ack_snd_check(struct sock *sk) 3587static inline void tcp_ack_snd_check(struct sock *sk)
3548{ 3588{
3549 if (!inet_csk_ack_scheduled(sk)) { 3589 if (!inet_csk_ack_scheduled(sk)) {
3550 /* We sent a data segment already. */ 3590 /* We sent a data segment already. */
@@ -3692,8 +3732,7 @@ static int __tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb)
3692 return result; 3732 return result;
3693} 3733}
3694 3734
3695static __inline__ int 3735static inline int tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb)
3696tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb)
3697{ 3736{
3698 return skb->ip_summed != CHECKSUM_UNNECESSARY && 3737 return skb->ip_summed != CHECKSUM_UNNECESSARY &&
3699 __tcp_checksum_complete_user(sk, skb); 3738 __tcp_checksum_complete_user(sk, skb);
@@ -3967,12 +4006,12 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
3967 struct tcphdr *th, unsigned len) 4006 struct tcphdr *th, unsigned len)
3968{ 4007{
3969 struct tcp_sock *tp = tcp_sk(sk); 4008 struct tcp_sock *tp = tcp_sk(sk);
4009 struct inet_connection_sock *icsk = inet_csk(sk);
3970 int saved_clamp = tp->rx_opt.mss_clamp; 4010 int saved_clamp = tp->rx_opt.mss_clamp;
3971 4011
3972 tcp_parse_options(skb, &tp->rx_opt, 0); 4012 tcp_parse_options(skb, &tp->rx_opt, 0);
3973 4013
3974 if (th->ack) { 4014 if (th->ack) {
3975 struct inet_connection_sock *icsk;
3976 /* rfc793: 4015 /* rfc793:
3977 * "If the state is SYN-SENT then 4016 * "If the state is SYN-SENT then
3978 * first check the ACK bit 4017 * first check the ACK bit
@@ -4061,7 +4100,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
4061 if (tp->rx_opt.sack_ok && sysctl_tcp_fack) 4100 if (tp->rx_opt.sack_ok && sysctl_tcp_fack)
4062 tp->rx_opt.sack_ok |= 2; 4101 tp->rx_opt.sack_ok |= 2;
4063 4102
4064 tcp_sync_mss(sk, tp->pmtu_cookie); 4103 tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
4065 tcp_initialize_rcv_mss(sk); 4104 tcp_initialize_rcv_mss(sk);
4066 4105
4067 /* Remember, tcp_poll() does not lock socket! 4106 /* Remember, tcp_poll() does not lock socket!
@@ -4072,7 +4111,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
4072 tcp_set_state(sk, TCP_ESTABLISHED); 4111 tcp_set_state(sk, TCP_ESTABLISHED);
4073 4112
4074 /* Make sure socket is routed, for correct metrics. */ 4113 /* Make sure socket is routed, for correct metrics. */
4075 tp->af_specific->rebuild_header(sk); 4114 icsk->icsk_af_ops->rebuild_header(sk);
4076 4115
4077 tcp_init_metrics(sk); 4116 tcp_init_metrics(sk);
4078 4117
@@ -4098,8 +4137,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
4098 sk_wake_async(sk, 0, POLL_OUT); 4137 sk_wake_async(sk, 0, POLL_OUT);
4099 } 4138 }
4100 4139
4101 icsk = inet_csk(sk);
4102
4103 if (sk->sk_write_pending || 4140 if (sk->sk_write_pending ||
4104 icsk->icsk_accept_queue.rskq_defer_accept || 4141 icsk->icsk_accept_queue.rskq_defer_accept ||
4105 icsk->icsk_ack.pingpong) { 4142 icsk->icsk_ack.pingpong) {
@@ -4173,7 +4210,7 @@ discard:
4173 if (tp->ecn_flags&TCP_ECN_OK) 4210 if (tp->ecn_flags&TCP_ECN_OK)
4174 sock_set_flag(sk, SOCK_NO_LARGESEND); 4211 sock_set_flag(sk, SOCK_NO_LARGESEND);
4175 4212
4176 tcp_sync_mss(sk, tp->pmtu_cookie); 4213 tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
4177 tcp_initialize_rcv_mss(sk); 4214 tcp_initialize_rcv_mss(sk);
4178 4215
4179 4216
@@ -4220,6 +4257,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4220 struct tcphdr *th, unsigned len) 4257 struct tcphdr *th, unsigned len)
4221{ 4258{
4222 struct tcp_sock *tp = tcp_sk(sk); 4259 struct tcp_sock *tp = tcp_sk(sk);
4260 struct inet_connection_sock *icsk = inet_csk(sk);
4223 int queued = 0; 4261 int queued = 0;
4224 4262
4225 tp->rx_opt.saw_tstamp = 0; 4263 tp->rx_opt.saw_tstamp = 0;
@@ -4236,7 +4274,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4236 goto discard; 4274 goto discard;
4237 4275
4238 if(th->syn) { 4276 if(th->syn) {
4239 if(tp->af_specific->conn_request(sk, skb) < 0) 4277 if (icsk->icsk_af_ops->conn_request(sk, skb) < 0)
4240 return 1; 4278 return 1;
4241 4279
4242 /* Now we have several options: In theory there is 4280 /* Now we have several options: In theory there is
@@ -4349,7 +4387,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4349 /* Make sure socket is routed, for 4387 /* Make sure socket is routed, for
4350 * correct metrics. 4388 * correct metrics.
4351 */ 4389 */
4352 tp->af_specific->rebuild_header(sk); 4390 icsk->icsk_af_ops->rebuild_header(sk);
4353 4391
4354 tcp_init_metrics(sk); 4392 tcp_init_metrics(sk);
4355 4393
@@ -4475,3 +4513,4 @@ EXPORT_SYMBOL(sysctl_tcp_abc);
4475EXPORT_SYMBOL(tcp_parse_options); 4513EXPORT_SYMBOL(tcp_parse_options);
4476EXPORT_SYMBOL(tcp_rcv_established); 4514EXPORT_SYMBOL(tcp_rcv_established);
4477EXPORT_SYMBOL(tcp_rcv_state_process); 4515EXPORT_SYMBOL(tcp_rcv_state_process);
4516EXPORT_SYMBOL(tcp_initialize_rcv_mss);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 4d5021e1929b..e9f83e5b28ce 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -69,6 +69,7 @@
69#include <net/transp_v6.h> 69#include <net/transp_v6.h>
70#include <net/ipv6.h> 70#include <net/ipv6.h>
71#include <net/inet_common.h> 71#include <net/inet_common.h>
72#include <net/timewait_sock.h>
72#include <net/xfrm.h> 73#include <net/xfrm.h>
73 74
74#include <linux/inet.h> 75#include <linux/inet.h>
@@ -86,8 +87,7 @@ int sysctl_tcp_low_latency;
86/* Socket used for sending RSTs */ 87/* Socket used for sending RSTs */
87static struct socket *tcp_socket; 88static struct socket *tcp_socket;
88 89
89void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len, 90void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb);
90 struct sk_buff *skb);
91 91
92struct inet_hashinfo __cacheline_aligned tcp_hashinfo = { 92struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
93 .lhash_lock = RW_LOCK_UNLOCKED, 93 .lhash_lock = RW_LOCK_UNLOCKED,
@@ -97,7 +97,8 @@ struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
97 97
98static int tcp_v4_get_port(struct sock *sk, unsigned short snum) 98static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
99{ 99{
100 return inet_csk_get_port(&tcp_hashinfo, sk, snum); 100 return inet_csk_get_port(&tcp_hashinfo, sk, snum,
101 inet_csk_bind_conflict);
101} 102}
102 103
103static void tcp_v4_hash(struct sock *sk) 104static void tcp_v4_hash(struct sock *sk)
@@ -118,202 +119,38 @@ static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
118 skb->h.th->source); 119 skb->h.th->source);
119} 120}
120 121
121/* called with local bh disabled */ 122int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
122static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
123 struct inet_timewait_sock **twp)
124{ 123{
125 struct inet_sock *inet = inet_sk(sk); 124 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
126 u32 daddr = inet->rcv_saddr; 125 struct tcp_sock *tp = tcp_sk(sk);
127 u32 saddr = inet->daddr;
128 int dif = sk->sk_bound_dev_if;
129 INET_ADDR_COOKIE(acookie, saddr, daddr)
130 const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
131 unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport);
132 struct inet_ehash_bucket *head = inet_ehash_bucket(&tcp_hashinfo, hash);
133 struct sock *sk2;
134 const struct hlist_node *node;
135 struct inet_timewait_sock *tw;
136
137 prefetch(head->chain.first);
138 write_lock(&head->lock);
139
140 /* Check TIME-WAIT sockets first. */
141 sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) {
142 tw = inet_twsk(sk2);
143
144 if (INET_TW_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) {
145 const struct tcp_timewait_sock *tcptw = tcp_twsk(sk2);
146 struct tcp_sock *tp = tcp_sk(sk);
147
148 /* With PAWS, it is safe from the viewpoint
149 of data integrity. Even without PAWS it
150 is safe provided sequence spaces do not
151 overlap i.e. at data rates <= 80Mbit/sec.
152
153 Actually, the idea is close to VJ's one,
154 only timestamp cache is held not per host,
155 but per port pair and TW bucket is used
156 as state holder.
157 126
158 If TW bucket has been already destroyed we 127 /* With PAWS, it is safe from the viewpoint
159 fall back to VJ's scheme and use initial 128 of data integrity. Even without PAWS it is safe provided sequence
160 timestamp retrieved from peer table. 129 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
161 */
162 if (tcptw->tw_ts_recent_stamp &&
163 (!twp || (sysctl_tcp_tw_reuse &&
164 xtime.tv_sec -
165 tcptw->tw_ts_recent_stamp > 1))) {
166 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
167 if (tp->write_seq == 0)
168 tp->write_seq = 1;
169 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
170 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
171 sock_hold(sk2);
172 goto unique;
173 } else
174 goto not_unique;
175 }
176 }
177 tw = NULL;
178 130
179 /* And established part... */ 131 Actually, the idea is close to VJ's one, only timestamp cache is
180 sk_for_each(sk2, node, &head->chain) { 132 held not per host, but per port pair and TW bucket is used as state
181 if (INET_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) 133 holder.
182 goto not_unique;
183 }
184 134
185unique: 135 If TW bucket has been already destroyed we fall back to VJ's scheme
186 /* Must record num and sport now. Otherwise we will see 136 and use initial timestamp retrieved from peer table.
187 * in hash table socket with a funny identity. */ 137 */
188 inet->num = lport; 138 if (tcptw->tw_ts_recent_stamp &&
189 inet->sport = htons(lport); 139 (twp == NULL || (sysctl_tcp_tw_reuse &&
190 sk->sk_hash = hash; 140 xtime.tv_sec - tcptw->tw_ts_recent_stamp > 1))) {
191 BUG_TRAP(sk_unhashed(sk)); 141 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
192 __sk_add_node(sk, &head->chain); 142 if (tp->write_seq == 0)
193 sock_prot_inc_use(sk->sk_prot); 143 tp->write_seq = 1;
194 write_unlock(&head->lock); 144 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
195 145 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
196 if (twp) { 146 sock_hold(sktw);
197 *twp = tw; 147 return 1;
198 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
199 } else if (tw) {
200 /* Silly. Should hash-dance instead... */
201 inet_twsk_deschedule(tw, &tcp_death_row);
202 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
203
204 inet_twsk_put(tw);
205 } 148 }
206 149
207 return 0; 150 return 0;
208
209not_unique:
210 write_unlock(&head->lock);
211 return -EADDRNOTAVAIL;
212} 151}
213 152
214static inline u32 connect_port_offset(const struct sock *sk) 153EXPORT_SYMBOL_GPL(tcp_twsk_unique);
215{
216 const struct inet_sock *inet = inet_sk(sk);
217
218 return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr,
219 inet->dport);
220}
221
222/*
223 * Bind a port for a connect operation and hash it.
224 */
225static inline int tcp_v4_hash_connect(struct sock *sk)
226{
227 const unsigned short snum = inet_sk(sk)->num;
228 struct inet_bind_hashbucket *head;
229 struct inet_bind_bucket *tb;
230 int ret;
231
232 if (!snum) {
233 int low = sysctl_local_port_range[0];
234 int high = sysctl_local_port_range[1];
235 int range = high - low;
236 int i;
237 int port;
238 static u32 hint;
239 u32 offset = hint + connect_port_offset(sk);
240 struct hlist_node *node;
241 struct inet_timewait_sock *tw = NULL;
242
243 local_bh_disable();
244 for (i = 1; i <= range; i++) {
245 port = low + (i + offset) % range;
246 head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)];
247 spin_lock(&head->lock);
248
249 /* Does not bother with rcv_saddr checks,
250 * because the established check is already
251 * unique enough.
252 */
253 inet_bind_bucket_for_each(tb, node, &head->chain) {
254 if (tb->port == port) {
255 BUG_TRAP(!hlist_empty(&tb->owners));
256 if (tb->fastreuse >= 0)
257 goto next_port;
258 if (!__tcp_v4_check_established(sk,
259 port,
260 &tw))
261 goto ok;
262 goto next_port;
263 }
264 }
265
266 tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port);
267 if (!tb) {
268 spin_unlock(&head->lock);
269 break;
270 }
271 tb->fastreuse = -1;
272 goto ok;
273
274 next_port:
275 spin_unlock(&head->lock);
276 }
277 local_bh_enable();
278
279 return -EADDRNOTAVAIL;
280
281ok:
282 hint += i;
283
284 /* Head lock still held and bh's disabled */
285 inet_bind_hash(sk, tb, port);
286 if (sk_unhashed(sk)) {
287 inet_sk(sk)->sport = htons(port);
288 __inet_hash(&tcp_hashinfo, sk, 0);
289 }
290 spin_unlock(&head->lock);
291
292 if (tw) {
293 inet_twsk_deschedule(tw, &tcp_death_row);;
294 inet_twsk_put(tw);
295 }
296
297 ret = 0;
298 goto out;
299 }
300
301 head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
302 tb = inet_csk(sk)->icsk_bind_hash;
303 spin_lock_bh(&head->lock);
304 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
305 __inet_hash(&tcp_hashinfo, sk, 0);
306 spin_unlock_bh(&head->lock);
307 return 0;
308 } else {
309 spin_unlock(&head->lock);
310 /* No definite answer... Walk to established hash table */
311 ret = __tcp_v4_check_established(sk, snum, NULL);
312out:
313 local_bh_enable();
314 return ret;
315 }
316}
317 154
318/* This will initiate an outgoing connection. */ 155/* This will initiate an outgoing connection. */
319int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) 156int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
@@ -383,9 +220,9 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
383 inet->dport = usin->sin_port; 220 inet->dport = usin->sin_port;
384 inet->daddr = daddr; 221 inet->daddr = daddr;
385 222
386 tp->ext_header_len = 0; 223 inet_csk(sk)->icsk_ext_hdr_len = 0;
387 if (inet->opt) 224 if (inet->opt)
388 tp->ext_header_len = inet->opt->optlen; 225 inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
389 226
390 tp->rx_opt.mss_clamp = 536; 227 tp->rx_opt.mss_clamp = 536;
391 228
@@ -395,7 +232,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
395 * complete initialization after this. 232 * complete initialization after this.
396 */ 233 */
397 tcp_set_state(sk, TCP_SYN_SENT); 234 tcp_set_state(sk, TCP_SYN_SENT);
398 err = tcp_v4_hash_connect(sk); 235 err = inet_hash_connect(&tcp_death_row, sk);
399 if (err) 236 if (err)
400 goto failure; 237 goto failure;
401 238
@@ -433,12 +270,10 @@ failure:
433/* 270/*
434 * This routine does path mtu discovery as defined in RFC1191. 271 * This routine does path mtu discovery as defined in RFC1191.
435 */ 272 */
436static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, 273static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
437 u32 mtu)
438{ 274{
439 struct dst_entry *dst; 275 struct dst_entry *dst;
440 struct inet_sock *inet = inet_sk(sk); 276 struct inet_sock *inet = inet_sk(sk);
441 struct tcp_sock *tp = tcp_sk(sk);
442 277
443 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs 278 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
444 * send out by Linux are always <576bytes so they should go through 279 * send out by Linux are always <576bytes so they should go through
@@ -467,7 +302,7 @@ static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
467 mtu = dst_mtu(dst); 302 mtu = dst_mtu(dst);
468 303
469 if (inet->pmtudisc != IP_PMTUDISC_DONT && 304 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
470 tp->pmtu_cookie > mtu) { 305 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
471 tcp_sync_mss(sk, mtu); 306 tcp_sync_mss(sk, mtu);
472 307
473 /* Resend the TCP packet because it's 308 /* Resend the TCP packet because it's
@@ -644,10 +479,10 @@ out:
644} 479}
645 480
646/* This routine computes an IPv4 TCP checksum. */ 481/* This routine computes an IPv4 TCP checksum. */
647void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len, 482void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
648 struct sk_buff *skb)
649{ 483{
650 struct inet_sock *inet = inet_sk(sk); 484 struct inet_sock *inet = inet_sk(sk);
485 struct tcphdr *th = skb->h.th;
651 486
652 if (skb->ip_summed == CHECKSUM_HW) { 487 if (skb->ip_summed == CHECKSUM_HW) {
653 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0); 488 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
@@ -826,7 +661,8 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req)
826 kfree(inet_rsk(req)->opt); 661 kfree(inet_rsk(req)->opt);
827} 662}
828 663
829static inline void syn_flood_warning(struct sk_buff *skb) 664#ifdef CONFIG_SYN_COOKIES
665static void syn_flood_warning(struct sk_buff *skb)
830{ 666{
831 static unsigned long warntime; 667 static unsigned long warntime;
832 668
@@ -837,12 +673,13 @@ static inline void syn_flood_warning(struct sk_buff *skb)
837 ntohs(skb->h.th->dest)); 673 ntohs(skb->h.th->dest));
838 } 674 }
839} 675}
676#endif
840 677
841/* 678/*
842 * Save and compile IPv4 options into the request_sock if needed. 679 * Save and compile IPv4 options into the request_sock if needed.
843 */ 680 */
844static inline struct ip_options *tcp_v4_save_options(struct sock *sk, 681static struct ip_options *tcp_v4_save_options(struct sock *sk,
845 struct sk_buff *skb) 682 struct sk_buff *skb)
846{ 683{
847 struct ip_options *opt = &(IPCB(skb)->opt); 684 struct ip_options *opt = &(IPCB(skb)->opt);
848 struct ip_options *dopt = NULL; 685 struct ip_options *dopt = NULL;
@@ -869,6 +706,11 @@ struct request_sock_ops tcp_request_sock_ops = {
869 .send_reset = tcp_v4_send_reset, 706 .send_reset = tcp_v4_send_reset,
870}; 707};
871 708
709static struct timewait_sock_ops tcp_timewait_sock_ops = {
710 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
711 .twsk_unique = tcp_twsk_unique,
712};
713
872int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 714int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
873{ 715{
874 struct inet_request_sock *ireq; 716 struct inet_request_sock *ireq;
@@ -1053,9 +895,9 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1053 ireq->opt = NULL; 895 ireq->opt = NULL;
1054 newinet->mc_index = inet_iif(skb); 896 newinet->mc_index = inet_iif(skb);
1055 newinet->mc_ttl = skb->nh.iph->ttl; 897 newinet->mc_ttl = skb->nh.iph->ttl;
1056 newtp->ext_header_len = 0; 898 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1057 if (newinet->opt) 899 if (newinet->opt)
1058 newtp->ext_header_len = newinet->opt->optlen; 900 inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
1059 newinet->id = newtp->write_seq ^ jiffies; 901 newinet->id = newtp->write_seq ^ jiffies;
1060 902
1061 tcp_sync_mss(newsk, dst_mtu(dst)); 903 tcp_sync_mss(newsk, dst_mtu(dst));
@@ -1314,16 +1156,6 @@ do_time_wait:
1314 goto discard_it; 1156 goto discard_it;
1315} 1157}
1316 1158
1317static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1318{
1319 struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1320 struct inet_sock *inet = inet_sk(sk);
1321
1322 sin->sin_family = AF_INET;
1323 sin->sin_addr.s_addr = inet->daddr;
1324 sin->sin_port = inet->dport;
1325}
1326
1327/* VJ's idea. Save last timestamp seen from this destination 1159/* VJ's idea. Save last timestamp seen from this destination
1328 * and hold it at least for normal timewait interval to use for duplicate 1160 * and hold it at least for normal timewait interval to use for duplicate
1329 * segment detection in subsequent connections, before they enter synchronized 1161 * segment detection in subsequent connections, before they enter synchronized
@@ -1382,7 +1214,7 @@ int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1382 return 0; 1214 return 0;
1383} 1215}
1384 1216
1385struct tcp_func ipv4_specific = { 1217struct inet_connection_sock_af_ops ipv4_specific = {
1386 .queue_xmit = ip_queue_xmit, 1218 .queue_xmit = ip_queue_xmit,
1387 .send_check = tcp_v4_send_check, 1219 .send_check = tcp_v4_send_check,
1388 .rebuild_header = inet_sk_rebuild_header, 1220 .rebuild_header = inet_sk_rebuild_header,
@@ -1392,7 +1224,7 @@ struct tcp_func ipv4_specific = {
1392 .net_header_len = sizeof(struct iphdr), 1224 .net_header_len = sizeof(struct iphdr),
1393 .setsockopt = ip_setsockopt, 1225 .setsockopt = ip_setsockopt,
1394 .getsockopt = ip_getsockopt, 1226 .getsockopt = ip_getsockopt,
1395 .addr2sockaddr = v4_addr2sockaddr, 1227 .addr2sockaddr = inet_csk_addr2sockaddr,
1396 .sockaddr_len = sizeof(struct sockaddr_in), 1228 .sockaddr_len = sizeof(struct sockaddr_in),
1397}; 1229};
1398 1230
@@ -1433,7 +1265,8 @@ static int tcp_v4_init_sock(struct sock *sk)
1433 sk->sk_write_space = sk_stream_write_space; 1265 sk->sk_write_space = sk_stream_write_space;
1434 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); 1266 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1435 1267
1436 tp->af_specific = &ipv4_specific; 1268 icsk->icsk_af_ops = &ipv4_specific;
1269 icsk->icsk_sync_mss = tcp_sync_mss;
1437 1270
1438 sk->sk_sndbuf = sysctl_tcp_wmem[1]; 1271 sk->sk_sndbuf = sysctl_tcp_wmem[1];
1439 sk->sk_rcvbuf = sysctl_tcp_rmem[1]; 1272 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
@@ -1989,7 +1822,7 @@ struct proto tcp_prot = {
1989 .sysctl_rmem = sysctl_tcp_rmem, 1822 .sysctl_rmem = sysctl_tcp_rmem,
1990 .max_header = MAX_TCP_HEADER, 1823 .max_header = MAX_TCP_HEADER,
1991 .obj_size = sizeof(struct tcp_sock), 1824 .obj_size = sizeof(struct tcp_sock),
1992 .twsk_obj_size = sizeof(struct tcp_timewait_sock), 1825 .twsk_prot = &tcp_timewait_sock_ops,
1993 .rsk_prot = &tcp_request_sock_ops, 1826 .rsk_prot = &tcp_request_sock_ops,
1994}; 1827};
1995 1828
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 1b66a2ac4321..2b9b7f6c7f7c 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -274,18 +274,18 @@ kill:
274void tcp_time_wait(struct sock *sk, int state, int timeo) 274void tcp_time_wait(struct sock *sk, int state, int timeo)
275{ 275{
276 struct inet_timewait_sock *tw = NULL; 276 struct inet_timewait_sock *tw = NULL;
277 const struct inet_connection_sock *icsk = inet_csk(sk);
277 const struct tcp_sock *tp = tcp_sk(sk); 278 const struct tcp_sock *tp = tcp_sk(sk);
278 int recycle_ok = 0; 279 int recycle_ok = 0;
279 280
280 if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp) 281 if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp)
281 recycle_ok = tp->af_specific->remember_stamp(sk); 282 recycle_ok = icsk->icsk_af_ops->remember_stamp(sk);
282 283
283 if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets) 284 if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets)
284 tw = inet_twsk_alloc(sk, state); 285 tw = inet_twsk_alloc(sk, state);
285 286
286 if (tw != NULL) { 287 if (tw != NULL) {
287 struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); 288 struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
288 const struct inet_connection_sock *icsk = inet_csk(sk);
289 const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1); 289 const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1);
290 290
291 tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale; 291 tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale;
@@ -298,10 +298,12 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
298#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 298#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
299 if (tw->tw_family == PF_INET6) { 299 if (tw->tw_family == PF_INET6) {
300 struct ipv6_pinfo *np = inet6_sk(sk); 300 struct ipv6_pinfo *np = inet6_sk(sk);
301 struct tcp6_timewait_sock *tcp6tw = tcp6_twsk((struct sock *)tw); 301 struct inet6_timewait_sock *tw6;
302 302
303 ipv6_addr_copy(&tcp6tw->tw_v6_daddr, &np->daddr); 303 tw->tw_ipv6_offset = inet6_tw_offset(sk->sk_prot);
304 ipv6_addr_copy(&tcp6tw->tw_v6_rcv_saddr, &np->rcv_saddr); 304 tw6 = inet6_twsk((struct sock *)tw);
305 ipv6_addr_copy(&tw6->tw_v6_daddr, &np->daddr);
306 ipv6_addr_copy(&tw6->tw_v6_rcv_saddr, &np->rcv_saddr);
305 tw->tw_ipv6only = np->ipv6only; 307 tw->tw_ipv6only = np->ipv6only;
306 } 308 }
307#endif 309#endif
@@ -456,7 +458,6 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
456 struct request_sock **prev) 458 struct request_sock **prev)
457{ 459{
458 struct tcphdr *th = skb->h.th; 460 struct tcphdr *th = skb->h.th;
459 struct tcp_sock *tp = tcp_sk(sk);
460 u32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); 461 u32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
461 int paws_reject = 0; 462 int paws_reject = 0;
462 struct tcp_options_received tmp_opt; 463 struct tcp_options_received tmp_opt;
@@ -613,7 +614,8 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
613 * ESTABLISHED STATE. If it will be dropped after 614 * ESTABLISHED STATE. If it will be dropped after
614 * socket is created, wait for troubles. 615 * socket is created, wait for troubles.
615 */ 616 */
616 child = tp->af_specific->syn_recv_sock(sk, skb, req, NULL); 617 child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb,
618 req, NULL);
617 if (child == NULL) 619 if (child == NULL)
618 goto listen_overflow; 620 goto listen_overflow;
619 621
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index b7325e0b406a..a7623ead39a8 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -51,8 +51,8 @@ int sysctl_tcp_retrans_collapse = 1;
51 */ 51 */
52int sysctl_tcp_tso_win_divisor = 3; 52int sysctl_tcp_tso_win_divisor = 3;
53 53
54static inline void update_send_head(struct sock *sk, struct tcp_sock *tp, 54static void update_send_head(struct sock *sk, struct tcp_sock *tp,
55 struct sk_buff *skb) 55 struct sk_buff *skb)
56{ 56{
57 sk->sk_send_head = skb->next; 57 sk->sk_send_head = skb->next;
58 if (sk->sk_send_head == (struct sk_buff *)&sk->sk_write_queue) 58 if (sk->sk_send_head == (struct sk_buff *)&sk->sk_write_queue)
@@ -124,8 +124,8 @@ static void tcp_cwnd_restart(struct sock *sk, struct dst_entry *dst)
124 tp->snd_cwnd_used = 0; 124 tp->snd_cwnd_used = 0;
125} 125}
126 126
127static inline void tcp_event_data_sent(struct tcp_sock *tp, 127static void tcp_event_data_sent(struct tcp_sock *tp,
128 struct sk_buff *skb, struct sock *sk) 128 struct sk_buff *skb, struct sock *sk)
129{ 129{
130 struct inet_connection_sock *icsk = inet_csk(sk); 130 struct inet_connection_sock *icsk = inet_csk(sk);
131 const u32 now = tcp_time_stamp; 131 const u32 now = tcp_time_stamp;
@@ -142,7 +142,7 @@ static inline void tcp_event_data_sent(struct tcp_sock *tp,
142 icsk->icsk_ack.pingpong = 1; 142 icsk->icsk_ack.pingpong = 1;
143} 143}
144 144
145static __inline__ void tcp_event_ack_sent(struct sock *sk, unsigned int pkts) 145static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
146{ 146{
147 tcp_dec_quickack_mode(sk, pkts); 147 tcp_dec_quickack_mode(sk, pkts);
148 inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); 148 inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
@@ -212,7 +212,7 @@ void tcp_select_initial_window(int __space, __u32 mss,
212 * value can be stuffed directly into th->window for an outgoing 212 * value can be stuffed directly into th->window for an outgoing
213 * frame. 213 * frame.
214 */ 214 */
215static __inline__ u16 tcp_select_window(struct sock *sk) 215static u16 tcp_select_window(struct sock *sk)
216{ 216{
217 struct tcp_sock *tp = tcp_sk(sk); 217 struct tcp_sock *tp = tcp_sk(sk);
218 u32 cur_win = tcp_receive_window(tp); 218 u32 cur_win = tcp_receive_window(tp);
@@ -250,6 +250,75 @@ static __inline__ u16 tcp_select_window(struct sock *sk)
250 return new_win; 250 return new_win;
251} 251}
252 252
253static void tcp_build_and_update_options(__u32 *ptr, struct tcp_sock *tp,
254 __u32 tstamp)
255{
256 if (tp->rx_opt.tstamp_ok) {
257 *ptr++ = __constant_htonl((TCPOPT_NOP << 24) |
258 (TCPOPT_NOP << 16) |
259 (TCPOPT_TIMESTAMP << 8) |
260 TCPOLEN_TIMESTAMP);
261 *ptr++ = htonl(tstamp);
262 *ptr++ = htonl(tp->rx_opt.ts_recent);
263 }
264 if (tp->rx_opt.eff_sacks) {
265 struct tcp_sack_block *sp = tp->rx_opt.dsack ? tp->duplicate_sack : tp->selective_acks;
266 int this_sack;
267
268 *ptr++ = htonl((TCPOPT_NOP << 24) |
269 (TCPOPT_NOP << 16) |
270 (TCPOPT_SACK << 8) |
271 (TCPOLEN_SACK_BASE + (tp->rx_opt.eff_sacks *
272 TCPOLEN_SACK_PERBLOCK)));
273 for(this_sack = 0; this_sack < tp->rx_opt.eff_sacks; this_sack++) {
274 *ptr++ = htonl(sp[this_sack].start_seq);
275 *ptr++ = htonl(sp[this_sack].end_seq);
276 }
277 if (tp->rx_opt.dsack) {
278 tp->rx_opt.dsack = 0;
279 tp->rx_opt.eff_sacks--;
280 }
281 }
282}
283
284/* Construct a tcp options header for a SYN or SYN_ACK packet.
285 * If this is every changed make sure to change the definition of
286 * MAX_SYN_SIZE to match the new maximum number of options that you
287 * can generate.
288 */
289static void tcp_syn_build_options(__u32 *ptr, int mss, int ts, int sack,
290 int offer_wscale, int wscale, __u32 tstamp,
291 __u32 ts_recent)
292{
293 /* We always get an MSS option.
294 * The option bytes which will be seen in normal data
295 * packets should timestamps be used, must be in the MSS
296 * advertised. But we subtract them from tp->mss_cache so
297 * that calculations in tcp_sendmsg are simpler etc.
298 * So account for this fact here if necessary. If we
299 * don't do this correctly, as a receiver we won't
300 * recognize data packets as being full sized when we
301 * should, and thus we won't abide by the delayed ACK
302 * rules correctly.
303 * SACKs don't matter, we never delay an ACK when we
304 * have any of those going out.
305 */
306 *ptr++ = htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | mss);
307 if (ts) {
308 if(sack)
309 *ptr++ = __constant_htonl((TCPOPT_SACK_PERM << 24) | (TCPOLEN_SACK_PERM << 16) |
310 (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP);
311 else
312 *ptr++ = __constant_htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
313 (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP);
314 *ptr++ = htonl(tstamp); /* TSVAL */
315 *ptr++ = htonl(ts_recent); /* TSECR */
316 } else if(sack)
317 *ptr++ = __constant_htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
318 (TCPOPT_SACK_PERM << 8) | TCPOLEN_SACK_PERM);
319 if (offer_wscale)
320 *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_WINDOW << 16) | (TCPOLEN_WINDOW << 8) | (wscale));
321}
253 322
254/* This routine actually transmits TCP packets queued in by 323/* This routine actually transmits TCP packets queued in by
255 * tcp_do_sendmsg(). This is used by both the initial 324 * tcp_do_sendmsg(). This is used by both the initial
@@ -371,7 +440,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
371 TCP_ECN_send(sk, tp, skb, tcp_header_size); 440 TCP_ECN_send(sk, tp, skb, tcp_header_size);
372 } 441 }
373 442
374 tp->af_specific->send_check(sk, th, skb->len, skb); 443 icsk->icsk_af_ops->send_check(sk, skb->len, skb);
375 444
376 if (likely(tcb->flags & TCPCB_FLAG_ACK)) 445 if (likely(tcb->flags & TCPCB_FLAG_ACK))
377 tcp_event_ack_sent(sk, tcp_skb_pcount(skb)); 446 tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
@@ -381,7 +450,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
381 450
382 TCP_INC_STATS(TCP_MIB_OUTSEGS); 451 TCP_INC_STATS(TCP_MIB_OUTSEGS);
383 452
384 err = tp->af_specific->queue_xmit(skb, 0); 453 err = icsk->icsk_af_ops->queue_xmit(skb, 0);
385 if (unlikely(err <= 0)) 454 if (unlikely(err <= 0))
386 return err; 455 return err;
387 456
@@ -621,7 +690,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
621 It is minimum of user_mss and mss received with SYN. 690 It is minimum of user_mss and mss received with SYN.
622 It also does not include TCP options. 691 It also does not include TCP options.
623 692
624 tp->pmtu_cookie is last pmtu, seen by this function. 693 inet_csk(sk)->icsk_pmtu_cookie is last pmtu, seen by this function.
625 694
626 tp->mss_cache is current effective sending mss, including 695 tp->mss_cache is current effective sending mss, including
627 all tcp options except for SACKs. It is evaluated, 696 all tcp options except for SACKs. It is evaluated,
@@ -631,26 +700,26 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
631 NOTE1. rfc1122 clearly states that advertised MSS 700 NOTE1. rfc1122 clearly states that advertised MSS
632 DOES NOT include either tcp or ip options. 701 DOES NOT include either tcp or ip options.
633 702
634 NOTE2. tp->pmtu_cookie and tp->mss_cache are READ ONLY outside 703 NOTE2. inet_csk(sk)->icsk_pmtu_cookie and tp->mss_cache
635 this function. --ANK (980731) 704 are READ ONLY outside this function. --ANK (980731)
636 */ 705 */
637 706
638unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) 707unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
639{ 708{
640 struct tcp_sock *tp = tcp_sk(sk); 709 struct tcp_sock *tp = tcp_sk(sk);
641 int mss_now; 710 struct inet_connection_sock *icsk = inet_csk(sk);
642
643 /* Calculate base mss without TCP options: 711 /* Calculate base mss without TCP options:
644 It is MMS_S - sizeof(tcphdr) of rfc1122 712 It is MMS_S - sizeof(tcphdr) of rfc1122
645 */ 713 */
646 mss_now = pmtu - tp->af_specific->net_header_len - sizeof(struct tcphdr); 714 int mss_now = (pmtu - icsk->icsk_af_ops->net_header_len -
715 sizeof(struct tcphdr));
647 716
648 /* Clamp it (mss_clamp does not include tcp options) */ 717 /* Clamp it (mss_clamp does not include tcp options) */
649 if (mss_now > tp->rx_opt.mss_clamp) 718 if (mss_now > tp->rx_opt.mss_clamp)
650 mss_now = tp->rx_opt.mss_clamp; 719 mss_now = tp->rx_opt.mss_clamp;
651 720
652 /* Now subtract optional transport overhead */ 721 /* Now subtract optional transport overhead */
653 mss_now -= tp->ext_header_len; 722 mss_now -= icsk->icsk_ext_hdr_len;
654 723
655 /* Then reserve room for full set of TCP options and 8 bytes of data */ 724 /* Then reserve room for full set of TCP options and 8 bytes of data */
656 if (mss_now < 48) 725 if (mss_now < 48)
@@ -664,7 +733,7 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
664 mss_now = max((tp->max_window>>1), 68U - tp->tcp_header_len); 733 mss_now = max((tp->max_window>>1), 68U - tp->tcp_header_len);
665 734
666 /* And store cached results */ 735 /* And store cached results */
667 tp->pmtu_cookie = pmtu; 736 icsk->icsk_pmtu_cookie = pmtu;
668 tp->mss_cache = mss_now; 737 tp->mss_cache = mss_now;
669 738
670 return mss_now; 739 return mss_now;
@@ -694,7 +763,7 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed)
694 763
695 if (dst) { 764 if (dst) {
696 u32 mtu = dst_mtu(dst); 765 u32 mtu = dst_mtu(dst);
697 if (mtu != tp->pmtu_cookie) 766 if (mtu != inet_csk(sk)->icsk_pmtu_cookie)
698 mss_now = tcp_sync_mss(sk, mtu); 767 mss_now = tcp_sync_mss(sk, mtu);
699 } 768 }
700 769
@@ -705,9 +774,10 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed)
705 xmit_size_goal = mss_now; 774 xmit_size_goal = mss_now;
706 775
707 if (doing_tso) { 776 if (doing_tso) {
708 xmit_size_goal = 65535 - 777 xmit_size_goal = (65535 -
709 tp->af_specific->net_header_len - 778 inet_csk(sk)->icsk_af_ops->net_header_len -
710 tp->ext_header_len - tp->tcp_header_len; 779 inet_csk(sk)->icsk_ext_hdr_len -
780 tp->tcp_header_len);
711 781
712 if (tp->max_window && 782 if (tp->max_window &&
713 (xmit_size_goal > (tp->max_window >> 1))) 783 (xmit_size_goal > (tp->max_window >> 1)))
@@ -723,7 +793,7 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed)
723 793
724/* Congestion window validation. (RFC2861) */ 794/* Congestion window validation. (RFC2861) */
725 795
726static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp) 796static void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp)
727{ 797{
728 __u32 packets_out = tp->packets_out; 798 __u32 packets_out = tp->packets_out;
729 799
@@ -772,7 +842,7 @@ static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, struct sk_buff *sk
772/* This must be invoked the first time we consider transmitting 842/* This must be invoked the first time we consider transmitting
773 * SKB onto the wire. 843 * SKB onto the wire.
774 */ 844 */
775static inline int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now) 845static int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now)
776{ 846{
777 int tso_segs = tcp_skb_pcount(skb); 847 int tso_segs = tcp_skb_pcount(skb);
778 848
@@ -1422,7 +1492,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
1422 (sysctl_tcp_retrans_collapse != 0)) 1492 (sysctl_tcp_retrans_collapse != 0))
1423 tcp_retrans_try_collapse(sk, skb, cur_mss); 1493 tcp_retrans_try_collapse(sk, skb, cur_mss);
1424 1494
1425 if(tp->af_specific->rebuild_header(sk)) 1495 if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
1426 return -EHOSTUNREACH; /* Routing failure or similar. */ 1496 return -EHOSTUNREACH; /* Routing failure or similar. */
1427 1497
1428 /* Some Solaris stacks overoptimize and ignore the FIN on a 1498 /* Some Solaris stacks overoptimize and ignore the FIN on a
@@ -1793,7 +1863,7 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
1793/* 1863/*
1794 * Do all connect socket setups that can be done AF independent. 1864 * Do all connect socket setups that can be done AF independent.
1795 */ 1865 */
1796static inline void tcp_connect_init(struct sock *sk) 1866static void tcp_connect_init(struct sock *sk)
1797{ 1867{
1798 struct dst_entry *dst = __sk_dst_get(sk); 1868 struct dst_entry *dst = __sk_dst_get(sk);
1799 struct tcp_sock *tp = tcp_sk(sk); 1869 struct tcp_sock *tp = tcp_sk(sk);
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index 13e7e6e8df16..3b7403495052 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -330,6 +330,10 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack,
330 vegas->cntRTT = 0; 330 vegas->cntRTT = 0;
331 vegas->minRTT = 0x7fffffff; 331 vegas->minRTT = 0x7fffffff;
332 } 332 }
333 /* Use normal slow start */
334 else if (tp->snd_cwnd <= tp->snd_ssthresh)
335 tcp_slow_start(tp);
336
333} 337}
334 338
335/* Extract info for Tcp socket info provided via netlink. */ 339/* Extract info for Tcp socket info provided via netlink. */
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 2422a5f7195d..223abaa72bc5 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -86,6 +86,7 @@
86#include <linux/module.h> 86#include <linux/module.h>
87#include <linux/socket.h> 87#include <linux/socket.h>
88#include <linux/sockios.h> 88#include <linux/sockios.h>
89#include <linux/igmp.h>
89#include <linux/in.h> 90#include <linux/in.h>
90#include <linux/errno.h> 91#include <linux/errno.h>
91#include <linux/timer.h> 92#include <linux/timer.h>
@@ -846,20 +847,7 @@ out:
846csum_copy_err: 847csum_copy_err:
847 UDP_INC_STATS_BH(UDP_MIB_INERRORS); 848 UDP_INC_STATS_BH(UDP_MIB_INERRORS);
848 849
849 /* Clear queue. */ 850 skb_kill_datagram(sk, skb, flags);
850 if (flags&MSG_PEEK) {
851 int clear = 0;
852 spin_lock_bh(&sk->sk_receive_queue.lock);
853 if (skb == skb_peek(&sk->sk_receive_queue)) {
854 __skb_unlink(skb, &sk->sk_receive_queue);
855 clear = 1;
856 }
857 spin_unlock_bh(&sk->sk_receive_queue.lock);
858 if (clear)
859 kfree_skb(skb);
860 }
861
862 skb_free_datagram(sk, skb);
863 851
864 if (noblock) 852 if (noblock)
865 return -EAGAIN; 853 return -EAGAIN;
@@ -1094,7 +1082,7 @@ static int udp_v4_mcast_deliver(struct sk_buff *skb, struct udphdr *uh,
1094 * Otherwise, csum completion requires chacksumming packet body, 1082 * Otherwise, csum completion requires chacksumming packet body,
1095 * including udp header and folding it to skb->csum. 1083 * including udp header and folding it to skb->csum.
1096 */ 1084 */
1097static int udp_checksum_init(struct sk_buff *skb, struct udphdr *uh, 1085static void udp_checksum_init(struct sk_buff *skb, struct udphdr *uh,
1098 unsigned short ulen, u32 saddr, u32 daddr) 1086 unsigned short ulen, u32 saddr, u32 daddr)
1099{ 1087{
1100 if (uh->check == 0) { 1088 if (uh->check == 0) {
@@ -1108,7 +1096,6 @@ static int udp_checksum_init(struct sk_buff *skb, struct udphdr *uh,
1108 /* Probably, we should checksum udp header (it should be in cache 1096 /* Probably, we should checksum udp header (it should be in cache
1109 * in any case) and data in tiny packets (< rx copybreak). 1097 * in any case) and data in tiny packets (< rx copybreak).
1110 */ 1098 */
1111 return 0;
1112} 1099}
1113 1100
1114/* 1101/*
@@ -1141,8 +1128,7 @@ int udp_rcv(struct sk_buff *skb)
1141 if (pskb_trim_rcsum(skb, ulen)) 1128 if (pskb_trim_rcsum(skb, ulen))
1142 goto short_packet; 1129 goto short_packet;
1143 1130
1144 if (udp_checksum_init(skb, uh, ulen, saddr, daddr) < 0) 1131 udp_checksum_init(skb, uh, ulen, saddr, daddr);
1145 goto csum_error;
1146 1132
1147 if(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) 1133 if(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST))
1148 return udp_v4_mcast_deliver(skb, uh, saddr, daddr); 1134 return udp_v4_mcast_deliver(skb, uh, saddr, daddr);