aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Kconfig17
-rw-r--r--net/ipv4/Makefile8
-rw-r--r--net/ipv4/af_inet.c179
-rw-r--r--net/ipv4/arp.c8
-rw-r--r--net/ipv4/datagram.c3
-rw-r--r--net/ipv4/devinet.c7
-rw-r--r--net/ipv4/esp4.c12
-rw-r--r--net/ipv4/fib_frontend.c6
-rw-r--r--net/ipv4/fib_hash.c4
-rw-r--r--net/ipv4/fib_lookup.h1
-rw-r--r--net/ipv4/fib_semantics.c7
-rw-r--r--net/ipv4/fib_trie.c1628
-rw-r--r--net/ipv4/icmp.c14
-rw-r--r--net/ipv4/igmp.c2
-rw-r--r--net/ipv4/inet_connection_sock.c641
-rw-r--r--net/ipv4/inet_diag.c868
-rw-r--r--net/ipv4/inet_hashtables.c165
-rw-r--r--net/ipv4/inet_timewait_sock.c384
-rw-r--r--net/ipv4/inetpeer.c5
-rw-r--r--net/ipv4/ip_forward.c6
-rw-r--r--net/ipv4/ip_fragment.c10
-rw-r--r--net/ipv4/ip_input.c141
-rw-r--r--net/ipv4/ip_options.c52
-rw-r--r--net/ipv4/ip_output.c24
-rw-r--r--net/ipv4/ip_sockglue.c8
-rw-r--r--net/ipv4/ipcomp.c4
-rw-r--r--net/ipv4/ipconfig.c8
-rw-r--r--net/ipv4/ipmr.c2
-rw-r--r--net/ipv4/ipvs/ip_vs_app.c1
-rw-r--r--net/ipv4/ipvs/ip_vs_conn.c2
-rw-r--r--net/ipv4/ipvs/ip_vs_core.c9
-rw-r--r--net/ipv4/ipvs/ip_vs_ctl.c4
-rw-r--r--net/ipv4/ipvs/ip_vs_lblc.c4
-rw-r--r--net/ipv4/ipvs/ip_vs_lblcr.c4
-rw-r--r--net/ipv4/ipvs/ip_vs_proto_tcp.c8
-rw-r--r--net/ipv4/ipvs/ip_vs_xmit.c2
-rw-r--r--net/ipv4/multipath_drr.c2
-rw-r--r--net/ipv4/netfilter.c139
-rw-r--r--net/ipv4/netfilter/Kconfig70
-rw-r--r--net/ipv4/netfilter/Makefile9
-rw-r--r--net/ipv4/netfilter/ip_conntrack_amanda.c18
-rw-r--r--net/ipv4/netfilter/ip_conntrack_core.c379
-rw-r--r--net/ipv4/netfilter/ip_conntrack_ftp.c21
-rw-r--r--net/ipv4/netfilter/ip_conntrack_irc.c7
-rw-r--r--net/ipv4/netfilter/ip_conntrack_netlink.c1579
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_icmp.c73
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_sctp.c9
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_tcp.c48
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_udp.c14
-rw-r--r--net/ipv4/netfilter/ip_conntrack_standalone.c49
-rw-r--r--net/ipv4/netfilter/ip_nat_core.c104
-rw-r--r--net/ipv4/netfilter/ip_nat_helper.c8
-rw-r--r--net/ipv4/netfilter/ip_nat_proto_icmp.c23
-rw-r--r--net/ipv4/netfilter/ip_nat_proto_tcp.c24
-rw-r--r--net/ipv4/netfilter/ip_nat_proto_udp.c23
-rw-r--r--net/ipv4/netfilter/ip_nat_proto_unknown.c13
-rw-r--r--net/ipv4/netfilter/ip_nat_snmp_basic.c2
-rw-r--r--net/ipv4/netfilter/ip_nat_standalone.c4
-rw-r--r--net/ipv4/netfilter/ip_queue.c51
-rw-r--r--net/ipv4/netfilter/ip_tables.c5
-rw-r--r--net/ipv4/netfilter/ipt_CLASSIFY.c4
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c2
-rw-r--r--net/ipv4/netfilter/ipt_CONNMARK.c15
-rw-r--r--net/ipv4/netfilter/ipt_DSCP.c3
-rw-r--r--net/ipv4/netfilter/ipt_ECN.c6
-rw-r--r--net/ipv4/netfilter/ipt_LOG.c86
-rw-r--r--net/ipv4/netfilter/ipt_MARK.c22
-rw-r--r--net/ipv4/netfilter/ipt_MASQUERADE.c5
-rw-r--r--net/ipv4/netfilter/ipt_NETMAP.c8
-rw-r--r--net/ipv4/netfilter/ipt_NFQUEUE.c70
-rw-r--r--net/ipv4/netfilter/ipt_REJECT.c1
-rw-r--r--net/ipv4/netfilter/ipt_TCPMSS.c3
-rw-r--r--net/ipv4/netfilter/ipt_TOS.c3
-rw-r--r--net/ipv4/netfilter/ipt_TTL.c119
-rw-r--r--net/ipv4/netfilter/ipt_ULOG.c53
-rw-r--r--net/ipv4/netfilter/ipt_connbytes.c162
-rw-r--r--net/ipv4/netfilter/ipt_connmark.c7
-rw-r--r--net/ipv4/netfilter/ipt_dccp.c176
-rw-r--r--net/ipv4/netfilter/ipt_hashlimit.c2
-rw-r--r--net/ipv4/netfilter/ipt_mark.c7
-rw-r--r--net/ipv4/netfilter/ipt_owner.c132
-rw-r--r--net/ipv4/netfilter/ipt_string.c91
-rw-r--r--net/ipv4/proc.c5
-rw-r--r--net/ipv4/protocol.c1
-rw-r--r--net/ipv4/raw.c7
-rw-r--r--net/ipv4/route.c8
-rw-r--r--net/ipv4/syncookies.c4
-rw-r--r--net/ipv4/sysctl_net_ipv4.c47
-rw-r--r--net/ipv4/tcp.c402
-rw-r--r--net/ipv4/tcp_bic.c46
-rw-r--r--net/ipv4/tcp_cong.c44
-rw-r--r--net/ipv4/tcp_diag.c784
-rw-r--r--net/ipv4/tcp_highspeed.c17
-rw-r--r--net/ipv4/tcp_htcp.c53
-rw-r--r--net/ipv4/tcp_hybla.c31
-rw-r--r--net/ipv4/tcp_input.c513
-rw-r--r--net/ipv4/tcp_ipv4.c944
-rw-r--r--net/ipv4/tcp_minisocks.c605
-rw-r--r--net/ipv4/tcp_output.c143
-rw-r--r--net/ipv4/tcp_scalable.c6
-rw-r--r--net/ipv4/tcp_timer.c253
-rw-r--r--net/ipv4/tcp_vegas.c50
-rw-r--r--net/ipv4/tcp_westwood.c64
-rw-r--r--net/ipv4/udp.c37
-rw-r--r--net/ipv4/xfrm4_state.c2
105 files changed, 7421 insertions, 4544 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 0b3d9f1d8069..e55136ae09f4 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -413,20 +413,19 @@ config INET_TUNNEL
413 413
414 If unsure, say Y. 414 If unsure, say Y.
415 415
416config IP_TCPDIAG 416config INET_DIAG
417 tristate "IP: TCP socket monitoring interface" 417 tristate "INET: socket monitoring interface"
418 default y 418 default y
419 ---help--- 419 ---help---
420 Support for TCP socket monitoring interface used by native Linux 420 Support for INET (TCP, DCCP, etc) socket monitoring interface used by
421 tools such as ss. ss is included in iproute2, currently downloadable 421 native Linux tools such as ss. ss is included in iproute2, currently
422 at <http://developer.osdl.org/dev/iproute2>. If you want IPv6 support 422 downloadable at <http://developer.osdl.org/dev/iproute2>.
423 and have selected IPv6 as a module, you need to build this as a
424 module too.
425 423
426 If unsure, say Y. 424 If unsure, say Y.
427 425
428config IP_TCPDIAG_IPV6 426config INET_TCP_DIAG
429 def_bool (IP_TCPDIAG=y && IPV6=y) || (IP_TCPDIAG=m && IPV6) 427 depends on INET_DIAG
428 def_tristate INET_DIAG
430 429
431config TCP_CONG_ADVANCED 430config TCP_CONG_ADVANCED
432 bool "TCP: advanced congestion control" 431 bool "TCP: advanced congestion control"
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 55dc6cca1e7b..f0435d00db6b 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -4,11 +4,12 @@
4 4
5obj-y := route.o inetpeer.o protocol.o \ 5obj-y := route.o inetpeer.o protocol.o \
6 ip_input.o ip_fragment.o ip_forward.o ip_options.o \ 6 ip_input.o ip_fragment.o ip_forward.o ip_options.o \
7 ip_output.o ip_sockglue.o \ 7 ip_output.o ip_sockglue.o inet_hashtables.o \
8 inet_timewait_sock.o inet_connection_sock.o \
8 tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \ 9 tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
9 tcp_minisocks.o tcp_cong.o \ 10 tcp_minisocks.o tcp_cong.o \
10 datagram.o raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \ 11 datagram.o raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \
11 sysctl_net_ipv4.o fib_frontend.o fib_semantics.o 12 sysctl_net_ipv4.o fib_frontend.o fib_semantics.o netfilter.o
12 13
13obj-$(CONFIG_IP_FIB_HASH) += fib_hash.o 14obj-$(CONFIG_IP_FIB_HASH) += fib_hash.o
14obj-$(CONFIG_IP_FIB_TRIE) += fib_trie.o 15obj-$(CONFIG_IP_FIB_TRIE) += fib_trie.o
@@ -29,8 +30,9 @@ obj-$(CONFIG_IP_ROUTE_MULTIPATH_WRANDOM) += multipath_wrandom.o
29obj-$(CONFIG_IP_ROUTE_MULTIPATH_DRR) += multipath_drr.o 30obj-$(CONFIG_IP_ROUTE_MULTIPATH_DRR) += multipath_drr.o
30obj-$(CONFIG_NETFILTER) += netfilter/ 31obj-$(CONFIG_NETFILTER) += netfilter/
31obj-$(CONFIG_IP_VS) += ipvs/ 32obj-$(CONFIG_IP_VS) += ipvs/
32obj-$(CONFIG_IP_TCPDIAG) += tcp_diag.o 33obj-$(CONFIG_INET_DIAG) += inet_diag.o
33obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o 34obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o
35obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
34obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o 36obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
35obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o 37obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o
36obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o 38obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 163ae4068b5f..bf147f8db399 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -99,6 +99,7 @@
99#include <net/arp.h> 99#include <net/arp.h>
100#include <net/route.h> 100#include <net/route.h>
101#include <net/ip_fib.h> 101#include <net/ip_fib.h>
102#include <net/inet_connection_sock.h>
102#include <net/tcp.h> 103#include <net/tcp.h>
103#include <net/udp.h> 104#include <net/udp.h>
104#include <linux/skbuff.h> 105#include <linux/skbuff.h>
@@ -112,11 +113,7 @@
112#include <linux/mroute.h> 113#include <linux/mroute.h>
113#endif 114#endif
114 115
115DEFINE_SNMP_STAT(struct linux_mib, net_statistics); 116DEFINE_SNMP_STAT(struct linux_mib, net_statistics) __read_mostly;
116
117#ifdef INET_REFCNT_DEBUG
118atomic_t inet_sock_nr;
119#endif
120 117
121extern void ip_mc_drop_socket(struct sock *sk); 118extern void ip_mc_drop_socket(struct sock *sk);
122 119
@@ -153,11 +150,7 @@ void inet_sock_destruct(struct sock *sk)
153 if (inet->opt) 150 if (inet->opt)
154 kfree(inet->opt); 151 kfree(inet->opt);
155 dst_release(sk->sk_dst_cache); 152 dst_release(sk->sk_dst_cache);
156#ifdef INET_REFCNT_DEBUG 153 sk_refcnt_debug_dec(sk);
157 atomic_dec(&inet_sock_nr);
158 printk(KERN_DEBUG "INET socket %p released, %d are still alive\n",
159 sk, atomic_read(&inet_sock_nr));
160#endif
161} 154}
162 155
163/* 156/*
@@ -210,7 +203,7 @@ int inet_listen(struct socket *sock, int backlog)
210 * we can only allow the backlog to be adjusted. 203 * we can only allow the backlog to be adjusted.
211 */ 204 */
212 if (old_state != TCP_LISTEN) { 205 if (old_state != TCP_LISTEN) {
213 err = tcp_listen_start(sk); 206 err = inet_csk_listen_start(sk, TCP_SYNQ_HSIZE);
214 if (err) 207 if (err)
215 goto out; 208 goto out;
216 } 209 }
@@ -235,12 +228,14 @@ static int inet_create(struct socket *sock, int protocol)
235 struct proto *answer_prot; 228 struct proto *answer_prot;
236 unsigned char answer_flags; 229 unsigned char answer_flags;
237 char answer_no_check; 230 char answer_no_check;
238 int err; 231 int try_loading_module = 0;
232 int err = -ESOCKTNOSUPPORT;
239 233
240 sock->state = SS_UNCONNECTED; 234 sock->state = SS_UNCONNECTED;
241 235
242 /* Look for the requested type/protocol pair. */ 236 /* Look for the requested type/protocol pair. */
243 answer = NULL; 237 answer = NULL;
238lookup_protocol:
244 rcu_read_lock(); 239 rcu_read_lock();
245 list_for_each_rcu(p, &inetsw[sock->type]) { 240 list_for_each_rcu(p, &inetsw[sock->type]) {
246 answer = list_entry(p, struct inet_protosw, list); 241 answer = list_entry(p, struct inet_protosw, list);
@@ -261,9 +256,28 @@ static int inet_create(struct socket *sock, int protocol)
261 answer = NULL; 256 answer = NULL;
262 } 257 }
263 258
264 err = -ESOCKTNOSUPPORT; 259 if (unlikely(answer == NULL)) {
265 if (!answer) 260 if (try_loading_module < 2) {
266 goto out_rcu_unlock; 261 rcu_read_unlock();
262 /*
263 * Be more specific, e.g. net-pf-2-proto-132-type-1
264 * (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM)
265 */
266 if (++try_loading_module == 1)
267 request_module("net-pf-%d-proto-%d-type-%d",
268 PF_INET, protocol, sock->type);
269 /*
270 * Fall back to generic, e.g. net-pf-2-proto-132
271 * (net-pf-PF_INET-proto-IPPROTO_SCTP)
272 */
273 else
274 request_module("net-pf-%d-proto-%d",
275 PF_INET, protocol);
276 goto lookup_protocol;
277 } else
278 goto out_rcu_unlock;
279 }
280
267 err = -EPERM; 281 err = -EPERM;
268 if (answer->capability > 0 && !capable(answer->capability)) 282 if (answer->capability > 0 && !capable(answer->capability))
269 goto out_rcu_unlock; 283 goto out_rcu_unlock;
@@ -317,9 +331,7 @@ static int inet_create(struct socket *sock, int protocol)
317 inet->mc_index = 0; 331 inet->mc_index = 0;
318 inet->mc_list = NULL; 332 inet->mc_list = NULL;
319 333
320#ifdef INET_REFCNT_DEBUG 334 sk_refcnt_debug_inc(sk);
321 atomic_inc(&inet_sock_nr);
322#endif
323 335
324 if (inet->num) { 336 if (inet->num) {
325 /* It assumes that any protocol which allows 337 /* It assumes that any protocol which allows
@@ -847,10 +859,6 @@ static struct net_proto_family inet_family_ops = {
847 .owner = THIS_MODULE, 859 .owner = THIS_MODULE,
848}; 860};
849 861
850
851extern void tcp_init(void);
852extern void tcp_v4_init(struct net_proto_family *);
853
854/* Upon startup we insert all the elements in inetsw_array[] into 862/* Upon startup we insert all the elements in inetsw_array[] into
855 * the linked list inetsw. 863 * the linked list inetsw.
856 */ 864 */
@@ -961,6 +969,119 @@ void inet_unregister_protosw(struct inet_protosw *p)
961 } 969 }
962} 970}
963 971
972/*
973 * Shall we try to damage output packets if routing dev changes?
974 */
975
976int sysctl_ip_dynaddr;
977
978static int inet_sk_reselect_saddr(struct sock *sk)
979{
980 struct inet_sock *inet = inet_sk(sk);
981 int err;
982 struct rtable *rt;
983 __u32 old_saddr = inet->saddr;
984 __u32 new_saddr;
985 __u32 daddr = inet->daddr;
986
987 if (inet->opt && inet->opt->srr)
988 daddr = inet->opt->faddr;
989
990 /* Query new route. */
991 err = ip_route_connect(&rt, daddr, 0,
992 RT_CONN_FLAGS(sk),
993 sk->sk_bound_dev_if,
994 sk->sk_protocol,
995 inet->sport, inet->dport, sk);
996 if (err)
997 return err;
998
999 sk_setup_caps(sk, &rt->u.dst);
1000
1001 new_saddr = rt->rt_src;
1002
1003 if (new_saddr == old_saddr)
1004 return 0;
1005
1006 if (sysctl_ip_dynaddr > 1) {
1007 printk(KERN_INFO "%s(): shifting inet->"
1008 "saddr from %d.%d.%d.%d to %d.%d.%d.%d\n",
1009 __FUNCTION__,
1010 NIPQUAD(old_saddr),
1011 NIPQUAD(new_saddr));
1012 }
1013
1014 inet->saddr = inet->rcv_saddr = new_saddr;
1015
1016 /*
1017 * XXX The only one ugly spot where we need to
1018 * XXX really change the sockets identity after
1019 * XXX it has entered the hashes. -DaveM
1020 *
1021 * Besides that, it does not check for connection
1022 * uniqueness. Wait for troubles.
1023 */
1024 __sk_prot_rehash(sk);
1025 return 0;
1026}
1027
1028int inet_sk_rebuild_header(struct sock *sk)
1029{
1030 struct inet_sock *inet = inet_sk(sk);
1031 struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
1032 u32 daddr;
1033 int err;
1034
1035 /* Route is OK, nothing to do. */
1036 if (rt)
1037 return 0;
1038
1039 /* Reroute. */
1040 daddr = inet->daddr;
1041 if (inet->opt && inet->opt->srr)
1042 daddr = inet->opt->faddr;
1043{
1044 struct flowi fl = {
1045 .oif = sk->sk_bound_dev_if,
1046 .nl_u = {
1047 .ip4_u = {
1048 .daddr = daddr,
1049 .saddr = inet->saddr,
1050 .tos = RT_CONN_FLAGS(sk),
1051 },
1052 },
1053 .proto = sk->sk_protocol,
1054 .uli_u = {
1055 .ports = {
1056 .sport = inet->sport,
1057 .dport = inet->dport,
1058 },
1059 },
1060 };
1061
1062 err = ip_route_output_flow(&rt, &fl, sk, 0);
1063}
1064 if (!err)
1065 sk_setup_caps(sk, &rt->u.dst);
1066 else {
1067 /* Routing failed... */
1068 sk->sk_route_caps = 0;
1069 /*
1070 * Other protocols have to map its equivalent state to TCP_SYN_SENT.
1071 * DCCP maps its DCCP_REQUESTING state to TCP_SYN_SENT. -acme
1072 */
1073 if (!sysctl_ip_dynaddr ||
1074 sk->sk_state != TCP_SYN_SENT ||
1075 (sk->sk_userlocks & SOCK_BINDADDR_LOCK) ||
1076 (err = inet_sk_reselect_saddr(sk)) != 0)
1077 sk->sk_err_soft = -err;
1078 }
1079
1080 return err;
1081}
1082
1083EXPORT_SYMBOL(inet_sk_rebuild_header);
1084
964#ifdef CONFIG_IP_MULTICAST 1085#ifdef CONFIG_IP_MULTICAST
965static struct net_protocol igmp_protocol = { 1086static struct net_protocol igmp_protocol = {
966 .handler = igmp_rcv, 1087 .handler = igmp_rcv,
@@ -1007,7 +1128,6 @@ static int __init init_ipv4_mibs(void)
1007} 1128}
1008 1129
1009static int ipv4_proc_init(void); 1130static int ipv4_proc_init(void);
1010extern void ipfrag_init(void);
1011 1131
1012/* 1132/*
1013 * IP protocol layer initialiser 1133 * IP protocol layer initialiser
@@ -1128,19 +1248,10 @@ module_init(inet_init);
1128/* ------------------------------------------------------------------------ */ 1248/* ------------------------------------------------------------------------ */
1129 1249
1130#ifdef CONFIG_PROC_FS 1250#ifdef CONFIG_PROC_FS
1131extern int fib_proc_init(void);
1132extern void fib_proc_exit(void);
1133#ifdef CONFIG_IP_FIB_TRIE 1251#ifdef CONFIG_IP_FIB_TRIE
1134extern int fib_stat_proc_init(void); 1252extern int fib_stat_proc_init(void);
1135extern void fib_stat_proc_exit(void); 1253extern void fib_stat_proc_exit(void);
1136#endif 1254#endif
1137extern int ip_misc_proc_init(void);
1138extern int raw_proc_init(void);
1139extern void raw_proc_exit(void);
1140extern int tcp4_proc_init(void);
1141extern void tcp4_proc_exit(void);
1142extern int udp4_proc_init(void);
1143extern void udp4_proc_exit(void);
1144 1255
1145static int __init ipv4_proc_init(void) 1256static int __init ipv4_proc_init(void)
1146{ 1257{
@@ -1205,7 +1316,3 @@ EXPORT_SYMBOL(inet_stream_ops);
1205EXPORT_SYMBOL(inet_unregister_protosw); 1316EXPORT_SYMBOL(inet_unregister_protosw);
1206EXPORT_SYMBOL(net_statistics); 1317EXPORT_SYMBOL(net_statistics);
1207EXPORT_SYMBOL(sysctl_ip_nonlocal_bind); 1318EXPORT_SYMBOL(sysctl_ip_nonlocal_bind);
1208
1209#ifdef INET_REFCNT_DEBUG
1210EXPORT_SYMBOL(inet_sock_nr);
1211#endif
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index a642fd612853..8bf312bdea13 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -700,7 +700,7 @@ void arp_send(int type, int ptype, u32 dest_ip,
700static void parp_redo(struct sk_buff *skb) 700static void parp_redo(struct sk_buff *skb)
701{ 701{
702 nf_reset(skb); 702 nf_reset(skb);
703 arp_rcv(skb, skb->dev, NULL); 703 arp_rcv(skb, skb->dev, NULL, skb->dev);
704} 704}
705 705
706/* 706/*
@@ -865,7 +865,7 @@ static int arp_process(struct sk_buff *skb)
865 if (n) 865 if (n)
866 neigh_release(n); 866 neigh_release(n);
867 867
868 if (skb->stamp.tv_sec == LOCALLY_ENQUEUED || 868 if (NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED ||
869 skb->pkt_type == PACKET_HOST || 869 skb->pkt_type == PACKET_HOST ||
870 in_dev->arp_parms->proxy_delay == 0) { 870 in_dev->arp_parms->proxy_delay == 0) {
871 arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha); 871 arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha);
@@ -927,7 +927,7 @@ out:
927 * Receive an arp request from the device layer. 927 * Receive an arp request from the device layer.
928 */ 928 */
929 929
930int arp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) 930int arp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
931{ 931{
932 struct arphdr *arp; 932 struct arphdr *arp;
933 933
@@ -948,6 +948,8 @@ int arp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
948 if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) 948 if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
949 goto out_of_mem; 949 goto out_of_mem;
950 950
951 memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb));
952
951 return NF_HOOK(NF_ARP, NF_ARP_IN, skb, dev, NULL, arp_process); 953 return NF_HOOK(NF_ARP, NF_ARP_IN, skb, dev, NULL, arp_process);
952 954
953freeskb: 955freeskb:
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index b1db561f2542..c1b42b5257f8 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -16,9 +16,10 @@
16#include <linux/module.h> 16#include <linux/module.h>
17#include <linux/ip.h> 17#include <linux/ip.h>
18#include <linux/in.h> 18#include <linux/in.h>
19#include <net/ip.h>
19#include <net/sock.h> 20#include <net/sock.h>
20#include <net/tcp.h>
21#include <net/route.h> 21#include <net/route.h>
22#include <net/tcp_states.h>
22 23
23int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) 24int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
24{ 25{
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index d8a10e3dd77d..ba2895ae8151 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1111,13 +1111,12 @@ static void rtmsg_ifa(int event, struct in_ifaddr* ifa)
1111 struct sk_buff *skb = alloc_skb(size, GFP_KERNEL); 1111 struct sk_buff *skb = alloc_skb(size, GFP_KERNEL);
1112 1112
1113 if (!skb) 1113 if (!skb)
1114 netlink_set_err(rtnl, 0, RTMGRP_IPV4_IFADDR, ENOBUFS); 1114 netlink_set_err(rtnl, 0, RTNLGRP_IPV4_IFADDR, ENOBUFS);
1115 else if (inet_fill_ifaddr(skb, ifa, current->pid, 0, event, 0) < 0) { 1115 else if (inet_fill_ifaddr(skb, ifa, current->pid, 0, event, 0) < 0) {
1116 kfree_skb(skb); 1116 kfree_skb(skb);
1117 netlink_set_err(rtnl, 0, RTMGRP_IPV4_IFADDR, EINVAL); 1117 netlink_set_err(rtnl, 0, RTNLGRP_IPV4_IFADDR, EINVAL);
1118 } else { 1118 } else {
1119 NETLINK_CB(skb).dst_groups = RTMGRP_IPV4_IFADDR; 1119 netlink_broadcast(rtnl, skb, 0, RTNLGRP_IPV4_IFADDR, GFP_KERNEL);
1120 netlink_broadcast(rtnl, skb, 0, RTMGRP_IPV4_IFADDR, GFP_KERNEL);
1121 } 1120 }
1122} 1121}
1123 1122
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index ba57446d5d1f..b31ffc5053d2 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -331,8 +331,8 @@ static void esp4_err(struct sk_buff *skb, u32 info)
331 x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, esph->spi, IPPROTO_ESP, AF_INET); 331 x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, esph->spi, IPPROTO_ESP, AF_INET);
332 if (!x) 332 if (!x)
333 return; 333 return;
334 NETDEBUG(printk(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n", 334 NETDEBUG(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n",
335 ntohl(esph->spi), ntohl(iph->daddr))); 335 ntohl(esph->spi), ntohl(iph->daddr));
336 xfrm_state_put(x); 336 xfrm_state_put(x);
337} 337}
338 338
@@ -395,10 +395,10 @@ static int esp_init_state(struct xfrm_state *x)
395 395
396 if (aalg_desc->uinfo.auth.icv_fullbits/8 != 396 if (aalg_desc->uinfo.auth.icv_fullbits/8 !=
397 crypto_tfm_alg_digestsize(esp->auth.tfm)) { 397 crypto_tfm_alg_digestsize(esp->auth.tfm)) {
398 NETDEBUG(printk(KERN_INFO "ESP: %s digestsize %u != %hu\n", 398 NETDEBUG(KERN_INFO "ESP: %s digestsize %u != %hu\n",
399 x->aalg->alg_name, 399 x->aalg->alg_name,
400 crypto_tfm_alg_digestsize(esp->auth.tfm), 400 crypto_tfm_alg_digestsize(esp->auth.tfm),
401 aalg_desc->uinfo.auth.icv_fullbits/8)); 401 aalg_desc->uinfo.auth.icv_fullbits/8);
402 goto error; 402 goto error;
403 } 403 }
404 404
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index cd8e45ab9580..4e1379f71269 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -558,16 +558,15 @@ static void nl_fib_input(struct sock *sk, int len)
558 nl_fib_lookup(frn, tb); 558 nl_fib_lookup(frn, tb);
559 559
560 pid = nlh->nlmsg_pid; /*pid of sending process */ 560 pid = nlh->nlmsg_pid; /*pid of sending process */
561 NETLINK_CB(skb).groups = 0; /* not in mcast group */
562 NETLINK_CB(skb).pid = 0; /* from kernel */ 561 NETLINK_CB(skb).pid = 0; /* from kernel */
563 NETLINK_CB(skb).dst_pid = pid; 562 NETLINK_CB(skb).dst_pid = pid;
564 NETLINK_CB(skb).dst_groups = 0; /* unicast */ 563 NETLINK_CB(skb).dst_group = 0; /* unicast */
565 netlink_unicast(sk, skb, pid, MSG_DONTWAIT); 564 netlink_unicast(sk, skb, pid, MSG_DONTWAIT);
566} 565}
567 566
568static void nl_fib_lookup_init(void) 567static void nl_fib_lookup_init(void)
569{ 568{
570 netlink_kernel_create(NETLINK_FIB_LOOKUP, nl_fib_input); 569 netlink_kernel_create(NETLINK_FIB_LOOKUP, 0, nl_fib_input, THIS_MODULE);
571} 570}
572 571
573static void fib_disable_ip(struct net_device *dev, int force) 572static void fib_disable_ip(struct net_device *dev, int force)
@@ -662,5 +661,4 @@ void __init ip_fib_init(void)
662} 661}
663 662
664EXPORT_SYMBOL(inet_addr_type); 663EXPORT_SYMBOL(inet_addr_type);
665EXPORT_SYMBOL(ip_dev_find);
666EXPORT_SYMBOL(ip_rt_ioctl); 664EXPORT_SYMBOL(ip_rt_ioctl);
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
index b10d6bb5ef3d..2a8c9afc3695 100644
--- a/net/ipv4/fib_hash.c
+++ b/net/ipv4/fib_hash.c
@@ -45,8 +45,8 @@
45 45
46#include "fib_lookup.h" 46#include "fib_lookup.h"
47 47
48static kmem_cache_t *fn_hash_kmem; 48static kmem_cache_t *fn_hash_kmem __read_mostly;
49static kmem_cache_t *fn_alias_kmem; 49static kmem_cache_t *fn_alias_kmem __read_mostly;
50 50
51struct fib_node { 51struct fib_node {
52 struct hlist_node fn_hash; 52 struct hlist_node fn_hash;
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
index b729d97cfa93..ef6609ea0eb7 100644
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -7,6 +7,7 @@
7 7
8struct fib_alias { 8struct fib_alias {
9 struct list_head fa_list; 9 struct list_head fa_list;
10 struct rcu_head rcu;
10 struct fib_info *fa_info; 11 struct fib_info *fa_info;
11 u8 fa_tos; 12 u8 fa_tos;
12 u8 fa_type; 13 u8 fa_type;
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index e278cb9d0075..d41219e8037c 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -290,10 +290,10 @@ void rtmsg_fib(int event, u32 key, struct fib_alias *fa,
290 kfree_skb(skb); 290 kfree_skb(skb);
291 return; 291 return;
292 } 292 }
293 NETLINK_CB(skb).dst_groups = RTMGRP_IPV4_ROUTE; 293 NETLINK_CB(skb).dst_group = RTNLGRP_IPV4_ROUTE;
294 if (n->nlmsg_flags&NLM_F_ECHO) 294 if (n->nlmsg_flags&NLM_F_ECHO)
295 atomic_inc(&skb->users); 295 atomic_inc(&skb->users);
296 netlink_broadcast(rtnl, skb, pid, RTMGRP_IPV4_ROUTE, GFP_KERNEL); 296 netlink_broadcast(rtnl, skb, pid, RTNLGRP_IPV4_ROUTE, GFP_KERNEL);
297 if (n->nlmsg_flags&NLM_F_ECHO) 297 if (n->nlmsg_flags&NLM_F_ECHO)
298 netlink_unicast(rtnl, skb, pid, MSG_DONTWAIT); 298 netlink_unicast(rtnl, skb, pid, MSG_DONTWAIT);
299} 299}
@@ -854,6 +854,7 @@ failure:
854 return NULL; 854 return NULL;
855} 855}
856 856
857/* Note! fib_semantic_match intentionally uses RCU list functions. */
857int fib_semantic_match(struct list_head *head, const struct flowi *flp, 858int fib_semantic_match(struct list_head *head, const struct flowi *flp,
858 struct fib_result *res, __u32 zone, __u32 mask, 859 struct fib_result *res, __u32 zone, __u32 mask,
859 int prefixlen) 860 int prefixlen)
@@ -861,7 +862,7 @@ int fib_semantic_match(struct list_head *head, const struct flowi *flp,
861 struct fib_alias *fa; 862 struct fib_alias *fa;
862 int nh_sel = 0; 863 int nh_sel = 0;
863 864
864 list_for_each_entry(fa, head, fa_list) { 865 list_for_each_entry_rcu(fa, head, fa_list) {
865 int err; 866 int err;
866 867
867 if (fa->fa_tos && 868 if (fa->fa_tos &&
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index a701405fab0b..b2dea4e5da77 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -43,7 +43,7 @@
43 * 2 of the License, or (at your option) any later version. 43 * 2 of the License, or (at your option) any later version.
44 */ 44 */
45 45
46#define VERSION "0.325" 46#define VERSION "0.402"
47 47
48#include <linux/config.h> 48#include <linux/config.h>
49#include <asm/uaccess.h> 49#include <asm/uaccess.h>
@@ -62,6 +62,7 @@
62#include <linux/netdevice.h> 62#include <linux/netdevice.h>
63#include <linux/if_arp.h> 63#include <linux/if_arp.h>
64#include <linux/proc_fs.h> 64#include <linux/proc_fs.h>
65#include <linux/rcupdate.h>
65#include <linux/skbuff.h> 66#include <linux/skbuff.h>
66#include <linux/netlink.h> 67#include <linux/netlink.h>
67#include <linux/init.h> 68#include <linux/init.h>
@@ -77,56 +78,55 @@
77#undef CONFIG_IP_FIB_TRIE_STATS 78#undef CONFIG_IP_FIB_TRIE_STATS
78#define MAX_CHILDS 16384 79#define MAX_CHILDS 16384
79 80
80#define EXTRACT(p, n, str) ((str)<<(p)>>(32-(n)))
81#define KEYLENGTH (8*sizeof(t_key)) 81#define KEYLENGTH (8*sizeof(t_key))
82#define MASK_PFX(k, l) (((l)==0)?0:(k >> (KEYLENGTH-l)) << (KEYLENGTH-l)) 82#define MASK_PFX(k, l) (((l)==0)?0:(k >> (KEYLENGTH-l)) << (KEYLENGTH-l))
83#define TKEY_GET_MASK(offset, bits) (((bits)==0)?0:((t_key)(-1) << (KEYLENGTH - bits) >> offset)) 83#define TKEY_GET_MASK(offset, bits) (((bits)==0)?0:((t_key)(-1) << (KEYLENGTH - bits) >> offset))
84 84
85static DEFINE_RWLOCK(fib_lock);
86
87typedef unsigned int t_key; 85typedef unsigned int t_key;
88 86
89#define T_TNODE 0 87#define T_TNODE 0
90#define T_LEAF 1 88#define T_LEAF 1
91#define NODE_TYPE_MASK 0x1UL 89#define NODE_TYPE_MASK 0x1UL
92#define NODE_PARENT(_node) \ 90#define NODE_PARENT(node) \
93 ((struct tnode *)((_node)->_parent & ~NODE_TYPE_MASK)) 91 ((struct tnode *)rcu_dereference(((node)->parent & ~NODE_TYPE_MASK)))
94#define NODE_SET_PARENT(_node, _ptr) \ 92
95 ((_node)->_parent = (((unsigned long)(_ptr)) | \ 93#define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
96 ((_node)->_parent & NODE_TYPE_MASK))) 94
97#define NODE_INIT_PARENT(_node, _type) \ 95#define NODE_SET_PARENT(node, ptr) \
98 ((_node)->_parent = (_type)) 96 rcu_assign_pointer((node)->parent, \
99#define NODE_TYPE(_node) \ 97 ((unsigned long)(ptr)) | NODE_TYPE(node))
100 ((_node)->_parent & NODE_TYPE_MASK) 98
101 99#define IS_TNODE(n) (!(n->parent & T_LEAF))
102#define IS_TNODE(n) (!(n->_parent & T_LEAF)) 100#define IS_LEAF(n) (n->parent & T_LEAF)
103#define IS_LEAF(n) (n->_parent & T_LEAF)
104 101
105struct node { 102struct node {
106 t_key key; 103 t_key key;
107 unsigned long _parent; 104 unsigned long parent;
108}; 105};
109 106
110struct leaf { 107struct leaf {
111 t_key key; 108 t_key key;
112 unsigned long _parent; 109 unsigned long parent;
113 struct hlist_head list; 110 struct hlist_head list;
111 struct rcu_head rcu;
114}; 112};
115 113
116struct leaf_info { 114struct leaf_info {
117 struct hlist_node hlist; 115 struct hlist_node hlist;
116 struct rcu_head rcu;
118 int plen; 117 int plen;
119 struct list_head falh; 118 struct list_head falh;
120}; 119};
121 120
122struct tnode { 121struct tnode {
123 t_key key; 122 t_key key;
124 unsigned long _parent; 123 unsigned long parent;
125 unsigned short pos:5; /* 2log(KEYLENGTH) bits needed */ 124 unsigned short pos:5; /* 2log(KEYLENGTH) bits needed */
126 unsigned short bits:5; /* 2log(KEYLENGTH) bits needed */ 125 unsigned short bits:5; /* 2log(KEYLENGTH) bits needed */
127 unsigned short full_children; /* KEYLENGTH bits needed */ 126 unsigned short full_children; /* KEYLENGTH bits needed */
128 unsigned short empty_children; /* KEYLENGTH bits needed */ 127 unsigned short empty_children; /* KEYLENGTH bits needed */
129 struct node *child[0]; 128 struct rcu_head rcu;
129 struct node *child[0];
130}; 130};
131 131
132#ifdef CONFIG_IP_FIB_TRIE_STATS 132#ifdef CONFIG_IP_FIB_TRIE_STATS
@@ -150,77 +150,45 @@ struct trie_stat {
150}; 150};
151 151
152struct trie { 152struct trie {
153 struct node *trie; 153 struct node *trie;
154#ifdef CONFIG_IP_FIB_TRIE_STATS 154#ifdef CONFIG_IP_FIB_TRIE_STATS
155 struct trie_use_stats stats; 155 struct trie_use_stats stats;
156#endif 156#endif
157 int size; 157 int size;
158 unsigned int revision; 158 unsigned int revision;
159}; 159};
160 160
161static int trie_debug = 0;
162
163static int tnode_full(struct tnode *tn, struct node *n);
164static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n); 161static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
165static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull); 162static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull);
166static int tnode_child_length(struct tnode *tn);
167static struct node *resize(struct trie *t, struct tnode *tn); 163static struct node *resize(struct trie *t, struct tnode *tn);
168static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err); 164static struct tnode *inflate(struct trie *t, struct tnode *tn);
169static struct tnode *halve(struct trie *t, struct tnode *tn, int *err); 165static struct tnode *halve(struct trie *t, struct tnode *tn);
170static void tnode_free(struct tnode *tn); 166static void tnode_free(struct tnode *tn);
171static void trie_dump_seq(struct seq_file *seq, struct trie *t); 167static void trie_dump_seq(struct seq_file *seq, struct trie *t);
172extern struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio);
173extern int fib_detect_death(struct fib_info *fi, int order,
174 struct fib_info **last_resort, int *last_idx, int *dflt);
175
176extern void rtmsg_fib(int event, u32 key, struct fib_alias *fa, int z, int tb_id,
177 struct nlmsghdr *n, struct netlink_skb_parms *req);
178 168
179static kmem_cache_t *fn_alias_kmem; 169static kmem_cache_t *fn_alias_kmem __read_mostly;
180static struct trie *trie_local = NULL, *trie_main = NULL; 170static struct trie *trie_local = NULL, *trie_main = NULL;
181 171
182static void trie_bug(char *err) 172
183{ 173/* rcu_read_lock needs to be hold by caller from readside */
184 printk("Trie Bug: %s\n", err);
185 BUG();
186}
187 174
188static inline struct node *tnode_get_child(struct tnode *tn, int i) 175static inline struct node *tnode_get_child(struct tnode *tn, int i)
189{ 176{
190 if (i >= 1<<tn->bits) 177 BUG_ON(i >= 1 << tn->bits);
191 trie_bug("tnode_get_child");
192 178
193 return tn->child[i]; 179 return rcu_dereference(tn->child[i]);
194} 180}
195 181
196static inline int tnode_child_length(struct tnode *tn) 182static inline int tnode_child_length(const struct tnode *tn)
197{ 183{
198 return 1<<tn->bits; 184 return 1 << tn->bits;
199} 185}
200 186
201/*
202 _________________________________________________________________
203 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
204 ----------------------------------------------------------------
205 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
206
207 _________________________________________________________________
208 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
209 -----------------------------------------------------------------
210 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
211
212 tp->pos = 7
213 tp->bits = 3
214 n->pos = 15
215 n->bits=4
216 KEYLENGTH=32
217*/
218
219static inline t_key tkey_extract_bits(t_key a, int offset, int bits) 187static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
220{ 188{
221 if (offset < KEYLENGTH) 189 if (offset < KEYLENGTH)
222 return ((t_key)(a << offset)) >> (KEYLENGTH - bits); 190 return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
223 else 191 else
224 return 0; 192 return 0;
225} 193}
226 194
@@ -233,8 +201,8 @@ static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
233{ 201{
234 if (bits == 0 || offset >= KEYLENGTH) 202 if (bits == 0 || offset >= KEYLENGTH)
235 return 1; 203 return 1;
236 bits = bits > KEYLENGTH ? KEYLENGTH : bits; 204 bits = bits > KEYLENGTH ? KEYLENGTH : bits;
237 return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0; 205 return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
238} 206}
239 207
240static inline int tkey_mismatch(t_key a, int offset, t_key b) 208static inline int tkey_mismatch(t_key a, int offset, t_key b)
@@ -249,14 +217,6 @@ static inline int tkey_mismatch(t_key a, int offset, t_key b)
249 return i; 217 return i;
250} 218}
251 219
252/* Candiate for fib_semantics */
253
254static void fn_free_alias(struct fib_alias *fa)
255{
256 fib_release_info(fa->fa_info);
257 kmem_cache_free(fn_alias_kmem, fa);
258}
259
260/* 220/*
261 To understand this stuff, an understanding of keys and all their bits is 221 To understand this stuff, an understanding of keys and all their bits is
262 necessary. Every node in the trie has a key associated with it, but not 222 necessary. Every node in the trie has a key associated with it, but not
@@ -295,7 +255,7 @@ static void fn_free_alias(struct fib_alias *fa)
295 tp->pos = 7 255 tp->pos = 7
296 tp->bits = 3 256 tp->bits = 3
297 n->pos = 15 257 n->pos = 15
298 n->bits=4 258 n->bits = 4
299 259
300 First, let's just ignore the bits that come before the parent tp, that is 260 First, let's just ignore the bits that come before the parent tp, that is
301 the bits from 0 to (tp->pos-1). They are *known* but at this point we do 261 the bits from 0 to (tp->pos-1). They are *known* but at this point we do
@@ -320,60 +280,65 @@ static void fn_free_alias(struct fib_alias *fa)
320 280
321*/ 281*/
322 282
323static void check_tnode(struct tnode *tn) 283static inline void check_tnode(const struct tnode *tn)
324{ 284{
325 if (tn && tn->pos+tn->bits > 32) { 285 WARN_ON(tn && tn->pos+tn->bits > 32);
326 printk("TNODE ERROR tn=%p, pos=%d, bits=%d\n", tn, tn->pos, tn->bits);
327 }
328} 286}
329 287
330static int halve_threshold = 25; 288static int halve_threshold = 25;
331static int inflate_threshold = 50; 289static int inflate_threshold = 50;
332 290
333static struct leaf *leaf_new(void) 291
292static void __alias_free_mem(struct rcu_head *head)
334{ 293{
335 struct leaf *l = kmalloc(sizeof(struct leaf), GFP_KERNEL); 294 struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
336 if (l) { 295 kmem_cache_free(fn_alias_kmem, fa);
337 NODE_INIT_PARENT(l, T_LEAF);
338 INIT_HLIST_HEAD(&l->list);
339 }
340 return l;
341} 296}
342 297
343static struct leaf_info *leaf_info_new(int plen) 298static inline void alias_free_mem_rcu(struct fib_alias *fa)
344{ 299{
345 struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL); 300 call_rcu(&fa->rcu, __alias_free_mem);
346 if (li) { 301}
347 li->plen = plen; 302
348 INIT_LIST_HEAD(&li->falh); 303static void __leaf_free_rcu(struct rcu_head *head)
349 } 304{
350 return li; 305 kfree(container_of(head, struct leaf, rcu));
306}
307
308static inline void free_leaf(struct leaf *leaf)
309{
310 call_rcu(&leaf->rcu, __leaf_free_rcu);
351} 311}
352 312
353static inline void free_leaf(struct leaf *l) 313static void __leaf_info_free_rcu(struct rcu_head *head)
354{ 314{
355 kfree(l); 315 kfree(container_of(head, struct leaf_info, rcu));
356} 316}
357 317
358static inline void free_leaf_info(struct leaf_info *li) 318static inline void free_leaf_info(struct leaf_info *leaf)
359{ 319{
360 kfree(li); 320 call_rcu(&leaf->rcu, __leaf_info_free_rcu);
361} 321}
362 322
363static struct tnode *tnode_alloc(unsigned int size) 323static struct tnode *tnode_alloc(unsigned int size)
364{ 324{
365 if (size <= PAGE_SIZE) { 325 struct page *pages;
366 return kmalloc(size, GFP_KERNEL); 326
367 } else { 327 if (size <= PAGE_SIZE)
368 return (struct tnode *) 328 return kcalloc(size, 1, GFP_KERNEL);
369 __get_free_pages(GFP_KERNEL, get_order(size)); 329
370 } 330 pages = alloc_pages(GFP_KERNEL|__GFP_ZERO, get_order(size));
331 if (!pages)
332 return NULL;
333
334 return page_address(pages);
371} 335}
372 336
373static void __tnode_free(struct tnode *tn) 337static void __tnode_free_rcu(struct rcu_head *head)
374{ 338{
339 struct tnode *tn = container_of(head, struct tnode, rcu);
375 unsigned int size = sizeof(struct tnode) + 340 unsigned int size = sizeof(struct tnode) +
376 (1<<tn->bits) * sizeof(struct node *); 341 (1 << tn->bits) * sizeof(struct node *);
377 342
378 if (size <= PAGE_SIZE) 343 if (size <= PAGE_SIZE)
379 kfree(tn); 344 kfree(tn);
@@ -381,15 +346,40 @@ static void __tnode_free(struct tnode *tn)
381 free_pages((unsigned long)tn, get_order(size)); 346 free_pages((unsigned long)tn, get_order(size));
382} 347}
383 348
349static inline void tnode_free(struct tnode *tn)
350{
351 call_rcu(&tn->rcu, __tnode_free_rcu);
352}
353
354static struct leaf *leaf_new(void)
355{
356 struct leaf *l = kmalloc(sizeof(struct leaf), GFP_KERNEL);
357 if (l) {
358 l->parent = T_LEAF;
359 INIT_HLIST_HEAD(&l->list);
360 }
361 return l;
362}
363
364static struct leaf_info *leaf_info_new(int plen)
365{
366 struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL);
367 if (li) {
368 li->plen = plen;
369 INIT_LIST_HEAD(&li->falh);
370 }
371 return li;
372}
373
384static struct tnode* tnode_new(t_key key, int pos, int bits) 374static struct tnode* tnode_new(t_key key, int pos, int bits)
385{ 375{
386 int nchildren = 1<<bits; 376 int nchildren = 1<<bits;
387 int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *); 377 int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *);
388 struct tnode *tn = tnode_alloc(sz); 378 struct tnode *tn = tnode_alloc(sz);
389 379
390 if (tn) { 380 if (tn) {
391 memset(tn, 0, sz); 381 memset(tn, 0, sz);
392 NODE_INIT_PARENT(tn, T_TNODE); 382 tn->parent = T_TNODE;
393 tn->pos = pos; 383 tn->pos = pos;
394 tn->bits = bits; 384 tn->bits = bits;
395 tn->key = key; 385 tn->key = key;
@@ -397,38 +387,17 @@ static struct tnode* tnode_new(t_key key, int pos, int bits)
397 tn->empty_children = 1<<bits; 387 tn->empty_children = 1<<bits;
398 } 388 }
399 389
400 if (trie_debug > 0) 390 pr_debug("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode),
401 printk("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode), 391 (unsigned int) (sizeof(struct node) * 1<<bits));
402 (unsigned int) (sizeof(struct node) * 1<<bits));
403 return tn; 392 return tn;
404} 393}
405 394
406static void tnode_free(struct tnode *tn)
407{
408 if (!tn) {
409 trie_bug("tnode_free\n");
410 }
411 if (IS_LEAF(tn)) {
412 free_leaf((struct leaf *)tn);
413 if (trie_debug > 0 )
414 printk("FL %p \n", tn);
415 }
416 else if (IS_TNODE(tn)) {
417 __tnode_free(tn);
418 if (trie_debug > 0 )
419 printk("FT %p \n", tn);
420 }
421 else {
422 trie_bug("tnode_free\n");
423 }
424}
425
426/* 395/*
427 * Check whether a tnode 'n' is "full", i.e. it is an internal node 396 * Check whether a tnode 'n' is "full", i.e. it is an internal node
428 * and no bits are skipped. See discussion in dyntree paper p. 6 397 * and no bits are skipped. See discussion in dyntree paper p. 6
429 */ 398 */
430 399
431static inline int tnode_full(struct tnode *tn, struct node *n) 400static inline int tnode_full(const struct tnode *tn, const struct node *n)
432{ 401{
433 if (n == NULL || IS_LEAF(n)) 402 if (n == NULL || IS_LEAF(n))
434 return 0; 403 return 0;
@@ -448,15 +417,11 @@ static inline void put_child(struct trie *t, struct tnode *tn, int i, struct nod
448 417
449static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull) 418static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull)
450{ 419{
451 struct node *chi; 420 struct node *chi = tn->child[i];
452 int isfull; 421 int isfull;
453 422
454 if (i >= 1<<tn->bits) { 423 BUG_ON(i >= 1<<tn->bits);
455 printk("bits=%d, i=%d\n", tn->bits, i); 424
456 trie_bug("tnode_put_child_reorg bits");
457 }
458 write_lock_bh(&fib_lock);
459 chi = tn->child[i];
460 425
461 /* update emptyChildren */ 426 /* update emptyChildren */
462 if (n == NULL && chi != NULL) 427 if (n == NULL && chi != NULL)
@@ -465,33 +430,32 @@ static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int w
465 tn->empty_children--; 430 tn->empty_children--;
466 431
467 /* update fullChildren */ 432 /* update fullChildren */
468 if (wasfull == -1) 433 if (wasfull == -1)
469 wasfull = tnode_full(tn, chi); 434 wasfull = tnode_full(tn, chi);
470 435
471 isfull = tnode_full(tn, n); 436 isfull = tnode_full(tn, n);
472 if (wasfull && !isfull) 437 if (wasfull && !isfull)
473 tn->full_children--; 438 tn->full_children--;
474
475 else if (!wasfull && isfull) 439 else if (!wasfull && isfull)
476 tn->full_children++; 440 tn->full_children++;
441
477 if (n) 442 if (n)
478 NODE_SET_PARENT(n, tn); 443 NODE_SET_PARENT(n, tn);
479 444
480 tn->child[i] = n; 445 rcu_assign_pointer(tn->child[i], n);
481 write_unlock_bh(&fib_lock);
482} 446}
483 447
484static struct node *resize(struct trie *t, struct tnode *tn) 448static struct node *resize(struct trie *t, struct tnode *tn)
485{ 449{
486 int i; 450 int i;
487 int err = 0; 451 int err = 0;
452 struct tnode *old_tn;
488 453
489 if (!tn) 454 if (!tn)
490 return NULL; 455 return NULL;
491 456
492 if (trie_debug) 457 pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
493 printk("In tnode_resize %p inflate_threshold=%d threshold=%d\n", 458 tn, inflate_threshold, halve_threshold);
494 tn, inflate_threshold, halve_threshold);
495 459
496 /* No children */ 460 /* No children */
497 if (tn->empty_children == tnode_child_length(tn)) { 461 if (tn->empty_children == tnode_child_length(tn)) {
@@ -501,20 +465,16 @@ static struct node *resize(struct trie *t, struct tnode *tn)
501 /* One child */ 465 /* One child */
502 if (tn->empty_children == tnode_child_length(tn) - 1) 466 if (tn->empty_children == tnode_child_length(tn) - 1)
503 for (i = 0; i < tnode_child_length(tn); i++) { 467 for (i = 0; i < tnode_child_length(tn); i++) {
468 struct node *n;
504 469
505 write_lock_bh(&fib_lock); 470 n = tn->child[i];
506 if (tn->child[i] != NULL) { 471 if (!n)
507 472 continue;
508 /* compress one level */
509 struct node *n = tn->child[i];
510 if (n)
511 NODE_INIT_PARENT(n, NODE_TYPE(n));
512 473
513 write_unlock_bh(&fib_lock); 474 /* compress one level */
514 tnode_free(tn); 475 NODE_SET_PARENT(n, NULL);
515 return n; 476 tnode_free(tn);
516 } 477 return n;
517 write_unlock_bh(&fib_lock);
518 } 478 }
519 /* 479 /*
520 * Double as long as the resulting node has a number of 480 * Double as long as the resulting node has a number of
@@ -566,16 +526,16 @@ static struct node *resize(struct trie *t, struct tnode *tn)
566 * 526 *
567 * expand not_to_be_doubled and to_be_doubled, and shorten: 527 * expand not_to_be_doubled and to_be_doubled, and shorten:
568 * 100 * (tnode_child_length(tn) - tn->empty_children + 528 * 100 * (tnode_child_length(tn) - tn->empty_children +
569 * tn->full_children ) >= inflate_threshold * new_child_length 529 * tn->full_children) >= inflate_threshold * new_child_length
570 * 530 *
571 * expand new_child_length: 531 * expand new_child_length:
572 * 100 * (tnode_child_length(tn) - tn->empty_children + 532 * 100 * (tnode_child_length(tn) - tn->empty_children +
573 * tn->full_children ) >= 533 * tn->full_children) >=
574 * inflate_threshold * tnode_child_length(tn) * 2 534 * inflate_threshold * tnode_child_length(tn) * 2
575 * 535 *
576 * shorten again: 536 * shorten again:
577 * 50 * (tn->full_children + tnode_child_length(tn) - 537 * 50 * (tn->full_children + tnode_child_length(tn) -
578 * tn->empty_children ) >= inflate_threshold * 538 * tn->empty_children) >= inflate_threshold *
579 * tnode_child_length(tn) 539 * tnode_child_length(tn)
580 * 540 *
581 */ 541 */
@@ -587,9 +547,10 @@ static struct node *resize(struct trie *t, struct tnode *tn)
587 50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >= 547 50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >=
588 inflate_threshold * tnode_child_length(tn))) { 548 inflate_threshold * tnode_child_length(tn))) {
589 549
590 tn = inflate(t, tn, &err); 550 old_tn = tn;
591 551 tn = inflate(t, tn);
592 if (err) { 552 if (IS_ERR(tn)) {
553 tn = old_tn;
593#ifdef CONFIG_IP_FIB_TRIE_STATS 554#ifdef CONFIG_IP_FIB_TRIE_STATS
594 t->stats.resize_node_skipped++; 555 t->stats.resize_node_skipped++;
595#endif 556#endif
@@ -609,9 +570,10 @@ static struct node *resize(struct trie *t, struct tnode *tn)
609 100 * (tnode_child_length(tn) - tn->empty_children) < 570 100 * (tnode_child_length(tn) - tn->empty_children) <
610 halve_threshold * tnode_child_length(tn)) { 571 halve_threshold * tnode_child_length(tn)) {
611 572
612 tn = halve(t, tn, &err); 573 old_tn = tn;
613 574 tn = halve(t, tn);
614 if (err) { 575 if (IS_ERR(tn)) {
576 tn = old_tn;
615#ifdef CONFIG_IP_FIB_TRIE_STATS 577#ifdef CONFIG_IP_FIB_TRIE_STATS
616 t->stats.resize_node_skipped++; 578 t->stats.resize_node_skipped++;
617#endif 579#endif
@@ -621,44 +583,37 @@ static struct node *resize(struct trie *t, struct tnode *tn)
621 583
622 584
623 /* Only one child remains */ 585 /* Only one child remains */
624
625 if (tn->empty_children == tnode_child_length(tn) - 1) 586 if (tn->empty_children == tnode_child_length(tn) - 1)
626 for (i = 0; i < tnode_child_length(tn); i++) { 587 for (i = 0; i < tnode_child_length(tn); i++) {
627 588 struct node *n;
628 write_lock_bh(&fib_lock); 589
629 if (tn->child[i] != NULL) { 590 n = tn->child[i];
630 /* compress one level */ 591 if (!n)
631 struct node *n = tn->child[i]; 592 continue;
632 593
633 if (n) 594 /* compress one level */
634 NODE_INIT_PARENT(n, NODE_TYPE(n)); 595
635 596 NODE_SET_PARENT(n, NULL);
636 write_unlock_bh(&fib_lock); 597 tnode_free(tn);
637 tnode_free(tn); 598 return n;
638 return n;
639 }
640 write_unlock_bh(&fib_lock);
641 } 599 }
642 600
643 return (struct node *) tn; 601 return (struct node *) tn;
644} 602}
645 603
646static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err) 604static struct tnode *inflate(struct trie *t, struct tnode *tn)
647{ 605{
648 struct tnode *inode; 606 struct tnode *inode;
649 struct tnode *oldtnode = tn; 607 struct tnode *oldtnode = tn;
650 int olen = tnode_child_length(tn); 608 int olen = tnode_child_length(tn);
651 int i; 609 int i;
652 610
653 if (trie_debug) 611 pr_debug("In inflate\n");
654 printk("In inflate\n");
655 612
656 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1); 613 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
657 614
658 if (!tn) { 615 if (!tn)
659 *err = -ENOMEM; 616 return ERR_PTR(-ENOMEM);
660 return oldtnode;
661 }
662 617
663 /* 618 /*
664 * Preallocate and store tnodes before the actual work so we 619 * Preallocate and store tnodes before the actual work so we
@@ -666,8 +621,8 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err)
666 * fails. In case of failure we return the oldnode and inflate 621 * fails. In case of failure we return the oldnode and inflate
667 * of tnode is ignored. 622 * of tnode is ignored.
668 */ 623 */
669 624
670 for(i = 0; i < olen; i++) { 625 for (i = 0; i < olen; i++) {
671 struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i); 626 struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i);
672 627
673 if (inode && 628 if (inode &&
@@ -675,46 +630,30 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err)
675 inode->pos == oldtnode->pos + oldtnode->bits && 630 inode->pos == oldtnode->pos + oldtnode->bits &&
676 inode->bits > 1) { 631 inode->bits > 1) {
677 struct tnode *left, *right; 632 struct tnode *left, *right;
678
679 t_key m = TKEY_GET_MASK(inode->pos, 1); 633 t_key m = TKEY_GET_MASK(inode->pos, 1);
680 634
681 left = tnode_new(inode->key&(~m), inode->pos + 1, 635 left = tnode_new(inode->key&(~m), inode->pos + 1,
682 inode->bits - 1); 636 inode->bits - 1);
637 if (!left)
638 goto nomem;
683 639
684 if (!left) {
685 *err = -ENOMEM;
686 break;
687 }
688
689 right = tnode_new(inode->key|m, inode->pos + 1, 640 right = tnode_new(inode->key|m, inode->pos + 1,
690 inode->bits - 1); 641 inode->bits - 1);
691 642
692 if (!right) { 643 if (!right) {
693 *err = -ENOMEM; 644 tnode_free(left);
694 break; 645 goto nomem;
695 } 646 }
696 647
697 put_child(t, tn, 2*i, (struct node *) left); 648 put_child(t, tn, 2*i, (struct node *) left);
698 put_child(t, tn, 2*i+1, (struct node *) right); 649 put_child(t, tn, 2*i+1, (struct node *) right);
699 } 650 }
700 } 651 }
701 652
702 if (*err) { 653 for (i = 0; i < olen; i++) {
703 int size = tnode_child_length(tn);
704 int j;
705
706 for(j = 0; j < size; j++)
707 if (tn->child[j])
708 tnode_free((struct tnode *)tn->child[j]);
709
710 tnode_free(tn);
711
712 *err = -ENOMEM;
713 return oldtnode;
714 }
715
716 for(i = 0; i < olen; i++) {
717 struct node *node = tnode_get_child(oldtnode, i); 654 struct node *node = tnode_get_child(oldtnode, i);
655 struct tnode *left, *right;
656 int size, j;
718 657
719 /* An empty child */ 658 /* An empty child */
720 if (node == NULL) 659 if (node == NULL)
@@ -740,76 +679,82 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err)
740 put_child(t, tn, 2*i+1, inode->child[1]); 679 put_child(t, tn, 2*i+1, inode->child[1]);
741 680
742 tnode_free(inode); 681 tnode_free(inode);
682 continue;
743 } 683 }
744 684
745 /* An internal node with more than two children */ 685 /* An internal node with more than two children */
746 else { 686
747 struct tnode *left, *right; 687 /* We will replace this node 'inode' with two new
748 int size, j; 688 * ones, 'left' and 'right', each with half of the
749 689 * original children. The two new nodes will have
750 /* We will replace this node 'inode' with two new 690 * a position one bit further down the key and this
751 * ones, 'left' and 'right', each with half of the 691 * means that the "significant" part of their keys
752 * original children. The two new nodes will have 692 * (see the discussion near the top of this file)
753 * a position one bit further down the key and this 693 * will differ by one bit, which will be "0" in
754 * means that the "significant" part of their keys 694 * left's key and "1" in right's key. Since we are
755 * (see the discussion near the top of this file) 695 * moving the key position by one step, the bit that
756 * will differ by one bit, which will be "0" in 696 * we are moving away from - the bit at position
757 * left's key and "1" in right's key. Since we are 697 * (inode->pos) - is the one that will differ between
758 * moving the key position by one step, the bit that 698 * left and right. So... we synthesize that bit in the
759 * we are moving away from - the bit at position 699 * two new keys.
760 * (inode->pos) - is the one that will differ between 700 * The mask 'm' below will be a single "one" bit at
761 * left and right. So... we synthesize that bit in the 701 * the position (inode->pos)
762 * two new keys. 702 */
763 * The mask 'm' below will be a single "one" bit at
764 * the position (inode->pos)
765 */
766
767 /* Use the old key, but set the new significant
768 * bit to zero.
769 */
770 703
771 left = (struct tnode *) tnode_get_child(tn, 2*i); 704 /* Use the old key, but set the new significant
772 put_child(t, tn, 2*i, NULL); 705 * bit to zero.
706 */
773 707
774 if (!left) 708 left = (struct tnode *) tnode_get_child(tn, 2*i);
775 BUG(); 709 put_child(t, tn, 2*i, NULL);
776 710
777 right = (struct tnode *) tnode_get_child(tn, 2*i+1); 711 BUG_ON(!left);
778 put_child(t, tn, 2*i+1, NULL);
779 712
780 if (!right) 713 right = (struct tnode *) tnode_get_child(tn, 2*i+1);
781 BUG(); 714 put_child(t, tn, 2*i+1, NULL);
782 715
783 size = tnode_child_length(left); 716 BUG_ON(!right);
784 for(j = 0; j < size; j++) {
785 put_child(t, left, j, inode->child[j]);
786 put_child(t, right, j, inode->child[j + size]);
787 }
788 put_child(t, tn, 2*i, resize(t, left));
789 put_child(t, tn, 2*i+1, resize(t, right));
790 717
791 tnode_free(inode); 718 size = tnode_child_length(left);
719 for (j = 0; j < size; j++) {
720 put_child(t, left, j, inode->child[j]);
721 put_child(t, right, j, inode->child[j + size]);
792 } 722 }
723 put_child(t, tn, 2*i, resize(t, left));
724 put_child(t, tn, 2*i+1, resize(t, right));
725
726 tnode_free(inode);
793 } 727 }
794 tnode_free(oldtnode); 728 tnode_free(oldtnode);
795 return tn; 729 return tn;
730nomem:
731 {
732 int size = tnode_child_length(tn);
733 int j;
734
735 for (j = 0; j < size; j++)
736 if (tn->child[j])
737 tnode_free((struct tnode *)tn->child[j]);
738
739 tnode_free(tn);
740
741 return ERR_PTR(-ENOMEM);
742 }
796} 743}
797 744
798static struct tnode *halve(struct trie *t, struct tnode *tn, int *err) 745static struct tnode *halve(struct trie *t, struct tnode *tn)
799{ 746{
800 struct tnode *oldtnode = tn; 747 struct tnode *oldtnode = tn;
801 struct node *left, *right; 748 struct node *left, *right;
802 int i; 749 int i;
803 int olen = tnode_child_length(tn); 750 int olen = tnode_child_length(tn);
804 751
805 if (trie_debug) printk("In halve\n"); 752 pr_debug("In halve\n");
806 753
807 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1); 754 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
808 755
809 if (!tn) { 756 if (!tn)
810 *err = -ENOMEM; 757 return ERR_PTR(-ENOMEM);
811 return oldtnode;
812 }
813 758
814 /* 759 /*
815 * Preallocate and store tnodes before the actual work so we 760 * Preallocate and store tnodes before the actual work so we
@@ -818,38 +763,27 @@ static struct tnode *halve(struct trie *t, struct tnode *tn, int *err)
818 * of tnode is ignored. 763 * of tnode is ignored.
819 */ 764 */
820 765
821 for(i = 0; i < olen; i += 2) { 766 for (i = 0; i < olen; i += 2) {
822 left = tnode_get_child(oldtnode, i); 767 left = tnode_get_child(oldtnode, i);
823 right = tnode_get_child(oldtnode, i+1); 768 right = tnode_get_child(oldtnode, i+1);
824 769
825 /* Two nonempty children */ 770 /* Two nonempty children */
826 if (left && right) { 771 if (left && right) {
827 struct tnode *newBinNode = 772 struct tnode *newn;
828 tnode_new(left->key, tn->pos + tn->bits, 1);
829 773
830 if (!newBinNode) { 774 newn = tnode_new(left->key, tn->pos + tn->bits, 1);
831 *err = -ENOMEM;
832 break;
833 }
834 put_child(t, tn, i/2, (struct node *)newBinNode);
835 }
836 }
837 775
838 if (*err) { 776 if (!newn)
839 int size = tnode_child_length(tn); 777 goto nomem;
840 int j;
841 778
842 for(j = 0; j < size; j++) 779 put_child(t, tn, i/2, (struct node *)newn);
843 if (tn->child[j]) 780 }
844 tnode_free((struct tnode *)tn->child[j]);
845 781
846 tnode_free(tn);
847
848 *err = -ENOMEM;
849 return oldtnode;
850 } 782 }
851 783
852 for(i = 0; i < olen; i += 2) { 784 for (i = 0; i < olen; i += 2) {
785 struct tnode *newBinNode;
786
853 left = tnode_get_child(oldtnode, i); 787 left = tnode_get_child(oldtnode, i);
854 right = tnode_get_child(oldtnode, i+1); 788 right = tnode_get_child(oldtnode, i+1);
855 789
@@ -858,88 +792,99 @@ static struct tnode *halve(struct trie *t, struct tnode *tn, int *err)
858 if (right == NULL) /* Both are empty */ 792 if (right == NULL) /* Both are empty */
859 continue; 793 continue;
860 put_child(t, tn, i/2, right); 794 put_child(t, tn, i/2, right);
861 } else if (right == NULL) 795 continue;
796 }
797
798 if (right == NULL) {
862 put_child(t, tn, i/2, left); 799 put_child(t, tn, i/2, left);
800 continue;
801 }
863 802
864 /* Two nonempty children */ 803 /* Two nonempty children */
865 else { 804 newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
866 struct tnode *newBinNode = 805 put_child(t, tn, i/2, NULL);
867 (struct tnode *) tnode_get_child(tn, i/2); 806 put_child(t, newBinNode, 0, left);
868 put_child(t, tn, i/2, NULL); 807 put_child(t, newBinNode, 1, right);
869 808 put_child(t, tn, i/2, resize(t, newBinNode));
870 if (!newBinNode)
871 BUG();
872
873 put_child(t, newBinNode, 0, left);
874 put_child(t, newBinNode, 1, right);
875 put_child(t, tn, i/2, resize(t, newBinNode));
876 }
877 } 809 }
878 tnode_free(oldtnode); 810 tnode_free(oldtnode);
879 return tn; 811 return tn;
812nomem:
813 {
814 int size = tnode_child_length(tn);
815 int j;
816
817 for (j = 0; j < size; j++)
818 if (tn->child[j])
819 tnode_free((struct tnode *)tn->child[j]);
820
821 tnode_free(tn);
822
823 return ERR_PTR(-ENOMEM);
824 }
880} 825}
881 826
882static void *trie_init(struct trie *t) 827static void trie_init(struct trie *t)
883{ 828{
884 if (t) { 829 if (!t)
885 t->size = 0; 830 return;
886 t->trie = NULL; 831
887 t->revision = 0; 832 t->size = 0;
833 rcu_assign_pointer(t->trie, NULL);
834 t->revision = 0;
888#ifdef CONFIG_IP_FIB_TRIE_STATS 835#ifdef CONFIG_IP_FIB_TRIE_STATS
889 memset(&t->stats, 0, sizeof(struct trie_use_stats)); 836 memset(&t->stats, 0, sizeof(struct trie_use_stats));
890#endif 837#endif
891 }
892 return t;
893} 838}
894 839
840/* readside most use rcu_read_lock currently dump routines
841 via get_fa_head and dump */
842
895static struct leaf_info *find_leaf_info(struct hlist_head *head, int plen) 843static struct leaf_info *find_leaf_info(struct hlist_head *head, int plen)
896{ 844{
897 struct hlist_node *node; 845 struct hlist_node *node;
898 struct leaf_info *li; 846 struct leaf_info *li;
899 847
900 hlist_for_each_entry(li, node, head, hlist) { 848 hlist_for_each_entry_rcu(li, node, head, hlist)
901 if (li->plen == plen) 849 if (li->plen == plen)
902 return li; 850 return li;
903 } 851
904 return NULL; 852 return NULL;
905} 853}
906 854
907static inline struct list_head * get_fa_head(struct leaf *l, int plen) 855static inline struct list_head * get_fa_head(struct leaf *l, int plen)
908{ 856{
909 struct list_head *fa_head = NULL;
910 struct leaf_info *li = find_leaf_info(&l->list, plen); 857 struct leaf_info *li = find_leaf_info(&l->list, plen);
911 858
912 if (li) 859 if (!li)
913 fa_head = &li->falh; 860 return NULL;
914 861
915 return fa_head; 862 return &li->falh;
916} 863}
917 864
918static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new) 865static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
919{ 866{
920 struct leaf_info *li = NULL, *last = NULL; 867 struct leaf_info *li = NULL, *last = NULL;
921 struct hlist_node *node, *tmp; 868 struct hlist_node *node;
922 869
923 write_lock_bh(&fib_lock); 870 if (hlist_empty(head)) {
924 871 hlist_add_head_rcu(&new->hlist, head);
925 if (hlist_empty(head)) 872 } else {
926 hlist_add_head(&new->hlist, head); 873 hlist_for_each_entry(li, node, head, hlist) {
927 else { 874 if (new->plen > li->plen)
928 hlist_for_each_entry_safe(li, node, tmp, head, hlist) { 875 break;
929 876
930 if (new->plen > li->plen) 877 last = li;
931 break; 878 }
932 879 if (last)
933 last = li; 880 hlist_add_after_rcu(&last->hlist, &new->hlist);
934 } 881 else
935 if (last) 882 hlist_add_before_rcu(&new->hlist, &li->hlist);
936 hlist_add_after(&last->hlist, &new->hlist); 883 }
937 else
938 hlist_add_before(&new->hlist, &li->hlist);
939 }
940 write_unlock_bh(&fib_lock);
941} 884}
942 885
886/* rcu_read_lock needs to be hold by caller from readside */
887
943static struct leaf * 888static struct leaf *
944fib_find_node(struct trie *t, u32 key) 889fib_find_node(struct trie *t, u32 key)
945{ 890{
@@ -948,61 +893,43 @@ fib_find_node(struct trie *t, u32 key)
948 struct node *n; 893 struct node *n;
949 894
950 pos = 0; 895 pos = 0;
951 n = t->trie; 896 n = rcu_dereference(t->trie);
952 897
953 while (n != NULL && NODE_TYPE(n) == T_TNODE) { 898 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
954 tn = (struct tnode *) n; 899 tn = (struct tnode *) n;
955 900
956 check_tnode(tn); 901 check_tnode(tn);
957 902
958 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) { 903 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
959 pos=tn->pos + tn->bits; 904 pos = tn->pos + tn->bits;
960 n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits)); 905 n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
961 } 906 } else
962 else
963 break; 907 break;
964 } 908 }
965 /* Case we have found a leaf. Compare prefixes */ 909 /* Case we have found a leaf. Compare prefixes */
966 910
967 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) { 911 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
968 struct leaf *l = (struct leaf *) n; 912 return (struct leaf *)n;
969 return l; 913
970 }
971 return NULL; 914 return NULL;
972} 915}
973 916
974static struct node *trie_rebalance(struct trie *t, struct tnode *tn) 917static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
975{ 918{
976 int i = 0;
977 int wasfull; 919 int wasfull;
978 t_key cindex, key; 920 t_key cindex, key;
979 struct tnode *tp = NULL; 921 struct tnode *tp = NULL;
980 922
981 if (!tn)
982 BUG();
983
984 key = tn->key; 923 key = tn->key;
985 i = 0;
986 924
987 while (tn != NULL && NODE_PARENT(tn) != NULL) { 925 while (tn != NULL && NODE_PARENT(tn) != NULL) {
988 926
989 if (i > 10) {
990 printk("Rebalance tn=%p \n", tn);
991 if (tn) printk("tn->parent=%p \n", NODE_PARENT(tn));
992
993 printk("Rebalance tp=%p \n", tp);
994 if (tp) printk("tp->parent=%p \n", NODE_PARENT(tp));
995 }
996
997 if (i > 12) BUG();
998 i++;
999
1000 tp = NODE_PARENT(tn); 927 tp = NODE_PARENT(tn);
1001 cindex = tkey_extract_bits(key, tp->pos, tp->bits); 928 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1002 wasfull = tnode_full(tp, tnode_get_child(tp, cindex)); 929 wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
1003 tn = (struct tnode *) resize (t, (struct tnode *)tn); 930 tn = (struct tnode *) resize (t, (struct tnode *)tn);
1004 tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull); 931 tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull);
1005 932
1006 if (!NODE_PARENT(tn)) 933 if (!NODE_PARENT(tn))
1007 break; 934 break;
1008 935
@@ -1015,6 +942,8 @@ static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
1015 return (struct node*) tn; 942 return (struct node*) tn;
1016} 943}
1017 944
945/* only used from updater-side */
946
1018static struct list_head * 947static struct list_head *
1019fib_insert_node(struct trie *t, int *err, u32 key, int plen) 948fib_insert_node(struct trie *t, int *err, u32 key, int plen)
1020{ 949{
@@ -1050,20 +979,16 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen)
1050 979
1051 while (n != NULL && NODE_TYPE(n) == T_TNODE) { 980 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
1052 tn = (struct tnode *) n; 981 tn = (struct tnode *) n;
1053 982
1054 check_tnode(tn); 983 check_tnode(tn);
1055 984
1056 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) { 985 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
1057 tp = tn; 986 tp = tn;
1058 pos=tn->pos + tn->bits; 987 pos = tn->pos + tn->bits;
1059 n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits)); 988 n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
1060 989
1061 if (n && NODE_PARENT(n) != tn) { 990 BUG_ON(n && NODE_PARENT(n) != tn);
1062 printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n)); 991 } else
1063 BUG();
1064 }
1065 }
1066 else
1067 break; 992 break;
1068 } 993 }
1069 994
@@ -1073,17 +998,15 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen)
1073 * tp is n's (parent) ----> NULL or TNODE 998 * tp is n's (parent) ----> NULL or TNODE
1074 */ 999 */
1075 1000
1076 if (tp && IS_LEAF(tp)) 1001 BUG_ON(tp && IS_LEAF(tp));
1077 BUG();
1078
1079 1002
1080 /* Case 1: n is a leaf. Compare prefixes */ 1003 /* Case 1: n is a leaf. Compare prefixes */
1081 1004
1082 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) { 1005 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1083 struct leaf *l = ( struct leaf *) n; 1006 struct leaf *l = (struct leaf *) n;
1084 1007
1085 li = leaf_info_new(plen); 1008 li = leaf_info_new(plen);
1086 1009
1087 if (!li) { 1010 if (!li) {
1088 *err = -ENOMEM; 1011 *err = -ENOMEM;
1089 goto err; 1012 goto err;
@@ -1113,35 +1036,29 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen)
1113 fa_head = &li->falh; 1036 fa_head = &li->falh;
1114 insert_leaf_info(&l->list, li); 1037 insert_leaf_info(&l->list, li);
1115 1038
1116 /* Case 2: n is NULL, and will just insert a new leaf */
1117 if (t->trie && n == NULL) { 1039 if (t->trie && n == NULL) {
1040 /* Case 2: n is NULL, and will just insert a new leaf */
1118 1041
1119 NODE_SET_PARENT(l, tp); 1042 NODE_SET_PARENT(l, tp);
1120
1121 if (!tp)
1122 BUG();
1123 1043
1124 else { 1044 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1125 cindex = tkey_extract_bits(key, tp->pos, tp->bits); 1045 put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
1126 put_child(t, (struct tnode *)tp, cindex, (struct node *)l); 1046 } else {
1127 } 1047 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1128 }
1129 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1130 else {
1131 /* 1048 /*
1132 * Add a new tnode here 1049 * Add a new tnode here
1133 * first tnode need some special handling 1050 * first tnode need some special handling
1134 */ 1051 */
1135 1052
1136 if (tp) 1053 if (tp)
1137 pos=tp->pos+tp->bits; 1054 pos = tp->pos+tp->bits;
1138 else 1055 else
1139 pos=0; 1056 pos = 0;
1057
1140 if (n) { 1058 if (n) {
1141 newpos = tkey_mismatch(key, pos, n->key); 1059 newpos = tkey_mismatch(key, pos, n->key);
1142 tn = tnode_new(n->key, newpos, 1); 1060 tn = tnode_new(n->key, newpos, 1);
1143 } 1061 } else {
1144 else {
1145 newpos = 0; 1062 newpos = 0;
1146 tn = tnode_new(key, newpos, 1); /* First tnode */ 1063 tn = tnode_new(key, newpos, 1); /* First tnode */
1147 } 1064 }
@@ -1151,32 +1068,33 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen)
1151 tnode_free((struct tnode *) l); 1068 tnode_free((struct tnode *) l);
1152 *err = -ENOMEM; 1069 *err = -ENOMEM;
1153 goto err; 1070 goto err;
1154 } 1071 }
1155 1072
1156 NODE_SET_PARENT(tn, tp); 1073 NODE_SET_PARENT(tn, tp);
1157 1074
1158 missbit=tkey_extract_bits(key, newpos, 1); 1075 missbit = tkey_extract_bits(key, newpos, 1);
1159 put_child(t, tn, missbit, (struct node *)l); 1076 put_child(t, tn, missbit, (struct node *)l);
1160 put_child(t, tn, 1-missbit, n); 1077 put_child(t, tn, 1-missbit, n);
1161 1078
1162 if (tp) { 1079 if (tp) {
1163 cindex = tkey_extract_bits(key, tp->pos, tp->bits); 1080 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1164 put_child(t, (struct tnode *)tp, cindex, (struct node *)tn); 1081 put_child(t, (struct tnode *)tp, cindex, (struct node *)tn);
1165 } 1082 } else {
1166 else { 1083 rcu_assign_pointer(t->trie, (struct node *)tn); /* First tnode */
1167 t->trie = (struct node*) tn; /* First tnode */
1168 tp = tn; 1084 tp = tn;
1169 } 1085 }
1170 } 1086 }
1171 if (tp && tp->pos+tp->bits > 32) { 1087
1088 if (tp && tp->pos + tp->bits > 32)
1172 printk("ERROR tp=%p pos=%d, bits=%d, key=%0x plen=%d\n", 1089 printk("ERROR tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1173 tp, tp->pos, tp->bits, key, plen); 1090 tp, tp->pos, tp->bits, key, plen);
1174 } 1091
1175 /* Rebalance the trie */ 1092 /* Rebalance the trie */
1176 t->trie = trie_rebalance(t, tp); 1093
1094 rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
1177done: 1095done:
1178 t->revision++; 1096 t->revision++;
1179err:; 1097err:
1180 return fa_head; 1098 return fa_head;
1181} 1099}
1182 1100
@@ -1204,17 +1122,18 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1204 1122
1205 key = ntohl(key); 1123 key = ntohl(key);
1206 1124
1207 if (trie_debug) 1125 pr_debug("Insert table=%d %08x/%d\n", tb->tb_id, key, plen);
1208 printk("Insert table=%d %08x/%d\n", tb->tb_id, key, plen);
1209 1126
1210 mask = ntohl( inet_make_mask(plen) ); 1127 mask = ntohl(inet_make_mask(plen));
1211 1128
1212 if (key & ~mask) 1129 if (key & ~mask)
1213 return -EINVAL; 1130 return -EINVAL;
1214 1131
1215 key = key & mask; 1132 key = key & mask;
1216 1133
1217 if ((fi = fib_create_info(r, rta, nlhdr, &err)) == NULL) 1134 fi = fib_create_info(r, rta, nlhdr, &err);
1135
1136 if (!fi)
1218 goto err; 1137 goto err;
1219 1138
1220 l = fib_find_node(t, key); 1139 l = fib_find_node(t, key);
@@ -1236,8 +1155,7 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1236 * and we need to allocate a new one of those as well. 1155 * and we need to allocate a new one of those as well.
1237 */ 1156 */
1238 1157
1239 if (fa && 1158 if (fa && fa->fa_info->fib_priority == fi->fib_priority) {
1240 fa->fa_info->fib_priority == fi->fib_priority) {
1241 struct fib_alias *fa_orig; 1159 struct fib_alias *fa_orig;
1242 1160
1243 err = -EEXIST; 1161 err = -EEXIST;
@@ -1248,22 +1166,27 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1248 struct fib_info *fi_drop; 1166 struct fib_info *fi_drop;
1249 u8 state; 1167 u8 state;
1250 1168
1251 write_lock_bh(&fib_lock); 1169 err = -ENOBUFS;
1170 new_fa = kmem_cache_alloc(fn_alias_kmem, SLAB_KERNEL);
1171 if (new_fa == NULL)
1172 goto out;
1252 1173
1253 fi_drop = fa->fa_info; 1174 fi_drop = fa->fa_info;
1254 fa->fa_info = fi; 1175 new_fa->fa_tos = fa->fa_tos;
1255 fa->fa_type = type; 1176 new_fa->fa_info = fi;
1256 fa->fa_scope = r->rtm_scope; 1177 new_fa->fa_type = type;
1178 new_fa->fa_scope = r->rtm_scope;
1257 state = fa->fa_state; 1179 state = fa->fa_state;
1258 fa->fa_state &= ~FA_S_ACCESSED; 1180 new_fa->fa_state &= ~FA_S_ACCESSED;
1259 1181
1260 write_unlock_bh(&fib_lock); 1182 list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1183 alias_free_mem_rcu(fa);
1261 1184
1262 fib_release_info(fi_drop); 1185 fib_release_info(fi_drop);
1263 if (state & FA_S_ACCESSED) 1186 if (state & FA_S_ACCESSED)
1264 rt_cache_flush(-1); 1187 rt_cache_flush(-1);
1265 1188
1266 goto succeeded; 1189 goto succeeded;
1267 } 1190 }
1268 /* Error if we find a perfect match which 1191 /* Error if we find a perfect match which
1269 * uses the same scope, type, and nexthop 1192 * uses the same scope, type, and nexthop
@@ -1285,7 +1208,7 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1285 fa = fa_orig; 1208 fa = fa_orig;
1286 } 1209 }
1287 err = -ENOENT; 1210 err = -ENOENT;
1288 if (!(nlhdr->nlmsg_flags&NLM_F_CREATE)) 1211 if (!(nlhdr->nlmsg_flags & NLM_F_CREATE))
1289 goto out; 1212 goto out;
1290 1213
1291 err = -ENOBUFS; 1214 err = -ENOBUFS;
@@ -1298,9 +1221,6 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1298 new_fa->fa_type = type; 1221 new_fa->fa_type = type;
1299 new_fa->fa_scope = r->rtm_scope; 1222 new_fa->fa_scope = r->rtm_scope;
1300 new_fa->fa_state = 0; 1223 new_fa->fa_state = 0;
1301#if 0
1302 new_fa->dst = NULL;
1303#endif
1304 /* 1224 /*
1305 * Insert new entry to the list. 1225 * Insert new entry to the list.
1306 */ 1226 */
@@ -1312,12 +1232,8 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1312 goto out_free_new_fa; 1232 goto out_free_new_fa;
1313 } 1233 }
1314 1234
1315 write_lock_bh(&fib_lock); 1235 list_add_tail_rcu(&new_fa->fa_list,
1316 1236 (fa ? &fa->fa_list : fa_head));
1317 list_add_tail(&new_fa->fa_list,
1318 (fa ? &fa->fa_list : fa_head));
1319
1320 write_unlock_bh(&fib_lock);
1321 1237
1322 rt_cache_flush(-1); 1238 rt_cache_flush(-1);
1323 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, nlhdr, req); 1239 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, nlhdr, req);
@@ -1328,38 +1244,40 @@ out_free_new_fa:
1328 kmem_cache_free(fn_alias_kmem, new_fa); 1244 kmem_cache_free(fn_alias_kmem, new_fa);
1329out: 1245out:
1330 fib_release_info(fi); 1246 fib_release_info(fi);
1331err:; 1247err:
1332 return err; 1248 return err;
1333} 1249}
1334 1250
1335static inline int check_leaf(struct trie *t, struct leaf *l, t_key key, int *plen, const struct flowi *flp, 1251
1336 struct fib_result *res, int *err) 1252/* should be clalled with rcu_read_lock */
1253static inline int check_leaf(struct trie *t, struct leaf *l,
1254 t_key key, int *plen, const struct flowi *flp,
1255 struct fib_result *res)
1337{ 1256{
1338 int i; 1257 int err, i;
1339 t_key mask; 1258 t_key mask;
1340 struct leaf_info *li; 1259 struct leaf_info *li;
1341 struct hlist_head *hhead = &l->list; 1260 struct hlist_head *hhead = &l->list;
1342 struct hlist_node *node; 1261 struct hlist_node *node;
1343 1262
1344 hlist_for_each_entry(li, node, hhead, hlist) { 1263 hlist_for_each_entry_rcu(li, node, hhead, hlist) {
1345
1346 i = li->plen; 1264 i = li->plen;
1347 mask = ntohl(inet_make_mask(i)); 1265 mask = ntohl(inet_make_mask(i));
1348 if (l->key != (key & mask)) 1266 if (l->key != (key & mask))
1349 continue; 1267 continue;
1350 1268
1351 if (((*err) = fib_semantic_match(&li->falh, flp, res, l->key, mask, i)) == 0) { 1269 if ((err = fib_semantic_match(&li->falh, flp, res, l->key, mask, i)) <= 0) {
1352 *plen = i; 1270 *plen = i;
1353#ifdef CONFIG_IP_FIB_TRIE_STATS 1271#ifdef CONFIG_IP_FIB_TRIE_STATS
1354 t->stats.semantic_match_passed++; 1272 t->stats.semantic_match_passed++;
1355#endif 1273#endif
1356 return 1; 1274 return err;
1357 } 1275 }
1358#ifdef CONFIG_IP_FIB_TRIE_STATS 1276#ifdef CONFIG_IP_FIB_TRIE_STATS
1359 t->stats.semantic_match_miss++; 1277 t->stats.semantic_match_miss++;
1360#endif 1278#endif
1361 } 1279 }
1362 return 0; 1280 return 1;
1363} 1281}
1364 1282
1365static int 1283static int
@@ -1370,13 +1288,17 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result
1370 struct node *n; 1288 struct node *n;
1371 struct tnode *pn; 1289 struct tnode *pn;
1372 int pos, bits; 1290 int pos, bits;
1373 t_key key=ntohl(flp->fl4_dst); 1291 t_key key = ntohl(flp->fl4_dst);
1374 int chopped_off; 1292 int chopped_off;
1375 t_key cindex = 0; 1293 t_key cindex = 0;
1376 int current_prefix_length = KEYLENGTH; 1294 int current_prefix_length = KEYLENGTH;
1377 n = t->trie; 1295 struct tnode *cn;
1296 t_key node_prefix, key_prefix, pref_mismatch;
1297 int mp;
1298
1299 rcu_read_lock();
1378 1300
1379 read_lock(&fib_lock); 1301 n = rcu_dereference(t->trie);
1380 if (!n) 1302 if (!n)
1381 goto failed; 1303 goto failed;
1382 1304
@@ -1386,15 +1308,14 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result
1386 1308
1387 /* Just a leaf? */ 1309 /* Just a leaf? */
1388 if (IS_LEAF(n)) { 1310 if (IS_LEAF(n)) {
1389 if (check_leaf(t, (struct leaf *)n, key, &plen, flp, res, &ret)) 1311 if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
1390 goto found; 1312 goto found;
1391 goto failed; 1313 goto failed;
1392 } 1314 }
1393 pn = (struct tnode *) n; 1315 pn = (struct tnode *) n;
1394 chopped_off = 0; 1316 chopped_off = 0;
1395 1317
1396 while (pn) { 1318 while (pn) {
1397
1398 pos = pn->pos; 1319 pos = pn->pos;
1399 bits = pn->bits; 1320 bits = pn->bits;
1400 1321
@@ -1410,130 +1331,129 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result
1410 goto backtrace; 1331 goto backtrace;
1411 } 1332 }
1412 1333
1413 if (IS_TNODE(n)) { 1334 if (IS_LEAF(n)) {
1335 if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
1336 goto found;
1337 else
1338 goto backtrace;
1339 }
1340
1414#define HL_OPTIMIZE 1341#define HL_OPTIMIZE
1415#ifdef HL_OPTIMIZE 1342#ifdef HL_OPTIMIZE
1416 struct tnode *cn = (struct tnode *)n; 1343 cn = (struct tnode *)n;
1417 t_key node_prefix, key_prefix, pref_mismatch;
1418 int mp;
1419 1344
1420 /* 1345 /*
1421 * It's a tnode, and we can do some extra checks here if we 1346 * It's a tnode, and we can do some extra checks here if we
1422 * like, to avoid descending into a dead-end branch. 1347 * like, to avoid descending into a dead-end branch.
1423 * This tnode is in the parent's child array at index 1348 * This tnode is in the parent's child array at index
1424 * key[p_pos..p_pos+p_bits] but potentially with some bits 1349 * key[p_pos..p_pos+p_bits] but potentially with some bits
1425 * chopped off, so in reality the index may be just a 1350 * chopped off, so in reality the index may be just a
1426 * subprefix, padded with zero at the end. 1351 * subprefix, padded with zero at the end.
1427 * We can also take a look at any skipped bits in this 1352 * We can also take a look at any skipped bits in this
1428 * tnode - everything up to p_pos is supposed to be ok, 1353 * tnode - everything up to p_pos is supposed to be ok,
1429 * and the non-chopped bits of the index (se previous 1354 * and the non-chopped bits of the index (se previous
1430 * paragraph) are also guaranteed ok, but the rest is 1355 * paragraph) are also guaranteed ok, but the rest is
1431 * considered unknown. 1356 * considered unknown.
1432 * 1357 *
1433 * The skipped bits are key[pos+bits..cn->pos]. 1358 * The skipped bits are key[pos+bits..cn->pos].
1434 */ 1359 */
1435
1436 /* If current_prefix_length < pos+bits, we are already doing
1437 * actual prefix matching, which means everything from
1438 * pos+(bits-chopped_off) onward must be zero along some
1439 * branch of this subtree - otherwise there is *no* valid
1440 * prefix present. Here we can only check the skipped
1441 * bits. Remember, since we have already indexed into the
1442 * parent's child array, we know that the bits we chopped of
1443 * *are* zero.
1444 */
1445 1360
1446 /* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */ 1361 /* If current_prefix_length < pos+bits, we are already doing
1447 1362 * actual prefix matching, which means everything from
1448 if (current_prefix_length < pos+bits) { 1363 * pos+(bits-chopped_off) onward must be zero along some
1449 if (tkey_extract_bits(cn->key, current_prefix_length, 1364 * branch of this subtree - otherwise there is *no* valid
1450 cn->pos - current_prefix_length) != 0 || 1365 * prefix present. Here we can only check the skipped
1451 !(cn->child[0])) 1366 * bits. Remember, since we have already indexed into the
1452 goto backtrace; 1367 * parent's child array, we know that the bits we chopped of
1453 } 1368 * *are* zero.
1369 */
1454 1370
1455 /* 1371 /* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */
1456 * If chopped_off=0, the index is fully validated and we
1457 * only need to look at the skipped bits for this, the new,
1458 * tnode. What we actually want to do is to find out if
1459 * these skipped bits match our key perfectly, or if we will
1460 * have to count on finding a matching prefix further down,
1461 * because if we do, we would like to have some way of
1462 * verifying the existence of such a prefix at this point.
1463 */
1464 1372
1465 /* The only thing we can do at this point is to verify that 1373 if (current_prefix_length < pos+bits) {
1466 * any such matching prefix can indeed be a prefix to our 1374 if (tkey_extract_bits(cn->key, current_prefix_length,
1467 * key, and if the bits in the node we are inspecting that 1375 cn->pos - current_prefix_length) != 0 ||
1468 * do not match our key are not ZERO, this cannot be true. 1376 !(cn->child[0]))
1469 * Thus, find out where there is a mismatch (before cn->pos) 1377 goto backtrace;
1470 * and verify that all the mismatching bits are zero in the 1378 }
1471 * new tnode's key.
1472 */
1473 1379
1474 /* Note: We aren't very concerned about the piece of the key 1380 /*
1475 * that precede pn->pos+pn->bits, since these have already been 1381 * If chopped_off=0, the index is fully validated and we
1476 * checked. The bits after cn->pos aren't checked since these are 1382 * only need to look at the skipped bits for this, the new,
1477 * by definition "unknown" at this point. Thus, what we want to 1383 * tnode. What we actually want to do is to find out if
1478 * see is if we are about to enter the "prefix matching" state, 1384 * these skipped bits match our key perfectly, or if we will
1479 * and in that case verify that the skipped bits that will prevail 1385 * have to count on finding a matching prefix further down,
1480 * throughout this subtree are zero, as they have to be if we are 1386 * because if we do, we would like to have some way of
1481 * to find a matching prefix. 1387 * verifying the existence of such a prefix at this point.
1482 */ 1388 */
1483 1389
1484 node_prefix = MASK_PFX(cn->key, cn->pos); 1390 /* The only thing we can do at this point is to verify that
1485 key_prefix = MASK_PFX(key, cn->pos); 1391 * any such matching prefix can indeed be a prefix to our
1486 pref_mismatch = key_prefix^node_prefix; 1392 * key, and if the bits in the node we are inspecting that
1487 mp = 0; 1393 * do not match our key are not ZERO, this cannot be true.
1394 * Thus, find out where there is a mismatch (before cn->pos)
1395 * and verify that all the mismatching bits are zero in the
1396 * new tnode's key.
1397 */
1488 1398
1489 /* In short: If skipped bits in this node do not match the search 1399 /* Note: We aren't very concerned about the piece of the key
1490 * key, enter the "prefix matching" state.directly. 1400 * that precede pn->pos+pn->bits, since these have already been
1491 */ 1401 * checked. The bits after cn->pos aren't checked since these are
1492 if (pref_mismatch) { 1402 * by definition "unknown" at this point. Thus, what we want to
1493 while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) { 1403 * see is if we are about to enter the "prefix matching" state,
1494 mp++; 1404 * and in that case verify that the skipped bits that will prevail
1495 pref_mismatch = pref_mismatch <<1; 1405 * throughout this subtree are zero, as they have to be if we are
1496 } 1406 * to find a matching prefix.
1497 key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp); 1407 */
1498 1408
1499 if (key_prefix != 0) 1409 node_prefix = MASK_PFX(cn->key, cn->pos);
1500 goto backtrace; 1410 key_prefix = MASK_PFX(key, cn->pos);
1501 1411 pref_mismatch = key_prefix^node_prefix;
1502 if (current_prefix_length >= cn->pos) 1412 mp = 0;
1503 current_prefix_length=mp; 1413
1504 } 1414 /* In short: If skipped bits in this node do not match the search
1505#endif 1415 * key, enter the "prefix matching" state.directly.
1506 pn = (struct tnode *)n; /* Descend */ 1416 */
1507 chopped_off = 0; 1417 if (pref_mismatch) {
1508 continue; 1418 while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
1419 mp++;
1420 pref_mismatch = pref_mismatch <<1;
1421 }
1422 key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1423
1424 if (key_prefix != 0)
1425 goto backtrace;
1426
1427 if (current_prefix_length >= cn->pos)
1428 current_prefix_length = mp;
1509 } 1429 }
1510 if (IS_LEAF(n)) { 1430#endif
1511 if (check_leaf(t, (struct leaf *)n, key, &plen, flp, res, &ret)) 1431 pn = (struct tnode *)n; /* Descend */
1512 goto found; 1432 chopped_off = 0;
1513 } 1433 continue;
1434
1514backtrace: 1435backtrace:
1515 chopped_off++; 1436 chopped_off++;
1516 1437
1517 /* As zero don't change the child key (cindex) */ 1438 /* As zero don't change the child key (cindex) */
1518 while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1)))) { 1439 while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1))))
1519 chopped_off++; 1440 chopped_off++;
1520 }
1521 1441
1522 /* Decrease current_... with bits chopped off */ 1442 /* Decrease current_... with bits chopped off */
1523 if (current_prefix_length > pn->pos + pn->bits - chopped_off) 1443 if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1524 current_prefix_length = pn->pos + pn->bits - chopped_off; 1444 current_prefix_length = pn->pos + pn->bits - chopped_off;
1525 1445
1526 /* 1446 /*
1527 * Either we do the actual chop off according or if we have 1447 * Either we do the actual chop off according or if we have
1528 * chopped off all bits in this tnode walk up to our parent. 1448 * chopped off all bits in this tnode walk up to our parent.
1529 */ 1449 */
1530 1450
1531 if (chopped_off <= pn->bits) 1451 if (chopped_off <= pn->bits) {
1532 cindex &= ~(1 << (chopped_off-1)); 1452 cindex &= ~(1 << (chopped_off-1));
1533 else { 1453 } else {
1534 if (NODE_PARENT(pn) == NULL) 1454 if (NODE_PARENT(pn) == NULL)
1535 goto failed; 1455 goto failed;
1536 1456
1537 /* Get Child's index */ 1457 /* Get Child's index */
1538 cindex = tkey_extract_bits(pn->key, NODE_PARENT(pn)->pos, NODE_PARENT(pn)->bits); 1458 cindex = tkey_extract_bits(pn->key, NODE_PARENT(pn)->pos, NODE_PARENT(pn)->bits);
1539 pn = NODE_PARENT(pn); 1459 pn = NODE_PARENT(pn);
@@ -1548,10 +1468,11 @@ backtrace:
1548failed: 1468failed:
1549 ret = 1; 1469 ret = 1;
1550found: 1470found:
1551 read_unlock(&fib_lock); 1471 rcu_read_unlock();
1552 return ret; 1472 return ret;
1553} 1473}
1554 1474
1475/* only called from updater side */
1555static int trie_leaf_remove(struct trie *t, t_key key) 1476static int trie_leaf_remove(struct trie *t, t_key key)
1556{ 1477{
1557 t_key cindex; 1478 t_key cindex;
@@ -1559,24 +1480,20 @@ static int trie_leaf_remove(struct trie *t, t_key key)
1559 struct node *n = t->trie; 1480 struct node *n = t->trie;
1560 struct leaf *l; 1481 struct leaf *l;
1561 1482
1562 if (trie_debug) 1483 pr_debug("entering trie_leaf_remove(%p)\n", n);
1563 printk("entering trie_leaf_remove(%p)\n", n);
1564 1484
1565 /* Note that in the case skipped bits, those bits are *not* checked! 1485 /* Note that in the case skipped bits, those bits are *not* checked!
1566 * When we finish this, we will have NULL or a T_LEAF, and the 1486 * When we finish this, we will have NULL or a T_LEAF, and the
1567 * T_LEAF may or may not match our key. 1487 * T_LEAF may or may not match our key.
1568 */ 1488 */
1569 1489
1570 while (n != NULL && IS_TNODE(n)) { 1490 while (n != NULL && IS_TNODE(n)) {
1571 struct tnode *tn = (struct tnode *) n; 1491 struct tnode *tn = (struct tnode *) n;
1572 check_tnode(tn); 1492 check_tnode(tn);
1573 n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits)); 1493 n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits));
1574 1494
1575 if (n && NODE_PARENT(n) != tn) { 1495 BUG_ON(n && NODE_PARENT(n) != tn);
1576 printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n)); 1496 }
1577 BUG();
1578 }
1579 }
1580 l = (struct leaf *) n; 1497 l = (struct leaf *) n;
1581 1498
1582 if (!n || !tkey_equals(l->key, key)) 1499 if (!n || !tkey_equals(l->key, key))
@@ -1590,23 +1507,24 @@ static int trie_leaf_remove(struct trie *t, t_key key)
1590 t->revision++; 1507 t->revision++;
1591 t->size--; 1508 t->size--;
1592 1509
1510 preempt_disable();
1593 tp = NODE_PARENT(n); 1511 tp = NODE_PARENT(n);
1594 tnode_free((struct tnode *) n); 1512 tnode_free((struct tnode *) n);
1595 1513
1596 if (tp) { 1514 if (tp) {
1597 cindex = tkey_extract_bits(key, tp->pos, tp->bits); 1515 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1598 put_child(t, (struct tnode *)tp, cindex, NULL); 1516 put_child(t, (struct tnode *)tp, cindex, NULL);
1599 t->trie = trie_rebalance(t, tp); 1517 rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
1600 } 1518 } else
1601 else 1519 rcu_assign_pointer(t->trie, NULL);
1602 t->trie = NULL; 1520 preempt_enable();
1603 1521
1604 return 1; 1522 return 1;
1605} 1523}
1606 1524
1607static int 1525static int
1608fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, 1526fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1609 struct nlmsghdr *nlhdr, struct netlink_skb_parms *req) 1527 struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
1610{ 1528{
1611 struct trie *t = (struct trie *) tb->tb_data; 1529 struct trie *t = (struct trie *) tb->tb_data;
1612 u32 key, mask; 1530 u32 key, mask;
@@ -1615,6 +1533,8 @@ fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1615 struct fib_alias *fa, *fa_to_delete; 1533 struct fib_alias *fa, *fa_to_delete;
1616 struct list_head *fa_head; 1534 struct list_head *fa_head;
1617 struct leaf *l; 1535 struct leaf *l;
1536 struct leaf_info *li;
1537
1618 1538
1619 if (plen > 32) 1539 if (plen > 32)
1620 return -EINVAL; 1540 return -EINVAL;
@@ -1624,7 +1544,7 @@ fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1624 memcpy(&key, rta->rta_dst, 4); 1544 memcpy(&key, rta->rta_dst, 4);
1625 1545
1626 key = ntohl(key); 1546 key = ntohl(key);
1627 mask = ntohl( inet_make_mask(plen) ); 1547 mask = ntohl(inet_make_mask(plen));
1628 1548
1629 if (key & ~mask) 1549 if (key & ~mask)
1630 return -EINVAL; 1550 return -EINVAL;
@@ -1641,11 +1561,11 @@ fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1641 if (!fa) 1561 if (!fa)
1642 return -ESRCH; 1562 return -ESRCH;
1643 1563
1644 if (trie_debug) 1564 pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1645 printk("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1646 1565
1647 fa_to_delete = NULL; 1566 fa_to_delete = NULL;
1648 fa_head = fa->fa_list.prev; 1567 fa_head = fa->fa_list.prev;
1568
1649 list_for_each_entry(fa, fa_head, fa_list) { 1569 list_for_each_entry(fa, fa_head, fa_list) {
1650 struct fib_info *fi = fa->fa_info; 1570 struct fib_info *fi = fa->fa_info;
1651 1571
@@ -1664,39 +1584,31 @@ fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1664 } 1584 }
1665 } 1585 }
1666 1586
1667 if (fa_to_delete) { 1587 if (!fa_to_delete)
1668 int kill_li = 0; 1588 return -ESRCH;
1669 struct leaf_info *li;
1670
1671 fa = fa_to_delete;
1672 rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id, nlhdr, req);
1673 1589
1674 l = fib_find_node(t, key); 1590 fa = fa_to_delete;
1675 li = find_leaf_info(&l->list, plen); 1591 rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id, nlhdr, req);
1676 1592
1677 write_lock_bh(&fib_lock); 1593 l = fib_find_node(t, key);
1594 li = find_leaf_info(&l->list, plen);
1678 1595
1679 list_del(&fa->fa_list); 1596 list_del_rcu(&fa->fa_list);
1680 1597
1681 if (list_empty(fa_head)) { 1598 if (list_empty(fa_head)) {
1682 hlist_del(&li->hlist); 1599 hlist_del_rcu(&li->hlist);
1683 kill_li = 1; 1600 free_leaf_info(li);
1684 } 1601 }
1685 write_unlock_bh(&fib_lock);
1686
1687 if (kill_li)
1688 free_leaf_info(li);
1689 1602
1690 if (hlist_empty(&l->list)) 1603 if (hlist_empty(&l->list))
1691 trie_leaf_remove(t, key); 1604 trie_leaf_remove(t, key);
1692 1605
1693 if (fa->fa_state & FA_S_ACCESSED) 1606 if (fa->fa_state & FA_S_ACCESSED)
1694 rt_cache_flush(-1); 1607 rt_cache_flush(-1);
1695 1608
1696 fn_free_alias(fa); 1609 fib_release_info(fa->fa_info);
1697 return 0; 1610 alias_free_mem_rcu(fa);
1698 } 1611 return 0;
1699 return -ESRCH;
1700} 1612}
1701 1613
1702static int trie_flush_list(struct trie *t, struct list_head *head) 1614static int trie_flush_list(struct trie *t, struct list_head *head)
@@ -1706,14 +1618,11 @@ static int trie_flush_list(struct trie *t, struct list_head *head)
1706 1618
1707 list_for_each_entry_safe(fa, fa_node, head, fa_list) { 1619 list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1708 struct fib_info *fi = fa->fa_info; 1620 struct fib_info *fi = fa->fa_info;
1709
1710 if (fi && (fi->fib_flags&RTNH_F_DEAD)) {
1711
1712 write_lock_bh(&fib_lock);
1713 list_del(&fa->fa_list);
1714 write_unlock_bh(&fib_lock);
1715 1621
1716 fn_free_alias(fa); 1622 if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1623 list_del_rcu(&fa->fa_list);
1624 fib_release_info(fa->fa_info);
1625 alias_free_mem_rcu(fa);
1717 found++; 1626 found++;
1718 } 1627 }
1719 } 1628 }
@@ -1728,37 +1637,34 @@ static int trie_flush_leaf(struct trie *t, struct leaf *l)
1728 struct leaf_info *li = NULL; 1637 struct leaf_info *li = NULL;
1729 1638
1730 hlist_for_each_entry_safe(li, node, tmp, lih, hlist) { 1639 hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
1731
1732 found += trie_flush_list(t, &li->falh); 1640 found += trie_flush_list(t, &li->falh);
1733 1641
1734 if (list_empty(&li->falh)) { 1642 if (list_empty(&li->falh)) {
1735 1643 hlist_del_rcu(&li->hlist);
1736 write_lock_bh(&fib_lock);
1737 hlist_del(&li->hlist);
1738 write_unlock_bh(&fib_lock);
1739
1740 free_leaf_info(li); 1644 free_leaf_info(li);
1741 } 1645 }
1742 } 1646 }
1743 return found; 1647 return found;
1744} 1648}
1745 1649
1650/* rcu_read_lock needs to be hold by caller from readside */
1651
1746static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf) 1652static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
1747{ 1653{
1748 struct node *c = (struct node *) thisleaf; 1654 struct node *c = (struct node *) thisleaf;
1749 struct tnode *p; 1655 struct tnode *p;
1750 int idx; 1656 int idx;
1657 struct node *trie = rcu_dereference(t->trie);
1751 1658
1752 if (c == NULL) { 1659 if (c == NULL) {
1753 if (t->trie == NULL) 1660 if (trie == NULL)
1754 return NULL; 1661 return NULL;
1755 1662
1756 if (IS_LEAF(t->trie)) /* trie w. just a leaf */ 1663 if (IS_LEAF(trie)) /* trie w. just a leaf */
1757 return (struct leaf *) t->trie; 1664 return (struct leaf *) trie;
1758 1665
1759 p = (struct tnode*) t->trie; /* Start */ 1666 p = (struct tnode*) trie; /* Start */
1760 } 1667 } else
1761 else
1762 p = (struct tnode *) NODE_PARENT(c); 1668 p = (struct tnode *) NODE_PARENT(c);
1763 1669
1764 while (p) { 1670 while (p) {
@@ -1771,29 +1677,31 @@ static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
1771 pos = 0; 1677 pos = 0;
1772 1678
1773 last = 1 << p->bits; 1679 last = 1 << p->bits;
1774 for(idx = pos; idx < last ; idx++) { 1680 for (idx = pos; idx < last ; idx++) {
1775 if (p->child[idx]) { 1681 c = rcu_dereference(p->child[idx]);
1776 1682
1777 /* Decend if tnode */ 1683 if (!c)
1778 1684 continue;
1779 while (IS_TNODE(p->child[idx])) { 1685
1780 p = (struct tnode*) p->child[idx]; 1686 /* Decend if tnode */
1781 idx = 0; 1687 while (IS_TNODE(c)) {
1782 1688 p = (struct tnode *) c;
1783 /* Rightmost non-NULL branch */ 1689 idx = 0;
1784 if (p && IS_TNODE(p)) 1690
1785 while (p->child[idx] == NULL && idx < (1 << p->bits)) idx++; 1691 /* Rightmost non-NULL branch */
1786 1692 if (p && IS_TNODE(p))
1787 /* Done with this tnode? */ 1693 while (!(c = rcu_dereference(p->child[idx]))
1788 if (idx >= (1 << p->bits) || p->child[idx] == NULL ) 1694 && idx < (1<<p->bits)) idx++;
1789 goto up; 1695
1790 } 1696 /* Done with this tnode? */
1791 return (struct leaf*) p->child[idx]; 1697 if (idx >= (1 << p->bits) || !c)
1698 goto up;
1792 } 1699 }
1700 return (struct leaf *) c;
1793 } 1701 }
1794up: 1702up:
1795 /* No more children go up one step */ 1703 /* No more children go up one step */
1796 c = (struct node*) p; 1704 c = (struct node *) p;
1797 p = (struct tnode *) NODE_PARENT(p); 1705 p = (struct tnode *) NODE_PARENT(p);
1798 } 1706 }
1799 return NULL; /* Ready. Root of trie */ 1707 return NULL; /* Ready. Root of trie */
@@ -1807,23 +1715,24 @@ static int fn_trie_flush(struct fib_table *tb)
1807 1715
1808 t->revision++; 1716 t->revision++;
1809 1717
1810 for (h=0; (l = nextleaf(t, l)) != NULL; h++) { 1718 rcu_read_lock();
1719 for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
1811 found += trie_flush_leaf(t, l); 1720 found += trie_flush_leaf(t, l);
1812 1721
1813 if (ll && hlist_empty(&ll->list)) 1722 if (ll && hlist_empty(&ll->list))
1814 trie_leaf_remove(t, ll->key); 1723 trie_leaf_remove(t, ll->key);
1815 ll = l; 1724 ll = l;
1816 } 1725 }
1726 rcu_read_unlock();
1817 1727
1818 if (ll && hlist_empty(&ll->list)) 1728 if (ll && hlist_empty(&ll->list))
1819 trie_leaf_remove(t, ll->key); 1729 trie_leaf_remove(t, ll->key);
1820 1730
1821 if (trie_debug) 1731 pr_debug("trie_flush found=%d\n", found);
1822 printk("trie_flush found=%d\n", found);
1823 return found; 1732 return found;
1824} 1733}
1825 1734
1826static int trie_last_dflt=-1; 1735static int trie_last_dflt = -1;
1827 1736
1828static void 1737static void
1829fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res) 1738fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
@@ -1840,7 +1749,7 @@ fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib
1840 last_resort = NULL; 1749 last_resort = NULL;
1841 order = -1; 1750 order = -1;
1842 1751
1843 read_lock(&fib_lock); 1752 rcu_read_lock();
1844 1753
1845 l = fib_find_node(t, 0); 1754 l = fib_find_node(t, 0);
1846 if (!l) 1755 if (!l)
@@ -1853,20 +1762,20 @@ fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib
1853 if (list_empty(fa_head)) 1762 if (list_empty(fa_head))
1854 goto out; 1763 goto out;
1855 1764
1856 list_for_each_entry(fa, fa_head, fa_list) { 1765 list_for_each_entry_rcu(fa, fa_head, fa_list) {
1857 struct fib_info *next_fi = fa->fa_info; 1766 struct fib_info *next_fi = fa->fa_info;
1858 1767
1859 if (fa->fa_scope != res->scope || 1768 if (fa->fa_scope != res->scope ||
1860 fa->fa_type != RTN_UNICAST) 1769 fa->fa_type != RTN_UNICAST)
1861 continue; 1770 continue;
1862 1771
1863 if (next_fi->fib_priority > res->fi->fib_priority) 1772 if (next_fi->fib_priority > res->fi->fib_priority)
1864 break; 1773 break;
1865 if (!next_fi->fib_nh[0].nh_gw || 1774 if (!next_fi->fib_nh[0].nh_gw ||
1866 next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK) 1775 next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1867 continue; 1776 continue;
1868 fa->fa_state |= FA_S_ACCESSED; 1777 fa->fa_state |= FA_S_ACCESSED;
1869 1778
1870 if (fi == NULL) { 1779 if (fi == NULL) {
1871 if (next_fi != res->fi) 1780 if (next_fi != res->fi)
1872 break; 1781 break;
@@ -1904,7 +1813,7 @@ fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib
1904 } 1813 }
1905 trie_last_dflt = last_idx; 1814 trie_last_dflt = last_idx;
1906 out:; 1815 out:;
1907 read_unlock(&fib_lock); 1816 rcu_read_unlock();
1908} 1817}
1909 1818
1910static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb, 1819static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb,
@@ -1913,12 +1822,14 @@ static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fi
1913 int i, s_i; 1822 int i, s_i;
1914 struct fib_alias *fa; 1823 struct fib_alias *fa;
1915 1824
1916 u32 xkey=htonl(key); 1825 u32 xkey = htonl(key);
1917 1826
1918 s_i=cb->args[3]; 1827 s_i = cb->args[3];
1919 i = 0; 1828 i = 0;
1920 1829
1921 list_for_each_entry(fa, fah, fa_list) { 1830 /* rcu_read_lock is hold by caller */
1831
1832 list_for_each_entry_rcu(fa, fah, fa_list) {
1922 if (i < s_i) { 1833 if (i < s_i) {
1923 i++; 1834 i++;
1924 continue; 1835 continue;
@@ -1946,10 +1857,10 @@ static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fi
1946 fa->fa_info, 0) < 0) { 1857 fa->fa_info, 0) < 0) {
1947 cb->args[3] = i; 1858 cb->args[3] = i;
1948 return -1; 1859 return -1;
1949 } 1860 }
1950 i++; 1861 i++;
1951 } 1862 }
1952 cb->args[3]=i; 1863 cb->args[3] = i;
1953 return skb->len; 1864 return skb->len;
1954} 1865}
1955 1866
@@ -1959,10 +1870,10 @@ static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, str
1959 int h, s_h; 1870 int h, s_h;
1960 struct list_head *fa_head; 1871 struct list_head *fa_head;
1961 struct leaf *l = NULL; 1872 struct leaf *l = NULL;
1962 s_h=cb->args[2];
1963 1873
1964 for (h=0; (l = nextleaf(t, l)) != NULL; h++) { 1874 s_h = cb->args[2];
1965 1875
1876 for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
1966 if (h < s_h) 1877 if (h < s_h)
1967 continue; 1878 continue;
1968 if (h > s_h) 1879 if (h > s_h)
@@ -1970,7 +1881,7 @@ static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, str
1970 sizeof(cb->args) - 3*sizeof(cb->args[0])); 1881 sizeof(cb->args) - 3*sizeof(cb->args[0]));
1971 1882
1972 fa_head = get_fa_head(l, plen); 1883 fa_head = get_fa_head(l, plen);
1973 1884
1974 if (!fa_head) 1885 if (!fa_head)
1975 continue; 1886 continue;
1976 1887
@@ -1978,11 +1889,11 @@ static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, str
1978 continue; 1889 continue;
1979 1890
1980 if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) { 1891 if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) {
1981 cb->args[2]=h; 1892 cb->args[2] = h;
1982 return -1; 1893 return -1;
1983 } 1894 }
1984 } 1895 }
1985 cb->args[2]=h; 1896 cb->args[2] = h;
1986 return skb->len; 1897 return skb->len;
1987} 1898}
1988 1899
@@ -1993,25 +1904,24 @@ static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlin
1993 1904
1994 s_m = cb->args[1]; 1905 s_m = cb->args[1];
1995 1906
1996 read_lock(&fib_lock); 1907 rcu_read_lock();
1997 for (m=0; m<=32; m++) { 1908 for (m = 0; m <= 32; m++) {
1998
1999 if (m < s_m) 1909 if (m < s_m)
2000 continue; 1910 continue;
2001 if (m > s_m) 1911 if (m > s_m)
2002 memset(&cb->args[2], 0, 1912 memset(&cb->args[2], 0,
2003 sizeof(cb->args) - 2*sizeof(cb->args[0])); 1913 sizeof(cb->args) - 2*sizeof(cb->args[0]));
2004 1914
2005 if (fn_trie_dump_plen(t, 32-m, tb, skb, cb)<0) { 1915 if (fn_trie_dump_plen(t, 32-m, tb, skb, cb)<0) {
2006 cb->args[1] = m; 1916 cb->args[1] = m;
2007 goto out; 1917 goto out;
2008 } 1918 }
2009 } 1919 }
2010 read_unlock(&fib_lock); 1920 rcu_read_unlock();
2011 cb->args[1] = m; 1921 cb->args[1] = m;
2012 return skb->len; 1922 return skb->len;
2013 out: 1923out:
2014 read_unlock(&fib_lock); 1924 rcu_read_unlock();
2015 return -1; 1925 return -1;
2016} 1926}
2017 1927
@@ -2051,9 +1961,9 @@ struct fib_table * __init fib_hash_init(int id)
2051 trie_init(t); 1961 trie_init(t);
2052 1962
2053 if (id == RT_TABLE_LOCAL) 1963 if (id == RT_TABLE_LOCAL)
2054 trie_local = t; 1964 trie_local = t;
2055 else if (id == RT_TABLE_MAIN) 1965 else if (id == RT_TABLE_MAIN)
2056 trie_main = t; 1966 trie_main = t;
2057 1967
2058 if (id == RT_TABLE_LOCAL) 1968 if (id == RT_TABLE_LOCAL)
2059 printk("IPv4 FIB: Using LC-trie version %s\n", VERSION); 1969 printk("IPv4 FIB: Using LC-trie version %s\n", VERSION);
@@ -2065,7 +1975,8 @@ struct fib_table * __init fib_hash_init(int id)
2065 1975
2066static void putspace_seq(struct seq_file *seq, int n) 1976static void putspace_seq(struct seq_file *seq, int n)
2067{ 1977{
2068 while (n--) seq_printf(seq, " "); 1978 while (n--)
1979 seq_printf(seq, " ");
2069} 1980}
2070 1981
2071static void printbin_seq(struct seq_file *seq, unsigned int v, int bits) 1982static void printbin_seq(struct seq_file *seq, unsigned int v, int bits)
@@ -2086,29 +1997,22 @@ static void printnode_seq(struct seq_file *seq, int indent, struct node *n,
2086 seq_printf(seq, "%d/", cindex); 1997 seq_printf(seq, "%d/", cindex);
2087 printbin_seq(seq, cindex, bits); 1998 printbin_seq(seq, cindex, bits);
2088 seq_printf(seq, ": "); 1999 seq_printf(seq, ": ");
2089 } 2000 } else
2090 else
2091 seq_printf(seq, "<root>: "); 2001 seq_printf(seq, "<root>: ");
2092 seq_printf(seq, "%s:%p ", IS_LEAF(n)?"Leaf":"Internal node", n); 2002 seq_printf(seq, "%s:%p ", IS_LEAF(n)?"Leaf":"Internal node", n);
2093 2003
2094 if (IS_LEAF(n))
2095 seq_printf(seq, "key=%d.%d.%d.%d\n",
2096 n->key >> 24, (n->key >> 16) % 256, (n->key >> 8) % 256, n->key % 256);
2097 else {
2098 int plen = ((struct tnode *)n)->pos;
2099 t_key prf=MASK_PFX(n->key, plen);
2100 seq_printf(seq, "key=%d.%d.%d.%d/%d\n",
2101 prf >> 24, (prf >> 16) % 256, (prf >> 8) % 256, prf % 256, plen);
2102 }
2103 if (IS_LEAF(n)) { 2004 if (IS_LEAF(n)) {
2104 struct leaf *l=(struct leaf *)n; 2005 struct leaf *l = (struct leaf *)n;
2105 struct fib_alias *fa; 2006 struct fib_alias *fa;
2106 int i; 2007 int i;
2107 for (i=32; i>=0; i--) 2008
2108 if (find_leaf_info(&l->list, i)) { 2009 seq_printf(seq, "key=%d.%d.%d.%d\n",
2109 2010 n->key >> 24, (n->key >> 16) % 256, (n->key >> 8) % 256, n->key % 256);
2011
2012 for (i = 32; i >= 0; i--)
2013 if (find_leaf_info(&l->list, i)) {
2110 struct list_head *fa_head = get_fa_head(l, i); 2014 struct list_head *fa_head = get_fa_head(l, i);
2111 2015
2112 if (!fa_head) 2016 if (!fa_head)
2113 continue; 2017 continue;
2114 2018
@@ -2118,17 +2022,16 @@ static void printnode_seq(struct seq_file *seq, int indent, struct node *n,
2118 putspace_seq(seq, indent+2); 2022 putspace_seq(seq, indent+2);
2119 seq_printf(seq, "{/%d...dumping}\n", i); 2023 seq_printf(seq, "{/%d...dumping}\n", i);
2120 2024
2121 2025 list_for_each_entry_rcu(fa, fa_head, fa_list) {
2122 list_for_each_entry(fa, fa_head, fa_list) {
2123 putspace_seq(seq, indent+2); 2026 putspace_seq(seq, indent+2);
2124 if (fa->fa_info->fib_nh == NULL) {
2125 seq_printf(seq, "Error _fib_nh=NULL\n");
2126 continue;
2127 }
2128 if (fa->fa_info == NULL) { 2027 if (fa->fa_info == NULL) {
2129 seq_printf(seq, "Error fa_info=NULL\n"); 2028 seq_printf(seq, "Error fa_info=NULL\n");
2130 continue; 2029 continue;
2131 } 2030 }
2031 if (fa->fa_info->fib_nh == NULL) {
2032 seq_printf(seq, "Error _fib_nh=NULL\n");
2033 continue;
2034 }
2132 2035
2133 seq_printf(seq, "{type=%d scope=%d TOS=%d}\n", 2036 seq_printf(seq, "{type=%d scope=%d TOS=%d}\n",
2134 fa->fa_type, 2037 fa->fa_type,
@@ -2136,11 +2039,16 @@ static void printnode_seq(struct seq_file *seq, int indent, struct node *n,
2136 fa->fa_tos); 2039 fa->fa_tos);
2137 } 2040 }
2138 } 2041 }
2139 } 2042 } else {
2140 else if (IS_TNODE(n)) {
2141 struct tnode *tn = (struct tnode *)n; 2043 struct tnode *tn = (struct tnode *)n;
2044 int plen = ((struct tnode *)n)->pos;
2045 t_key prf = MASK_PFX(n->key, plen);
2046
2047 seq_printf(seq, "key=%d.%d.%d.%d/%d\n",
2048 prf >> 24, (prf >> 16) % 256, (prf >> 8) % 256, prf % 256, plen);
2049
2142 putspace_seq(seq, indent); seq_printf(seq, "| "); 2050 putspace_seq(seq, indent); seq_printf(seq, "| ");
2143 seq_printf(seq, "{key prefix=%08x/", tn->key&TKEY_GET_MASK(0, tn->pos)); 2051 seq_printf(seq, "{key prefix=%08x/", tn->key & TKEY_GET_MASK(0, tn->pos));
2144 printbin_seq(seq, tkey_extract_bits(tn->key, 0, tn->pos), tn->pos); 2052 printbin_seq(seq, tkey_extract_bits(tn->key, 0, tn->pos), tn->pos);
2145 seq_printf(seq, "}\n"); 2053 seq_printf(seq, "}\n");
2146 putspace_seq(seq, indent); seq_printf(seq, "| "); 2054 putspace_seq(seq, indent); seq_printf(seq, "| ");
@@ -2154,194 +2062,196 @@ static void printnode_seq(struct seq_file *seq, int indent, struct node *n,
2154 2062
2155static void trie_dump_seq(struct seq_file *seq, struct trie *t) 2063static void trie_dump_seq(struct seq_file *seq, struct trie *t)
2156{ 2064{
2157 struct node *n = t->trie; 2065 struct node *n;
2158 int cindex=0; 2066 int cindex = 0;
2159 int indent=1; 2067 int indent = 1;
2160 int pend=0; 2068 int pend = 0;
2161 int depth = 0; 2069 int depth = 0;
2070 struct tnode *tn;
2162 2071
2163 read_lock(&fib_lock); 2072 rcu_read_lock();
2164 2073 n = rcu_dereference(t->trie);
2165 seq_printf(seq, "------ trie_dump of t=%p ------\n", t); 2074 seq_printf(seq, "------ trie_dump of t=%p ------\n", t);
2166 if (n) {
2167 printnode_seq(seq, indent, n, pend, cindex, 0);
2168 if (IS_TNODE(n)) {
2169 struct tnode *tn = (struct tnode *)n;
2170 pend = tn->pos+tn->bits;
2171 putspace_seq(seq, indent); seq_printf(seq, "\\--\n");
2172 indent += 3;
2173 depth++;
2174
2175 while (tn && cindex < (1 << tn->bits)) {
2176 if (tn->child[cindex]) {
2177
2178 /* Got a child */
2179
2180 printnode_seq(seq, indent, tn->child[cindex], pend, cindex, tn->bits);
2181 if (IS_LEAF(tn->child[cindex])) {
2182 cindex++;
2183
2184 }
2185 else {
2186 /*
2187 * New tnode. Decend one level
2188 */
2189
2190 depth++;
2191 n = tn->child[cindex];
2192 tn = (struct tnode *)n;
2193 pend = tn->pos+tn->bits;
2194 putspace_seq(seq, indent); seq_printf(seq, "\\--\n");
2195 indent+=3;
2196 cindex=0;
2197 }
2198 }
2199 else
2200 cindex++;
2201 2075
2076 if (!n) {
2077 seq_printf(seq, "------ trie is empty\n");
2078
2079 rcu_read_unlock();
2080 return;
2081 }
2082
2083 printnode_seq(seq, indent, n, pend, cindex, 0);
2084
2085 if (!IS_TNODE(n)) {
2086 rcu_read_unlock();
2087 return;
2088 }
2089
2090 tn = (struct tnode *)n;
2091 pend = tn->pos+tn->bits;
2092 putspace_seq(seq, indent); seq_printf(seq, "\\--\n");
2093 indent += 3;
2094 depth++;
2095
2096 while (tn && cindex < (1 << tn->bits)) {
2097 struct node *child = rcu_dereference(tn->child[cindex]);
2098 if (!child)
2099 cindex++;
2100 else {
2101 /* Got a child */
2102 printnode_seq(seq, indent, child, pend,
2103 cindex, tn->bits);
2104
2105 if (IS_LEAF(child))
2106 cindex++;
2107
2108 else {
2202 /* 2109 /*
2203 * Test if we are done 2110 * New tnode. Decend one level
2204 */ 2111 */
2205
2206 while (cindex >= (1 << tn->bits)) {
2207 2112
2208 /* 2113 depth++;
2209 * Move upwards and test for root 2114 n = child;
2210 * pop off all traversed nodes 2115 tn = (struct tnode *)n;
2211 */ 2116 pend = tn->pos+tn->bits;
2212 2117 putspace_seq(seq, indent);
2213 if (NODE_PARENT(tn) == NULL) { 2118 seq_printf(seq, "\\--\n");
2214 tn = NULL; 2119 indent += 3;
2215 n = NULL; 2120 cindex = 0;
2216 break;
2217 }
2218 else {
2219 cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
2220 tn = NODE_PARENT(tn);
2221 cindex++;
2222 n = (struct node *)tn;
2223 pend = tn->pos+tn->bits;
2224 indent-=3;
2225 depth--;
2226 }
2227 }
2228 } 2121 }
2229 } 2122 }
2230 else n = NULL;
2231 }
2232 else seq_printf(seq, "------ trie is empty\n");
2233 2123
2234 read_unlock(&fib_lock); 2124 /*
2125 * Test if we are done
2126 */
2127
2128 while (cindex >= (1 << tn->bits)) {
2129 /*
2130 * Move upwards and test for root
2131 * pop off all traversed nodes
2132 */
2133
2134 if (NODE_PARENT(tn) == NULL) {
2135 tn = NULL;
2136 break;
2137 }
2138
2139 cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
2140 cindex++;
2141 tn = NODE_PARENT(tn);
2142 pend = tn->pos + tn->bits;
2143 indent -= 3;
2144 depth--;
2145 }
2146 }
2147 rcu_read_unlock();
2235} 2148}
2236 2149
2237static struct trie_stat *trie_stat_new(void) 2150static struct trie_stat *trie_stat_new(void)
2238{ 2151{
2239 struct trie_stat *s = kmalloc(sizeof(struct trie_stat), GFP_KERNEL); 2152 struct trie_stat *s;
2240 int i; 2153 int i;
2241 2154
2242 if (s) { 2155 s = kmalloc(sizeof(struct trie_stat), GFP_KERNEL);
2243 s->totdepth = 0; 2156 if (!s)
2244 s->maxdepth = 0; 2157 return NULL;
2245 s->tnodes = 0; 2158
2246 s->leaves = 0; 2159 s->totdepth = 0;
2247 s->nullpointers = 0; 2160 s->maxdepth = 0;
2248 2161 s->tnodes = 0;
2249 for(i=0; i< MAX_CHILDS; i++) 2162 s->leaves = 0;
2250 s->nodesizes[i] = 0; 2163 s->nullpointers = 0;
2251 } 2164
2165 for (i = 0; i < MAX_CHILDS; i++)
2166 s->nodesizes[i] = 0;
2167
2252 return s; 2168 return s;
2253} 2169}
2254 2170
2255static struct trie_stat *trie_collect_stats(struct trie *t) 2171static struct trie_stat *trie_collect_stats(struct trie *t)
2256{ 2172{
2257 struct node *n = t->trie; 2173 struct node *n;
2258 struct trie_stat *s = trie_stat_new(); 2174 struct trie_stat *s = trie_stat_new();
2259 int cindex = 0; 2175 int cindex = 0;
2260 int indent = 1;
2261 int pend = 0; 2176 int pend = 0;
2262 int depth = 0; 2177 int depth = 0;
2263 2178
2264 read_lock(&fib_lock); 2179 if (!s)
2180 return NULL;
2265 2181
2266 if (s) { 2182 rcu_read_lock();
2267 if (n) { 2183 n = rcu_dereference(t->trie);
2268 if (IS_TNODE(n)) {
2269 struct tnode *tn = (struct tnode *)n;
2270 pend = tn->pos+tn->bits;
2271 indent += 3;
2272 s->nodesizes[tn->bits]++;
2273 depth++;
2274 2184
2275 while (tn && cindex < (1 << tn->bits)) { 2185 if (!n)
2276 if (tn->child[cindex]) { 2186 return s;
2277 /* Got a child */ 2187
2278 2188 if (IS_TNODE(n)) {
2279 if (IS_LEAF(tn->child[cindex])) { 2189 struct tnode *tn = (struct tnode *)n;
2280 cindex++; 2190 pend = tn->pos+tn->bits;
2281 2191 s->nodesizes[tn->bits]++;
2282 /* stats */ 2192 depth++;
2283 if (depth > s->maxdepth) 2193
2284 s->maxdepth = depth; 2194 while (tn && cindex < (1 << tn->bits)) {
2285 s->totdepth += depth; 2195 struct node *ch = rcu_dereference(tn->child[cindex]);
2286 s->leaves++; 2196 if (ch) {
2287 }
2288
2289 else {
2290 /*
2291 * New tnode. Decend one level
2292 */
2293
2294 s->tnodes++;
2295 s->nodesizes[tn->bits]++;
2296 depth++;
2297
2298 n = tn->child[cindex];
2299 tn = (struct tnode *)n;
2300 pend = tn->pos+tn->bits;
2301
2302 indent += 3;
2303 cindex = 0;
2304 }
2305 }
2306 else {
2307 cindex++;
2308 s->nullpointers++;
2309 }
2310 2197
2198 /* Got a child */
2199
2200 if (IS_LEAF(tn->child[cindex])) {
2201 cindex++;
2202
2203 /* stats */
2204 if (depth > s->maxdepth)
2205 s->maxdepth = depth;
2206 s->totdepth += depth;
2207 s->leaves++;
2208 } else {
2311 /* 2209 /*
2312 * Test if we are done 2210 * New tnode. Decend one level
2313 */ 2211 */
2314 2212
2315 while (cindex >= (1 << tn->bits)) { 2213 s->tnodes++;
2316 2214 s->nodesizes[tn->bits]++;
2317 /* 2215 depth++;
2318 * Move upwards and test for root 2216
2319 * pop off all traversed nodes 2217 n = ch;
2320 */ 2218 tn = (struct tnode *)n;
2321 2219 pend = tn->pos+tn->bits;
2322 2220
2323 if (NODE_PARENT(tn) == NULL) { 2221 cindex = 0;
2324 tn = NULL;
2325 n = NULL;
2326 break;
2327 }
2328 else {
2329 cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
2330 tn = NODE_PARENT(tn);
2331 cindex++;
2332 n = (struct node *)tn;
2333 pend = tn->pos+tn->bits;
2334 indent -= 3;
2335 depth--;
2336 }
2337 }
2338 } 2222 }
2223 } else {
2224 cindex++;
2225 s->nullpointers++;
2339 } 2226 }
2340 else n = NULL; 2227
2228 /*
2229 * Test if we are done
2230 */
2231
2232 while (cindex >= (1 << tn->bits)) {
2233 /*
2234 * Move upwards and test for root
2235 * pop off all traversed nodes
2236 */
2237
2238 if (NODE_PARENT(tn) == NULL) {
2239 tn = NULL;
2240 n = NULL;
2241 break;
2242 }
2243
2244 cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
2245 tn = NODE_PARENT(tn);
2246 cindex++;
2247 n = (struct node *)tn;
2248 pend = tn->pos+tn->bits;
2249 depth--;
2250 }
2341 } 2251 }
2342 } 2252 }
2343 2253
2344 read_unlock(&fib_lock); 2254 rcu_read_unlock();
2345 return s; 2255 return s;
2346} 2256}
2347 2257
@@ -2359,17 +2269,22 @@ static struct fib_alias *fib_triestat_get_next(struct seq_file *seq)
2359 2269
2360static void *fib_triestat_seq_start(struct seq_file *seq, loff_t *pos) 2270static void *fib_triestat_seq_start(struct seq_file *seq, loff_t *pos)
2361{ 2271{
2362 void *v = NULL; 2272 if (!ip_fib_main_table)
2273 return NULL;
2363 2274
2364 if (ip_fib_main_table) 2275 if (*pos)
2365 v = *pos ? fib_triestat_get_next(seq) : SEQ_START_TOKEN; 2276 return fib_triestat_get_next(seq);
2366 return v; 2277 else
2278 return SEQ_START_TOKEN;
2367} 2279}
2368 2280
2369static void *fib_triestat_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2281static void *fib_triestat_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2370{ 2282{
2371 ++*pos; 2283 ++*pos;
2372 return v == SEQ_START_TOKEN ? fib_triestat_get_first(seq) : fib_triestat_get_next(seq); 2284 if (v == SEQ_START_TOKEN)
2285 return fib_triestat_get_first(seq);
2286 else
2287 return fib_triestat_get_next(seq);
2373} 2288}
2374 2289
2375static void fib_triestat_seq_stop(struct seq_file *seq, void *v) 2290static void fib_triestat_seq_stop(struct seq_file *seq, void *v)
@@ -2388,22 +2303,22 @@ static void collect_and_show(struct trie *t, struct seq_file *seq)
2388{ 2303{
2389 int bytes = 0; /* How many bytes are used, a ref is 4 bytes */ 2304 int bytes = 0; /* How many bytes are used, a ref is 4 bytes */
2390 int i, max, pointers; 2305 int i, max, pointers;
2391 struct trie_stat *stat; 2306 struct trie_stat *stat;
2392 int avdepth; 2307 int avdepth;
2393 2308
2394 stat = trie_collect_stats(t); 2309 stat = trie_collect_stats(t);
2395 2310
2396 bytes=0; 2311 bytes = 0;
2397 seq_printf(seq, "trie=%p\n", t); 2312 seq_printf(seq, "trie=%p\n", t);
2398 2313
2399 if (stat) { 2314 if (stat) {
2400 if (stat->leaves) 2315 if (stat->leaves)
2401 avdepth=stat->totdepth*100 / stat->leaves; 2316 avdepth = stat->totdepth*100 / stat->leaves;
2402 else 2317 else
2403 avdepth=0; 2318 avdepth = 0;
2404 seq_printf(seq, "Aver depth: %d.%02d\n", avdepth / 100, avdepth % 100 ); 2319 seq_printf(seq, "Aver depth: %d.%02d\n", avdepth / 100, avdepth % 100);
2405 seq_printf(seq, "Max depth: %4d\n", stat->maxdepth); 2320 seq_printf(seq, "Max depth: %4d\n", stat->maxdepth);
2406 2321
2407 seq_printf(seq, "Leaves: %d\n", stat->leaves); 2322 seq_printf(seq, "Leaves: %d\n", stat->leaves);
2408 bytes += sizeof(struct leaf) * stat->leaves; 2323 bytes += sizeof(struct leaf) * stat->leaves;
2409 seq_printf(seq, "Internal nodes: %d\n", stat->tnodes); 2324 seq_printf(seq, "Internal nodes: %d\n", stat->tnodes);
@@ -2455,11 +2370,9 @@ static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2455 2370
2456 if (trie_main) 2371 if (trie_main)
2457 collect_and_show(trie_main, seq); 2372 collect_and_show(trie_main, seq);
2458 } 2373 } else {
2459 else { 2374 snprintf(bf, sizeof(bf), "*\t%08X\t%08X", 200, 400);
2460 snprintf(bf, sizeof(bf), 2375
2461 "*\t%08X\t%08X", 200, 400);
2462
2463 seq_printf(seq, "%-127s\n", bf); 2376 seq_printf(seq, "%-127s\n", bf);
2464 } 2377 }
2465 return 0; 2378 return 0;
@@ -2520,22 +2433,27 @@ static struct fib_alias *fib_trie_get_next(struct seq_file *seq)
2520 2433
2521static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos) 2434static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2522{ 2435{
2523 void *v = NULL; 2436 if (!ip_fib_main_table)
2437 return NULL;
2524 2438
2525 if (ip_fib_main_table) 2439 if (*pos)
2526 v = *pos ? fib_trie_get_next(seq) : SEQ_START_TOKEN; 2440 return fib_trie_get_next(seq);
2527 return v; 2441 else
2442 return SEQ_START_TOKEN;
2528} 2443}
2529 2444
2530static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2445static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2531{ 2446{
2532 ++*pos; 2447 ++*pos;
2533 return v == SEQ_START_TOKEN ? fib_trie_get_first(seq) : fib_trie_get_next(seq); 2448 if (v == SEQ_START_TOKEN)
2449 return fib_trie_get_first(seq);
2450 else
2451 return fib_trie_get_next(seq);
2452
2534} 2453}
2535 2454
2536static void fib_trie_seq_stop(struct seq_file *seq, void *v) 2455static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2537{ 2456{
2538
2539} 2457}
2540 2458
2541/* 2459/*
@@ -2555,9 +2473,7 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v)
2555 2473
2556 if (trie_main) 2474 if (trie_main)
2557 trie_dump_seq(seq, trie_main); 2475 trie_dump_seq(seq, trie_main);
2558 } 2476 } else {
2559
2560 else {
2561 snprintf(bf, sizeof(bf), 2477 snprintf(bf, sizeof(bf),
2562 "*\t%08X\t%08X", 200, 400); 2478 "*\t%08X\t%08X", 200, 400);
2563 seq_printf(seq, "%-127s\n", bf); 2479 seq_printf(seq, "%-127s\n", bf);
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index badfc5849973..24eb56ae1b5a 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -114,7 +114,7 @@ struct icmp_bxm {
114/* 114/*
115 * Statistics 115 * Statistics
116 */ 116 */
117DEFINE_SNMP_STAT(struct icmp_mib, icmp_statistics); 117DEFINE_SNMP_STAT(struct icmp_mib, icmp_statistics) __read_mostly;
118 118
119/* An array of errno for error messages from dest unreach. */ 119/* An array of errno for error messages from dest unreach. */
120/* RFC 1122: 3.2.2.1 States that NET_UNREACH, HOST_UNREACH and SR_FAILED MUST be considered 'transient errs'. */ 120/* RFC 1122: 3.2.2.1 States that NET_UNREACH, HOST_UNREACH and SR_FAILED MUST be considered 'transient errs'. */
@@ -627,11 +627,10 @@ static void icmp_unreach(struct sk_buff *skb)
627 break; 627 break;
628 case ICMP_FRAG_NEEDED: 628 case ICMP_FRAG_NEEDED:
629 if (ipv4_config.no_pmtu_disc) { 629 if (ipv4_config.no_pmtu_disc) {
630 LIMIT_NETDEBUG( 630 LIMIT_NETDEBUG(KERN_INFO "ICMP: %u.%u.%u.%u: "
631 printk(KERN_INFO "ICMP: %u.%u.%u.%u: "
632 "fragmentation needed " 631 "fragmentation needed "
633 "and DF set.\n", 632 "and DF set.\n",
634 NIPQUAD(iph->daddr))); 633 NIPQUAD(iph->daddr));
635 } else { 634 } else {
636 info = ip_rt_frag_needed(iph, 635 info = ip_rt_frag_needed(iph,
637 ntohs(icmph->un.frag.mtu)); 636 ntohs(icmph->un.frag.mtu));
@@ -640,10 +639,9 @@ static void icmp_unreach(struct sk_buff *skb)
640 } 639 }
641 break; 640 break;
642 case ICMP_SR_FAILED: 641 case ICMP_SR_FAILED:
643 LIMIT_NETDEBUG( 642 LIMIT_NETDEBUG(KERN_INFO "ICMP: %u.%u.%u.%u: Source "
644 printk(KERN_INFO "ICMP: %u.%u.%u.%u: Source "
645 "Route Failed.\n", 643 "Route Failed.\n",
646 NIPQUAD(iph->daddr))); 644 NIPQUAD(iph->daddr));
647 break; 645 break;
648 default: 646 default:
649 break; 647 break;
@@ -936,7 +934,7 @@ int icmp_rcv(struct sk_buff *skb)
936 case CHECKSUM_HW: 934 case CHECKSUM_HW:
937 if (!(u16)csum_fold(skb->csum)) 935 if (!(u16)csum_fold(skb->csum))
938 break; 936 break;
939 LIMIT_NETDEBUG(printk(KERN_DEBUG "icmp v4 hw csum failure\n")); 937 LIMIT_NETDEBUG(KERN_DEBUG "icmp v4 hw csum failure\n");
940 case CHECKSUM_NONE: 938 case CHECKSUM_NONE:
941 if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0))) 939 if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0)))
942 goto error; 940 goto error;
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 5088f90835ae..44607f4767b8 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -904,7 +904,7 @@ int igmp_rcv(struct sk_buff *skb)
904 case IGMP_MTRACE_RESP: 904 case IGMP_MTRACE_RESP:
905 break; 905 break;
906 default: 906 default:
907 NETDEBUG(printk(KERN_DEBUG "New IGMP type=%d, why we do not know about it?\n", ih->type)); 907 NETDEBUG(KERN_DEBUG "New IGMP type=%d, why we do not know about it?\n", ih->type);
908 } 908 }
909 in_dev_put(in_dev); 909 in_dev_put(in_dev);
910 kfree_skb(skb); 910 kfree_skb(skb);
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
new file mode 100644
index 000000000000..fe3c6d3d0c91
--- /dev/null
+++ b/net/ipv4/inet_connection_sock.c
@@ -0,0 +1,641 @@
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Support for INET connection oriented protocols.
7 *
8 * Authors: See the TCP sources
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or(at your option) any later version.
14 */
15
16#include <linux/config.h>
17#include <linux/module.h>
18#include <linux/jhash.h>
19
20#include <net/inet_connection_sock.h>
21#include <net/inet_hashtables.h>
22#include <net/inet_timewait_sock.h>
23#include <net/ip.h>
24#include <net/route.h>
25#include <net/tcp_states.h>
26#include <net/xfrm.h>
27
28#ifdef INET_CSK_DEBUG
29const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n";
30EXPORT_SYMBOL(inet_csk_timer_bug_msg);
31#endif
32
33/*
34 * This array holds the first and last local port number.
35 * For high-usage systems, use sysctl to change this to
36 * 32768-61000
37 */
38int sysctl_local_port_range[2] = { 1024, 4999 };
39
40static inline int inet_csk_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb)
41{
42 const u32 sk_rcv_saddr = inet_rcv_saddr(sk);
43 struct sock *sk2;
44 struct hlist_node *node;
45 int reuse = sk->sk_reuse;
46
47 sk_for_each_bound(sk2, node, &tb->owners) {
48 if (sk != sk2 &&
49 !inet_v6_ipv6only(sk2) &&
50 (!sk->sk_bound_dev_if ||
51 !sk2->sk_bound_dev_if ||
52 sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
53 if (!reuse || !sk2->sk_reuse ||
54 sk2->sk_state == TCP_LISTEN) {
55 const u32 sk2_rcv_saddr = inet_rcv_saddr(sk2);
56 if (!sk2_rcv_saddr || !sk_rcv_saddr ||
57 sk2_rcv_saddr == sk_rcv_saddr)
58 break;
59 }
60 }
61 }
62 return node != NULL;
63}
64
65/* Obtain a reference to a local port for the given sock,
66 * if snum is zero it means select any available local port.
67 */
68int inet_csk_get_port(struct inet_hashinfo *hashinfo,
69 struct sock *sk, unsigned short snum)
70{
71 struct inet_bind_hashbucket *head;
72 struct hlist_node *node;
73 struct inet_bind_bucket *tb;
74 int ret;
75
76 local_bh_disable();
77 if (!snum) {
78 int low = sysctl_local_port_range[0];
79 int high = sysctl_local_port_range[1];
80 int remaining = (high - low) + 1;
81 int rover;
82
83 spin_lock(&hashinfo->portalloc_lock);
84 if (hashinfo->port_rover < low)
85 rover = low;
86 else
87 rover = hashinfo->port_rover;
88 do {
89 rover++;
90 if (rover > high)
91 rover = low;
92 head = &hashinfo->bhash[inet_bhashfn(rover, hashinfo->bhash_size)];
93 spin_lock(&head->lock);
94 inet_bind_bucket_for_each(tb, node, &head->chain)
95 if (tb->port == rover)
96 goto next;
97 break;
98 next:
99 spin_unlock(&head->lock);
100 } while (--remaining > 0);
101 hashinfo->port_rover = rover;
102 spin_unlock(&hashinfo->portalloc_lock);
103
104 /* Exhausted local port range during search? It is not
105 * possible for us to be holding one of the bind hash
106 * locks if this test triggers, because if 'remaining'
107 * drops to zero, we broke out of the do/while loop at
108 * the top level, not from the 'break;' statement.
109 */
110 ret = 1;
111 if (remaining <= 0)
112 goto fail;
113
114 /* OK, here is the one we will use. HEAD is
115 * non-NULL and we hold it's mutex.
116 */
117 snum = rover;
118 } else {
119 head = &hashinfo->bhash[inet_bhashfn(snum, hashinfo->bhash_size)];
120 spin_lock(&head->lock);
121 inet_bind_bucket_for_each(tb, node, &head->chain)
122 if (tb->port == snum)
123 goto tb_found;
124 }
125 tb = NULL;
126 goto tb_not_found;
127tb_found:
128 if (!hlist_empty(&tb->owners)) {
129 if (sk->sk_reuse > 1)
130 goto success;
131 if (tb->fastreuse > 0 &&
132 sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
133 goto success;
134 } else {
135 ret = 1;
136 if (inet_csk_bind_conflict(sk, tb))
137 goto fail_unlock;
138 }
139 }
140tb_not_found:
141 ret = 1;
142 if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep, head, snum)) == NULL)
143 goto fail_unlock;
144 if (hlist_empty(&tb->owners)) {
145 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
146 tb->fastreuse = 1;
147 else
148 tb->fastreuse = 0;
149 } else if (tb->fastreuse &&
150 (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
151 tb->fastreuse = 0;
152success:
153 if (!inet_csk(sk)->icsk_bind_hash)
154 inet_bind_hash(sk, tb, snum);
155 BUG_TRAP(inet_csk(sk)->icsk_bind_hash == tb);
156 ret = 0;
157
158fail_unlock:
159 spin_unlock(&head->lock);
160fail:
161 local_bh_enable();
162 return ret;
163}
164
165EXPORT_SYMBOL_GPL(inet_csk_get_port);
166
167/*
168 * Wait for an incoming connection, avoid race conditions. This must be called
169 * with the socket locked.
170 */
171static int inet_csk_wait_for_connect(struct sock *sk, long timeo)
172{
173 struct inet_connection_sock *icsk = inet_csk(sk);
174 DEFINE_WAIT(wait);
175 int err;
176
177 /*
178 * True wake-one mechanism for incoming connections: only
179 * one process gets woken up, not the 'whole herd'.
180 * Since we do not 'race & poll' for established sockets
181 * anymore, the common case will execute the loop only once.
182 *
183 * Subtle issue: "add_wait_queue_exclusive()" will be added
184 * after any current non-exclusive waiters, and we know that
185 * it will always _stay_ after any new non-exclusive waiters
186 * because all non-exclusive waiters are added at the
187 * beginning of the wait-queue. As such, it's ok to "drop"
188 * our exclusiveness temporarily when we get woken up without
189 * having to remove and re-insert us on the wait queue.
190 */
191 for (;;) {
192 prepare_to_wait_exclusive(sk->sk_sleep, &wait,
193 TASK_INTERRUPTIBLE);
194 release_sock(sk);
195 if (reqsk_queue_empty(&icsk->icsk_accept_queue))
196 timeo = schedule_timeout(timeo);
197 lock_sock(sk);
198 err = 0;
199 if (!reqsk_queue_empty(&icsk->icsk_accept_queue))
200 break;
201 err = -EINVAL;
202 if (sk->sk_state != TCP_LISTEN)
203 break;
204 err = sock_intr_errno(timeo);
205 if (signal_pending(current))
206 break;
207 err = -EAGAIN;
208 if (!timeo)
209 break;
210 }
211 finish_wait(sk->sk_sleep, &wait);
212 return err;
213}
214
215/*
216 * This will accept the next outstanding connection.
217 */
218struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
219{
220 struct inet_connection_sock *icsk = inet_csk(sk);
221 struct sock *newsk;
222 int error;
223
224 lock_sock(sk);
225
226 /* We need to make sure that this socket is listening,
227 * and that it has something pending.
228 */
229 error = -EINVAL;
230 if (sk->sk_state != TCP_LISTEN)
231 goto out_err;
232
233 /* Find already established connection */
234 if (reqsk_queue_empty(&icsk->icsk_accept_queue)) {
235 long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
236
237 /* If this is a non blocking socket don't sleep */
238 error = -EAGAIN;
239 if (!timeo)
240 goto out_err;
241
242 error = inet_csk_wait_for_connect(sk, timeo);
243 if (error)
244 goto out_err;
245 }
246
247 newsk = reqsk_queue_get_child(&icsk->icsk_accept_queue, sk);
248 BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);
249out:
250 release_sock(sk);
251 return newsk;
252out_err:
253 newsk = NULL;
254 *err = error;
255 goto out;
256}
257
258EXPORT_SYMBOL(inet_csk_accept);
259
260/*
261 * Using different timers for retransmit, delayed acks and probes
262 * We may wish use just one timer maintaining a list of expire jiffies
263 * to optimize.
264 */
265void inet_csk_init_xmit_timers(struct sock *sk,
266 void (*retransmit_handler)(unsigned long),
267 void (*delack_handler)(unsigned long),
268 void (*keepalive_handler)(unsigned long))
269{
270 struct inet_connection_sock *icsk = inet_csk(sk);
271
272 init_timer(&icsk->icsk_retransmit_timer);
273 init_timer(&icsk->icsk_delack_timer);
274 init_timer(&sk->sk_timer);
275
276 icsk->icsk_retransmit_timer.function = retransmit_handler;
277 icsk->icsk_delack_timer.function = delack_handler;
278 sk->sk_timer.function = keepalive_handler;
279
280 icsk->icsk_retransmit_timer.data =
281 icsk->icsk_delack_timer.data =
282 sk->sk_timer.data = (unsigned long)sk;
283
284 icsk->icsk_pending = icsk->icsk_ack.pending = 0;
285}
286
287EXPORT_SYMBOL(inet_csk_init_xmit_timers);
288
289void inet_csk_clear_xmit_timers(struct sock *sk)
290{
291 struct inet_connection_sock *icsk = inet_csk(sk);
292
293 icsk->icsk_pending = icsk->icsk_ack.pending = icsk->icsk_ack.blocked = 0;
294
295 sk_stop_timer(sk, &icsk->icsk_retransmit_timer);
296 sk_stop_timer(sk, &icsk->icsk_delack_timer);
297 sk_stop_timer(sk, &sk->sk_timer);
298}
299
300EXPORT_SYMBOL(inet_csk_clear_xmit_timers);
301
302void inet_csk_delete_keepalive_timer(struct sock *sk)
303{
304 sk_stop_timer(sk, &sk->sk_timer);
305}
306
307EXPORT_SYMBOL(inet_csk_delete_keepalive_timer);
308
309void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len)
310{
311 sk_reset_timer(sk, &sk->sk_timer, jiffies + len);
312}
313
314EXPORT_SYMBOL(inet_csk_reset_keepalive_timer);
315
316struct dst_entry* inet_csk_route_req(struct sock *sk,
317 const struct request_sock *req)
318{
319 struct rtable *rt;
320 const struct inet_request_sock *ireq = inet_rsk(req);
321 struct ip_options *opt = inet_rsk(req)->opt;
322 struct flowi fl = { .oif = sk->sk_bound_dev_if,
323 .nl_u = { .ip4_u =
324 { .daddr = ((opt && opt->srr) ?
325 opt->faddr :
326 ireq->rmt_addr),
327 .saddr = ireq->loc_addr,
328 .tos = RT_CONN_FLAGS(sk) } },
329 .proto = sk->sk_protocol,
330 .uli_u = { .ports =
331 { .sport = inet_sk(sk)->sport,
332 .dport = ireq->rmt_port } } };
333
334 if (ip_route_output_flow(&rt, &fl, sk, 0)) {
335 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
336 return NULL;
337 }
338 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
339 ip_rt_put(rt);
340 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
341 return NULL;
342 }
343 return &rt->u.dst;
344}
345
346EXPORT_SYMBOL_GPL(inet_csk_route_req);
347
348static inline u32 inet_synq_hash(const u32 raddr, const u16 rport,
349 const u32 rnd, const u16 synq_hsize)
350{
351 return jhash_2words(raddr, (u32)rport, rnd) & (synq_hsize - 1);
352}
353
354#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
355#define AF_INET_FAMILY(fam) ((fam) == AF_INET)
356#else
357#define AF_INET_FAMILY(fam) 1
358#endif
359
360struct request_sock *inet_csk_search_req(const struct sock *sk,
361 struct request_sock ***prevp,
362 const __u16 rport, const __u32 raddr,
363 const __u32 laddr)
364{
365 const struct inet_connection_sock *icsk = inet_csk(sk);
366 struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
367 struct request_sock *req, **prev;
368
369 for (prev = &lopt->syn_table[inet_synq_hash(raddr, rport, lopt->hash_rnd,
370 lopt->nr_table_entries)];
371 (req = *prev) != NULL;
372 prev = &req->dl_next) {
373 const struct inet_request_sock *ireq = inet_rsk(req);
374
375 if (ireq->rmt_port == rport &&
376 ireq->rmt_addr == raddr &&
377 ireq->loc_addr == laddr &&
378 AF_INET_FAMILY(req->rsk_ops->family)) {
379 BUG_TRAP(!req->sk);
380 *prevp = prev;
381 break;
382 }
383 }
384
385 return req;
386}
387
388EXPORT_SYMBOL_GPL(inet_csk_search_req);
389
390void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
391 const unsigned timeout)
392{
393 struct inet_connection_sock *icsk = inet_csk(sk);
394 struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
395 const u32 h = inet_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port,
396 lopt->hash_rnd, lopt->nr_table_entries);
397
398 reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout);
399 inet_csk_reqsk_queue_added(sk, timeout);
400}
401
402/* Only thing we need from tcp.h */
403extern int sysctl_tcp_synack_retries;
404
405EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add);
406
407void inet_csk_reqsk_queue_prune(struct sock *parent,
408 const unsigned long interval,
409 const unsigned long timeout,
410 const unsigned long max_rto)
411{
412 struct inet_connection_sock *icsk = inet_csk(parent);
413 struct request_sock_queue *queue = &icsk->icsk_accept_queue;
414 struct listen_sock *lopt = queue->listen_opt;
415 int max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries;
416 int thresh = max_retries;
417 unsigned long now = jiffies;
418 struct request_sock **reqp, *req;
419 int i, budget;
420
421 if (lopt == NULL || lopt->qlen == 0)
422 return;
423
424 /* Normally all the openreqs are young and become mature
425 * (i.e. converted to established socket) for first timeout.
426 * If synack was not acknowledged for 3 seconds, it means
427 * one of the following things: synack was lost, ack was lost,
428 * rtt is high or nobody planned to ack (i.e. synflood).
429 * When server is a bit loaded, queue is populated with old
430 * open requests, reducing effective size of queue.
431 * When server is well loaded, queue size reduces to zero
432 * after several minutes of work. It is not synflood,
433 * it is normal operation. The solution is pruning
434 * too old entries overriding normal timeout, when
435 * situation becomes dangerous.
436 *
437 * Essentially, we reserve half of room for young
438 * embrions; and abort old ones without pity, if old
439 * ones are about to clog our table.
440 */
441 if (lopt->qlen>>(lopt->max_qlen_log-1)) {
442 int young = (lopt->qlen_young<<1);
443
444 while (thresh > 2) {
445 if (lopt->qlen < young)
446 break;
447 thresh--;
448 young <<= 1;
449 }
450 }
451
452 if (queue->rskq_defer_accept)
453 max_retries = queue->rskq_defer_accept;
454
455 budget = 2 * (lopt->nr_table_entries / (timeout / interval));
456 i = lopt->clock_hand;
457
458 do {
459 reqp=&lopt->syn_table[i];
460 while ((req = *reqp) != NULL) {
461 if (time_after_eq(now, req->expires)) {
462 if ((req->retrans < thresh ||
463 (inet_rsk(req)->acked && req->retrans < max_retries))
464 && !req->rsk_ops->rtx_syn_ack(parent, req, NULL)) {
465 unsigned long timeo;
466
467 if (req->retrans++ == 0)
468 lopt->qlen_young--;
469 timeo = min((timeout << req->retrans), max_rto);
470 req->expires = now + timeo;
471 reqp = &req->dl_next;
472 continue;
473 }
474
475 /* Drop this request */
476 inet_csk_reqsk_queue_unlink(parent, req, reqp);
477 reqsk_queue_removed(queue, req);
478 reqsk_free(req);
479 continue;
480 }
481 reqp = &req->dl_next;
482 }
483
484 i = (i + 1) & (lopt->nr_table_entries - 1);
485
486 } while (--budget > 0);
487
488 lopt->clock_hand = i;
489
490 if (lopt->qlen)
491 inet_csk_reset_keepalive_timer(parent, interval);
492}
493
494EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_prune);
495
496struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req,
497 const unsigned int __nocast priority)
498{
499 struct sock *newsk = sk_clone(sk, priority);
500
501 if (newsk != NULL) {
502 struct inet_connection_sock *newicsk = inet_csk(newsk);
503
504 newsk->sk_state = TCP_SYN_RECV;
505 newicsk->icsk_bind_hash = NULL;
506
507 inet_sk(newsk)->dport = inet_rsk(req)->rmt_port;
508 newsk->sk_write_space = sk_stream_write_space;
509
510 newicsk->icsk_retransmits = 0;
511 newicsk->icsk_backoff = 0;
512 newicsk->icsk_probes_out = 0;
513
514 /* Deinitialize accept_queue to trap illegal accesses. */
515 memset(&newicsk->icsk_accept_queue, 0, sizeof(newicsk->icsk_accept_queue));
516 }
517 return newsk;
518}
519
520EXPORT_SYMBOL_GPL(inet_csk_clone);
521
522/*
523 * At this point, there should be no process reference to this
524 * socket, and thus no user references at all. Therefore we
525 * can assume the socket waitqueue is inactive and nobody will
526 * try to jump onto it.
527 */
528void inet_csk_destroy_sock(struct sock *sk)
529{
530 BUG_TRAP(sk->sk_state == TCP_CLOSE);
531 BUG_TRAP(sock_flag(sk, SOCK_DEAD));
532
533 /* It cannot be in hash table! */
534 BUG_TRAP(sk_unhashed(sk));
535
536 /* If it has not 0 inet_sk(sk)->num, it must be bound */
537 BUG_TRAP(!inet_sk(sk)->num || inet_csk(sk)->icsk_bind_hash);
538
539 sk->sk_prot->destroy(sk);
540
541 sk_stream_kill_queues(sk);
542
543 xfrm_sk_free_policy(sk);
544
545 sk_refcnt_debug_release(sk);
546
547 atomic_dec(sk->sk_prot->orphan_count);
548 sock_put(sk);
549}
550
551EXPORT_SYMBOL(inet_csk_destroy_sock);
552
553int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
554{
555 struct inet_sock *inet = inet_sk(sk);
556 struct inet_connection_sock *icsk = inet_csk(sk);
557 int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries);
558
559 if (rc != 0)
560 return rc;
561
562 sk->sk_max_ack_backlog = 0;
563 sk->sk_ack_backlog = 0;
564 inet_csk_delack_init(sk);
565
566 /* There is race window here: we announce ourselves listening,
567 * but this transition is still not validated by get_port().
568 * It is OK, because this socket enters to hash table only
569 * after validation is complete.
570 */
571 sk->sk_state = TCP_LISTEN;
572 if (!sk->sk_prot->get_port(sk, inet->num)) {
573 inet->sport = htons(inet->num);
574
575 sk_dst_reset(sk);
576 sk->sk_prot->hash(sk);
577
578 return 0;
579 }
580
581 sk->sk_state = TCP_CLOSE;
582 __reqsk_queue_destroy(&icsk->icsk_accept_queue);
583 return -EADDRINUSE;
584}
585
586EXPORT_SYMBOL_GPL(inet_csk_listen_start);
587
588/*
589 * This routine closes sockets which have been at least partially
590 * opened, but not yet accepted.
591 */
592void inet_csk_listen_stop(struct sock *sk)
593{
594 struct inet_connection_sock *icsk = inet_csk(sk);
595 struct request_sock *acc_req;
596 struct request_sock *req;
597
598 inet_csk_delete_keepalive_timer(sk);
599
600 /* make all the listen_opt local to us */
601 acc_req = reqsk_queue_yank_acceptq(&icsk->icsk_accept_queue);
602
603 /* Following specs, it would be better either to send FIN
604 * (and enter FIN-WAIT-1, it is normal close)
605 * or to send active reset (abort).
606 * Certainly, it is pretty dangerous while synflood, but it is
607 * bad justification for our negligence 8)
608 * To be honest, we are not able to make either
609 * of the variants now. --ANK
610 */
611 reqsk_queue_destroy(&icsk->icsk_accept_queue);
612
613 while ((req = acc_req) != NULL) {
614 struct sock *child = req->sk;
615
616 acc_req = req->dl_next;
617
618 local_bh_disable();
619 bh_lock_sock(child);
620 BUG_TRAP(!sock_owned_by_user(child));
621 sock_hold(child);
622
623 sk->sk_prot->disconnect(child, O_NONBLOCK);
624
625 sock_orphan(child);
626
627 atomic_inc(sk->sk_prot->orphan_count);
628
629 inet_csk_destroy_sock(child);
630
631 bh_unlock_sock(child);
632 local_bh_enable();
633 sock_put(child);
634
635 sk_acceptq_removed(sk);
636 __reqsk_free(req);
637 }
638 BUG_TRAP(!sk->sk_ack_backlog);
639}
640
641EXPORT_SYMBOL_GPL(inet_csk_listen_stop);
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
new file mode 100644
index 000000000000..71f3c7350c6e
--- /dev/null
+++ b/net/ipv4/inet_diag.c
@@ -0,0 +1,868 @@
1/*
2 * inet_diag.c Module for monitoring INET transport protocols sockets.
3 *
4 * Version: $Id: inet_diag.c,v 1.3 2002/02/01 22:01:04 davem Exp $
5 *
6 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
12 */
13
14#include <linux/config.h>
15#include <linux/module.h>
16#include <linux/types.h>
17#include <linux/fcntl.h>
18#include <linux/random.h>
19#include <linux/cache.h>
20#include <linux/init.h>
21#include <linux/time.h>
22
23#include <net/icmp.h>
24#include <net/tcp.h>
25#include <net/ipv6.h>
26#include <net/inet_common.h>
27#include <net/inet_connection_sock.h>
28#include <net/inet_hashtables.h>
29#include <net/inet_timewait_sock.h>
30#include <net/inet6_hashtables.h>
31
32#include <linux/inet.h>
33#include <linux/stddef.h>
34
35#include <linux/inet_diag.h>
36
37static const struct inet_diag_handler **inet_diag_table;
38
39struct inet_diag_entry {
40 u32 *saddr;
41 u32 *daddr;
42 u16 sport;
43 u16 dport;
44 u16 family;
45 u16 userlocks;
46};
47
48static struct sock *idiagnl;
49
50#define INET_DIAG_PUT(skb, attrtype, attrlen) \
51 RTA_DATA(__RTA_PUT(skb, attrtype, attrlen))
52
53static int inet_diag_fill(struct sk_buff *skb, struct sock *sk,
54 int ext, u32 pid, u32 seq, u16 nlmsg_flags,
55 const struct nlmsghdr *unlh)
56{
57 const struct inet_sock *inet = inet_sk(sk);
58 const struct inet_connection_sock *icsk = inet_csk(sk);
59 struct inet_diag_msg *r;
60 struct nlmsghdr *nlh;
61 void *info = NULL;
62 struct inet_diag_meminfo *minfo = NULL;
63 unsigned char *b = skb->tail;
64 const struct inet_diag_handler *handler;
65
66 handler = inet_diag_table[unlh->nlmsg_type];
67 BUG_ON(handler == NULL);
68
69 nlh = NLMSG_PUT(skb, pid, seq, unlh->nlmsg_type, sizeof(*r));
70 nlh->nlmsg_flags = nlmsg_flags;
71
72 r = NLMSG_DATA(nlh);
73 if (sk->sk_state != TCP_TIME_WAIT) {
74 if (ext & (1 << (INET_DIAG_MEMINFO - 1)))
75 minfo = INET_DIAG_PUT(skb, INET_DIAG_MEMINFO,
76 sizeof(*minfo));
77 if (ext & (1 << (INET_DIAG_INFO - 1)))
78 info = INET_DIAG_PUT(skb, INET_DIAG_INFO,
79 handler->idiag_info_size);
80
81 if ((ext & (1 << (INET_DIAG_CONG - 1))) && icsk->icsk_ca_ops) {
82 size_t len = strlen(icsk->icsk_ca_ops->name);
83 strcpy(INET_DIAG_PUT(skb, INET_DIAG_CONG, len + 1),
84 icsk->icsk_ca_ops->name);
85 }
86 }
87 r->idiag_family = sk->sk_family;
88 r->idiag_state = sk->sk_state;
89 r->idiag_timer = 0;
90 r->idiag_retrans = 0;
91
92 r->id.idiag_if = sk->sk_bound_dev_if;
93 r->id.idiag_cookie[0] = (u32)(unsigned long)sk;
94 r->id.idiag_cookie[1] = (u32)(((unsigned long)sk >> 31) >> 1);
95
96 if (r->idiag_state == TCP_TIME_WAIT) {
97 const struct inet_timewait_sock *tw = inet_twsk(sk);
98 long tmo = tw->tw_ttd - jiffies;
99 if (tmo < 0)
100 tmo = 0;
101
102 r->id.idiag_sport = tw->tw_sport;
103 r->id.idiag_dport = tw->tw_dport;
104 r->id.idiag_src[0] = tw->tw_rcv_saddr;
105 r->id.idiag_dst[0] = tw->tw_daddr;
106 r->idiag_state = tw->tw_substate;
107 r->idiag_timer = 3;
108 r->idiag_expires = (tmo * 1000 + HZ - 1) / HZ;
109 r->idiag_rqueue = 0;
110 r->idiag_wqueue = 0;
111 r->idiag_uid = 0;
112 r->idiag_inode = 0;
113#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
114 if (r->idiag_family == AF_INET6) {
115 const struct tcp6_timewait_sock *tcp6tw = tcp6_twsk(sk);
116
117 ipv6_addr_copy((struct in6_addr *)r->id.idiag_src,
118 &tcp6tw->tw_v6_rcv_saddr);
119 ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst,
120 &tcp6tw->tw_v6_daddr);
121 }
122#endif
123 nlh->nlmsg_len = skb->tail - b;
124 return skb->len;
125 }
126
127 r->id.idiag_sport = inet->sport;
128 r->id.idiag_dport = inet->dport;
129 r->id.idiag_src[0] = inet->rcv_saddr;
130 r->id.idiag_dst[0] = inet->daddr;
131
132#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
133 if (r->idiag_family == AF_INET6) {
134 struct ipv6_pinfo *np = inet6_sk(sk);
135
136 ipv6_addr_copy((struct in6_addr *)r->id.idiag_src,
137 &np->rcv_saddr);
138 ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst,
139 &np->daddr);
140 }
141#endif
142
143#define EXPIRES_IN_MS(tmo) ((tmo - jiffies) * 1000 + HZ - 1) / HZ
144
145 if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
146 r->idiag_timer = 1;
147 r->idiag_retrans = icsk->icsk_retransmits;
148 r->idiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout);
149 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
150 r->idiag_timer = 4;
151 r->idiag_retrans = icsk->icsk_probes_out;
152 r->idiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout);
153 } else if (timer_pending(&sk->sk_timer)) {
154 r->idiag_timer = 2;
155 r->idiag_retrans = icsk->icsk_probes_out;
156 r->idiag_expires = EXPIRES_IN_MS(sk->sk_timer.expires);
157 } else {
158 r->idiag_timer = 0;
159 r->idiag_expires = 0;
160 }
161#undef EXPIRES_IN_MS
162
163 r->idiag_uid = sock_i_uid(sk);
164 r->idiag_inode = sock_i_ino(sk);
165
166 if (minfo) {
167 minfo->idiag_rmem = atomic_read(&sk->sk_rmem_alloc);
168 minfo->idiag_wmem = sk->sk_wmem_queued;
169 minfo->idiag_fmem = sk->sk_forward_alloc;
170 minfo->idiag_tmem = atomic_read(&sk->sk_wmem_alloc);
171 }
172
173 handler->idiag_get_info(sk, r, info);
174
175 if (sk->sk_state < TCP_TIME_WAIT &&
176 icsk->icsk_ca_ops && icsk->icsk_ca_ops->get_info)
177 icsk->icsk_ca_ops->get_info(sk, ext, skb);
178
179 nlh->nlmsg_len = skb->tail - b;
180 return skb->len;
181
182rtattr_failure:
183nlmsg_failure:
184 skb_trim(skb, b - skb->data);
185 return -1;
186}
187
188static int inet_diag_get_exact(struct sk_buff *in_skb, const struct nlmsghdr *nlh)
189{
190 int err;
191 struct sock *sk;
192 struct inet_diag_req *req = NLMSG_DATA(nlh);
193 struct sk_buff *rep;
194 struct inet_hashinfo *hashinfo;
195 const struct inet_diag_handler *handler;
196
197 handler = inet_diag_table[nlh->nlmsg_type];
198 BUG_ON(handler == NULL);
199 hashinfo = handler->idiag_hashinfo;
200
201 if (req->idiag_family == AF_INET) {
202 sk = inet_lookup(hashinfo, req->id.idiag_dst[0],
203 req->id.idiag_dport, req->id.idiag_src[0],
204 req->id.idiag_sport, req->id.idiag_if);
205 }
206#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
207 else if (req->idiag_family == AF_INET6) {
208 sk = inet6_lookup(hashinfo,
209 (struct in6_addr *)req->id.idiag_dst,
210 req->id.idiag_dport,
211 (struct in6_addr *)req->id.idiag_src,
212 req->id.idiag_sport,
213 req->id.idiag_if);
214 }
215#endif
216 else {
217 return -EINVAL;
218 }
219
220 if (sk == NULL)
221 return -ENOENT;
222
223 err = -ESTALE;
224 if ((req->id.idiag_cookie[0] != INET_DIAG_NOCOOKIE ||
225 req->id.idiag_cookie[1] != INET_DIAG_NOCOOKIE) &&
226 ((u32)(unsigned long)sk != req->id.idiag_cookie[0] ||
227 (u32)((((unsigned long)sk) >> 31) >> 1) != req->id.idiag_cookie[1]))
228 goto out;
229
230 err = -ENOMEM;
231 rep = alloc_skb(NLMSG_SPACE((sizeof(struct inet_diag_msg) +
232 sizeof(struct inet_diag_meminfo) +
233 handler->idiag_info_size + 64)),
234 GFP_KERNEL);
235 if (!rep)
236 goto out;
237
238 if (inet_diag_fill(rep, sk, req->idiag_ext,
239 NETLINK_CB(in_skb).pid,
240 nlh->nlmsg_seq, 0, nlh) <= 0)
241 BUG();
242
243 err = netlink_unicast(idiagnl, rep, NETLINK_CB(in_skb).pid,
244 MSG_DONTWAIT);
245 if (err > 0)
246 err = 0;
247
248out:
249 if (sk) {
250 if (sk->sk_state == TCP_TIME_WAIT)
251 inet_twsk_put((struct inet_timewait_sock *)sk);
252 else
253 sock_put(sk);
254 }
255 return err;
256}
257
258static int bitstring_match(const u32 *a1, const u32 *a2, int bits)
259{
260 int words = bits >> 5;
261
262 bits &= 0x1f;
263
264 if (words) {
265 if (memcmp(a1, a2, words << 2))
266 return 0;
267 }
268 if (bits) {
269 __u32 w1, w2;
270 __u32 mask;
271
272 w1 = a1[words];
273 w2 = a2[words];
274
275 mask = htonl((0xffffffff) << (32 - bits));
276
277 if ((w1 ^ w2) & mask)
278 return 0;
279 }
280
281 return 1;
282}
283
284
285static int inet_diag_bc_run(const void *bc, int len,
286 const struct inet_diag_entry *entry)
287{
288 while (len > 0) {
289 int yes = 1;
290 const struct inet_diag_bc_op *op = bc;
291
292 switch (op->code) {
293 case INET_DIAG_BC_NOP:
294 break;
295 case INET_DIAG_BC_JMP:
296 yes = 0;
297 break;
298 case INET_DIAG_BC_S_GE:
299 yes = entry->sport >= op[1].no;
300 break;
301 case INET_DIAG_BC_S_LE:
302 yes = entry->dport <= op[1].no;
303 break;
304 case INET_DIAG_BC_D_GE:
305 yes = entry->dport >= op[1].no;
306 break;
307 case INET_DIAG_BC_D_LE:
308 yes = entry->dport <= op[1].no;
309 break;
310 case INET_DIAG_BC_AUTO:
311 yes = !(entry->userlocks & SOCK_BINDPORT_LOCK);
312 break;
313 case INET_DIAG_BC_S_COND:
314 case INET_DIAG_BC_D_COND: {
315 struct inet_diag_hostcond *cond;
316 u32 *addr;
317
318 cond = (struct inet_diag_hostcond *)(op + 1);
319 if (cond->port != -1 &&
320 cond->port != (op->code == INET_DIAG_BC_S_COND ?
321 entry->sport : entry->dport)) {
322 yes = 0;
323 break;
324 }
325
326 if (cond->prefix_len == 0)
327 break;
328
329 if (op->code == INET_DIAG_BC_S_COND)
330 addr = entry->saddr;
331 else
332 addr = entry->daddr;
333
334 if (bitstring_match(addr, cond->addr, cond->prefix_len))
335 break;
336 if (entry->family == AF_INET6 &&
337 cond->family == AF_INET) {
338 if (addr[0] == 0 && addr[1] == 0 &&
339 addr[2] == htonl(0xffff) &&
340 bitstring_match(addr + 3, cond->addr,
341 cond->prefix_len))
342 break;
343 }
344 yes = 0;
345 break;
346 }
347 }
348
349 if (yes) {
350 len -= op->yes;
351 bc += op->yes;
352 } else {
353 len -= op->no;
354 bc += op->no;
355 }
356 }
357 return (len == 0);
358}
359
360static int valid_cc(const void *bc, int len, int cc)
361{
362 while (len >= 0) {
363 const struct inet_diag_bc_op *op = bc;
364
365 if (cc > len)
366 return 0;
367 if (cc == len)
368 return 1;
369 if (op->yes < 4)
370 return 0;
371 len -= op->yes;
372 bc += op->yes;
373 }
374 return 0;
375}
376
377static int inet_diag_bc_audit(const void *bytecode, int bytecode_len)
378{
379 const unsigned char *bc = bytecode;
380 int len = bytecode_len;
381
382 while (len > 0) {
383 struct inet_diag_bc_op *op = (struct inet_diag_bc_op *)bc;
384
385//printk("BC: %d %d %d {%d} / %d\n", op->code, op->yes, op->no, op[1].no, len);
386 switch (op->code) {
387 case INET_DIAG_BC_AUTO:
388 case INET_DIAG_BC_S_COND:
389 case INET_DIAG_BC_D_COND:
390 case INET_DIAG_BC_S_GE:
391 case INET_DIAG_BC_S_LE:
392 case INET_DIAG_BC_D_GE:
393 case INET_DIAG_BC_D_LE:
394 if (op->yes < 4 || op->yes > len + 4)
395 return -EINVAL;
396 case INET_DIAG_BC_JMP:
397 if (op->no < 4 || op->no > len + 4)
398 return -EINVAL;
399 if (op->no < len &&
400 !valid_cc(bytecode, bytecode_len, len - op->no))
401 return -EINVAL;
402 break;
403 case INET_DIAG_BC_NOP:
404 if (op->yes < 4 || op->yes > len + 4)
405 return -EINVAL;
406 break;
407 default:
408 return -EINVAL;
409 }
410 bc += op->yes;
411 len -= op->yes;
412 }
413 return len == 0 ? 0 : -EINVAL;
414}
415
416static int inet_diag_dump_sock(struct sk_buff *skb, struct sock *sk,
417 struct netlink_callback *cb)
418{
419 struct inet_diag_req *r = NLMSG_DATA(cb->nlh);
420
421 if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) {
422 struct inet_diag_entry entry;
423 struct rtattr *bc = (struct rtattr *)(r + 1);
424 struct inet_sock *inet = inet_sk(sk);
425
426 entry.family = sk->sk_family;
427#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
428 if (entry.family == AF_INET6) {
429 struct ipv6_pinfo *np = inet6_sk(sk);
430
431 entry.saddr = np->rcv_saddr.s6_addr32;
432 entry.daddr = np->daddr.s6_addr32;
433 } else
434#endif
435 {
436 entry.saddr = &inet->rcv_saddr;
437 entry.daddr = &inet->daddr;
438 }
439 entry.sport = inet->num;
440 entry.dport = ntohs(inet->dport);
441 entry.userlocks = sk->sk_userlocks;
442
443 if (!inet_diag_bc_run(RTA_DATA(bc), RTA_PAYLOAD(bc), &entry))
444 return 0;
445 }
446
447 return inet_diag_fill(skb, sk, r->idiag_ext, NETLINK_CB(cb->skb).pid,
448 cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
449}
450
451static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk,
452 struct request_sock *req,
453 u32 pid, u32 seq,
454 const struct nlmsghdr *unlh)
455{
456 const struct inet_request_sock *ireq = inet_rsk(req);
457 struct inet_sock *inet = inet_sk(sk);
458 unsigned char *b = skb->tail;
459 struct inet_diag_msg *r;
460 struct nlmsghdr *nlh;
461 long tmo;
462
463 nlh = NLMSG_PUT(skb, pid, seq, unlh->nlmsg_type, sizeof(*r));
464 nlh->nlmsg_flags = NLM_F_MULTI;
465 r = NLMSG_DATA(nlh);
466
467 r->idiag_family = sk->sk_family;
468 r->idiag_state = TCP_SYN_RECV;
469 r->idiag_timer = 1;
470 r->idiag_retrans = req->retrans;
471
472 r->id.idiag_if = sk->sk_bound_dev_if;
473 r->id.idiag_cookie[0] = (u32)(unsigned long)req;
474 r->id.idiag_cookie[1] = (u32)(((unsigned long)req >> 31) >> 1);
475
476 tmo = req->expires - jiffies;
477 if (tmo < 0)
478 tmo = 0;
479
480 r->id.idiag_sport = inet->sport;
481 r->id.idiag_dport = ireq->rmt_port;
482 r->id.idiag_src[0] = ireq->loc_addr;
483 r->id.idiag_dst[0] = ireq->rmt_addr;
484 r->idiag_expires = jiffies_to_msecs(tmo);
485 r->idiag_rqueue = 0;
486 r->idiag_wqueue = 0;
487 r->idiag_uid = sock_i_uid(sk);
488 r->idiag_inode = 0;
489#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
490 if (r->idiag_family == AF_INET6) {
491 ipv6_addr_copy((struct in6_addr *)r->id.idiag_src,
492 &tcp6_rsk(req)->loc_addr);
493 ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst,
494 &tcp6_rsk(req)->rmt_addr);
495 }
496#endif
497 nlh->nlmsg_len = skb->tail - b;
498
499 return skb->len;
500
501nlmsg_failure:
502 skb_trim(skb, b - skb->data);
503 return -1;
504}
505
506static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
507 struct netlink_callback *cb)
508{
509 struct inet_diag_entry entry;
510 struct inet_diag_req *r = NLMSG_DATA(cb->nlh);
511 struct inet_connection_sock *icsk = inet_csk(sk);
512 struct listen_sock *lopt;
513 struct rtattr *bc = NULL;
514 struct inet_sock *inet = inet_sk(sk);
515 int j, s_j;
516 int reqnum, s_reqnum;
517 int err = 0;
518
519 s_j = cb->args[3];
520 s_reqnum = cb->args[4];
521
522 if (s_j > 0)
523 s_j--;
524
525 entry.family = sk->sk_family;
526
527 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
528
529 lopt = icsk->icsk_accept_queue.listen_opt;
530 if (!lopt || !lopt->qlen)
531 goto out;
532
533 if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) {
534 bc = (struct rtattr *)(r + 1);
535 entry.sport = inet->num;
536 entry.userlocks = sk->sk_userlocks;
537 }
538
539 for (j = s_j; j < lopt->nr_table_entries; j++) {
540 struct request_sock *req, *head = lopt->syn_table[j];
541
542 reqnum = 0;
543 for (req = head; req; reqnum++, req = req->dl_next) {
544 struct inet_request_sock *ireq = inet_rsk(req);
545
546 if (reqnum < s_reqnum)
547 continue;
548 if (r->id.idiag_dport != ireq->rmt_port &&
549 r->id.idiag_dport)
550 continue;
551
552 if (bc) {
553 entry.saddr =
554#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
555 (entry.family == AF_INET6) ?
556 tcp6_rsk(req)->loc_addr.s6_addr32 :
557#endif
558 &ireq->loc_addr;
559 entry.daddr =
560#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
561 (entry.family == AF_INET6) ?
562 tcp6_rsk(req)->rmt_addr.s6_addr32 :
563#endif
564 &ireq->rmt_addr;
565 entry.dport = ntohs(ireq->rmt_port);
566
567 if (!inet_diag_bc_run(RTA_DATA(bc),
568 RTA_PAYLOAD(bc), &entry))
569 continue;
570 }
571
572 err = inet_diag_fill_req(skb, sk, req,
573 NETLINK_CB(cb->skb).pid,
574 cb->nlh->nlmsg_seq, cb->nlh);
575 if (err < 0) {
576 cb->args[3] = j + 1;
577 cb->args[4] = reqnum;
578 goto out;
579 }
580 }
581
582 s_reqnum = 0;
583 }
584
585out:
586 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
587
588 return err;
589}
590
591static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
592{
593 int i, num;
594 int s_i, s_num;
595 struct inet_diag_req *r = NLMSG_DATA(cb->nlh);
596 const struct inet_diag_handler *handler;
597 struct inet_hashinfo *hashinfo;
598
599 handler = inet_diag_table[cb->nlh->nlmsg_type];
600 BUG_ON(handler == NULL);
601 hashinfo = handler->idiag_hashinfo;
602
603 s_i = cb->args[1];
604 s_num = num = cb->args[2];
605
606 if (cb->args[0] == 0) {
607 if (!(r->idiag_states & (TCPF_LISTEN | TCPF_SYN_RECV)))
608 goto skip_listen_ht;
609
610 inet_listen_lock(hashinfo);
611 for (i = s_i; i < INET_LHTABLE_SIZE; i++) {
612 struct sock *sk;
613 struct hlist_node *node;
614
615 num = 0;
616 sk_for_each(sk, node, &hashinfo->listening_hash[i]) {
617 struct inet_sock *inet = inet_sk(sk);
618
619 if (num < s_num) {
620 num++;
621 continue;
622 }
623
624 if (r->id.idiag_sport != inet->sport &&
625 r->id.idiag_sport)
626 goto next_listen;
627
628 if (!(r->idiag_states & TCPF_LISTEN) ||
629 r->id.idiag_dport ||
630 cb->args[3] > 0)
631 goto syn_recv;
632
633 if (inet_diag_dump_sock(skb, sk, cb) < 0) {
634 inet_listen_unlock(hashinfo);
635 goto done;
636 }
637
638syn_recv:
639 if (!(r->idiag_states & TCPF_SYN_RECV))
640 goto next_listen;
641
642 if (inet_diag_dump_reqs(skb, sk, cb) < 0) {
643 inet_listen_unlock(hashinfo);
644 goto done;
645 }
646
647next_listen:
648 cb->args[3] = 0;
649 cb->args[4] = 0;
650 ++num;
651 }
652
653 s_num = 0;
654 cb->args[3] = 0;
655 cb->args[4] = 0;
656 }
657 inet_listen_unlock(hashinfo);
658skip_listen_ht:
659 cb->args[0] = 1;
660 s_i = num = s_num = 0;
661 }
662
663 if (!(r->idiag_states & ~(TCPF_LISTEN | TCPF_SYN_RECV)))
664 return skb->len;
665
666 for (i = s_i; i < hashinfo->ehash_size; i++) {
667 struct inet_ehash_bucket *head = &hashinfo->ehash[i];
668 struct sock *sk;
669 struct hlist_node *node;
670
671 if (i > s_i)
672 s_num = 0;
673
674 read_lock_bh(&head->lock);
675
676 num = 0;
677 sk_for_each(sk, node, &head->chain) {
678 struct inet_sock *inet = inet_sk(sk);
679
680 if (num < s_num)
681 goto next_normal;
682 if (!(r->idiag_states & (1 << sk->sk_state)))
683 goto next_normal;
684 if (r->id.idiag_sport != inet->sport &&
685 r->id.idiag_sport)
686 goto next_normal;
687 if (r->id.idiag_dport != inet->dport && r->id.idiag_dport)
688 goto next_normal;
689 if (inet_diag_dump_sock(skb, sk, cb) < 0) {
690 read_unlock_bh(&head->lock);
691 goto done;
692 }
693next_normal:
694 ++num;
695 }
696
697 if (r->idiag_states & TCPF_TIME_WAIT) {
698 sk_for_each(sk, node,
699 &hashinfo->ehash[i + hashinfo->ehash_size].chain) {
700 struct inet_sock *inet = inet_sk(sk);
701
702 if (num < s_num)
703 goto next_dying;
704 if (r->id.idiag_sport != inet->sport &&
705 r->id.idiag_sport)
706 goto next_dying;
707 if (r->id.idiag_dport != inet->dport &&
708 r->id.idiag_dport)
709 goto next_dying;
710 if (inet_diag_dump_sock(skb, sk, cb) < 0) {
711 read_unlock_bh(&head->lock);
712 goto done;
713 }
714next_dying:
715 ++num;
716 }
717 }
718 read_unlock_bh(&head->lock);
719 }
720
721done:
722 cb->args[1] = i;
723 cb->args[2] = num;
724 return skb->len;
725}
726
727static int inet_diag_dump_done(struct netlink_callback *cb)
728{
729 return 0;
730}
731
732
733static __inline__ int
734inet_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
735{
736 if (!(nlh->nlmsg_flags&NLM_F_REQUEST))
737 return 0;
738
739 if (nlh->nlmsg_type >= INET_DIAG_GETSOCK_MAX)
740 goto err_inval;
741
742 if (inet_diag_table[nlh->nlmsg_type] == NULL)
743 return -ENOENT;
744
745 if (NLMSG_LENGTH(sizeof(struct inet_diag_req)) > skb->len)
746 goto err_inval;
747
748 if (nlh->nlmsg_flags&NLM_F_DUMP) {
749 if (nlh->nlmsg_len >
750 (4 + NLMSG_SPACE(sizeof(struct inet_diag_req)))) {
751 struct rtattr *rta = (void *)(NLMSG_DATA(nlh) +
752 sizeof(struct inet_diag_req));
753 if (rta->rta_type != INET_DIAG_REQ_BYTECODE ||
754 rta->rta_len < 8 ||
755 rta->rta_len >
756 (nlh->nlmsg_len -
757 NLMSG_SPACE(sizeof(struct inet_diag_req))))
758 goto err_inval;
759 if (inet_diag_bc_audit(RTA_DATA(rta), RTA_PAYLOAD(rta)))
760 goto err_inval;
761 }
762 return netlink_dump_start(idiagnl, skb, nlh,
763 inet_diag_dump,
764 inet_diag_dump_done);
765 } else {
766 return inet_diag_get_exact(skb, nlh);
767 }
768
769err_inval:
770 return -EINVAL;
771}
772
773
774static inline void inet_diag_rcv_skb(struct sk_buff *skb)
775{
776 int err;
777 struct nlmsghdr * nlh;
778
779 if (skb->len >= NLMSG_SPACE(0)) {
780 nlh = (struct nlmsghdr *)skb->data;
781 if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len)
782 return;
783 err = inet_diag_rcv_msg(skb, nlh);
784 if (err || nlh->nlmsg_flags & NLM_F_ACK)
785 netlink_ack(skb, nlh, err);
786 }
787}
788
789static void inet_diag_rcv(struct sock *sk, int len)
790{
791 struct sk_buff *skb;
792 unsigned int qlen = skb_queue_len(&sk->sk_receive_queue);
793
794 while (qlen-- && (skb = skb_dequeue(&sk->sk_receive_queue))) {
795 inet_diag_rcv_skb(skb);
796 kfree_skb(skb);
797 }
798}
799
800static DEFINE_SPINLOCK(inet_diag_register_lock);
801
802int inet_diag_register(const struct inet_diag_handler *h)
803{
804 const __u16 type = h->idiag_type;
805 int err = -EINVAL;
806
807 if (type >= INET_DIAG_GETSOCK_MAX)
808 goto out;
809
810 spin_lock(&inet_diag_register_lock);
811 err = -EEXIST;
812 if (inet_diag_table[type] == NULL) {
813 inet_diag_table[type] = h;
814 err = 0;
815 }
816 spin_unlock(&inet_diag_register_lock);
817out:
818 return err;
819}
820EXPORT_SYMBOL_GPL(inet_diag_register);
821
822void inet_diag_unregister(const struct inet_diag_handler *h)
823{
824 const __u16 type = h->idiag_type;
825
826 if (type >= INET_DIAG_GETSOCK_MAX)
827 return;
828
829 spin_lock(&inet_diag_register_lock);
830 inet_diag_table[type] = NULL;
831 spin_unlock(&inet_diag_register_lock);
832
833 synchronize_rcu();
834}
835EXPORT_SYMBOL_GPL(inet_diag_unregister);
836
837static int __init inet_diag_init(void)
838{
839 const int inet_diag_table_size = (INET_DIAG_GETSOCK_MAX *
840 sizeof(struct inet_diag_handler *));
841 int err = -ENOMEM;
842
843 inet_diag_table = kmalloc(inet_diag_table_size, GFP_KERNEL);
844 if (!inet_diag_table)
845 goto out;
846
847 memset(inet_diag_table, 0, inet_diag_table_size);
848 idiagnl = netlink_kernel_create(NETLINK_INET_DIAG, 0, inet_diag_rcv,
849 THIS_MODULE);
850 if (idiagnl == NULL)
851 goto out_free_table;
852 err = 0;
853out:
854 return err;
855out_free_table:
856 kfree(inet_diag_table);
857 goto out;
858}
859
860static void __exit inet_diag_exit(void)
861{
862 sock_release(idiagnl->sk_socket);
863 kfree(inet_diag_table);
864}
865
866module_init(inet_diag_init);
867module_exit(inet_diag_exit);
868MODULE_LICENSE("GPL");
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
new file mode 100644
index 000000000000..e8d29fe736d2
--- /dev/null
+++ b/net/ipv4/inet_hashtables.c
@@ -0,0 +1,165 @@
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Generic INET transport hashtables
7 *
8 * Authors: Lotsa people, from code originally in tcp
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
14 */
15
16#include <linux/config.h>
17#include <linux/module.h>
18#include <linux/sched.h>
19#include <linux/slab.h>
20#include <linux/wait.h>
21
22#include <net/inet_connection_sock.h>
23#include <net/inet_hashtables.h>
24
25/*
26 * Allocate and initialize a new local port bind bucket.
27 * The bindhash mutex for snum's hash chain must be held here.
28 */
29struct inet_bind_bucket *inet_bind_bucket_create(kmem_cache_t *cachep,
30 struct inet_bind_hashbucket *head,
31 const unsigned short snum)
32{
33 struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, SLAB_ATOMIC);
34
35 if (tb != NULL) {
36 tb->port = snum;
37 tb->fastreuse = 0;
38 INIT_HLIST_HEAD(&tb->owners);
39 hlist_add_head(&tb->node, &head->chain);
40 }
41 return tb;
42}
43
44EXPORT_SYMBOL(inet_bind_bucket_create);
45
46/*
47 * Caller must hold hashbucket lock for this tb with local BH disabled
48 */
49void inet_bind_bucket_destroy(kmem_cache_t *cachep, struct inet_bind_bucket *tb)
50{
51 if (hlist_empty(&tb->owners)) {
52 __hlist_del(&tb->node);
53 kmem_cache_free(cachep, tb);
54 }
55}
56
57void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
58 const unsigned short snum)
59{
60 inet_sk(sk)->num = snum;
61 sk_add_bind_node(sk, &tb->owners);
62 inet_csk(sk)->icsk_bind_hash = tb;
63}
64
65EXPORT_SYMBOL(inet_bind_hash);
66
67/*
68 * Get rid of any references to a local port held by the given sock.
69 */
70static void __inet_put_port(struct inet_hashinfo *hashinfo, struct sock *sk)
71{
72 const int bhash = inet_bhashfn(inet_sk(sk)->num, hashinfo->bhash_size);
73 struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash];
74 struct inet_bind_bucket *tb;
75
76 spin_lock(&head->lock);
77 tb = inet_csk(sk)->icsk_bind_hash;
78 __sk_del_bind_node(sk);
79 inet_csk(sk)->icsk_bind_hash = NULL;
80 inet_sk(sk)->num = 0;
81 inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
82 spin_unlock(&head->lock);
83}
84
85void inet_put_port(struct inet_hashinfo *hashinfo, struct sock *sk)
86{
87 local_bh_disable();
88 __inet_put_port(hashinfo, sk);
89 local_bh_enable();
90}
91
92EXPORT_SYMBOL(inet_put_port);
93
94/*
95 * This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
96 * Look, when several writers sleep and reader wakes them up, all but one
97 * immediately hit write lock and grab all the cpus. Exclusive sleep solves
98 * this, _but_ remember, it adds useless work on UP machines (wake up each
99 * exclusive lock release). It should be ifdefed really.
100 */
101void inet_listen_wlock(struct inet_hashinfo *hashinfo)
102{
103 write_lock(&hashinfo->lhash_lock);
104
105 if (atomic_read(&hashinfo->lhash_users)) {
106 DEFINE_WAIT(wait);
107
108 for (;;) {
109 prepare_to_wait_exclusive(&hashinfo->lhash_wait,
110 &wait, TASK_UNINTERRUPTIBLE);
111 if (!atomic_read(&hashinfo->lhash_users))
112 break;
113 write_unlock_bh(&hashinfo->lhash_lock);
114 schedule();
115 write_lock_bh(&hashinfo->lhash_lock);
116 }
117
118 finish_wait(&hashinfo->lhash_wait, &wait);
119 }
120}
121
122EXPORT_SYMBOL(inet_listen_wlock);
123
124/*
125 * Don't inline this cruft. Here are some nice properties to exploit here. The
126 * BSD API does not allow a listening sock to specify the remote port nor the
127 * remote address for the connection. So always assume those are both
128 * wildcarded during the search since they can never be otherwise.
129 */
130struct sock *__inet_lookup_listener(const struct hlist_head *head, const u32 daddr,
131 const unsigned short hnum, const int dif)
132{
133 struct sock *result = NULL, *sk;
134 const struct hlist_node *node;
135 int hiscore = -1;
136
137 sk_for_each(sk, node, head) {
138 const struct inet_sock *inet = inet_sk(sk);
139
140 if (inet->num == hnum && !ipv6_only_sock(sk)) {
141 const __u32 rcv_saddr = inet->rcv_saddr;
142 int score = sk->sk_family == PF_INET ? 1 : 0;
143
144 if (rcv_saddr) {
145 if (rcv_saddr != daddr)
146 continue;
147 score += 2;
148 }
149 if (sk->sk_bound_dev_if) {
150 if (sk->sk_bound_dev_if != dif)
151 continue;
152 score += 2;
153 }
154 if (score == 5)
155 return sk;
156 if (score > hiscore) {
157 hiscore = score;
158 result = sk;
159 }
160 }
161 }
162 return result;
163}
164
165EXPORT_SYMBOL_GPL(__inet_lookup_listener);
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
new file mode 100644
index 000000000000..4d1502a49852
--- /dev/null
+++ b/net/ipv4/inet_timewait_sock.c
@@ -0,0 +1,384 @@
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Generic TIME_WAIT sockets functions
7 *
8 * From code orinally in TCP
9 */
10
11#include <linux/config.h>
12
13#include <net/inet_hashtables.h>
14#include <net/inet_timewait_sock.h>
15#include <net/ip.h>
16
17/* Must be called with locally disabled BHs. */
18void __inet_twsk_kill(struct inet_timewait_sock *tw, struct inet_hashinfo *hashinfo)
19{
20 struct inet_bind_hashbucket *bhead;
21 struct inet_bind_bucket *tb;
22 /* Unlink from established hashes. */
23 struct inet_ehash_bucket *ehead = &hashinfo->ehash[tw->tw_hashent];
24
25 write_lock(&ehead->lock);
26 if (hlist_unhashed(&tw->tw_node)) {
27 write_unlock(&ehead->lock);
28 return;
29 }
30 __hlist_del(&tw->tw_node);
31 sk_node_init(&tw->tw_node);
32 write_unlock(&ehead->lock);
33
34 /* Disassociate with bind bucket. */
35 bhead = &hashinfo->bhash[inet_bhashfn(tw->tw_num, hashinfo->bhash_size)];
36 spin_lock(&bhead->lock);
37 tb = tw->tw_tb;
38 __hlist_del(&tw->tw_bind_node);
39 tw->tw_tb = NULL;
40 inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
41 spin_unlock(&bhead->lock);
42#ifdef SOCK_REFCNT_DEBUG
43 if (atomic_read(&tw->tw_refcnt) != 1) {
44 printk(KERN_DEBUG "%s timewait_sock %p refcnt=%d\n",
45 tw->tw_prot->name, tw, atomic_read(&tw->tw_refcnt));
46 }
47#endif
48 inet_twsk_put(tw);
49}
50
51EXPORT_SYMBOL_GPL(__inet_twsk_kill);
52
53/*
54 * Enter the time wait state. This is called with locally disabled BH.
55 * Essentially we whip up a timewait bucket, copy the relevant info into it
56 * from the SK, and mess with hash chains and list linkage.
57 */
58void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
59 struct inet_hashinfo *hashinfo)
60{
61 const struct inet_sock *inet = inet_sk(sk);
62 const struct inet_connection_sock *icsk = inet_csk(sk);
63 struct inet_ehash_bucket *ehead = &hashinfo->ehash[sk->sk_hashent];
64 struct inet_bind_hashbucket *bhead;
65 /* Step 1: Put TW into bind hash. Original socket stays there too.
66 Note, that any socket with inet->num != 0 MUST be bound in
67 binding cache, even if it is closed.
68 */
69 bhead = &hashinfo->bhash[inet_bhashfn(inet->num, hashinfo->bhash_size)];
70 spin_lock(&bhead->lock);
71 tw->tw_tb = icsk->icsk_bind_hash;
72 BUG_TRAP(icsk->icsk_bind_hash);
73 inet_twsk_add_bind_node(tw, &tw->tw_tb->owners);
74 spin_unlock(&bhead->lock);
75
76 write_lock(&ehead->lock);
77
78 /* Step 2: Remove SK from established hash. */
79 if (__sk_del_node_init(sk))
80 sock_prot_dec_use(sk->sk_prot);
81
82 /* Step 3: Hash TW into TIMEWAIT half of established hash table. */
83 inet_twsk_add_node(tw, &(ehead + hashinfo->ehash_size)->chain);
84 atomic_inc(&tw->tw_refcnt);
85
86 write_unlock(&ehead->lock);
87}
88
89EXPORT_SYMBOL_GPL(__inet_twsk_hashdance);
90
91struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int state)
92{
93 struct inet_timewait_sock *tw = kmem_cache_alloc(sk->sk_prot_creator->twsk_slab,
94 SLAB_ATOMIC);
95 if (tw != NULL) {
96 const struct inet_sock *inet = inet_sk(sk);
97
98 /* Give us an identity. */
99 tw->tw_daddr = inet->daddr;
100 tw->tw_rcv_saddr = inet->rcv_saddr;
101 tw->tw_bound_dev_if = sk->sk_bound_dev_if;
102 tw->tw_num = inet->num;
103 tw->tw_state = TCP_TIME_WAIT;
104 tw->tw_substate = state;
105 tw->tw_sport = inet->sport;
106 tw->tw_dport = inet->dport;
107 tw->tw_family = sk->sk_family;
108 tw->tw_reuse = sk->sk_reuse;
109 tw->tw_hashent = sk->sk_hashent;
110 tw->tw_ipv6only = 0;
111 tw->tw_prot = sk->sk_prot_creator;
112 atomic_set(&tw->tw_refcnt, 1);
113 inet_twsk_dead_node_init(tw);
114 }
115
116 return tw;
117}
118
119EXPORT_SYMBOL_GPL(inet_twsk_alloc);
120
121/* Returns non-zero if quota exceeded. */
122static int inet_twdr_do_twkill_work(struct inet_timewait_death_row *twdr,
123 const int slot)
124{
125 struct inet_timewait_sock *tw;
126 struct hlist_node *node;
127 unsigned int killed;
128 int ret;
129
130 /* NOTE: compare this to previous version where lock
131 * was released after detaching chain. It was racy,
132 * because tw buckets are scheduled in not serialized context
133 * in 2.3 (with netfilter), and with softnet it is common, because
134 * soft irqs are not sequenced.
135 */
136 killed = 0;
137 ret = 0;
138rescan:
139 inet_twsk_for_each_inmate(tw, node, &twdr->cells[slot]) {
140 __inet_twsk_del_dead_node(tw);
141 spin_unlock(&twdr->death_lock);
142 __inet_twsk_kill(tw, twdr->hashinfo);
143 inet_twsk_put(tw);
144 killed++;
145 spin_lock(&twdr->death_lock);
146 if (killed > INET_TWDR_TWKILL_QUOTA) {
147 ret = 1;
148 break;
149 }
150
151 /* While we dropped twdr->death_lock, another cpu may have
152 * killed off the next TW bucket in the list, therefore
153 * do a fresh re-read of the hlist head node with the
154 * lock reacquired. We still use the hlist traversal
155 * macro in order to get the prefetches.
156 */
157 goto rescan;
158 }
159
160 twdr->tw_count -= killed;
161 NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITED, killed);
162
163 return ret;
164}
165
166void inet_twdr_hangman(unsigned long data)
167{
168 struct inet_timewait_death_row *twdr;
169 int unsigned need_timer;
170
171 twdr = (struct inet_timewait_death_row *)data;
172 spin_lock(&twdr->death_lock);
173
174 if (twdr->tw_count == 0)
175 goto out;
176
177 need_timer = 0;
178 if (inet_twdr_do_twkill_work(twdr, twdr->slot)) {
179 twdr->thread_slots |= (1 << twdr->slot);
180 mb();
181 schedule_work(&twdr->twkill_work);
182 need_timer = 1;
183 } else {
184 /* We purged the entire slot, anything left? */
185 if (twdr->tw_count)
186 need_timer = 1;
187 }
188 twdr->slot = ((twdr->slot + 1) & (INET_TWDR_TWKILL_SLOTS - 1));
189 if (need_timer)
190 mod_timer(&twdr->tw_timer, jiffies + twdr->period);
191out:
192 spin_unlock(&twdr->death_lock);
193}
194
195EXPORT_SYMBOL_GPL(inet_twdr_hangman);
196
197extern void twkill_slots_invalid(void);
198
199void inet_twdr_twkill_work(void *data)
200{
201 struct inet_timewait_death_row *twdr = data;
202 int i;
203
204 if ((INET_TWDR_TWKILL_SLOTS - 1) > (sizeof(twdr->thread_slots) * 8))
205 twkill_slots_invalid();
206
207 while (twdr->thread_slots) {
208 spin_lock_bh(&twdr->death_lock);
209 for (i = 0; i < INET_TWDR_TWKILL_SLOTS; i++) {
210 if (!(twdr->thread_slots & (1 << i)))
211 continue;
212
213 while (inet_twdr_do_twkill_work(twdr, i) != 0) {
214 if (need_resched()) {
215 spin_unlock_bh(&twdr->death_lock);
216 schedule();
217 spin_lock_bh(&twdr->death_lock);
218 }
219 }
220
221 twdr->thread_slots &= ~(1 << i);
222 }
223 spin_unlock_bh(&twdr->death_lock);
224 }
225}
226
227EXPORT_SYMBOL_GPL(inet_twdr_twkill_work);
228
229/* These are always called from BH context. See callers in
230 * tcp_input.c to verify this.
231 */
232
233/* This is for handling early-kills of TIME_WAIT sockets. */
234void inet_twsk_deschedule(struct inet_timewait_sock *tw,
235 struct inet_timewait_death_row *twdr)
236{
237 spin_lock(&twdr->death_lock);
238 if (inet_twsk_del_dead_node(tw)) {
239 inet_twsk_put(tw);
240 if (--twdr->tw_count == 0)
241 del_timer(&twdr->tw_timer);
242 }
243 spin_unlock(&twdr->death_lock);
244 __inet_twsk_kill(tw, twdr->hashinfo);
245}
246
247EXPORT_SYMBOL(inet_twsk_deschedule);
248
249void inet_twsk_schedule(struct inet_timewait_sock *tw,
250 struct inet_timewait_death_row *twdr,
251 const int timeo, const int timewait_len)
252{
253 struct hlist_head *list;
254 int slot;
255
256 /* timeout := RTO * 3.5
257 *
258 * 3.5 = 1+2+0.5 to wait for two retransmits.
259 *
260 * RATIONALE: if FIN arrived and we entered TIME-WAIT state,
261 * our ACK acking that FIN can be lost. If N subsequent retransmitted
262 * FINs (or previous seqments) are lost (probability of such event
263 * is p^(N+1), where p is probability to lose single packet and
264 * time to detect the loss is about RTO*(2^N - 1) with exponential
265 * backoff). Normal timewait length is calculated so, that we
266 * waited at least for one retransmitted FIN (maximal RTO is 120sec).
267 * [ BTW Linux. following BSD, violates this requirement waiting
268 * only for 60sec, we should wait at least for 240 secs.
269 * Well, 240 consumes too much of resources 8)
270 * ]
271 * This interval is not reduced to catch old duplicate and
272 * responces to our wandering segments living for two MSLs.
273 * However, if we use PAWS to detect
274 * old duplicates, we can reduce the interval to bounds required
275 * by RTO, rather than MSL. So, if peer understands PAWS, we
276 * kill tw bucket after 3.5*RTO (it is important that this number
277 * is greater than TS tick!) and detect old duplicates with help
278 * of PAWS.
279 */
280 slot = (timeo + (1 << INET_TWDR_RECYCLE_TICK) - 1) >> INET_TWDR_RECYCLE_TICK;
281
282 spin_lock(&twdr->death_lock);
283
284 /* Unlink it, if it was scheduled */
285 if (inet_twsk_del_dead_node(tw))
286 twdr->tw_count--;
287 else
288 atomic_inc(&tw->tw_refcnt);
289
290 if (slot >= INET_TWDR_RECYCLE_SLOTS) {
291 /* Schedule to slow timer */
292 if (timeo >= timewait_len) {
293 slot = INET_TWDR_TWKILL_SLOTS - 1;
294 } else {
295 slot = (timeo + twdr->period - 1) / twdr->period;
296 if (slot >= INET_TWDR_TWKILL_SLOTS)
297 slot = INET_TWDR_TWKILL_SLOTS - 1;
298 }
299 tw->tw_ttd = jiffies + timeo;
300 slot = (twdr->slot + slot) & (INET_TWDR_TWKILL_SLOTS - 1);
301 list = &twdr->cells[slot];
302 } else {
303 tw->tw_ttd = jiffies + (slot << INET_TWDR_RECYCLE_TICK);
304
305 if (twdr->twcal_hand < 0) {
306 twdr->twcal_hand = 0;
307 twdr->twcal_jiffie = jiffies;
308 twdr->twcal_timer.expires = twdr->twcal_jiffie +
309 (slot << INET_TWDR_RECYCLE_TICK);
310 add_timer(&twdr->twcal_timer);
311 } else {
312 if (time_after(twdr->twcal_timer.expires,
313 jiffies + (slot << INET_TWDR_RECYCLE_TICK)))
314 mod_timer(&twdr->twcal_timer,
315 jiffies + (slot << INET_TWDR_RECYCLE_TICK));
316 slot = (twdr->twcal_hand + slot) & (INET_TWDR_RECYCLE_SLOTS - 1);
317 }
318 list = &twdr->twcal_row[slot];
319 }
320
321 hlist_add_head(&tw->tw_death_node, list);
322
323 if (twdr->tw_count++ == 0)
324 mod_timer(&twdr->tw_timer, jiffies + twdr->period);
325 spin_unlock(&twdr->death_lock);
326}
327
328EXPORT_SYMBOL_GPL(inet_twsk_schedule);
329
330void inet_twdr_twcal_tick(unsigned long data)
331{
332 struct inet_timewait_death_row *twdr;
333 int n, slot;
334 unsigned long j;
335 unsigned long now = jiffies;
336 int killed = 0;
337 int adv = 0;
338
339 twdr = (struct inet_timewait_death_row *)data;
340
341 spin_lock(&twdr->death_lock);
342 if (twdr->twcal_hand < 0)
343 goto out;
344
345 slot = twdr->twcal_hand;
346 j = twdr->twcal_jiffie;
347
348 for (n = 0; n < INET_TWDR_RECYCLE_SLOTS; n++) {
349 if (time_before_eq(j, now)) {
350 struct hlist_node *node, *safe;
351 struct inet_timewait_sock *tw;
352
353 inet_twsk_for_each_inmate_safe(tw, node, safe,
354 &twdr->twcal_row[slot]) {
355 __inet_twsk_del_dead_node(tw);
356 __inet_twsk_kill(tw, twdr->hashinfo);
357 inet_twsk_put(tw);
358 killed++;
359 }
360 } else {
361 if (!adv) {
362 adv = 1;
363 twdr->twcal_jiffie = j;
364 twdr->twcal_hand = slot;
365 }
366
367 if (!hlist_empty(&twdr->twcal_row[slot])) {
368 mod_timer(&twdr->twcal_timer, j);
369 goto out;
370 }
371 }
372 j += 1 << INET_TWDR_RECYCLE_TICK;
373 slot = (slot + 1) & (INET_TWDR_RECYCLE_SLOTS - 1);
374 }
375 twdr->twcal_hand = -1;
376
377out:
378 if ((twdr->tw_count -= killed) == 0)
379 del_timer(&twdr->tw_timer);
380 NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITKILLED, killed);
381 spin_unlock(&twdr->death_lock);
382}
383
384EXPORT_SYMBOL_GPL(inet_twdr_twcal_tick);
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index ab18a853d7ce..f84ba9c96551 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -20,6 +20,7 @@
20#include <linux/kernel.h> 20#include <linux/kernel.h>
21#include <linux/mm.h> 21#include <linux/mm.h>
22#include <linux/net.h> 22#include <linux/net.h>
23#include <net/ip.h>
23#include <net/inetpeer.h> 24#include <net/inetpeer.h>
24 25
25/* 26/*
@@ -72,7 +73,7 @@
72/* Exported for inet_getid inline function. */ 73/* Exported for inet_getid inline function. */
73DEFINE_SPINLOCK(inet_peer_idlock); 74DEFINE_SPINLOCK(inet_peer_idlock);
74 75
75static kmem_cache_t *peer_cachep; 76static kmem_cache_t *peer_cachep __read_mostly;
76 77
77#define node_height(x) x->avl_height 78#define node_height(x) x->avl_height
78static struct inet_peer peer_fake_node = { 79static struct inet_peer peer_fake_node = {
@@ -459,5 +460,3 @@ static void peer_check_expire(unsigned long dummy)
459 peer_total / inet_peer_threshold * HZ; 460 peer_total / inet_peer_threshold * HZ;
460 add_timer(&peer_periodic_timer); 461 add_timer(&peer_periodic_timer);
461} 462}
462
463EXPORT_SYMBOL(inet_peer_idlock);
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index 77094aac6c28..0923add122b4 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -76,16 +76,12 @@ int ip_forward(struct sk_buff *skb)
76 * that reaches zero, we must reply an ICMP control message telling 76 * that reaches zero, we must reply an ICMP control message telling
77 * that the packet's lifetime expired. 77 * that the packet's lifetime expired.
78 */ 78 */
79 79 if (skb->nh.iph->ttl <= 1)
80 iph = skb->nh.iph;
81
82 if (iph->ttl <= 1)
83 goto too_many_hops; 80 goto too_many_hops;
84 81
85 if (!xfrm4_route_forward(skb)) 82 if (!xfrm4_route_forward(skb))
86 goto drop; 83 goto drop;
87 84
88 iph = skb->nh.iph;
89 rt = (struct rtable*)skb->dst; 85 rt = (struct rtable*)skb->dst;
90 86
91 if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway) 87 if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index eb377ae15305..9e6e683cc34d 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -377,7 +377,7 @@ static struct ipq *ip_frag_create(unsigned hash, struct iphdr *iph, u32 user)
377 return ip_frag_intern(hash, qp); 377 return ip_frag_intern(hash, qp);
378 378
379out_nomem: 379out_nomem:
380 LIMIT_NETDEBUG(printk(KERN_ERR "ip_frag_create: no memory left !\n")); 380 LIMIT_NETDEBUG(KERN_ERR "ip_frag_create: no memory left !\n");
381 return NULL; 381 return NULL;
382} 382}
383 383
@@ -533,7 +533,7 @@ static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
533 if (skb->dev) 533 if (skb->dev)
534 qp->iif = skb->dev->ifindex; 534 qp->iif = skb->dev->ifindex;
535 skb->dev = NULL; 535 skb->dev = NULL;
536 qp->stamp = skb->stamp; 536 skb_get_timestamp(skb, &qp->stamp);
537 qp->meat += skb->len; 537 qp->meat += skb->len;
538 atomic_add(skb->truesize, &ip_frag_mem); 538 atomic_add(skb->truesize, &ip_frag_mem);
539 if (offset == 0) 539 if (offset == 0)
@@ -615,7 +615,7 @@ static struct sk_buff *ip_frag_reasm(struct ipq *qp, struct net_device *dev)
615 615
616 head->next = NULL; 616 head->next = NULL;
617 head->dev = dev; 617 head->dev = dev;
618 head->stamp = qp->stamp; 618 skb_set_timestamp(head, &qp->stamp);
619 619
620 iph = head->nh.iph; 620 iph = head->nh.iph;
621 iph->frag_off = 0; 621 iph->frag_off = 0;
@@ -625,8 +625,8 @@ static struct sk_buff *ip_frag_reasm(struct ipq *qp, struct net_device *dev)
625 return head; 625 return head;
626 626
627out_nomem: 627out_nomem:
628 LIMIT_NETDEBUG(printk(KERN_ERR "IP: queue_glue: no memory for gluing " 628 LIMIT_NETDEBUG(KERN_ERR "IP: queue_glue: no memory for gluing "
629 "queue %p\n", qp)); 629 "queue %p\n", qp);
630 goto out_fail; 630 goto out_fail;
631out_oversize: 631out_oversize:
632 if (net_ratelimit()) 632 if (net_ratelimit())
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index c703528e0bcd..473d0f2b2e0d 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -150,7 +150,7 @@
150 * SNMP management statistics 150 * SNMP management statistics
151 */ 151 */
152 152
153DEFINE_SNMP_STAT(struct ipstats_mib, ip_statistics); 153DEFINE_SNMP_STAT(struct ipstats_mib, ip_statistics) __read_mostly;
154 154
155/* 155/*
156 * Process Router Attention IP option 156 * Process Router Attention IP option
@@ -225,8 +225,8 @@ static inline int ip_local_deliver_finish(struct sk_buff *skb)
225 /* If there maybe a raw socket we must check - if not we 225 /* If there maybe a raw socket we must check - if not we
226 * don't care less 226 * don't care less
227 */ 227 */
228 if (raw_sk) 228 if (raw_sk && !raw_v4_input(skb, skb->nh.iph, hash))
229 raw_v4_input(skb, skb->nh.iph, hash); 229 raw_sk = NULL;
230 230
231 if ((ipprot = rcu_dereference(inet_protos[hash])) != NULL) { 231 if ((ipprot = rcu_dereference(inet_protos[hash])) != NULL) {
232 int ret; 232 int ret;
@@ -279,18 +279,70 @@ int ip_local_deliver(struct sk_buff *skb)
279 ip_local_deliver_finish); 279 ip_local_deliver_finish);
280} 280}
281 281
282static inline int ip_rcv_finish(struct sk_buff *skb) 282static inline int ip_rcv_options(struct sk_buff *skb)
283{ 283{
284 struct ip_options *opt;
285 struct iphdr *iph;
284 struct net_device *dev = skb->dev; 286 struct net_device *dev = skb->dev;
287
288 /* It looks as overkill, because not all
289 IP options require packet mangling.
290 But it is the easiest for now, especially taking
291 into account that combination of IP options
292 and running sniffer is extremely rare condition.
293 --ANK (980813)
294 */
295 if (skb_cow(skb, skb_headroom(skb))) {
296 IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS);
297 goto drop;
298 }
299
300 iph = skb->nh.iph;
301
302 if (ip_options_compile(NULL, skb)) {
303 IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS);
304 goto drop;
305 }
306
307 opt = &(IPCB(skb)->opt);
308 if (unlikely(opt->srr)) {
309 struct in_device *in_dev = in_dev_get(dev);
310 if (in_dev) {
311 if (!IN_DEV_SOURCE_ROUTE(in_dev)) {
312 if (IN_DEV_LOG_MARTIANS(in_dev) &&
313 net_ratelimit())
314 printk(KERN_INFO "source route option "
315 "%u.%u.%u.%u -> %u.%u.%u.%u\n",
316 NIPQUAD(iph->saddr),
317 NIPQUAD(iph->daddr));
318 in_dev_put(in_dev);
319 goto drop;
320 }
321
322 in_dev_put(in_dev);
323 }
324
325 if (ip_options_rcv_srr(skb))
326 goto drop;
327 }
328
329 return 0;
330drop:
331 return -1;
332}
333
334static inline int ip_rcv_finish(struct sk_buff *skb)
335{
285 struct iphdr *iph = skb->nh.iph; 336 struct iphdr *iph = skb->nh.iph;
286 int err;
287 337
288 /* 338 /*
289 * Initialise the virtual path cache for the packet. It describes 339 * Initialise the virtual path cache for the packet. It describes
290 * how the packet travels inside Linux networking. 340 * how the packet travels inside Linux networking.
291 */ 341 */
292 if (skb->dst == NULL) { 342 if (likely(skb->dst == NULL)) {
293 if ((err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))) { 343 int err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos,
344 skb->dev);
345 if (unlikely(err)) {
294 if (err == -EHOSTUNREACH) 346 if (err == -EHOSTUNREACH)
295 IP_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS); 347 IP_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
296 goto drop; 348 goto drop;
@@ -298,7 +350,7 @@ static inline int ip_rcv_finish(struct sk_buff *skb)
298 } 350 }
299 351
300#ifdef CONFIG_NET_CLS_ROUTE 352#ifdef CONFIG_NET_CLS_ROUTE
301 if (skb->dst->tclassid) { 353 if (unlikely(skb->dst->tclassid)) {
302 struct ip_rt_acct *st = ip_rt_acct + 256*smp_processor_id(); 354 struct ip_rt_acct *st = ip_rt_acct + 256*smp_processor_id();
303 u32 idx = skb->dst->tclassid; 355 u32 idx = skb->dst->tclassid;
304 st[idx&0xFF].o_packets++; 356 st[idx&0xFF].o_packets++;
@@ -308,48 +360,11 @@ static inline int ip_rcv_finish(struct sk_buff *skb)
308 } 360 }
309#endif 361#endif
310 362
311 if (iph->ihl > 5) { 363 if (iph->ihl > 5 && ip_rcv_options(skb))
312 struct ip_options *opt; 364 goto drop;
313
314 /* It looks as overkill, because not all
315 IP options require packet mangling.
316 But it is the easiest for now, especially taking
317 into account that combination of IP options
318 and running sniffer is extremely rare condition.
319 --ANK (980813)
320 */
321
322 if (skb_cow(skb, skb_headroom(skb))) {
323 IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS);
324 goto drop;
325 }
326 iph = skb->nh.iph;
327
328 if (ip_options_compile(NULL, skb))
329 goto inhdr_error;
330
331 opt = &(IPCB(skb)->opt);
332 if (opt->srr) {
333 struct in_device *in_dev = in_dev_get(dev);
334 if (in_dev) {
335 if (!IN_DEV_SOURCE_ROUTE(in_dev)) {
336 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
337 printk(KERN_INFO "source route option %u.%u.%u.%u -> %u.%u.%u.%u\n",
338 NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
339 in_dev_put(in_dev);
340 goto drop;
341 }
342 in_dev_put(in_dev);
343 }
344 if (ip_options_rcv_srr(skb))
345 goto drop;
346 }
347 }
348 365
349 return dst_input(skb); 366 return dst_input(skb);
350 367
351inhdr_error:
352 IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS);
353drop: 368drop:
354 kfree_skb(skb); 369 kfree_skb(skb);
355 return NET_RX_DROP; 370 return NET_RX_DROP;
@@ -358,9 +373,10 @@ drop:
358/* 373/*
359 * Main IP Receive routine. 374 * Main IP Receive routine.
360 */ 375 */
361int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) 376int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
362{ 377{
363 struct iphdr *iph; 378 struct iphdr *iph;
379 u32 len;
364 380
365 /* When the interface is in promisc. mode, drop all the crap 381 /* When the interface is in promisc. mode, drop all the crap
366 * that it receives, do not try to analyse it. 382 * that it receives, do not try to analyse it.
@@ -392,29 +408,27 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
392 */ 408 */
393 409
394 if (iph->ihl < 5 || iph->version != 4) 410 if (iph->ihl < 5 || iph->version != 4)
395 goto inhdr_error; 411 goto inhdr_error;
396 412
397 if (!pskb_may_pull(skb, iph->ihl*4)) 413 if (!pskb_may_pull(skb, iph->ihl*4))
398 goto inhdr_error; 414 goto inhdr_error;
399 415
400 iph = skb->nh.iph; 416 iph = skb->nh.iph;
401 417
402 if (ip_fast_csum((u8 *)iph, iph->ihl) != 0) 418 if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
403 goto inhdr_error; 419 goto inhdr_error;
404 420
405 { 421 len = ntohs(iph->tot_len);
406 __u32 len = ntohs(iph->tot_len); 422 if (skb->len < len || len < (iph->ihl*4))
407 if (skb->len < len || len < (iph->ihl<<2)) 423 goto inhdr_error;
408 goto inhdr_error;
409 424
410 /* Our transport medium may have padded the buffer out. Now we know it 425 /* Our transport medium may have padded the buffer out. Now we know it
411 * is IP we can trim to the true length of the frame. 426 * is IP we can trim to the true length of the frame.
412 * Note this now means skb->len holds ntohs(iph->tot_len). 427 * Note this now means skb->len holds ntohs(iph->tot_len).
413 */ 428 */
414 if (pskb_trim_rcsum(skb, len)) { 429 if (pskb_trim_rcsum(skb, len)) {
415 IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS); 430 IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS);
416 goto drop; 431 goto drop;
417 }
418 } 432 }
419 433
420 return NF_HOOK(PF_INET, NF_IP_PRE_ROUTING, skb, dev, NULL, 434 return NF_HOOK(PF_INET, NF_IP_PRE_ROUTING, skb, dev, NULL,
@@ -428,5 +442,4 @@ out:
428 return NET_RX_DROP; 442 return NET_RX_DROP;
429} 443}
430 444
431EXPORT_SYMBOL(ip_rcv);
432EXPORT_SYMBOL(ip_statistics); 445EXPORT_SYMBOL(ip_statistics);
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index 6d89f3f3e701..bce4e875193b 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -489,23 +489,18 @@ void ip_options_undo(struct ip_options * opt)
489 } 489 }
490} 490}
491 491
492int ip_options_get(struct ip_options **optp, unsigned char *data, int optlen, int user) 492static struct ip_options *ip_options_get_alloc(const int optlen)
493{ 493{
494 struct ip_options *opt; 494 struct ip_options *opt = kmalloc(sizeof(*opt) + ((optlen + 3) & ~3),
495 GFP_KERNEL);
496 if (opt)
497 memset(opt, 0, sizeof(*opt));
498 return opt;
499}
495 500
496 opt = kmalloc(sizeof(struct ip_options)+((optlen+3)&~3), GFP_KERNEL); 501static int ip_options_get_finish(struct ip_options **optp,
497 if (!opt) 502 struct ip_options *opt, int optlen)
498 return -ENOMEM; 503{
499 memset(opt, 0, sizeof(struct ip_options));
500 if (optlen) {
501 if (user) {
502 if (copy_from_user(opt->__data, data, optlen)) {
503 kfree(opt);
504 return -EFAULT;
505 }
506 } else
507 memcpy(opt->__data, data, optlen);
508 }
509 while (optlen & 3) 504 while (optlen & 3)
510 opt->__data[optlen++] = IPOPT_END; 505 opt->__data[optlen++] = IPOPT_END;
511 opt->optlen = optlen; 506 opt->optlen = optlen;
@@ -521,6 +516,30 @@ int ip_options_get(struct ip_options **optp, unsigned char *data, int optlen, in
521 return 0; 516 return 0;
522} 517}
523 518
519int ip_options_get_from_user(struct ip_options **optp, unsigned char __user *data, int optlen)
520{
521 struct ip_options *opt = ip_options_get_alloc(optlen);
522
523 if (!opt)
524 return -ENOMEM;
525 if (optlen && copy_from_user(opt->__data, data, optlen)) {
526 kfree(opt);
527 return -EFAULT;
528 }
529 return ip_options_get_finish(optp, opt, optlen);
530}
531
532int ip_options_get(struct ip_options **optp, unsigned char *data, int optlen)
533{
534 struct ip_options *opt = ip_options_get_alloc(optlen);
535
536 if (!opt)
537 return -ENOMEM;
538 if (optlen)
539 memcpy(opt->__data, data, optlen);
540 return ip_options_get_finish(optp, opt, optlen);
541}
542
524void ip_forward_options(struct sk_buff *skb) 543void ip_forward_options(struct sk_buff *skb)
525{ 544{
526 struct ip_options * opt = &(IPCB(skb)->opt); 545 struct ip_options * opt = &(IPCB(skb)->opt);
@@ -620,6 +639,3 @@ int ip_options_rcv_srr(struct sk_buff *skb)
620 } 639 }
621 return 0; 640 return 0;
622} 641}
623
624EXPORT_SYMBOL(ip_options_compile);
625EXPORT_SYMBOL(ip_options_undo);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 80d13103b2b0..3f1a263e1249 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -69,13 +69,10 @@
69#include <net/ip.h> 69#include <net/ip.h>
70#include <net/protocol.h> 70#include <net/protocol.h>
71#include <net/route.h> 71#include <net/route.h>
72#include <net/tcp.h>
73#include <net/udp.h>
74#include <linux/skbuff.h> 72#include <linux/skbuff.h>
75#include <net/sock.h> 73#include <net/sock.h>
76#include <net/arp.h> 74#include <net/arp.h>
77#include <net/icmp.h> 75#include <net/icmp.h>
78#include <net/raw.h>
79#include <net/checksum.h> 76#include <net/checksum.h>
80#include <net/inetpeer.h> 77#include <net/inetpeer.h>
81#include <net/checksum.h> 78#include <net/checksum.h>
@@ -84,12 +81,8 @@
84#include <linux/netfilter_bridge.h> 81#include <linux/netfilter_bridge.h>
85#include <linux/mroute.h> 82#include <linux/mroute.h>
86#include <linux/netlink.h> 83#include <linux/netlink.h>
84#include <linux/tcp.h>
87 85
88/*
89 * Shall we try to damage output packets if routing dev changes?
90 */
91
92int sysctl_ip_dynaddr;
93int sysctl_ip_default_ttl = IPDEFTTL; 86int sysctl_ip_default_ttl = IPDEFTTL;
94 87
95/* Generate a checksum for an outgoing IP datagram. */ 88/* Generate a checksum for an outgoing IP datagram. */
@@ -165,6 +158,8 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
165 dst_output); 158 dst_output);
166} 159}
167 160
161EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
162
168static inline int ip_finish_output2(struct sk_buff *skb) 163static inline int ip_finish_output2(struct sk_buff *skb)
169{ 164{
170 struct dst_entry *dst = skb->dst; 165 struct dst_entry *dst = skb->dst;
@@ -205,7 +200,7 @@ static inline int ip_finish_output2(struct sk_buff *skb)
205 return -EINVAL; 200 return -EINVAL;
206} 201}
207 202
208int ip_finish_output(struct sk_buff *skb) 203static inline int ip_finish_output(struct sk_buff *skb)
209{ 204{
210 struct net_device *dev = skb->dst->dev; 205 struct net_device *dev = skb->dst->dev;
211 206
@@ -329,8 +324,7 @@ int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
329 if (ip_route_output_flow(&rt, &fl, sk, 0)) 324 if (ip_route_output_flow(&rt, &fl, sk, 0))
330 goto no_route; 325 goto no_route;
331 } 326 }
332 __sk_dst_set(sk, &rt->u.dst); 327 sk_setup_caps(sk, &rt->u.dst);
333 tcp_v4_setup_caps(sk, &rt->u.dst);
334 } 328 }
335 skb->dst = dst_clone(&rt->u.dst); 329 skb->dst = dst_clone(&rt->u.dst);
336 330
@@ -392,7 +386,6 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
392#endif 386#endif
393#ifdef CONFIG_NETFILTER 387#ifdef CONFIG_NETFILTER
394 to->nfmark = from->nfmark; 388 to->nfmark = from->nfmark;
395 to->nfcache = from->nfcache;
396 /* Connection association is same as pre-frag packet */ 389 /* Connection association is same as pre-frag packet */
397 nf_conntrack_put(to->nfct); 390 nf_conntrack_put(to->nfct);
398 to->nfct = from->nfct; 391 to->nfct = from->nfct;
@@ -580,7 +573,7 @@ slow_path:
580 */ 573 */
581 574
582 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) { 575 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
583 NETDEBUG(printk(KERN_INFO "IP: frag: no memory for new fragment!\n")); 576 NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
584 err = -ENOMEM; 577 err = -ENOMEM;
585 goto fail; 578 goto fail;
586 } 579 }
@@ -1329,12 +1322,7 @@ void __init ip_init(void)
1329#endif 1322#endif
1330} 1323}
1331 1324
1332EXPORT_SYMBOL(ip_finish_output);
1333EXPORT_SYMBOL(ip_fragment); 1325EXPORT_SYMBOL(ip_fragment);
1334EXPORT_SYMBOL(ip_generic_getfrag); 1326EXPORT_SYMBOL(ip_generic_getfrag);
1335EXPORT_SYMBOL(ip_queue_xmit); 1327EXPORT_SYMBOL(ip_queue_xmit);
1336EXPORT_SYMBOL(ip_send_check); 1328EXPORT_SYMBOL(ip_send_check);
1337
1338#ifdef CONFIG_SYSCTL
1339EXPORT_SYMBOL(sysctl_ip_default_ttl);
1340#endif
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index ff4bd067b397..2f0b47da5b37 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -153,7 +153,7 @@ int ip_cmsg_send(struct msghdr *msg, struct ipcm_cookie *ipc)
153 switch (cmsg->cmsg_type) { 153 switch (cmsg->cmsg_type) {
154 case IP_RETOPTS: 154 case IP_RETOPTS:
155 err = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr)); 155 err = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr));
156 err = ip_options_get(&ipc->opt, CMSG_DATA(cmsg), err < 40 ? err : 40, 0); 156 err = ip_options_get(&ipc->opt, CMSG_DATA(cmsg), err < 40 ? err : 40);
157 if (err) 157 if (err)
158 return err; 158 return err;
159 break; 159 break;
@@ -425,7 +425,7 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
425 struct ip_options * opt = NULL; 425 struct ip_options * opt = NULL;
426 if (optlen > 40 || optlen < 0) 426 if (optlen > 40 || optlen < 0)
427 goto e_inval; 427 goto e_inval;
428 err = ip_options_get(&opt, optval, optlen, 1); 428 err = ip_options_get_from_user(&opt, optval, optlen);
429 if (err) 429 if (err)
430 break; 430 break;
431 if (sk->sk_type == SOCK_STREAM) { 431 if (sk->sk_type == SOCK_STREAM) {
@@ -614,7 +614,6 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
614 } 614 }
615 case IP_MSFILTER: 615 case IP_MSFILTER:
616 { 616 {
617 extern int sysctl_optmem_max;
618 extern int sysctl_igmp_max_msf; 617 extern int sysctl_igmp_max_msf;
619 struct ip_msfilter *msf; 618 struct ip_msfilter *msf;
620 619
@@ -769,7 +768,6 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
769 } 768 }
770 case MCAST_MSFILTER: 769 case MCAST_MSFILTER:
771 { 770 {
772 extern int sysctl_optmem_max;
773 extern int sysctl_igmp_max_msf; 771 extern int sysctl_igmp_max_msf;
774 struct sockaddr_in *psin; 772 struct sockaddr_in *psin;
775 struct ip_msfilter *msf = NULL; 773 struct ip_msfilter *msf = NULL;
@@ -1090,7 +1088,5 @@ int ip_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
1090 1088
1091EXPORT_SYMBOL(ip_cmsg_recv); 1089EXPORT_SYMBOL(ip_cmsg_recv);
1092 1090
1093#ifdef CONFIG_IP_SCTP_MODULE
1094EXPORT_SYMBOL(ip_getsockopt); 1091EXPORT_SYMBOL(ip_getsockopt);
1095EXPORT_SYMBOL(ip_setsockopt); 1092EXPORT_SYMBOL(ip_setsockopt);
1096#endif
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
index 7ded6e60f43a..dcb7ee6c4858 100644
--- a/net/ipv4/ipcomp.c
+++ b/net/ipv4/ipcomp.c
@@ -214,8 +214,8 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info)
214 spi, IPPROTO_COMP, AF_INET); 214 spi, IPPROTO_COMP, AF_INET);
215 if (!x) 215 if (!x)
216 return; 216 return;
217 NETDEBUG(printk(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/%u.%u.%u.%u\n", 217 NETDEBUG(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/%u.%u.%u.%u\n",
218 spi, NIPQUAD(iph->daddr))); 218 spi, NIPQUAD(iph->daddr));
219 xfrm_state_put(x); 219 xfrm_state_put(x);
220} 220}
221 221
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index d2bf8e1930a3..63e106605f28 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -393,7 +393,7 @@ static int __init ic_defaults(void)
393 393
394#ifdef IPCONFIG_RARP 394#ifdef IPCONFIG_RARP
395 395
396static int ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt); 396static int ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev);
397 397
398static struct packet_type rarp_packet_type __initdata = { 398static struct packet_type rarp_packet_type __initdata = {
399 .type = __constant_htons(ETH_P_RARP), 399 .type = __constant_htons(ETH_P_RARP),
@@ -414,7 +414,7 @@ static inline void ic_rarp_cleanup(void)
414 * Process received RARP packet. 414 * Process received RARP packet.
415 */ 415 */
416static int __init 416static int __init
417ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) 417ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
418{ 418{
419 struct arphdr *rarp; 419 struct arphdr *rarp;
420 unsigned char *rarp_ptr; 420 unsigned char *rarp_ptr;
@@ -555,7 +555,7 @@ struct bootp_pkt { /* BOOTP packet format */
555#define DHCPRELEASE 7 555#define DHCPRELEASE 7
556#define DHCPINFORM 8 556#define DHCPINFORM 8
557 557
558static int ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt); 558static int ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev);
559 559
560static struct packet_type bootp_packet_type __initdata = { 560static struct packet_type bootp_packet_type __initdata = {
561 .type = __constant_htons(ETH_P_IP), 561 .type = __constant_htons(ETH_P_IP),
@@ -823,7 +823,7 @@ static void __init ic_do_bootp_ext(u8 *ext)
823/* 823/*
824 * Receive BOOTP reply. 824 * Receive BOOTP reply.
825 */ 825 */
826static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) 826static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
827{ 827{
828 struct bootp_pkt *b; 828 struct bootp_pkt *b;
829 struct iphdr *h; 829 struct iphdr *h;
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index dc806b578427..9dbf5909f3a6 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -103,7 +103,7 @@ static DEFINE_SPINLOCK(mfc_unres_lock);
103 In this case data path is free of exclusive locks at all. 103 In this case data path is free of exclusive locks at all.
104 */ 104 */
105 105
106static kmem_cache_t *mrt_cachep; 106static kmem_cache_t *mrt_cachep __read_mostly;
107 107
108static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local); 108static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local);
109static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert); 109static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert);
diff --git a/net/ipv4/ipvs/ip_vs_app.c b/net/ipv4/ipvs/ip_vs_app.c
index d9212addd193..6e092dadb388 100644
--- a/net/ipv4/ipvs/ip_vs_app.c
+++ b/net/ipv4/ipvs/ip_vs_app.c
@@ -26,6 +26,7 @@
26#include <linux/in.h> 26#include <linux/in.h>
27#include <linux/ip.h> 27#include <linux/ip.h>
28#include <net/protocol.h> 28#include <net/protocol.h>
29#include <net/tcp.h>
29#include <asm/system.h> 30#include <asm/system.h>
30#include <linux/stat.h> 31#include <linux/stat.h>
31#include <linux/proc_fs.h> 32#include <linux/proc_fs.h>
diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c
index d0145a8b1551..e11952ea17af 100644
--- a/net/ipv4/ipvs/ip_vs_conn.c
+++ b/net/ipv4/ipvs/ip_vs_conn.c
@@ -40,7 +40,7 @@
40static struct list_head *ip_vs_conn_tab; 40static struct list_head *ip_vs_conn_tab;
41 41
42/* SLAB cache for IPVS connections */ 42/* SLAB cache for IPVS connections */
43static kmem_cache_t *ip_vs_conn_cachep; 43static kmem_cache_t *ip_vs_conn_cachep __read_mostly;
44 44
45/* counter for current IPVS connections */ 45/* counter for current IPVS connections */
46static atomic_t ip_vs_conn_count = ATOMIC_INIT(0); 46static atomic_t ip_vs_conn_count = ATOMIC_INIT(0);
diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c
index 5fb257dd07cb..3ac7eeca04ac 100644
--- a/net/ipv4/ipvs/ip_vs_core.c
+++ b/net/ipv4/ipvs/ip_vs_core.c
@@ -22,6 +22,7 @@
22 * 22 *
23 * Changes: 23 * Changes:
24 * Paul `Rusty' Russell properly handle non-linear skbs 24 * Paul `Rusty' Russell properly handle non-linear skbs
25 * Harald Welte don't use nfcache
25 * 26 *
26 */ 27 */
27 28
@@ -529,7 +530,7 @@ static unsigned int ip_vs_post_routing(unsigned int hooknum,
529 const struct net_device *out, 530 const struct net_device *out,
530 int (*okfn)(struct sk_buff *)) 531 int (*okfn)(struct sk_buff *))
531{ 532{
532 if (!((*pskb)->nfcache & NFC_IPVS_PROPERTY)) 533 if (!((*pskb)->ipvs_property))
533 return NF_ACCEPT; 534 return NF_ACCEPT;
534 535
535 /* The packet was sent from IPVS, exit this chain */ 536 /* The packet was sent from IPVS, exit this chain */
@@ -701,7 +702,7 @@ static int ip_vs_out_icmp(struct sk_buff **pskb, int *related)
701 /* do the statistics and put it back */ 702 /* do the statistics and put it back */
702 ip_vs_out_stats(cp, skb); 703 ip_vs_out_stats(cp, skb);
703 704
704 skb->nfcache |= NFC_IPVS_PROPERTY; 705 skb->ipvs_property = 1;
705 verdict = NF_ACCEPT; 706 verdict = NF_ACCEPT;
706 707
707 out: 708 out:
@@ -739,7 +740,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff **pskb,
739 740
740 EnterFunction(11); 741 EnterFunction(11);
741 742
742 if (skb->nfcache & NFC_IPVS_PROPERTY) 743 if (skb->ipvs_property)
743 return NF_ACCEPT; 744 return NF_ACCEPT;
744 745
745 iph = skb->nh.iph; 746 iph = skb->nh.iph;
@@ -821,7 +822,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff **pskb,
821 ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp); 822 ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp);
822 ip_vs_conn_put(cp); 823 ip_vs_conn_put(cp);
823 824
824 skb->nfcache |= NFC_IPVS_PROPERTY; 825 skb->ipvs_property = 1;
825 826
826 LeaveFunction(11); 827 LeaveFunction(11);
827 return NF_ACCEPT; 828 return NF_ACCEPT;
diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c
index 7d99ede2ef79..2d66848e7aa0 100644
--- a/net/ipv4/ipvs/ip_vs_ctl.c
+++ b/net/ipv4/ipvs/ip_vs_ctl.c
@@ -1598,7 +1598,7 @@ static ctl_table vs_table[] = {
1598 { .ctl_name = 0 } 1598 { .ctl_name = 0 }
1599}; 1599};
1600 1600
1601static ctl_table ipv4_table[] = { 1601static ctl_table ipvs_ipv4_table[] = {
1602 { 1602 {
1603 .ctl_name = NET_IPV4, 1603 .ctl_name = NET_IPV4,
1604 .procname = "ipv4", 1604 .procname = "ipv4",
@@ -1613,7 +1613,7 @@ static ctl_table vs_root_table[] = {
1613 .ctl_name = CTL_NET, 1613 .ctl_name = CTL_NET,
1614 .procname = "net", 1614 .procname = "net",
1615 .mode = 0555, 1615 .mode = 0555,
1616 .child = ipv4_table, 1616 .child = ipvs_ipv4_table,
1617 }, 1617 },
1618 { .ctl_name = 0 } 1618 { .ctl_name = 0 }
1619}; 1619};
diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c
index c035838b780a..561cda326fa8 100644
--- a/net/ipv4/ipvs/ip_vs_lblc.c
+++ b/net/ipv4/ipvs/ip_vs_lblc.c
@@ -131,7 +131,7 @@ static ctl_table vs_table[] = {
131 { .ctl_name = 0 } 131 { .ctl_name = 0 }
132}; 132};
133 133
134static ctl_table ipv4_table[] = { 134static ctl_table ipvs_ipv4_table[] = {
135 { 135 {
136 .ctl_name = NET_IPV4, 136 .ctl_name = NET_IPV4,
137 .procname = "ipv4", 137 .procname = "ipv4",
@@ -146,7 +146,7 @@ static ctl_table lblc_root_table[] = {
146 .ctl_name = CTL_NET, 146 .ctl_name = CTL_NET,
147 .procname = "net", 147 .procname = "net",
148 .mode = 0555, 148 .mode = 0555,
149 .child = ipv4_table 149 .child = ipvs_ipv4_table
150 }, 150 },
151 { .ctl_name = 0 } 151 { .ctl_name = 0 }
152}; 152};
diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c
index 22b5dd55d271..ce456dbf09a5 100644
--- a/net/ipv4/ipvs/ip_vs_lblcr.c
+++ b/net/ipv4/ipvs/ip_vs_lblcr.c
@@ -320,7 +320,7 @@ static ctl_table vs_table[] = {
320 { .ctl_name = 0 } 320 { .ctl_name = 0 }
321}; 321};
322 322
323static ctl_table ipv4_table[] = { 323static ctl_table ipvs_ipv4_table[] = {
324 { 324 {
325 .ctl_name = NET_IPV4, 325 .ctl_name = NET_IPV4,
326 .procname = "ipv4", 326 .procname = "ipv4",
@@ -335,7 +335,7 @@ static ctl_table lblcr_root_table[] = {
335 .ctl_name = CTL_NET, 335 .ctl_name = CTL_NET,
336 .procname = "net", 336 .procname = "net",
337 .mode = 0555, 337 .mode = 0555,
338 .child = ipv4_table 338 .child = ipvs_ipv4_table
339 }, 339 },
340 { .ctl_name = 0 } 340 { .ctl_name = 0 }
341}; 341};
diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c
index e65de675da74..c19408973c09 100644
--- a/net/ipv4/ipvs/ip_vs_proto_tcp.c
+++ b/net/ipv4/ipvs/ip_vs_proto_tcp.c
@@ -604,14 +604,14 @@ void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp)
604} 604}
605 605
606 606
607static void tcp_init(struct ip_vs_protocol *pp) 607static void ip_vs_tcp_init(struct ip_vs_protocol *pp)
608{ 608{
609 IP_VS_INIT_HASH_TABLE(tcp_apps); 609 IP_VS_INIT_HASH_TABLE(tcp_apps);
610 pp->timeout_table = tcp_timeouts; 610 pp->timeout_table = tcp_timeouts;
611} 611}
612 612
613 613
614static void tcp_exit(struct ip_vs_protocol *pp) 614static void ip_vs_tcp_exit(struct ip_vs_protocol *pp)
615{ 615{
616} 616}
617 617
@@ -621,8 +621,8 @@ struct ip_vs_protocol ip_vs_protocol_tcp = {
621 .protocol = IPPROTO_TCP, 621 .protocol = IPPROTO_TCP,
622 .dont_defrag = 0, 622 .dont_defrag = 0,
623 .appcnt = ATOMIC_INIT(0), 623 .appcnt = ATOMIC_INIT(0),
624 .init = tcp_init, 624 .init = ip_vs_tcp_init,
625 .exit = tcp_exit, 625 .exit = ip_vs_tcp_exit,
626 .register_app = tcp_register_app, 626 .register_app = tcp_register_app,
627 .unregister_app = tcp_unregister_app, 627 .unregister_app = tcp_unregister_app,
628 .conn_schedule = tcp_conn_schedule, 628 .conn_schedule = tcp_conn_schedule,
diff --git a/net/ipv4/ipvs/ip_vs_xmit.c b/net/ipv4/ipvs/ip_vs_xmit.c
index a8512a3fd08a..3b87482049cf 100644
--- a/net/ipv4/ipvs/ip_vs_xmit.c
+++ b/net/ipv4/ipvs/ip_vs_xmit.c
@@ -127,7 +127,7 @@ ip_vs_dst_reset(struct ip_vs_dest *dest)
127 127
128#define IP_VS_XMIT(skb, rt) \ 128#define IP_VS_XMIT(skb, rt) \
129do { \ 129do { \
130 (skb)->nfcache |= NFC_IPVS_PROPERTY; \ 130 (skb)->ipvs_property = 1; \
131 (skb)->ip_summed = CHECKSUM_NONE; \ 131 (skb)->ip_summed = CHECKSUM_NONE; \
132 NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, (skb), NULL, \ 132 NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, (skb), NULL, \
133 (rt)->u.dst.dev, dst_output); \ 133 (rt)->u.dst.dev, dst_output); \
diff --git a/net/ipv4/multipath_drr.c b/net/ipv4/multipath_drr.c
index c9cf8726051d..db67373f9b34 100644
--- a/net/ipv4/multipath_drr.c
+++ b/net/ipv4/multipath_drr.c
@@ -107,7 +107,7 @@ static int drr_dev_event(struct notifier_block *this,
107 return NOTIFY_DONE; 107 return NOTIFY_DONE;
108} 108}
109 109
110struct notifier_block drr_dev_notifier = { 110static struct notifier_block drr_dev_notifier = {
111 .notifier_call = drr_dev_event, 111 .notifier_call = drr_dev_event,
112}; 112};
113 113
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
new file mode 100644
index 000000000000..ae0779d82c5d
--- /dev/null
+++ b/net/ipv4/netfilter.c
@@ -0,0 +1,139 @@
1/* IPv4 specific functions of netfilter core */
2
3#include <linux/config.h>
4#ifdef CONFIG_NETFILTER
5
6#include <linux/kernel.h>
7#include <linux/netfilter.h>
8#include <linux/netfilter_ipv4.h>
9
10#include <linux/tcp.h>
11#include <linux/udp.h>
12#include <linux/icmp.h>
13#include <net/route.h>
14#include <linux/ip.h>
15
16/* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */
17int ip_route_me_harder(struct sk_buff **pskb)
18{
19 struct iphdr *iph = (*pskb)->nh.iph;
20 struct rtable *rt;
21 struct flowi fl = {};
22 struct dst_entry *odst;
23 unsigned int hh_len;
24
25 /* some non-standard hacks like ipt_REJECT.c:send_reset() can cause
26 * packets with foreign saddr to appear on the NF_IP_LOCAL_OUT hook.
27 */
28 if (inet_addr_type(iph->saddr) == RTN_LOCAL) {
29 fl.nl_u.ip4_u.daddr = iph->daddr;
30 fl.nl_u.ip4_u.saddr = iph->saddr;
31 fl.nl_u.ip4_u.tos = RT_TOS(iph->tos);
32 fl.oif = (*pskb)->sk ? (*pskb)->sk->sk_bound_dev_if : 0;
33#ifdef CONFIG_IP_ROUTE_FWMARK
34 fl.nl_u.ip4_u.fwmark = (*pskb)->nfmark;
35#endif
36 fl.proto = iph->protocol;
37 if (ip_route_output_key(&rt, &fl) != 0)
38 return -1;
39
40 /* Drop old route. */
41 dst_release((*pskb)->dst);
42 (*pskb)->dst = &rt->u.dst;
43 } else {
44 /* non-local src, find valid iif to satisfy
45 * rp-filter when calling ip_route_input. */
46 fl.nl_u.ip4_u.daddr = iph->saddr;
47 if (ip_route_output_key(&rt, &fl) != 0)
48 return -1;
49
50 odst = (*pskb)->dst;
51 if (ip_route_input(*pskb, iph->daddr, iph->saddr,
52 RT_TOS(iph->tos), rt->u.dst.dev) != 0) {
53 dst_release(&rt->u.dst);
54 return -1;
55 }
56 dst_release(&rt->u.dst);
57 dst_release(odst);
58 }
59
60 if ((*pskb)->dst->error)
61 return -1;
62
63 /* Change in oif may mean change in hh_len. */
64 hh_len = (*pskb)->dst->dev->hard_header_len;
65 if (skb_headroom(*pskb) < hh_len) {
66 struct sk_buff *nskb;
67
68 nskb = skb_realloc_headroom(*pskb, hh_len);
69 if (!nskb)
70 return -1;
71 if ((*pskb)->sk)
72 skb_set_owner_w(nskb, (*pskb)->sk);
73 kfree_skb(*pskb);
74 *pskb = nskb;
75 }
76
77 return 0;
78}
79EXPORT_SYMBOL(ip_route_me_harder);
80
81/*
82 * Extra routing may needed on local out, as the QUEUE target never
83 * returns control to the table.
84 */
85
86struct ip_rt_info {
87 u_int32_t daddr;
88 u_int32_t saddr;
89 u_int8_t tos;
90};
91
92static void queue_save(const struct sk_buff *skb, struct nf_info *info)
93{
94 struct ip_rt_info *rt_info = nf_info_reroute(info);
95
96 if (info->hook == NF_IP_LOCAL_OUT) {
97 const struct iphdr *iph = skb->nh.iph;
98
99 rt_info->tos = iph->tos;
100 rt_info->daddr = iph->daddr;
101 rt_info->saddr = iph->saddr;
102 }
103}
104
105static int queue_reroute(struct sk_buff **pskb, const struct nf_info *info)
106{
107 const struct ip_rt_info *rt_info = nf_info_reroute(info);
108
109 if (info->hook == NF_IP_LOCAL_OUT) {
110 struct iphdr *iph = (*pskb)->nh.iph;
111
112 if (!(iph->tos == rt_info->tos
113 && iph->daddr == rt_info->daddr
114 && iph->saddr == rt_info->saddr))
115 return ip_route_me_harder(pskb);
116 }
117 return 0;
118}
119
120static struct nf_queue_rerouter ip_reroute = {
121 .rer_size = sizeof(struct ip_rt_info),
122 .save = queue_save,
123 .reroute = queue_reroute,
124};
125
126static int init(void)
127{
128 return nf_register_queue_rerouter(PF_INET, &ip_reroute);
129}
130
131static void fini(void)
132{
133 nf_unregister_queue_rerouter(PF_INET);
134}
135
136module_init(init);
137module_exit(fini);
138
139#endif /* CONFIG_NETFILTER */
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 46d4cb1c06f0..e046f5521814 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -40,6 +40,16 @@ config IP_NF_CONNTRACK_MARK
40 of packets, but this mark value is kept in the conntrack session 40 of packets, but this mark value is kept in the conntrack session
41 instead of the individual packets. 41 instead of the individual packets.
42 42
43config IP_NF_CONNTRACK_EVENTS
44 bool "Connection tracking events"
45 depends on IP_NF_CONNTRACK
46 help
47 If this option is enabled, the connection tracking code will
48 provide a notifier chain that can be used by other kernel code
49 to get notified about changes in the connection tracking state.
50
51 IF unsure, say `N'.
52
43config IP_NF_CT_PROTO_SCTP 53config IP_NF_CT_PROTO_SCTP
44 tristate 'SCTP protocol connection tracking support (EXPERIMENTAL)' 54 tristate 'SCTP protocol connection tracking support (EXPERIMENTAL)'
45 depends on IP_NF_CONNTRACK && EXPERIMENTAL 55 depends on IP_NF_CONNTRACK && EXPERIMENTAL
@@ -100,11 +110,15 @@ config IP_NF_AMANDA
100 To compile it as a module, choose M here. If unsure, say Y. 110 To compile it as a module, choose M here. If unsure, say Y.
101 111
102config IP_NF_QUEUE 112config IP_NF_QUEUE
103 tristate "Userspace queueing via NETLINK" 113 tristate "IP Userspace queueing via NETLINK (OBSOLETE)"
104 help 114 help
105 Netfilter has the ability to queue packets to user space: the 115 Netfilter has the ability to queue packets to user space: the
106 netlink device can be used to access them using this driver. 116 netlink device can be used to access them using this driver.
107 117
118 This option enables the old IPv4-only "ip_queue" implementation
119 which has been obsoleted by the new "nfnetlink_queue" code (see
120 CONFIG_NETFILTER_NETLINK_QUEUE).
121
108 To compile it as a module, choose M here. If unsure, say N. 122 To compile it as a module, choose M here. If unsure, say N.
109 123
110config IP_NF_IPTABLES 124config IP_NF_IPTABLES
@@ -340,6 +354,17 @@ config IP_NF_MATCH_SCTP
340 If you want to compile it as a module, say M here and read 354 If you want to compile it as a module, say M here and read
341 <file:Documentation/modules.txt>. If unsure, say `N'. 355 <file:Documentation/modules.txt>. If unsure, say `N'.
342 356
357config IP_NF_MATCH_DCCP
358 tristate 'DCCP protocol match support'
359 depends on IP_NF_IPTABLES
360 help
361 With this option enabled, you will be able to use the iptables
362 `dccp' match in order to match on DCCP source/destination ports
363 and DCCP flags.
364
365 If you want to compile it as a module, say M here and read
366 <file:Documentation/modules.txt>. If unsure, say `N'.
367
343config IP_NF_MATCH_COMMENT 368config IP_NF_MATCH_COMMENT
344 tristate 'comment match support' 369 tristate 'comment match support'
345 depends on IP_NF_IPTABLES 370 depends on IP_NF_IPTABLES
@@ -361,6 +386,16 @@ config IP_NF_MATCH_CONNMARK
361 <file:Documentation/modules.txt>. The module will be called 386 <file:Documentation/modules.txt>. The module will be called
362 ipt_connmark.o. If unsure, say `N'. 387 ipt_connmark.o. If unsure, say `N'.
363 388
389config IP_NF_MATCH_CONNBYTES
390 tristate 'Connection byte/packet counter match support'
391 depends on IP_NF_CT_ACCT && IP_NF_IPTABLES
392 help
393 This option adds a `connbytes' match, which allows you to match the
394 number of bytes and/or packets for each direction within a connection.
395
396 If you want to compile it as a module, say M here and read
397 <file:Documentation/modules.txt>. If unsure, say `N'.
398
364config IP_NF_MATCH_HASHLIMIT 399config IP_NF_MATCH_HASHLIMIT
365 tristate 'hashlimit match support' 400 tristate 'hashlimit match support'
366 depends on IP_NF_IPTABLES 401 depends on IP_NF_IPTABLES
@@ -375,6 +410,19 @@ config IP_NF_MATCH_HASHLIMIT
375 destination IP' or `500pps from any given source IP' with a single 410 destination IP' or `500pps from any given source IP' with a single
376 IPtables rule. 411 IPtables rule.
377 412
413config IP_NF_MATCH_STRING
414 tristate 'string match support'
415 depends on IP_NF_IPTABLES
416 select TEXTSEARCH
417 select TEXTSEARCH_KMP
418 select TEXTSEARCH_BM
419 select TEXTSEARCH_FSM
420 help
421 This option adds a `string' match, which allows you to look for
422 pattern matchings in packets.
423
424 To compile it as a module, choose M here. If unsure, say N.
425
378# `filter', generic and specific targets 426# `filter', generic and specific targets
379config IP_NF_FILTER 427config IP_NF_FILTER
380 tristate "Packet filtering" 428 tristate "Packet filtering"
@@ -616,6 +664,20 @@ config IP_NF_TARGET_CLASSIFY
616 664
617 To compile it as a module, choose M here. If unsure, say N. 665 To compile it as a module, choose M here. If unsure, say N.
618 666
667config IP_NF_TARGET_TTL
668 tristate 'TTL target support'
669 depends on IP_NF_MANGLE
670 help
671 This option adds a `TTL' target, which enables the user to modify
672 the TTL value of the IP header.
673
674 While it is safe to decrement/lower the TTL, this target also enables
675 functionality to increment and set the TTL value of the IP header to
676 arbitrary values. This is EXTREMELY DANGEROUS since you can easily
677 create immortal packets that loop forever on the network.
678
679 To compile it as a module, choose M here. If unsure, say N.
680
619config IP_NF_TARGET_CONNMARK 681config IP_NF_TARGET_CONNMARK
620 tristate 'CONNMARK target support' 682 tristate 'CONNMARK target support'
621 depends on IP_NF_CONNTRACK_MARK && IP_NF_MANGLE 683 depends on IP_NF_CONNTRACK_MARK && IP_NF_MANGLE
@@ -692,5 +754,11 @@ config IP_NF_ARP_MANGLE
692 Allows altering the ARP packet payload: source and destination 754 Allows altering the ARP packet payload: source and destination
693 hardware and network addresses. 755 hardware and network addresses.
694 756
757config IP_NF_CONNTRACK_NETLINK
758 tristate 'Connection tracking netlink interface'
759 depends on IP_NF_CONNTRACK && NETFILTER_NETLINK
760 help
761 This option enables support for a netlink-based userspace interface
762
695endmenu 763endmenu
696 764
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 45796d5924dd..a7bd38f50522 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -9,6 +9,10 @@ iptable_nat-objs := ip_nat_standalone.o ip_nat_rule.o ip_nat_core.o ip_nat_helpe
9# connection tracking 9# connection tracking
10obj-$(CONFIG_IP_NF_CONNTRACK) += ip_conntrack.o 10obj-$(CONFIG_IP_NF_CONNTRACK) += ip_conntrack.o
11 11
12# conntrack netlink interface
13obj-$(CONFIG_IP_NF_CONNTRACK_NETLINK) += ip_conntrack_netlink.o
14
15
12# SCTP protocol connection tracking 16# SCTP protocol connection tracking
13obj-$(CONFIG_IP_NF_CT_PROTO_SCTP) += ip_conntrack_proto_sctp.o 17obj-$(CONFIG_IP_NF_CT_PROTO_SCTP) += ip_conntrack_proto_sctp.o
14 18
@@ -38,6 +42,7 @@ obj-$(CONFIG_IP_NF_MATCH_HELPER) += ipt_helper.o
38obj-$(CONFIG_IP_NF_MATCH_LIMIT) += ipt_limit.o 42obj-$(CONFIG_IP_NF_MATCH_LIMIT) += ipt_limit.o
39obj-$(CONFIG_IP_NF_MATCH_HASHLIMIT) += ipt_hashlimit.o 43obj-$(CONFIG_IP_NF_MATCH_HASHLIMIT) += ipt_hashlimit.o
40obj-$(CONFIG_IP_NF_MATCH_SCTP) += ipt_sctp.o 44obj-$(CONFIG_IP_NF_MATCH_SCTP) += ipt_sctp.o
45obj-$(CONFIG_IP_NF_MATCH_DCCP) += ipt_dccp.o
41obj-$(CONFIG_IP_NF_MATCH_MARK) += ipt_mark.o 46obj-$(CONFIG_IP_NF_MATCH_MARK) += ipt_mark.o
42obj-$(CONFIG_IP_NF_MATCH_MAC) += ipt_mac.o 47obj-$(CONFIG_IP_NF_MATCH_MAC) += ipt_mac.o
43obj-$(CONFIG_IP_NF_MATCH_IPRANGE) += ipt_iprange.o 48obj-$(CONFIG_IP_NF_MATCH_IPRANGE) += ipt_iprange.o
@@ -54,11 +59,13 @@ obj-$(CONFIG_IP_NF_MATCH_TTL) += ipt_ttl.o
54obj-$(CONFIG_IP_NF_MATCH_STATE) += ipt_state.o 59obj-$(CONFIG_IP_NF_MATCH_STATE) += ipt_state.o
55obj-$(CONFIG_IP_NF_MATCH_CONNMARK) += ipt_connmark.o 60obj-$(CONFIG_IP_NF_MATCH_CONNMARK) += ipt_connmark.o
56obj-$(CONFIG_IP_NF_MATCH_CONNTRACK) += ipt_conntrack.o 61obj-$(CONFIG_IP_NF_MATCH_CONNTRACK) += ipt_conntrack.o
62obj-$(CONFIG_IP_NF_MATCH_CONNBYTES) += ipt_connbytes.o
57obj-$(CONFIG_IP_NF_MATCH_TCPMSS) += ipt_tcpmss.o 63obj-$(CONFIG_IP_NF_MATCH_TCPMSS) += ipt_tcpmss.o
58obj-$(CONFIG_IP_NF_MATCH_REALM) += ipt_realm.o 64obj-$(CONFIG_IP_NF_MATCH_REALM) += ipt_realm.o
59obj-$(CONFIG_IP_NF_MATCH_ADDRTYPE) += ipt_addrtype.o 65obj-$(CONFIG_IP_NF_MATCH_ADDRTYPE) += ipt_addrtype.o
60obj-$(CONFIG_IP_NF_MATCH_PHYSDEV) += ipt_physdev.o 66obj-$(CONFIG_IP_NF_MATCH_PHYSDEV) += ipt_physdev.o
61obj-$(CONFIG_IP_NF_MATCH_COMMENT) += ipt_comment.o 67obj-$(CONFIG_IP_NF_MATCH_COMMENT) += ipt_comment.o
68obj-$(CONFIG_IP_NF_MATCH_STRING) += ipt_string.o
62 69
63# targets 70# targets
64obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o 71obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o
@@ -78,6 +85,7 @@ obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o
78obj-$(CONFIG_IP_NF_TARGET_TCPMSS) += ipt_TCPMSS.o 85obj-$(CONFIG_IP_NF_TARGET_TCPMSS) += ipt_TCPMSS.o
79obj-$(CONFIG_IP_NF_TARGET_NOTRACK) += ipt_NOTRACK.o 86obj-$(CONFIG_IP_NF_TARGET_NOTRACK) += ipt_NOTRACK.o
80obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o 87obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o
88obj-$(CONFIG_IP_NF_TARGET_TTL) += ipt_TTL.o
81 89
82# generic ARP tables 90# generic ARP tables
83obj-$(CONFIG_IP_NF_ARPTABLES) += arp_tables.o 91obj-$(CONFIG_IP_NF_ARPTABLES) += arp_tables.o
@@ -87,3 +95,4 @@ obj-$(CONFIG_IP_NF_ARP_MANGLE) += arpt_mangle.o
87obj-$(CONFIG_IP_NF_ARPFILTER) += arptable_filter.o 95obj-$(CONFIG_IP_NF_ARPFILTER) += arptable_filter.o
88 96
89obj-$(CONFIG_IP_NF_QUEUE) += ip_queue.o 97obj-$(CONFIG_IP_NF_QUEUE) += ip_queue.o
98obj-$(CONFIG_NETFILTER_NETLINK_QUEUE) += ipt_NFQUEUE.o
diff --git a/net/ipv4/netfilter/ip_conntrack_amanda.c b/net/ipv4/netfilter/ip_conntrack_amanda.c
index 01e1b58322a9..be4c9eb3243f 100644
--- a/net/ipv4/netfilter/ip_conntrack_amanda.c
+++ b/net/ipv4/netfilter/ip_conntrack_amanda.c
@@ -40,7 +40,7 @@ MODULE_PARM_DESC(master_timeout, "timeout for the master connection");
40static char *conns[] = { "DATA ", "MESG ", "INDEX " }; 40static char *conns[] = { "DATA ", "MESG ", "INDEX " };
41 41
42/* This is slow, but it's simple. --RR */ 42/* This is slow, but it's simple. --RR */
43static char amanda_buffer[65536]; 43static char *amanda_buffer;
44static DEFINE_SPINLOCK(amanda_buffer_lock); 44static DEFINE_SPINLOCK(amanda_buffer_lock);
45 45
46unsigned int (*ip_nat_amanda_hook)(struct sk_buff **pskb, 46unsigned int (*ip_nat_amanda_hook)(struct sk_buff **pskb,
@@ -153,11 +153,25 @@ static struct ip_conntrack_helper amanda_helper = {
153static void __exit fini(void) 153static void __exit fini(void)
154{ 154{
155 ip_conntrack_helper_unregister(&amanda_helper); 155 ip_conntrack_helper_unregister(&amanda_helper);
156 kfree(amanda_buffer);
156} 157}
157 158
158static int __init init(void) 159static int __init init(void)
159{ 160{
160 return ip_conntrack_helper_register(&amanda_helper); 161 int ret;
162
163 amanda_buffer = kmalloc(65536, GFP_KERNEL);
164 if (!amanda_buffer)
165 return -ENOMEM;
166
167 ret = ip_conntrack_helper_register(&amanda_helper);
168 if (ret < 0) {
169 kfree(amanda_buffer);
170 return ret;
171 }
172 return 0;
173
174
161} 175}
162 176
163module_init(init); 177module_init(init);
diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c
index a7f0c821a9b2..a0648600190e 100644
--- a/net/ipv4/netfilter/ip_conntrack_core.c
+++ b/net/ipv4/netfilter/ip_conntrack_core.c
@@ -37,6 +37,7 @@
37#include <linux/err.h> 37#include <linux/err.h>
38#include <linux/percpu.h> 38#include <linux/percpu.h>
39#include <linux/moduleparam.h> 39#include <linux/moduleparam.h>
40#include <linux/notifier.h>
40 41
41/* ip_conntrack_lock protects the main hash table, protocol/helper/expected 42/* ip_conntrack_lock protects the main hash table, protocol/helper/expected
42 registrations, conntrack timers*/ 43 registrations, conntrack timers*/
@@ -49,7 +50,7 @@
49#include <linux/netfilter_ipv4/ip_conntrack_core.h> 50#include <linux/netfilter_ipv4/ip_conntrack_core.h>
50#include <linux/netfilter_ipv4/listhelp.h> 51#include <linux/netfilter_ipv4/listhelp.h>
51 52
52#define IP_CONNTRACK_VERSION "2.1" 53#define IP_CONNTRACK_VERSION "2.3"
53 54
54#if 0 55#if 0
55#define DEBUGP printk 56#define DEBUGP printk
@@ -69,22 +70,81 @@ static LIST_HEAD(helpers);
69unsigned int ip_conntrack_htable_size = 0; 70unsigned int ip_conntrack_htable_size = 0;
70int ip_conntrack_max; 71int ip_conntrack_max;
71struct list_head *ip_conntrack_hash; 72struct list_head *ip_conntrack_hash;
72static kmem_cache_t *ip_conntrack_cachep; 73static kmem_cache_t *ip_conntrack_cachep __read_mostly;
73static kmem_cache_t *ip_conntrack_expect_cachep; 74static kmem_cache_t *ip_conntrack_expect_cachep __read_mostly;
74struct ip_conntrack ip_conntrack_untracked; 75struct ip_conntrack ip_conntrack_untracked;
75unsigned int ip_ct_log_invalid; 76unsigned int ip_ct_log_invalid;
76static LIST_HEAD(unconfirmed); 77static LIST_HEAD(unconfirmed);
77static int ip_conntrack_vmalloc; 78static int ip_conntrack_vmalloc;
78 79
79DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat); 80static unsigned int ip_conntrack_next_id = 1;
81static unsigned int ip_conntrack_expect_next_id = 1;
82#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
83struct notifier_block *ip_conntrack_chain;
84struct notifier_block *ip_conntrack_expect_chain;
85
86DEFINE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache);
80 87
81void 88/* deliver cached events and clear cache entry - must be called with locally
82ip_conntrack_put(struct ip_conntrack *ct) 89 * disabled softirqs */
90static inline void
91__ip_ct_deliver_cached_events(struct ip_conntrack_ecache *ecache)
83{ 92{
84 IP_NF_ASSERT(ct); 93 DEBUGP("ecache: delivering events for %p\n", ecache->ct);
85 nf_conntrack_put(&ct->ct_general); 94 if (is_confirmed(ecache->ct) && !is_dying(ecache->ct) && ecache->events)
95 notifier_call_chain(&ip_conntrack_chain, ecache->events,
96 ecache->ct);
97 ecache->events = 0;
98 ip_conntrack_put(ecache->ct);
99 ecache->ct = NULL;
86} 100}
87 101
102/* Deliver all cached events for a particular conntrack. This is called
103 * by code prior to async packet handling or freeing the skb */
104void ip_ct_deliver_cached_events(const struct ip_conntrack *ct)
105{
106 struct ip_conntrack_ecache *ecache;
107
108 local_bh_disable();
109 ecache = &__get_cpu_var(ip_conntrack_ecache);
110 if (ecache->ct == ct)
111 __ip_ct_deliver_cached_events(ecache);
112 local_bh_enable();
113}
114
115void __ip_ct_event_cache_init(struct ip_conntrack *ct)
116{
117 struct ip_conntrack_ecache *ecache;
118
119 /* take care of delivering potentially old events */
120 ecache = &__get_cpu_var(ip_conntrack_ecache);
121 BUG_ON(ecache->ct == ct);
122 if (ecache->ct)
123 __ip_ct_deliver_cached_events(ecache);
124 /* initialize for this conntrack/packet */
125 ecache->ct = ct;
126 nf_conntrack_get(&ct->ct_general);
127}
128
129/* flush the event cache - touches other CPU's data and must not be called while
130 * packets are still passing through the code */
131static void ip_ct_event_cache_flush(void)
132{
133 struct ip_conntrack_ecache *ecache;
134 int cpu;
135
136 for_each_cpu(cpu) {
137 ecache = &per_cpu(ip_conntrack_ecache, cpu);
138 if (ecache->ct)
139 ip_conntrack_put(ecache->ct);
140 }
141}
142#else
143static inline void ip_ct_event_cache_flush(void) {}
144#endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */
145
146DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
147
88static int ip_conntrack_hash_rnd_initted; 148static int ip_conntrack_hash_rnd_initted;
89static unsigned int ip_conntrack_hash_rnd; 149static unsigned int ip_conntrack_hash_rnd;
90 150
@@ -144,6 +204,13 @@ static void unlink_expect(struct ip_conntrack_expect *exp)
144 list_del(&exp->list); 204 list_del(&exp->list);
145 CONNTRACK_STAT_INC(expect_delete); 205 CONNTRACK_STAT_INC(expect_delete);
146 exp->master->expecting--; 206 exp->master->expecting--;
207 ip_conntrack_expect_put(exp);
208}
209
210void __ip_ct_expect_unlink_destroy(struct ip_conntrack_expect *exp)
211{
212 unlink_expect(exp);
213 ip_conntrack_expect_put(exp);
147} 214}
148 215
149static void expectation_timed_out(unsigned long ul_expect) 216static void expectation_timed_out(unsigned long ul_expect)
@@ -156,6 +223,33 @@ static void expectation_timed_out(unsigned long ul_expect)
156 ip_conntrack_expect_put(exp); 223 ip_conntrack_expect_put(exp);
157} 224}
158 225
226struct ip_conntrack_expect *
227__ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple)
228{
229 struct ip_conntrack_expect *i;
230
231 list_for_each_entry(i, &ip_conntrack_expect_list, list) {
232 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)) {
233 atomic_inc(&i->use);
234 return i;
235 }
236 }
237 return NULL;
238}
239
240/* Just find a expectation corresponding to a tuple. */
241struct ip_conntrack_expect *
242ip_conntrack_expect_find_get(const struct ip_conntrack_tuple *tuple)
243{
244 struct ip_conntrack_expect *i;
245
246 read_lock_bh(&ip_conntrack_lock);
247 i = __ip_conntrack_expect_find(tuple);
248 read_unlock_bh(&ip_conntrack_lock);
249
250 return i;
251}
252
159/* If an expectation for this connection is found, it gets delete from 253/* If an expectation for this connection is found, it gets delete from
160 * global list then returned. */ 254 * global list then returned. */
161static struct ip_conntrack_expect * 255static struct ip_conntrack_expect *
@@ -180,7 +274,7 @@ find_expectation(const struct ip_conntrack_tuple *tuple)
180} 274}
181 275
182/* delete all expectations for this conntrack */ 276/* delete all expectations for this conntrack */
183static void remove_expectations(struct ip_conntrack *ct) 277void ip_ct_remove_expectations(struct ip_conntrack *ct)
184{ 278{
185 struct ip_conntrack_expect *i, *tmp; 279 struct ip_conntrack_expect *i, *tmp;
186 280
@@ -210,7 +304,7 @@ clean_from_lists(struct ip_conntrack *ct)
210 LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]); 304 LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
211 305
212 /* Destroy all pending expectations */ 306 /* Destroy all pending expectations */
213 remove_expectations(ct); 307 ip_ct_remove_expectations(ct);
214} 308}
215 309
216static void 310static void
@@ -223,10 +317,13 @@ destroy_conntrack(struct nf_conntrack *nfct)
223 IP_NF_ASSERT(atomic_read(&nfct->use) == 0); 317 IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
224 IP_NF_ASSERT(!timer_pending(&ct->timeout)); 318 IP_NF_ASSERT(!timer_pending(&ct->timeout));
225 319
320 ip_conntrack_event(IPCT_DESTROY, ct);
321 set_bit(IPS_DYING_BIT, &ct->status);
322
226 /* To make sure we don't get any weird locking issues here: 323 /* To make sure we don't get any weird locking issues here:
227 * destroy_conntrack() MUST NOT be called with a write lock 324 * destroy_conntrack() MUST NOT be called with a write lock
228 * to ip_conntrack_lock!!! -HW */ 325 * to ip_conntrack_lock!!! -HW */
229 proto = ip_ct_find_proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum); 326 proto = __ip_conntrack_proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
230 if (proto && proto->destroy) 327 if (proto && proto->destroy)
231 proto->destroy(ct); 328 proto->destroy(ct);
232 329
@@ -238,7 +335,7 @@ destroy_conntrack(struct nf_conntrack *nfct)
238 * except TFTP can create an expectation on the first packet, 335 * except TFTP can create an expectation on the first packet,
239 * before connection is in the list, so we need to clean here, 336 * before connection is in the list, so we need to clean here,
240 * too. */ 337 * too. */
241 remove_expectations(ct); 338 ip_ct_remove_expectations(ct);
242 339
243 /* We overload first tuple to link into unconfirmed list. */ 340 /* We overload first tuple to link into unconfirmed list. */
244 if (!is_confirmed(ct)) { 341 if (!is_confirmed(ct)) {
@@ -253,8 +350,7 @@ destroy_conntrack(struct nf_conntrack *nfct)
253 ip_conntrack_put(ct->master); 350 ip_conntrack_put(ct->master);
254 351
255 DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct); 352 DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
256 kmem_cache_free(ip_conntrack_cachep, ct); 353 ip_conntrack_free(ct);
257 atomic_dec(&ip_conntrack_count);
258} 354}
259 355
260static void death_by_timeout(unsigned long ul_conntrack) 356static void death_by_timeout(unsigned long ul_conntrack)
@@ -280,7 +376,7 @@ conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i,
280 && ip_ct_tuple_equal(tuple, &i->tuple); 376 && ip_ct_tuple_equal(tuple, &i->tuple);
281} 377}
282 378
283static struct ip_conntrack_tuple_hash * 379struct ip_conntrack_tuple_hash *
284__ip_conntrack_find(const struct ip_conntrack_tuple *tuple, 380__ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
285 const struct ip_conntrack *ignored_conntrack) 381 const struct ip_conntrack *ignored_conntrack)
286{ 382{
@@ -315,6 +411,29 @@ ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
315 return h; 411 return h;
316} 412}
317 413
414static void __ip_conntrack_hash_insert(struct ip_conntrack *ct,
415 unsigned int hash,
416 unsigned int repl_hash)
417{
418 ct->id = ++ip_conntrack_next_id;
419 list_prepend(&ip_conntrack_hash[hash],
420 &ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
421 list_prepend(&ip_conntrack_hash[repl_hash],
422 &ct->tuplehash[IP_CT_DIR_REPLY].list);
423}
424
425void ip_conntrack_hash_insert(struct ip_conntrack *ct)
426{
427 unsigned int hash, repl_hash;
428
429 hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
430 repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
431
432 write_lock_bh(&ip_conntrack_lock);
433 __ip_conntrack_hash_insert(ct, hash, repl_hash);
434 write_unlock_bh(&ip_conntrack_lock);
435}
436
318/* Confirm a connection given skb; places it in hash table */ 437/* Confirm a connection given skb; places it in hash table */
319int 438int
320__ip_conntrack_confirm(struct sk_buff **pskb) 439__ip_conntrack_confirm(struct sk_buff **pskb)
@@ -361,10 +480,7 @@ __ip_conntrack_confirm(struct sk_buff **pskb)
361 /* Remove from unconfirmed list */ 480 /* Remove from unconfirmed list */
362 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list); 481 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
363 482
364 list_prepend(&ip_conntrack_hash[hash], 483 __ip_conntrack_hash_insert(ct, hash, repl_hash);
365 &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
366 list_prepend(&ip_conntrack_hash[repl_hash],
367 &ct->tuplehash[IP_CT_DIR_REPLY]);
368 /* Timer relative to confirmation time, not original 484 /* Timer relative to confirmation time, not original
369 setting time, otherwise we'd get timer wrap in 485 setting time, otherwise we'd get timer wrap in
370 weird delay cases. */ 486 weird delay cases. */
@@ -374,6 +490,16 @@ __ip_conntrack_confirm(struct sk_buff **pskb)
374 set_bit(IPS_CONFIRMED_BIT, &ct->status); 490 set_bit(IPS_CONFIRMED_BIT, &ct->status);
375 CONNTRACK_STAT_INC(insert); 491 CONNTRACK_STAT_INC(insert);
376 write_unlock_bh(&ip_conntrack_lock); 492 write_unlock_bh(&ip_conntrack_lock);
493 if (ct->helper)
494 ip_conntrack_event_cache(IPCT_HELPER, *pskb);
495#ifdef CONFIG_IP_NF_NAT_NEEDED
496 if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) ||
497 test_bit(IPS_DST_NAT_DONE_BIT, &ct->status))
498 ip_conntrack_event_cache(IPCT_NATINFO, *pskb);
499#endif
500 ip_conntrack_event_cache(master_ct(ct) ?
501 IPCT_RELATED : IPCT_NEW, *pskb);
502
377 return NF_ACCEPT; 503 return NF_ACCEPT;
378 } 504 }
379 505
@@ -438,34 +564,84 @@ static inline int helper_cmp(const struct ip_conntrack_helper *i,
438 return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask); 564 return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
439} 565}
440 566
441static struct ip_conntrack_helper *ip_ct_find_helper(const struct ip_conntrack_tuple *tuple) 567static struct ip_conntrack_helper *
568__ip_conntrack_helper_find( const struct ip_conntrack_tuple *tuple)
442{ 569{
443 return LIST_FIND(&helpers, helper_cmp, 570 return LIST_FIND(&helpers, helper_cmp,
444 struct ip_conntrack_helper *, 571 struct ip_conntrack_helper *,
445 tuple); 572 tuple);
446} 573}
447 574
448/* Allocate a new conntrack: we return -ENOMEM if classification 575struct ip_conntrack_helper *
449 failed due to stress. Otherwise it really is unclassifiable. */ 576ip_conntrack_helper_find_get( const struct ip_conntrack_tuple *tuple)
450static struct ip_conntrack_tuple_hash * 577{
451init_conntrack(const struct ip_conntrack_tuple *tuple, 578 struct ip_conntrack_helper *helper;
452 struct ip_conntrack_protocol *protocol, 579
453 struct sk_buff *skb) 580 /* need ip_conntrack_lock to assure that helper exists until
581 * try_module_get() is called */
582 read_lock_bh(&ip_conntrack_lock);
583
584 helper = __ip_conntrack_helper_find(tuple);
585 if (helper) {
586 /* need to increase module usage count to assure helper will
587 * not go away while the caller is e.g. busy putting a
588 * conntrack in the hash that uses the helper */
589 if (!try_module_get(helper->me))
590 helper = NULL;
591 }
592
593 read_unlock_bh(&ip_conntrack_lock);
594
595 return helper;
596}
597
598void ip_conntrack_helper_put(struct ip_conntrack_helper *helper)
599{
600 module_put(helper->me);
601}
602
603struct ip_conntrack_protocol *
604__ip_conntrack_proto_find(u_int8_t protocol)
605{
606 return ip_ct_protos[protocol];
607}
608
609/* this is guaranteed to always return a valid protocol helper, since
610 * it falls back to generic_protocol */
611struct ip_conntrack_protocol *
612ip_conntrack_proto_find_get(u_int8_t protocol)
613{
614 struct ip_conntrack_protocol *p;
615
616 preempt_disable();
617 p = __ip_conntrack_proto_find(protocol);
618 if (p) {
619 if (!try_module_get(p->me))
620 p = &ip_conntrack_generic_protocol;
621 }
622 preempt_enable();
623
624 return p;
625}
626
627void ip_conntrack_proto_put(struct ip_conntrack_protocol *p)
628{
629 module_put(p->me);
630}
631
632struct ip_conntrack *ip_conntrack_alloc(struct ip_conntrack_tuple *orig,
633 struct ip_conntrack_tuple *repl)
454{ 634{
455 struct ip_conntrack *conntrack; 635 struct ip_conntrack *conntrack;
456 struct ip_conntrack_tuple repl_tuple;
457 size_t hash;
458 struct ip_conntrack_expect *exp;
459 636
460 if (!ip_conntrack_hash_rnd_initted) { 637 if (!ip_conntrack_hash_rnd_initted) {
461 get_random_bytes(&ip_conntrack_hash_rnd, 4); 638 get_random_bytes(&ip_conntrack_hash_rnd, 4);
462 ip_conntrack_hash_rnd_initted = 1; 639 ip_conntrack_hash_rnd_initted = 1;
463 } 640 }
464 641
465 hash = hash_conntrack(tuple);
466
467 if (ip_conntrack_max 642 if (ip_conntrack_max
468 && atomic_read(&ip_conntrack_count) >= ip_conntrack_max) { 643 && atomic_read(&ip_conntrack_count) >= ip_conntrack_max) {
644 unsigned int hash = hash_conntrack(orig);
469 /* Try dropping from this hash chain. */ 645 /* Try dropping from this hash chain. */
470 if (!early_drop(&ip_conntrack_hash[hash])) { 646 if (!early_drop(&ip_conntrack_hash[hash])) {
471 if (net_ratelimit()) 647 if (net_ratelimit())
@@ -476,11 +652,6 @@ init_conntrack(const struct ip_conntrack_tuple *tuple,
476 } 652 }
477 } 653 }
478 654
479 if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) {
480 DEBUGP("Can't invert tuple.\n");
481 return NULL;
482 }
483
484 conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC); 655 conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC);
485 if (!conntrack) { 656 if (!conntrack) {
486 DEBUGP("Can't allocate conntrack.\n"); 657 DEBUGP("Can't allocate conntrack.\n");
@@ -490,17 +661,50 @@ init_conntrack(const struct ip_conntrack_tuple *tuple,
490 memset(conntrack, 0, sizeof(*conntrack)); 661 memset(conntrack, 0, sizeof(*conntrack));
491 atomic_set(&conntrack->ct_general.use, 1); 662 atomic_set(&conntrack->ct_general.use, 1);
492 conntrack->ct_general.destroy = destroy_conntrack; 663 conntrack->ct_general.destroy = destroy_conntrack;
493 conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *tuple; 664 conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
494 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = repl_tuple; 665 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
495 if (!protocol->new(conntrack, skb)) {
496 kmem_cache_free(ip_conntrack_cachep, conntrack);
497 return NULL;
498 }
499 /* Don't set timer yet: wait for confirmation */ 666 /* Don't set timer yet: wait for confirmation */
500 init_timer(&conntrack->timeout); 667 init_timer(&conntrack->timeout);
501 conntrack->timeout.data = (unsigned long)conntrack; 668 conntrack->timeout.data = (unsigned long)conntrack;
502 conntrack->timeout.function = death_by_timeout; 669 conntrack->timeout.function = death_by_timeout;
503 670
671 atomic_inc(&ip_conntrack_count);
672
673 return conntrack;
674}
675
676void
677ip_conntrack_free(struct ip_conntrack *conntrack)
678{
679 atomic_dec(&ip_conntrack_count);
680 kmem_cache_free(ip_conntrack_cachep, conntrack);
681}
682
683/* Allocate a new conntrack: we return -ENOMEM if classification
684 * failed due to stress. Otherwise it really is unclassifiable */
685static struct ip_conntrack_tuple_hash *
686init_conntrack(struct ip_conntrack_tuple *tuple,
687 struct ip_conntrack_protocol *protocol,
688 struct sk_buff *skb)
689{
690 struct ip_conntrack *conntrack;
691 struct ip_conntrack_tuple repl_tuple;
692 struct ip_conntrack_expect *exp;
693
694 if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) {
695 DEBUGP("Can't invert tuple.\n");
696 return NULL;
697 }
698
699 conntrack = ip_conntrack_alloc(tuple, &repl_tuple);
700 if (conntrack == NULL || IS_ERR(conntrack))
701 return (struct ip_conntrack_tuple_hash *)conntrack;
702
703 if (!protocol->new(conntrack, skb)) {
704 ip_conntrack_free(conntrack);
705 return NULL;
706 }
707
504 write_lock_bh(&ip_conntrack_lock); 708 write_lock_bh(&ip_conntrack_lock);
505 exp = find_expectation(tuple); 709 exp = find_expectation(tuple);
506 710
@@ -521,7 +725,7 @@ init_conntrack(const struct ip_conntrack_tuple *tuple,
521 nf_conntrack_get(&conntrack->master->ct_general); 725 nf_conntrack_get(&conntrack->master->ct_general);
522 CONNTRACK_STAT_INC(expect_new); 726 CONNTRACK_STAT_INC(expect_new);
523 } else { 727 } else {
524 conntrack->helper = ip_ct_find_helper(&repl_tuple); 728 conntrack->helper = __ip_conntrack_helper_find(&repl_tuple);
525 729
526 CONNTRACK_STAT_INC(new); 730 CONNTRACK_STAT_INC(new);
527 } 731 }
@@ -529,7 +733,6 @@ init_conntrack(const struct ip_conntrack_tuple *tuple,
529 /* Overload tuple linked list to put us in unconfirmed list. */ 733 /* Overload tuple linked list to put us in unconfirmed list. */
530 list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed); 734 list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
531 735
532 atomic_inc(&ip_conntrack_count);
533 write_unlock_bh(&ip_conntrack_lock); 736 write_unlock_bh(&ip_conntrack_lock);
534 737
535 if (exp) { 738 if (exp) {
@@ -607,7 +810,7 @@ unsigned int ip_conntrack_in(unsigned int hooknum,
607 struct ip_conntrack *ct; 810 struct ip_conntrack *ct;
608 enum ip_conntrack_info ctinfo; 811 enum ip_conntrack_info ctinfo;
609 struct ip_conntrack_protocol *proto; 812 struct ip_conntrack_protocol *proto;
610 int set_reply; 813 int set_reply = 0;
611 int ret; 814 int ret;
612 815
613 /* Previously seen (loopback or untracked)? Ignore. */ 816 /* Previously seen (loopback or untracked)? Ignore. */
@@ -625,9 +828,6 @@ unsigned int ip_conntrack_in(unsigned int hooknum,
625 return NF_DROP; 828 return NF_DROP;
626 } 829 }
627 830
628 /* FIXME: Do this right please. --RR */
629 (*pskb)->nfcache |= NFC_UNKNOWN;
630
631/* Doesn't cover locally-generated broadcast, so not worth it. */ 831/* Doesn't cover locally-generated broadcast, so not worth it. */
632#if 0 832#if 0
633 /* Ignore broadcast: no `connection'. */ 833 /* Ignore broadcast: no `connection'. */
@@ -643,7 +843,7 @@ unsigned int ip_conntrack_in(unsigned int hooknum,
643 } 843 }
644#endif 844#endif
645 845
646 proto = ip_ct_find_proto((*pskb)->nh.iph->protocol); 846 proto = __ip_conntrack_proto_find((*pskb)->nh.iph->protocol);
647 847
648 /* It may be an special packet, error, unclean... 848 /* It may be an special packet, error, unclean...
649 * inverse of the return code tells to the netfilter 849 * inverse of the return code tells to the netfilter
@@ -679,8 +879,8 @@ unsigned int ip_conntrack_in(unsigned int hooknum,
679 return -ret; 879 return -ret;
680 } 880 }
681 881
682 if (set_reply) 882 if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
683 set_bit(IPS_SEEN_REPLY_BIT, &ct->status); 883 ip_conntrack_event_cache(IPCT_STATUS, *pskb);
684 884
685 return ret; 885 return ret;
686} 886}
@@ -689,7 +889,7 @@ int invert_tuplepr(struct ip_conntrack_tuple *inverse,
689 const struct ip_conntrack_tuple *orig) 889 const struct ip_conntrack_tuple *orig)
690{ 890{
691 return ip_ct_invert_tuple(inverse, orig, 891 return ip_ct_invert_tuple(inverse, orig,
692 ip_ct_find_proto(orig->dst.protonum)); 892 __ip_conntrack_proto_find(orig->dst.protonum));
693} 893}
694 894
695/* Would two expected things clash? */ 895/* Would two expected things clash? */
@@ -769,6 +969,8 @@ static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp)
769 exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ; 969 exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ;
770 add_timer(&exp->timeout); 970 add_timer(&exp->timeout);
771 971
972 exp->id = ++ip_conntrack_expect_next_id;
973 atomic_inc(&exp->use);
772 CONNTRACK_STAT_INC(expect_create); 974 CONNTRACK_STAT_INC(expect_create);
773} 975}
774 976
@@ -827,6 +1029,7 @@ int ip_conntrack_expect_related(struct ip_conntrack_expect *expect)
827 evict_oldest_expect(expect->master); 1029 evict_oldest_expect(expect->master);
828 1030
829 ip_conntrack_expect_insert(expect); 1031 ip_conntrack_expect_insert(expect);
1032 ip_conntrack_expect_event(IPEXP_NEW, expect);
830 ret = 0; 1033 ret = 0;
831out: 1034out:
832 write_unlock_bh(&ip_conntrack_lock); 1035 write_unlock_bh(&ip_conntrack_lock);
@@ -847,7 +1050,7 @@ void ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
847 1050
848 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply; 1051 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
849 if (!conntrack->master && conntrack->expecting == 0) 1052 if (!conntrack->master && conntrack->expecting == 0)
850 conntrack->helper = ip_ct_find_helper(newreply); 1053 conntrack->helper = __ip_conntrack_helper_find(newreply);
851 write_unlock_bh(&ip_conntrack_lock); 1054 write_unlock_bh(&ip_conntrack_lock);
852} 1055}
853 1056
@@ -861,11 +1064,26 @@ int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
861 return 0; 1064 return 0;
862} 1065}
863 1066
1067struct ip_conntrack_helper *
1068__ip_conntrack_helper_find_byname(const char *name)
1069{
1070 struct ip_conntrack_helper *h;
1071
1072 list_for_each_entry(h, &helpers, list) {
1073 if (!strcmp(h->name, name))
1074 return h;
1075 }
1076
1077 return NULL;
1078}
1079
864static inline int unhelp(struct ip_conntrack_tuple_hash *i, 1080static inline int unhelp(struct ip_conntrack_tuple_hash *i,
865 const struct ip_conntrack_helper *me) 1081 const struct ip_conntrack_helper *me)
866{ 1082{
867 if (tuplehash_to_ctrack(i)->helper == me) 1083 if (tuplehash_to_ctrack(i)->helper == me) {
1084 ip_conntrack_event(IPCT_HELPER, tuplehash_to_ctrack(i));
868 tuplehash_to_ctrack(i)->helper = NULL; 1085 tuplehash_to_ctrack(i)->helper = NULL;
1086 }
869 return 0; 1087 return 0;
870} 1088}
871 1089
@@ -927,12 +1145,46 @@ void ip_ct_refresh_acct(struct ip_conntrack *ct,
927 if (del_timer(&ct->timeout)) { 1145 if (del_timer(&ct->timeout)) {
928 ct->timeout.expires = jiffies + extra_jiffies; 1146 ct->timeout.expires = jiffies + extra_jiffies;
929 add_timer(&ct->timeout); 1147 add_timer(&ct->timeout);
1148 ip_conntrack_event_cache(IPCT_REFRESH, skb);
930 } 1149 }
931 ct_add_counters(ct, ctinfo, skb); 1150 ct_add_counters(ct, ctinfo, skb);
932 write_unlock_bh(&ip_conntrack_lock); 1151 write_unlock_bh(&ip_conntrack_lock);
933 } 1152 }
934} 1153}
935 1154
1155#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
1156 defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
1157/* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
1158 * in ip_conntrack_core, since we don't want the protocols to autoload
1159 * or depend on ctnetlink */
1160int ip_ct_port_tuple_to_nfattr(struct sk_buff *skb,
1161 const struct ip_conntrack_tuple *tuple)
1162{
1163 NFA_PUT(skb, CTA_PROTO_SRC_PORT, sizeof(u_int16_t),
1164 &tuple->src.u.tcp.port);
1165 NFA_PUT(skb, CTA_PROTO_DST_PORT, sizeof(u_int16_t),
1166 &tuple->dst.u.tcp.port);
1167 return 0;
1168
1169nfattr_failure:
1170 return -1;
1171}
1172
1173int ip_ct_port_nfattr_to_tuple(struct nfattr *tb[],
1174 struct ip_conntrack_tuple *t)
1175{
1176 if (!tb[CTA_PROTO_SRC_PORT-1] || !tb[CTA_PROTO_DST_PORT-1])
1177 return -EINVAL;
1178
1179 t->src.u.tcp.port =
1180 *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_SRC_PORT-1]);
1181 t->dst.u.tcp.port =
1182 *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_DST_PORT-1]);
1183
1184 return 0;
1185}
1186#endif
1187
936/* Returns new sk_buff, or NULL */ 1188/* Returns new sk_buff, or NULL */
937struct sk_buff * 1189struct sk_buff *
938ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user) 1190ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
@@ -943,10 +1195,8 @@ ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
943 skb = ip_defrag(skb, user); 1195 skb = ip_defrag(skb, user);
944 local_bh_enable(); 1196 local_bh_enable();
945 1197
946 if (skb) { 1198 if (skb)
947 ip_send_check(skb->nh.iph); 1199 ip_send_check(skb->nh.iph);
948 skb->nfcache |= NFC_ALTERED;
949 }
950 return skb; 1200 return skb;
951} 1201}
952 1202
@@ -1096,16 +1346,14 @@ static void free_conntrack_hash(void)
1096 * ip_conntrack_htable_size)); 1346 * ip_conntrack_htable_size));
1097} 1347}
1098 1348
1099/* Mishearing the voices in his head, our hero wonders how he's 1349void ip_conntrack_flush()
1100 supposed to kill the mall. */
1101void ip_conntrack_cleanup(void)
1102{ 1350{
1103 ip_ct_attach = NULL;
1104 /* This makes sure all current packets have passed through 1351 /* This makes sure all current packets have passed through
1105 netfilter framework. Roll on, two-stage module 1352 netfilter framework. Roll on, two-stage module
1106 delete... */ 1353 delete... */
1107 synchronize_net(); 1354 synchronize_net();
1108 1355
1356 ip_ct_event_cache_flush();
1109 i_see_dead_people: 1357 i_see_dead_people:
1110 ip_ct_iterate_cleanup(kill_all, NULL); 1358 ip_ct_iterate_cleanup(kill_all, NULL);
1111 if (atomic_read(&ip_conntrack_count) != 0) { 1359 if (atomic_read(&ip_conntrack_count) != 0) {
@@ -1115,7 +1363,14 @@ void ip_conntrack_cleanup(void)
1115 /* wait until all references to ip_conntrack_untracked are dropped */ 1363 /* wait until all references to ip_conntrack_untracked are dropped */
1116 while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1) 1364 while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1)
1117 schedule(); 1365 schedule();
1366}
1118 1367
1368/* Mishearing the voices in his head, our hero wonders how he's
1369 supposed to kill the mall. */
1370void ip_conntrack_cleanup(void)
1371{
1372 ip_ct_attach = NULL;
1373 ip_conntrack_flush();
1119 kmem_cache_destroy(ip_conntrack_cachep); 1374 kmem_cache_destroy(ip_conntrack_cachep);
1120 kmem_cache_destroy(ip_conntrack_expect_cachep); 1375 kmem_cache_destroy(ip_conntrack_expect_cachep);
1121 free_conntrack_hash(); 1376 free_conntrack_hash();
diff --git a/net/ipv4/netfilter/ip_conntrack_ftp.c b/net/ipv4/netfilter/ip_conntrack_ftp.c
index 7a3b773be3f9..3a2627db1729 100644
--- a/net/ipv4/netfilter/ip_conntrack_ftp.c
+++ b/net/ipv4/netfilter/ip_conntrack_ftp.c
@@ -25,8 +25,7 @@ MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
25MODULE_DESCRIPTION("ftp connection tracking helper"); 25MODULE_DESCRIPTION("ftp connection tracking helper");
26 26
27/* This is slow, but it's simple. --RR */ 27/* This is slow, but it's simple. --RR */
28static char ftp_buffer[65536]; 28static char *ftp_buffer;
29
30static DEFINE_SPINLOCK(ip_ftp_lock); 29static DEFINE_SPINLOCK(ip_ftp_lock);
31 30
32#define MAX_PORTS 8 31#define MAX_PORTS 8
@@ -262,7 +261,8 @@ static int find_nl_seq(u32 seq, const struct ip_ct_ftp_master *info, int dir)
262} 261}
263 262
264/* We don't update if it's older than what we have. */ 263/* We don't update if it's older than what we have. */
265static void update_nl_seq(u32 nl_seq, struct ip_ct_ftp_master *info, int dir) 264static void update_nl_seq(u32 nl_seq, struct ip_ct_ftp_master *info, int dir,
265 struct sk_buff *skb)
266{ 266{
267 unsigned int i, oldest = NUM_SEQ_TO_REMEMBER; 267 unsigned int i, oldest = NUM_SEQ_TO_REMEMBER;
268 268
@@ -276,10 +276,13 @@ static void update_nl_seq(u32 nl_seq, struct ip_ct_ftp_master *info, int dir)
276 oldest = i; 276 oldest = i;
277 } 277 }
278 278
279 if (info->seq_aft_nl_num[dir] < NUM_SEQ_TO_REMEMBER) 279 if (info->seq_aft_nl_num[dir] < NUM_SEQ_TO_REMEMBER) {
280 info->seq_aft_nl[dir][info->seq_aft_nl_num[dir]++] = nl_seq; 280 info->seq_aft_nl[dir][info->seq_aft_nl_num[dir]++] = nl_seq;
281 else if (oldest != NUM_SEQ_TO_REMEMBER) 281 ip_conntrack_event_cache(IPCT_HELPINFO_VOLATILE, skb);
282 } else if (oldest != NUM_SEQ_TO_REMEMBER) {
282 info->seq_aft_nl[dir][oldest] = nl_seq; 283 info->seq_aft_nl[dir][oldest] = nl_seq;
284 ip_conntrack_event_cache(IPCT_HELPINFO_VOLATILE, skb);
285 }
283} 286}
284 287
285static int help(struct sk_buff **pskb, 288static int help(struct sk_buff **pskb,
@@ -439,7 +442,7 @@ out_update_nl:
439 /* Now if this ends in \n, update ftp info. Seq may have been 442 /* Now if this ends in \n, update ftp info. Seq may have been
440 * adjusted by NAT code. */ 443 * adjusted by NAT code. */
441 if (ends_in_nl) 444 if (ends_in_nl)
442 update_nl_seq(seq, ct_ftp_info,dir); 445 update_nl_seq(seq, ct_ftp_info,dir, *pskb);
443 out: 446 out:
444 spin_unlock_bh(&ip_ftp_lock); 447 spin_unlock_bh(&ip_ftp_lock);
445 return ret; 448 return ret;
@@ -457,6 +460,8 @@ static void fini(void)
457 ports[i]); 460 ports[i]);
458 ip_conntrack_helper_unregister(&ftp[i]); 461 ip_conntrack_helper_unregister(&ftp[i]);
459 } 462 }
463
464 kfree(ftp_buffer);
460} 465}
461 466
462static int __init init(void) 467static int __init init(void)
@@ -464,6 +469,10 @@ static int __init init(void)
464 int i, ret; 469 int i, ret;
465 char *tmpname; 470 char *tmpname;
466 471
472 ftp_buffer = kmalloc(65536, GFP_KERNEL);
473 if (!ftp_buffer)
474 return -ENOMEM;
475
467 if (ports_c == 0) 476 if (ports_c == 0)
468 ports[ports_c++] = FTP_PORT; 477 ports[ports_c++] = FTP_PORT;
469 478
diff --git a/net/ipv4/netfilter/ip_conntrack_irc.c b/net/ipv4/netfilter/ip_conntrack_irc.c
index 4a28f297d502..25438eec21a1 100644
--- a/net/ipv4/netfilter/ip_conntrack_irc.c
+++ b/net/ipv4/netfilter/ip_conntrack_irc.c
@@ -39,7 +39,7 @@ static int ports_c;
39static int max_dcc_channels = 8; 39static int max_dcc_channels = 8;
40static unsigned int dcc_timeout = 300; 40static unsigned int dcc_timeout = 300;
41/* This is slow, but it's simple. --RR */ 41/* This is slow, but it's simple. --RR */
42static char irc_buffer[65536]; 42static char *irc_buffer;
43static DEFINE_SPINLOCK(irc_buffer_lock); 43static DEFINE_SPINLOCK(irc_buffer_lock);
44 44
45unsigned int (*ip_nat_irc_hook)(struct sk_buff **pskb, 45unsigned int (*ip_nat_irc_hook)(struct sk_buff **pskb,
@@ -257,6 +257,10 @@ static int __init init(void)
257 printk("ip_conntrack_irc: dcc_timeout must be a positive integer\n"); 257 printk("ip_conntrack_irc: dcc_timeout must be a positive integer\n");
258 return -EBUSY; 258 return -EBUSY;
259 } 259 }
260
261 irc_buffer = kmalloc(65536, GFP_KERNEL);
262 if (!irc_buffer)
263 return -ENOMEM;
260 264
261 /* If no port given, default to standard irc port */ 265 /* If no port given, default to standard irc port */
262 if (ports_c == 0) 266 if (ports_c == 0)
@@ -304,6 +308,7 @@ static void fini(void)
304 ports[i]); 308 ports[i]);
305 ip_conntrack_helper_unregister(&irc_helpers[i]); 309 ip_conntrack_helper_unregister(&irc_helpers[i]);
306 } 310 }
311 kfree(irc_buffer);
307} 312}
308 313
309module_init(init); 314module_init(init);
diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c
new file mode 100644
index 000000000000..a4e9278db4ed
--- /dev/null
+++ b/net/ipv4/netfilter/ip_conntrack_netlink.c
@@ -0,0 +1,1579 @@
1/* Connection tracking via netlink socket. Allows for user space
2 * protocol helpers and general trouble making from userspace.
3 *
4 * (C) 2001 by Jay Schulist <jschlst@samba.org>
5 * (C) 2002-2005 by Harald Welte <laforge@gnumonks.org>
6 * (C) 2003 by Patrick Mchardy <kaber@trash.net>
7 * (C) 2005 by Pablo Neira Ayuso <pablo@eurodev.net>
8 *
9 * I've reworked this stuff to use attributes instead of conntrack
10 * structures. 5.44 am. I need more tea. --pablo 05/07/11.
11 *
12 * Initial connection tracking via netlink development funded and
13 * generally made possible by Network Robots, Inc. (www.networkrobots.com)
14 *
15 * Further development of this code funded by Astaro AG (http://www.astaro.com)
16 *
17 * This software may be used and distributed according to the terms
18 * of the GNU General Public License, incorporated herein by reference.
19 */
20
21#include <linux/init.h>
22#include <linux/module.h>
23#include <linux/kernel.h>
24#include <linux/types.h>
25#include <linux/timer.h>
26#include <linux/skbuff.h>
27#include <linux/errno.h>
28#include <linux/netlink.h>
29#include <linux/spinlock.h>
30#include <linux/notifier.h>
31#include <linux/rtnetlink.h>
32
33#include <linux/netfilter.h>
34#include <linux/netfilter_ipv4.h>
35#include <linux/netfilter_ipv4/ip_tables.h>
36#include <linux/netfilter_ipv4/ip_conntrack.h>
37#include <linux/netfilter_ipv4/ip_conntrack_core.h>
38#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
39#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
40#include <linux/netfilter_ipv4/ip_nat_protocol.h>
41
42#include <linux/netfilter/nfnetlink.h>
43#include <linux/netfilter/nfnetlink_conntrack.h>
44
45MODULE_LICENSE("GPL");
46
47static char __initdata version[] = "0.90";
48
49#if 0
50#define DEBUGP printk
51#else
52#define DEBUGP(format, args...)
53#endif
54
55
56static inline int
57ctnetlink_dump_tuples_proto(struct sk_buff *skb,
58 const struct ip_conntrack_tuple *tuple)
59{
60 struct ip_conntrack_protocol *proto;
61
62 NFA_PUT(skb, CTA_PROTO_NUM, sizeof(u_int8_t), &tuple->dst.protonum);
63
64 proto = ip_conntrack_proto_find_get(tuple->dst.protonum);
65 if (proto && proto->tuple_to_nfattr)
66 return proto->tuple_to_nfattr(skb, tuple);
67
68 return 0;
69
70nfattr_failure:
71 return -1;
72}
73
74static inline int
75ctnetlink_dump_tuples(struct sk_buff *skb,
76 const struct ip_conntrack_tuple *tuple)
77{
78 struct nfattr *nest_parms;
79
80 nest_parms = NFA_NEST(skb, CTA_TUPLE_IP);
81 NFA_PUT(skb, CTA_IP_V4_SRC, sizeof(u_int32_t), &tuple->src.ip);
82 NFA_PUT(skb, CTA_IP_V4_DST, sizeof(u_int32_t), &tuple->dst.ip);
83 NFA_NEST_END(skb, nest_parms);
84
85 nest_parms = NFA_NEST(skb, CTA_TUPLE_PROTO);
86 ctnetlink_dump_tuples_proto(skb, tuple);
87 NFA_NEST_END(skb, nest_parms);
88
89 return 0;
90
91nfattr_failure:
92 return -1;
93}
94
95static inline int
96ctnetlink_dump_status(struct sk_buff *skb, const struct ip_conntrack *ct)
97{
98 u_int32_t status = htonl((u_int32_t) ct->status);
99 NFA_PUT(skb, CTA_STATUS, sizeof(status), &status);
100 return 0;
101
102nfattr_failure:
103 return -1;
104}
105
106static inline int
107ctnetlink_dump_timeout(struct sk_buff *skb, const struct ip_conntrack *ct)
108{
109 long timeout_l = ct->timeout.expires - jiffies;
110 u_int32_t timeout;
111
112 if (timeout_l < 0)
113 timeout = 0;
114 else
115 timeout = htonl(timeout_l / HZ);
116
117 NFA_PUT(skb, CTA_TIMEOUT, sizeof(timeout), &timeout);
118 return 0;
119
120nfattr_failure:
121 return -1;
122}
123
124static inline int
125ctnetlink_dump_protoinfo(struct sk_buff *skb, const struct ip_conntrack *ct)
126{
127 struct ip_conntrack_protocol *proto = ip_conntrack_proto_find_get(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum);
128
129 struct nfattr *nest_proto;
130 int ret;
131
132 if (!proto || !proto->to_nfattr)
133 return 0;
134
135 nest_proto = NFA_NEST(skb, CTA_PROTOINFO);
136
137 ret = proto->to_nfattr(skb, nest_proto, ct);
138
139 ip_conntrack_proto_put(proto);
140
141 NFA_NEST_END(skb, nest_proto);
142
143 return ret;
144
145nfattr_failure:
146 return -1;
147}
148
149static inline int
150ctnetlink_dump_helpinfo(struct sk_buff *skb, const struct ip_conntrack *ct)
151{
152 struct nfattr *nest_helper;
153
154 if (!ct->helper)
155 return 0;
156
157 nest_helper = NFA_NEST(skb, CTA_HELP);
158 NFA_PUT(skb, CTA_HELP_NAME, CTA_HELP_MAXNAMESIZE, &ct->helper->name);
159
160 if (ct->helper->to_nfattr)
161 ct->helper->to_nfattr(skb, ct);
162
163 NFA_NEST_END(skb, nest_helper);
164
165 return 0;
166
167nfattr_failure:
168 return -1;
169}
170
171#ifdef CONFIG_IP_NF_CT_ACCT
172static inline int
173ctnetlink_dump_counters(struct sk_buff *skb, const struct ip_conntrack *ct,
174 enum ip_conntrack_dir dir)
175{
176 enum ctattr_type type = dir ? CTA_COUNTERS_REPLY: CTA_COUNTERS_ORIG;
177 struct nfattr *nest_count = NFA_NEST(skb, type);
178 u_int64_t tmp;
179
180 tmp = cpu_to_be64(ct->counters[dir].packets);
181 NFA_PUT(skb, CTA_COUNTERS_PACKETS, sizeof(u_int64_t), &tmp);
182
183 tmp = cpu_to_be64(ct->counters[dir].bytes);
184 NFA_PUT(skb, CTA_COUNTERS_BYTES, sizeof(u_int64_t), &tmp);
185
186 NFA_NEST_END(skb, nest_count);
187
188 return 0;
189
190nfattr_failure:
191 return -1;
192}
193#else
194#define ctnetlink_dump_counters(a, b, c) (0)
195#endif
196
197#ifdef CONFIG_IP_NF_CONNTRACK_MARK
198static inline int
199ctnetlink_dump_mark(struct sk_buff *skb, const struct ip_conntrack *ct)
200{
201 u_int32_t mark = htonl(ct->mark);
202
203 NFA_PUT(skb, CTA_MARK, sizeof(u_int32_t), &mark);
204 return 0;
205
206nfattr_failure:
207 return -1;
208}
209#else
210#define ctnetlink_dump_mark(a, b) (0)
211#endif
212
213static inline int
214ctnetlink_dump_id(struct sk_buff *skb, const struct ip_conntrack *ct)
215{
216 u_int32_t id = htonl(ct->id);
217 NFA_PUT(skb, CTA_ID, sizeof(u_int32_t), &id);
218 return 0;
219
220nfattr_failure:
221 return -1;
222}
223
224static inline int
225ctnetlink_dump_use(struct sk_buff *skb, const struct ip_conntrack *ct)
226{
227 unsigned int use = htonl(atomic_read(&ct->ct_general.use));
228
229 NFA_PUT(skb, CTA_USE, sizeof(u_int32_t), &use);
230 return 0;
231
232nfattr_failure:
233 return -1;
234}
235
236#define tuple(ct, dir) (&(ct)->tuplehash[dir].tuple)
237
238static int
239ctnetlink_fill_info(struct sk_buff *skb, u32 pid, u32 seq,
240 int event, int nowait,
241 const struct ip_conntrack *ct)
242{
243 struct nlmsghdr *nlh;
244 struct nfgenmsg *nfmsg;
245 struct nfattr *nest_parms;
246 unsigned char *b;
247
248 b = skb->tail;
249
250 event |= NFNL_SUBSYS_CTNETLINK << 8;
251 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(struct nfgenmsg));
252 nfmsg = NLMSG_DATA(nlh);
253
254 nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0;
255 nfmsg->nfgen_family = AF_INET;
256 nfmsg->version = NFNETLINK_V0;
257 nfmsg->res_id = 0;
258
259 nest_parms = NFA_NEST(skb, CTA_TUPLE_ORIG);
260 if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_ORIGINAL)) < 0)
261 goto nfattr_failure;
262 NFA_NEST_END(skb, nest_parms);
263
264 nest_parms = NFA_NEST(skb, CTA_TUPLE_REPLY);
265 if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_REPLY)) < 0)
266 goto nfattr_failure;
267 NFA_NEST_END(skb, nest_parms);
268
269 if (ctnetlink_dump_status(skb, ct) < 0 ||
270 ctnetlink_dump_timeout(skb, ct) < 0 ||
271 ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 ||
272 ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0 ||
273 ctnetlink_dump_protoinfo(skb, ct) < 0 ||
274 ctnetlink_dump_helpinfo(skb, ct) < 0 ||
275 ctnetlink_dump_mark(skb, ct) < 0 ||
276 ctnetlink_dump_id(skb, ct) < 0 ||
277 ctnetlink_dump_use(skb, ct) < 0)
278 goto nfattr_failure;
279
280 nlh->nlmsg_len = skb->tail - b;
281 return skb->len;
282
283nlmsg_failure:
284nfattr_failure:
285 skb_trim(skb, b - skb->data);
286 return -1;
287}
288
289#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
290static int ctnetlink_conntrack_event(struct notifier_block *this,
291 unsigned long events, void *ptr)
292{
293 struct nlmsghdr *nlh;
294 struct nfgenmsg *nfmsg;
295 struct nfattr *nest_parms;
296 struct ip_conntrack *ct = (struct ip_conntrack *)ptr;
297 struct sk_buff *skb;
298 unsigned int type;
299 unsigned char *b;
300 unsigned int flags = 0, group;
301
302 /* ignore our fake conntrack entry */
303 if (ct == &ip_conntrack_untracked)
304 return NOTIFY_DONE;
305
306 if (events & IPCT_DESTROY) {
307 type = IPCTNL_MSG_CT_DELETE;
308 group = NFNLGRP_CONNTRACK_DESTROY;
309 goto alloc_skb;
310 }
311 if (events & (IPCT_NEW | IPCT_RELATED)) {
312 type = IPCTNL_MSG_CT_NEW;
313 flags = NLM_F_CREATE|NLM_F_EXCL;
314 /* dump everything */
315 events = ~0UL;
316 group = NFNLGRP_CONNTRACK_NEW;
317 goto alloc_skb;
318 }
319 if (events & (IPCT_STATUS |
320 IPCT_PROTOINFO |
321 IPCT_HELPER |
322 IPCT_HELPINFO |
323 IPCT_NATINFO)) {
324 type = IPCTNL_MSG_CT_NEW;
325 group = NFNLGRP_CONNTRACK_UPDATE;
326 goto alloc_skb;
327 }
328
329 return NOTIFY_DONE;
330
331alloc_skb:
332 /* FIXME: Check if there are any listeners before, don't hurt performance */
333
334 skb = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
335 if (!skb)
336 return NOTIFY_DONE;
337
338 b = skb->tail;
339
340 type |= NFNL_SUBSYS_CTNETLINK << 8;
341 nlh = NLMSG_PUT(skb, 0, 0, type, sizeof(struct nfgenmsg));
342 nfmsg = NLMSG_DATA(nlh);
343
344 nlh->nlmsg_flags = flags;
345 nfmsg->nfgen_family = AF_INET;
346 nfmsg->version = NFNETLINK_V0;
347 nfmsg->res_id = 0;
348
349 nest_parms = NFA_NEST(skb, CTA_TUPLE_ORIG);
350 if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_ORIGINAL)) < 0)
351 goto nfattr_failure;
352 NFA_NEST_END(skb, nest_parms);
353
354 nest_parms = NFA_NEST(skb, CTA_TUPLE_REPLY);
355 if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_REPLY)) < 0)
356 goto nfattr_failure;
357 NFA_NEST_END(skb, nest_parms);
358
359 /* NAT stuff is now a status flag */
360 if ((events & IPCT_STATUS || events & IPCT_NATINFO)
361 && ctnetlink_dump_status(skb, ct) < 0)
362 goto nfattr_failure;
363 if (events & IPCT_REFRESH
364 && ctnetlink_dump_timeout(skb, ct) < 0)
365 goto nfattr_failure;
366 if (events & IPCT_PROTOINFO
367 && ctnetlink_dump_protoinfo(skb, ct) < 0)
368 goto nfattr_failure;
369 if (events & IPCT_HELPINFO
370 && ctnetlink_dump_helpinfo(skb, ct) < 0)
371 goto nfattr_failure;
372
373 if (ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 ||
374 ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0)
375 goto nfattr_failure;
376
377 nlh->nlmsg_len = skb->tail - b;
378 nfnetlink_send(skb, 0, group, 0);
379 return NOTIFY_DONE;
380
381nlmsg_failure:
382nfattr_failure:
383 kfree_skb(skb);
384 return NOTIFY_DONE;
385}
386#endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */
387
388static int ctnetlink_done(struct netlink_callback *cb)
389{
390 DEBUGP("entered %s\n", __FUNCTION__);
391 return 0;
392}
393
394static int
395ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
396{
397 struct ip_conntrack *ct = NULL;
398 struct ip_conntrack_tuple_hash *h;
399 struct list_head *i;
400 u_int32_t *id = (u_int32_t *) &cb->args[1];
401
402 DEBUGP("entered %s, last bucket=%lu id=%u\n", __FUNCTION__,
403 cb->args[0], *id);
404
405 read_lock_bh(&ip_conntrack_lock);
406 for (; cb->args[0] < ip_conntrack_htable_size; cb->args[0]++, *id = 0) {
407 list_for_each_prev(i, &ip_conntrack_hash[cb->args[0]]) {
408 h = (struct ip_conntrack_tuple_hash *) i;
409 if (DIRECTION(h) != IP_CT_DIR_ORIGINAL)
410 continue;
411 ct = tuplehash_to_ctrack(h);
412 if (ct->id <= *id)
413 continue;
414 if (ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).pid,
415 cb->nlh->nlmsg_seq,
416 IPCTNL_MSG_CT_NEW,
417 1, ct) < 0)
418 goto out;
419 *id = ct->id;
420 }
421 }
422out:
423 read_unlock_bh(&ip_conntrack_lock);
424
425 DEBUGP("leaving, last bucket=%lu id=%u\n", cb->args[0], *id);
426
427 return skb->len;
428}
429
430#ifdef CONFIG_IP_NF_CT_ACCT
431static int
432ctnetlink_dump_table_w(struct sk_buff *skb, struct netlink_callback *cb)
433{
434 struct ip_conntrack *ct = NULL;
435 struct ip_conntrack_tuple_hash *h;
436 struct list_head *i;
437 u_int32_t *id = (u_int32_t *) &cb->args[1];
438
439 DEBUGP("entered %s, last bucket=%u id=%u\n", __FUNCTION__,
440 cb->args[0], *id);
441
442 write_lock_bh(&ip_conntrack_lock);
443 for (; cb->args[0] < ip_conntrack_htable_size; cb->args[0]++, *id = 0) {
444 list_for_each_prev(i, &ip_conntrack_hash[cb->args[0]]) {
445 h = (struct ip_conntrack_tuple_hash *) i;
446 if (DIRECTION(h) != IP_CT_DIR_ORIGINAL)
447 continue;
448 ct = tuplehash_to_ctrack(h);
449 if (ct->id <= *id)
450 continue;
451 if (ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).pid,
452 cb->nlh->nlmsg_seq,
453 IPCTNL_MSG_CT_NEW,
454 1, ct) < 0)
455 goto out;
456 *id = ct->id;
457
458 memset(&ct->counters, 0, sizeof(ct->counters));
459 }
460 }
461out:
462 write_unlock_bh(&ip_conntrack_lock);
463
464 DEBUGP("leaving, last bucket=%lu id=%u\n", cb->args[0], *id);
465
466 return skb->len;
467}
468#endif
469
470static const int cta_min_ip[CTA_IP_MAX] = {
471 [CTA_IP_V4_SRC-1] = sizeof(u_int32_t),
472 [CTA_IP_V4_DST-1] = sizeof(u_int32_t),
473};
474
475static inline int
476ctnetlink_parse_tuple_ip(struct nfattr *attr, struct ip_conntrack_tuple *tuple)
477{
478 struct nfattr *tb[CTA_IP_MAX];
479
480 DEBUGP("entered %s\n", __FUNCTION__);
481
482
483 if (nfattr_parse_nested(tb, CTA_IP_MAX, attr) < 0)
484 goto nfattr_failure;
485
486 if (nfattr_bad_size(tb, CTA_IP_MAX, cta_min_ip))
487 return -EINVAL;
488
489 if (!tb[CTA_IP_V4_SRC-1])
490 return -EINVAL;
491 tuple->src.ip = *(u_int32_t *)NFA_DATA(tb[CTA_IP_V4_SRC-1]);
492
493 if (!tb[CTA_IP_V4_DST-1])
494 return -EINVAL;
495 tuple->dst.ip = *(u_int32_t *)NFA_DATA(tb[CTA_IP_V4_DST-1]);
496
497 DEBUGP("leaving\n");
498
499 return 0;
500
501nfattr_failure:
502 return -1;
503}
504
505static const int cta_min_proto[CTA_PROTO_MAX] = {
506 [CTA_PROTO_NUM-1] = sizeof(u_int16_t),
507 [CTA_PROTO_SRC_PORT-1] = sizeof(u_int16_t),
508 [CTA_PROTO_DST_PORT-1] = sizeof(u_int16_t),
509 [CTA_PROTO_ICMP_TYPE-1] = sizeof(u_int8_t),
510 [CTA_PROTO_ICMP_CODE-1] = sizeof(u_int8_t),
511 [CTA_PROTO_ICMP_ID-1] = sizeof(u_int16_t),
512};
513
514static inline int
515ctnetlink_parse_tuple_proto(struct nfattr *attr,
516 struct ip_conntrack_tuple *tuple)
517{
518 struct nfattr *tb[CTA_PROTO_MAX];
519 struct ip_conntrack_protocol *proto;
520 int ret = 0;
521
522 DEBUGP("entered %s\n", __FUNCTION__);
523
524 if (nfattr_parse_nested(tb, CTA_PROTO_MAX, attr) < 0)
525 goto nfattr_failure;
526
527 if (nfattr_bad_size(tb, CTA_PROTO_MAX, cta_min_proto))
528 return -EINVAL;
529
530 if (!tb[CTA_PROTO_NUM-1])
531 return -EINVAL;
532 tuple->dst.protonum = *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_NUM-1]);
533
534 proto = ip_conntrack_proto_find_get(tuple->dst.protonum);
535
536 if (likely(proto && proto->nfattr_to_tuple)) {
537 ret = proto->nfattr_to_tuple(tb, tuple);
538 ip_conntrack_proto_put(proto);
539 }
540
541 return ret;
542
543nfattr_failure:
544 return -1;
545}
546
547static inline int
548ctnetlink_parse_tuple(struct nfattr *cda[], struct ip_conntrack_tuple *tuple,
549 enum ctattr_tuple type)
550{
551 struct nfattr *tb[CTA_TUPLE_MAX];
552 int err;
553
554 DEBUGP("entered %s\n", __FUNCTION__);
555
556 memset(tuple, 0, sizeof(*tuple));
557
558 if (nfattr_parse_nested(tb, CTA_TUPLE_MAX, cda[type-1]) < 0)
559 goto nfattr_failure;
560
561 if (!tb[CTA_TUPLE_IP-1])
562 return -EINVAL;
563
564 err = ctnetlink_parse_tuple_ip(tb[CTA_TUPLE_IP-1], tuple);
565 if (err < 0)
566 return err;
567
568 if (!tb[CTA_TUPLE_PROTO-1])
569 return -EINVAL;
570
571 err = ctnetlink_parse_tuple_proto(tb[CTA_TUPLE_PROTO-1], tuple);
572 if (err < 0)
573 return err;
574
575 /* orig and expect tuples get DIR_ORIGINAL */
576 if (type == CTA_TUPLE_REPLY)
577 tuple->dst.dir = IP_CT_DIR_REPLY;
578 else
579 tuple->dst.dir = IP_CT_DIR_ORIGINAL;
580
581 DUMP_TUPLE(tuple);
582
583 DEBUGP("leaving\n");
584
585 return 0;
586
587nfattr_failure:
588 return -1;
589}
590
591#ifdef CONFIG_IP_NF_NAT_NEEDED
592static const int cta_min_protonat[CTA_PROTONAT_MAX] = {
593 [CTA_PROTONAT_PORT_MIN-1] = sizeof(u_int16_t),
594 [CTA_PROTONAT_PORT_MAX-1] = sizeof(u_int16_t),
595};
596
597static int ctnetlink_parse_nat_proto(struct nfattr *attr,
598 const struct ip_conntrack *ct,
599 struct ip_nat_range *range)
600{
601 struct nfattr *tb[CTA_PROTONAT_MAX];
602 struct ip_nat_protocol *npt;
603
604 DEBUGP("entered %s\n", __FUNCTION__);
605
606 if (nfattr_parse_nested(tb, CTA_PROTONAT_MAX, attr) < 0)
607 goto nfattr_failure;
608
609 if (nfattr_bad_size(tb, CTA_PROTONAT_MAX, cta_min_protonat))
610 goto nfattr_failure;
611
612 npt = ip_nat_proto_find_get(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum);
613 if (!npt)
614 return 0;
615
616 if (!npt->nfattr_to_range) {
617 ip_nat_proto_put(npt);
618 return 0;
619 }
620
621 /* nfattr_to_range returns 1 if it parsed, 0 if not, neg. on error */
622 if (npt->nfattr_to_range(tb, range) > 0)
623 range->flags |= IP_NAT_RANGE_PROTO_SPECIFIED;
624
625 ip_nat_proto_put(npt);
626
627 DEBUGP("leaving\n");
628 return 0;
629
630nfattr_failure:
631 return -1;
632}
633
634static inline int
635ctnetlink_parse_nat(struct nfattr *cda[],
636 const struct ip_conntrack *ct, struct ip_nat_range *range)
637{
638 struct nfattr *tb[CTA_NAT_MAX];
639 int err;
640
641 DEBUGP("entered %s\n", __FUNCTION__);
642
643 memset(range, 0, sizeof(*range));
644
645 if (nfattr_parse_nested(tb, CTA_NAT_MAX, cda[CTA_NAT-1]) < 0)
646 goto nfattr_failure;
647
648 if (tb[CTA_NAT_MINIP-1])
649 range->min_ip = *(u_int32_t *)NFA_DATA(tb[CTA_NAT_MINIP-1]);
650
651 if (!tb[CTA_NAT_MAXIP-1])
652 range->max_ip = range->min_ip;
653 else
654 range->max_ip = *(u_int32_t *)NFA_DATA(tb[CTA_NAT_MAXIP-1]);
655
656 if (range->min_ip)
657 range->flags |= IP_NAT_RANGE_MAP_IPS;
658
659 if (!tb[CTA_NAT_PROTO-1])
660 return 0;
661
662 err = ctnetlink_parse_nat_proto(tb[CTA_NAT_PROTO-1], ct, range);
663 if (err < 0)
664 return err;
665
666 DEBUGP("leaving\n");
667 return 0;
668
669nfattr_failure:
670 return -1;
671}
672#endif
673
674static inline int
675ctnetlink_parse_help(struct nfattr *attr, char **helper_name)
676{
677 struct nfattr *tb[CTA_HELP_MAX];
678
679 DEBUGP("entered %s\n", __FUNCTION__);
680
681 if (nfattr_parse_nested(tb, CTA_HELP_MAX, attr) < 0)
682 goto nfattr_failure;
683
684 if (!tb[CTA_HELP_NAME-1])
685 return -EINVAL;
686
687 *helper_name = NFA_DATA(tb[CTA_HELP_NAME-1]);
688
689 return 0;
690
691nfattr_failure:
692 return -1;
693}
694
695static int
696ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb,
697 struct nlmsghdr *nlh, struct nfattr *cda[], int *errp)
698{
699 struct ip_conntrack_tuple_hash *h;
700 struct ip_conntrack_tuple tuple;
701 struct ip_conntrack *ct;
702 int err = 0;
703
704 DEBUGP("entered %s\n", __FUNCTION__);
705
706 if (cda[CTA_TUPLE_ORIG-1])
707 err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG);
708 else if (cda[CTA_TUPLE_REPLY-1])
709 err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY);
710 else {
711 /* Flush the whole table */
712 ip_conntrack_flush();
713 return 0;
714 }
715
716 if (err < 0)
717 return err;
718
719 h = ip_conntrack_find_get(&tuple, NULL);
720 if (!h) {
721 DEBUGP("tuple not found in conntrack hash\n");
722 return -ENOENT;
723 }
724
725 ct = tuplehash_to_ctrack(h);
726
727 if (cda[CTA_ID-1]) {
728 u_int32_t id = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_ID-1]));
729 if (ct->id != id) {
730 ip_conntrack_put(ct);
731 return -ENOENT;
732 }
733 }
734 if (del_timer(&ct->timeout)) {
735 ip_conntrack_put(ct);
736 ct->timeout.function((unsigned long)ct);
737 return 0;
738 }
739 ip_conntrack_put(ct);
740 DEBUGP("leaving\n");
741
742 return 0;
743}
744
745static int
746ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb,
747 struct nlmsghdr *nlh, struct nfattr *cda[], int *errp)
748{
749 struct ip_conntrack_tuple_hash *h;
750 struct ip_conntrack_tuple tuple;
751 struct ip_conntrack *ct;
752 struct sk_buff *skb2 = NULL;
753 int err = 0;
754
755 DEBUGP("entered %s\n", __FUNCTION__);
756
757 if (nlh->nlmsg_flags & NLM_F_DUMP) {
758 struct nfgenmsg *msg = NLMSG_DATA(nlh);
759 u32 rlen;
760
761 if (msg->nfgen_family != AF_INET)
762 return -EAFNOSUPPORT;
763
764 if (NFNL_MSG_TYPE(nlh->nlmsg_type) ==
765 IPCTNL_MSG_CT_GET_CTRZERO) {
766#ifdef CONFIG_IP_NF_CT_ACCT
767 if ((*errp = netlink_dump_start(ctnl, skb, nlh,
768 ctnetlink_dump_table_w,
769 ctnetlink_done)) != 0)
770 return -EINVAL;
771#else
772 return -ENOTSUPP;
773#endif
774 } else {
775 if ((*errp = netlink_dump_start(ctnl, skb, nlh,
776 ctnetlink_dump_table,
777 ctnetlink_done)) != 0)
778 return -EINVAL;
779 }
780
781 rlen = NLMSG_ALIGN(nlh->nlmsg_len);
782 if (rlen > skb->len)
783 rlen = skb->len;
784 skb_pull(skb, rlen);
785 return 0;
786 }
787
788 if (cda[CTA_TUPLE_ORIG-1])
789 err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG);
790 else if (cda[CTA_TUPLE_REPLY-1])
791 err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY);
792 else
793 return -EINVAL;
794
795 if (err < 0)
796 return err;
797
798 h = ip_conntrack_find_get(&tuple, NULL);
799 if (!h) {
800 DEBUGP("tuple not found in conntrack hash");
801 return -ENOENT;
802 }
803 DEBUGP("tuple found\n");
804 ct = tuplehash_to_ctrack(h);
805
806 err = -ENOMEM;
807 skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
808 if (!skb2) {
809 ip_conntrack_put(ct);
810 return -ENOMEM;
811 }
812 NETLINK_CB(skb2).dst_pid = NETLINK_CB(skb).pid;
813
814 err = ctnetlink_fill_info(skb2, NETLINK_CB(skb).pid, nlh->nlmsg_seq,
815 IPCTNL_MSG_CT_NEW, 1, ct);
816 ip_conntrack_put(ct);
817 if (err <= 0)
818 goto out;
819
820 err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT);
821 if (err < 0)
822 goto out;
823
824 DEBUGP("leaving\n");
825 return 0;
826
827out:
828 if (skb2)
829 kfree_skb(skb2);
830 return -1;
831}
832
833static inline int
834ctnetlink_change_status(struct ip_conntrack *ct, struct nfattr *cda[])
835{
836 unsigned long d, status = *(u_int32_t *)NFA_DATA(cda[CTA_STATUS-1]);
837 d = ct->status ^ status;
838
839 if (d & (IPS_EXPECTED|IPS_CONFIRMED|IPS_DYING))
840 /* unchangeable */
841 return -EINVAL;
842
843 if (d & IPS_SEEN_REPLY && !(status & IPS_SEEN_REPLY))
844 /* SEEN_REPLY bit can only be set */
845 return -EINVAL;
846
847
848 if (d & IPS_ASSURED && !(status & IPS_ASSURED))
849 /* ASSURED bit can only be set */
850 return -EINVAL;
851
852 if (cda[CTA_NAT-1]) {
853#ifndef CONFIG_IP_NF_NAT_NEEDED
854 return -EINVAL;
855#else
856 unsigned int hooknum;
857 struct ip_nat_range range;
858
859 if (ctnetlink_parse_nat(cda, ct, &range) < 0)
860 return -EINVAL;
861
862 DEBUGP("NAT: %u.%u.%u.%u-%u.%u.%u.%u:%u-%u\n",
863 NIPQUAD(range.min_ip), NIPQUAD(range.max_ip),
864 htons(range.min.all), htons(range.max.all));
865
866 /* This is tricky but it works. ip_nat_setup_info needs the
867 * hook number as parameter, so let's do the correct
868 * conversion and run away */
869 if (status & IPS_SRC_NAT_DONE)
870 hooknum = NF_IP_POST_ROUTING; /* IP_NAT_MANIP_SRC */
871 else if (status & IPS_DST_NAT_DONE)
872 hooknum = NF_IP_PRE_ROUTING; /* IP_NAT_MANIP_DST */
873 else
874 return -EINVAL; /* Missing NAT flags */
875
876 DEBUGP("NAT status: %lu\n",
877 status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK));
878
879 if (ip_nat_initialized(ct, hooknum))
880 return -EEXIST;
881 ip_nat_setup_info(ct, &range, hooknum);
882
883 DEBUGP("NAT status after setup_info: %lu\n",
884 ct->status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK));
885#endif
886 }
887
888 /* Be careful here, modifying NAT bits can screw up things,
889 * so don't let users modify them directly if they don't pass
890 * ip_nat_range. */
891 ct->status |= status & ~(IPS_NAT_DONE_MASK | IPS_NAT_MASK);
892 return 0;
893}
894
895
896static inline int
897ctnetlink_change_helper(struct ip_conntrack *ct, struct nfattr *cda[])
898{
899 struct ip_conntrack_helper *helper;
900 char *helpname;
901 int err;
902
903 DEBUGP("entered %s\n", __FUNCTION__);
904
905 /* don't change helper of sibling connections */
906 if (ct->master)
907 return -EINVAL;
908
909 err = ctnetlink_parse_help(cda[CTA_HELP-1], &helpname);
910 if (err < 0)
911 return err;
912
913 helper = __ip_conntrack_helper_find_byname(helpname);
914 if (!helper) {
915 if (!strcmp(helpname, ""))
916 helper = NULL;
917 else
918 return -EINVAL;
919 }
920
921 if (ct->helper) {
922 if (!helper) {
923 /* we had a helper before ... */
924 ip_ct_remove_expectations(ct);
925 ct->helper = NULL;
926 } else {
927 /* need to zero data of old helper */
928 memset(&ct->help, 0, sizeof(ct->help));
929 }
930 }
931
932 ct->helper = helper;
933
934 return 0;
935}
936
937static inline int
938ctnetlink_change_timeout(struct ip_conntrack *ct, struct nfattr *cda[])
939{
940 u_int32_t timeout = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_TIMEOUT-1]));
941
942 if (!del_timer(&ct->timeout))
943 return -ETIME;
944
945 ct->timeout.expires = jiffies + timeout * HZ;
946 add_timer(&ct->timeout);
947
948 return 0;
949}
950
951static int
952ctnetlink_change_conntrack(struct ip_conntrack *ct, struct nfattr *cda[])
953{
954 int err;
955
956 DEBUGP("entered %s\n", __FUNCTION__);
957
958 if (cda[CTA_HELP-1]) {
959 err = ctnetlink_change_helper(ct, cda);
960 if (err < 0)
961 return err;
962 }
963
964 if (cda[CTA_TIMEOUT-1]) {
965 err = ctnetlink_change_timeout(ct, cda);
966 if (err < 0)
967 return err;
968 }
969
970 if (cda[CTA_STATUS-1]) {
971 err = ctnetlink_change_status(ct, cda);
972 if (err < 0)
973 return err;
974 }
975
976 DEBUGP("all done\n");
977 return 0;
978}
979
980static int
981ctnetlink_create_conntrack(struct nfattr *cda[],
982 struct ip_conntrack_tuple *otuple,
983 struct ip_conntrack_tuple *rtuple)
984{
985 struct ip_conntrack *ct;
986 int err = -EINVAL;
987
988 DEBUGP("entered %s\n", __FUNCTION__);
989
990 ct = ip_conntrack_alloc(otuple, rtuple);
991 if (ct == NULL || IS_ERR(ct))
992 return -ENOMEM;
993
994 if (!cda[CTA_TIMEOUT-1])
995 goto err;
996 ct->timeout.expires = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_TIMEOUT-1]));
997
998 ct->timeout.expires = jiffies + ct->timeout.expires * HZ;
999 ct->status |= IPS_CONFIRMED;
1000
1001 err = ctnetlink_change_status(ct, cda);
1002 if (err < 0)
1003 goto err;
1004
1005 ct->helper = ip_conntrack_helper_find_get(rtuple);
1006
1007 add_timer(&ct->timeout);
1008 ip_conntrack_hash_insert(ct);
1009
1010 if (ct->helper)
1011 ip_conntrack_helper_put(ct->helper);
1012
1013 DEBUGP("conntrack with id %u inserted\n", ct->id);
1014 return 0;
1015
1016err:
1017 ip_conntrack_free(ct);
1018 return err;
1019}
1020
1021static int
1022ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,
1023 struct nlmsghdr *nlh, struct nfattr *cda[], int *errp)
1024{
1025 struct ip_conntrack_tuple otuple, rtuple;
1026 struct ip_conntrack_tuple_hash *h = NULL;
1027 int err = 0;
1028
1029 DEBUGP("entered %s\n", __FUNCTION__);
1030
1031 if (cda[CTA_TUPLE_ORIG-1]) {
1032 err = ctnetlink_parse_tuple(cda, &otuple, CTA_TUPLE_ORIG);
1033 if (err < 0)
1034 return err;
1035 }
1036
1037 if (cda[CTA_TUPLE_REPLY-1]) {
1038 err = ctnetlink_parse_tuple(cda, &rtuple, CTA_TUPLE_REPLY);
1039 if (err < 0)
1040 return err;
1041 }
1042
1043 write_lock_bh(&ip_conntrack_lock);
1044 if (cda[CTA_TUPLE_ORIG-1])
1045 h = __ip_conntrack_find(&otuple, NULL);
1046 else if (cda[CTA_TUPLE_REPLY-1])
1047 h = __ip_conntrack_find(&rtuple, NULL);
1048
1049 if (h == NULL) {
1050 write_unlock_bh(&ip_conntrack_lock);
1051 DEBUGP("no such conntrack, create new\n");
1052 err = -ENOENT;
1053 if (nlh->nlmsg_flags & NLM_F_CREATE)
1054 err = ctnetlink_create_conntrack(cda, &otuple, &rtuple);
1055 return err;
1056 }
1057 /* implicit 'else' */
1058
1059 /* we only allow nat config for new conntracks */
1060 if (cda[CTA_NAT-1]) {
1061 err = -EINVAL;
1062 goto out_unlock;
1063 }
1064
1065 /* We manipulate the conntrack inside the global conntrack table lock,
1066 * so there's no need to increase the refcount */
1067 DEBUGP("conntrack found\n");
1068 err = -EEXIST;
1069 if (!(nlh->nlmsg_flags & NLM_F_EXCL))
1070 err = ctnetlink_change_conntrack(tuplehash_to_ctrack(h), cda);
1071
1072out_unlock:
1073 write_unlock_bh(&ip_conntrack_lock);
1074 return err;
1075}
1076
1077/***********************************************************************
1078 * EXPECT
1079 ***********************************************************************/
1080
1081static inline int
1082ctnetlink_exp_dump_tuple(struct sk_buff *skb,
1083 const struct ip_conntrack_tuple *tuple,
1084 enum ctattr_expect type)
1085{
1086 struct nfattr *nest_parms = NFA_NEST(skb, type);
1087
1088 if (ctnetlink_dump_tuples(skb, tuple) < 0)
1089 goto nfattr_failure;
1090
1091 NFA_NEST_END(skb, nest_parms);
1092
1093 return 0;
1094
1095nfattr_failure:
1096 return -1;
1097}
1098
1099static inline int
1100ctnetlink_exp_dump_expect(struct sk_buff *skb,
1101 const struct ip_conntrack_expect *exp)
1102{
1103 struct ip_conntrack *master = exp->master;
1104 u_int32_t timeout = htonl((exp->timeout.expires - jiffies) / HZ);
1105 u_int32_t id = htonl(exp->id);
1106
1107 if (ctnetlink_exp_dump_tuple(skb, &exp->tuple, CTA_EXPECT_TUPLE) < 0)
1108 goto nfattr_failure;
1109 if (ctnetlink_exp_dump_tuple(skb, &exp->mask, CTA_EXPECT_MASK) < 0)
1110 goto nfattr_failure;
1111 if (ctnetlink_exp_dump_tuple(skb,
1112 &master->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
1113 CTA_EXPECT_MASTER) < 0)
1114 goto nfattr_failure;
1115
1116 NFA_PUT(skb, CTA_EXPECT_TIMEOUT, sizeof(timeout), &timeout);
1117 NFA_PUT(skb, CTA_EXPECT_ID, sizeof(u_int32_t), &id);
1118
1119 return 0;
1120
1121nfattr_failure:
1122 return -1;
1123}
1124
1125static int
1126ctnetlink_exp_fill_info(struct sk_buff *skb, u32 pid, u32 seq,
1127 int event,
1128 int nowait,
1129 const struct ip_conntrack_expect *exp)
1130{
1131 struct nlmsghdr *nlh;
1132 struct nfgenmsg *nfmsg;
1133 unsigned char *b;
1134
1135 b = skb->tail;
1136
1137 event |= NFNL_SUBSYS_CTNETLINK_EXP << 8;
1138 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(struct nfgenmsg));
1139 nfmsg = NLMSG_DATA(nlh);
1140
1141 nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0;
1142 nfmsg->nfgen_family = AF_INET;
1143 nfmsg->version = NFNETLINK_V0;
1144 nfmsg->res_id = 0;
1145
1146 if (ctnetlink_exp_dump_expect(skb, exp) < 0)
1147 goto nfattr_failure;
1148
1149 nlh->nlmsg_len = skb->tail - b;
1150 return skb->len;
1151
1152nlmsg_failure:
1153nfattr_failure:
1154 skb_trim(skb, b - skb->data);
1155 return -1;
1156}
1157
1158#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
1159static int ctnetlink_expect_event(struct notifier_block *this,
1160 unsigned long events, void *ptr)
1161{
1162 struct nlmsghdr *nlh;
1163 struct nfgenmsg *nfmsg;
1164 struct ip_conntrack_expect *exp = (struct ip_conntrack_expect *)ptr;
1165 struct sk_buff *skb;
1166 unsigned int type;
1167 unsigned char *b;
1168 int flags = 0;
1169 u16 proto;
1170
1171 if (events & IPEXP_NEW) {
1172 type = IPCTNL_MSG_EXP_NEW;
1173 flags = NLM_F_CREATE|NLM_F_EXCL;
1174 } else
1175 return NOTIFY_DONE;
1176
1177 skb = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
1178 if (!skb)
1179 return NOTIFY_DONE;
1180
1181 b = skb->tail;
1182
1183 type |= NFNL_SUBSYS_CTNETLINK << 8;
1184 nlh = NLMSG_PUT(skb, 0, 0, type, sizeof(struct nfgenmsg));
1185 nfmsg = NLMSG_DATA(nlh);
1186
1187 nlh->nlmsg_flags = flags;
1188 nfmsg->nfgen_family = AF_INET;
1189 nfmsg->version = NFNETLINK_V0;
1190 nfmsg->res_id = 0;
1191
1192 if (ctnetlink_exp_dump_expect(skb, exp) < 0)
1193 goto nfattr_failure;
1194
1195 nlh->nlmsg_len = skb->tail - b;
1196 proto = exp->tuple.dst.protonum;
1197 nfnetlink_send(skb, 0, NFNLGRP_CONNTRACK_EXP_NEW, 0);
1198 return NOTIFY_DONE;
1199
1200nlmsg_failure:
1201nfattr_failure:
1202 kfree_skb(skb);
1203 return NOTIFY_DONE;
1204}
1205#endif
1206
1207static int
1208ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
1209{
1210 struct ip_conntrack_expect *exp = NULL;
1211 struct list_head *i;
1212 u_int32_t *id = (u_int32_t *) &cb->args[0];
1213
1214 DEBUGP("entered %s, last id=%llu\n", __FUNCTION__, *id);
1215
1216 read_lock_bh(&ip_conntrack_lock);
1217 list_for_each_prev(i, &ip_conntrack_expect_list) {
1218 exp = (struct ip_conntrack_expect *) i;
1219 if (exp->id <= *id)
1220 continue;
1221 if (ctnetlink_exp_fill_info(skb, NETLINK_CB(cb->skb).pid,
1222 cb->nlh->nlmsg_seq,
1223 IPCTNL_MSG_EXP_NEW,
1224 1, exp) < 0)
1225 goto out;
1226 *id = exp->id;
1227 }
1228out:
1229 read_unlock_bh(&ip_conntrack_lock);
1230
1231 DEBUGP("leaving, last id=%llu\n", *id);
1232
1233 return skb->len;
1234}
1235
1236static int
1237ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb,
1238 struct nlmsghdr *nlh, struct nfattr *cda[], int *errp)
1239{
1240 struct ip_conntrack_tuple tuple;
1241 struct ip_conntrack_expect *exp;
1242 struct sk_buff *skb2;
1243 int err = 0;
1244
1245 DEBUGP("entered %s\n", __FUNCTION__);
1246
1247 if (nlh->nlmsg_flags & NLM_F_DUMP) {
1248 struct nfgenmsg *msg = NLMSG_DATA(nlh);
1249 u32 rlen;
1250
1251 if (msg->nfgen_family != AF_INET)
1252 return -EAFNOSUPPORT;
1253
1254 if ((*errp = netlink_dump_start(ctnl, skb, nlh,
1255 ctnetlink_exp_dump_table,
1256 ctnetlink_done)) != 0)
1257 return -EINVAL;
1258 rlen = NLMSG_ALIGN(nlh->nlmsg_len);
1259 if (rlen > skb->len)
1260 rlen = skb->len;
1261 skb_pull(skb, rlen);
1262 return 0;
1263 }
1264
1265 if (cda[CTA_EXPECT_MASTER-1])
1266 err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER);
1267 else
1268 return -EINVAL;
1269
1270 if (err < 0)
1271 return err;
1272
1273 exp = ip_conntrack_expect_find_get(&tuple);
1274 if (!exp)
1275 return -ENOENT;
1276
1277 err = -ENOMEM;
1278 skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1279 if (!skb2)
1280 goto out;
1281 NETLINK_CB(skb2).dst_pid = NETLINK_CB(skb).pid;
1282
1283 err = ctnetlink_exp_fill_info(skb2, NETLINK_CB(skb).pid,
1284 nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW,
1285 1, exp);
1286 if (err <= 0)
1287 goto out;
1288
1289 ip_conntrack_expect_put(exp);
1290
1291 err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT);
1292 if (err < 0)
1293 goto free;
1294
1295 return err;
1296
1297out:
1298 ip_conntrack_expect_put(exp);
1299free:
1300 if (skb2)
1301 kfree_skb(skb2);
1302 return err;
1303}
1304
1305static int
1306ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb,
1307 struct nlmsghdr *nlh, struct nfattr *cda[], int *errp)
1308{
1309 struct ip_conntrack_expect *exp, *tmp;
1310 struct ip_conntrack_tuple tuple;
1311 struct ip_conntrack_helper *h;
1312 int err;
1313
1314 if (cda[CTA_EXPECT_TUPLE-1]) {
1315 /* delete a single expect by tuple */
1316 err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE);
1317 if (err < 0)
1318 return err;
1319
1320 /* bump usage count to 2 */
1321 exp = ip_conntrack_expect_find_get(&tuple);
1322 if (!exp)
1323 return -ENOENT;
1324
1325 if (cda[CTA_EXPECT_ID-1]) {
1326 u_int32_t id =
1327 *(u_int32_t *)NFA_DATA(cda[CTA_EXPECT_ID-1]);
1328 if (exp->id != ntohl(id)) {
1329 ip_conntrack_expect_put(exp);
1330 return -ENOENT;
1331 }
1332 }
1333
1334 /* after list removal, usage count == 1 */
1335 ip_conntrack_unexpect_related(exp);
1336 /* have to put what we 'get' above.
1337 * after this line usage count == 0 */
1338 ip_conntrack_expect_put(exp);
1339 } else if (cda[CTA_EXPECT_HELP_NAME-1]) {
1340 char *name = NFA_DATA(cda[CTA_EXPECT_HELP_NAME-1]);
1341
1342 /* delete all expectations for this helper */
1343 write_lock_bh(&ip_conntrack_lock);
1344 h = __ip_conntrack_helper_find_byname(name);
1345 if (!h) {
1346 write_unlock_bh(&ip_conntrack_lock);
1347 return -EINVAL;
1348 }
1349 list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list,
1350 list) {
1351 if (exp->master->helper == h
1352 && del_timer(&exp->timeout))
1353 __ip_ct_expect_unlink_destroy(exp);
1354 }
1355 write_unlock(&ip_conntrack_lock);
1356 } else {
1357 /* This basically means we have to flush everything*/
1358 write_lock_bh(&ip_conntrack_lock);
1359 list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list,
1360 list) {
1361 if (del_timer(&exp->timeout))
1362 __ip_ct_expect_unlink_destroy(exp);
1363 }
1364 write_unlock_bh(&ip_conntrack_lock);
1365 }
1366
1367 return 0;
1368}
1369static int
1370ctnetlink_change_expect(struct ip_conntrack_expect *x, struct nfattr *cda[])
1371{
1372 return -EOPNOTSUPP;
1373}
1374
1375static int
1376ctnetlink_create_expect(struct nfattr *cda[])
1377{
1378 struct ip_conntrack_tuple tuple, mask, master_tuple;
1379 struct ip_conntrack_tuple_hash *h = NULL;
1380 struct ip_conntrack_expect *exp;
1381 struct ip_conntrack *ct;
1382 int err = 0;
1383
1384 DEBUGP("entered %s\n", __FUNCTION__);
1385
1386 /* caller guarantees that those three CTA_EXPECT_* exist */
1387 err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE);
1388 if (err < 0)
1389 return err;
1390 err = ctnetlink_parse_tuple(cda, &mask, CTA_EXPECT_MASK);
1391 if (err < 0)
1392 return err;
1393 err = ctnetlink_parse_tuple(cda, &master_tuple, CTA_EXPECT_MASTER);
1394 if (err < 0)
1395 return err;
1396
1397 /* Look for master conntrack of this expectation */
1398 h = ip_conntrack_find_get(&master_tuple, NULL);
1399 if (!h)
1400 return -ENOENT;
1401 ct = tuplehash_to_ctrack(h);
1402
1403 if (!ct->helper) {
1404 /* such conntrack hasn't got any helper, abort */
1405 err = -EINVAL;
1406 goto out;
1407 }
1408
1409 exp = ip_conntrack_expect_alloc(ct);
1410 if (!exp) {
1411 err = -ENOMEM;
1412 goto out;
1413 }
1414
1415 exp->expectfn = NULL;
1416 exp->master = ct;
1417 memcpy(&exp->tuple, &tuple, sizeof(struct ip_conntrack_tuple));
1418 memcpy(&exp->mask, &mask, sizeof(struct ip_conntrack_tuple));
1419
1420 err = ip_conntrack_expect_related(exp);
1421 ip_conntrack_expect_put(exp);
1422
1423out:
1424 ip_conntrack_put(tuplehash_to_ctrack(h));
1425 return err;
1426}
1427
1428static int
1429ctnetlink_new_expect(struct sock *ctnl, struct sk_buff *skb,
1430 struct nlmsghdr *nlh, struct nfattr *cda[], int *errp)
1431{
1432 struct ip_conntrack_tuple tuple;
1433 struct ip_conntrack_expect *exp;
1434 int err = 0;
1435
1436 DEBUGP("entered %s\n", __FUNCTION__);
1437
1438 if (!cda[CTA_EXPECT_TUPLE-1]
1439 || !cda[CTA_EXPECT_MASK-1]
1440 || !cda[CTA_EXPECT_MASTER-1])
1441 return -EINVAL;
1442
1443 err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE);
1444 if (err < 0)
1445 return err;
1446
1447 write_lock_bh(&ip_conntrack_lock);
1448 exp = __ip_conntrack_expect_find(&tuple);
1449
1450 if (!exp) {
1451 write_unlock_bh(&ip_conntrack_lock);
1452 err = -ENOENT;
1453 if (nlh->nlmsg_flags & NLM_F_CREATE)
1454 err = ctnetlink_create_expect(cda);
1455 return err;
1456 }
1457
1458 err = -EEXIST;
1459 if (!(nlh->nlmsg_flags & NLM_F_EXCL))
1460 err = ctnetlink_change_expect(exp, cda);
1461 write_unlock_bh(&ip_conntrack_lock);
1462
1463 DEBUGP("leaving\n");
1464
1465 return err;
1466}
1467
1468#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
1469static struct notifier_block ctnl_notifier = {
1470 .notifier_call = ctnetlink_conntrack_event,
1471};
1472
1473static struct notifier_block ctnl_notifier_exp = {
1474 .notifier_call = ctnetlink_expect_event,
1475};
1476#endif
1477
1478static struct nfnl_callback ctnl_cb[IPCTNL_MSG_MAX] = {
1479 [IPCTNL_MSG_CT_NEW] = { .call = ctnetlink_new_conntrack,
1480 .attr_count = CTA_MAX,
1481 .cap_required = CAP_NET_ADMIN },
1482 [IPCTNL_MSG_CT_GET] = { .call = ctnetlink_get_conntrack,
1483 .attr_count = CTA_MAX,
1484 .cap_required = CAP_NET_ADMIN },
1485 [IPCTNL_MSG_CT_DELETE] = { .call = ctnetlink_del_conntrack,
1486 .attr_count = CTA_MAX,
1487 .cap_required = CAP_NET_ADMIN },
1488 [IPCTNL_MSG_CT_GET_CTRZERO] = { .call = ctnetlink_get_conntrack,
1489 .attr_count = CTA_MAX,
1490 .cap_required = CAP_NET_ADMIN },
1491};
1492
1493static struct nfnl_callback ctnl_exp_cb[IPCTNL_MSG_EXP_MAX] = {
1494 [IPCTNL_MSG_EXP_GET] = { .call = ctnetlink_get_expect,
1495 .attr_count = CTA_EXPECT_MAX,
1496 .cap_required = CAP_NET_ADMIN },
1497 [IPCTNL_MSG_EXP_NEW] = { .call = ctnetlink_new_expect,
1498 .attr_count = CTA_EXPECT_MAX,
1499 .cap_required = CAP_NET_ADMIN },
1500 [IPCTNL_MSG_EXP_DELETE] = { .call = ctnetlink_del_expect,
1501 .attr_count = CTA_EXPECT_MAX,
1502 .cap_required = CAP_NET_ADMIN },
1503};
1504
1505static struct nfnetlink_subsystem ctnl_subsys = {
1506 .name = "conntrack",
1507 .subsys_id = NFNL_SUBSYS_CTNETLINK,
1508 .cb_count = IPCTNL_MSG_MAX,
1509 .cb = ctnl_cb,
1510};
1511
1512static struct nfnetlink_subsystem ctnl_exp_subsys = {
1513 .name = "conntrack_expect",
1514 .subsys_id = NFNL_SUBSYS_CTNETLINK_EXP,
1515 .cb_count = IPCTNL_MSG_EXP_MAX,
1516 .cb = ctnl_exp_cb,
1517};
1518
1519static int __init ctnetlink_init(void)
1520{
1521 int ret;
1522
1523 printk("ctnetlink v%s: registering with nfnetlink.\n", version);
1524 ret = nfnetlink_subsys_register(&ctnl_subsys);
1525 if (ret < 0) {
1526 printk("ctnetlink_init: cannot register with nfnetlink.\n");
1527 goto err_out;
1528 }
1529
1530 ret = nfnetlink_subsys_register(&ctnl_exp_subsys);
1531 if (ret < 0) {
1532 printk("ctnetlink_init: cannot register exp with nfnetlink.\n");
1533 goto err_unreg_subsys;
1534 }
1535
1536#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
1537 ret = ip_conntrack_register_notifier(&ctnl_notifier);
1538 if (ret < 0) {
1539 printk("ctnetlink_init: cannot register notifier.\n");
1540 goto err_unreg_exp_subsys;
1541 }
1542
1543 ret = ip_conntrack_expect_register_notifier(&ctnl_notifier_exp);
1544 if (ret < 0) {
1545 printk("ctnetlink_init: cannot expect register notifier.\n");
1546 goto err_unreg_notifier;
1547 }
1548#endif
1549
1550 return 0;
1551
1552#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
1553err_unreg_notifier:
1554 ip_conntrack_unregister_notifier(&ctnl_notifier);
1555err_unreg_exp_subsys:
1556 nfnetlink_subsys_unregister(&ctnl_exp_subsys);
1557#endif
1558err_unreg_subsys:
1559 nfnetlink_subsys_unregister(&ctnl_subsys);
1560err_out:
1561 return ret;
1562}
1563
1564static void __exit ctnetlink_exit(void)
1565{
1566 printk("ctnetlink: unregistering from nfnetlink.\n");
1567
1568#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
1569 ip_conntrack_unregister_notifier(&ctnl_notifier_exp);
1570 ip_conntrack_unregister_notifier(&ctnl_notifier);
1571#endif
1572
1573 nfnetlink_subsys_unregister(&ctnl_exp_subsys);
1574 nfnetlink_subsys_unregister(&ctnl_subsys);
1575 return;
1576}
1577
1578module_init(ctnetlink_init);
1579module_exit(ctnetlink_exit);
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
index 602c74db3252..838d1d69b36e 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
@@ -102,22 +102,24 @@ static int icmp_packet(struct ip_conntrack *ct,
102 ct->timeout.function((unsigned long)ct); 102 ct->timeout.function((unsigned long)ct);
103 } else { 103 } else {
104 atomic_inc(&ct->proto.icmp.count); 104 atomic_inc(&ct->proto.icmp.count);
105 ip_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb);
105 ip_ct_refresh_acct(ct, ctinfo, skb, ip_ct_icmp_timeout); 106 ip_ct_refresh_acct(ct, ctinfo, skb, ip_ct_icmp_timeout);
106 } 107 }
107 108
108 return NF_ACCEPT; 109 return NF_ACCEPT;
109} 110}
110 111
112static u_int8_t valid_new[] = {
113 [ICMP_ECHO] = 1,
114 [ICMP_TIMESTAMP] = 1,
115 [ICMP_INFO_REQUEST] = 1,
116 [ICMP_ADDRESS] = 1
117};
118
111/* Called when a new connection for this protocol found. */ 119/* Called when a new connection for this protocol found. */
112static int icmp_new(struct ip_conntrack *conntrack, 120static int icmp_new(struct ip_conntrack *conntrack,
113 const struct sk_buff *skb) 121 const struct sk_buff *skb)
114{ 122{
115 static u_int8_t valid_new[]
116 = { [ICMP_ECHO] = 1,
117 [ICMP_TIMESTAMP] = 1,
118 [ICMP_INFO_REQUEST] = 1,
119 [ICMP_ADDRESS] = 1 };
120
121 if (conntrack->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new) 123 if (conntrack->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new)
122 || !valid_new[conntrack->tuplehash[0].tuple.dst.u.icmp.type]) { 124 || !valid_new[conntrack->tuplehash[0].tuple.dst.u.icmp.type]) {
123 /* Can't create a new ICMP `conn' with this. */ 125 /* Can't create a new ICMP `conn' with this. */
@@ -158,11 +160,12 @@ icmp_error_message(struct sk_buff *skb,
158 return NF_ACCEPT; 160 return NF_ACCEPT;
159 } 161 }
160 162
161 innerproto = ip_ct_find_proto(inside->ip.protocol); 163 innerproto = ip_conntrack_proto_find_get(inside->ip.protocol);
162 dataoff = skb->nh.iph->ihl*4 + sizeof(inside->icmp) + inside->ip.ihl*4; 164 dataoff = skb->nh.iph->ihl*4 + sizeof(inside->icmp) + inside->ip.ihl*4;
163 /* Are they talking about one of our connections? */ 165 /* Are they talking about one of our connections? */
164 if (!ip_ct_get_tuple(&inside->ip, skb, dataoff, &origtuple, innerproto)) { 166 if (!ip_ct_get_tuple(&inside->ip, skb, dataoff, &origtuple, innerproto)) {
165 DEBUGP("icmp_error: ! get_tuple p=%u", inside->ip.protocol); 167 DEBUGP("icmp_error: ! get_tuple p=%u", inside->ip.protocol);
168 ip_conntrack_proto_put(innerproto);
166 return NF_ACCEPT; 169 return NF_ACCEPT;
167 } 170 }
168 171
@@ -170,8 +173,10 @@ icmp_error_message(struct sk_buff *skb,
170 been preserved inside the ICMP. */ 173 been preserved inside the ICMP. */
171 if (!ip_ct_invert_tuple(&innertuple, &origtuple, innerproto)) { 174 if (!ip_ct_invert_tuple(&innertuple, &origtuple, innerproto)) {
172 DEBUGP("icmp_error_track: Can't invert tuple\n"); 175 DEBUGP("icmp_error_track: Can't invert tuple\n");
176 ip_conntrack_proto_put(innerproto);
173 return NF_ACCEPT; 177 return NF_ACCEPT;
174 } 178 }
179 ip_conntrack_proto_put(innerproto);
175 180
176 *ctinfo = IP_CT_RELATED; 181 *ctinfo = IP_CT_RELATED;
177 182
@@ -212,7 +217,7 @@ icmp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
212 icmph = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_ih), &_ih); 217 icmph = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_ih), &_ih);
213 if (icmph == NULL) { 218 if (icmph == NULL) {
214 if (LOG_INVALID(IPPROTO_ICMP)) 219 if (LOG_INVALID(IPPROTO_ICMP))
215 nf_log_packet(PF_INET, 0, skb, NULL, NULL, 220 nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
216 "ip_ct_icmp: short packet "); 221 "ip_ct_icmp: short packet ");
217 return -NF_ACCEPT; 222 return -NF_ACCEPT;
218 } 223 }
@@ -226,13 +231,13 @@ icmp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
226 if (!(u16)csum_fold(skb->csum)) 231 if (!(u16)csum_fold(skb->csum))
227 break; 232 break;
228 if (LOG_INVALID(IPPROTO_ICMP)) 233 if (LOG_INVALID(IPPROTO_ICMP))
229 nf_log_packet(PF_INET, 0, skb, NULL, NULL, 234 nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
230 "ip_ct_icmp: bad HW ICMP checksum "); 235 "ip_ct_icmp: bad HW ICMP checksum ");
231 return -NF_ACCEPT; 236 return -NF_ACCEPT;
232 case CHECKSUM_NONE: 237 case CHECKSUM_NONE:
233 if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0))) { 238 if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0))) {
234 if (LOG_INVALID(IPPROTO_ICMP)) 239 if (LOG_INVALID(IPPROTO_ICMP))
235 nf_log_packet(PF_INET, 0, skb, NULL, NULL, 240 nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
236 "ip_ct_icmp: bad ICMP checksum "); 241 "ip_ct_icmp: bad ICMP checksum ");
237 return -NF_ACCEPT; 242 return -NF_ACCEPT;
238 } 243 }
@@ -249,7 +254,7 @@ checksum_skipped:
249 */ 254 */
250 if (icmph->type > NR_ICMP_TYPES) { 255 if (icmph->type > NR_ICMP_TYPES) {
251 if (LOG_INVALID(IPPROTO_ICMP)) 256 if (LOG_INVALID(IPPROTO_ICMP))
252 nf_log_packet(PF_INET, 0, skb, NULL, NULL, 257 nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
253 "ip_ct_icmp: invalid ICMP type "); 258 "ip_ct_icmp: invalid ICMP type ");
254 return -NF_ACCEPT; 259 return -NF_ACCEPT;
255 } 260 }
@@ -265,6 +270,47 @@ checksum_skipped:
265 return icmp_error_message(skb, ctinfo, hooknum); 270 return icmp_error_message(skb, ctinfo, hooknum);
266} 271}
267 272
273#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
274 defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
275static int icmp_tuple_to_nfattr(struct sk_buff *skb,
276 const struct ip_conntrack_tuple *t)
277{
278 NFA_PUT(skb, CTA_PROTO_ICMP_ID, sizeof(u_int16_t),
279 &t->src.u.icmp.id);
280 NFA_PUT(skb, CTA_PROTO_ICMP_TYPE, sizeof(u_int8_t),
281 &t->dst.u.icmp.type);
282 NFA_PUT(skb, CTA_PROTO_ICMP_CODE, sizeof(u_int8_t),
283 &t->dst.u.icmp.code);
284
285 if (t->dst.u.icmp.type >= sizeof(valid_new)
286 || !valid_new[t->dst.u.icmp.type])
287 return -EINVAL;
288
289 return 0;
290
291nfattr_failure:
292 return -1;
293}
294
295static int icmp_nfattr_to_tuple(struct nfattr *tb[],
296 struct ip_conntrack_tuple *tuple)
297{
298 if (!tb[CTA_PROTO_ICMP_TYPE-1]
299 || !tb[CTA_PROTO_ICMP_CODE-1]
300 || !tb[CTA_PROTO_ICMP_ID-1])
301 return -1;
302
303 tuple->dst.u.icmp.type =
304 *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMP_TYPE-1]);
305 tuple->dst.u.icmp.code =
306 *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMP_CODE-1]);
307 tuple->src.u.icmp.id =
308 *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMP_ID-1]);
309
310 return 0;
311}
312#endif
313
268struct ip_conntrack_protocol ip_conntrack_protocol_icmp = 314struct ip_conntrack_protocol ip_conntrack_protocol_icmp =
269{ 315{
270 .proto = IPPROTO_ICMP, 316 .proto = IPPROTO_ICMP,
@@ -276,4 +322,9 @@ struct ip_conntrack_protocol ip_conntrack_protocol_icmp =
276 .packet = icmp_packet, 322 .packet = icmp_packet,
277 .new = icmp_new, 323 .new = icmp_new,
278 .error = icmp_error, 324 .error = icmp_error,
325#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
326 defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
327 .tuple_to_nfattr = icmp_tuple_to_nfattr,
328 .nfattr_to_tuple = icmp_nfattr_to_tuple,
329#endif
279}; 330};
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_sctp.c b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c
index 31d75390bf12..a875f35e576d 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_sctp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c
@@ -404,6 +404,8 @@ static int sctp_packet(struct ip_conntrack *conntrack,
404 } 404 }
405 405
406 conntrack->proto.sctp.state = newconntrack; 406 conntrack->proto.sctp.state = newconntrack;
407 if (oldsctpstate != newconntrack)
408 ip_conntrack_event_cache(IPCT_PROTOINFO, skb);
407 write_unlock_bh(&sctp_lock); 409 write_unlock_bh(&sctp_lock);
408 } 410 }
409 411
@@ -503,7 +505,12 @@ static struct ip_conntrack_protocol ip_conntrack_protocol_sctp = {
503 .packet = sctp_packet, 505 .packet = sctp_packet,
504 .new = sctp_new, 506 .new = sctp_new,
505 .destroy = NULL, 507 .destroy = NULL,
506 .me = THIS_MODULE 508 .me = THIS_MODULE,
509#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
510 defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
511 .tuple_to_nfattr = ip_ct_port_tuple_to_nfattr,
512 .nfattr_to_tuple = ip_ct_port_nfattr_to_tuple,
513#endif
507}; 514};
508 515
509#ifdef CONFIG_SYSCTL 516#ifdef CONFIG_SYSCTL
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
index 809dfed766d4..f23ef1f88c46 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
@@ -336,6 +336,23 @@ static int tcp_print_conntrack(struct seq_file *s,
336 return seq_printf(s, "%s ", tcp_conntrack_names[state]); 336 return seq_printf(s, "%s ", tcp_conntrack_names[state]);
337} 337}
338 338
339#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
340 defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
341static int tcp_to_nfattr(struct sk_buff *skb, struct nfattr *nfa,
342 const struct ip_conntrack *ct)
343{
344 read_lock_bh(&tcp_lock);
345 NFA_PUT(skb, CTA_PROTOINFO_TCP_STATE, sizeof(u_int8_t),
346 &ct->proto.tcp.state);
347 read_unlock_bh(&tcp_lock);
348
349 return 0;
350
351nfattr_failure:
352 return -1;
353}
354#endif
355
339static unsigned int get_conntrack_index(const struct tcphdr *tcph) 356static unsigned int get_conntrack_index(const struct tcphdr *tcph)
340{ 357{
341 if (tcph->rst) return TCP_RST_SET; 358 if (tcph->rst) return TCP_RST_SET;
@@ -699,7 +716,7 @@ static int tcp_in_window(struct ip_ct_tcp *state,
699 res = 1; 716 res = 1;
700 } else { 717 } else {
701 if (LOG_INVALID(IPPROTO_TCP)) 718 if (LOG_INVALID(IPPROTO_TCP))
702 nf_log_packet(PF_INET, 0, skb, NULL, NULL, 719 nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
703 "ip_ct_tcp: %s ", 720 "ip_ct_tcp: %s ",
704 before(seq, sender->td_maxend + 1) ? 721 before(seq, sender->td_maxend + 1) ?
705 after(end, sender->td_end - receiver->td_maxwin - 1) ? 722 after(end, sender->td_end - receiver->td_maxwin - 1) ?
@@ -798,7 +815,7 @@ static int tcp_error(struct sk_buff *skb,
798 sizeof(_tcph), &_tcph); 815 sizeof(_tcph), &_tcph);
799 if (th == NULL) { 816 if (th == NULL) {
800 if (LOG_INVALID(IPPROTO_TCP)) 817 if (LOG_INVALID(IPPROTO_TCP))
801 nf_log_packet(PF_INET, 0, skb, NULL, NULL, 818 nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
802 "ip_ct_tcp: short packet "); 819 "ip_ct_tcp: short packet ");
803 return -NF_ACCEPT; 820 return -NF_ACCEPT;
804 } 821 }
@@ -806,7 +823,7 @@ static int tcp_error(struct sk_buff *skb,
806 /* Not whole TCP header or malformed packet */ 823 /* Not whole TCP header or malformed packet */
807 if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) { 824 if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) {
808 if (LOG_INVALID(IPPROTO_TCP)) 825 if (LOG_INVALID(IPPROTO_TCP))
809 nf_log_packet(PF_INET, 0, skb, NULL, NULL, 826 nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
810 "ip_ct_tcp: truncated/malformed packet "); 827 "ip_ct_tcp: truncated/malformed packet ");
811 return -NF_ACCEPT; 828 return -NF_ACCEPT;
812 } 829 }
@@ -823,7 +840,7 @@ static int tcp_error(struct sk_buff *skb,
823 skb->ip_summed == CHECKSUM_HW ? skb->csum 840 skb->ip_summed == CHECKSUM_HW ? skb->csum
824 : skb_checksum(skb, iph->ihl*4, tcplen, 0))) { 841 : skb_checksum(skb, iph->ihl*4, tcplen, 0))) {
825 if (LOG_INVALID(IPPROTO_TCP)) 842 if (LOG_INVALID(IPPROTO_TCP))
826 nf_log_packet(PF_INET, 0, skb, NULL, NULL, 843 nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
827 "ip_ct_tcp: bad TCP checksum "); 844 "ip_ct_tcp: bad TCP checksum ");
828 return -NF_ACCEPT; 845 return -NF_ACCEPT;
829 } 846 }
@@ -832,7 +849,7 @@ static int tcp_error(struct sk_buff *skb,
832 tcpflags = (((u_int8_t *)th)[13] & ~(TH_ECE|TH_CWR)); 849 tcpflags = (((u_int8_t *)th)[13] & ~(TH_ECE|TH_CWR));
833 if (!tcp_valid_flags[tcpflags]) { 850 if (!tcp_valid_flags[tcpflags]) {
834 if (LOG_INVALID(IPPROTO_TCP)) 851 if (LOG_INVALID(IPPROTO_TCP))
835 nf_log_packet(PF_INET, 0, skb, NULL, NULL, 852 nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
836 "ip_ct_tcp: invalid TCP flag combination "); 853 "ip_ct_tcp: invalid TCP flag combination ");
837 return -NF_ACCEPT; 854 return -NF_ACCEPT;
838 } 855 }
@@ -880,8 +897,9 @@ static int tcp_packet(struct ip_conntrack *conntrack,
880 */ 897 */
881 write_unlock_bh(&tcp_lock); 898 write_unlock_bh(&tcp_lock);
882 if (LOG_INVALID(IPPROTO_TCP)) 899 if (LOG_INVALID(IPPROTO_TCP))
883 nf_log_packet(PF_INET, 0, skb, NULL, NULL, 900 nf_log_packet(PF_INET, 0, skb, NULL, NULL,
884 "ip_ct_tcp: killing out of sync session "); 901 NULL, "ip_ct_tcp: "
902 "killing out of sync session ");
885 if (del_timer(&conntrack->timeout)) 903 if (del_timer(&conntrack->timeout))
886 conntrack->timeout.function((unsigned long) 904 conntrack->timeout.function((unsigned long)
887 conntrack); 905 conntrack);
@@ -895,7 +913,7 @@ static int tcp_packet(struct ip_conntrack *conntrack,
895 913
896 write_unlock_bh(&tcp_lock); 914 write_unlock_bh(&tcp_lock);
897 if (LOG_INVALID(IPPROTO_TCP)) 915 if (LOG_INVALID(IPPROTO_TCP))
898 nf_log_packet(PF_INET, 0, skb, NULL, NULL, 916 nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
899 "ip_ct_tcp: invalid packet ignored "); 917 "ip_ct_tcp: invalid packet ignored ");
900 return NF_ACCEPT; 918 return NF_ACCEPT;
901 case TCP_CONNTRACK_MAX: 919 case TCP_CONNTRACK_MAX:
@@ -905,7 +923,7 @@ static int tcp_packet(struct ip_conntrack *conntrack,
905 old_state); 923 old_state);
906 write_unlock_bh(&tcp_lock); 924 write_unlock_bh(&tcp_lock);
907 if (LOG_INVALID(IPPROTO_TCP)) 925 if (LOG_INVALID(IPPROTO_TCP))
908 nf_log_packet(PF_INET, 0, skb, NULL, NULL, 926 nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
909 "ip_ct_tcp: invalid state "); 927 "ip_ct_tcp: invalid state ");
910 return -NF_ACCEPT; 928 return -NF_ACCEPT;
911 case TCP_CONNTRACK_SYN_SENT: 929 case TCP_CONNTRACK_SYN_SENT:
@@ -926,7 +944,7 @@ static int tcp_packet(struct ip_conntrack *conntrack,
926 write_unlock_bh(&tcp_lock); 944 write_unlock_bh(&tcp_lock);
927 if (LOG_INVALID(IPPROTO_TCP)) 945 if (LOG_INVALID(IPPROTO_TCP))
928 nf_log_packet(PF_INET, 0, skb, NULL, NULL, 946 nf_log_packet(PF_INET, 0, skb, NULL, NULL,
929 "ip_ct_tcp: invalid SYN"); 947 NULL, "ip_ct_tcp: invalid SYN");
930 return -NF_ACCEPT; 948 return -NF_ACCEPT;
931 } 949 }
932 case TCP_CONNTRACK_CLOSE: 950 case TCP_CONNTRACK_CLOSE:
@@ -973,6 +991,10 @@ static int tcp_packet(struct ip_conntrack *conntrack,
973 ? ip_ct_tcp_timeout_max_retrans : *tcp_timeouts[new_state]; 991 ? ip_ct_tcp_timeout_max_retrans : *tcp_timeouts[new_state];
974 write_unlock_bh(&tcp_lock); 992 write_unlock_bh(&tcp_lock);
975 993
994 ip_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb);
995 if (new_state != old_state)
996 ip_conntrack_event_cache(IPCT_PROTOINFO, skb);
997
976 if (!test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) { 998 if (!test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) {
977 /* If only reply is a RST, we can consider ourselves not to 999 /* If only reply is a RST, we can consider ourselves not to
978 have an established connection: this is a fairly common 1000 have an established connection: this is a fairly common
@@ -1096,4 +1118,10 @@ struct ip_conntrack_protocol ip_conntrack_protocol_tcp =
1096 .packet = tcp_packet, 1118 .packet = tcp_packet,
1097 .new = tcp_new, 1119 .new = tcp_new,
1098 .error = tcp_error, 1120 .error = tcp_error,
1121#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
1122 defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
1123 .to_nfattr = tcp_to_nfattr,
1124 .tuple_to_nfattr = ip_ct_port_tuple_to_nfattr,
1125 .nfattr_to_tuple = ip_ct_port_nfattr_to_tuple,
1126#endif
1099}; 1127};
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_udp.c b/net/ipv4/netfilter/ip_conntrack_proto_udp.c
index 8c1eaba098d4..f2dcac7c7660 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_udp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_udp.c
@@ -73,7 +73,8 @@ static int udp_packet(struct ip_conntrack *conntrack,
73 ip_ct_refresh_acct(conntrack, ctinfo, skb, 73 ip_ct_refresh_acct(conntrack, ctinfo, skb,
74 ip_ct_udp_timeout_stream); 74 ip_ct_udp_timeout_stream);
75 /* Also, more likely to be important, and not a probe */ 75 /* Also, more likely to be important, and not a probe */
76 set_bit(IPS_ASSURED_BIT, &conntrack->status); 76 if (!test_and_set_bit(IPS_ASSURED_BIT, &conntrack->status))
77 ip_conntrack_event_cache(IPCT_STATUS, skb);
77 } else 78 } else
78 ip_ct_refresh_acct(conntrack, ctinfo, skb, ip_ct_udp_timeout); 79 ip_ct_refresh_acct(conntrack, ctinfo, skb, ip_ct_udp_timeout);
79 80
@@ -97,7 +98,7 @@ static int udp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
97 hdr = skb_header_pointer(skb, iph->ihl*4, sizeof(_hdr), &_hdr); 98 hdr = skb_header_pointer(skb, iph->ihl*4, sizeof(_hdr), &_hdr);
98 if (hdr == NULL) { 99 if (hdr == NULL) {
99 if (LOG_INVALID(IPPROTO_UDP)) 100 if (LOG_INVALID(IPPROTO_UDP))
100 nf_log_packet(PF_INET, 0, skb, NULL, NULL, 101 nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
101 "ip_ct_udp: short packet "); 102 "ip_ct_udp: short packet ");
102 return -NF_ACCEPT; 103 return -NF_ACCEPT;
103 } 104 }
@@ -105,7 +106,7 @@ static int udp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
105 /* Truncated/malformed packets */ 106 /* Truncated/malformed packets */
106 if (ntohs(hdr->len) > udplen || ntohs(hdr->len) < sizeof(*hdr)) { 107 if (ntohs(hdr->len) > udplen || ntohs(hdr->len) < sizeof(*hdr)) {
107 if (LOG_INVALID(IPPROTO_UDP)) 108 if (LOG_INVALID(IPPROTO_UDP))
108 nf_log_packet(PF_INET, 0, skb, NULL, NULL, 109 nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
109 "ip_ct_udp: truncated/malformed packet "); 110 "ip_ct_udp: truncated/malformed packet ");
110 return -NF_ACCEPT; 111 return -NF_ACCEPT;
111 } 112 }
@@ -125,7 +126,7 @@ static int udp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
125 skb->ip_summed == CHECKSUM_HW ? skb->csum 126 skb->ip_summed == CHECKSUM_HW ? skb->csum
126 : skb_checksum(skb, iph->ihl*4, udplen, 0))) { 127 : skb_checksum(skb, iph->ihl*4, udplen, 0))) {
127 if (LOG_INVALID(IPPROTO_UDP)) 128 if (LOG_INVALID(IPPROTO_UDP))
128 nf_log_packet(PF_INET, 0, skb, NULL, NULL, 129 nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
129 "ip_ct_udp: bad UDP checksum "); 130 "ip_ct_udp: bad UDP checksum ");
130 return -NF_ACCEPT; 131 return -NF_ACCEPT;
131 } 132 }
@@ -144,4 +145,9 @@ struct ip_conntrack_protocol ip_conntrack_protocol_udp =
144 .packet = udp_packet, 145 .packet = udp_packet,
145 .new = udp_new, 146 .new = udp_new,
146 .error = udp_error, 147 .error = udp_error,
148#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
149 defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
150 .tuple_to_nfattr = ip_ct_port_tuple_to_nfattr,
151 .nfattr_to_tuple = ip_ct_port_nfattr_to_tuple,
152#endif
147}; 153};
diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c
index 61798c46e91d..ee5895afd0c3 100644
--- a/net/ipv4/netfilter/ip_conntrack_standalone.c
+++ b/net/ipv4/netfilter/ip_conntrack_standalone.c
@@ -5,7 +5,7 @@
5*/ 5*/
6 6
7/* (C) 1999-2001 Paul `Rusty' Russell 7/* (C) 1999-2001 Paul `Rusty' Russell
8 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> 8 * (C) 2002-2005 Netfilter Core Team <coreteam@netfilter.org>
9 * 9 *
10 * This program is free software; you can redistribute it and/or modify 10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License version 2 as 11 * it under the terms of the GNU General Public License version 2 as
@@ -147,8 +147,7 @@ static int ct_seq_show(struct seq_file *s, void *v)
147 if (DIRECTION(hash)) 147 if (DIRECTION(hash))
148 return 0; 148 return 0;
149 149
150 proto = ip_ct_find_proto(conntrack->tuplehash[IP_CT_DIR_ORIGINAL] 150 proto = __ip_conntrack_proto_find(conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum);
151 .tuple.dst.protonum);
152 IP_NF_ASSERT(proto); 151 IP_NF_ASSERT(proto);
153 152
154 if (seq_printf(s, "%-8s %u %ld ", 153 if (seq_printf(s, "%-8s %u %ld ",
@@ -185,7 +184,7 @@ static int ct_seq_show(struct seq_file *s, void *v)
185 return -ENOSPC; 184 return -ENOSPC;
186 185
187#if defined(CONFIG_IP_NF_CONNTRACK_MARK) 186#if defined(CONFIG_IP_NF_CONNTRACK_MARK)
188 if (seq_printf(s, "mark=%lu ", conntrack->mark)) 187 if (seq_printf(s, "mark=%u ", conntrack->mark))
189 return -ENOSPC; 188 return -ENOSPC;
190#endif 189#endif
191 190
@@ -283,7 +282,7 @@ static int exp_seq_show(struct seq_file *s, void *v)
283 seq_printf(s, "proto=%u ", expect->tuple.dst.protonum); 282 seq_printf(s, "proto=%u ", expect->tuple.dst.protonum);
284 283
285 print_tuple(s, &expect->tuple, 284 print_tuple(s, &expect->tuple,
286 ip_ct_find_proto(expect->tuple.dst.protonum)); 285 __ip_conntrack_proto_find(expect->tuple.dst.protonum));
287 return seq_putc(s, '\n'); 286 return seq_putc(s, '\n');
288} 287}
289 288
@@ -889,6 +888,7 @@ static int init_or_cleanup(int init)
889 return ret; 888 return ret;
890 889
891 cleanup: 890 cleanup:
891 synchronize_net();
892#ifdef CONFIG_SYSCTL 892#ifdef CONFIG_SYSCTL
893 unregister_sysctl_table(ip_ct_sysctl_header); 893 unregister_sysctl_table(ip_ct_sysctl_header);
894 cleanup_localinops: 894 cleanup_localinops:
@@ -971,6 +971,14 @@ void need_ip_conntrack(void)
971{ 971{
972} 972}
973 973
974#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
975EXPORT_SYMBOL_GPL(ip_conntrack_chain);
976EXPORT_SYMBOL_GPL(ip_conntrack_expect_chain);
977EXPORT_SYMBOL_GPL(ip_conntrack_register_notifier);
978EXPORT_SYMBOL_GPL(ip_conntrack_unregister_notifier);
979EXPORT_SYMBOL_GPL(__ip_ct_event_cache_init);
980EXPORT_PER_CPU_SYMBOL_GPL(ip_conntrack_ecache);
981#endif
974EXPORT_SYMBOL(ip_conntrack_protocol_register); 982EXPORT_SYMBOL(ip_conntrack_protocol_register);
975EXPORT_SYMBOL(ip_conntrack_protocol_unregister); 983EXPORT_SYMBOL(ip_conntrack_protocol_unregister);
976EXPORT_SYMBOL(ip_ct_get_tuple); 984EXPORT_SYMBOL(ip_ct_get_tuple);
@@ -982,12 +990,16 @@ EXPORT_SYMBOL(ip_conntrack_helper_register);
982EXPORT_SYMBOL(ip_conntrack_helper_unregister); 990EXPORT_SYMBOL(ip_conntrack_helper_unregister);
983EXPORT_SYMBOL(ip_ct_iterate_cleanup); 991EXPORT_SYMBOL(ip_ct_iterate_cleanup);
984EXPORT_SYMBOL(ip_ct_refresh_acct); 992EXPORT_SYMBOL(ip_ct_refresh_acct);
985EXPORT_SYMBOL(ip_ct_protos); 993
986EXPORT_SYMBOL(ip_ct_find_proto);
987EXPORT_SYMBOL(ip_conntrack_expect_alloc); 994EXPORT_SYMBOL(ip_conntrack_expect_alloc);
988EXPORT_SYMBOL(ip_conntrack_expect_put); 995EXPORT_SYMBOL(ip_conntrack_expect_put);
996EXPORT_SYMBOL_GPL(ip_conntrack_expect_find_get);
989EXPORT_SYMBOL(ip_conntrack_expect_related); 997EXPORT_SYMBOL(ip_conntrack_expect_related);
990EXPORT_SYMBOL(ip_conntrack_unexpect_related); 998EXPORT_SYMBOL(ip_conntrack_unexpect_related);
999EXPORT_SYMBOL_GPL(ip_conntrack_expect_list);
1000EXPORT_SYMBOL_GPL(__ip_conntrack_expect_find);
1001EXPORT_SYMBOL_GPL(__ip_ct_expect_unlink_destroy);
1002
991EXPORT_SYMBOL(ip_conntrack_tuple_taken); 1003EXPORT_SYMBOL(ip_conntrack_tuple_taken);
992EXPORT_SYMBOL(ip_ct_gather_frags); 1004EXPORT_SYMBOL(ip_ct_gather_frags);
993EXPORT_SYMBOL(ip_conntrack_htable_size); 1005EXPORT_SYMBOL(ip_conntrack_htable_size);
@@ -995,7 +1007,28 @@ EXPORT_SYMBOL(ip_conntrack_lock);
995EXPORT_SYMBOL(ip_conntrack_hash); 1007EXPORT_SYMBOL(ip_conntrack_hash);
996EXPORT_SYMBOL(ip_conntrack_untracked); 1008EXPORT_SYMBOL(ip_conntrack_untracked);
997EXPORT_SYMBOL_GPL(ip_conntrack_find_get); 1009EXPORT_SYMBOL_GPL(ip_conntrack_find_get);
998EXPORT_SYMBOL_GPL(ip_conntrack_put);
999#ifdef CONFIG_IP_NF_NAT_NEEDED 1010#ifdef CONFIG_IP_NF_NAT_NEEDED
1000EXPORT_SYMBOL(ip_conntrack_tcp_update); 1011EXPORT_SYMBOL(ip_conntrack_tcp_update);
1001#endif 1012#endif
1013
1014EXPORT_SYMBOL_GPL(ip_conntrack_flush);
1015EXPORT_SYMBOL_GPL(__ip_conntrack_find);
1016
1017EXPORT_SYMBOL_GPL(ip_conntrack_alloc);
1018EXPORT_SYMBOL_GPL(ip_conntrack_free);
1019EXPORT_SYMBOL_GPL(ip_conntrack_hash_insert);
1020
1021EXPORT_SYMBOL_GPL(ip_ct_remove_expectations);
1022
1023EXPORT_SYMBOL_GPL(ip_conntrack_helper_find_get);
1024EXPORT_SYMBOL_GPL(ip_conntrack_helper_put);
1025EXPORT_SYMBOL_GPL(__ip_conntrack_helper_find_byname);
1026
1027EXPORT_SYMBOL_GPL(ip_conntrack_proto_find_get);
1028EXPORT_SYMBOL_GPL(ip_conntrack_proto_put);
1029EXPORT_SYMBOL_GPL(__ip_conntrack_proto_find);
1030#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
1031 defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
1032EXPORT_SYMBOL_GPL(ip_ct_port_tuple_to_nfattr);
1033EXPORT_SYMBOL_GPL(ip_ct_port_nfattr_to_tuple);
1034#endif
diff --git a/net/ipv4/netfilter/ip_nat_core.c b/net/ipv4/netfilter/ip_nat_core.c
index 739b6dde1c82..1adedb743f60 100644
--- a/net/ipv4/netfilter/ip_nat_core.c
+++ b/net/ipv4/netfilter/ip_nat_core.c
@@ -47,8 +47,39 @@ DEFINE_RWLOCK(ip_nat_lock);
47static unsigned int ip_nat_htable_size; 47static unsigned int ip_nat_htable_size;
48 48
49static struct list_head *bysource; 49static struct list_head *bysource;
50
51#define MAX_IP_NAT_PROTO 256
50struct ip_nat_protocol *ip_nat_protos[MAX_IP_NAT_PROTO]; 52struct ip_nat_protocol *ip_nat_protos[MAX_IP_NAT_PROTO];
51 53
54static inline struct ip_nat_protocol *
55__ip_nat_proto_find(u_int8_t protonum)
56{
57 return ip_nat_protos[protonum];
58}
59
60struct ip_nat_protocol *
61ip_nat_proto_find_get(u_int8_t protonum)
62{
63 struct ip_nat_protocol *p;
64
65 /* we need to disable preemption to make sure 'p' doesn't get
66 * removed until we've grabbed the reference */
67 preempt_disable();
68 p = __ip_nat_proto_find(protonum);
69 if (p) {
70 if (!try_module_get(p->me))
71 p = &ip_nat_unknown_protocol;
72 }
73 preempt_enable();
74
75 return p;
76}
77
78void
79ip_nat_proto_put(struct ip_nat_protocol *p)
80{
81 module_put(p->me);
82}
52 83
53/* We keep an extra hash for each conntrack, for fast searching. */ 84/* We keep an extra hash for each conntrack, for fast searching. */
54static inline unsigned int 85static inline unsigned int
@@ -103,7 +134,8 @@ static int
103in_range(const struct ip_conntrack_tuple *tuple, 134in_range(const struct ip_conntrack_tuple *tuple,
104 const struct ip_nat_range *range) 135 const struct ip_nat_range *range)
105{ 136{
106 struct ip_nat_protocol *proto = ip_nat_find_proto(tuple->dst.protonum); 137 struct ip_nat_protocol *proto =
138 __ip_nat_proto_find(tuple->dst.protonum);
107 139
108 /* If we are supposed to map IPs, then we must be in the 140 /* If we are supposed to map IPs, then we must be in the
109 range specified, otherwise let this drag us onto a new src IP. */ 141 range specified, otherwise let this drag us onto a new src IP. */
@@ -216,8 +248,7 @@ get_unique_tuple(struct ip_conntrack_tuple *tuple,
216 struct ip_conntrack *conntrack, 248 struct ip_conntrack *conntrack,
217 enum ip_nat_manip_type maniptype) 249 enum ip_nat_manip_type maniptype)
218{ 250{
219 struct ip_nat_protocol *proto 251 struct ip_nat_protocol *proto;
220 = ip_nat_find_proto(orig_tuple->dst.protonum);
221 252
222 /* 1) If this srcip/proto/src-proto-part is currently mapped, 253 /* 1) If this srcip/proto/src-proto-part is currently mapped,
223 and that same mapping gives a unique tuple within the given 254 and that same mapping gives a unique tuple within the given
@@ -242,14 +273,20 @@ get_unique_tuple(struct ip_conntrack_tuple *tuple,
242 /* 3) The per-protocol part of the manip is made to map into 273 /* 3) The per-protocol part of the manip is made to map into
243 the range to make a unique tuple. */ 274 the range to make a unique tuple. */
244 275
276 proto = ip_nat_proto_find_get(orig_tuple->dst.protonum);
277
245 /* Only bother mapping if it's not already in range and unique */ 278 /* Only bother mapping if it's not already in range and unique */
246 if ((!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED) 279 if ((!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)
247 || proto->in_range(tuple, maniptype, &range->min, &range->max)) 280 || proto->in_range(tuple, maniptype, &range->min, &range->max))
248 && !ip_nat_used_tuple(tuple, conntrack)) 281 && !ip_nat_used_tuple(tuple, conntrack)) {
282 ip_nat_proto_put(proto);
249 return; 283 return;
284 }
250 285
251 /* Last change: get protocol to try to obtain unique tuple. */ 286 /* Last change: get protocol to try to obtain unique tuple. */
252 proto->unique_tuple(tuple, range, maniptype, conntrack); 287 proto->unique_tuple(tuple, range, maniptype, conntrack);
288
289 ip_nat_proto_put(proto);
253} 290}
254 291
255unsigned int 292unsigned int
@@ -320,17 +357,20 @@ manip_pkt(u_int16_t proto,
320 enum ip_nat_manip_type maniptype) 357 enum ip_nat_manip_type maniptype)
321{ 358{
322 struct iphdr *iph; 359 struct iphdr *iph;
360 struct ip_nat_protocol *p;
323 361
324 (*pskb)->nfcache |= NFC_ALTERED; 362 if (!skb_make_writable(pskb, iphdroff + sizeof(*iph)))
325 if (!skb_ip_make_writable(pskb, iphdroff + sizeof(*iph)))
326 return 0; 363 return 0;
327 364
328 iph = (void *)(*pskb)->data + iphdroff; 365 iph = (void *)(*pskb)->data + iphdroff;
329 366
330 /* Manipulate protcol part. */ 367 /* Manipulate protcol part. */
331 if (!ip_nat_find_proto(proto)->manip_pkt(pskb, iphdroff, 368 p = ip_nat_proto_find_get(proto);
332 target, maniptype)) 369 if (!p->manip_pkt(pskb, iphdroff, target, maniptype)) {
370 ip_nat_proto_put(p);
333 return 0; 371 return 0;
372 }
373 ip_nat_proto_put(p);
334 374
335 iph = (void *)(*pskb)->data + iphdroff; 375 iph = (void *)(*pskb)->data + iphdroff;
336 376
@@ -391,7 +431,7 @@ int icmp_reply_translation(struct sk_buff **pskb,
391 struct ip_conntrack_tuple inner, target; 431 struct ip_conntrack_tuple inner, target;
392 int hdrlen = (*pskb)->nh.iph->ihl * 4; 432 int hdrlen = (*pskb)->nh.iph->ihl * 4;
393 433
394 if (!skb_ip_make_writable(pskb, hdrlen + sizeof(*inside))) 434 if (!skb_make_writable(pskb, hdrlen + sizeof(*inside)))
395 return 0; 435 return 0;
396 436
397 inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4; 437 inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4;
@@ -426,7 +466,8 @@ int icmp_reply_translation(struct sk_buff **pskb,
426 466
427 if (!ip_ct_get_tuple(&inside->ip, *pskb, (*pskb)->nh.iph->ihl*4 + 467 if (!ip_ct_get_tuple(&inside->ip, *pskb, (*pskb)->nh.iph->ihl*4 +
428 sizeof(struct icmphdr) + inside->ip.ihl*4, 468 sizeof(struct icmphdr) + inside->ip.ihl*4,
429 &inner, ip_ct_find_proto(inside->ip.protocol))) 469 &inner,
470 __ip_conntrack_proto_find(inside->ip.protocol)))
430 return 0; 471 return 0;
431 472
432 /* Change inner back to look like incoming packet. We do the 473 /* Change inner back to look like incoming packet. We do the
@@ -496,6 +537,49 @@ void ip_nat_protocol_unregister(struct ip_nat_protocol *proto)
496 synchronize_net(); 537 synchronize_net();
497} 538}
498 539
540#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
541 defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
542int
543ip_nat_port_range_to_nfattr(struct sk_buff *skb,
544 const struct ip_nat_range *range)
545{
546 NFA_PUT(skb, CTA_PROTONAT_PORT_MIN, sizeof(u_int16_t),
547 &range->min.tcp.port);
548 NFA_PUT(skb, CTA_PROTONAT_PORT_MAX, sizeof(u_int16_t),
549 &range->max.tcp.port);
550
551 return 0;
552
553nfattr_failure:
554 return -1;
555}
556
557int
558ip_nat_port_nfattr_to_range(struct nfattr *tb[], struct ip_nat_range *range)
559{
560 int ret = 0;
561
562 /* we have to return whether we actually parsed something or not */
563
564 if (tb[CTA_PROTONAT_PORT_MIN-1]) {
565 ret = 1;
566 range->min.tcp.port =
567 *(u_int16_t *)NFA_DATA(tb[CTA_PROTONAT_PORT_MIN-1]);
568 }
569
570 if (!tb[CTA_PROTONAT_PORT_MAX-1]) {
571 if (ret)
572 range->max.tcp.port = range->min.tcp.port;
573 } else {
574 ret = 1;
575 range->max.tcp.port =
576 *(u_int16_t *)NFA_DATA(tb[CTA_PROTONAT_PORT_MAX-1]);
577 }
578
579 return ret;
580}
581#endif
582
499int __init ip_nat_init(void) 583int __init ip_nat_init(void)
500{ 584{
501 size_t i; 585 size_t i;
diff --git a/net/ipv4/netfilter/ip_nat_helper.c b/net/ipv4/netfilter/ip_nat_helper.c
index 158f34f32c04..d2dd5d313556 100644
--- a/net/ipv4/netfilter/ip_nat_helper.c
+++ b/net/ipv4/netfilter/ip_nat_helper.c
@@ -168,7 +168,7 @@ ip_nat_mangle_tcp_packet(struct sk_buff **pskb,
168 struct tcphdr *tcph; 168 struct tcphdr *tcph;
169 int datalen; 169 int datalen;
170 170
171 if (!skb_ip_make_writable(pskb, (*pskb)->len)) 171 if (!skb_make_writable(pskb, (*pskb)->len))
172 return 0; 172 return 0;
173 173
174 if (rep_len > match_len 174 if (rep_len > match_len
@@ -228,7 +228,7 @@ ip_nat_mangle_udp_packet(struct sk_buff **pskb,
228 match_offset + match_len) 228 match_offset + match_len)
229 return 0; 229 return 0;
230 230
231 if (!skb_ip_make_writable(pskb, (*pskb)->len)) 231 if (!skb_make_writable(pskb, (*pskb)->len))
232 return 0; 232 return 0;
233 233
234 if (rep_len > match_len 234 if (rep_len > match_len
@@ -315,7 +315,7 @@ ip_nat_sack_adjust(struct sk_buff **pskb,
315 optoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct tcphdr); 315 optoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct tcphdr);
316 optend = (*pskb)->nh.iph->ihl*4 + tcph->doff*4; 316 optend = (*pskb)->nh.iph->ihl*4 + tcph->doff*4;
317 317
318 if (!skb_ip_make_writable(pskb, optend)) 318 if (!skb_make_writable(pskb, optend))
319 return 0; 319 return 0;
320 320
321 dir = CTINFO2DIR(ctinfo); 321 dir = CTINFO2DIR(ctinfo);
@@ -363,7 +363,7 @@ ip_nat_seq_adjust(struct sk_buff **pskb,
363 this_way = &ct->nat.info.seq[dir]; 363 this_way = &ct->nat.info.seq[dir];
364 other_way = &ct->nat.info.seq[!dir]; 364 other_way = &ct->nat.info.seq[!dir];
365 365
366 if (!skb_ip_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph))) 366 if (!skb_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph)))
367 return 0; 367 return 0;
368 368
369 tcph = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4; 369 tcph = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4;
diff --git a/net/ipv4/netfilter/ip_nat_proto_icmp.c b/net/ipv4/netfilter/ip_nat_proto_icmp.c
index 6596c9ee1655..938719043999 100644
--- a/net/ipv4/netfilter/ip_nat_proto_icmp.c
+++ b/net/ipv4/netfilter/ip_nat_proto_icmp.c
@@ -62,7 +62,7 @@ icmp_manip_pkt(struct sk_buff **pskb,
62 struct icmphdr *hdr; 62 struct icmphdr *hdr;
63 unsigned int hdroff = iphdroff + iph->ihl*4; 63 unsigned int hdroff = iphdroff + iph->ihl*4;
64 64
65 if (!skb_ip_make_writable(pskb, hdroff + sizeof(*hdr))) 65 if (!skb_make_writable(pskb, hdroff + sizeof(*hdr)))
66 return 0; 66 return 0;
67 67
68 hdr = (struct icmphdr *)((*pskb)->data + hdroff); 68 hdr = (struct icmphdr *)((*pskb)->data + hdroff);
@@ -106,11 +106,18 @@ icmp_print_range(char *buffer, const struct ip_nat_range *range)
106 else return 0; 106 else return 0;
107} 107}
108 108
109struct ip_nat_protocol ip_nat_protocol_icmp 109struct ip_nat_protocol ip_nat_protocol_icmp = {
110= { "ICMP", IPPROTO_ICMP, 110 .name = "ICMP",
111 icmp_manip_pkt, 111 .protonum = IPPROTO_ICMP,
112 icmp_in_range, 112 .me = THIS_MODULE,
113 icmp_unique_tuple, 113 .manip_pkt = icmp_manip_pkt,
114 icmp_print, 114 .in_range = icmp_in_range,
115 icmp_print_range 115 .unique_tuple = icmp_unique_tuple,
116 .print = icmp_print,
117 .print_range = icmp_print_range,
118#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
119 defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
120 .range_to_nfattr = ip_nat_port_range_to_nfattr,
121 .nfattr_to_range = ip_nat_port_nfattr_to_range,
122#endif
116}; 123};
diff --git a/net/ipv4/netfilter/ip_nat_proto_tcp.c b/net/ipv4/netfilter/ip_nat_proto_tcp.c
index a98e36d2b3c6..1d381bf68574 100644
--- a/net/ipv4/netfilter/ip_nat_proto_tcp.c
+++ b/net/ipv4/netfilter/ip_nat_proto_tcp.c
@@ -12,6 +12,7 @@
12#include <linux/ip.h> 12#include <linux/ip.h>
13#include <linux/tcp.h> 13#include <linux/tcp.h>
14#include <linux/if.h> 14#include <linux/if.h>
15#include <linux/netfilter/nfnetlink_conntrack.h>
15#include <linux/netfilter_ipv4/ip_nat.h> 16#include <linux/netfilter_ipv4/ip_nat.h>
16#include <linux/netfilter_ipv4/ip_nat_rule.h> 17#include <linux/netfilter_ipv4/ip_nat_rule.h>
17#include <linux/netfilter_ipv4/ip_nat_protocol.h> 18#include <linux/netfilter_ipv4/ip_nat_protocol.h>
@@ -102,7 +103,7 @@ tcp_manip_pkt(struct sk_buff **pskb,
102 if ((*pskb)->len >= hdroff + sizeof(struct tcphdr)) 103 if ((*pskb)->len >= hdroff + sizeof(struct tcphdr))
103 hdrsize = sizeof(struct tcphdr); 104 hdrsize = sizeof(struct tcphdr);
104 105
105 if (!skb_ip_make_writable(pskb, hdroff + hdrsize)) 106 if (!skb_make_writable(pskb, hdroff + hdrsize))
106 return 0; 107 return 0;
107 108
108 iph = (struct iphdr *)((*pskb)->data + iphdroff); 109 iph = (struct iphdr *)((*pskb)->data + iphdroff);
@@ -169,11 +170,18 @@ tcp_print_range(char *buffer, const struct ip_nat_range *range)
169 else return 0; 170 else return 0;
170} 171}
171 172
172struct ip_nat_protocol ip_nat_protocol_tcp 173struct ip_nat_protocol ip_nat_protocol_tcp = {
173= { "TCP", IPPROTO_TCP, 174 .name = "TCP",
174 tcp_manip_pkt, 175 .protonum = IPPROTO_TCP,
175 tcp_in_range, 176 .me = THIS_MODULE,
176 tcp_unique_tuple, 177 .manip_pkt = tcp_manip_pkt,
177 tcp_print, 178 .in_range = tcp_in_range,
178 tcp_print_range 179 .unique_tuple = tcp_unique_tuple,
180 .print = tcp_print,
181 .print_range = tcp_print_range,
182#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
183 defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
184 .range_to_nfattr = ip_nat_port_range_to_nfattr,
185 .nfattr_to_range = ip_nat_port_nfattr_to_range,
186#endif
179}; 187};
diff --git a/net/ipv4/netfilter/ip_nat_proto_udp.c b/net/ipv4/netfilter/ip_nat_proto_udp.c
index 9f66e5625664..c4906e1aa24a 100644
--- a/net/ipv4/netfilter/ip_nat_proto_udp.c
+++ b/net/ipv4/netfilter/ip_nat_proto_udp.c
@@ -94,7 +94,7 @@ udp_manip_pkt(struct sk_buff **pskb,
94 u32 oldip, newip; 94 u32 oldip, newip;
95 u16 *portptr, newport; 95 u16 *portptr, newport;
96 96
97 if (!skb_ip_make_writable(pskb, hdroff + sizeof(*hdr))) 97 if (!skb_make_writable(pskb, hdroff + sizeof(*hdr)))
98 return 0; 98 return 0;
99 99
100 iph = (struct iphdr *)((*pskb)->data + iphdroff); 100 iph = (struct iphdr *)((*pskb)->data + iphdroff);
@@ -156,11 +156,18 @@ udp_print_range(char *buffer, const struct ip_nat_range *range)
156 else return 0; 156 else return 0;
157} 157}
158 158
159struct ip_nat_protocol ip_nat_protocol_udp 159struct ip_nat_protocol ip_nat_protocol_udp = {
160= { "UDP", IPPROTO_UDP, 160 .name = "UDP",
161 udp_manip_pkt, 161 .protonum = IPPROTO_UDP,
162 udp_in_range, 162 .me = THIS_MODULE,
163 udp_unique_tuple, 163 .manip_pkt = udp_manip_pkt,
164 udp_print, 164 .in_range = udp_in_range,
165 udp_print_range 165 .unique_tuple = udp_unique_tuple,
166 .print = udp_print,
167 .print_range = udp_print_range,
168#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
169 defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
170 .range_to_nfattr = ip_nat_port_range_to_nfattr,
171 .nfattr_to_range = ip_nat_port_nfattr_to_range,
172#endif
166}; 173};
diff --git a/net/ipv4/netfilter/ip_nat_proto_unknown.c b/net/ipv4/netfilter/ip_nat_proto_unknown.c
index f5525bd58d16..99bbef56f84e 100644
--- a/net/ipv4/netfilter/ip_nat_proto_unknown.c
+++ b/net/ipv4/netfilter/ip_nat_proto_unknown.c
@@ -61,10 +61,11 @@ unknown_print_range(char *buffer, const struct ip_nat_range *range)
61} 61}
62 62
63struct ip_nat_protocol ip_nat_unknown_protocol = { 63struct ip_nat_protocol ip_nat_unknown_protocol = {
64 "unknown", 0, 64 .name = "unknown",
65 unknown_manip_pkt, 65 .me = THIS_MODULE,
66 unknown_in_range, 66 .manip_pkt = unknown_manip_pkt,
67 unknown_unique_tuple, 67 .in_range = unknown_in_range,
68 unknown_print, 68 .unique_tuple = unknown_unique_tuple,
69 unknown_print_range 69 .print = unknown_print,
70 .print_range = unknown_print_range
70}; 71};
diff --git a/net/ipv4/netfilter/ip_nat_snmp_basic.c b/net/ipv4/netfilter/ip_nat_snmp_basic.c
index 2a48b6e635ae..93b2c5111bb2 100644
--- a/net/ipv4/netfilter/ip_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/ip_nat_snmp_basic.c
@@ -1275,7 +1275,7 @@ static int help(struct sk_buff **pskb,
1275 return NF_DROP; 1275 return NF_DROP;
1276 } 1276 }
1277 1277
1278 if (!skb_ip_make_writable(pskb, (*pskb)->len)) 1278 if (!skb_make_writable(pskb, (*pskb)->len))
1279 return NF_DROP; 1279 return NF_DROP;
1280 1280
1281 spin_lock_bh(&snmp_lock); 1281 spin_lock_bh(&snmp_lock);
diff --git a/net/ipv4/netfilter/ip_nat_standalone.c b/net/ipv4/netfilter/ip_nat_standalone.c
index 91d5ea1dbbc9..89db052add81 100644
--- a/net/ipv4/netfilter/ip_nat_standalone.c
+++ b/net/ipv4/netfilter/ip_nat_standalone.c
@@ -73,8 +73,6 @@ ip_nat_fn(unsigned int hooknum,
73 IP_NF_ASSERT(!((*pskb)->nh.iph->frag_off 73 IP_NF_ASSERT(!((*pskb)->nh.iph->frag_off
74 & htons(IP_MF|IP_OFFSET))); 74 & htons(IP_MF|IP_OFFSET)));
75 75
76 (*pskb)->nfcache |= NFC_UNKNOWN;
77
78 /* If we had a hardware checksum before, it's now invalid */ 76 /* If we had a hardware checksum before, it's now invalid */
79 if ((*pskb)->ip_summed == CHECKSUM_HW) 77 if ((*pskb)->ip_summed == CHECKSUM_HW)
80 if (skb_checksum_help(*pskb, (out == NULL))) 78 if (skb_checksum_help(*pskb, (out == NULL)))
@@ -396,6 +394,8 @@ module_exit(fini);
396EXPORT_SYMBOL(ip_nat_setup_info); 394EXPORT_SYMBOL(ip_nat_setup_info);
397EXPORT_SYMBOL(ip_nat_protocol_register); 395EXPORT_SYMBOL(ip_nat_protocol_register);
398EXPORT_SYMBOL(ip_nat_protocol_unregister); 396EXPORT_SYMBOL(ip_nat_protocol_unregister);
397EXPORT_SYMBOL_GPL(ip_nat_proto_find_get);
398EXPORT_SYMBOL_GPL(ip_nat_proto_put);
399EXPORT_SYMBOL(ip_nat_cheat_check); 399EXPORT_SYMBOL(ip_nat_cheat_check);
400EXPORT_SYMBOL(ip_nat_mangle_tcp_packet); 400EXPORT_SYMBOL(ip_nat_mangle_tcp_packet);
401EXPORT_SYMBOL(ip_nat_mangle_udp_packet); 401EXPORT_SYMBOL(ip_nat_mangle_udp_packet);
diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c
index c6baa8174389..d54f14d926f6 100644
--- a/net/ipv4/netfilter/ip_queue.c
+++ b/net/ipv4/netfilter/ip_queue.c
@@ -43,17 +43,10 @@
43#define NET_IPQ_QMAX 2088 43#define NET_IPQ_QMAX 2088
44#define NET_IPQ_QMAX_NAME "ip_queue_maxlen" 44#define NET_IPQ_QMAX_NAME "ip_queue_maxlen"
45 45
46struct ipq_rt_info {
47 __u8 tos;
48 __u32 daddr;
49 __u32 saddr;
50};
51
52struct ipq_queue_entry { 46struct ipq_queue_entry {
53 struct list_head list; 47 struct list_head list;
54 struct nf_info *info; 48 struct nf_info *info;
55 struct sk_buff *skb; 49 struct sk_buff *skb;
56 struct ipq_rt_info rt_info;
57}; 50};
58 51
59typedef int (*ipq_cmpfn)(struct ipq_queue_entry *, unsigned long); 52typedef int (*ipq_cmpfn)(struct ipq_queue_entry *, unsigned long);
@@ -247,8 +240,8 @@ ipq_build_packet_message(struct ipq_queue_entry *entry, int *errp)
247 240
248 pmsg->packet_id = (unsigned long )entry; 241 pmsg->packet_id = (unsigned long )entry;
249 pmsg->data_len = data_len; 242 pmsg->data_len = data_len;
250 pmsg->timestamp_sec = entry->skb->stamp.tv_sec; 243 pmsg->timestamp_sec = skb_tv_base.tv_sec + entry->skb->tstamp.off_sec;
251 pmsg->timestamp_usec = entry->skb->stamp.tv_usec; 244 pmsg->timestamp_usec = skb_tv_base.tv_usec + entry->skb->tstamp.off_usec;
252 pmsg->mark = entry->skb->nfmark; 245 pmsg->mark = entry->skb->nfmark;
253 pmsg->hook = entry->info->hook; 246 pmsg->hook = entry->info->hook;
254 pmsg->hw_protocol = entry->skb->protocol; 247 pmsg->hw_protocol = entry->skb->protocol;
@@ -287,7 +280,8 @@ nlmsg_failure:
287} 280}
288 281
289static int 282static int
290ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info, void *data) 283ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info,
284 unsigned int queuenum, void *data)
291{ 285{
292 int status = -EINVAL; 286 int status = -EINVAL;
293 struct sk_buff *nskb; 287 struct sk_buff *nskb;
@@ -305,14 +299,6 @@ ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info, void *data)
305 entry->info = info; 299 entry->info = info;
306 entry->skb = skb; 300 entry->skb = skb;
307 301
308 if (entry->info->hook == NF_IP_LOCAL_OUT) {
309 struct iphdr *iph = skb->nh.iph;
310
311 entry->rt_info.tos = iph->tos;
312 entry->rt_info.daddr = iph->daddr;
313 entry->rt_info.saddr = iph->saddr;
314 }
315
316 nskb = ipq_build_packet_message(entry, &status); 302 nskb = ipq_build_packet_message(entry, &status);
317 if (nskb == NULL) 303 if (nskb == NULL)
318 goto err_out_free; 304 goto err_out_free;
@@ -388,24 +374,11 @@ ipq_mangle_ipv4(ipq_verdict_msg_t *v, struct ipq_queue_entry *e)
388 } 374 }
389 skb_put(e->skb, diff); 375 skb_put(e->skb, diff);
390 } 376 }
391 if (!skb_ip_make_writable(&e->skb, v->data_len)) 377 if (!skb_make_writable(&e->skb, v->data_len))
392 return -ENOMEM; 378 return -ENOMEM;
393 memcpy(e->skb->data, v->payload, v->data_len); 379 memcpy(e->skb->data, v->payload, v->data_len);
394 e->skb->ip_summed = CHECKSUM_NONE; 380 e->skb->ip_summed = CHECKSUM_NONE;
395 e->skb->nfcache |= NFC_ALTERED; 381
396
397 /*
398 * Extra routing may needed on local out, as the QUEUE target never
399 * returns control to the table.
400 */
401 if (e->info->hook == NF_IP_LOCAL_OUT) {
402 struct iphdr *iph = e->skb->nh.iph;
403
404 if (!(iph->tos == e->rt_info.tos
405 && iph->daddr == e->rt_info.daddr
406 && iph->saddr == e->rt_info.saddr))
407 return ip_route_me_harder(&e->skb);
408 }
409 return 0; 382 return 0;
410} 383}
411 384
@@ -683,6 +656,11 @@ ipq_get_info(char *buffer, char **start, off_t offset, int length)
683} 656}
684#endif /* CONFIG_PROC_FS */ 657#endif /* CONFIG_PROC_FS */
685 658
659static struct nf_queue_handler nfqh = {
660 .name = "ip_queue",
661 .outfn = &ipq_enqueue_packet,
662};
663
686static int 664static int
687init_or_cleanup(int init) 665init_or_cleanup(int init)
688{ 666{
@@ -693,7 +671,8 @@ init_or_cleanup(int init)
693 goto cleanup; 671 goto cleanup;
694 672
695 netlink_register_notifier(&ipq_nl_notifier); 673 netlink_register_notifier(&ipq_nl_notifier);
696 ipqnl = netlink_kernel_create(NETLINK_FIREWALL, ipq_rcv_sk); 674 ipqnl = netlink_kernel_create(NETLINK_FIREWALL, 0, ipq_rcv_sk,
675 THIS_MODULE);
697 if (ipqnl == NULL) { 676 if (ipqnl == NULL) {
698 printk(KERN_ERR "ip_queue: failed to create netlink socket\n"); 677 printk(KERN_ERR "ip_queue: failed to create netlink socket\n");
699 goto cleanup_netlink_notifier; 678 goto cleanup_netlink_notifier;
@@ -710,7 +689,7 @@ init_or_cleanup(int init)
710 register_netdevice_notifier(&ipq_dev_notifier); 689 register_netdevice_notifier(&ipq_dev_notifier);
711 ipq_sysctl_header = register_sysctl_table(ipq_root_table, 0); 690 ipq_sysctl_header = register_sysctl_table(ipq_root_table, 0);
712 691
713 status = nf_register_queue_handler(PF_INET, ipq_enqueue_packet, NULL); 692 status = nf_register_queue_handler(PF_INET, &nfqh);
714 if (status < 0) { 693 if (status < 0) {
715 printk(KERN_ERR "ip_queue: failed to register queue handler\n"); 694 printk(KERN_ERR "ip_queue: failed to register queue handler\n");
716 goto cleanup_sysctl; 695 goto cleanup_sysctl;
@@ -718,7 +697,7 @@ init_or_cleanup(int init)
718 return status; 697 return status;
719 698
720cleanup: 699cleanup:
721 nf_unregister_queue_handler(PF_INET); 700 nf_unregister_queue_handlers(&nfqh);
722 synchronize_net(); 701 synchronize_net();
723 ipq_flush(NF_DROP); 702 ipq_flush(NF_DROP);
724 703
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index c88dfcd38c56..eef99a1b5de6 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -312,7 +312,6 @@ ipt_do_table(struct sk_buff **pskb,
312 do { 312 do {
313 IP_NF_ASSERT(e); 313 IP_NF_ASSERT(e);
314 IP_NF_ASSERT(back); 314 IP_NF_ASSERT(back);
315 (*pskb)->nfcache |= e->nfcache;
316 if (ip_packet_match(ip, indev, outdev, &e->ip, offset)) { 315 if (ip_packet_match(ip, indev, outdev, &e->ip, offset)) {
317 struct ipt_entry_target *t; 316 struct ipt_entry_target *t;
318 317
@@ -341,8 +340,8 @@ ipt_do_table(struct sk_buff **pskb,
341 back->comefrom); 340 back->comefrom);
342 continue; 341 continue;
343 } 342 }
344 if (table_base + v 343 if (table_base + v != (void *)e + e->next_offset
345 != (void *)e + e->next_offset) { 344 && !(e->ip.flags & IPT_F_GOTO)) {
346 /* Save old back ptr in next entry */ 345 /* Save old back ptr in next entry */
347 struct ipt_entry *next 346 struct ipt_entry *next
348 = (void *)e + e->next_offset; 347 = (void *)e + e->next_offset;
diff --git a/net/ipv4/netfilter/ipt_CLASSIFY.c b/net/ipv4/netfilter/ipt_CLASSIFY.c
index 9842e6e23184..dab78d8bd494 100644
--- a/net/ipv4/netfilter/ipt_CLASSIFY.c
+++ b/net/ipv4/netfilter/ipt_CLASSIFY.c
@@ -32,10 +32,8 @@ target(struct sk_buff **pskb,
32{ 32{
33 const struct ipt_classify_target_info *clinfo = targinfo; 33 const struct ipt_classify_target_info *clinfo = targinfo;
34 34
35 if((*pskb)->priority != clinfo->priority) { 35 if((*pskb)->priority != clinfo->priority)
36 (*pskb)->priority = clinfo->priority; 36 (*pskb)->priority = clinfo->priority;
37 (*pskb)->nfcache |= NFC_ALTERED;
38 }
39 37
40 return IPT_CONTINUE; 38 return IPT_CONTINUE;
41} 39}
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 6706d3a1bc4f..2d05cafec221 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -367,7 +367,7 @@ target(struct sk_buff **pskb,
367#ifdef DEBUG_CLUSTERP 367#ifdef DEBUG_CLUSTERP
368 DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); 368 DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
369#endif 369#endif
370 DEBUGP("hash=%u ct_hash=%lu ", hash, ct->mark); 370 DEBUGP("hash=%u ct_hash=%u ", hash, ct->mark);
371 if (!clusterip_responsible(cipinfo->config, hash)) { 371 if (!clusterip_responsible(cipinfo->config, hash)) {
372 DEBUGP("not responsible\n"); 372 DEBUGP("not responsible\n");
373 return NF_DROP; 373 return NF_DROP;
diff --git a/net/ipv4/netfilter/ipt_CONNMARK.c b/net/ipv4/netfilter/ipt_CONNMARK.c
index 30ddd3e18eb7..134638021339 100644
--- a/net/ipv4/netfilter/ipt_CONNMARK.c
+++ b/net/ipv4/netfilter/ipt_CONNMARK.c
@@ -40,9 +40,9 @@ target(struct sk_buff **pskb,
40 void *userinfo) 40 void *userinfo)
41{ 41{
42 const struct ipt_connmark_target_info *markinfo = targinfo; 42 const struct ipt_connmark_target_info *markinfo = targinfo;
43 unsigned long diff; 43 u_int32_t diff;
44 unsigned long nfmark; 44 u_int32_t nfmark;
45 unsigned long newmark; 45 u_int32_t newmark;
46 46
47 enum ip_conntrack_info ctinfo; 47 enum ip_conntrack_info ctinfo;
48 struct ip_conntrack *ct = ip_conntrack_get((*pskb), &ctinfo); 48 struct ip_conntrack *ct = ip_conntrack_get((*pskb), &ctinfo);
@@ -61,10 +61,8 @@ target(struct sk_buff **pskb,
61 case IPT_CONNMARK_RESTORE: 61 case IPT_CONNMARK_RESTORE:
62 nfmark = (*pskb)->nfmark; 62 nfmark = (*pskb)->nfmark;
63 diff = (ct->mark ^ nfmark) & markinfo->mask; 63 diff = (ct->mark ^ nfmark) & markinfo->mask;
64 if (diff != 0) { 64 if (diff != 0)
65 (*pskb)->nfmark = nfmark ^ diff; 65 (*pskb)->nfmark = nfmark ^ diff;
66 (*pskb)->nfcache |= NFC_ALTERED;
67 }
68 break; 66 break;
69 } 67 }
70 } 68 }
@@ -94,6 +92,11 @@ checkentry(const char *tablename,
94 } 92 }
95 } 93 }
96 94
95 if (matchinfo->mark > 0xffffffff || matchinfo->mask > 0xffffffff) {
96 printk(KERN_WARNING "CONNMARK: Only supports 32bit mark\n");
97 return 0;
98 }
99
97 return 1; 100 return 1;
98} 101}
99 102
diff --git a/net/ipv4/netfilter/ipt_DSCP.c b/net/ipv4/netfilter/ipt_DSCP.c
index 3ea4509099f9..6e319570a28c 100644
--- a/net/ipv4/netfilter/ipt_DSCP.c
+++ b/net/ipv4/netfilter/ipt_DSCP.c
@@ -39,7 +39,7 @@ target(struct sk_buff **pskb,
39 if (((*pskb)->nh.iph->tos & IPT_DSCP_MASK) != sh_dscp) { 39 if (((*pskb)->nh.iph->tos & IPT_DSCP_MASK) != sh_dscp) {
40 u_int16_t diffs[2]; 40 u_int16_t diffs[2];
41 41
42 if (!skb_ip_make_writable(pskb, sizeof(struct iphdr))) 42 if (!skb_make_writable(pskb, sizeof(struct iphdr)))
43 return NF_DROP; 43 return NF_DROP;
44 44
45 diffs[0] = htons((*pskb)->nh.iph->tos) ^ 0xFFFF; 45 diffs[0] = htons((*pskb)->nh.iph->tos) ^ 0xFFFF;
@@ -51,7 +51,6 @@ target(struct sk_buff **pskb,
51 sizeof(diffs), 51 sizeof(diffs),
52 (*pskb)->nh.iph->check 52 (*pskb)->nh.iph->check
53 ^ 0xFFFF)); 53 ^ 0xFFFF));
54 (*pskb)->nfcache |= NFC_ALTERED;
55 } 54 }
56 return IPT_CONTINUE; 55 return IPT_CONTINUE;
57} 56}
diff --git a/net/ipv4/netfilter/ipt_ECN.c b/net/ipv4/netfilter/ipt_ECN.c
index 94a0ce1c1c9d..a1319693f648 100644
--- a/net/ipv4/netfilter/ipt_ECN.c
+++ b/net/ipv4/netfilter/ipt_ECN.c
@@ -31,7 +31,7 @@ set_ect_ip(struct sk_buff **pskb, const struct ipt_ECN_info *einfo)
31 != (einfo->ip_ect & IPT_ECN_IP_MASK)) { 31 != (einfo->ip_ect & IPT_ECN_IP_MASK)) {
32 u_int16_t diffs[2]; 32 u_int16_t diffs[2];
33 33
34 if (!skb_ip_make_writable(pskb, sizeof(struct iphdr))) 34 if (!skb_make_writable(pskb, sizeof(struct iphdr)))
35 return 0; 35 return 0;
36 36
37 diffs[0] = htons((*pskb)->nh.iph->tos) ^ 0xFFFF; 37 diffs[0] = htons((*pskb)->nh.iph->tos) ^ 0xFFFF;
@@ -43,7 +43,6 @@ set_ect_ip(struct sk_buff **pskb, const struct ipt_ECN_info *einfo)
43 sizeof(diffs), 43 sizeof(diffs),
44 (*pskb)->nh.iph->check 44 (*pskb)->nh.iph->check
45 ^0xFFFF)); 45 ^0xFFFF));
46 (*pskb)->nfcache |= NFC_ALTERED;
47 } 46 }
48 return 1; 47 return 1;
49} 48}
@@ -67,7 +66,7 @@ set_ect_tcp(struct sk_buff **pskb, const struct ipt_ECN_info *einfo, int inward)
67 tcph->cwr == einfo->proto.tcp.cwr))) 66 tcph->cwr == einfo->proto.tcp.cwr)))
68 return 1; 67 return 1;
69 68
70 if (!skb_ip_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph))) 69 if (!skb_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph)))
71 return 0; 70 return 0;
72 tcph = (void *)(*pskb)->nh.iph + (*pskb)->nh.iph->ihl*4; 71 tcph = (void *)(*pskb)->nh.iph + (*pskb)->nh.iph->ihl*4;
73 72
@@ -87,7 +86,6 @@ set_ect_tcp(struct sk_buff **pskb, const struct ipt_ECN_info *einfo, int inward)
87 tcph->check = csum_fold(csum_partial((char *)diffs, 86 tcph->check = csum_fold(csum_partial((char *)diffs,
88 sizeof(diffs), 87 sizeof(diffs),
89 tcph->check^0xFFFF)); 88 tcph->check^0xFFFF));
90 (*pskb)->nfcache |= NFC_ALTERED;
91 return 1; 89 return 1;
92} 90}
93 91
diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c
index ef08733d26da..92ed050fac69 100644
--- a/net/ipv4/netfilter/ipt_LOG.c
+++ b/net/ipv4/netfilter/ipt_LOG.c
@@ -27,10 +27,6 @@ MODULE_LICENSE("GPL");
27MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); 27MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
28MODULE_DESCRIPTION("iptables syslog logging module"); 28MODULE_DESCRIPTION("iptables syslog logging module");
29 29
30static unsigned int nflog = 1;
31module_param(nflog, int, 0400);
32MODULE_PARM_DESC(nflog, "register as internal netfilter logging module");
33
34#if 0 30#if 0
35#define DEBUGP printk 31#define DEBUGP printk
36#else 32#else
@@ -41,11 +37,17 @@ MODULE_PARM_DESC(nflog, "register as internal netfilter logging module");
41static DEFINE_SPINLOCK(log_lock); 37static DEFINE_SPINLOCK(log_lock);
42 38
43/* One level of recursion won't kill us */ 39/* One level of recursion won't kill us */
44static void dump_packet(const struct ipt_log_info *info, 40static void dump_packet(const struct nf_loginfo *info,
45 const struct sk_buff *skb, 41 const struct sk_buff *skb,
46 unsigned int iphoff) 42 unsigned int iphoff)
47{ 43{
48 struct iphdr _iph, *ih; 44 struct iphdr _iph, *ih;
45 unsigned int logflags;
46
47 if (info->type == NF_LOG_TYPE_LOG)
48 logflags = info->u.log.logflags;
49 else
50 logflags = NF_LOG_MASK;
49 51
50 ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph); 52 ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph);
51 if (ih == NULL) { 53 if (ih == NULL) {
@@ -76,7 +78,7 @@ static void dump_packet(const struct ipt_log_info *info,
76 if (ntohs(ih->frag_off) & IP_OFFSET) 78 if (ntohs(ih->frag_off) & IP_OFFSET)
77 printk("FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET); 79 printk("FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET);
78 80
79 if ((info->logflags & IPT_LOG_IPOPT) 81 if ((logflags & IPT_LOG_IPOPT)
80 && ih->ihl * 4 > sizeof(struct iphdr)) { 82 && ih->ihl * 4 > sizeof(struct iphdr)) {
81 unsigned char _opt[4 * 15 - sizeof(struct iphdr)], *op; 83 unsigned char _opt[4 * 15 - sizeof(struct iphdr)], *op;
82 unsigned int i, optsize; 84 unsigned int i, optsize;
@@ -119,7 +121,7 @@ static void dump_packet(const struct ipt_log_info *info,
119 printk("SPT=%u DPT=%u ", 121 printk("SPT=%u DPT=%u ",
120 ntohs(th->source), ntohs(th->dest)); 122 ntohs(th->source), ntohs(th->dest));
121 /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */ 123 /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */
122 if (info->logflags & IPT_LOG_TCPSEQ) 124 if (logflags & IPT_LOG_TCPSEQ)
123 printk("SEQ=%u ACK=%u ", 125 printk("SEQ=%u ACK=%u ",
124 ntohl(th->seq), ntohl(th->ack_seq)); 126 ntohl(th->seq), ntohl(th->ack_seq));
125 /* Max length: 13 "WINDOW=65535 " */ 127 /* Max length: 13 "WINDOW=65535 " */
@@ -146,7 +148,7 @@ static void dump_packet(const struct ipt_log_info *info,
146 /* Max length: 11 "URGP=65535 " */ 148 /* Max length: 11 "URGP=65535 " */
147 printk("URGP=%u ", ntohs(th->urg_ptr)); 149 printk("URGP=%u ", ntohs(th->urg_ptr));
148 150
149 if ((info->logflags & IPT_LOG_TCPOPT) 151 if ((logflags & IPT_LOG_TCPOPT)
150 && th->doff * 4 > sizeof(struct tcphdr)) { 152 && th->doff * 4 > sizeof(struct tcphdr)) {
151 unsigned char _opt[4 * 15 - sizeof(struct tcphdr)]; 153 unsigned char _opt[4 * 15 - sizeof(struct tcphdr)];
152 unsigned char *op; 154 unsigned char *op;
@@ -328,7 +330,7 @@ static void dump_packet(const struct ipt_log_info *info,
328 } 330 }
329 331
330 /* Max length: 15 "UID=4294967295 " */ 332 /* Max length: 15 "UID=4294967295 " */
331 if ((info->logflags & IPT_LOG_UID) && !iphoff && skb->sk) { 333 if ((logflags & IPT_LOG_UID) && !iphoff && skb->sk) {
332 read_lock_bh(&skb->sk->sk_callback_lock); 334 read_lock_bh(&skb->sk->sk_callback_lock);
333 if (skb->sk->sk_socket && skb->sk->sk_socket->file) 335 if (skb->sk->sk_socket && skb->sk->sk_socket->file)
334 printk("UID=%u ", skb->sk->sk_socket->file->f_uid); 336 printk("UID=%u ", skb->sk->sk_socket->file->f_uid);
@@ -349,19 +351,31 @@ static void dump_packet(const struct ipt_log_info *info,
349 /* maxlen = 230+ 91 + 230 + 252 = 803 */ 351 /* maxlen = 230+ 91 + 230 + 252 = 803 */
350} 352}
351 353
354struct nf_loginfo default_loginfo = {
355 .type = NF_LOG_TYPE_LOG,
356 .u = {
357 .log = {
358 .level = 0,
359 .logflags = NF_LOG_MASK,
360 },
361 },
362};
363
352static void 364static void
353ipt_log_packet(unsigned int hooknum, 365ipt_log_packet(unsigned int pf,
366 unsigned int hooknum,
354 const struct sk_buff *skb, 367 const struct sk_buff *skb,
355 const struct net_device *in, 368 const struct net_device *in,
356 const struct net_device *out, 369 const struct net_device *out,
357 const struct ipt_log_info *loginfo, 370 const struct nf_loginfo *loginfo,
358 const char *level_string,
359 const char *prefix) 371 const char *prefix)
360{ 372{
373 if (!loginfo)
374 loginfo = &default_loginfo;
375
361 spin_lock_bh(&log_lock); 376 spin_lock_bh(&log_lock);
362 printk(level_string); 377 printk("<%d>%sIN=%s OUT=%s ", loginfo->u.log.level,
363 printk("%sIN=%s OUT=%s ", 378 prefix,
364 prefix == NULL ? loginfo->prefix : prefix,
365 in ? in->name : "", 379 in ? in->name : "",
366 out ? out->name : ""); 380 out ? out->name : "");
367#ifdef CONFIG_BRIDGE_NETFILTER 381#ifdef CONFIG_BRIDGE_NETFILTER
@@ -405,28 +419,15 @@ ipt_log_target(struct sk_buff **pskb,
405 void *userinfo) 419 void *userinfo)
406{ 420{
407 const struct ipt_log_info *loginfo = targinfo; 421 const struct ipt_log_info *loginfo = targinfo;
408 char level_string[4] = "< >"; 422 struct nf_loginfo li;
409 423
410 level_string[1] = '0' + (loginfo->level % 8); 424 li.type = NF_LOG_TYPE_LOG;
411 ipt_log_packet(hooknum, *pskb, in, out, loginfo, level_string, NULL); 425 li.u.log.level = loginfo->level;
426 li.u.log.logflags = loginfo->logflags;
412 427
413 return IPT_CONTINUE; 428 nf_log_packet(PF_INET, hooknum, *pskb, in, out, &li, loginfo->prefix);
414}
415 429
416static void 430 return IPT_CONTINUE;
417ipt_logfn(unsigned int hooknum,
418 const struct sk_buff *skb,
419 const struct net_device *in,
420 const struct net_device *out,
421 const char *prefix)
422{
423 struct ipt_log_info loginfo = {
424 .level = 0,
425 .logflags = IPT_LOG_MASK,
426 .prefix = ""
427 };
428
429 ipt_log_packet(hooknum, skb, in, out, &loginfo, KERN_WARNING, prefix);
430} 431}
431 432
432static int ipt_log_checkentry(const char *tablename, 433static int ipt_log_checkentry(const char *tablename,
@@ -464,20 +465,29 @@ static struct ipt_target ipt_log_reg = {
464 .me = THIS_MODULE, 465 .me = THIS_MODULE,
465}; 466};
466 467
468static struct nf_logger ipt_log_logger ={
469 .name = "ipt_LOG",
470 .logfn = &ipt_log_packet,
471 .me = THIS_MODULE,
472};
473
467static int __init init(void) 474static int __init init(void)
468{ 475{
469 if (ipt_register_target(&ipt_log_reg)) 476 if (ipt_register_target(&ipt_log_reg))
470 return -EINVAL; 477 return -EINVAL;
471 if (nflog) 478 if (nf_log_register(PF_INET, &ipt_log_logger) < 0) {
472 nf_log_register(PF_INET, &ipt_logfn); 479 printk(KERN_WARNING "ipt_LOG: not logging via system console "
480 "since somebody else already registered for PF_INET\n");
481 /* we cannot make module load fail here, since otherwise
482 * iptables userspace would abort */
483 }
473 484
474 return 0; 485 return 0;
475} 486}
476 487
477static void __exit fini(void) 488static void __exit fini(void)
478{ 489{
479 if (nflog) 490 nf_log_unregister_logger(&ipt_log_logger);
480 nf_log_unregister(PF_INET, &ipt_logfn);
481 ipt_unregister_target(&ipt_log_reg); 491 ipt_unregister_target(&ipt_log_reg);
482} 492}
483 493
diff --git a/net/ipv4/netfilter/ipt_MARK.c b/net/ipv4/netfilter/ipt_MARK.c
index 33c6f9b63b8d..52b4f2c296bf 100644
--- a/net/ipv4/netfilter/ipt_MARK.c
+++ b/net/ipv4/netfilter/ipt_MARK.c
@@ -29,10 +29,9 @@ target_v0(struct sk_buff **pskb,
29{ 29{
30 const struct ipt_mark_target_info *markinfo = targinfo; 30 const struct ipt_mark_target_info *markinfo = targinfo;
31 31
32 if((*pskb)->nfmark != markinfo->mark) { 32 if((*pskb)->nfmark != markinfo->mark)
33 (*pskb)->nfmark = markinfo->mark; 33 (*pskb)->nfmark = markinfo->mark;
34 (*pskb)->nfcache |= NFC_ALTERED; 34
35 }
36 return IPT_CONTINUE; 35 return IPT_CONTINUE;
37} 36}
38 37
@@ -61,10 +60,9 @@ target_v1(struct sk_buff **pskb,
61 break; 60 break;
62 } 61 }
63 62
64 if((*pskb)->nfmark != mark) { 63 if((*pskb)->nfmark != mark)
65 (*pskb)->nfmark = mark; 64 (*pskb)->nfmark = mark;
66 (*pskb)->nfcache |= NFC_ALTERED; 65
67 }
68 return IPT_CONTINUE; 66 return IPT_CONTINUE;
69} 67}
70 68
@@ -76,6 +74,8 @@ checkentry_v0(const char *tablename,
76 unsigned int targinfosize, 74 unsigned int targinfosize,
77 unsigned int hook_mask) 75 unsigned int hook_mask)
78{ 76{
77 struct ipt_mark_target_info *markinfo = targinfo;
78
79 if (targinfosize != IPT_ALIGN(sizeof(struct ipt_mark_target_info))) { 79 if (targinfosize != IPT_ALIGN(sizeof(struct ipt_mark_target_info))) {
80 printk(KERN_WARNING "MARK: targinfosize %u != %Zu\n", 80 printk(KERN_WARNING "MARK: targinfosize %u != %Zu\n",
81 targinfosize, 81 targinfosize,
@@ -88,6 +88,11 @@ checkentry_v0(const char *tablename,
88 return 0; 88 return 0;
89 } 89 }
90 90
91 if (markinfo->mark > 0xffffffff) {
92 printk(KERN_WARNING "MARK: Only supports 32bit wide mark\n");
93 return 0;
94 }
95
91 return 1; 96 return 1;
92} 97}
93 98
@@ -120,6 +125,11 @@ checkentry_v1(const char *tablename,
120 return 0; 125 return 0;
121 } 126 }
122 127
128 if (markinfo->mark > 0xffffffff) {
129 printk(KERN_WARNING "MARK: Only supports 32bit wide mark\n");
130 return 0;
131 }
132
123 return 1; 133 return 1;
124} 134}
125 135
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index 91e74502c3d3..2f3e181c8e97 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -86,11 +86,6 @@ masquerade_target(struct sk_buff **pskb,
86 86
87 IP_NF_ASSERT(hooknum == NF_IP_POST_ROUTING); 87 IP_NF_ASSERT(hooknum == NF_IP_POST_ROUTING);
88 88
89 /* FIXME: For the moment, don't do local packets, breaks
90 testsuite for 2.3.49 --RR */
91 if ((*pskb)->sk)
92 return NF_ACCEPT;
93
94 ct = ip_conntrack_get(*pskb, &ctinfo); 89 ct = ip_conntrack_get(*pskb, &ctinfo);
95 IP_NF_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED 90 IP_NF_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED
96 || ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY)); 91 || ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY));
diff --git a/net/ipv4/netfilter/ipt_NETMAP.c b/net/ipv4/netfilter/ipt_NETMAP.c
index 06254b29d034..e6e7b6095363 100644
--- a/net/ipv4/netfilter/ipt_NETMAP.c
+++ b/net/ipv4/netfilter/ipt_NETMAP.c
@@ -46,7 +46,8 @@ check(const char *tablename,
46 DEBUGP(MODULENAME":check: size %u.\n", targinfosize); 46 DEBUGP(MODULENAME":check: size %u.\n", targinfosize);
47 return 0; 47 return 0;
48 } 48 }
49 if (hook_mask & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_POST_ROUTING))) { 49 if (hook_mask & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_POST_ROUTING) |
50 (1 << NF_IP_LOCAL_OUT))) {
50 DEBUGP(MODULENAME":check: bad hooks %x.\n", hook_mask); 51 DEBUGP(MODULENAME":check: bad hooks %x.\n", hook_mask);
51 return 0; 52 return 0;
52 } 53 }
@@ -76,12 +77,13 @@ target(struct sk_buff **pskb,
76 struct ip_nat_range newrange; 77 struct ip_nat_range newrange;
77 78
78 IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING 79 IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING
79 || hooknum == NF_IP_POST_ROUTING); 80 || hooknum == NF_IP_POST_ROUTING
81 || hooknum == NF_IP_LOCAL_OUT);
80 ct = ip_conntrack_get(*pskb, &ctinfo); 82 ct = ip_conntrack_get(*pskb, &ctinfo);
81 83
82 netmask = ~(mr->range[0].min_ip ^ mr->range[0].max_ip); 84 netmask = ~(mr->range[0].min_ip ^ mr->range[0].max_ip);
83 85
84 if (hooknum == NF_IP_PRE_ROUTING) 86 if (hooknum == NF_IP_PRE_ROUTING || hooknum == NF_IP_LOCAL_OUT)
85 new_ip = (*pskb)->nh.iph->daddr & ~netmask; 87 new_ip = (*pskb)->nh.iph->daddr & ~netmask;
86 else 88 else
87 new_ip = (*pskb)->nh.iph->saddr & ~netmask; 89 new_ip = (*pskb)->nh.iph->saddr & ~netmask;
diff --git a/net/ipv4/netfilter/ipt_NFQUEUE.c b/net/ipv4/netfilter/ipt_NFQUEUE.c
new file mode 100644
index 000000000000..3cedc9be8807
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_NFQUEUE.c
@@ -0,0 +1,70 @@
1/* iptables module for using new netfilter netlink queue
2 *
3 * (C) 2005 by Harald Welte <laforge@netfilter.org>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 2 as
7 * published by the Free Software Foundation.
8 *
9 */
10
11#include <linux/module.h>
12#include <linux/skbuff.h>
13
14#include <linux/netfilter.h>
15#include <linux/netfilter_ipv4/ip_tables.h>
16#include <linux/netfilter_ipv4/ipt_NFQUEUE.h>
17
18MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
19MODULE_DESCRIPTION("iptables NFQUEUE target");
20MODULE_LICENSE("GPL");
21
22static unsigned int
23target(struct sk_buff **pskb,
24 const struct net_device *in,
25 const struct net_device *out,
26 unsigned int hooknum,
27 const void *targinfo,
28 void *userinfo)
29{
30 const struct ipt_NFQ_info *tinfo = targinfo;
31
32 return NF_QUEUE_NR(tinfo->queuenum);
33}
34
35static int
36checkentry(const char *tablename,
37 const struct ipt_entry *e,
38 void *targinfo,
39 unsigned int targinfosize,
40 unsigned int hook_mask)
41{
42 if (targinfosize != IPT_ALIGN(sizeof(struct ipt_NFQ_info))) {
43 printk(KERN_WARNING "NFQUEUE: targinfosize %u != %Zu\n",
44 targinfosize,
45 IPT_ALIGN(sizeof(struct ipt_NFQ_info)));
46 return 0;
47 }
48
49 return 1;
50}
51
52static struct ipt_target ipt_NFQ_reg = {
53 .name = "NFQUEUE",
54 .target = target,
55 .checkentry = checkentry,
56 .me = THIS_MODULE,
57};
58
59static int __init init(void)
60{
61 return ipt_register_target(&ipt_NFQ_reg);
62}
63
64static void __exit fini(void)
65{
66 ipt_unregister_target(&ipt_NFQ_reg);
67}
68
69module_init(init);
70module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index 915696446020..f115a84a4ac6 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -156,7 +156,6 @@ static void send_reset(struct sk_buff *oldskb, int hook)
156 156
157 /* This packet will not be the same as the other: clear nf fields */ 157 /* This packet will not be the same as the other: clear nf fields */
158 nf_reset(nskb); 158 nf_reset(nskb);
159 nskb->nfcache = 0;
160 nskb->nfmark = 0; 159 nskb->nfmark = 0;
161#ifdef CONFIG_BRIDGE_NETFILTER 160#ifdef CONFIG_BRIDGE_NETFILTER
162 nf_bridge_put(nskb->nf_bridge); 161 nf_bridge_put(nskb->nf_bridge);
diff --git a/net/ipv4/netfilter/ipt_TCPMSS.c b/net/ipv4/netfilter/ipt_TCPMSS.c
index 7b84a254440e..8db70d6908c3 100644
--- a/net/ipv4/netfilter/ipt_TCPMSS.c
+++ b/net/ipv4/netfilter/ipt_TCPMSS.c
@@ -58,7 +58,7 @@ ipt_tcpmss_target(struct sk_buff **pskb,
58 unsigned int i; 58 unsigned int i;
59 u_int8_t *opt; 59 u_int8_t *opt;
60 60
61 if (!skb_ip_make_writable(pskb, (*pskb)->len)) 61 if (!skb_make_writable(pskb, (*pskb)->len))
62 return NF_DROP; 62 return NF_DROP;
63 63
64 if ((*pskb)->ip_summed == CHECKSUM_HW && 64 if ((*pskb)->ip_summed == CHECKSUM_HW &&
@@ -190,7 +190,6 @@ ipt_tcpmss_target(struct sk_buff **pskb,
190 newmss); 190 newmss);
191 191
192 retmodified: 192 retmodified:
193 (*pskb)->nfcache |= NFC_UNKNOWN | NFC_ALTERED;
194 return IPT_CONTINUE; 193 return IPT_CONTINUE;
195} 194}
196 195
diff --git a/net/ipv4/netfilter/ipt_TOS.c b/net/ipv4/netfilter/ipt_TOS.c
index 85c70d240f8b..deadb36d4428 100644
--- a/net/ipv4/netfilter/ipt_TOS.c
+++ b/net/ipv4/netfilter/ipt_TOS.c
@@ -33,7 +33,7 @@ target(struct sk_buff **pskb,
33 if (((*pskb)->nh.iph->tos & IPTOS_TOS_MASK) != tosinfo->tos) { 33 if (((*pskb)->nh.iph->tos & IPTOS_TOS_MASK) != tosinfo->tos) {
34 u_int16_t diffs[2]; 34 u_int16_t diffs[2];
35 35
36 if (!skb_ip_make_writable(pskb, sizeof(struct iphdr))) 36 if (!skb_make_writable(pskb, sizeof(struct iphdr)))
37 return NF_DROP; 37 return NF_DROP;
38 38
39 diffs[0] = htons((*pskb)->nh.iph->tos) ^ 0xFFFF; 39 diffs[0] = htons((*pskb)->nh.iph->tos) ^ 0xFFFF;
@@ -46,7 +46,6 @@ target(struct sk_buff **pskb,
46 sizeof(diffs), 46 sizeof(diffs),
47 (*pskb)->nh.iph->check 47 (*pskb)->nh.iph->check
48 ^0xFFFF)); 48 ^0xFFFF));
49 (*pskb)->nfcache |= NFC_ALTERED;
50 } 49 }
51 return IPT_CONTINUE; 50 return IPT_CONTINUE;
52} 51}
diff --git a/net/ipv4/netfilter/ipt_TTL.c b/net/ipv4/netfilter/ipt_TTL.c
new file mode 100644
index 000000000000..b9ae6a9382f3
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_TTL.c
@@ -0,0 +1,119 @@
1/* TTL modification target for IP tables
2 * (C) 2000,2005 by Harald Welte <laforge@netfilter.org>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 *
8 */
9
10#include <linux/module.h>
11#include <linux/skbuff.h>
12#include <linux/ip.h>
13#include <net/checksum.h>
14
15#include <linux/netfilter_ipv4/ip_tables.h>
16#include <linux/netfilter_ipv4/ipt_TTL.h>
17
18MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
19MODULE_DESCRIPTION("IP tables TTL modification module");
20MODULE_LICENSE("GPL");
21
22static unsigned int
23ipt_ttl_target(struct sk_buff **pskb, const struct net_device *in,
24 const struct net_device *out, unsigned int hooknum,
25 const void *targinfo, void *userinfo)
26{
27 struct iphdr *iph;
28 const struct ipt_TTL_info *info = targinfo;
29 u_int16_t diffs[2];
30 int new_ttl;
31
32 if (!skb_make_writable(pskb, (*pskb)->len))
33 return NF_DROP;
34
35 iph = (*pskb)->nh.iph;
36
37 switch (info->mode) {
38 case IPT_TTL_SET:
39 new_ttl = info->ttl;
40 break;
41 case IPT_TTL_INC:
42 new_ttl = iph->ttl + info->ttl;
43 if (new_ttl > 255)
44 new_ttl = 255;
45 break;
46 case IPT_TTL_DEC:
47 new_ttl = iph->ttl - info->ttl;
48 if (new_ttl < 0)
49 new_ttl = 0;
50 break;
51 default:
52 new_ttl = iph->ttl;
53 break;
54 }
55
56 if (new_ttl != iph->ttl) {
57 diffs[0] = htons(((unsigned)iph->ttl) << 8) ^ 0xFFFF;
58 iph->ttl = new_ttl;
59 diffs[1] = htons(((unsigned)iph->ttl) << 8);
60 iph->check = csum_fold(csum_partial((char *)diffs,
61 sizeof(diffs),
62 iph->check^0xFFFF));
63 }
64
65 return IPT_CONTINUE;
66}
67
68static int ipt_ttl_checkentry(const char *tablename,
69 const struct ipt_entry *e,
70 void *targinfo,
71 unsigned int targinfosize,
72 unsigned int hook_mask)
73{
74 struct ipt_TTL_info *info = targinfo;
75
76 if (targinfosize != IPT_ALIGN(sizeof(struct ipt_TTL_info))) {
77 printk(KERN_WARNING "ipt_TTL: targinfosize %u != %Zu\n",
78 targinfosize,
79 IPT_ALIGN(sizeof(struct ipt_TTL_info)));
80 return 0;
81 }
82
83 if (strcmp(tablename, "mangle")) {
84 printk(KERN_WARNING "ipt_TTL: can only be called from "
85 "\"mangle\" table, not \"%s\"\n", tablename);
86 return 0;
87 }
88
89 if (info->mode > IPT_TTL_MAXMODE) {
90 printk(KERN_WARNING "ipt_TTL: invalid or unknown Mode %u\n",
91 info->mode);
92 return 0;
93 }
94
95 if ((info->mode != IPT_TTL_SET) && (info->ttl == 0))
96 return 0;
97
98 return 1;
99}
100
101static struct ipt_target ipt_TTL = {
102 .name = "TTL",
103 .target = ipt_ttl_target,
104 .checkentry = ipt_ttl_checkentry,
105 .me = THIS_MODULE,
106};
107
108static int __init init(void)
109{
110 return ipt_register_target(&ipt_TTL);
111}
112
113static void __exit fini(void)
114{
115 ipt_unregister_target(&ipt_TTL);
116}
117
118module_init(init);
119module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c
index 52a0076302a7..e2c14f3cb2fc 100644
--- a/net/ipv4/netfilter/ipt_ULOG.c
+++ b/net/ipv4/netfilter/ipt_ULOG.c
@@ -62,6 +62,7 @@
62MODULE_LICENSE("GPL"); 62MODULE_LICENSE("GPL");
63MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>"); 63MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
64MODULE_DESCRIPTION("iptables userspace logging module"); 64MODULE_DESCRIPTION("iptables userspace logging module");
65MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NFLOG);
65 66
66#define ULOG_NL_EVENT 111 /* Harald's favorite number */ 67#define ULOG_NL_EVENT 111 /* Harald's favorite number */
67#define ULOG_MAXNLGROUPS 32 /* numer of nlgroups */ 68#define ULOG_MAXNLGROUPS 32 /* numer of nlgroups */
@@ -115,10 +116,10 @@ static void ulog_send(unsigned int nlgroupnum)
115 if (ub->qlen > 1) 116 if (ub->qlen > 1)
116 ub->lastnlh->nlmsg_type = NLMSG_DONE; 117 ub->lastnlh->nlmsg_type = NLMSG_DONE;
117 118
118 NETLINK_CB(ub->skb).dst_groups = (1 << nlgroupnum); 119 NETLINK_CB(ub->skb).dst_group = nlgroupnum + 1;
119 DEBUGP("ipt_ULOG: throwing %d packets to netlink mask %u\n", 120 DEBUGP("ipt_ULOG: throwing %d packets to netlink group %u\n",
120 ub->qlen, nlgroupnum); 121 ub->qlen, nlgroupnum + 1);
121 netlink_broadcast(nflognl, ub->skb, 0, (1 << nlgroupnum), GFP_ATOMIC); 122 netlink_broadcast(nflognl, ub->skb, 0, nlgroupnum + 1, GFP_ATOMIC);
122 123
123 ub->qlen = 0; 124 ub->qlen = 0;
124 ub->skb = NULL; 125 ub->skb = NULL;
@@ -219,13 +220,13 @@ static void ipt_ulog_packet(unsigned int hooknum,
219 pm = NLMSG_DATA(nlh); 220 pm = NLMSG_DATA(nlh);
220 221
221 /* We might not have a timestamp, get one */ 222 /* We might not have a timestamp, get one */
222 if (skb->stamp.tv_sec == 0) 223 if (skb->tstamp.off_sec == 0)
223 do_gettimeofday((struct timeval *)&skb->stamp); 224 __net_timestamp((struct sk_buff *)skb);
224 225
225 /* copy hook, prefix, timestamp, payload, etc. */ 226 /* copy hook, prefix, timestamp, payload, etc. */
226 pm->data_len = copy_len; 227 pm->data_len = copy_len;
227 pm->timestamp_sec = skb->stamp.tv_sec; 228 pm->timestamp_sec = skb_tv_base.tv_sec + skb->tstamp.off_sec;
228 pm->timestamp_usec = skb->stamp.tv_usec; 229 pm->timestamp_usec = skb_tv_base.tv_usec + skb->tstamp.off_usec;
229 pm->mark = skb->nfmark; 230 pm->mark = skb->nfmark;
230 pm->hook = hooknum; 231 pm->hook = hooknum;
231 if (prefix != NULL) 232 if (prefix != NULL)
@@ -303,18 +304,27 @@ static unsigned int ipt_ulog_target(struct sk_buff **pskb,
303 return IPT_CONTINUE; 304 return IPT_CONTINUE;
304} 305}
305 306
306static void ipt_logfn(unsigned int hooknum, 307static void ipt_logfn(unsigned int pf,
308 unsigned int hooknum,
307 const struct sk_buff *skb, 309 const struct sk_buff *skb,
308 const struct net_device *in, 310 const struct net_device *in,
309 const struct net_device *out, 311 const struct net_device *out,
312 const struct nf_loginfo *li,
310 const char *prefix) 313 const char *prefix)
311{ 314{
312 struct ipt_ulog_info loginfo = { 315 struct ipt_ulog_info loginfo;
313 .nl_group = ULOG_DEFAULT_NLGROUP, 316
314 .copy_range = 0, 317 if (!li || li->type != NF_LOG_TYPE_ULOG) {
315 .qthreshold = ULOG_DEFAULT_QTHRESHOLD, 318 loginfo.nl_group = ULOG_DEFAULT_NLGROUP;
316 .prefix = "" 319 loginfo.copy_range = 0;
317 }; 320 loginfo.qthreshold = ULOG_DEFAULT_QTHRESHOLD;
321 loginfo.prefix[0] = '\0';
322 } else {
323 loginfo.nl_group = li->u.ulog.group;
324 loginfo.copy_range = li->u.ulog.copy_len;
325 loginfo.qthreshold = li->u.ulog.qthreshold;
326 strlcpy(loginfo.prefix, prefix, sizeof(loginfo.prefix));
327 }
318 328
319 ipt_ulog_packet(hooknum, skb, in, out, &loginfo, prefix); 329 ipt_ulog_packet(hooknum, skb, in, out, &loginfo, prefix);
320} 330}
@@ -354,6 +364,12 @@ static struct ipt_target ipt_ulog_reg = {
354 .me = THIS_MODULE, 364 .me = THIS_MODULE,
355}; 365};
356 366
367static struct nf_logger ipt_ulog_logger = {
368 .name = "ipt_ULOG",
369 .logfn = &ipt_logfn,
370 .me = THIS_MODULE,
371};
372
357static int __init init(void) 373static int __init init(void)
358{ 374{
359 int i; 375 int i;
@@ -372,7 +388,8 @@ static int __init init(void)
372 ulog_buffers[i].timer.data = i; 388 ulog_buffers[i].timer.data = i;
373 } 389 }
374 390
375 nflognl = netlink_kernel_create(NETLINK_NFLOG, NULL); 391 nflognl = netlink_kernel_create(NETLINK_NFLOG, ULOG_MAXNLGROUPS, NULL,
392 THIS_MODULE);
376 if (!nflognl) 393 if (!nflognl)
377 return -ENOMEM; 394 return -ENOMEM;
378 395
@@ -381,7 +398,7 @@ static int __init init(void)
381 return -EINVAL; 398 return -EINVAL;
382 } 399 }
383 if (nflog) 400 if (nflog)
384 nf_log_register(PF_INET, &ipt_logfn); 401 nf_log_register(PF_INET, &ipt_ulog_logger);
385 402
386 return 0; 403 return 0;
387} 404}
@@ -394,7 +411,7 @@ static void __exit fini(void)
394 DEBUGP("ipt_ULOG: cleanup_module\n"); 411 DEBUGP("ipt_ULOG: cleanup_module\n");
395 412
396 if (nflog) 413 if (nflog)
397 nf_log_unregister(PF_INET, &ipt_logfn); 414 nf_log_unregister_logger(&ipt_ulog_logger);
398 ipt_unregister_target(&ipt_ulog_reg); 415 ipt_unregister_target(&ipt_ulog_reg);
399 sock_release(nflognl->sk_socket); 416 sock_release(nflognl->sk_socket);
400 417
diff --git a/net/ipv4/netfilter/ipt_connbytes.c b/net/ipv4/netfilter/ipt_connbytes.c
new file mode 100644
index 000000000000..df4a42c6da22
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_connbytes.c
@@ -0,0 +1,162 @@
1/* Kernel module to match connection tracking byte counter.
2 * GPL (C) 2002 Martin Devera (devik@cdi.cz).
3 *
4 * 2004-07-20 Harald Welte <laforge@netfilter.org>
5 * - reimplemented to use per-connection accounting counters
6 * - add functionality to match number of packets
7 * - add functionality to match average packet size
8 * - add support to match directions seperately
9 *
10 */
11#include <linux/module.h>
12#include <linux/skbuff.h>
13#include <linux/netfilter_ipv4/ip_conntrack.h>
14#include <linux/netfilter_ipv4/ip_tables.h>
15#include <linux/netfilter_ipv4/ipt_connbytes.h>
16
17#include <asm/div64.h>
18#include <asm/bitops.h>
19
20MODULE_LICENSE("GPL");
21MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
22MODULE_DESCRIPTION("iptables match for matching number of pkts/bytes per connection");
23
24/* 64bit divisor, dividend and result. dynamic precision */
25static u_int64_t div64_64(u_int64_t dividend, u_int64_t divisor)
26{
27 u_int32_t d = divisor;
28
29 if (divisor > 0xffffffffULL) {
30 unsigned int shift = fls(divisor >> 32);
31
32 d = divisor >> shift;
33 dividend >>= shift;
34 }
35
36 do_div(dividend, d);
37 return dividend;
38}
39
40static int
41match(const struct sk_buff *skb,
42 const struct net_device *in,
43 const struct net_device *out,
44 const void *matchinfo,
45 int offset,
46 int *hotdrop)
47{
48 const struct ipt_connbytes_info *sinfo = matchinfo;
49 enum ip_conntrack_info ctinfo;
50 struct ip_conntrack *ct;
51 u_int64_t what = 0; /* initialize to make gcc happy */
52
53 if (!(ct = ip_conntrack_get((struct sk_buff *)skb, &ctinfo)))
54 return 0; /* no match */
55
56 switch (sinfo->what) {
57 case IPT_CONNBYTES_PKTS:
58 switch (sinfo->direction) {
59 case IPT_CONNBYTES_DIR_ORIGINAL:
60 what = ct->counters[IP_CT_DIR_ORIGINAL].packets;
61 break;
62 case IPT_CONNBYTES_DIR_REPLY:
63 what = ct->counters[IP_CT_DIR_REPLY].packets;
64 break;
65 case IPT_CONNBYTES_DIR_BOTH:
66 what = ct->counters[IP_CT_DIR_ORIGINAL].packets;
67 what += ct->counters[IP_CT_DIR_REPLY].packets;
68 break;
69 }
70 break;
71 case IPT_CONNBYTES_BYTES:
72 switch (sinfo->direction) {
73 case IPT_CONNBYTES_DIR_ORIGINAL:
74 what = ct->counters[IP_CT_DIR_ORIGINAL].bytes;
75 break;
76 case IPT_CONNBYTES_DIR_REPLY:
77 what = ct->counters[IP_CT_DIR_REPLY].bytes;
78 break;
79 case IPT_CONNBYTES_DIR_BOTH:
80 what = ct->counters[IP_CT_DIR_ORIGINAL].bytes;
81 what += ct->counters[IP_CT_DIR_REPLY].bytes;
82 break;
83 }
84 break;
85 case IPT_CONNBYTES_AVGPKT:
86 switch (sinfo->direction) {
87 case IPT_CONNBYTES_DIR_ORIGINAL:
88 what = div64_64(ct->counters[IP_CT_DIR_ORIGINAL].bytes,
89 ct->counters[IP_CT_DIR_ORIGINAL].packets);
90 break;
91 case IPT_CONNBYTES_DIR_REPLY:
92 what = div64_64(ct->counters[IP_CT_DIR_REPLY].bytes,
93 ct->counters[IP_CT_DIR_REPLY].packets);
94 break;
95 case IPT_CONNBYTES_DIR_BOTH:
96 {
97 u_int64_t bytes;
98 u_int64_t pkts;
99 bytes = ct->counters[IP_CT_DIR_ORIGINAL].bytes +
100 ct->counters[IP_CT_DIR_REPLY].bytes;
101 pkts = ct->counters[IP_CT_DIR_ORIGINAL].packets+
102 ct->counters[IP_CT_DIR_REPLY].packets;
103
104 /* FIXME_THEORETICAL: what to do if sum
105 * overflows ? */
106
107 what = div64_64(bytes, pkts);
108 }
109 break;
110 }
111 break;
112 }
113
114 if (sinfo->count.to)
115 return (what <= sinfo->count.to && what >= sinfo->count.from);
116 else
117 return (what >= sinfo->count.from);
118}
119
120static int check(const char *tablename,
121 const struct ipt_ip *ip,
122 void *matchinfo,
123 unsigned int matchsize,
124 unsigned int hook_mask)
125{
126 const struct ipt_connbytes_info *sinfo = matchinfo;
127
128 if (matchsize != IPT_ALIGN(sizeof(struct ipt_connbytes_info)))
129 return 0;
130
131 if (sinfo->what != IPT_CONNBYTES_PKTS &&
132 sinfo->what != IPT_CONNBYTES_BYTES &&
133 sinfo->what != IPT_CONNBYTES_AVGPKT)
134 return 0;
135
136 if (sinfo->direction != IPT_CONNBYTES_DIR_ORIGINAL &&
137 sinfo->direction != IPT_CONNBYTES_DIR_REPLY &&
138 sinfo->direction != IPT_CONNBYTES_DIR_BOTH)
139 return 0;
140
141 return 1;
142}
143
144static struct ipt_match state_match = {
145 .name = "connbytes",
146 .match = &match,
147 .checkentry = &check,
148 .me = THIS_MODULE
149};
150
151static int __init init(void)
152{
153 return ipt_register_match(&state_match);
154}
155
156static void __exit fini(void)
157{
158 ipt_unregister_match(&state_match);
159}
160
161module_init(init);
162module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_connmark.c b/net/ipv4/netfilter/ipt_connmark.c
index 2706f96cea55..bf8de47ce004 100644
--- a/net/ipv4/netfilter/ipt_connmark.c
+++ b/net/ipv4/netfilter/ipt_connmark.c
@@ -54,9 +54,16 @@ checkentry(const char *tablename,
54 unsigned int matchsize, 54 unsigned int matchsize,
55 unsigned int hook_mask) 55 unsigned int hook_mask)
56{ 56{
57 struct ipt_connmark_info *cm =
58 (struct ipt_connmark_info *)matchinfo;
57 if (matchsize != IPT_ALIGN(sizeof(struct ipt_connmark_info))) 59 if (matchsize != IPT_ALIGN(sizeof(struct ipt_connmark_info)))
58 return 0; 60 return 0;
59 61
62 if (cm->mark > 0xffffffff || cm->mask > 0xffffffff) {
63 printk(KERN_WARNING "connmark: only support 32bit mark\n");
64 return 0;
65 }
66
60 return 1; 67 return 1;
61} 68}
62 69
diff --git a/net/ipv4/netfilter/ipt_dccp.c b/net/ipv4/netfilter/ipt_dccp.c
new file mode 100644
index 000000000000..ad3278bba6c1
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_dccp.c
@@ -0,0 +1,176 @@
1/*
2 * iptables module for DCCP protocol header matching
3 *
4 * (C) 2005 by Harald Welte <laforge@netfilter.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11#include <linux/module.h>
12#include <linux/skbuff.h>
13#include <linux/spinlock.h>
14#include <net/ip.h>
15#include <linux/dccp.h>
16
17#include <linux/netfilter_ipv4/ip_tables.h>
18#include <linux/netfilter_ipv4/ipt_dccp.h>
19
20#define DCCHECK(cond, option, flag, invflag) (!((flag) & (option)) \
21 || (!!((invflag) & (option)) ^ (cond)))
22
23static unsigned char *dccp_optbuf;
24static DEFINE_SPINLOCK(dccp_buflock);
25
26static inline int
27dccp_find_option(u_int8_t option,
28 const struct sk_buff *skb,
29 const struct dccp_hdr *dh,
30 int *hotdrop)
31{
32 /* tcp.doff is only 4 bits, ie. max 15 * 4 bytes */
33 unsigned char *op;
34 unsigned int optoff = __dccp_hdr_len(dh);
35 unsigned int optlen = dh->dccph_doff*4 - __dccp_hdr_len(dh);
36 unsigned int i;
37
38 if (dh->dccph_doff * 4 < __dccp_hdr_len(dh)) {
39 *hotdrop = 1;
40 return 0;
41 }
42
43 if (!optlen)
44 return 0;
45
46 spin_lock_bh(&dccp_buflock);
47 op = skb_header_pointer(skb,
48 skb->nh.iph->ihl*4 + optoff,
49 optlen, dccp_optbuf);
50 if (op == NULL) {
51 /* If we don't have the whole header, drop packet. */
52 spin_unlock_bh(&dccp_buflock);
53 *hotdrop = 1;
54 return 0;
55 }
56
57 for (i = 0; i < optlen; ) {
58 if (op[i] == option) {
59 spin_unlock_bh(&dccp_buflock);
60 return 1;
61 }
62
63 if (op[i] < 2)
64 i++;
65 else
66 i += op[i+1]?:1;
67 }
68
69 spin_unlock_bh(&dccp_buflock);
70 return 0;
71}
72
73
74static inline int
75match_types(const struct dccp_hdr *dh, u_int16_t typemask)
76{
77 return (typemask & (1 << dh->dccph_type));
78}
79
80static inline int
81match_option(u_int8_t option, const struct sk_buff *skb,
82 const struct dccp_hdr *dh, int *hotdrop)
83{
84 return dccp_find_option(option, skb, dh, hotdrop);
85}
86
87static int
88match(const struct sk_buff *skb,
89 const struct net_device *in,
90 const struct net_device *out,
91 const void *matchinfo,
92 int offset,
93 int *hotdrop)
94{
95 const struct ipt_dccp_info *info =
96 (const struct ipt_dccp_info *)matchinfo;
97 struct dccp_hdr _dh, *dh;
98
99 if (offset)
100 return 0;
101
102 dh = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_dh), &_dh);
103 if (dh == NULL) {
104 *hotdrop = 1;
105 return 0;
106 }
107
108 return DCCHECK(((ntohs(dh->dccph_sport) >= info->spts[0])
109 && (ntohs(dh->dccph_sport) <= info->spts[1])),
110 IPT_DCCP_SRC_PORTS, info->flags, info->invflags)
111 && DCCHECK(((ntohs(dh->dccph_dport) >= info->dpts[0])
112 && (ntohs(dh->dccph_dport) <= info->dpts[1])),
113 IPT_DCCP_DEST_PORTS, info->flags, info->invflags)
114 && DCCHECK(match_types(dh, info->typemask),
115 IPT_DCCP_TYPE, info->flags, info->invflags)
116 && DCCHECK(match_option(info->option, skb, dh, hotdrop),
117 IPT_DCCP_OPTION, info->flags, info->invflags);
118}
119
120static int
121checkentry(const char *tablename,
122 const struct ipt_ip *ip,
123 void *matchinfo,
124 unsigned int matchsize,
125 unsigned int hook_mask)
126{
127 const struct ipt_dccp_info *info;
128
129 info = (const struct ipt_dccp_info *)matchinfo;
130
131 return ip->proto == IPPROTO_DCCP
132 && !(ip->invflags & IPT_INV_PROTO)
133 && matchsize == IPT_ALIGN(sizeof(struct ipt_dccp_info))
134 && !(info->flags & ~IPT_DCCP_VALID_FLAGS)
135 && !(info->invflags & ~IPT_DCCP_VALID_FLAGS)
136 && !(info->invflags & ~info->flags);
137}
138
139static struct ipt_match dccp_match =
140{
141 .name = "dccp",
142 .match = &match,
143 .checkentry = &checkentry,
144 .me = THIS_MODULE,
145};
146
147static int __init init(void)
148{
149 int ret;
150
151 /* doff is 8 bits, so the maximum option size is (4*256). Don't put
152 * this in BSS since DaveM is worried about locked TLB's for kernel
153 * BSS. */
154 dccp_optbuf = kmalloc(256 * 4, GFP_KERNEL);
155 if (!dccp_optbuf)
156 return -ENOMEM;
157 ret = ipt_register_match(&dccp_match);
158 if (ret)
159 kfree(dccp_optbuf);
160
161 return ret;
162}
163
164static void __exit fini(void)
165{
166 ipt_unregister_match(&dccp_match);
167 kfree(dccp_optbuf);
168}
169
170module_init(init);
171module_exit(fini);
172
173MODULE_LICENSE("GPL");
174MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
175MODULE_DESCRIPTION("Match for DCCP protocol packets");
176
diff --git a/net/ipv4/netfilter/ipt_hashlimit.c b/net/ipv4/netfilter/ipt_hashlimit.c
index 564b49bfebcf..2dd1cccbdab9 100644
--- a/net/ipv4/netfilter/ipt_hashlimit.c
+++ b/net/ipv4/netfilter/ipt_hashlimit.c
@@ -94,7 +94,7 @@ struct ipt_hashlimit_htable {
94static DEFINE_SPINLOCK(hashlimit_lock); /* protects htables list */ 94static DEFINE_SPINLOCK(hashlimit_lock); /* protects htables list */
95static DECLARE_MUTEX(hlimit_mutex); /* additional checkentry protection */ 95static DECLARE_MUTEX(hlimit_mutex); /* additional checkentry protection */
96static HLIST_HEAD(hashlimit_htables); 96static HLIST_HEAD(hashlimit_htables);
97static kmem_cache_t *hashlimit_cachep; 97static kmem_cache_t *hashlimit_cachep __read_mostly;
98 98
99static inline int dst_cmp(const struct dsthash_ent *ent, struct dsthash_dst *b) 99static inline int dst_cmp(const struct dsthash_ent *ent, struct dsthash_dst *b)
100{ 100{
diff --git a/net/ipv4/netfilter/ipt_mark.c b/net/ipv4/netfilter/ipt_mark.c
index 8955728127b9..00bef6cdd3f8 100644
--- a/net/ipv4/netfilter/ipt_mark.c
+++ b/net/ipv4/netfilter/ipt_mark.c
@@ -37,9 +37,16 @@ checkentry(const char *tablename,
37 unsigned int matchsize, 37 unsigned int matchsize,
38 unsigned int hook_mask) 38 unsigned int hook_mask)
39{ 39{
40 struct ipt_mark_info *minfo = (struct ipt_mark_info *) matchinfo;
41
40 if (matchsize != IPT_ALIGN(sizeof(struct ipt_mark_info))) 42 if (matchsize != IPT_ALIGN(sizeof(struct ipt_mark_info)))
41 return 0; 43 return 0;
42 44
45 if (minfo->mark > 0xffffffff || minfo->mask > 0xffffffff) {
46 printk(KERN_WARNING "mark: only supports 32bit mark\n");
47 return 0;
48 }
49
43 return 1; 50 return 1;
44} 51}
45 52
diff --git a/net/ipv4/netfilter/ipt_owner.c b/net/ipv4/netfilter/ipt_owner.c
index 3b9065e06381..c1889f88262b 100644
--- a/net/ipv4/netfilter/ipt_owner.c
+++ b/net/ipv4/netfilter/ipt_owner.c
@@ -21,106 +21,6 @@ MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>");
21MODULE_DESCRIPTION("iptables owner match"); 21MODULE_DESCRIPTION("iptables owner match");
22 22
23static int 23static int
24match_comm(const struct sk_buff *skb, const char *comm)
25{
26 struct task_struct *g, *p;
27 struct files_struct *files;
28 int i;
29
30 read_lock(&tasklist_lock);
31 do_each_thread(g, p) {
32 if(strncmp(p->comm, comm, sizeof(p->comm)))
33 continue;
34
35 task_lock(p);
36 files = p->files;
37 if(files) {
38 spin_lock(&files->file_lock);
39 for (i=0; i < files->max_fds; i++) {
40 if (fcheck_files(files, i) ==
41 skb->sk->sk_socket->file) {
42 spin_unlock(&files->file_lock);
43 task_unlock(p);
44 read_unlock(&tasklist_lock);
45 return 1;
46 }
47 }
48 spin_unlock(&files->file_lock);
49 }
50 task_unlock(p);
51 } while_each_thread(g, p);
52 read_unlock(&tasklist_lock);
53 return 0;
54}
55
56static int
57match_pid(const struct sk_buff *skb, pid_t pid)
58{
59 struct task_struct *p;
60 struct files_struct *files;
61 int i;
62
63 read_lock(&tasklist_lock);
64 p = find_task_by_pid(pid);
65 if (!p)
66 goto out;
67 task_lock(p);
68 files = p->files;
69 if(files) {
70 spin_lock(&files->file_lock);
71 for (i=0; i < files->max_fds; i++) {
72 if (fcheck_files(files, i) ==
73 skb->sk->sk_socket->file) {
74 spin_unlock(&files->file_lock);
75 task_unlock(p);
76 read_unlock(&tasklist_lock);
77 return 1;
78 }
79 }
80 spin_unlock(&files->file_lock);
81 }
82 task_unlock(p);
83out:
84 read_unlock(&tasklist_lock);
85 return 0;
86}
87
88static int
89match_sid(const struct sk_buff *skb, pid_t sid)
90{
91 struct task_struct *g, *p;
92 struct file *file = skb->sk->sk_socket->file;
93 int i, found=0;
94
95 read_lock(&tasklist_lock);
96 do_each_thread(g, p) {
97 struct files_struct *files;
98 if (p->signal->session != sid)
99 continue;
100
101 task_lock(p);
102 files = p->files;
103 if (files) {
104 spin_lock(&files->file_lock);
105 for (i=0; i < files->max_fds; i++) {
106 if (fcheck_files(files, i) == file) {
107 found = 1;
108 break;
109 }
110 }
111 spin_unlock(&files->file_lock);
112 }
113 task_unlock(p);
114 if (found)
115 goto out;
116 } while_each_thread(g, p);
117out:
118 read_unlock(&tasklist_lock);
119
120 return found;
121}
122
123static int
124match(const struct sk_buff *skb, 24match(const struct sk_buff *skb,
125 const struct net_device *in, 25 const struct net_device *in,
126 const struct net_device *out, 26 const struct net_device *out,
@@ -145,24 +45,6 @@ match(const struct sk_buff *skb,
145 return 0; 45 return 0;
146 } 46 }
147 47
148 if(info->match & IPT_OWNER_PID) {
149 if (!match_pid(skb, info->pid) ^
150 !!(info->invert & IPT_OWNER_PID))
151 return 0;
152 }
153
154 if(info->match & IPT_OWNER_SID) {
155 if (!match_sid(skb, info->sid) ^
156 !!(info->invert & IPT_OWNER_SID))
157 return 0;
158 }
159
160 if(info->match & IPT_OWNER_COMM) {
161 if (!match_comm(skb, info->comm) ^
162 !!(info->invert & IPT_OWNER_COMM))
163 return 0;
164 }
165
166 return 1; 48 return 1;
167} 49}
168 50
@@ -173,6 +55,8 @@ checkentry(const char *tablename,
173 unsigned int matchsize, 55 unsigned int matchsize,
174 unsigned int hook_mask) 56 unsigned int hook_mask)
175{ 57{
58 const struct ipt_owner_info *info = matchinfo;
59
176 if (hook_mask 60 if (hook_mask
177 & ~((1 << NF_IP_LOCAL_OUT) | (1 << NF_IP_POST_ROUTING))) { 61 & ~((1 << NF_IP_LOCAL_OUT) | (1 << NF_IP_POST_ROUTING))) {
178 printk("ipt_owner: only valid for LOCAL_OUT or POST_ROUTING.\n"); 62 printk("ipt_owner: only valid for LOCAL_OUT or POST_ROUTING.\n");
@@ -184,15 +68,13 @@ checkentry(const char *tablename,
184 IPT_ALIGN(sizeof(struct ipt_owner_info))); 68 IPT_ALIGN(sizeof(struct ipt_owner_info)));
185 return 0; 69 return 0;
186 } 70 }
187#ifdef CONFIG_SMP 71
188 /* files->file_lock can not be used in a BH */ 72 if (info->match & (IPT_OWNER_PID|IPT_OWNER_SID|IPT_OWNER_COMM)) {
189 if (((struct ipt_owner_info *)matchinfo)->match 73 printk("ipt_owner: pid, sid and command matching "
190 & (IPT_OWNER_PID|IPT_OWNER_SID|IPT_OWNER_COMM)) { 74 "not supported anymore\n");
191 printk("ipt_owner: pid, sid and command matching is broken "
192 "on SMP.\n");
193 return 0; 75 return 0;
194 } 76 }
195#endif 77
196 return 1; 78 return 1;
197} 79}
198 80
diff --git a/net/ipv4/netfilter/ipt_string.c b/net/ipv4/netfilter/ipt_string.c
new file mode 100644
index 000000000000..b5def204d798
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_string.c
@@ -0,0 +1,91 @@
1/* String matching match for iptables
2 *
3 * (C) 2005 Pablo Neira Ayuso <pablo@eurodev.net>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 2 as
7 * published by the Free Software Foundation.
8 */
9
10#include <linux/init.h>
11#include <linux/module.h>
12#include <linux/kernel.h>
13#include <linux/skbuff.h>
14#include <linux/netfilter_ipv4/ip_tables.h>
15#include <linux/netfilter_ipv4/ipt_string.h>
16#include <linux/textsearch.h>
17
18MODULE_AUTHOR("Pablo Neira Ayuso <pablo@eurodev.net>");
19MODULE_DESCRIPTION("IP tables string match module");
20MODULE_LICENSE("GPL");
21
22static int match(const struct sk_buff *skb,
23 const struct net_device *in,
24 const struct net_device *out,
25 const void *matchinfo,
26 int offset,
27 int *hotdrop)
28{
29 struct ts_state state;
30 struct ipt_string_info *conf = (struct ipt_string_info *) matchinfo;
31
32 memset(&state, 0, sizeof(struct ts_state));
33
34 return (skb_find_text((struct sk_buff *)skb, conf->from_offset,
35 conf->to_offset, conf->config, &state)
36 != UINT_MAX) && !conf->invert;
37}
38
39#define STRING_TEXT_PRIV(m) ((struct ipt_string_info *) m)
40
41static int checkentry(const char *tablename,
42 const struct ipt_ip *ip,
43 void *matchinfo,
44 unsigned int matchsize,
45 unsigned int hook_mask)
46{
47 struct ipt_string_info *conf = matchinfo;
48 struct ts_config *ts_conf;
49
50 if (matchsize != IPT_ALIGN(sizeof(struct ipt_string_info)))
51 return 0;
52
53 /* Damn, can't handle this case properly with iptables... */
54 if (conf->from_offset > conf->to_offset)
55 return 0;
56
57 ts_conf = textsearch_prepare(conf->algo, conf->pattern, conf->patlen,
58 GFP_KERNEL, TS_AUTOLOAD);
59 if (IS_ERR(ts_conf))
60 return 0;
61
62 conf->config = ts_conf;
63
64 return 1;
65}
66
67static void destroy(void *matchinfo, unsigned int matchsize)
68{
69 textsearch_destroy(STRING_TEXT_PRIV(matchinfo)->config);
70}
71
72static struct ipt_match string_match = {
73 .name = "string",
74 .match = match,
75 .checkentry = checkentry,
76 .destroy = destroy,
77 .me = THIS_MODULE
78};
79
80static int __init init(void)
81{
82 return ipt_register_match(&string_match);
83}
84
85static void __exit fini(void)
86{
87 ipt_unregister_match(&string_match);
88}
89
90module_init(init);
91module_exit(fini);
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 912bbcc7f415..f7943ba1f43c 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -59,13 +59,10 @@ static int fold_prot_inuse(struct proto *proto)
59 */ 59 */
60static int sockstat_seq_show(struct seq_file *seq, void *v) 60static int sockstat_seq_show(struct seq_file *seq, void *v)
61{ 61{
62 /* From net/socket.c */
63 extern void socket_seq_show(struct seq_file *seq);
64
65 socket_seq_show(seq); 62 socket_seq_show(seq);
66 seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %d\n", 63 seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %d\n",
67 fold_prot_inuse(&tcp_prot), atomic_read(&tcp_orphan_count), 64 fold_prot_inuse(&tcp_prot), atomic_read(&tcp_orphan_count),
68 tcp_tw_count, atomic_read(&tcp_sockets_allocated), 65 tcp_death_row.tw_count, atomic_read(&tcp_sockets_allocated),
69 atomic_read(&tcp_memory_allocated)); 66 atomic_read(&tcp_memory_allocated));
70 seq_printf(seq, "UDP: inuse %d\n", fold_prot_inuse(&udp_prot)); 67 seq_printf(seq, "UDP: inuse %d\n", fold_prot_inuse(&udp_prot));
71 seq_printf(seq, "RAW: inuse %d\n", fold_prot_inuse(&raw_prot)); 68 seq_printf(seq, "RAW: inuse %d\n", fold_prot_inuse(&raw_prot));
diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c
index 0db405a869f2..291831e792af 100644
--- a/net/ipv4/protocol.c
+++ b/net/ipv4/protocol.c
@@ -40,7 +40,6 @@
40#include <linux/timer.h> 40#include <linux/timer.h>
41#include <net/ip.h> 41#include <net/ip.h>
42#include <net/protocol.h> 42#include <net/protocol.h>
43#include <net/tcp.h>
44#include <linux/skbuff.h> 43#include <linux/skbuff.h>
45#include <net/sock.h> 44#include <net/sock.h>
46#include <net/icmp.h> 45#include <net/icmp.h>
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index d1835b1bc8c4..304bb0a1d4f0 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -59,7 +59,6 @@
59#include <linux/netdevice.h> 59#include <linux/netdevice.h>
60#include <linux/in_route.h> 60#include <linux/in_route.h>
61#include <linux/route.h> 61#include <linux/route.h>
62#include <linux/tcp.h>
63#include <linux/skbuff.h> 62#include <linux/skbuff.h>
64#include <net/dst.h> 63#include <net/dst.h>
65#include <net/sock.h> 64#include <net/sock.h>
@@ -71,6 +70,7 @@
71#include <net/udp.h> 70#include <net/udp.h>
72#include <net/raw.h> 71#include <net/raw.h>
73#include <net/snmp.h> 72#include <net/snmp.h>
73#include <net/tcp_states.h>
74#include <net/inet_common.h> 74#include <net/inet_common.h>
75#include <net/checksum.h> 75#include <net/checksum.h>
76#include <net/xfrm.h> 76#include <net/xfrm.h>
@@ -150,10 +150,11 @@ static __inline__ int icmp_filter(struct sock *sk, struct sk_buff *skb)
150 * RFC 1122: SHOULD pass TOS value up to the transport layer. 150 * RFC 1122: SHOULD pass TOS value up to the transport layer.
151 * -> It does. And not only TOS, but all IP header. 151 * -> It does. And not only TOS, but all IP header.
152 */ 152 */
153void raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash) 153int raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash)
154{ 154{
155 struct sock *sk; 155 struct sock *sk;
156 struct hlist_head *head; 156 struct hlist_head *head;
157 int delivered = 0;
157 158
158 read_lock(&raw_v4_lock); 159 read_lock(&raw_v4_lock);
159 head = &raw_v4_htable[hash]; 160 head = &raw_v4_htable[hash];
@@ -164,6 +165,7 @@ void raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash)
164 skb->dev->ifindex); 165 skb->dev->ifindex);
165 166
166 while (sk) { 167 while (sk) {
168 delivered = 1;
167 if (iph->protocol != IPPROTO_ICMP || !icmp_filter(sk, skb)) { 169 if (iph->protocol != IPPROTO_ICMP || !icmp_filter(sk, skb)) {
168 struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC); 170 struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC);
169 171
@@ -177,6 +179,7 @@ void raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash)
177 } 179 }
178out: 180out:
179 read_unlock(&raw_v4_lock); 181 read_unlock(&raw_v4_lock);
182 return delivered;
180} 183}
181 184
182void raw_err (struct sock *sk, struct sk_buff *skb, u32 info) 185void raw_err (struct sock *sk, struct sk_buff *skb, u32 info)
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index d675ff80b04d..8c0b14e3beec 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -240,7 +240,9 @@ static unsigned rt_hash_mask;
240static int rt_hash_log; 240static int rt_hash_log;
241static unsigned int rt_hash_rnd; 241static unsigned int rt_hash_rnd;
242 242
243struct rt_cache_stat *rt_cache_stat; 243static struct rt_cache_stat *rt_cache_stat;
244#define RT_CACHE_STAT_INC(field) \
245 (per_cpu_ptr(rt_cache_stat, raw_smp_processor_id())->field++)
244 246
245static int rt_intern_hash(unsigned hash, struct rtable *rth, 247static int rt_intern_hash(unsigned hash, struct rtable *rth,
246 struct rtable **res); 248 struct rtable **res);
@@ -2600,6 +2602,8 @@ int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2600 return ip_route_output_slow(rp, flp); 2602 return ip_route_output_slow(rp, flp);
2601} 2603}
2602 2604
2605EXPORT_SYMBOL_GPL(__ip_route_output_key);
2606
2603int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags) 2607int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2604{ 2608{
2605 int err; 2609 int err;
@@ -2618,6 +2622,8 @@ int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk,
2618 return 0; 2622 return 0;
2619} 2623}
2620 2624
2625EXPORT_SYMBOL_GPL(ip_route_output_flow);
2626
2621int ip_route_output_key(struct rtable **rp, struct flowi *flp) 2627int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2622{ 2628{
2623 return ip_route_output_flow(rp, flp, NULL, 0); 2629 return ip_route_output_flow(rp, flp, NULL, 0);
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 72d014442185..a34e60ea48a1 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -169,8 +169,6 @@ static inline int cookie_check(struct sk_buff *skb, __u32 cookie)
169 return mssind < NUM_MSS ? msstab[mssind] + 1 : 0; 169 return mssind < NUM_MSS ? msstab[mssind] + 1 : 0;
170} 170}
171 171
172extern struct request_sock_ops tcp_request_sock_ops;
173
174static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, 172static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
175 struct request_sock *req, 173 struct request_sock *req,
176 struct dst_entry *dst) 174 struct dst_entry *dst)
@@ -180,7 +178,7 @@ static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
180 178
181 child = tp->af_specific->syn_recv_sock(sk, skb, req, dst); 179 child = tp->af_specific->syn_recv_sock(sk, skb, req, dst);
182 if (child) 180 if (child)
183 tcp_acceptq_queue(sk, req, child); 181 inet_csk_reqsk_queue_add(sk, req, child);
184 else 182 else
185 reqsk_free(req); 183 reqsk_free(req);
186 184
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index e32894532416..652685623519 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -11,7 +11,9 @@
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/sysctl.h> 12#include <linux/sysctl.h>
13#include <linux/config.h> 13#include <linux/config.h>
14#include <linux/igmp.h>
14#include <net/snmp.h> 15#include <net/snmp.h>
16#include <net/icmp.h>
15#include <net/ip.h> 17#include <net/ip.h>
16#include <net/route.h> 18#include <net/route.h>
17#include <net/tcp.h> 19#include <net/tcp.h>
@@ -19,36 +21,6 @@
19/* From af_inet.c */ 21/* From af_inet.c */
20extern int sysctl_ip_nonlocal_bind; 22extern int sysctl_ip_nonlocal_bind;
21 23
22/* From icmp.c */
23extern int sysctl_icmp_echo_ignore_all;
24extern int sysctl_icmp_echo_ignore_broadcasts;
25extern int sysctl_icmp_ignore_bogus_error_responses;
26extern int sysctl_icmp_errors_use_inbound_ifaddr;
27
28/* From ip_fragment.c */
29extern int sysctl_ipfrag_low_thresh;
30extern int sysctl_ipfrag_high_thresh;
31extern int sysctl_ipfrag_time;
32extern int sysctl_ipfrag_secret_interval;
33
34/* From ip_output.c */
35extern int sysctl_ip_dynaddr;
36
37/* From icmp.c */
38extern int sysctl_icmp_ratelimit;
39extern int sysctl_icmp_ratemask;
40
41/* From igmp.c */
42extern int sysctl_igmp_max_memberships;
43extern int sysctl_igmp_max_msf;
44
45/* From inetpeer.c */
46extern int inet_peer_threshold;
47extern int inet_peer_minttl;
48extern int inet_peer_maxttl;
49extern int inet_peer_gc_mintime;
50extern int inet_peer_gc_maxtime;
51
52#ifdef CONFIG_SYSCTL 24#ifdef CONFIG_SYSCTL
53static int tcp_retr1_max = 255; 25static int tcp_retr1_max = 255;
54static int ip_local_port_range_min[] = { 1, 1 }; 26static int ip_local_port_range_min[] = { 1, 1 };
@@ -57,8 +29,6 @@ static int ip_local_port_range_max[] = { 65535, 65535 };
57 29
58struct ipv4_config ipv4_config; 30struct ipv4_config ipv4_config;
59 31
60extern ctl_table ipv4_route_table[];
61
62#ifdef CONFIG_SYSCTL 32#ifdef CONFIG_SYSCTL
63 33
64static 34static
@@ -136,10 +106,11 @@ static int proc_tcp_congestion_control(ctl_table *ctl, int write, struct file *
136 return ret; 106 return ret;
137} 107}
138 108
139int sysctl_tcp_congestion_control(ctl_table *table, int __user *name, int nlen, 109static int sysctl_tcp_congestion_control(ctl_table *table, int __user *name,
140 void __user *oldval, size_t __user *oldlenp, 110 int nlen, void __user *oldval,
141 void __user *newval, size_t newlen, 111 size_t __user *oldlenp,
142 void **context) 112 void __user *newval, size_t newlen,
113 void **context)
143{ 114{
144 char val[TCP_CA_NAME_MAX]; 115 char val[TCP_CA_NAME_MAX];
145 ctl_table tbl = { 116 ctl_table tbl = {
@@ -259,7 +230,7 @@ ctl_table ipv4_table[] = {
259 { 230 {
260 .ctl_name = NET_TCP_MAX_TW_BUCKETS, 231 .ctl_name = NET_TCP_MAX_TW_BUCKETS,
261 .procname = "tcp_max_tw_buckets", 232 .procname = "tcp_max_tw_buckets",
262 .data = &sysctl_tcp_max_tw_buckets, 233 .data = &tcp_death_row.sysctl_max_tw_buckets,
263 .maxlen = sizeof(int), 234 .maxlen = sizeof(int),
264 .mode = 0644, 235 .mode = 0644,
265 .proc_handler = &proc_dointvec 236 .proc_handler = &proc_dointvec
@@ -363,7 +334,7 @@ ctl_table ipv4_table[] = {
363 { 334 {
364 .ctl_name = NET_TCP_TW_RECYCLE, 335 .ctl_name = NET_TCP_TW_RECYCLE,
365 .procname = "tcp_tw_recycle", 336 .procname = "tcp_tw_recycle",
366 .data = &sysctl_tcp_tw_recycle, 337 .data = &tcp_death_row.sysctl_tw_recycle,
367 .maxlen = sizeof(int), 338 .maxlen = sizeof(int),
368 .mode = 0644, 339 .mode = 0644,
369 .proc_handler = &proc_dointvec 340 .proc_handler = &proc_dointvec
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 69b1fcf70077..02fdda68718d 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -269,13 +269,12 @@
269 269
270int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; 270int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
271 271
272DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics); 272DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics) __read_mostly;
273
274kmem_cache_t *tcp_bucket_cachep;
275kmem_cache_t *tcp_timewait_cachep;
276 273
277atomic_t tcp_orphan_count = ATOMIC_INIT(0); 274atomic_t tcp_orphan_count = ATOMIC_INIT(0);
278 275
276EXPORT_SYMBOL_GPL(tcp_orphan_count);
277
279int sysctl_tcp_mem[3]; 278int sysctl_tcp_mem[3];
280int sysctl_tcp_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 }; 279int sysctl_tcp_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 };
281int sysctl_tcp_rmem[3] = { 4 * 1024, 87380, 87380 * 2 }; 280int sysctl_tcp_rmem[3] = { 4 * 1024, 87380, 87380 * 2 };
@@ -311,15 +310,6 @@ void tcp_enter_memory_pressure(void)
311EXPORT_SYMBOL(tcp_enter_memory_pressure); 310EXPORT_SYMBOL(tcp_enter_memory_pressure);
312 311
313/* 312/*
314 * LISTEN is a special case for poll..
315 */
316static __inline__ unsigned int tcp_listen_poll(struct sock *sk,
317 poll_table *wait)
318{
319 return !reqsk_queue_empty(&tcp_sk(sk)->accept_queue) ? (POLLIN | POLLRDNORM) : 0;
320}
321
322/*
323 * Wait for a TCP event. 313 * Wait for a TCP event.
324 * 314 *
325 * Note that we don't need to lock the socket, as the upper poll layers 315 * Note that we don't need to lock the socket, as the upper poll layers
@@ -334,7 +324,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
334 324
335 poll_wait(file, sk->sk_sleep, wait); 325 poll_wait(file, sk->sk_sleep, wait);
336 if (sk->sk_state == TCP_LISTEN) 326 if (sk->sk_state == TCP_LISTEN)
337 return tcp_listen_poll(sk, wait); 327 return inet_csk_listen_poll(sk);
338 328
339 /* Socket is not locked. We are protected from async events 329 /* Socket is not locked. We are protected from async events
340 by poll logic and correct handling of state changes 330 by poll logic and correct handling of state changes
@@ -457,109 +447,6 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
457 return put_user(answ, (int __user *)arg); 447 return put_user(answ, (int __user *)arg);
458} 448}
459 449
460
461int tcp_listen_start(struct sock *sk)
462{
463 struct inet_sock *inet = inet_sk(sk);
464 struct tcp_sock *tp = tcp_sk(sk);
465 int rc = reqsk_queue_alloc(&tp->accept_queue, TCP_SYNQ_HSIZE);
466
467 if (rc != 0)
468 return rc;
469
470 sk->sk_max_ack_backlog = 0;
471 sk->sk_ack_backlog = 0;
472 tcp_delack_init(tp);
473
474 /* There is race window here: we announce ourselves listening,
475 * but this transition is still not validated by get_port().
476 * It is OK, because this socket enters to hash table only
477 * after validation is complete.
478 */
479 sk->sk_state = TCP_LISTEN;
480 if (!sk->sk_prot->get_port(sk, inet->num)) {
481 inet->sport = htons(inet->num);
482
483 sk_dst_reset(sk);
484 sk->sk_prot->hash(sk);
485
486 return 0;
487 }
488
489 sk->sk_state = TCP_CLOSE;
490 reqsk_queue_destroy(&tp->accept_queue);
491 return -EADDRINUSE;
492}
493
494/*
495 * This routine closes sockets which have been at least partially
496 * opened, but not yet accepted.
497 */
498
499static void tcp_listen_stop (struct sock *sk)
500{
501 struct tcp_sock *tp = tcp_sk(sk);
502 struct listen_sock *lopt;
503 struct request_sock *acc_req;
504 struct request_sock *req;
505 int i;
506
507 tcp_delete_keepalive_timer(sk);
508
509 /* make all the listen_opt local to us */
510 lopt = reqsk_queue_yank_listen_sk(&tp->accept_queue);
511 acc_req = reqsk_queue_yank_acceptq(&tp->accept_queue);
512
513 if (lopt->qlen) {
514 for (i = 0; i < TCP_SYNQ_HSIZE; i++) {
515 while ((req = lopt->syn_table[i]) != NULL) {
516 lopt->syn_table[i] = req->dl_next;
517 lopt->qlen--;
518 reqsk_free(req);
519
520 /* Following specs, it would be better either to send FIN
521 * (and enter FIN-WAIT-1, it is normal close)
522 * or to send active reset (abort).
523 * Certainly, it is pretty dangerous while synflood, but it is
524 * bad justification for our negligence 8)
525 * To be honest, we are not able to make either
526 * of the variants now. --ANK
527 */
528 }
529 }
530 }
531 BUG_TRAP(!lopt->qlen);
532
533 kfree(lopt);
534
535 while ((req = acc_req) != NULL) {
536 struct sock *child = req->sk;
537
538 acc_req = req->dl_next;
539
540 local_bh_disable();
541 bh_lock_sock(child);
542 BUG_TRAP(!sock_owned_by_user(child));
543 sock_hold(child);
544
545 tcp_disconnect(child, O_NONBLOCK);
546
547 sock_orphan(child);
548
549 atomic_inc(&tcp_orphan_count);
550
551 tcp_destroy_sock(child);
552
553 bh_unlock_sock(child);
554 local_bh_enable();
555 sock_put(child);
556
557 sk_acceptq_removed(sk);
558 __reqsk_free(req);
559 }
560 BUG_TRAP(!sk->sk_ack_backlog);
561}
562
563static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb) 450static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
564{ 451{
565 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; 452 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
@@ -975,7 +862,7 @@ do_fault:
975 if (!skb->len) { 862 if (!skb->len) {
976 if (sk->sk_send_head == skb) 863 if (sk->sk_send_head == skb)
977 sk->sk_send_head = NULL; 864 sk->sk_send_head = NULL;
978 __skb_unlink(skb, skb->list); 865 __skb_unlink(skb, &sk->sk_write_queue);
979 sk_stream_free_skb(sk, skb); 866 sk_stream_free_skb(sk, skb);
980 } 867 }
981 868
@@ -1057,20 +944,21 @@ static void cleanup_rbuf(struct sock *sk, int copied)
1057 BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)); 944 BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
1058#endif 945#endif
1059 946
1060 if (tcp_ack_scheduled(tp)) { 947 if (inet_csk_ack_scheduled(sk)) {
948 const struct inet_connection_sock *icsk = inet_csk(sk);
1061 /* Delayed ACKs frequently hit locked sockets during bulk 949 /* Delayed ACKs frequently hit locked sockets during bulk
1062 * receive. */ 950 * receive. */
1063 if (tp->ack.blocked || 951 if (icsk->icsk_ack.blocked ||
1064 /* Once-per-two-segments ACK was not sent by tcp_input.c */ 952 /* Once-per-two-segments ACK was not sent by tcp_input.c */
1065 tp->rcv_nxt - tp->rcv_wup > tp->ack.rcv_mss || 953 tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
1066 /* 954 /*
1067 * If this read emptied read buffer, we send ACK, if 955 * If this read emptied read buffer, we send ACK, if
1068 * connection is not bidirectional, user drained 956 * connection is not bidirectional, user drained
1069 * receive buffer and there was a small segment 957 * receive buffer and there was a small segment
1070 * in queue. 958 * in queue.
1071 */ 959 */
1072 (copied > 0 && (tp->ack.pending & TCP_ACK_PUSHED) && 960 (copied > 0 && (icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
1073 !tp->ack.pingpong && !atomic_read(&sk->sk_rmem_alloc))) 961 !icsk->icsk_ack.pingpong && !atomic_read(&sk->sk_rmem_alloc)))
1074 time_to_ack = 1; 962 time_to_ack = 1;
1075 } 963 }
1076 964
@@ -1572,40 +1460,6 @@ void tcp_shutdown(struct sock *sk, int how)
1572 } 1460 }
1573} 1461}
1574 1462
1575/*
1576 * At this point, there should be no process reference to this
1577 * socket, and thus no user references at all. Therefore we
1578 * can assume the socket waitqueue is inactive and nobody will
1579 * try to jump onto it.
1580 */
1581void tcp_destroy_sock(struct sock *sk)
1582{
1583 BUG_TRAP(sk->sk_state == TCP_CLOSE);
1584 BUG_TRAP(sock_flag(sk, SOCK_DEAD));
1585
1586 /* It cannot be in hash table! */
1587 BUG_TRAP(sk_unhashed(sk));
1588
1589 /* If it has not 0 inet_sk(sk)->num, it must be bound */
1590 BUG_TRAP(!inet_sk(sk)->num || tcp_sk(sk)->bind_hash);
1591
1592 sk->sk_prot->destroy(sk);
1593
1594 sk_stream_kill_queues(sk);
1595
1596 xfrm_sk_free_policy(sk);
1597
1598#ifdef INET_REFCNT_DEBUG
1599 if (atomic_read(&sk->sk_refcnt) != 1) {
1600 printk(KERN_DEBUG "Destruction TCP %p delayed, c=%d\n",
1601 sk, atomic_read(&sk->sk_refcnt));
1602 }
1603#endif
1604
1605 atomic_dec(&tcp_orphan_count);
1606 sock_put(sk);
1607}
1608
1609void tcp_close(struct sock *sk, long timeout) 1463void tcp_close(struct sock *sk, long timeout)
1610{ 1464{
1611 struct sk_buff *skb; 1465 struct sk_buff *skb;
@@ -1618,7 +1472,7 @@ void tcp_close(struct sock *sk, long timeout)
1618 tcp_set_state(sk, TCP_CLOSE); 1472 tcp_set_state(sk, TCP_CLOSE);
1619 1473
1620 /* Special case. */ 1474 /* Special case. */
1621 tcp_listen_stop(sk); 1475 inet_csk_listen_stop(sk);
1622 1476
1623 goto adjudge_to_death; 1477 goto adjudge_to_death;
1624 } 1478 }
@@ -1721,12 +1575,12 @@ adjudge_to_death:
1721 tcp_send_active_reset(sk, GFP_ATOMIC); 1575 tcp_send_active_reset(sk, GFP_ATOMIC);
1722 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONLINGER); 1576 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONLINGER);
1723 } else { 1577 } else {
1724 int tmo = tcp_fin_time(tp); 1578 const int tmo = tcp_fin_time(sk);
1725 1579
1726 if (tmo > TCP_TIMEWAIT_LEN) { 1580 if (tmo > TCP_TIMEWAIT_LEN) {
1727 tcp_reset_keepalive_timer(sk, tcp_fin_time(tp)); 1581 inet_csk_reset_keepalive_timer(sk, tcp_fin_time(sk));
1728 } else { 1582 } else {
1729 atomic_inc(&tcp_orphan_count); 1583 atomic_inc(sk->sk_prot->orphan_count);
1730 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); 1584 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
1731 goto out; 1585 goto out;
1732 } 1586 }
@@ -1734,7 +1588,7 @@ adjudge_to_death:
1734 } 1588 }
1735 if (sk->sk_state != TCP_CLOSE) { 1589 if (sk->sk_state != TCP_CLOSE) {
1736 sk_stream_mem_reclaim(sk); 1590 sk_stream_mem_reclaim(sk);
1737 if (atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans || 1591 if (atomic_read(sk->sk_prot->orphan_count) > sysctl_tcp_max_orphans ||
1738 (sk->sk_wmem_queued > SOCK_MIN_SNDBUF && 1592 (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
1739 atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) { 1593 atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
1740 if (net_ratelimit()) 1594 if (net_ratelimit())
@@ -1745,10 +1599,10 @@ adjudge_to_death:
1745 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY); 1599 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
1746 } 1600 }
1747 } 1601 }
1748 atomic_inc(&tcp_orphan_count); 1602 atomic_inc(sk->sk_prot->orphan_count);
1749 1603
1750 if (sk->sk_state == TCP_CLOSE) 1604 if (sk->sk_state == TCP_CLOSE)
1751 tcp_destroy_sock(sk); 1605 inet_csk_destroy_sock(sk);
1752 /* Otherwise, socket is reprieved until protocol close. */ 1606 /* Otherwise, socket is reprieved until protocol close. */
1753 1607
1754out: 1608out:
@@ -1769,6 +1623,7 @@ static inline int tcp_need_reset(int state)
1769int tcp_disconnect(struct sock *sk, int flags) 1623int tcp_disconnect(struct sock *sk, int flags)
1770{ 1624{
1771 struct inet_sock *inet = inet_sk(sk); 1625 struct inet_sock *inet = inet_sk(sk);
1626 struct inet_connection_sock *icsk = inet_csk(sk);
1772 struct tcp_sock *tp = tcp_sk(sk); 1627 struct tcp_sock *tp = tcp_sk(sk);
1773 int err = 0; 1628 int err = 0;
1774 int old_state = sk->sk_state; 1629 int old_state = sk->sk_state;
@@ -1778,7 +1633,7 @@ int tcp_disconnect(struct sock *sk, int flags)
1778 1633
1779 /* ABORT function of RFC793 */ 1634 /* ABORT function of RFC793 */
1780 if (old_state == TCP_LISTEN) { 1635 if (old_state == TCP_LISTEN) {
1781 tcp_listen_stop(sk); 1636 inet_csk_listen_stop(sk);
1782 } else if (tcp_need_reset(old_state) || 1637 } else if (tcp_need_reset(old_state) ||
1783 (tp->snd_nxt != tp->write_seq && 1638 (tp->snd_nxt != tp->write_seq &&
1784 (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) { 1639 (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
@@ -1805,125 +1660,34 @@ int tcp_disconnect(struct sock *sk, int flags)
1805 tp->srtt = 0; 1660 tp->srtt = 0;
1806 if ((tp->write_seq += tp->max_window + 2) == 0) 1661 if ((tp->write_seq += tp->max_window + 2) == 0)
1807 tp->write_seq = 1; 1662 tp->write_seq = 1;
1808 tp->backoff = 0; 1663 icsk->icsk_backoff = 0;
1809 tp->snd_cwnd = 2; 1664 tp->snd_cwnd = 2;
1810 tp->probes_out = 0; 1665 icsk->icsk_probes_out = 0;
1811 tp->packets_out = 0; 1666 tp->packets_out = 0;
1812 tp->snd_ssthresh = 0x7fffffff; 1667 tp->snd_ssthresh = 0x7fffffff;
1813 tp->snd_cwnd_cnt = 0; 1668 tp->snd_cwnd_cnt = 0;
1814 tcp_set_ca_state(tp, TCP_CA_Open); 1669 tcp_set_ca_state(sk, TCP_CA_Open);
1815 tcp_clear_retrans(tp); 1670 tcp_clear_retrans(tp);
1816 tcp_delack_init(tp); 1671 inet_csk_delack_init(sk);
1817 sk->sk_send_head = NULL; 1672 sk->sk_send_head = NULL;
1818 tp->rx_opt.saw_tstamp = 0; 1673 tp->rx_opt.saw_tstamp = 0;
1819 tcp_sack_reset(&tp->rx_opt); 1674 tcp_sack_reset(&tp->rx_opt);
1820 __sk_dst_reset(sk); 1675 __sk_dst_reset(sk);
1821 1676
1822 BUG_TRAP(!inet->num || tp->bind_hash); 1677 BUG_TRAP(!inet->num || icsk->icsk_bind_hash);
1823 1678
1824 sk->sk_error_report(sk); 1679 sk->sk_error_report(sk);
1825 return err; 1680 return err;
1826} 1681}
1827 1682
1828/* 1683/*
1829 * Wait for an incoming connection, avoid race
1830 * conditions. This must be called with the socket locked.
1831 */
1832static int wait_for_connect(struct sock *sk, long timeo)
1833{
1834 struct tcp_sock *tp = tcp_sk(sk);
1835 DEFINE_WAIT(wait);
1836 int err;
1837
1838 /*
1839 * True wake-one mechanism for incoming connections: only
1840 * one process gets woken up, not the 'whole herd'.
1841 * Since we do not 'race & poll' for established sockets
1842 * anymore, the common case will execute the loop only once.
1843 *
1844 * Subtle issue: "add_wait_queue_exclusive()" will be added
1845 * after any current non-exclusive waiters, and we know that
1846 * it will always _stay_ after any new non-exclusive waiters
1847 * because all non-exclusive waiters are added at the
1848 * beginning of the wait-queue. As such, it's ok to "drop"
1849 * our exclusiveness temporarily when we get woken up without
1850 * having to remove and re-insert us on the wait queue.
1851 */
1852 for (;;) {
1853 prepare_to_wait_exclusive(sk->sk_sleep, &wait,
1854 TASK_INTERRUPTIBLE);
1855 release_sock(sk);
1856 if (reqsk_queue_empty(&tp->accept_queue))
1857 timeo = schedule_timeout(timeo);
1858 lock_sock(sk);
1859 err = 0;
1860 if (!reqsk_queue_empty(&tp->accept_queue))
1861 break;
1862 err = -EINVAL;
1863 if (sk->sk_state != TCP_LISTEN)
1864 break;
1865 err = sock_intr_errno(timeo);
1866 if (signal_pending(current))
1867 break;
1868 err = -EAGAIN;
1869 if (!timeo)
1870 break;
1871 }
1872 finish_wait(sk->sk_sleep, &wait);
1873 return err;
1874}
1875
1876/*
1877 * This will accept the next outstanding connection.
1878 */
1879
1880struct sock *tcp_accept(struct sock *sk, int flags, int *err)
1881{
1882 struct tcp_sock *tp = tcp_sk(sk);
1883 struct sock *newsk;
1884 int error;
1885
1886 lock_sock(sk);
1887
1888 /* We need to make sure that this socket is listening,
1889 * and that it has something pending.
1890 */
1891 error = -EINVAL;
1892 if (sk->sk_state != TCP_LISTEN)
1893 goto out_err;
1894
1895 /* Find already established connection */
1896 if (reqsk_queue_empty(&tp->accept_queue)) {
1897 long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
1898
1899 /* If this is a non blocking socket don't sleep */
1900 error = -EAGAIN;
1901 if (!timeo)
1902 goto out_err;
1903
1904 error = wait_for_connect(sk, timeo);
1905 if (error)
1906 goto out_err;
1907 }
1908
1909 newsk = reqsk_queue_get_child(&tp->accept_queue, sk);
1910 BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);
1911out:
1912 release_sock(sk);
1913 return newsk;
1914out_err:
1915 newsk = NULL;
1916 *err = error;
1917 goto out;
1918}
1919
1920/*
1921 * Socket option code for TCP. 1684 * Socket option code for TCP.
1922 */ 1685 */
1923int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, 1686int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
1924 int optlen) 1687 int optlen)
1925{ 1688{
1926 struct tcp_sock *tp = tcp_sk(sk); 1689 struct tcp_sock *tp = tcp_sk(sk);
1690 struct inet_connection_sock *icsk = inet_csk(sk);
1927 int val; 1691 int val;
1928 int err = 0; 1692 int err = 0;
1929 1693
@@ -1945,7 +1709,7 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
1945 name[val] = 0; 1709 name[val] = 0;
1946 1710
1947 lock_sock(sk); 1711 lock_sock(sk);
1948 err = tcp_set_congestion_control(tp, name); 1712 err = tcp_set_congestion_control(sk, name);
1949 release_sock(sk); 1713 release_sock(sk);
1950 return err; 1714 return err;
1951 } 1715 }
@@ -2022,7 +1786,7 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
2022 elapsed = tp->keepalive_time - elapsed; 1786 elapsed = tp->keepalive_time - elapsed;
2023 else 1787 else
2024 elapsed = 0; 1788 elapsed = 0;
2025 tcp_reset_keepalive_timer(sk, elapsed); 1789 inet_csk_reset_keepalive_timer(sk, elapsed);
2026 } 1790 }
2027 } 1791 }
2028 break; 1792 break;
@@ -2042,7 +1806,7 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
2042 if (val < 1 || val > MAX_TCP_SYNCNT) 1806 if (val < 1 || val > MAX_TCP_SYNCNT)
2043 err = -EINVAL; 1807 err = -EINVAL;
2044 else 1808 else
2045 tp->syn_retries = val; 1809 icsk->icsk_syn_retries = val;
2046 break; 1810 break;
2047 1811
2048 case TCP_LINGER2: 1812 case TCP_LINGER2:
@@ -2055,15 +1819,15 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
2055 break; 1819 break;
2056 1820
2057 case TCP_DEFER_ACCEPT: 1821 case TCP_DEFER_ACCEPT:
2058 tp->defer_accept = 0; 1822 icsk->icsk_accept_queue.rskq_defer_accept = 0;
2059 if (val > 0) { 1823 if (val > 0) {
2060 /* Translate value in seconds to number of 1824 /* Translate value in seconds to number of
2061 * retransmits */ 1825 * retransmits */
2062 while (tp->defer_accept < 32 && 1826 while (icsk->icsk_accept_queue.rskq_defer_accept < 32 &&
2063 val > ((TCP_TIMEOUT_INIT / HZ) << 1827 val > ((TCP_TIMEOUT_INIT / HZ) <<
2064 tp->defer_accept)) 1828 icsk->icsk_accept_queue.rskq_defer_accept))
2065 tp->defer_accept++; 1829 icsk->icsk_accept_queue.rskq_defer_accept++;
2066 tp->defer_accept++; 1830 icsk->icsk_accept_queue.rskq_defer_accept++;
2067 } 1831 }
2068 break; 1832 break;
2069 1833
@@ -2081,16 +1845,16 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
2081 1845
2082 case TCP_QUICKACK: 1846 case TCP_QUICKACK:
2083 if (!val) { 1847 if (!val) {
2084 tp->ack.pingpong = 1; 1848 icsk->icsk_ack.pingpong = 1;
2085 } else { 1849 } else {
2086 tp->ack.pingpong = 0; 1850 icsk->icsk_ack.pingpong = 0;
2087 if ((1 << sk->sk_state) & 1851 if ((1 << sk->sk_state) &
2088 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) && 1852 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
2089 tcp_ack_scheduled(tp)) { 1853 inet_csk_ack_scheduled(sk)) {
2090 tp->ack.pending |= TCP_ACK_PUSHED; 1854 icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
2091 cleanup_rbuf(sk, 1); 1855 cleanup_rbuf(sk, 1);
2092 if (!(val & 1)) 1856 if (!(val & 1))
2093 tp->ack.pingpong = 1; 1857 icsk->icsk_ack.pingpong = 1;
2094 } 1858 }
2095 } 1859 }
2096 break; 1860 break;
@@ -2107,15 +1871,16 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
2107void tcp_get_info(struct sock *sk, struct tcp_info *info) 1871void tcp_get_info(struct sock *sk, struct tcp_info *info)
2108{ 1872{
2109 struct tcp_sock *tp = tcp_sk(sk); 1873 struct tcp_sock *tp = tcp_sk(sk);
1874 const struct inet_connection_sock *icsk = inet_csk(sk);
2110 u32 now = tcp_time_stamp; 1875 u32 now = tcp_time_stamp;
2111 1876
2112 memset(info, 0, sizeof(*info)); 1877 memset(info, 0, sizeof(*info));
2113 1878
2114 info->tcpi_state = sk->sk_state; 1879 info->tcpi_state = sk->sk_state;
2115 info->tcpi_ca_state = tp->ca_state; 1880 info->tcpi_ca_state = icsk->icsk_ca_state;
2116 info->tcpi_retransmits = tp->retransmits; 1881 info->tcpi_retransmits = icsk->icsk_retransmits;
2117 info->tcpi_probes = tp->probes_out; 1882 info->tcpi_probes = icsk->icsk_probes_out;
2118 info->tcpi_backoff = tp->backoff; 1883 info->tcpi_backoff = icsk->icsk_backoff;
2119 1884
2120 if (tp->rx_opt.tstamp_ok) 1885 if (tp->rx_opt.tstamp_ok)
2121 info->tcpi_options |= TCPI_OPT_TIMESTAMPS; 1886 info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
@@ -2130,10 +1895,10 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
2130 if (tp->ecn_flags&TCP_ECN_OK) 1895 if (tp->ecn_flags&TCP_ECN_OK)
2131 info->tcpi_options |= TCPI_OPT_ECN; 1896 info->tcpi_options |= TCPI_OPT_ECN;
2132 1897
2133 info->tcpi_rto = jiffies_to_usecs(tp->rto); 1898 info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
2134 info->tcpi_ato = jiffies_to_usecs(tp->ack.ato); 1899 info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);
2135 info->tcpi_snd_mss = tp->mss_cache; 1900 info->tcpi_snd_mss = tp->mss_cache;
2136 info->tcpi_rcv_mss = tp->ack.rcv_mss; 1901 info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
2137 1902
2138 info->tcpi_unacked = tp->packets_out; 1903 info->tcpi_unacked = tp->packets_out;
2139 info->tcpi_sacked = tp->sacked_out; 1904 info->tcpi_sacked = tp->sacked_out;
@@ -2142,7 +1907,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
2142 info->tcpi_fackets = tp->fackets_out; 1907 info->tcpi_fackets = tp->fackets_out;
2143 1908
2144 info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime); 1909 info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
2145 info->tcpi_last_data_recv = jiffies_to_msecs(now - tp->ack.lrcvtime); 1910 info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
2146 info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp); 1911 info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
2147 1912
2148 info->tcpi_pmtu = tp->pmtu_cookie; 1913 info->tcpi_pmtu = tp->pmtu_cookie;
@@ -2165,6 +1930,7 @@ EXPORT_SYMBOL_GPL(tcp_get_info);
2165int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, 1930int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2166 int __user *optlen) 1931 int __user *optlen)
2167{ 1932{
1933 struct inet_connection_sock *icsk = inet_csk(sk);
2168 struct tcp_sock *tp = tcp_sk(sk); 1934 struct tcp_sock *tp = tcp_sk(sk);
2169 int val, len; 1935 int val, len;
2170 1936
@@ -2202,7 +1968,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2202 val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes; 1968 val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
2203 break; 1969 break;
2204 case TCP_SYNCNT: 1970 case TCP_SYNCNT:
2205 val = tp->syn_retries ? : sysctl_tcp_syn_retries; 1971 val = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
2206 break; 1972 break;
2207 case TCP_LINGER2: 1973 case TCP_LINGER2:
2208 val = tp->linger2; 1974 val = tp->linger2;
@@ -2210,8 +1976,8 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2210 val = (val ? : sysctl_tcp_fin_timeout) / HZ; 1976 val = (val ? : sysctl_tcp_fin_timeout) / HZ;
2211 break; 1977 break;
2212 case TCP_DEFER_ACCEPT: 1978 case TCP_DEFER_ACCEPT:
2213 val = !tp->defer_accept ? 0 : ((TCP_TIMEOUT_INIT / HZ) << 1979 val = !icsk->icsk_accept_queue.rskq_defer_accept ? 0 :
2214 (tp->defer_accept - 1)); 1980 ((TCP_TIMEOUT_INIT / HZ) << (icsk->icsk_accept_queue.rskq_defer_accept - 1));
2215 break; 1981 break;
2216 case TCP_WINDOW_CLAMP: 1982 case TCP_WINDOW_CLAMP:
2217 val = tp->window_clamp; 1983 val = tp->window_clamp;
@@ -2232,7 +1998,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2232 return 0; 1998 return 0;
2233 } 1999 }
2234 case TCP_QUICKACK: 2000 case TCP_QUICKACK:
2235 val = !tp->ack.pingpong; 2001 val = !icsk->icsk_ack.pingpong;
2236 break; 2002 break;
2237 2003
2238 case TCP_CONGESTION: 2004 case TCP_CONGESTION:
@@ -2241,7 +2007,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2241 len = min_t(unsigned int, len, TCP_CA_NAME_MAX); 2007 len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
2242 if (put_user(len, optlen)) 2008 if (put_user(len, optlen))
2243 return -EFAULT; 2009 return -EFAULT;
2244 if (copy_to_user(optval, tp->ca_ops->name, len)) 2010 if (copy_to_user(optval, icsk->icsk_ca_ops->name, len))
2245 return -EFAULT; 2011 return -EFAULT;
2246 return 0; 2012 return 0;
2247 default: 2013 default:
@@ -2278,79 +2044,72 @@ void __init tcp_init(void)
2278 __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb), 2044 __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
2279 sizeof(skb->cb)); 2045 sizeof(skb->cb));
2280 2046
2281 tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket", 2047 tcp_hashinfo.bind_bucket_cachep =
2282 sizeof(struct tcp_bind_bucket), 2048 kmem_cache_create("tcp_bind_bucket",
2283 0, SLAB_HWCACHE_ALIGN, 2049 sizeof(struct inet_bind_bucket), 0,
2284 NULL, NULL); 2050 SLAB_HWCACHE_ALIGN, NULL, NULL);
2285 if (!tcp_bucket_cachep) 2051 if (!tcp_hashinfo.bind_bucket_cachep)
2286 panic("tcp_init: Cannot alloc tcp_bind_bucket cache."); 2052 panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
2287 2053
2288 tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket",
2289 sizeof(struct tcp_tw_bucket),
2290 0, SLAB_HWCACHE_ALIGN,
2291 NULL, NULL);
2292 if (!tcp_timewait_cachep)
2293 panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
2294
2295 /* Size and allocate the main established and bind bucket 2054 /* Size and allocate the main established and bind bucket
2296 * hash tables. 2055 * hash tables.
2297 * 2056 *
2298 * The methodology is similar to that of the buffer cache. 2057 * The methodology is similar to that of the buffer cache.
2299 */ 2058 */
2300 tcp_ehash = (struct tcp_ehash_bucket *) 2059 tcp_hashinfo.ehash =
2301 alloc_large_system_hash("TCP established", 2060 alloc_large_system_hash("TCP established",
2302 sizeof(struct tcp_ehash_bucket), 2061 sizeof(struct inet_ehash_bucket),
2303 thash_entries, 2062 thash_entries,
2304 (num_physpages >= 128 * 1024) ? 2063 (num_physpages >= 128 * 1024) ?
2305 (25 - PAGE_SHIFT) : 2064 (25 - PAGE_SHIFT) :
2306 (27 - PAGE_SHIFT), 2065 (27 - PAGE_SHIFT),
2307 HASH_HIGHMEM, 2066 HASH_HIGHMEM,
2308 &tcp_ehash_size, 2067 &tcp_hashinfo.ehash_size,
2309 NULL, 2068 NULL,
2310 0); 2069 0);
2311 tcp_ehash_size = (1 << tcp_ehash_size) >> 1; 2070 tcp_hashinfo.ehash_size = (1 << tcp_hashinfo.ehash_size) >> 1;
2312 for (i = 0; i < (tcp_ehash_size << 1); i++) { 2071 for (i = 0; i < (tcp_hashinfo.ehash_size << 1); i++) {
2313 rwlock_init(&tcp_ehash[i].lock); 2072 rwlock_init(&tcp_hashinfo.ehash[i].lock);
2314 INIT_HLIST_HEAD(&tcp_ehash[i].chain); 2073 INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain);
2315 } 2074 }
2316 2075
2317 tcp_bhash = (struct tcp_bind_hashbucket *) 2076 tcp_hashinfo.bhash =
2318 alloc_large_system_hash("TCP bind", 2077 alloc_large_system_hash("TCP bind",
2319 sizeof(struct tcp_bind_hashbucket), 2078 sizeof(struct inet_bind_hashbucket),
2320 tcp_ehash_size, 2079 tcp_hashinfo.ehash_size,
2321 (num_physpages >= 128 * 1024) ? 2080 (num_physpages >= 128 * 1024) ?
2322 (25 - PAGE_SHIFT) : 2081 (25 - PAGE_SHIFT) :
2323 (27 - PAGE_SHIFT), 2082 (27 - PAGE_SHIFT),
2324 HASH_HIGHMEM, 2083 HASH_HIGHMEM,
2325 &tcp_bhash_size, 2084 &tcp_hashinfo.bhash_size,
2326 NULL, 2085 NULL,
2327 64 * 1024); 2086 64 * 1024);
2328 tcp_bhash_size = 1 << tcp_bhash_size; 2087 tcp_hashinfo.bhash_size = 1 << tcp_hashinfo.bhash_size;
2329 for (i = 0; i < tcp_bhash_size; i++) { 2088 for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
2330 spin_lock_init(&tcp_bhash[i].lock); 2089 spin_lock_init(&tcp_hashinfo.bhash[i].lock);
2331 INIT_HLIST_HEAD(&tcp_bhash[i].chain); 2090 INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
2332 } 2091 }
2333 2092
2334 /* Try to be a bit smarter and adjust defaults depending 2093 /* Try to be a bit smarter and adjust defaults depending
2335 * on available memory. 2094 * on available memory.
2336 */ 2095 */
2337 for (order = 0; ((1 << order) << PAGE_SHIFT) < 2096 for (order = 0; ((1 << order) << PAGE_SHIFT) <
2338 (tcp_bhash_size * sizeof(struct tcp_bind_hashbucket)); 2097 (tcp_hashinfo.bhash_size * sizeof(struct inet_bind_hashbucket));
2339 order++) 2098 order++)
2340 ; 2099 ;
2341 if (order >= 4) { 2100 if (order >= 4) {
2342 sysctl_local_port_range[0] = 32768; 2101 sysctl_local_port_range[0] = 32768;
2343 sysctl_local_port_range[1] = 61000; 2102 sysctl_local_port_range[1] = 61000;
2344 sysctl_tcp_max_tw_buckets = 180000; 2103 tcp_death_row.sysctl_max_tw_buckets = 180000;
2345 sysctl_tcp_max_orphans = 4096 << (order - 4); 2104 sysctl_tcp_max_orphans = 4096 << (order - 4);
2346 sysctl_max_syn_backlog = 1024; 2105 sysctl_max_syn_backlog = 1024;
2347 } else if (order < 3) { 2106 } else if (order < 3) {
2348 sysctl_local_port_range[0] = 1024 * (3 - order); 2107 sysctl_local_port_range[0] = 1024 * (3 - order);
2349 sysctl_tcp_max_tw_buckets >>= (3 - order); 2108 tcp_death_row.sysctl_max_tw_buckets >>= (3 - order);
2350 sysctl_tcp_max_orphans >>= (3 - order); 2109 sysctl_tcp_max_orphans >>= (3 - order);
2351 sysctl_max_syn_backlog = 128; 2110 sysctl_max_syn_backlog = 128;
2352 } 2111 }
2353 tcp_port_rover = sysctl_local_port_range[0] - 1; 2112 tcp_hashinfo.port_rover = sysctl_local_port_range[0] - 1;
2354 2113
2355 sysctl_tcp_mem[0] = 768 << order; 2114 sysctl_tcp_mem[0] = 768 << order;
2356 sysctl_tcp_mem[1] = 1024 << order; 2115 sysctl_tcp_mem[1] = 1024 << order;
@@ -2365,14 +2124,12 @@ void __init tcp_init(void)
2365 2124
2366 printk(KERN_INFO "TCP: Hash tables configured " 2125 printk(KERN_INFO "TCP: Hash tables configured "
2367 "(established %d bind %d)\n", 2126 "(established %d bind %d)\n",
2368 tcp_ehash_size << 1, tcp_bhash_size); 2127 tcp_hashinfo.ehash_size << 1, tcp_hashinfo.bhash_size);
2369 2128
2370 tcp_register_congestion_control(&tcp_reno); 2129 tcp_register_congestion_control(&tcp_reno);
2371} 2130}
2372 2131
2373EXPORT_SYMBOL(tcp_accept);
2374EXPORT_SYMBOL(tcp_close); 2132EXPORT_SYMBOL(tcp_close);
2375EXPORT_SYMBOL(tcp_destroy_sock);
2376EXPORT_SYMBOL(tcp_disconnect); 2133EXPORT_SYMBOL(tcp_disconnect);
2377EXPORT_SYMBOL(tcp_getsockopt); 2134EXPORT_SYMBOL(tcp_getsockopt);
2378EXPORT_SYMBOL(tcp_ioctl); 2135EXPORT_SYMBOL(tcp_ioctl);
@@ -2384,4 +2141,3 @@ EXPORT_SYMBOL(tcp_sendpage);
2384EXPORT_SYMBOL(tcp_setsockopt); 2141EXPORT_SYMBOL(tcp_setsockopt);
2385EXPORT_SYMBOL(tcp_shutdown); 2142EXPORT_SYMBOL(tcp_shutdown);
2386EXPORT_SYMBOL(tcp_statistics); 2143EXPORT_SYMBOL(tcp_statistics);
2387EXPORT_SYMBOL(tcp_timewait_cachep);
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index ec38d45d6649..b940346de4e7 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -86,11 +86,11 @@ static inline void bictcp_reset(struct bictcp *ca)
86 ca->delayed_ack = 2 << ACK_RATIO_SHIFT; 86 ca->delayed_ack = 2 << ACK_RATIO_SHIFT;
87} 87}
88 88
89static void bictcp_init(struct tcp_sock *tp) 89static void bictcp_init(struct sock *sk)
90{ 90{
91 bictcp_reset(tcp_ca(tp)); 91 bictcp_reset(inet_csk_ca(sk));
92 if (initial_ssthresh) 92 if (initial_ssthresh)
93 tp->snd_ssthresh = initial_ssthresh; 93 tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
94} 94}
95 95
96/* 96/*
@@ -156,9 +156,10 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
156 156
157 157
158/* Detect low utilization in congestion avoidance */ 158/* Detect low utilization in congestion avoidance */
159static inline void bictcp_low_utilization(struct tcp_sock *tp, int flag) 159static inline void bictcp_low_utilization(struct sock *sk, int flag)
160{ 160{
161 struct bictcp *ca = tcp_ca(tp); 161 const struct tcp_sock *tp = tcp_sk(sk);
162 struct bictcp *ca = inet_csk_ca(sk);
162 u32 dist, delay; 163 u32 dist, delay;
163 164
164 /* No time stamp */ 165 /* No time stamp */
@@ -208,12 +209,13 @@ static inline void bictcp_low_utilization(struct tcp_sock *tp, int flag)
208 209
209} 210}
210 211
211static void bictcp_cong_avoid(struct tcp_sock *tp, u32 ack, 212static void bictcp_cong_avoid(struct sock *sk, u32 ack,
212 u32 seq_rtt, u32 in_flight, int data_acked) 213 u32 seq_rtt, u32 in_flight, int data_acked)
213{ 214{
214 struct bictcp *ca = tcp_ca(tp); 215 struct tcp_sock *tp = tcp_sk(sk);
216 struct bictcp *ca = inet_csk_ca(sk);
215 217
216 bictcp_low_utilization(tp, data_acked); 218 bictcp_low_utilization(sk, data_acked);
217 219
218 if (in_flight < tp->snd_cwnd) 220 if (in_flight < tp->snd_cwnd)
219 return; 221 return;
@@ -242,9 +244,10 @@ static void bictcp_cong_avoid(struct tcp_sock *tp, u32 ack,
242 * behave like Reno until low_window is reached, 244 * behave like Reno until low_window is reached,
243 * then increase congestion window slowly 245 * then increase congestion window slowly
244 */ 246 */
245static u32 bictcp_recalc_ssthresh(struct tcp_sock *tp) 247static u32 bictcp_recalc_ssthresh(struct sock *sk)
246{ 248{
247 struct bictcp *ca = tcp_ca(tp); 249 const struct tcp_sock *tp = tcp_sk(sk);
250 struct bictcp *ca = inet_csk_ca(sk);
248 251
249 ca->epoch_start = 0; /* end of epoch */ 252 ca->epoch_start = 0; /* end of epoch */
250 253
@@ -269,31 +272,34 @@ static u32 bictcp_recalc_ssthresh(struct tcp_sock *tp)
269 return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U); 272 return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U);
270} 273}
271 274
272static u32 bictcp_undo_cwnd(struct tcp_sock *tp) 275static u32 bictcp_undo_cwnd(struct sock *sk)
273{ 276{
274 struct bictcp *ca = tcp_ca(tp); 277 const struct tcp_sock *tp = tcp_sk(sk);
275 278 const struct bictcp *ca = inet_csk_ca(sk);
276 return max(tp->snd_cwnd, ca->last_max_cwnd); 279 return max(tp->snd_cwnd, ca->last_max_cwnd);
277} 280}
278 281
279static u32 bictcp_min_cwnd(struct tcp_sock *tp) 282static u32 bictcp_min_cwnd(struct sock *sk)
280{ 283{
284 const struct tcp_sock *tp = tcp_sk(sk);
281 return tp->snd_ssthresh; 285 return tp->snd_ssthresh;
282} 286}
283 287
284static void bictcp_state(struct tcp_sock *tp, u8 new_state) 288static void bictcp_state(struct sock *sk, u8 new_state)
285{ 289{
286 if (new_state == TCP_CA_Loss) 290 if (new_state == TCP_CA_Loss)
287 bictcp_reset(tcp_ca(tp)); 291 bictcp_reset(inet_csk_ca(sk));
288} 292}
289 293
290/* Track delayed acknowledgement ratio using sliding window 294/* Track delayed acknowledgement ratio using sliding window
291 * ratio = (15*ratio + sample) / 16 295 * ratio = (15*ratio + sample) / 16
292 */ 296 */
293static void bictcp_acked(struct tcp_sock *tp, u32 cnt) 297static void bictcp_acked(struct sock *sk, u32 cnt)
294{ 298{
295 if (cnt > 0 && tp->ca_state == TCP_CA_Open) { 299 const struct inet_connection_sock *icsk = inet_csk(sk);
296 struct bictcp *ca = tcp_ca(tp); 300
301 if (cnt > 0 && icsk->icsk_ca_state == TCP_CA_Open) {
302 struct bictcp *ca = inet_csk_ca(sk);
297 cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT; 303 cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT;
298 ca->delayed_ack += cnt; 304 ca->delayed_ack += cnt;
299 } 305 }
@@ -314,7 +320,7 @@ static struct tcp_congestion_ops bictcp = {
314 320
315static int __init bictcp_register(void) 321static int __init bictcp_register(void)
316{ 322{
317 BUG_ON(sizeof(struct bictcp) > TCP_CA_PRIV_SIZE); 323 BUG_ON(sizeof(struct bictcp) > ICSK_CA_PRIV_SIZE);
318 return tcp_register_congestion_control(&bictcp); 324 return tcp_register_congestion_control(&bictcp);
319} 325}
320 326
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 4970d10a7785..bbf2d6624e89 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -73,33 +73,36 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca)
73EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control); 73EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control);
74 74
75/* Assign choice of congestion control. */ 75/* Assign choice of congestion control. */
76void tcp_init_congestion_control(struct tcp_sock *tp) 76void tcp_init_congestion_control(struct sock *sk)
77{ 77{
78 struct inet_connection_sock *icsk = inet_csk(sk);
78 struct tcp_congestion_ops *ca; 79 struct tcp_congestion_ops *ca;
79 80
80 if (tp->ca_ops != &tcp_init_congestion_ops) 81 if (icsk->icsk_ca_ops != &tcp_init_congestion_ops)
81 return; 82 return;
82 83
83 rcu_read_lock(); 84 rcu_read_lock();
84 list_for_each_entry_rcu(ca, &tcp_cong_list, list) { 85 list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
85 if (try_module_get(ca->owner)) { 86 if (try_module_get(ca->owner)) {
86 tp->ca_ops = ca; 87 icsk->icsk_ca_ops = ca;
87 break; 88 break;
88 } 89 }
89 90
90 } 91 }
91 rcu_read_unlock(); 92 rcu_read_unlock();
92 93
93 if (tp->ca_ops->init) 94 if (icsk->icsk_ca_ops->init)
94 tp->ca_ops->init(tp); 95 icsk->icsk_ca_ops->init(sk);
95} 96}
96 97
97/* Manage refcounts on socket close. */ 98/* Manage refcounts on socket close. */
98void tcp_cleanup_congestion_control(struct tcp_sock *tp) 99void tcp_cleanup_congestion_control(struct sock *sk)
99{ 100{
100 if (tp->ca_ops->release) 101 struct inet_connection_sock *icsk = inet_csk(sk);
101 tp->ca_ops->release(tp); 102
102 module_put(tp->ca_ops->owner); 103 if (icsk->icsk_ca_ops->release)
104 icsk->icsk_ca_ops->release(sk);
105 module_put(icsk->icsk_ca_ops->owner);
103} 106}
104 107
105/* Used by sysctl to change default congestion control */ 108/* Used by sysctl to change default congestion control */
@@ -143,14 +146,15 @@ void tcp_get_default_congestion_control(char *name)
143} 146}
144 147
145/* Change congestion control for socket */ 148/* Change congestion control for socket */
146int tcp_set_congestion_control(struct tcp_sock *tp, const char *name) 149int tcp_set_congestion_control(struct sock *sk, const char *name)
147{ 150{
151 struct inet_connection_sock *icsk = inet_csk(sk);
148 struct tcp_congestion_ops *ca; 152 struct tcp_congestion_ops *ca;
149 int err = 0; 153 int err = 0;
150 154
151 rcu_read_lock(); 155 rcu_read_lock();
152 ca = tcp_ca_find(name); 156 ca = tcp_ca_find(name);
153 if (ca == tp->ca_ops) 157 if (ca == icsk->icsk_ca_ops)
154 goto out; 158 goto out;
155 159
156 if (!ca) 160 if (!ca)
@@ -160,10 +164,10 @@ int tcp_set_congestion_control(struct tcp_sock *tp, const char *name)
160 err = -EBUSY; 164 err = -EBUSY;
161 165
162 else { 166 else {
163 tcp_cleanup_congestion_control(tp); 167 tcp_cleanup_congestion_control(sk);
164 tp->ca_ops = ca; 168 icsk->icsk_ca_ops = ca;
165 if (tp->ca_ops->init) 169 if (icsk->icsk_ca_ops->init)
166 tp->ca_ops->init(tp); 170 icsk->icsk_ca_ops->init(sk);
167 } 171 }
168 out: 172 out:
169 rcu_read_unlock(); 173 rcu_read_unlock();
@@ -177,9 +181,11 @@ int tcp_set_congestion_control(struct tcp_sock *tp, const char *name)
177/* This is Jacobson's slow start and congestion avoidance. 181/* This is Jacobson's slow start and congestion avoidance.
178 * SIGCOMM '88, p. 328. 182 * SIGCOMM '88, p. 328.
179 */ 183 */
180void tcp_reno_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, u32 in_flight, 184void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight,
181 int flag) 185 int flag)
182{ 186{
187 struct tcp_sock *tp = tcp_sk(sk);
188
183 if (in_flight < tp->snd_cwnd) 189 if (in_flight < tp->snd_cwnd)
184 return; 190 return;
185 191
@@ -202,15 +208,17 @@ void tcp_reno_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, u32 in_flight,
202EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid); 208EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid);
203 209
204/* Slow start threshold is half the congestion window (min 2) */ 210/* Slow start threshold is half the congestion window (min 2) */
205u32 tcp_reno_ssthresh(struct tcp_sock *tp) 211u32 tcp_reno_ssthresh(struct sock *sk)
206{ 212{
213 const struct tcp_sock *tp = tcp_sk(sk);
207 return max(tp->snd_cwnd >> 1U, 2U); 214 return max(tp->snd_cwnd >> 1U, 2U);
208} 215}
209EXPORT_SYMBOL_GPL(tcp_reno_ssthresh); 216EXPORT_SYMBOL_GPL(tcp_reno_ssthresh);
210 217
211/* Lower bound on congestion window. */ 218/* Lower bound on congestion window. */
212u32 tcp_reno_min_cwnd(struct tcp_sock *tp) 219u32 tcp_reno_min_cwnd(struct sock *sk)
213{ 220{
221 const struct tcp_sock *tp = tcp_sk(sk);
214 return tp->snd_ssthresh/2; 222 return tp->snd_ssthresh/2;
215} 223}
216EXPORT_SYMBOL_GPL(tcp_reno_min_cwnd); 224EXPORT_SYMBOL_GPL(tcp_reno_min_cwnd);
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index f66945cb158f..c148c1081880 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * tcp_diag.c Module for monitoring TCP sockets. 2 * tcp_diag.c Module for monitoring TCP transport protocols sockets.
3 * 3 *
4 * Version: $Id: tcp_diag.c,v 1.3 2002/02/01 22:01:04 davem Exp $ 4 * Version: $Id: tcp_diag.c,v 1.3 2002/02/01 22:01:04 davem Exp $
5 * 5 *
@@ -12,779 +12,43 @@
12 */ 12 */
13 13
14#include <linux/config.h> 14#include <linux/config.h>
15#include <linux/module.h>
16#include <linux/types.h>
17#include <linux/fcntl.h>
18#include <linux/random.h>
19#include <linux/cache.h>
20#include <linux/init.h>
21#include <linux/time.h>
22
23#include <net/icmp.h>
24#include <net/tcp.h>
25#include <net/ipv6.h>
26#include <net/inet_common.h>
27
28#include <linux/inet.h>
29#include <linux/stddef.h>
30
31#include <linux/tcp_diag.h>
32 15
33struct tcpdiag_entry 16#include <linux/module.h>
34{ 17#include <linux/inet_diag.h>
35 u32 *saddr;
36 u32 *daddr;
37 u16 sport;
38 u16 dport;
39 u16 family;
40 u16 userlocks;
41};
42 18
43static struct sock *tcpnl; 19#include <linux/tcp.h>
44 20
45#define TCPDIAG_PUT(skb, attrtype, attrlen) \ 21#include <net/tcp.h>
46 RTA_DATA(__RTA_PUT(skb, attrtype, attrlen))
47 22
48static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, 23static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
49 int ext, u32 pid, u32 seq, u16 nlmsg_flags) 24 void *_info)
50{ 25{
51 struct inet_sock *inet = inet_sk(sk); 26 const struct tcp_sock *tp = tcp_sk(sk);
52 struct tcp_sock *tp = tcp_sk(sk); 27 struct tcp_info *info = _info;
53 struct tcpdiagmsg *r;
54 struct nlmsghdr *nlh;
55 struct tcp_info *info = NULL;
56 struct tcpdiag_meminfo *minfo = NULL;
57 unsigned char *b = skb->tail;
58
59 nlh = NLMSG_PUT(skb, pid, seq, TCPDIAG_GETSOCK, sizeof(*r));
60 nlh->nlmsg_flags = nlmsg_flags;
61 r = NLMSG_DATA(nlh);
62 if (sk->sk_state != TCP_TIME_WAIT) {
63 if (ext & (1<<(TCPDIAG_MEMINFO-1)))
64 minfo = TCPDIAG_PUT(skb, TCPDIAG_MEMINFO, sizeof(*minfo));
65 if (ext & (1<<(TCPDIAG_INFO-1)))
66 info = TCPDIAG_PUT(skb, TCPDIAG_INFO, sizeof(*info));
67
68 if (ext & (1<<(TCPDIAG_CONG-1))) {
69 size_t len = strlen(tp->ca_ops->name);
70 strcpy(TCPDIAG_PUT(skb, TCPDIAG_CONG, len+1),
71 tp->ca_ops->name);
72 }
73 }
74 r->tcpdiag_family = sk->sk_family;
75 r->tcpdiag_state = sk->sk_state;
76 r->tcpdiag_timer = 0;
77 r->tcpdiag_retrans = 0;
78
79 r->id.tcpdiag_if = sk->sk_bound_dev_if;
80 r->id.tcpdiag_cookie[0] = (u32)(unsigned long)sk;
81 r->id.tcpdiag_cookie[1] = (u32)(((unsigned long)sk >> 31) >> 1);
82
83 if (r->tcpdiag_state == TCP_TIME_WAIT) {
84 struct tcp_tw_bucket *tw = (struct tcp_tw_bucket*)sk;
85 long tmo = tw->tw_ttd - jiffies;
86 if (tmo < 0)
87 tmo = 0;
88
89 r->id.tcpdiag_sport = tw->tw_sport;
90 r->id.tcpdiag_dport = tw->tw_dport;
91 r->id.tcpdiag_src[0] = tw->tw_rcv_saddr;
92 r->id.tcpdiag_dst[0] = tw->tw_daddr;
93 r->tcpdiag_state = tw->tw_substate;
94 r->tcpdiag_timer = 3;
95 r->tcpdiag_expires = (tmo*1000+HZ-1)/HZ;
96 r->tcpdiag_rqueue = 0;
97 r->tcpdiag_wqueue = 0;
98 r->tcpdiag_uid = 0;
99 r->tcpdiag_inode = 0;
100#ifdef CONFIG_IP_TCPDIAG_IPV6
101 if (r->tcpdiag_family == AF_INET6) {
102 ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_src,
103 &tw->tw_v6_rcv_saddr);
104 ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_dst,
105 &tw->tw_v6_daddr);
106 }
107#endif
108 nlh->nlmsg_len = skb->tail - b;
109 return skb->len;
110 }
111
112 r->id.tcpdiag_sport = inet->sport;
113 r->id.tcpdiag_dport = inet->dport;
114 r->id.tcpdiag_src[0] = inet->rcv_saddr;
115 r->id.tcpdiag_dst[0] = inet->daddr;
116
117#ifdef CONFIG_IP_TCPDIAG_IPV6
118 if (r->tcpdiag_family == AF_INET6) {
119 struct ipv6_pinfo *np = inet6_sk(sk);
120
121 ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_src,
122 &np->rcv_saddr);
123 ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_dst,
124 &np->daddr);
125 }
126#endif
127
128#define EXPIRES_IN_MS(tmo) ((tmo-jiffies)*1000+HZ-1)/HZ
129
130 if (tp->pending == TCP_TIME_RETRANS) {
131 r->tcpdiag_timer = 1;
132 r->tcpdiag_retrans = tp->retransmits;
133 r->tcpdiag_expires = EXPIRES_IN_MS(tp->timeout);
134 } else if (tp->pending == TCP_TIME_PROBE0) {
135 r->tcpdiag_timer = 4;
136 r->tcpdiag_retrans = tp->probes_out;
137 r->tcpdiag_expires = EXPIRES_IN_MS(tp->timeout);
138 } else if (timer_pending(&sk->sk_timer)) {
139 r->tcpdiag_timer = 2;
140 r->tcpdiag_retrans = tp->probes_out;
141 r->tcpdiag_expires = EXPIRES_IN_MS(sk->sk_timer.expires);
142 } else {
143 r->tcpdiag_timer = 0;
144 r->tcpdiag_expires = 0;
145 }
146#undef EXPIRES_IN_MS
147 28
148 r->tcpdiag_rqueue = tp->rcv_nxt - tp->copied_seq; 29 r->idiag_rqueue = tp->rcv_nxt - tp->copied_seq;
149 r->tcpdiag_wqueue = tp->write_seq - tp->snd_una; 30 r->idiag_wqueue = tp->write_seq - tp->snd_una;
150 r->tcpdiag_uid = sock_i_uid(sk); 31 if (info != NULL)
151 r->tcpdiag_inode = sock_i_ino(sk);
152
153 if (minfo) {
154 minfo->tcpdiag_rmem = atomic_read(&sk->sk_rmem_alloc);
155 minfo->tcpdiag_wmem = sk->sk_wmem_queued;
156 minfo->tcpdiag_fmem = sk->sk_forward_alloc;
157 minfo->tcpdiag_tmem = atomic_read(&sk->sk_wmem_alloc);
158 }
159
160 if (info)
161 tcp_get_info(sk, info); 32 tcp_get_info(sk, info);
162
163 if (sk->sk_state < TCP_TIME_WAIT && tp->ca_ops->get_info)
164 tp->ca_ops->get_info(tp, ext, skb);
165
166 nlh->nlmsg_len = skb->tail - b;
167 return skb->len;
168
169rtattr_failure:
170nlmsg_failure:
171 skb_trim(skb, b - skb->data);
172 return -1;
173}
174
175extern struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport,
176 int dif);
177#ifdef CONFIG_IP_TCPDIAG_IPV6
178extern struct sock *tcp_v6_lookup(struct in6_addr *saddr, u16 sport,
179 struct in6_addr *daddr, u16 dport,
180 int dif);
181#else
182static inline struct sock *tcp_v6_lookup(struct in6_addr *saddr, u16 sport,
183 struct in6_addr *daddr, u16 dport,
184 int dif)
185{
186 return NULL;
187}
188#endif
189
190static int tcpdiag_get_exact(struct sk_buff *in_skb, const struct nlmsghdr *nlh)
191{
192 int err;
193 struct sock *sk;
194 struct tcpdiagreq *req = NLMSG_DATA(nlh);
195 struct sk_buff *rep;
196
197 if (req->tcpdiag_family == AF_INET) {
198 sk = tcp_v4_lookup(req->id.tcpdiag_dst[0], req->id.tcpdiag_dport,
199 req->id.tcpdiag_src[0], req->id.tcpdiag_sport,
200 req->id.tcpdiag_if);
201 }
202#ifdef CONFIG_IP_TCPDIAG_IPV6
203 else if (req->tcpdiag_family == AF_INET6) {
204 sk = tcp_v6_lookup((struct in6_addr*)req->id.tcpdiag_dst, req->id.tcpdiag_dport,
205 (struct in6_addr*)req->id.tcpdiag_src, req->id.tcpdiag_sport,
206 req->id.tcpdiag_if);
207 }
208#endif
209 else {
210 return -EINVAL;
211 }
212
213 if (sk == NULL)
214 return -ENOENT;
215
216 err = -ESTALE;
217 if ((req->id.tcpdiag_cookie[0] != TCPDIAG_NOCOOKIE ||
218 req->id.tcpdiag_cookie[1] != TCPDIAG_NOCOOKIE) &&
219 ((u32)(unsigned long)sk != req->id.tcpdiag_cookie[0] ||
220 (u32)((((unsigned long)sk) >> 31) >> 1) != req->id.tcpdiag_cookie[1]))
221 goto out;
222
223 err = -ENOMEM;
224 rep = alloc_skb(NLMSG_SPACE(sizeof(struct tcpdiagmsg)+
225 sizeof(struct tcpdiag_meminfo)+
226 sizeof(struct tcp_info)+64), GFP_KERNEL);
227 if (!rep)
228 goto out;
229
230 if (tcpdiag_fill(rep, sk, req->tcpdiag_ext,
231 NETLINK_CB(in_skb).pid,
232 nlh->nlmsg_seq, 0) <= 0)
233 BUG();
234
235 err = netlink_unicast(tcpnl, rep, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
236 if (err > 0)
237 err = 0;
238
239out:
240 if (sk) {
241 if (sk->sk_state == TCP_TIME_WAIT)
242 tcp_tw_put((struct tcp_tw_bucket*)sk);
243 else
244 sock_put(sk);
245 }
246 return err;
247}
248
249static int bitstring_match(const u32 *a1, const u32 *a2, int bits)
250{
251 int words = bits >> 5;
252
253 bits &= 0x1f;
254
255 if (words) {
256 if (memcmp(a1, a2, words << 2))
257 return 0;
258 }
259 if (bits) {
260 __u32 w1, w2;
261 __u32 mask;
262
263 w1 = a1[words];
264 w2 = a2[words];
265
266 mask = htonl((0xffffffff) << (32 - bits));
267
268 if ((w1 ^ w2) & mask)
269 return 0;
270 }
271
272 return 1;
273}
274
275
276static int tcpdiag_bc_run(const void *bc, int len,
277 const struct tcpdiag_entry *entry)
278{
279 while (len > 0) {
280 int yes = 1;
281 const struct tcpdiag_bc_op *op = bc;
282
283 switch (op->code) {
284 case TCPDIAG_BC_NOP:
285 break;
286 case TCPDIAG_BC_JMP:
287 yes = 0;
288 break;
289 case TCPDIAG_BC_S_GE:
290 yes = entry->sport >= op[1].no;
291 break;
292 case TCPDIAG_BC_S_LE:
293 yes = entry->dport <= op[1].no;
294 break;
295 case TCPDIAG_BC_D_GE:
296 yes = entry->dport >= op[1].no;
297 break;
298 case TCPDIAG_BC_D_LE:
299 yes = entry->dport <= op[1].no;
300 break;
301 case TCPDIAG_BC_AUTO:
302 yes = !(entry->userlocks & SOCK_BINDPORT_LOCK);
303 break;
304 case TCPDIAG_BC_S_COND:
305 case TCPDIAG_BC_D_COND:
306 {
307 struct tcpdiag_hostcond *cond = (struct tcpdiag_hostcond*)(op+1);
308 u32 *addr;
309
310 if (cond->port != -1 &&
311 cond->port != (op->code == TCPDIAG_BC_S_COND ?
312 entry->sport : entry->dport)) {
313 yes = 0;
314 break;
315 }
316
317 if (cond->prefix_len == 0)
318 break;
319
320 if (op->code == TCPDIAG_BC_S_COND)
321 addr = entry->saddr;
322 else
323 addr = entry->daddr;
324
325 if (bitstring_match(addr, cond->addr, cond->prefix_len))
326 break;
327 if (entry->family == AF_INET6 &&
328 cond->family == AF_INET) {
329 if (addr[0] == 0 && addr[1] == 0 &&
330 addr[2] == htonl(0xffff) &&
331 bitstring_match(addr+3, cond->addr, cond->prefix_len))
332 break;
333 }
334 yes = 0;
335 break;
336 }
337 }
338
339 if (yes) {
340 len -= op->yes;
341 bc += op->yes;
342 } else {
343 len -= op->no;
344 bc += op->no;
345 }
346 }
347 return (len == 0);
348}
349
350static int valid_cc(const void *bc, int len, int cc)
351{
352 while (len >= 0) {
353 const struct tcpdiag_bc_op *op = bc;
354
355 if (cc > len)
356 return 0;
357 if (cc == len)
358 return 1;
359 if (op->yes < 4)
360 return 0;
361 len -= op->yes;
362 bc += op->yes;
363 }
364 return 0;
365}
366
367static int tcpdiag_bc_audit(const void *bytecode, int bytecode_len)
368{
369 const unsigned char *bc = bytecode;
370 int len = bytecode_len;
371
372 while (len > 0) {
373 struct tcpdiag_bc_op *op = (struct tcpdiag_bc_op*)bc;
374
375//printk("BC: %d %d %d {%d} / %d\n", op->code, op->yes, op->no, op[1].no, len);
376 switch (op->code) {
377 case TCPDIAG_BC_AUTO:
378 case TCPDIAG_BC_S_COND:
379 case TCPDIAG_BC_D_COND:
380 case TCPDIAG_BC_S_GE:
381 case TCPDIAG_BC_S_LE:
382 case TCPDIAG_BC_D_GE:
383 case TCPDIAG_BC_D_LE:
384 if (op->yes < 4 || op->yes > len+4)
385 return -EINVAL;
386 case TCPDIAG_BC_JMP:
387 if (op->no < 4 || op->no > len+4)
388 return -EINVAL;
389 if (op->no < len &&
390 !valid_cc(bytecode, bytecode_len, len-op->no))
391 return -EINVAL;
392 break;
393 case TCPDIAG_BC_NOP:
394 if (op->yes < 4 || op->yes > len+4)
395 return -EINVAL;
396 break;
397 default:
398 return -EINVAL;
399 }
400 bc += op->yes;
401 len -= op->yes;
402 }
403 return len == 0 ? 0 : -EINVAL;
404}
405
406static int tcpdiag_dump_sock(struct sk_buff *skb, struct sock *sk,
407 struct netlink_callback *cb)
408{
409 struct tcpdiagreq *r = NLMSG_DATA(cb->nlh);
410
411 if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) {
412 struct tcpdiag_entry entry;
413 struct rtattr *bc = (struct rtattr *)(r + 1);
414 struct inet_sock *inet = inet_sk(sk);
415
416 entry.family = sk->sk_family;
417#ifdef CONFIG_IP_TCPDIAG_IPV6
418 if (entry.family == AF_INET6) {
419 struct ipv6_pinfo *np = inet6_sk(sk);
420
421 entry.saddr = np->rcv_saddr.s6_addr32;
422 entry.daddr = np->daddr.s6_addr32;
423 } else
424#endif
425 {
426 entry.saddr = &inet->rcv_saddr;
427 entry.daddr = &inet->daddr;
428 }
429 entry.sport = inet->num;
430 entry.dport = ntohs(inet->dport);
431 entry.userlocks = sk->sk_userlocks;
432
433 if (!tcpdiag_bc_run(RTA_DATA(bc), RTA_PAYLOAD(bc), &entry))
434 return 0;
435 }
436
437 return tcpdiag_fill(skb, sk, r->tcpdiag_ext, NETLINK_CB(cb->skb).pid,
438 cb->nlh->nlmsg_seq, NLM_F_MULTI);
439} 33}
440 34
441static int tcpdiag_fill_req(struct sk_buff *skb, struct sock *sk, 35static struct inet_diag_handler tcp_diag_handler = {
442 struct request_sock *req, 36 .idiag_hashinfo = &tcp_hashinfo,
443 u32 pid, u32 seq) 37 .idiag_get_info = tcp_diag_get_info,
444{ 38 .idiag_type = TCPDIAG_GETSOCK,
445 const struct inet_request_sock *ireq = inet_rsk(req); 39 .idiag_info_size = sizeof(struct tcp_info),
446 struct inet_sock *inet = inet_sk(sk); 40};
447 unsigned char *b = skb->tail;
448 struct tcpdiagmsg *r;
449 struct nlmsghdr *nlh;
450 long tmo;
451
452 nlh = NLMSG_PUT(skb, pid, seq, TCPDIAG_GETSOCK, sizeof(*r));
453 nlh->nlmsg_flags = NLM_F_MULTI;
454 r = NLMSG_DATA(nlh);
455
456 r->tcpdiag_family = sk->sk_family;
457 r->tcpdiag_state = TCP_SYN_RECV;
458 r->tcpdiag_timer = 1;
459 r->tcpdiag_retrans = req->retrans;
460
461 r->id.tcpdiag_if = sk->sk_bound_dev_if;
462 r->id.tcpdiag_cookie[0] = (u32)(unsigned long)req;
463 r->id.tcpdiag_cookie[1] = (u32)(((unsigned long)req >> 31) >> 1);
464
465 tmo = req->expires - jiffies;
466 if (tmo < 0)
467 tmo = 0;
468
469 r->id.tcpdiag_sport = inet->sport;
470 r->id.tcpdiag_dport = ireq->rmt_port;
471 r->id.tcpdiag_src[0] = ireq->loc_addr;
472 r->id.tcpdiag_dst[0] = ireq->rmt_addr;
473 r->tcpdiag_expires = jiffies_to_msecs(tmo),
474 r->tcpdiag_rqueue = 0;
475 r->tcpdiag_wqueue = 0;
476 r->tcpdiag_uid = sock_i_uid(sk);
477 r->tcpdiag_inode = 0;
478#ifdef CONFIG_IP_TCPDIAG_IPV6
479 if (r->tcpdiag_family == AF_INET6) {
480 ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_src,
481 &tcp6_rsk(req)->loc_addr);
482 ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_dst,
483 &tcp6_rsk(req)->rmt_addr);
484 }
485#endif
486 nlh->nlmsg_len = skb->tail - b;
487
488 return skb->len;
489
490nlmsg_failure:
491 skb_trim(skb, b - skb->data);
492 return -1;
493}
494
495static int tcpdiag_dump_reqs(struct sk_buff *skb, struct sock *sk,
496 struct netlink_callback *cb)
497{
498 struct tcpdiag_entry entry;
499 struct tcpdiagreq *r = NLMSG_DATA(cb->nlh);
500 struct tcp_sock *tp = tcp_sk(sk);
501 struct listen_sock *lopt;
502 struct rtattr *bc = NULL;
503 struct inet_sock *inet = inet_sk(sk);
504 int j, s_j;
505 int reqnum, s_reqnum;
506 int err = 0;
507
508 s_j = cb->args[3];
509 s_reqnum = cb->args[4];
510
511 if (s_j > 0)
512 s_j--;
513
514 entry.family = sk->sk_family;
515
516 read_lock_bh(&tp->accept_queue.syn_wait_lock);
517
518 lopt = tp->accept_queue.listen_opt;
519 if (!lopt || !lopt->qlen)
520 goto out;
521
522 if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) {
523 bc = (struct rtattr *)(r + 1);
524 entry.sport = inet->num;
525 entry.userlocks = sk->sk_userlocks;
526 }
527
528 for (j = s_j; j < TCP_SYNQ_HSIZE; j++) {
529 struct request_sock *req, *head = lopt->syn_table[j];
530
531 reqnum = 0;
532 for (req = head; req; reqnum++, req = req->dl_next) {
533 struct inet_request_sock *ireq = inet_rsk(req);
534
535 if (reqnum < s_reqnum)
536 continue;
537 if (r->id.tcpdiag_dport != ireq->rmt_port &&
538 r->id.tcpdiag_dport)
539 continue;
540
541 if (bc) {
542 entry.saddr =
543#ifdef CONFIG_IP_TCPDIAG_IPV6
544 (entry.family == AF_INET6) ?
545 tcp6_rsk(req)->loc_addr.s6_addr32 :
546#endif
547 &ireq->loc_addr;
548 entry.daddr =
549#ifdef CONFIG_IP_TCPDIAG_IPV6
550 (entry.family == AF_INET6) ?
551 tcp6_rsk(req)->rmt_addr.s6_addr32 :
552#endif
553 &ireq->rmt_addr;
554 entry.dport = ntohs(ireq->rmt_port);
555
556 if (!tcpdiag_bc_run(RTA_DATA(bc),
557 RTA_PAYLOAD(bc), &entry))
558 continue;
559 }
560
561 err = tcpdiag_fill_req(skb, sk, req,
562 NETLINK_CB(cb->skb).pid,
563 cb->nlh->nlmsg_seq);
564 if (err < 0) {
565 cb->args[3] = j + 1;
566 cb->args[4] = reqnum;
567 goto out;
568 }
569 }
570
571 s_reqnum = 0;
572 }
573
574out:
575 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
576
577 return err;
578}
579
580static int tcpdiag_dump(struct sk_buff *skb, struct netlink_callback *cb)
581{
582 int i, num;
583 int s_i, s_num;
584 struct tcpdiagreq *r = NLMSG_DATA(cb->nlh);
585
586 s_i = cb->args[1];
587 s_num = num = cb->args[2];
588
589 if (cb->args[0] == 0) {
590 if (!(r->tcpdiag_states&(TCPF_LISTEN|TCPF_SYN_RECV)))
591 goto skip_listen_ht;
592 tcp_listen_lock();
593 for (i = s_i; i < TCP_LHTABLE_SIZE; i++) {
594 struct sock *sk;
595 struct hlist_node *node;
596
597 num = 0;
598 sk_for_each(sk, node, &tcp_listening_hash[i]) {
599 struct inet_sock *inet = inet_sk(sk);
600
601 if (num < s_num) {
602 num++;
603 continue;
604 }
605
606 if (r->id.tcpdiag_sport != inet->sport &&
607 r->id.tcpdiag_sport)
608 goto next_listen;
609
610 if (!(r->tcpdiag_states&TCPF_LISTEN) ||
611 r->id.tcpdiag_dport ||
612 cb->args[3] > 0)
613 goto syn_recv;
614
615 if (tcpdiag_dump_sock(skb, sk, cb) < 0) {
616 tcp_listen_unlock();
617 goto done;
618 }
619
620syn_recv:
621 if (!(r->tcpdiag_states&TCPF_SYN_RECV))
622 goto next_listen;
623
624 if (tcpdiag_dump_reqs(skb, sk, cb) < 0) {
625 tcp_listen_unlock();
626 goto done;
627 }
628
629next_listen:
630 cb->args[3] = 0;
631 cb->args[4] = 0;
632 ++num;
633 }
634
635 s_num = 0;
636 cb->args[3] = 0;
637 cb->args[4] = 0;
638 }
639 tcp_listen_unlock();
640skip_listen_ht:
641 cb->args[0] = 1;
642 s_i = num = s_num = 0;
643 }
644
645 if (!(r->tcpdiag_states&~(TCPF_LISTEN|TCPF_SYN_RECV)))
646 return skb->len;
647
648 for (i = s_i; i < tcp_ehash_size; i++) {
649 struct tcp_ehash_bucket *head = &tcp_ehash[i];
650 struct sock *sk;
651 struct hlist_node *node;
652
653 if (i > s_i)
654 s_num = 0;
655
656 read_lock_bh(&head->lock);
657
658 num = 0;
659 sk_for_each(sk, node, &head->chain) {
660 struct inet_sock *inet = inet_sk(sk);
661
662 if (num < s_num)
663 goto next_normal;
664 if (!(r->tcpdiag_states & (1 << sk->sk_state)))
665 goto next_normal;
666 if (r->id.tcpdiag_sport != inet->sport &&
667 r->id.tcpdiag_sport)
668 goto next_normal;
669 if (r->id.tcpdiag_dport != inet->dport && r->id.tcpdiag_dport)
670 goto next_normal;
671 if (tcpdiag_dump_sock(skb, sk, cb) < 0) {
672 read_unlock_bh(&head->lock);
673 goto done;
674 }
675next_normal:
676 ++num;
677 }
678
679 if (r->tcpdiag_states&TCPF_TIME_WAIT) {
680 sk_for_each(sk, node,
681 &tcp_ehash[i + tcp_ehash_size].chain) {
682 struct inet_sock *inet = inet_sk(sk);
683
684 if (num < s_num)
685 goto next_dying;
686 if (r->id.tcpdiag_sport != inet->sport &&
687 r->id.tcpdiag_sport)
688 goto next_dying;
689 if (r->id.tcpdiag_dport != inet->dport &&
690 r->id.tcpdiag_dport)
691 goto next_dying;
692 if (tcpdiag_dump_sock(skb, sk, cb) < 0) {
693 read_unlock_bh(&head->lock);
694 goto done;
695 }
696next_dying:
697 ++num;
698 }
699 }
700 read_unlock_bh(&head->lock);
701 }
702
703done:
704 cb->args[1] = i;
705 cb->args[2] = num;
706 return skb->len;
707}
708
709static int tcpdiag_dump_done(struct netlink_callback *cb)
710{
711 return 0;
712}
713
714
715static __inline__ int
716tcpdiag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
717{
718 if (!(nlh->nlmsg_flags&NLM_F_REQUEST))
719 return 0;
720
721 if (nlh->nlmsg_type != TCPDIAG_GETSOCK)
722 goto err_inval;
723
724 if (NLMSG_LENGTH(sizeof(struct tcpdiagreq)) > skb->len)
725 goto err_inval;
726
727 if (nlh->nlmsg_flags&NLM_F_DUMP) {
728 if (nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(struct tcpdiagreq))) {
729 struct rtattr *rta = (struct rtattr*)(NLMSG_DATA(nlh) + sizeof(struct tcpdiagreq));
730 if (rta->rta_type != TCPDIAG_REQ_BYTECODE ||
731 rta->rta_len < 8 ||
732 rta->rta_len > nlh->nlmsg_len - NLMSG_SPACE(sizeof(struct tcpdiagreq)))
733 goto err_inval;
734 if (tcpdiag_bc_audit(RTA_DATA(rta), RTA_PAYLOAD(rta)))
735 goto err_inval;
736 }
737 return netlink_dump_start(tcpnl, skb, nlh,
738 tcpdiag_dump,
739 tcpdiag_dump_done);
740 } else {
741 return tcpdiag_get_exact(skb, nlh);
742 }
743
744err_inval:
745 return -EINVAL;
746}
747
748
749static inline void tcpdiag_rcv_skb(struct sk_buff *skb)
750{
751 int err;
752 struct nlmsghdr * nlh;
753
754 if (skb->len >= NLMSG_SPACE(0)) {
755 nlh = (struct nlmsghdr *)skb->data;
756 if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len)
757 return;
758 err = tcpdiag_rcv_msg(skb, nlh);
759 if (err || nlh->nlmsg_flags & NLM_F_ACK)
760 netlink_ack(skb, nlh, err);
761 }
762}
763
764static void tcpdiag_rcv(struct sock *sk, int len)
765{
766 struct sk_buff *skb;
767 unsigned int qlen = skb_queue_len(&sk->sk_receive_queue);
768
769 while (qlen-- && (skb = skb_dequeue(&sk->sk_receive_queue))) {
770 tcpdiag_rcv_skb(skb);
771 kfree_skb(skb);
772 }
773}
774 41
775static int __init tcpdiag_init(void) 42static int __init tcp_diag_init(void)
776{ 43{
777 tcpnl = netlink_kernel_create(NETLINK_TCPDIAG, tcpdiag_rcv); 44 return inet_diag_register(&tcp_diag_handler);
778 if (tcpnl == NULL)
779 return -ENOMEM;
780 return 0;
781} 45}
782 46
783static void __exit tcpdiag_exit(void) 47static void __exit tcp_diag_exit(void)
784{ 48{
785 sock_release(tcpnl->sk_socket); 49 inet_diag_unregister(&tcp_diag_handler);
786} 50}
787 51
788module_init(tcpdiag_init); 52module_init(tcp_diag_init);
789module_exit(tcpdiag_exit); 53module_exit(tcp_diag_exit);
790MODULE_LICENSE("GPL"); 54MODULE_LICENSE("GPL");
diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c
index 36c51f8136bf..6acc04bde080 100644
--- a/net/ipv4/tcp_highspeed.c
+++ b/net/ipv4/tcp_highspeed.c
@@ -98,9 +98,10 @@ struct hstcp {
98 u32 ai; 98 u32 ai;
99}; 99};
100 100
101static void hstcp_init(struct tcp_sock *tp) 101static void hstcp_init(struct sock *sk)
102{ 102{
103 struct hstcp *ca = tcp_ca(tp); 103 struct tcp_sock *tp = tcp_sk(sk);
104 struct hstcp *ca = inet_csk_ca(sk);
104 105
105 ca->ai = 0; 106 ca->ai = 0;
106 107
@@ -109,10 +110,11 @@ static void hstcp_init(struct tcp_sock *tp)
109 tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128); 110 tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128);
110} 111}
111 112
112static void hstcp_cong_avoid(struct tcp_sock *tp, u32 adk, u32 rtt, 113static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 rtt,
113 u32 in_flight, int good) 114 u32 in_flight, int good)
114{ 115{
115 struct hstcp *ca = tcp_ca(tp); 116 struct tcp_sock *tp = tcp_sk(sk);
117 struct hstcp *ca = inet_csk_ca(sk);
116 118
117 if (in_flight < tp->snd_cwnd) 119 if (in_flight < tp->snd_cwnd)
118 return; 120 return;
@@ -143,9 +145,10 @@ static void hstcp_cong_avoid(struct tcp_sock *tp, u32 adk, u32 rtt,
143 } 145 }
144} 146}
145 147
146static u32 hstcp_ssthresh(struct tcp_sock *tp) 148static u32 hstcp_ssthresh(struct sock *sk)
147{ 149{
148 struct hstcp *ca = tcp_ca(tp); 150 const struct tcp_sock *tp = tcp_sk(sk);
151 const struct hstcp *ca = inet_csk_ca(sk);
149 152
150 /* Do multiplicative decrease */ 153 /* Do multiplicative decrease */
151 return max(tp->snd_cwnd - ((tp->snd_cwnd * hstcp_aimd_vals[ca->ai].md) >> 8), 2U); 154 return max(tp->snd_cwnd - ((tp->snd_cwnd * hstcp_aimd_vals[ca->ai].md) >> 8), 2U);
@@ -164,7 +167,7 @@ static struct tcp_congestion_ops tcp_highspeed = {
164 167
165static int __init hstcp_register(void) 168static int __init hstcp_register(void)
166{ 169{
167 BUG_ON(sizeof(struct hstcp) > TCP_CA_PRIV_SIZE); 170 BUG_ON(sizeof(struct hstcp) > ICSK_CA_PRIV_SIZE);
168 return tcp_register_congestion_control(&tcp_highspeed); 171 return tcp_register_congestion_control(&tcp_highspeed);
169} 172}
170 173
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
index 40168275acf9..e47b37984e95 100644
--- a/net/ipv4/tcp_htcp.c
+++ b/net/ipv4/tcp_htcp.c
@@ -55,18 +55,21 @@ static inline void htcp_reset(struct htcp *ca)
55 ca->snd_cwnd_cnt2 = 0; 55 ca->snd_cwnd_cnt2 = 0;
56} 56}
57 57
58static u32 htcp_cwnd_undo(struct tcp_sock *tp) 58static u32 htcp_cwnd_undo(struct sock *sk)
59{ 59{
60 struct htcp *ca = tcp_ca(tp); 60 const struct tcp_sock *tp = tcp_sk(sk);
61 struct htcp *ca = inet_csk_ca(sk);
61 ca->ccount = ca->undo_ccount; 62 ca->ccount = ca->undo_ccount;
62 ca->maxRTT = ca->undo_maxRTT; 63 ca->maxRTT = ca->undo_maxRTT;
63 ca->old_maxB = ca->undo_old_maxB; 64 ca->old_maxB = ca->undo_old_maxB;
64 return max(tp->snd_cwnd, (tp->snd_ssthresh<<7)/ca->beta); 65 return max(tp->snd_cwnd, (tp->snd_ssthresh<<7)/ca->beta);
65} 66}
66 67
67static inline void measure_rtt(struct tcp_sock *tp) 68static inline void measure_rtt(struct sock *sk)
68{ 69{
69 struct htcp *ca = tcp_ca(tp); 70 const struct inet_connection_sock *icsk = inet_csk(sk);
71 const struct tcp_sock *tp = tcp_sk(sk);
72 struct htcp *ca = inet_csk_ca(sk);
70 u32 srtt = tp->srtt>>3; 73 u32 srtt = tp->srtt>>3;
71 74
72 /* keep track of minimum RTT seen so far, minRTT is zero at first */ 75 /* keep track of minimum RTT seen so far, minRTT is zero at first */
@@ -74,7 +77,7 @@ static inline void measure_rtt(struct tcp_sock *tp)
74 ca->minRTT = srtt; 77 ca->minRTT = srtt;
75 78
76 /* max RTT */ 79 /* max RTT */
77 if (tp->ca_state == TCP_CA_Open && tp->snd_ssthresh < 0xFFFF && ca->ccount > 3) { 80 if (icsk->icsk_ca_state == TCP_CA_Open && tp->snd_ssthresh < 0xFFFF && ca->ccount > 3) {
78 if (ca->maxRTT < ca->minRTT) 81 if (ca->maxRTT < ca->minRTT)
79 ca->maxRTT = ca->minRTT; 82 ca->maxRTT = ca->minRTT;
80 if (ca->maxRTT < srtt && srtt <= ca->maxRTT+HZ/50) 83 if (ca->maxRTT < srtt && srtt <= ca->maxRTT+HZ/50)
@@ -82,13 +85,16 @@ static inline void measure_rtt(struct tcp_sock *tp)
82 } 85 }
83} 86}
84 87
85static void measure_achieved_throughput(struct tcp_sock *tp, u32 pkts_acked) 88static void measure_achieved_throughput(struct sock *sk, u32 pkts_acked)
86{ 89{
87 struct htcp *ca = tcp_ca(tp); 90 const struct inet_connection_sock *icsk = inet_csk(sk);
91 const struct tcp_sock *tp = tcp_sk(sk);
92 struct htcp *ca = inet_csk_ca(sk);
88 u32 now = tcp_time_stamp; 93 u32 now = tcp_time_stamp;
89 94
90 /* achieved throughput calculations */ 95 /* achieved throughput calculations */
91 if (tp->ca_state != TCP_CA_Open && tp->ca_state != TCP_CA_Disorder) { 96 if (icsk->icsk_ca_state != TCP_CA_Open &&
97 icsk->icsk_ca_state != TCP_CA_Disorder) {
92 ca->packetcount = 0; 98 ca->packetcount = 0;
93 ca->lasttime = now; 99 ca->lasttime = now;
94 return; 100 return;
@@ -173,9 +179,9 @@ static inline void htcp_alpha_update(struct htcp *ca)
173 * that point do we really have a real sense of maxRTT (the queues en route 179 * that point do we really have a real sense of maxRTT (the queues en route
174 * were getting just too full now). 180 * were getting just too full now).
175 */ 181 */
176static void htcp_param_update(struct tcp_sock *tp) 182static void htcp_param_update(struct sock *sk)
177{ 183{
178 struct htcp *ca = tcp_ca(tp); 184 struct htcp *ca = inet_csk_ca(sk);
179 u32 minRTT = ca->minRTT; 185 u32 minRTT = ca->minRTT;
180 u32 maxRTT = ca->maxRTT; 186 u32 maxRTT = ca->maxRTT;
181 187
@@ -187,17 +193,19 @@ static void htcp_param_update(struct tcp_sock *tp)
187 ca->maxRTT = minRTT + ((maxRTT-minRTT)*95)/100; 193 ca->maxRTT = minRTT + ((maxRTT-minRTT)*95)/100;
188} 194}
189 195
190static u32 htcp_recalc_ssthresh(struct tcp_sock *tp) 196static u32 htcp_recalc_ssthresh(struct sock *sk)
191{ 197{
192 struct htcp *ca = tcp_ca(tp); 198 const struct tcp_sock *tp = tcp_sk(sk);
193 htcp_param_update(tp); 199 const struct htcp *ca = inet_csk_ca(sk);
200 htcp_param_update(sk);
194 return max((tp->snd_cwnd * ca->beta) >> 7, 2U); 201 return max((tp->snd_cwnd * ca->beta) >> 7, 2U);
195} 202}
196 203
197static void htcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, 204static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
198 u32 in_flight, int data_acked) 205 u32 in_flight, int data_acked)
199{ 206{
200 struct htcp *ca = tcp_ca(tp); 207 struct tcp_sock *tp = tcp_sk(sk);
208 struct htcp *ca = inet_csk_ca(sk);
201 209
202 if (in_flight < tp->snd_cwnd) 210 if (in_flight < tp->snd_cwnd)
203 return; 211 return;
@@ -207,7 +215,7 @@ static void htcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
207 if (tp->snd_cwnd < tp->snd_cwnd_clamp) 215 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
208 tp->snd_cwnd++; 216 tp->snd_cwnd++;
209 } else { 217 } else {
210 measure_rtt(tp); 218 measure_rtt(sk);
211 219
212 /* keep track of number of round-trip times since last backoff event */ 220 /* keep track of number of round-trip times since last backoff event */
213 if (ca->snd_cwnd_cnt2++ > tp->snd_cwnd) { 221 if (ca->snd_cwnd_cnt2++ > tp->snd_cwnd) {
@@ -229,28 +237,29 @@ static void htcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
229} 237}
230 238
231/* Lower bound on congestion window. */ 239/* Lower bound on congestion window. */
232static u32 htcp_min_cwnd(struct tcp_sock *tp) 240static u32 htcp_min_cwnd(struct sock *sk)
233{ 241{
242 const struct tcp_sock *tp = tcp_sk(sk);
234 return tp->snd_ssthresh; 243 return tp->snd_ssthresh;
235} 244}
236 245
237 246
238static void htcp_init(struct tcp_sock *tp) 247static void htcp_init(struct sock *sk)
239{ 248{
240 struct htcp *ca = tcp_ca(tp); 249 struct htcp *ca = inet_csk_ca(sk);
241 250
242 memset(ca, 0, sizeof(struct htcp)); 251 memset(ca, 0, sizeof(struct htcp));
243 ca->alpha = ALPHA_BASE; 252 ca->alpha = ALPHA_BASE;
244 ca->beta = BETA_MIN; 253 ca->beta = BETA_MIN;
245} 254}
246 255
247static void htcp_state(struct tcp_sock *tp, u8 new_state) 256static void htcp_state(struct sock *sk, u8 new_state)
248{ 257{
249 switch (new_state) { 258 switch (new_state) {
250 case TCP_CA_CWR: 259 case TCP_CA_CWR:
251 case TCP_CA_Recovery: 260 case TCP_CA_Recovery:
252 case TCP_CA_Loss: 261 case TCP_CA_Loss:
253 htcp_reset(tcp_ca(tp)); 262 htcp_reset(inet_csk_ca(sk));
254 break; 263 break;
255 } 264 }
256} 265}
@@ -269,7 +278,7 @@ static struct tcp_congestion_ops htcp = {
269 278
270static int __init htcp_register(void) 279static int __init htcp_register(void)
271{ 280{
272 BUG_ON(sizeof(struct htcp) > TCP_CA_PRIV_SIZE); 281 BUG_ON(sizeof(struct htcp) > ICSK_CA_PRIV_SIZE);
273 BUILD_BUG_ON(BETA_MIN >= BETA_MAX); 282 BUILD_BUG_ON(BETA_MIN >= BETA_MAX);
274 if (!use_bandwidth_switch) 283 if (!use_bandwidth_switch)
275 htcp.pkts_acked = NULL; 284 htcp.pkts_acked = NULL;
diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c
index 13a66342c304..77add63623df 100644
--- a/net/ipv4/tcp_hybla.c
+++ b/net/ipv4/tcp_hybla.c
@@ -33,19 +33,20 @@ MODULE_PARM_DESC(rtt0, "reference rout trip time (ms)");
33 33
34 34
35/* This is called to refresh values for hybla parameters */ 35/* This is called to refresh values for hybla parameters */
36static inline void hybla_recalc_param (struct tcp_sock *tp) 36static inline void hybla_recalc_param (struct sock *sk)
37{ 37{
38 struct hybla *ca = tcp_ca(tp); 38 struct hybla *ca = inet_csk_ca(sk);
39 39
40 ca->rho_3ls = max_t(u32, tp->srtt / msecs_to_jiffies(rtt0), 8); 40 ca->rho_3ls = max_t(u32, tcp_sk(sk)->srtt / msecs_to_jiffies(rtt0), 8);
41 ca->rho = ca->rho_3ls >> 3; 41 ca->rho = ca->rho_3ls >> 3;
42 ca->rho2_7ls = (ca->rho_3ls * ca->rho_3ls) << 1; 42 ca->rho2_7ls = (ca->rho_3ls * ca->rho_3ls) << 1;
43 ca->rho2 = ca->rho2_7ls >>7; 43 ca->rho2 = ca->rho2_7ls >>7;
44} 44}
45 45
46static void hybla_init(struct tcp_sock *tp) 46static void hybla_init(struct sock *sk)
47{ 47{
48 struct hybla *ca = tcp_ca(tp); 48 struct tcp_sock *tp = tcp_sk(sk);
49 struct hybla *ca = inet_csk_ca(sk);
49 50
50 ca->rho = 0; 51 ca->rho = 0;
51 ca->rho2 = 0; 52 ca->rho2 = 0;
@@ -57,17 +58,16 @@ static void hybla_init(struct tcp_sock *tp)
57 tp->snd_cwnd_clamp = 65535; 58 tp->snd_cwnd_clamp = 65535;
58 59
59 /* 1st Rho measurement based on initial srtt */ 60 /* 1st Rho measurement based on initial srtt */
60 hybla_recalc_param(tp); 61 hybla_recalc_param(sk);
61 62
62 /* set minimum rtt as this is the 1st ever seen */ 63 /* set minimum rtt as this is the 1st ever seen */
63 ca->minrtt = tp->srtt; 64 ca->minrtt = tp->srtt;
64 tp->snd_cwnd = ca->rho; 65 tp->snd_cwnd = ca->rho;
65} 66}
66 67
67static void hybla_state(struct tcp_sock *tp, u8 ca_state) 68static void hybla_state(struct sock *sk, u8 ca_state)
68{ 69{
69 struct hybla *ca = tcp_ca(tp); 70 struct hybla *ca = inet_csk_ca(sk);
70
71 ca->hybla_en = (ca_state == TCP_CA_Open); 71 ca->hybla_en = (ca_state == TCP_CA_Open);
72} 72}
73 73
@@ -86,27 +86,28 @@ static inline u32 hybla_fraction(u32 odds)
86 * o Give cwnd a new value based on the model proposed 86 * o Give cwnd a new value based on the model proposed
87 * o remember increments <1 87 * o remember increments <1
88 */ 88 */
89static void hybla_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, 89static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
90 u32 in_flight, int flag) 90 u32 in_flight, int flag)
91{ 91{
92 struct hybla *ca = tcp_ca(tp); 92 struct tcp_sock *tp = tcp_sk(sk);
93 struct hybla *ca = inet_csk_ca(sk);
93 u32 increment, odd, rho_fractions; 94 u32 increment, odd, rho_fractions;
94 int is_slowstart = 0; 95 int is_slowstart = 0;
95 96
96 /* Recalculate rho only if this srtt is the lowest */ 97 /* Recalculate rho only if this srtt is the lowest */
97 if (tp->srtt < ca->minrtt){ 98 if (tp->srtt < ca->minrtt){
98 hybla_recalc_param(tp); 99 hybla_recalc_param(sk);
99 ca->minrtt = tp->srtt; 100 ca->minrtt = tp->srtt;
100 } 101 }
101 102
102 if (!ca->hybla_en) 103 if (!ca->hybla_en)
103 return tcp_reno_cong_avoid(tp, ack, rtt, in_flight, flag); 104 return tcp_reno_cong_avoid(sk, ack, rtt, in_flight, flag);
104 105
105 if (in_flight < tp->snd_cwnd) 106 if (in_flight < tp->snd_cwnd)
106 return; 107 return;
107 108
108 if (ca->rho == 0) 109 if (ca->rho == 0)
109 hybla_recalc_param(tp); 110 hybla_recalc_param(sk);
110 111
111 rho_fractions = ca->rho_3ls - (ca->rho << 3); 112 rho_fractions = ca->rho_3ls - (ca->rho << 3);
112 113
@@ -170,7 +171,7 @@ static struct tcp_congestion_ops tcp_hybla = {
170 171
171static int __init hybla_register(void) 172static int __init hybla_register(void)
172{ 173{
173 BUG_ON(sizeof(struct hybla) > TCP_CA_PRIV_SIZE); 174 BUG_ON(sizeof(struct hybla) > ICSK_CA_PRIV_SIZE);
174 return tcp_register_congestion_control(&tcp_hybla); 175 return tcp_register_congestion_control(&tcp_hybla);
175} 176}
176 177
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 53a8a5399f1e..1afb080bdf0c 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -114,20 +114,21 @@ int sysctl_tcp_moderate_rcvbuf = 1;
114/* Adapt the MSS value used to make delayed ack decision to the 114/* Adapt the MSS value used to make delayed ack decision to the
115 * real world. 115 * real world.
116 */ 116 */
117static inline void tcp_measure_rcv_mss(struct tcp_sock *tp, 117static inline void tcp_measure_rcv_mss(struct sock *sk,
118 struct sk_buff *skb) 118 const struct sk_buff *skb)
119{ 119{
120 unsigned int len, lss; 120 struct inet_connection_sock *icsk = inet_csk(sk);
121 const unsigned int lss = icsk->icsk_ack.last_seg_size;
122 unsigned int len;
121 123
122 lss = tp->ack.last_seg_size; 124 icsk->icsk_ack.last_seg_size = 0;
123 tp->ack.last_seg_size = 0;
124 125
125 /* skb->len may jitter because of SACKs, even if peer 126 /* skb->len may jitter because of SACKs, even if peer
126 * sends good full-sized frames. 127 * sends good full-sized frames.
127 */ 128 */
128 len = skb->len; 129 len = skb->len;
129 if (len >= tp->ack.rcv_mss) { 130 if (len >= icsk->icsk_ack.rcv_mss) {
130 tp->ack.rcv_mss = len; 131 icsk->icsk_ack.rcv_mss = len;
131 } else { 132 } else {
132 /* Otherwise, we make more careful check taking into account, 133 /* Otherwise, we make more careful check taking into account,
133 * that SACKs block is variable. 134 * that SACKs block is variable.
@@ -147,41 +148,44 @@ static inline void tcp_measure_rcv_mss(struct tcp_sock *tp,
147 * tcp header plus fixed timestamp option length. 148 * tcp header plus fixed timestamp option length.
148 * Resulting "len" is MSS free of SACK jitter. 149 * Resulting "len" is MSS free of SACK jitter.
149 */ 150 */
150 len -= tp->tcp_header_len; 151 len -= tcp_sk(sk)->tcp_header_len;
151 tp->ack.last_seg_size = len; 152 icsk->icsk_ack.last_seg_size = len;
152 if (len == lss) { 153 if (len == lss) {
153 tp->ack.rcv_mss = len; 154 icsk->icsk_ack.rcv_mss = len;
154 return; 155 return;
155 } 156 }
156 } 157 }
157 tp->ack.pending |= TCP_ACK_PUSHED; 158 icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
158 } 159 }
159} 160}
160 161
161static void tcp_incr_quickack(struct tcp_sock *tp) 162static void tcp_incr_quickack(struct sock *sk)
162{ 163{
163 unsigned quickacks = tp->rcv_wnd/(2*tp->ack.rcv_mss); 164 struct inet_connection_sock *icsk = inet_csk(sk);
165 unsigned quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss);
164 166
165 if (quickacks==0) 167 if (quickacks==0)
166 quickacks=2; 168 quickacks=2;
167 if (quickacks > tp->ack.quick) 169 if (quickacks > icsk->icsk_ack.quick)
168 tp->ack.quick = min(quickacks, TCP_MAX_QUICKACKS); 170 icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS);
169} 171}
170 172
171void tcp_enter_quickack_mode(struct tcp_sock *tp) 173void tcp_enter_quickack_mode(struct sock *sk)
172{ 174{
173 tcp_incr_quickack(tp); 175 struct inet_connection_sock *icsk = inet_csk(sk);
174 tp->ack.pingpong = 0; 176 tcp_incr_quickack(sk);
175 tp->ack.ato = TCP_ATO_MIN; 177 icsk->icsk_ack.pingpong = 0;
178 icsk->icsk_ack.ato = TCP_ATO_MIN;
176} 179}
177 180
178/* Send ACKs quickly, if "quick" count is not exhausted 181/* Send ACKs quickly, if "quick" count is not exhausted
179 * and the session is not interactive. 182 * and the session is not interactive.
180 */ 183 */
181 184
182static __inline__ int tcp_in_quickack_mode(struct tcp_sock *tp) 185static inline int tcp_in_quickack_mode(const struct sock *sk)
183{ 186{
184 return (tp->ack.quick && !tp->ack.pingpong); 187 const struct inet_connection_sock *icsk = inet_csk(sk);
188 return icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong;
185} 189}
186 190
187/* Buffer size and advertised window tuning. 191/* Buffer size and advertised window tuning.
@@ -224,8 +228,8 @@ static void tcp_fixup_sndbuf(struct sock *sk)
224 */ 228 */
225 229
226/* Slow part of check#2. */ 230/* Slow part of check#2. */
227static int __tcp_grow_window(struct sock *sk, struct tcp_sock *tp, 231static int __tcp_grow_window(const struct sock *sk, struct tcp_sock *tp,
228 struct sk_buff *skb) 232 const struct sk_buff *skb)
229{ 233{
230 /* Optimize this! */ 234 /* Optimize this! */
231 int truesize = tcp_win_from_space(skb->truesize)/2; 235 int truesize = tcp_win_from_space(skb->truesize)/2;
@@ -233,7 +237,7 @@ static int __tcp_grow_window(struct sock *sk, struct tcp_sock *tp,
233 237
234 while (tp->rcv_ssthresh <= window) { 238 while (tp->rcv_ssthresh <= window) {
235 if (truesize <= skb->len) 239 if (truesize <= skb->len)
236 return 2*tp->ack.rcv_mss; 240 return 2 * inet_csk(sk)->icsk_ack.rcv_mss;
237 241
238 truesize >>= 1; 242 truesize >>= 1;
239 window >>= 1; 243 window >>= 1;
@@ -260,7 +264,7 @@ static inline void tcp_grow_window(struct sock *sk, struct tcp_sock *tp,
260 264
261 if (incr) { 265 if (incr) {
262 tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr, tp->window_clamp); 266 tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr, tp->window_clamp);
263 tp->ack.quick |= 1; 267 inet_csk(sk)->icsk_ack.quick |= 1;
264 } 268 }
265 } 269 }
266} 270}
@@ -321,11 +325,12 @@ static void tcp_init_buffer_space(struct sock *sk)
321/* 5. Recalculate window clamp after socket hit its memory bounds. */ 325/* 5. Recalculate window clamp after socket hit its memory bounds. */
322static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp) 326static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp)
323{ 327{
328 struct inet_connection_sock *icsk = inet_csk(sk);
324 struct sk_buff *skb; 329 struct sk_buff *skb;
325 unsigned int app_win = tp->rcv_nxt - tp->copied_seq; 330 unsigned int app_win = tp->rcv_nxt - tp->copied_seq;
326 int ofo_win = 0; 331 int ofo_win = 0;
327 332
328 tp->ack.quick = 0; 333 icsk->icsk_ack.quick = 0;
329 334
330 skb_queue_walk(&tp->out_of_order_queue, skb) { 335 skb_queue_walk(&tp->out_of_order_queue, skb) {
331 ofo_win += skb->len; 336 ofo_win += skb->len;
@@ -346,8 +351,8 @@ static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp)
346 app_win += ofo_win; 351 app_win += ofo_win;
347 if (atomic_read(&sk->sk_rmem_alloc) >= 2 * sk->sk_rcvbuf) 352 if (atomic_read(&sk->sk_rmem_alloc) >= 2 * sk->sk_rcvbuf)
348 app_win >>= 1; 353 app_win >>= 1;
349 if (app_win > tp->ack.rcv_mss) 354 if (app_win > icsk->icsk_ack.rcv_mss)
350 app_win -= tp->ack.rcv_mss; 355 app_win -= icsk->icsk_ack.rcv_mss;
351 app_win = max(app_win, 2U*tp->advmss); 356 app_win = max(app_win, 2U*tp->advmss);
352 357
353 if (!ofo_win) 358 if (!ofo_win)
@@ -415,11 +420,12 @@ new_measure:
415 tp->rcv_rtt_est.time = tcp_time_stamp; 420 tp->rcv_rtt_est.time = tcp_time_stamp;
416} 421}
417 422
418static inline void tcp_rcv_rtt_measure_ts(struct tcp_sock *tp, struct sk_buff *skb) 423static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, const struct sk_buff *skb)
419{ 424{
425 struct tcp_sock *tp = tcp_sk(sk);
420 if (tp->rx_opt.rcv_tsecr && 426 if (tp->rx_opt.rcv_tsecr &&
421 (TCP_SKB_CB(skb)->end_seq - 427 (TCP_SKB_CB(skb)->end_seq -
422 TCP_SKB_CB(skb)->seq >= tp->ack.rcv_mss)) 428 TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss))
423 tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rx_opt.rcv_tsecr, 0); 429 tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rx_opt.rcv_tsecr, 0);
424} 430}
425 431
@@ -492,41 +498,42 @@ new_measure:
492 */ 498 */
493static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb) 499static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb)
494{ 500{
501 struct inet_connection_sock *icsk = inet_csk(sk);
495 u32 now; 502 u32 now;
496 503
497 tcp_schedule_ack(tp); 504 inet_csk_schedule_ack(sk);
498 505
499 tcp_measure_rcv_mss(tp, skb); 506 tcp_measure_rcv_mss(sk, skb);
500 507
501 tcp_rcv_rtt_measure(tp); 508 tcp_rcv_rtt_measure(tp);
502 509
503 now = tcp_time_stamp; 510 now = tcp_time_stamp;
504 511
505 if (!tp->ack.ato) { 512 if (!icsk->icsk_ack.ato) {
506 /* The _first_ data packet received, initialize 513 /* The _first_ data packet received, initialize
507 * delayed ACK engine. 514 * delayed ACK engine.
508 */ 515 */
509 tcp_incr_quickack(tp); 516 tcp_incr_quickack(sk);
510 tp->ack.ato = TCP_ATO_MIN; 517 icsk->icsk_ack.ato = TCP_ATO_MIN;
511 } else { 518 } else {
512 int m = now - tp->ack.lrcvtime; 519 int m = now - icsk->icsk_ack.lrcvtime;
513 520
514 if (m <= TCP_ATO_MIN/2) { 521 if (m <= TCP_ATO_MIN/2) {
515 /* The fastest case is the first. */ 522 /* The fastest case is the first. */
516 tp->ack.ato = (tp->ack.ato>>1) + TCP_ATO_MIN/2; 523 icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + TCP_ATO_MIN / 2;
517 } else if (m < tp->ack.ato) { 524 } else if (m < icsk->icsk_ack.ato) {
518 tp->ack.ato = (tp->ack.ato>>1) + m; 525 icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + m;
519 if (tp->ack.ato > tp->rto) 526 if (icsk->icsk_ack.ato > icsk->icsk_rto)
520 tp->ack.ato = tp->rto; 527 icsk->icsk_ack.ato = icsk->icsk_rto;
521 } else if (m > tp->rto) { 528 } else if (m > icsk->icsk_rto) {
522 /* Too long gap. Apparently sender falled to 529 /* Too long gap. Apparently sender falled to
523 * restart window, so that we send ACKs quickly. 530 * restart window, so that we send ACKs quickly.
524 */ 531 */
525 tcp_incr_quickack(tp); 532 tcp_incr_quickack(sk);
526 sk_stream_mem_reclaim(sk); 533 sk_stream_mem_reclaim(sk);
527 } 534 }
528 } 535 }
529 tp->ack.lrcvtime = now; 536 icsk->icsk_ack.lrcvtime = now;
530 537
531 TCP_ECN_check_ce(tp, skb); 538 TCP_ECN_check_ce(tp, skb);
532 539
@@ -543,8 +550,10 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_
543 * To save cycles in the RFC 1323 implementation it was better to break 550 * To save cycles in the RFC 1323 implementation it was better to break
544 * it up into three procedures. -- erics 551 * it up into three procedures. -- erics
545 */ 552 */
546static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt, u32 *usrtt) 553static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt, u32 *usrtt)
547{ 554{
555 struct tcp_sock *tp = tcp_sk(sk);
556 const struct inet_connection_sock *icsk = inet_csk(sk);
548 long m = mrtt; /* RTT */ 557 long m = mrtt; /* RTT */
549 558
550 /* The following amusing code comes from Jacobson's 559 /* The following amusing code comes from Jacobson's
@@ -604,15 +613,16 @@ static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt, u32 *usrtt)
604 tp->rtt_seq = tp->snd_nxt; 613 tp->rtt_seq = tp->snd_nxt;
605 } 614 }
606 615
607 if (tp->ca_ops->rtt_sample) 616 if (icsk->icsk_ca_ops->rtt_sample)
608 tp->ca_ops->rtt_sample(tp, *usrtt); 617 icsk->icsk_ca_ops->rtt_sample(sk, *usrtt);
609} 618}
610 619
611/* Calculate rto without backoff. This is the second half of Van Jacobson's 620/* Calculate rto without backoff. This is the second half of Van Jacobson's
612 * routine referred to above. 621 * routine referred to above.
613 */ 622 */
614static inline void tcp_set_rto(struct tcp_sock *tp) 623static inline void tcp_set_rto(struct sock *sk)
615{ 624{
625 const struct tcp_sock *tp = tcp_sk(sk);
616 /* Old crap is replaced with new one. 8) 626 /* Old crap is replaced with new one. 8)
617 * 627 *
618 * More seriously: 628 * More seriously:
@@ -623,7 +633,7 @@ static inline void tcp_set_rto(struct tcp_sock *tp)
623 * is invisible. Actually, Linux-2.4 also generates erratic 633 * is invisible. Actually, Linux-2.4 also generates erratic
624 * ACKs in some curcumstances. 634 * ACKs in some curcumstances.
625 */ 635 */
626 tp->rto = (tp->srtt >> 3) + tp->rttvar; 636 inet_csk(sk)->icsk_rto = (tp->srtt >> 3) + tp->rttvar;
627 637
628 /* 2. Fixups made earlier cannot be right. 638 /* 2. Fixups made earlier cannot be right.
629 * If we do not estimate RTO correctly without them, 639 * If we do not estimate RTO correctly without them,
@@ -635,10 +645,10 @@ static inline void tcp_set_rto(struct tcp_sock *tp)
635/* NOTE: clamping at TCP_RTO_MIN is not required, current algo 645/* NOTE: clamping at TCP_RTO_MIN is not required, current algo
636 * guarantees that rto is higher. 646 * guarantees that rto is higher.
637 */ 647 */
638static inline void tcp_bound_rto(struct tcp_sock *tp) 648static inline void tcp_bound_rto(struct sock *sk)
639{ 649{
640 if (tp->rto > TCP_RTO_MAX) 650 if (inet_csk(sk)->icsk_rto > TCP_RTO_MAX)
641 tp->rto = TCP_RTO_MAX; 651 inet_csk(sk)->icsk_rto = TCP_RTO_MAX;
642} 652}
643 653
644/* Save metrics learned by this TCP session. 654/* Save metrics learned by this TCP session.
@@ -656,9 +666,10 @@ void tcp_update_metrics(struct sock *sk)
656 dst_confirm(dst); 666 dst_confirm(dst);
657 667
658 if (dst && (dst->flags&DST_HOST)) { 668 if (dst && (dst->flags&DST_HOST)) {
669 const struct inet_connection_sock *icsk = inet_csk(sk);
659 int m; 670 int m;
660 671
661 if (tp->backoff || !tp->srtt) { 672 if (icsk->icsk_backoff || !tp->srtt) {
662 /* This session failed to estimate rtt. Why? 673 /* This session failed to estimate rtt. Why?
663 * Probably, no packets returned in time. 674 * Probably, no packets returned in time.
664 * Reset our results. 675 * Reset our results.
@@ -707,7 +718,7 @@ void tcp_update_metrics(struct sock *sk)
707 tp->snd_cwnd > dst_metric(dst, RTAX_CWND)) 718 tp->snd_cwnd > dst_metric(dst, RTAX_CWND))
708 dst->metrics[RTAX_CWND-1] = tp->snd_cwnd; 719 dst->metrics[RTAX_CWND-1] = tp->snd_cwnd;
709 } else if (tp->snd_cwnd > tp->snd_ssthresh && 720 } else if (tp->snd_cwnd > tp->snd_ssthresh &&
710 tp->ca_state == TCP_CA_Open) { 721 icsk->icsk_ca_state == TCP_CA_Open) {
711 /* Cong. avoidance phase, cwnd is reliable. */ 722 /* Cong. avoidance phase, cwnd is reliable. */
712 if (!dst_metric_locked(dst, RTAX_SSTHRESH)) 723 if (!dst_metric_locked(dst, RTAX_SSTHRESH))
713 dst->metrics[RTAX_SSTHRESH-1] = 724 dst->metrics[RTAX_SSTHRESH-1] =
@@ -801,9 +812,9 @@ static void tcp_init_metrics(struct sock *sk)
801 tp->mdev = dst_metric(dst, RTAX_RTTVAR); 812 tp->mdev = dst_metric(dst, RTAX_RTTVAR);
802 tp->mdev_max = tp->rttvar = max(tp->mdev, TCP_RTO_MIN); 813 tp->mdev_max = tp->rttvar = max(tp->mdev, TCP_RTO_MIN);
803 } 814 }
804 tcp_set_rto(tp); 815 tcp_set_rto(sk);
805 tcp_bound_rto(tp); 816 tcp_bound_rto(sk);
806 if (tp->rto < TCP_TIMEOUT_INIT && !tp->rx_opt.saw_tstamp) 817 if (inet_csk(sk)->icsk_rto < TCP_TIMEOUT_INIT && !tp->rx_opt.saw_tstamp)
807 goto reset; 818 goto reset;
808 tp->snd_cwnd = tcp_init_cwnd(tp, dst); 819 tp->snd_cwnd = tcp_init_cwnd(tp, dst);
809 tp->snd_cwnd_stamp = tcp_time_stamp; 820 tp->snd_cwnd_stamp = tcp_time_stamp;
@@ -817,12 +828,14 @@ reset:
817 if (!tp->rx_opt.saw_tstamp && tp->srtt) { 828 if (!tp->rx_opt.saw_tstamp && tp->srtt) {
818 tp->srtt = 0; 829 tp->srtt = 0;
819 tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_INIT; 830 tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_INIT;
820 tp->rto = TCP_TIMEOUT_INIT; 831 inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
821 } 832 }
822} 833}
823 834
824static void tcp_update_reordering(struct tcp_sock *tp, int metric, int ts) 835static void tcp_update_reordering(struct sock *sk, const int metric,
836 const int ts)
825{ 837{
838 struct tcp_sock *tp = tcp_sk(sk);
826 if (metric > tp->reordering) { 839 if (metric > tp->reordering) {
827 tp->reordering = min(TCP_MAX_REORDERING, metric); 840 tp->reordering = min(TCP_MAX_REORDERING, metric);
828 841
@@ -837,7 +850,7 @@ static void tcp_update_reordering(struct tcp_sock *tp, int metric, int ts)
837 NET_INC_STATS_BH(LINUX_MIB_TCPSACKREORDER); 850 NET_INC_STATS_BH(LINUX_MIB_TCPSACKREORDER);
838#if FASTRETRANS_DEBUG > 1 851#if FASTRETRANS_DEBUG > 1
839 printk(KERN_DEBUG "Disorder%d %d %u f%u s%u rr%d\n", 852 printk(KERN_DEBUG "Disorder%d %d %u f%u s%u rr%d\n",
840 tp->rx_opt.sack_ok, tp->ca_state, 853 tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state,
841 tp->reordering, 854 tp->reordering,
842 tp->fackets_out, 855 tp->fackets_out,
843 tp->sacked_out, 856 tp->sacked_out,
@@ -899,6 +912,7 @@ static void tcp_update_reordering(struct tcp_sock *tp, int metric, int ts)
899static int 912static int
900tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_una) 913tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_una)
901{ 914{
915 const struct inet_connection_sock *icsk = inet_csk(sk);
902 struct tcp_sock *tp = tcp_sk(sk); 916 struct tcp_sock *tp = tcp_sk(sk);
903 unsigned char *ptr = ack_skb->h.raw + TCP_SKB_CB(ack_skb)->sacked; 917 unsigned char *ptr = ack_skb->h.raw + TCP_SKB_CB(ack_skb)->sacked;
904 struct tcp_sack_block *sp = (struct tcp_sack_block *)(ptr+2); 918 struct tcp_sack_block *sp = (struct tcp_sack_block *)(ptr+2);
@@ -1064,7 +1078,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
1064 * we have to account for reordering! Ugly, 1078 * we have to account for reordering! Ugly,
1065 * but should help. 1079 * but should help.
1066 */ 1080 */
1067 if (lost_retrans && tp->ca_state == TCP_CA_Recovery) { 1081 if (lost_retrans && icsk->icsk_ca_state == TCP_CA_Recovery) {
1068 struct sk_buff *skb; 1082 struct sk_buff *skb;
1069 1083
1070 sk_stream_for_retrans_queue(skb, sk) { 1084 sk_stream_for_retrans_queue(skb, sk) {
@@ -1093,8 +1107,8 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
1093 1107
1094 tp->left_out = tp->sacked_out + tp->lost_out; 1108 tp->left_out = tp->sacked_out + tp->lost_out;
1095 1109
1096 if ((reord < tp->fackets_out) && tp->ca_state != TCP_CA_Loss) 1110 if ((reord < tp->fackets_out) && icsk->icsk_ca_state != TCP_CA_Loss)
1097 tcp_update_reordering(tp, ((tp->fackets_out + 1) - reord), 0); 1111 tcp_update_reordering(sk, ((tp->fackets_out + 1) - reord), 0);
1098 1112
1099#if FASTRETRANS_DEBUG > 0 1113#if FASTRETRANS_DEBUG > 0
1100 BUG_TRAP((int)tp->sacked_out >= 0); 1114 BUG_TRAP((int)tp->sacked_out >= 0);
@@ -1111,17 +1125,18 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
1111 */ 1125 */
1112void tcp_enter_frto(struct sock *sk) 1126void tcp_enter_frto(struct sock *sk)
1113{ 1127{
1128 const struct inet_connection_sock *icsk = inet_csk(sk);
1114 struct tcp_sock *tp = tcp_sk(sk); 1129 struct tcp_sock *tp = tcp_sk(sk);
1115 struct sk_buff *skb; 1130 struct sk_buff *skb;
1116 1131
1117 tp->frto_counter = 1; 1132 tp->frto_counter = 1;
1118 1133
1119 if (tp->ca_state <= TCP_CA_Disorder || 1134 if (icsk->icsk_ca_state <= TCP_CA_Disorder ||
1120 tp->snd_una == tp->high_seq || 1135 tp->snd_una == tp->high_seq ||
1121 (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) { 1136 (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
1122 tp->prior_ssthresh = tcp_current_ssthresh(tp); 1137 tp->prior_ssthresh = tcp_current_ssthresh(sk);
1123 tp->snd_ssthresh = tp->ca_ops->ssthresh(tp); 1138 tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
1124 tcp_ca_event(tp, CA_EVENT_FRTO); 1139 tcp_ca_event(sk, CA_EVENT_FRTO);
1125 } 1140 }
1126 1141
1127 /* Have to clear retransmission markers here to keep the bookkeeping 1142 /* Have to clear retransmission markers here to keep the bookkeeping
@@ -1138,7 +1153,7 @@ void tcp_enter_frto(struct sock *sk)
1138 } 1153 }
1139 tcp_sync_left_out(tp); 1154 tcp_sync_left_out(tp);
1140 1155
1141 tcp_set_ca_state(tp, TCP_CA_Open); 1156 tcp_set_ca_state(sk, TCP_CA_Open);
1142 tp->frto_highmark = tp->snd_nxt; 1157 tp->frto_highmark = tp->snd_nxt;
1143} 1158}
1144 1159
@@ -1184,7 +1199,7 @@ static void tcp_enter_frto_loss(struct sock *sk)
1184 1199
1185 tp->reordering = min_t(unsigned int, tp->reordering, 1200 tp->reordering = min_t(unsigned int, tp->reordering,
1186 sysctl_tcp_reordering); 1201 sysctl_tcp_reordering);
1187 tcp_set_ca_state(tp, TCP_CA_Loss); 1202 tcp_set_ca_state(sk, TCP_CA_Loss);
1188 tp->high_seq = tp->frto_highmark; 1203 tp->high_seq = tp->frto_highmark;
1189 TCP_ECN_queue_cwr(tp); 1204 TCP_ECN_queue_cwr(tp);
1190} 1205}
@@ -1208,16 +1223,17 @@ void tcp_clear_retrans(struct tcp_sock *tp)
1208 */ 1223 */
1209void tcp_enter_loss(struct sock *sk, int how) 1224void tcp_enter_loss(struct sock *sk, int how)
1210{ 1225{
1226 const struct inet_connection_sock *icsk = inet_csk(sk);
1211 struct tcp_sock *tp = tcp_sk(sk); 1227 struct tcp_sock *tp = tcp_sk(sk);
1212 struct sk_buff *skb; 1228 struct sk_buff *skb;
1213 int cnt = 0; 1229 int cnt = 0;
1214 1230
1215 /* Reduce ssthresh if it has not yet been made inside this window. */ 1231 /* Reduce ssthresh if it has not yet been made inside this window. */
1216 if (tp->ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq || 1232 if (icsk->icsk_ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq ||
1217 (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) { 1233 (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
1218 tp->prior_ssthresh = tcp_current_ssthresh(tp); 1234 tp->prior_ssthresh = tcp_current_ssthresh(sk);
1219 tp->snd_ssthresh = tp->ca_ops->ssthresh(tp); 1235 tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
1220 tcp_ca_event(tp, CA_EVENT_LOSS); 1236 tcp_ca_event(sk, CA_EVENT_LOSS);
1221 } 1237 }
1222 tp->snd_cwnd = 1; 1238 tp->snd_cwnd = 1;
1223 tp->snd_cwnd_cnt = 0; 1239 tp->snd_cwnd_cnt = 0;
@@ -1248,12 +1264,12 @@ void tcp_enter_loss(struct sock *sk, int how)
1248 1264
1249 tp->reordering = min_t(unsigned int, tp->reordering, 1265 tp->reordering = min_t(unsigned int, tp->reordering,
1250 sysctl_tcp_reordering); 1266 sysctl_tcp_reordering);
1251 tcp_set_ca_state(tp, TCP_CA_Loss); 1267 tcp_set_ca_state(sk, TCP_CA_Loss);
1252 tp->high_seq = tp->snd_nxt; 1268 tp->high_seq = tp->snd_nxt;
1253 TCP_ECN_queue_cwr(tp); 1269 TCP_ECN_queue_cwr(tp);
1254} 1270}
1255 1271
1256static int tcp_check_sack_reneging(struct sock *sk, struct tcp_sock *tp) 1272static int tcp_check_sack_reneging(struct sock *sk)
1257{ 1273{
1258 struct sk_buff *skb; 1274 struct sk_buff *skb;
1259 1275
@@ -1265,12 +1281,14 @@ static int tcp_check_sack_reneging(struct sock *sk, struct tcp_sock *tp)
1265 */ 1281 */
1266 if ((skb = skb_peek(&sk->sk_write_queue)) != NULL && 1282 if ((skb = skb_peek(&sk->sk_write_queue)) != NULL &&
1267 (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) { 1283 (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
1284 struct inet_connection_sock *icsk = inet_csk(sk);
1268 NET_INC_STATS_BH(LINUX_MIB_TCPSACKRENEGING); 1285 NET_INC_STATS_BH(LINUX_MIB_TCPSACKRENEGING);
1269 1286
1270 tcp_enter_loss(sk, 1); 1287 tcp_enter_loss(sk, 1);
1271 tp->retransmits++; 1288 icsk->icsk_retransmits++;
1272 tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue)); 1289 tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue));
1273 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); 1290 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
1291 icsk->icsk_rto, TCP_RTO_MAX);
1274 return 1; 1292 return 1;
1275 } 1293 }
1276 return 0; 1294 return 0;
@@ -1281,15 +1299,15 @@ static inline int tcp_fackets_out(struct tcp_sock *tp)
1281 return IsReno(tp) ? tp->sacked_out+1 : tp->fackets_out; 1299 return IsReno(tp) ? tp->sacked_out+1 : tp->fackets_out;
1282} 1300}
1283 1301
1284static inline int tcp_skb_timedout(struct tcp_sock *tp, struct sk_buff *skb) 1302static inline int tcp_skb_timedout(struct sock *sk, struct sk_buff *skb)
1285{ 1303{
1286 return (tcp_time_stamp - TCP_SKB_CB(skb)->when > tp->rto); 1304 return (tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto);
1287} 1305}
1288 1306
1289static inline int tcp_head_timedout(struct sock *sk, struct tcp_sock *tp) 1307static inline int tcp_head_timedout(struct sock *sk, struct tcp_sock *tp)
1290{ 1308{
1291 return tp->packets_out && 1309 return tp->packets_out &&
1292 tcp_skb_timedout(tp, skb_peek(&sk->sk_write_queue)); 1310 tcp_skb_timedout(sk, skb_peek(&sk->sk_write_queue));
1293} 1311}
1294 1312
1295/* Linux NewReno/SACK/FACK/ECN state machine. 1313/* Linux NewReno/SACK/FACK/ECN state machine.
@@ -1423,8 +1441,9 @@ static int tcp_time_to_recover(struct sock *sk, struct tcp_sock *tp)
1423 * in assumption of absent reordering, interpret this as reordering. 1441 * in assumption of absent reordering, interpret this as reordering.
1424 * The only another reason could be bug in receiver TCP. 1442 * The only another reason could be bug in receiver TCP.
1425 */ 1443 */
1426static void tcp_check_reno_reordering(struct tcp_sock *tp, int addend) 1444static void tcp_check_reno_reordering(struct sock *sk, const int addend)
1427{ 1445{
1446 struct tcp_sock *tp = tcp_sk(sk);
1428 u32 holes; 1447 u32 holes;
1429 1448
1430 holes = max(tp->lost_out, 1U); 1449 holes = max(tp->lost_out, 1U);
@@ -1432,16 +1451,17 @@ static void tcp_check_reno_reordering(struct tcp_sock *tp, int addend)
1432 1451
1433 if ((tp->sacked_out + holes) > tp->packets_out) { 1452 if ((tp->sacked_out + holes) > tp->packets_out) {
1434 tp->sacked_out = tp->packets_out - holes; 1453 tp->sacked_out = tp->packets_out - holes;
1435 tcp_update_reordering(tp, tp->packets_out+addend, 0); 1454 tcp_update_reordering(sk, tp->packets_out + addend, 0);
1436 } 1455 }
1437} 1456}
1438 1457
1439/* Emulate SACKs for SACKless connection: account for a new dupack. */ 1458/* Emulate SACKs for SACKless connection: account for a new dupack. */
1440 1459
1441static void tcp_add_reno_sack(struct tcp_sock *tp) 1460static void tcp_add_reno_sack(struct sock *sk)
1442{ 1461{
1462 struct tcp_sock *tp = tcp_sk(sk);
1443 tp->sacked_out++; 1463 tp->sacked_out++;
1444 tcp_check_reno_reordering(tp, 0); 1464 tcp_check_reno_reordering(sk, 0);
1445 tcp_sync_left_out(tp); 1465 tcp_sync_left_out(tp);
1446} 1466}
1447 1467
@@ -1456,7 +1476,7 @@ static void tcp_remove_reno_sacks(struct sock *sk, struct tcp_sock *tp, int acke
1456 else 1476 else
1457 tp->sacked_out -= acked-1; 1477 tp->sacked_out -= acked-1;
1458 } 1478 }
1459 tcp_check_reno_reordering(tp, acked); 1479 tcp_check_reno_reordering(sk, acked);
1460 tcp_sync_left_out(tp); 1480 tcp_sync_left_out(tp);
1461} 1481}
1462 1482
@@ -1509,7 +1529,7 @@ static void tcp_update_scoreboard(struct sock *sk, struct tcp_sock *tp)
1509 struct sk_buff *skb; 1529 struct sk_buff *skb;
1510 1530
1511 sk_stream_for_retrans_queue(skb, sk) { 1531 sk_stream_for_retrans_queue(skb, sk) {
1512 if (tcp_skb_timedout(tp, skb) && 1532 if (tcp_skb_timedout(sk, skb) &&
1513 !(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) { 1533 !(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) {
1514 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; 1534 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
1515 tp->lost_out += tcp_skb_pcount(skb); 1535 tp->lost_out += tcp_skb_pcount(skb);
@@ -1530,14 +1550,16 @@ static inline void tcp_moderate_cwnd(struct tcp_sock *tp)
1530} 1550}
1531 1551
1532/* Decrease cwnd each second ack. */ 1552/* Decrease cwnd each second ack. */
1533static void tcp_cwnd_down(struct tcp_sock *tp) 1553static void tcp_cwnd_down(struct sock *sk)
1534{ 1554{
1555 const struct inet_connection_sock *icsk = inet_csk(sk);
1556 struct tcp_sock *tp = tcp_sk(sk);
1535 int decr = tp->snd_cwnd_cnt + 1; 1557 int decr = tp->snd_cwnd_cnt + 1;
1536 1558
1537 tp->snd_cwnd_cnt = decr&1; 1559 tp->snd_cwnd_cnt = decr&1;
1538 decr >>= 1; 1560 decr >>= 1;
1539 1561
1540 if (decr && tp->snd_cwnd > tp->ca_ops->min_cwnd(tp)) 1562 if (decr && tp->snd_cwnd > icsk->icsk_ca_ops->min_cwnd(sk))
1541 tp->snd_cwnd -= decr; 1563 tp->snd_cwnd -= decr;
1542 1564
1543 tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1); 1565 tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1);
@@ -1571,11 +1593,15 @@ static void DBGUNDO(struct sock *sk, struct tcp_sock *tp, const char *msg)
1571#define DBGUNDO(x...) do { } while (0) 1593#define DBGUNDO(x...) do { } while (0)
1572#endif 1594#endif
1573 1595
1574static void tcp_undo_cwr(struct tcp_sock *tp, int undo) 1596static void tcp_undo_cwr(struct sock *sk, const int undo)
1575{ 1597{
1598 struct tcp_sock *tp = tcp_sk(sk);
1599
1576 if (tp->prior_ssthresh) { 1600 if (tp->prior_ssthresh) {
1577 if (tp->ca_ops->undo_cwnd) 1601 const struct inet_connection_sock *icsk = inet_csk(sk);
1578 tp->snd_cwnd = tp->ca_ops->undo_cwnd(tp); 1602
1603 if (icsk->icsk_ca_ops->undo_cwnd)
1604 tp->snd_cwnd = icsk->icsk_ca_ops->undo_cwnd(sk);
1579 else 1605 else
1580 tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh<<1); 1606 tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh<<1);
1581 1607
@@ -1603,9 +1629,9 @@ static int tcp_try_undo_recovery(struct sock *sk, struct tcp_sock *tp)
1603 /* Happy end! We did not retransmit anything 1629 /* Happy end! We did not retransmit anything
1604 * or our original transmission succeeded. 1630 * or our original transmission succeeded.
1605 */ 1631 */
1606 DBGUNDO(sk, tp, tp->ca_state == TCP_CA_Loss ? "loss" : "retrans"); 1632 DBGUNDO(sk, tp, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans");
1607 tcp_undo_cwr(tp, 1); 1633 tcp_undo_cwr(sk, 1);
1608 if (tp->ca_state == TCP_CA_Loss) 1634 if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss)
1609 NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO); 1635 NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO);
1610 else 1636 else
1611 NET_INC_STATS_BH(LINUX_MIB_TCPFULLUNDO); 1637 NET_INC_STATS_BH(LINUX_MIB_TCPFULLUNDO);
@@ -1618,7 +1644,7 @@ static int tcp_try_undo_recovery(struct sock *sk, struct tcp_sock *tp)
1618 tcp_moderate_cwnd(tp); 1644 tcp_moderate_cwnd(tp);
1619 return 1; 1645 return 1;
1620 } 1646 }
1621 tcp_set_ca_state(tp, TCP_CA_Open); 1647 tcp_set_ca_state(sk, TCP_CA_Open);
1622 return 0; 1648 return 0;
1623} 1649}
1624 1650
@@ -1627,7 +1653,7 @@ static void tcp_try_undo_dsack(struct sock *sk, struct tcp_sock *tp)
1627{ 1653{
1628 if (tp->undo_marker && !tp->undo_retrans) { 1654 if (tp->undo_marker && !tp->undo_retrans) {
1629 DBGUNDO(sk, tp, "D-SACK"); 1655 DBGUNDO(sk, tp, "D-SACK");
1630 tcp_undo_cwr(tp, 1); 1656 tcp_undo_cwr(sk, 1);
1631 tp->undo_marker = 0; 1657 tp->undo_marker = 0;
1632 NET_INC_STATS_BH(LINUX_MIB_TCPDSACKUNDO); 1658 NET_INC_STATS_BH(LINUX_MIB_TCPDSACKUNDO);
1633 } 1659 }
@@ -1648,10 +1674,10 @@ static int tcp_try_undo_partial(struct sock *sk, struct tcp_sock *tp,
1648 if (tp->retrans_out == 0) 1674 if (tp->retrans_out == 0)
1649 tp->retrans_stamp = 0; 1675 tp->retrans_stamp = 0;
1650 1676
1651 tcp_update_reordering(tp, tcp_fackets_out(tp)+acked, 1); 1677 tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1);
1652 1678
1653 DBGUNDO(sk, tp, "Hoe"); 1679 DBGUNDO(sk, tp, "Hoe");
1654 tcp_undo_cwr(tp, 0); 1680 tcp_undo_cwr(sk, 0);
1655 NET_INC_STATS_BH(LINUX_MIB_TCPPARTIALUNDO); 1681 NET_INC_STATS_BH(LINUX_MIB_TCPPARTIALUNDO);
1656 1682
1657 /* So... Do not make Hoe's retransmit yet. 1683 /* So... Do not make Hoe's retransmit yet.
@@ -1674,22 +1700,23 @@ static int tcp_try_undo_loss(struct sock *sk, struct tcp_sock *tp)
1674 DBGUNDO(sk, tp, "partial loss"); 1700 DBGUNDO(sk, tp, "partial loss");
1675 tp->lost_out = 0; 1701 tp->lost_out = 0;
1676 tp->left_out = tp->sacked_out; 1702 tp->left_out = tp->sacked_out;
1677 tcp_undo_cwr(tp, 1); 1703 tcp_undo_cwr(sk, 1);
1678 NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO); 1704 NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO);
1679 tp->retransmits = 0; 1705 inet_csk(sk)->icsk_retransmits = 0;
1680 tp->undo_marker = 0; 1706 tp->undo_marker = 0;
1681 if (!IsReno(tp)) 1707 if (!IsReno(tp))
1682 tcp_set_ca_state(tp, TCP_CA_Open); 1708 tcp_set_ca_state(sk, TCP_CA_Open);
1683 return 1; 1709 return 1;
1684 } 1710 }
1685 return 0; 1711 return 0;
1686} 1712}
1687 1713
1688static inline void tcp_complete_cwr(struct tcp_sock *tp) 1714static inline void tcp_complete_cwr(struct sock *sk)
1689{ 1715{
1716 struct tcp_sock *tp = tcp_sk(sk);
1690 tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); 1717 tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
1691 tp->snd_cwnd_stamp = tcp_time_stamp; 1718 tp->snd_cwnd_stamp = tcp_time_stamp;
1692 tcp_ca_event(tp, CA_EVENT_COMPLETE_CWR); 1719 tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR);
1693} 1720}
1694 1721
1695static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag) 1722static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag)
@@ -1700,21 +1727,21 @@ static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag)
1700 tp->retrans_stamp = 0; 1727 tp->retrans_stamp = 0;
1701 1728
1702 if (flag&FLAG_ECE) 1729 if (flag&FLAG_ECE)
1703 tcp_enter_cwr(tp); 1730 tcp_enter_cwr(sk);
1704 1731
1705 if (tp->ca_state != TCP_CA_CWR) { 1732 if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {
1706 int state = TCP_CA_Open; 1733 int state = TCP_CA_Open;
1707 1734
1708 if (tp->left_out || tp->retrans_out || tp->undo_marker) 1735 if (tp->left_out || tp->retrans_out || tp->undo_marker)
1709 state = TCP_CA_Disorder; 1736 state = TCP_CA_Disorder;
1710 1737
1711 if (tp->ca_state != state) { 1738 if (inet_csk(sk)->icsk_ca_state != state) {
1712 tcp_set_ca_state(tp, state); 1739 tcp_set_ca_state(sk, state);
1713 tp->high_seq = tp->snd_nxt; 1740 tp->high_seq = tp->snd_nxt;
1714 } 1741 }
1715 tcp_moderate_cwnd(tp); 1742 tcp_moderate_cwnd(tp);
1716 } else { 1743 } else {
1717 tcp_cwnd_down(tp); 1744 tcp_cwnd_down(sk);
1718 } 1745 }
1719} 1746}
1720 1747
@@ -1733,6 +1760,7 @@ static void
1733tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, 1760tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
1734 int prior_packets, int flag) 1761 int prior_packets, int flag)
1735{ 1762{
1763 struct inet_connection_sock *icsk = inet_csk(sk);
1736 struct tcp_sock *tp = tcp_sk(sk); 1764 struct tcp_sock *tp = tcp_sk(sk);
1737 int is_dupack = (tp->snd_una == prior_snd_una && !(flag&FLAG_NOT_DUP)); 1765 int is_dupack = (tp->snd_una == prior_snd_una && !(flag&FLAG_NOT_DUP));
1738 1766
@@ -1750,13 +1778,13 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
1750 tp->prior_ssthresh = 0; 1778 tp->prior_ssthresh = 0;
1751 1779
1752 /* B. In all the states check for reneging SACKs. */ 1780 /* B. In all the states check for reneging SACKs. */
1753 if (tp->sacked_out && tcp_check_sack_reneging(sk, tp)) 1781 if (tp->sacked_out && tcp_check_sack_reneging(sk))
1754 return; 1782 return;
1755 1783
1756 /* C. Process data loss notification, provided it is valid. */ 1784 /* C. Process data loss notification, provided it is valid. */
1757 if ((flag&FLAG_DATA_LOST) && 1785 if ((flag&FLAG_DATA_LOST) &&
1758 before(tp->snd_una, tp->high_seq) && 1786 before(tp->snd_una, tp->high_seq) &&
1759 tp->ca_state != TCP_CA_Open && 1787 icsk->icsk_ca_state != TCP_CA_Open &&
1760 tp->fackets_out > tp->reordering) { 1788 tp->fackets_out > tp->reordering) {
1761 tcp_mark_head_lost(sk, tp, tp->fackets_out-tp->reordering, tp->high_seq); 1789 tcp_mark_head_lost(sk, tp, tp->fackets_out-tp->reordering, tp->high_seq);
1762 NET_INC_STATS_BH(LINUX_MIB_TCPLOSS); 1790 NET_INC_STATS_BH(LINUX_MIB_TCPLOSS);
@@ -1767,14 +1795,14 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
1767 1795
1768 /* E. Check state exit conditions. State can be terminated 1796 /* E. Check state exit conditions. State can be terminated
1769 * when high_seq is ACKed. */ 1797 * when high_seq is ACKed. */
1770 if (tp->ca_state == TCP_CA_Open) { 1798 if (icsk->icsk_ca_state == TCP_CA_Open) {
1771 if (!sysctl_tcp_frto) 1799 if (!sysctl_tcp_frto)
1772 BUG_TRAP(tp->retrans_out == 0); 1800 BUG_TRAP(tp->retrans_out == 0);
1773 tp->retrans_stamp = 0; 1801 tp->retrans_stamp = 0;
1774 } else if (!before(tp->snd_una, tp->high_seq)) { 1802 } else if (!before(tp->snd_una, tp->high_seq)) {
1775 switch (tp->ca_state) { 1803 switch (icsk->icsk_ca_state) {
1776 case TCP_CA_Loss: 1804 case TCP_CA_Loss:
1777 tp->retransmits = 0; 1805 icsk->icsk_retransmits = 0;
1778 if (tcp_try_undo_recovery(sk, tp)) 1806 if (tcp_try_undo_recovery(sk, tp))
1779 return; 1807 return;
1780 break; 1808 break;
@@ -1783,8 +1811,8 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
1783 /* CWR is to be held something *above* high_seq 1811 /* CWR is to be held something *above* high_seq
1784 * is ACKed for CWR bit to reach receiver. */ 1812 * is ACKed for CWR bit to reach receiver. */
1785 if (tp->snd_una != tp->high_seq) { 1813 if (tp->snd_una != tp->high_seq) {
1786 tcp_complete_cwr(tp); 1814 tcp_complete_cwr(sk);
1787 tcp_set_ca_state(tp, TCP_CA_Open); 1815 tcp_set_ca_state(sk, TCP_CA_Open);
1788 } 1816 }
1789 break; 1817 break;
1790 1818
@@ -1795,7 +1823,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
1795 * catching for all duplicate ACKs. */ 1823 * catching for all duplicate ACKs. */
1796 IsReno(tp) || tp->snd_una != tp->high_seq) { 1824 IsReno(tp) || tp->snd_una != tp->high_seq) {
1797 tp->undo_marker = 0; 1825 tp->undo_marker = 0;
1798 tcp_set_ca_state(tp, TCP_CA_Open); 1826 tcp_set_ca_state(sk, TCP_CA_Open);
1799 } 1827 }
1800 break; 1828 break;
1801 1829
@@ -1804,17 +1832,17 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
1804 tcp_reset_reno_sack(tp); 1832 tcp_reset_reno_sack(tp);
1805 if (tcp_try_undo_recovery(sk, tp)) 1833 if (tcp_try_undo_recovery(sk, tp))
1806 return; 1834 return;
1807 tcp_complete_cwr(tp); 1835 tcp_complete_cwr(sk);
1808 break; 1836 break;
1809 } 1837 }
1810 } 1838 }
1811 1839
1812 /* F. Process state. */ 1840 /* F. Process state. */
1813 switch (tp->ca_state) { 1841 switch (icsk->icsk_ca_state) {
1814 case TCP_CA_Recovery: 1842 case TCP_CA_Recovery:
1815 if (prior_snd_una == tp->snd_una) { 1843 if (prior_snd_una == tp->snd_una) {
1816 if (IsReno(tp) && is_dupack) 1844 if (IsReno(tp) && is_dupack)
1817 tcp_add_reno_sack(tp); 1845 tcp_add_reno_sack(sk);
1818 } else { 1846 } else {
1819 int acked = prior_packets - tp->packets_out; 1847 int acked = prior_packets - tp->packets_out;
1820 if (IsReno(tp)) 1848 if (IsReno(tp))
@@ -1824,13 +1852,13 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
1824 break; 1852 break;
1825 case TCP_CA_Loss: 1853 case TCP_CA_Loss:
1826 if (flag&FLAG_DATA_ACKED) 1854 if (flag&FLAG_DATA_ACKED)
1827 tp->retransmits = 0; 1855 icsk->icsk_retransmits = 0;
1828 if (!tcp_try_undo_loss(sk, tp)) { 1856 if (!tcp_try_undo_loss(sk, tp)) {
1829 tcp_moderate_cwnd(tp); 1857 tcp_moderate_cwnd(tp);
1830 tcp_xmit_retransmit_queue(sk); 1858 tcp_xmit_retransmit_queue(sk);
1831 return; 1859 return;
1832 } 1860 }
1833 if (tp->ca_state != TCP_CA_Open) 1861 if (icsk->icsk_ca_state != TCP_CA_Open)
1834 return; 1862 return;
1835 /* Loss is undone; fall through to processing in Open state. */ 1863 /* Loss is undone; fall through to processing in Open state. */
1836 default: 1864 default:
@@ -1838,10 +1866,10 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
1838 if (tp->snd_una != prior_snd_una) 1866 if (tp->snd_una != prior_snd_una)
1839 tcp_reset_reno_sack(tp); 1867 tcp_reset_reno_sack(tp);
1840 if (is_dupack) 1868 if (is_dupack)
1841 tcp_add_reno_sack(tp); 1869 tcp_add_reno_sack(sk);
1842 } 1870 }
1843 1871
1844 if (tp->ca_state == TCP_CA_Disorder) 1872 if (icsk->icsk_ca_state == TCP_CA_Disorder)
1845 tcp_try_undo_dsack(sk, tp); 1873 tcp_try_undo_dsack(sk, tp);
1846 1874
1847 if (!tcp_time_to_recover(sk, tp)) { 1875 if (!tcp_time_to_recover(sk, tp)) {
@@ -1861,30 +1889,28 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
1861 tp->undo_marker = tp->snd_una; 1889 tp->undo_marker = tp->snd_una;
1862 tp->undo_retrans = tp->retrans_out; 1890 tp->undo_retrans = tp->retrans_out;
1863 1891
1864 if (tp->ca_state < TCP_CA_CWR) { 1892 if (icsk->icsk_ca_state < TCP_CA_CWR) {
1865 if (!(flag&FLAG_ECE)) 1893 if (!(flag&FLAG_ECE))
1866 tp->prior_ssthresh = tcp_current_ssthresh(tp); 1894 tp->prior_ssthresh = tcp_current_ssthresh(sk);
1867 tp->snd_ssthresh = tp->ca_ops->ssthresh(tp); 1895 tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
1868 TCP_ECN_queue_cwr(tp); 1896 TCP_ECN_queue_cwr(tp);
1869 } 1897 }
1870 1898
1871 tp->snd_cwnd_cnt = 0; 1899 tp->snd_cwnd_cnt = 0;
1872 tcp_set_ca_state(tp, TCP_CA_Recovery); 1900 tcp_set_ca_state(sk, TCP_CA_Recovery);
1873 } 1901 }
1874 1902
1875 if (is_dupack || tcp_head_timedout(sk, tp)) 1903 if (is_dupack || tcp_head_timedout(sk, tp))
1876 tcp_update_scoreboard(sk, tp); 1904 tcp_update_scoreboard(sk, tp);
1877 tcp_cwnd_down(tp); 1905 tcp_cwnd_down(sk);
1878 tcp_xmit_retransmit_queue(sk); 1906 tcp_xmit_retransmit_queue(sk);
1879} 1907}
1880 1908
1881/* Read draft-ietf-tcplw-high-performance before mucking 1909/* Read draft-ietf-tcplw-high-performance before mucking
1882 * with this code. (Superceeds RFC1323) 1910 * with this code. (Superceeds RFC1323)
1883 */ 1911 */
1884static void tcp_ack_saw_tstamp(struct tcp_sock *tp, u32 *usrtt, int flag) 1912static void tcp_ack_saw_tstamp(struct sock *sk, u32 *usrtt, int flag)
1885{ 1913{
1886 __u32 seq_rtt;
1887
1888 /* RTTM Rule: A TSecr value received in a segment is used to 1914 /* RTTM Rule: A TSecr value received in a segment is used to
1889 * update the averaged RTT measurement only if the segment 1915 * update the averaged RTT measurement only if the segment
1890 * acknowledges some new data, i.e., only if it advances the 1916 * acknowledges some new data, i.e., only if it advances the
@@ -1900,14 +1926,15 @@ static void tcp_ack_saw_tstamp(struct tcp_sock *tp, u32 *usrtt, int flag)
1900 * answer arrives rto becomes 120 seconds! If at least one of segments 1926 * answer arrives rto becomes 120 seconds! If at least one of segments
1901 * in window is lost... Voila. --ANK (010210) 1927 * in window is lost... Voila. --ANK (010210)
1902 */ 1928 */
1903 seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; 1929 struct tcp_sock *tp = tcp_sk(sk);
1904 tcp_rtt_estimator(tp, seq_rtt, usrtt); 1930 const __u32 seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
1905 tcp_set_rto(tp); 1931 tcp_rtt_estimator(sk, seq_rtt, usrtt);
1906 tp->backoff = 0; 1932 tcp_set_rto(sk);
1907 tcp_bound_rto(tp); 1933 inet_csk(sk)->icsk_backoff = 0;
1934 tcp_bound_rto(sk);
1908} 1935}
1909 1936
1910static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, u32 *usrtt, int flag) 1937static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, u32 *usrtt, int flag)
1911{ 1938{
1912 /* We don't have a timestamp. Can only use 1939 /* We don't have a timestamp. Can only use
1913 * packets that are not retransmitted to determine 1940 * packets that are not retransmitted to determine
@@ -1921,27 +1948,29 @@ static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, u32 *usrtt, int
1921 if (flag & FLAG_RETRANS_DATA_ACKED) 1948 if (flag & FLAG_RETRANS_DATA_ACKED)
1922 return; 1949 return;
1923 1950
1924 tcp_rtt_estimator(tp, seq_rtt, usrtt); 1951 tcp_rtt_estimator(sk, seq_rtt, usrtt);
1925 tcp_set_rto(tp); 1952 tcp_set_rto(sk);
1926 tp->backoff = 0; 1953 inet_csk(sk)->icsk_backoff = 0;
1927 tcp_bound_rto(tp); 1954 tcp_bound_rto(sk);
1928} 1955}
1929 1956
1930static inline void tcp_ack_update_rtt(struct tcp_sock *tp, 1957static inline void tcp_ack_update_rtt(struct sock *sk, const int flag,
1931 int flag, s32 seq_rtt, u32 *usrtt) 1958 const s32 seq_rtt, u32 *usrtt)
1932{ 1959{
1960 const struct tcp_sock *tp = tcp_sk(sk);
1933 /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */ 1961 /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */
1934 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) 1962 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
1935 tcp_ack_saw_tstamp(tp, usrtt, flag); 1963 tcp_ack_saw_tstamp(sk, usrtt, flag);
1936 else if (seq_rtt >= 0) 1964 else if (seq_rtt >= 0)
1937 tcp_ack_no_tstamp(tp, seq_rtt, usrtt, flag); 1965 tcp_ack_no_tstamp(sk, seq_rtt, usrtt, flag);
1938} 1966}
1939 1967
1940static inline void tcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, 1968static inline void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
1941 u32 in_flight, int good) 1969 u32 in_flight, int good)
1942{ 1970{
1943 tp->ca_ops->cong_avoid(tp, ack, rtt, in_flight, good); 1971 const struct inet_connection_sock *icsk = inet_csk(sk);
1944 tp->snd_cwnd_stamp = tcp_time_stamp; 1972 icsk->icsk_ca_ops->cong_avoid(sk, ack, rtt, in_flight, good);
1973 tcp_sk(sk)->snd_cwnd_stamp = tcp_time_stamp;
1945} 1974}
1946 1975
1947/* Restart timer after forward progress on connection. 1976/* Restart timer after forward progress on connection.
@@ -1951,9 +1980,9 @@ static inline void tcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
1951static inline void tcp_ack_packets_out(struct sock *sk, struct tcp_sock *tp) 1980static inline void tcp_ack_packets_out(struct sock *sk, struct tcp_sock *tp)
1952{ 1981{
1953 if (!tp->packets_out) { 1982 if (!tp->packets_out) {
1954 tcp_clear_xmit_timer(sk, TCP_TIME_RETRANS); 1983 inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
1955 } else { 1984 } else {
1956 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); 1985 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
1957 } 1986 }
1958} 1987}
1959 1988
@@ -2068,9 +2097,13 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt
2068 seq_rtt = -1; 2097 seq_rtt = -1;
2069 } else if (seq_rtt < 0) 2098 } else if (seq_rtt < 0)
2070 seq_rtt = now - scb->when; 2099 seq_rtt = now - scb->when;
2071 if (seq_usrtt) 2100 if (seq_usrtt) {
2072 *seq_usrtt = (usnow.tv_sec - skb->stamp.tv_sec) * 1000000 2101 struct timeval tv;
2073 + (usnow.tv_usec - skb->stamp.tv_usec); 2102
2103 skb_get_timestamp(skb, &tv);
2104 *seq_usrtt = (usnow.tv_sec - tv.tv_sec) * 1000000
2105 + (usnow.tv_usec - tv.tv_usec);
2106 }
2074 2107
2075 if (sacked & TCPCB_SACKED_ACKED) 2108 if (sacked & TCPCB_SACKED_ACKED)
2076 tp->sacked_out -= tcp_skb_pcount(skb); 2109 tp->sacked_out -= tcp_skb_pcount(skb);
@@ -2085,16 +2118,17 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt
2085 seq_rtt = now - scb->when; 2118 seq_rtt = now - scb->when;
2086 tcp_dec_pcount_approx(&tp->fackets_out, skb); 2119 tcp_dec_pcount_approx(&tp->fackets_out, skb);
2087 tcp_packets_out_dec(tp, skb); 2120 tcp_packets_out_dec(tp, skb);
2088 __skb_unlink(skb, skb->list); 2121 __skb_unlink(skb, &sk->sk_write_queue);
2089 sk_stream_free_skb(sk, skb); 2122 sk_stream_free_skb(sk, skb);
2090 } 2123 }
2091 2124
2092 if (acked&FLAG_ACKED) { 2125 if (acked&FLAG_ACKED) {
2093 tcp_ack_update_rtt(tp, acked, seq_rtt, seq_usrtt); 2126 const struct inet_connection_sock *icsk = inet_csk(sk);
2127 tcp_ack_update_rtt(sk, acked, seq_rtt, seq_usrtt);
2094 tcp_ack_packets_out(sk, tp); 2128 tcp_ack_packets_out(sk, tp);
2095 2129
2096 if (tp->ca_ops->pkts_acked) 2130 if (icsk->icsk_ca_ops->pkts_acked)
2097 tp->ca_ops->pkts_acked(tp, pkts_acked); 2131 icsk->icsk_ca_ops->pkts_acked(sk, pkts_acked);
2098 } 2132 }
2099 2133
2100#if FASTRETRANS_DEBUG > 0 2134#if FASTRETRANS_DEBUG > 0
@@ -2102,19 +2136,20 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt
2102 BUG_TRAP((int)tp->lost_out >= 0); 2136 BUG_TRAP((int)tp->lost_out >= 0);
2103 BUG_TRAP((int)tp->retrans_out >= 0); 2137 BUG_TRAP((int)tp->retrans_out >= 0);
2104 if (!tp->packets_out && tp->rx_opt.sack_ok) { 2138 if (!tp->packets_out && tp->rx_opt.sack_ok) {
2139 const struct inet_connection_sock *icsk = inet_csk(sk);
2105 if (tp->lost_out) { 2140 if (tp->lost_out) {
2106 printk(KERN_DEBUG "Leak l=%u %d\n", 2141 printk(KERN_DEBUG "Leak l=%u %d\n",
2107 tp->lost_out, tp->ca_state); 2142 tp->lost_out, icsk->icsk_ca_state);
2108 tp->lost_out = 0; 2143 tp->lost_out = 0;
2109 } 2144 }
2110 if (tp->sacked_out) { 2145 if (tp->sacked_out) {
2111 printk(KERN_DEBUG "Leak s=%u %d\n", 2146 printk(KERN_DEBUG "Leak s=%u %d\n",
2112 tp->sacked_out, tp->ca_state); 2147 tp->sacked_out, icsk->icsk_ca_state);
2113 tp->sacked_out = 0; 2148 tp->sacked_out = 0;
2114 } 2149 }
2115 if (tp->retrans_out) { 2150 if (tp->retrans_out) {
2116 printk(KERN_DEBUG "Leak r=%u %d\n", 2151 printk(KERN_DEBUG "Leak r=%u %d\n",
2117 tp->retrans_out, tp->ca_state); 2152 tp->retrans_out, icsk->icsk_ca_state);
2118 tp->retrans_out = 0; 2153 tp->retrans_out = 0;
2119 } 2154 }
2120 } 2155 }
@@ -2125,40 +2160,43 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt
2125 2160
2126static void tcp_ack_probe(struct sock *sk) 2161static void tcp_ack_probe(struct sock *sk)
2127{ 2162{
2128 struct tcp_sock *tp = tcp_sk(sk); 2163 const struct tcp_sock *tp = tcp_sk(sk);
2164 struct inet_connection_sock *icsk = inet_csk(sk);
2129 2165
2130 /* Was it a usable window open? */ 2166 /* Was it a usable window open? */
2131 2167
2132 if (!after(TCP_SKB_CB(sk->sk_send_head)->end_seq, 2168 if (!after(TCP_SKB_CB(sk->sk_send_head)->end_seq,
2133 tp->snd_una + tp->snd_wnd)) { 2169 tp->snd_una + tp->snd_wnd)) {
2134 tp->backoff = 0; 2170 icsk->icsk_backoff = 0;
2135 tcp_clear_xmit_timer(sk, TCP_TIME_PROBE0); 2171 inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0);
2136 /* Socket must be waked up by subsequent tcp_data_snd_check(). 2172 /* Socket must be waked up by subsequent tcp_data_snd_check().
2137 * This function is not for random using! 2173 * This function is not for random using!
2138 */ 2174 */
2139 } else { 2175 } else {
2140 tcp_reset_xmit_timer(sk, TCP_TIME_PROBE0, 2176 inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
2141 min(tp->rto << tp->backoff, TCP_RTO_MAX)); 2177 min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX),
2178 TCP_RTO_MAX);
2142 } 2179 }
2143} 2180}
2144 2181
2145static inline int tcp_ack_is_dubious(struct tcp_sock *tp, int flag) 2182static inline int tcp_ack_is_dubious(const struct sock *sk, const int flag)
2146{ 2183{
2147 return (!(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) || 2184 return (!(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) ||
2148 tp->ca_state != TCP_CA_Open); 2185 inet_csk(sk)->icsk_ca_state != TCP_CA_Open);
2149} 2186}
2150 2187
2151static inline int tcp_may_raise_cwnd(struct tcp_sock *tp, int flag) 2188static inline int tcp_may_raise_cwnd(const struct sock *sk, const int flag)
2152{ 2189{
2190 const struct tcp_sock *tp = tcp_sk(sk);
2153 return (!(flag & FLAG_ECE) || tp->snd_cwnd < tp->snd_ssthresh) && 2191 return (!(flag & FLAG_ECE) || tp->snd_cwnd < tp->snd_ssthresh) &&
2154 !((1<<tp->ca_state)&(TCPF_CA_Recovery|TCPF_CA_CWR)); 2192 !((1 << inet_csk(sk)->icsk_ca_state) & (TCPF_CA_Recovery | TCPF_CA_CWR));
2155} 2193}
2156 2194
2157/* Check that window update is acceptable. 2195/* Check that window update is acceptable.
2158 * The function assumes that snd_una<=ack<=snd_next. 2196 * The function assumes that snd_una<=ack<=snd_next.
2159 */ 2197 */
2160static inline int tcp_may_update_window(struct tcp_sock *tp, u32 ack, 2198static inline int tcp_may_update_window(const struct tcp_sock *tp, const u32 ack,
2161 u32 ack_seq, u32 nwin) 2199 const u32 ack_seq, const u32 nwin)
2162{ 2200{
2163 return (after(ack, tp->snd_una) || 2201 return (after(ack, tp->snd_una) ||
2164 after(ack_seq, tp->snd_wl1) || 2202 after(ack_seq, tp->snd_wl1) ||
@@ -2241,6 +2279,7 @@ static void tcp_process_frto(struct sock *sk, u32 prior_snd_una)
2241/* This routine deals with incoming acks, but not outgoing ones. */ 2279/* This routine deals with incoming acks, but not outgoing ones. */
2242static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) 2280static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
2243{ 2281{
2282 struct inet_connection_sock *icsk = inet_csk(sk);
2244 struct tcp_sock *tp = tcp_sk(sk); 2283 struct tcp_sock *tp = tcp_sk(sk);
2245 u32 prior_snd_una = tp->snd_una; 2284 u32 prior_snd_una = tp->snd_una;
2246 u32 ack_seq = TCP_SKB_CB(skb)->seq; 2285 u32 ack_seq = TCP_SKB_CB(skb)->seq;
@@ -2268,7 +2307,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
2268 tp->snd_una = ack; 2307 tp->snd_una = ack;
2269 flag |= FLAG_WIN_UPDATE; 2308 flag |= FLAG_WIN_UPDATE;
2270 2309
2271 tcp_ca_event(tp, CA_EVENT_FAST_ACK); 2310 tcp_ca_event(sk, CA_EVENT_FAST_ACK);
2272 2311
2273 NET_INC_STATS_BH(LINUX_MIB_TCPHPACKS); 2312 NET_INC_STATS_BH(LINUX_MIB_TCPHPACKS);
2274 } else { 2313 } else {
@@ -2285,7 +2324,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
2285 if (TCP_ECN_rcv_ecn_echo(tp, skb->h.th)) 2324 if (TCP_ECN_rcv_ecn_echo(tp, skb->h.th))
2286 flag |= FLAG_ECE; 2325 flag |= FLAG_ECE;
2287 2326
2288 tcp_ca_event(tp, CA_EVENT_SLOW_ACK); 2327 tcp_ca_event(sk, CA_EVENT_SLOW_ACK);
2289 } 2328 }
2290 2329
2291 /* We passed data and got it acked, remove any soft error 2330 /* We passed data and got it acked, remove any soft error
@@ -2301,19 +2340,19 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
2301 2340
2302 /* See if we can take anything off of the retransmit queue. */ 2341 /* See if we can take anything off of the retransmit queue. */
2303 flag |= tcp_clean_rtx_queue(sk, &seq_rtt, 2342 flag |= tcp_clean_rtx_queue(sk, &seq_rtt,
2304 tp->ca_ops->rtt_sample ? &seq_usrtt : NULL); 2343 icsk->icsk_ca_ops->rtt_sample ? &seq_usrtt : NULL);
2305 2344
2306 if (tp->frto_counter) 2345 if (tp->frto_counter)
2307 tcp_process_frto(sk, prior_snd_una); 2346 tcp_process_frto(sk, prior_snd_una);
2308 2347
2309 if (tcp_ack_is_dubious(tp, flag)) { 2348 if (tcp_ack_is_dubious(sk, flag)) {
2310 /* Advanve CWND, if state allows this. */ 2349 /* Advanve CWND, if state allows this. */
2311 if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(tp, flag)) 2350 if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(sk, flag))
2312 tcp_cong_avoid(tp, ack, seq_rtt, prior_in_flight, 0); 2351 tcp_cong_avoid(sk, ack, seq_rtt, prior_in_flight, 0);
2313 tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag); 2352 tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag);
2314 } else { 2353 } else {
2315 if ((flag & FLAG_DATA_ACKED)) 2354 if ((flag & FLAG_DATA_ACKED))
2316 tcp_cong_avoid(tp, ack, seq_rtt, prior_in_flight, 1); 2355 tcp_cong_avoid(sk, ack, seq_rtt, prior_in_flight, 1);
2317 } 2356 }
2318 2357
2319 if ((flag & FLAG_FORWARD_PROGRESS) || !(flag&FLAG_NOT_DUP)) 2358 if ((flag & FLAG_FORWARD_PROGRESS) || !(flag&FLAG_NOT_DUP))
@@ -2322,7 +2361,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
2322 return 1; 2361 return 1;
2323 2362
2324no_queue: 2363no_queue:
2325 tp->probes_out = 0; 2364 icsk->icsk_probes_out = 0;
2326 2365
2327 /* If this ack opens up a zero window, clear backoff. It was 2366 /* If this ack opens up a zero window, clear backoff. It was
2328 * being used to time the probes, and is probably far higher than 2367 * being used to time the probes, and is probably far higher than
@@ -2500,8 +2539,9 @@ static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
2500 * up to bandwidth of 18Gigabit/sec. 8) ] 2539 * up to bandwidth of 18Gigabit/sec. 8) ]
2501 */ 2540 */
2502 2541
2503static int tcp_disordered_ack(struct tcp_sock *tp, struct sk_buff *skb) 2542static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb)
2504{ 2543{
2544 struct tcp_sock *tp = tcp_sk(sk);
2505 struct tcphdr *th = skb->h.th; 2545 struct tcphdr *th = skb->h.th;
2506 u32 seq = TCP_SKB_CB(skb)->seq; 2546 u32 seq = TCP_SKB_CB(skb)->seq;
2507 u32 ack = TCP_SKB_CB(skb)->ack_seq; 2547 u32 ack = TCP_SKB_CB(skb)->ack_seq;
@@ -2516,14 +2556,15 @@ static int tcp_disordered_ack(struct tcp_sock *tp, struct sk_buff *skb)
2516 !tcp_may_update_window(tp, ack, seq, ntohs(th->window) << tp->rx_opt.snd_wscale) && 2556 !tcp_may_update_window(tp, ack, seq, ntohs(th->window) << tp->rx_opt.snd_wscale) &&
2517 2557
2518 /* 4. ... and sits in replay window. */ 2558 /* 4. ... and sits in replay window. */
2519 (s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (tp->rto*1024)/HZ); 2559 (s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (inet_csk(sk)->icsk_rto * 1024) / HZ);
2520} 2560}
2521 2561
2522static inline int tcp_paws_discard(struct tcp_sock *tp, struct sk_buff *skb) 2562static inline int tcp_paws_discard(const struct sock *sk, const struct sk_buff *skb)
2523{ 2563{
2564 const struct tcp_sock *tp = tcp_sk(sk);
2524 return ((s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) > TCP_PAWS_WINDOW && 2565 return ((s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) > TCP_PAWS_WINDOW &&
2525 xtime.tv_sec < tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS && 2566 xtime.tv_sec < tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS &&
2526 !tcp_disordered_ack(tp, skb)); 2567 !tcp_disordered_ack(sk, skb));
2527} 2568}
2528 2569
2529/* Check segment sequence number for validity. 2570/* Check segment sequence number for validity.
@@ -2586,7 +2627,7 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
2586{ 2627{
2587 struct tcp_sock *tp = tcp_sk(sk); 2628 struct tcp_sock *tp = tcp_sk(sk);
2588 2629
2589 tcp_schedule_ack(tp); 2630 inet_csk_schedule_ack(sk);
2590 2631
2591 sk->sk_shutdown |= RCV_SHUTDOWN; 2632 sk->sk_shutdown |= RCV_SHUTDOWN;
2592 sock_set_flag(sk, SOCK_DONE); 2633 sock_set_flag(sk, SOCK_DONE);
@@ -2596,7 +2637,7 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
2596 case TCP_ESTABLISHED: 2637 case TCP_ESTABLISHED:
2597 /* Move to CLOSE_WAIT */ 2638 /* Move to CLOSE_WAIT */
2598 tcp_set_state(sk, TCP_CLOSE_WAIT); 2639 tcp_set_state(sk, TCP_CLOSE_WAIT);
2599 tp->ack.pingpong = 1; 2640 inet_csk(sk)->icsk_ack.pingpong = 1;
2600 break; 2641 break;
2601 2642
2602 case TCP_CLOSE_WAIT: 2643 case TCP_CLOSE_WAIT:
@@ -2694,7 +2735,7 @@ static void tcp_send_dupack(struct sock *sk, struct sk_buff *skb)
2694 if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && 2735 if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
2695 before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { 2736 before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
2696 NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOST); 2737 NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOST);
2697 tcp_enter_quickack_mode(tp); 2738 tcp_enter_quickack_mode(sk);
2698 2739
2699 if (tp->rx_opt.sack_ok && sysctl_tcp_dsack) { 2740 if (tp->rx_opt.sack_ok && sysctl_tcp_dsack) {
2700 u32 end_seq = TCP_SKB_CB(skb)->end_seq; 2741 u32 end_seq = TCP_SKB_CB(skb)->end_seq;
@@ -2853,7 +2894,7 @@ static void tcp_ofo_queue(struct sock *sk)
2853 2894
2854 if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { 2895 if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
2855 SOCK_DEBUG(sk, "ofo packet was already received \n"); 2896 SOCK_DEBUG(sk, "ofo packet was already received \n");
2856 __skb_unlink(skb, skb->list); 2897 __skb_unlink(skb, &tp->out_of_order_queue);
2857 __kfree_skb(skb); 2898 __kfree_skb(skb);
2858 continue; 2899 continue;
2859 } 2900 }
@@ -2861,7 +2902,7 @@ static void tcp_ofo_queue(struct sock *sk)
2861 tp->rcv_nxt, TCP_SKB_CB(skb)->seq, 2902 tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
2862 TCP_SKB_CB(skb)->end_seq); 2903 TCP_SKB_CB(skb)->end_seq);
2863 2904
2864 __skb_unlink(skb, skb->list); 2905 __skb_unlink(skb, &tp->out_of_order_queue);
2865 __skb_queue_tail(&sk->sk_receive_queue, skb); 2906 __skb_queue_tail(&sk->sk_receive_queue, skb);
2866 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; 2907 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
2867 if(skb->h.th->fin) 2908 if(skb->h.th->fin)
@@ -2942,7 +2983,7 @@ queue_and_out:
2942 * gap in queue is filled. 2983 * gap in queue is filled.
2943 */ 2984 */
2944 if (skb_queue_empty(&tp->out_of_order_queue)) 2985 if (skb_queue_empty(&tp->out_of_order_queue))
2945 tp->ack.pingpong = 0; 2986 inet_csk(sk)->icsk_ack.pingpong = 0;
2946 } 2987 }
2947 2988
2948 if (tp->rx_opt.num_sacks) 2989 if (tp->rx_opt.num_sacks)
@@ -2963,8 +3004,8 @@ queue_and_out:
2963 tcp_dsack_set(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); 3004 tcp_dsack_set(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
2964 3005
2965out_of_window: 3006out_of_window:
2966 tcp_enter_quickack_mode(tp); 3007 tcp_enter_quickack_mode(sk);
2967 tcp_schedule_ack(tp); 3008 inet_csk_schedule_ack(sk);
2968drop: 3009drop:
2969 __kfree_skb(skb); 3010 __kfree_skb(skb);
2970 return; 3011 return;
@@ -2974,7 +3015,7 @@ drop:
2974 if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp))) 3015 if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp)))
2975 goto out_of_window; 3016 goto out_of_window;
2976 3017
2977 tcp_enter_quickack_mode(tp); 3018 tcp_enter_quickack_mode(sk);
2978 3019
2979 if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { 3020 if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
2980 /* Partial packet, seq < rcv_next < end_seq */ 3021 /* Partial packet, seq < rcv_next < end_seq */
@@ -3003,7 +3044,7 @@ drop:
3003 3044
3004 /* Disable header prediction. */ 3045 /* Disable header prediction. */
3005 tp->pred_flags = 0; 3046 tp->pred_flags = 0;
3006 tcp_schedule_ack(tp); 3047 inet_csk_schedule_ack(sk);
3007 3048
3008 SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n", 3049 SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
3009 tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); 3050 tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
@@ -3027,7 +3068,7 @@ drop:
3027 u32 end_seq = TCP_SKB_CB(skb)->end_seq; 3068 u32 end_seq = TCP_SKB_CB(skb)->end_seq;
3028 3069
3029 if (seq == TCP_SKB_CB(skb1)->end_seq) { 3070 if (seq == TCP_SKB_CB(skb1)->end_seq) {
3030 __skb_append(skb1, skb); 3071 __skb_append(skb1, skb, &tp->out_of_order_queue);
3031 3072
3032 if (!tp->rx_opt.num_sacks || 3073 if (!tp->rx_opt.num_sacks ||
3033 tp->selective_acks[0].end_seq != seq) 3074 tp->selective_acks[0].end_seq != seq)
@@ -3071,7 +3112,7 @@ drop:
3071 tcp_dsack_extend(tp, TCP_SKB_CB(skb1)->seq, end_seq); 3112 tcp_dsack_extend(tp, TCP_SKB_CB(skb1)->seq, end_seq);
3072 break; 3113 break;
3073 } 3114 }
3074 __skb_unlink(skb1, skb1->list); 3115 __skb_unlink(skb1, &tp->out_of_order_queue);
3075 tcp_dsack_extend(tp, TCP_SKB_CB(skb1)->seq, TCP_SKB_CB(skb1)->end_seq); 3116 tcp_dsack_extend(tp, TCP_SKB_CB(skb1)->seq, TCP_SKB_CB(skb1)->end_seq);
3076 __kfree_skb(skb1); 3117 __kfree_skb(skb1);
3077 } 3118 }
@@ -3088,8 +3129,9 @@ add_sack:
3088 * simplifies code) 3129 * simplifies code)
3089 */ 3130 */
3090static void 3131static void
3091tcp_collapse(struct sock *sk, struct sk_buff *head, 3132tcp_collapse(struct sock *sk, struct sk_buff_head *list,
3092 struct sk_buff *tail, u32 start, u32 end) 3133 struct sk_buff *head, struct sk_buff *tail,
3134 u32 start, u32 end)
3093{ 3135{
3094 struct sk_buff *skb; 3136 struct sk_buff *skb;
3095 3137
@@ -3099,7 +3141,7 @@ tcp_collapse(struct sock *sk, struct sk_buff *head,
3099 /* No new bits? It is possible on ofo queue. */ 3141 /* No new bits? It is possible on ofo queue. */
3100 if (!before(start, TCP_SKB_CB(skb)->end_seq)) { 3142 if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
3101 struct sk_buff *next = skb->next; 3143 struct sk_buff *next = skb->next;
3102 __skb_unlink(skb, skb->list); 3144 __skb_unlink(skb, list);
3103 __kfree_skb(skb); 3145 __kfree_skb(skb);
3104 NET_INC_STATS_BH(LINUX_MIB_TCPRCVCOLLAPSED); 3146 NET_INC_STATS_BH(LINUX_MIB_TCPRCVCOLLAPSED);
3105 skb = next; 3147 skb = next;
@@ -3145,7 +3187,7 @@ tcp_collapse(struct sock *sk, struct sk_buff *head,
3145 nskb->mac.raw = nskb->head + (skb->mac.raw-skb->head); 3187 nskb->mac.raw = nskb->head + (skb->mac.raw-skb->head);
3146 memcpy(nskb->cb, skb->cb, sizeof(skb->cb)); 3188 memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
3147 TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start; 3189 TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
3148 __skb_insert(nskb, skb->prev, skb, skb->list); 3190 __skb_insert(nskb, skb->prev, skb, list);
3149 sk_stream_set_owner_r(nskb, sk); 3191 sk_stream_set_owner_r(nskb, sk);
3150 3192
3151 /* Copy data, releasing collapsed skbs. */ 3193 /* Copy data, releasing collapsed skbs. */
@@ -3164,7 +3206,7 @@ tcp_collapse(struct sock *sk, struct sk_buff *head,
3164 } 3206 }
3165 if (!before(start, TCP_SKB_CB(skb)->end_seq)) { 3207 if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
3166 struct sk_buff *next = skb->next; 3208 struct sk_buff *next = skb->next;
3167 __skb_unlink(skb, skb->list); 3209 __skb_unlink(skb, list);
3168 __kfree_skb(skb); 3210 __kfree_skb(skb);
3169 NET_INC_STATS_BH(LINUX_MIB_TCPRCVCOLLAPSED); 3211 NET_INC_STATS_BH(LINUX_MIB_TCPRCVCOLLAPSED);
3170 skb = next; 3212 skb = next;
@@ -3200,7 +3242,8 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
3200 if (skb == (struct sk_buff *)&tp->out_of_order_queue || 3242 if (skb == (struct sk_buff *)&tp->out_of_order_queue ||
3201 after(TCP_SKB_CB(skb)->seq, end) || 3243 after(TCP_SKB_CB(skb)->seq, end) ||
3202 before(TCP_SKB_CB(skb)->end_seq, start)) { 3244 before(TCP_SKB_CB(skb)->end_seq, start)) {
3203 tcp_collapse(sk, head, skb, start, end); 3245 tcp_collapse(sk, &tp->out_of_order_queue,
3246 head, skb, start, end);
3204 head = skb; 3247 head = skb;
3205 if (skb == (struct sk_buff *)&tp->out_of_order_queue) 3248 if (skb == (struct sk_buff *)&tp->out_of_order_queue)
3206 break; 3249 break;
@@ -3237,7 +3280,8 @@ static int tcp_prune_queue(struct sock *sk)
3237 tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); 3280 tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);
3238 3281
3239 tcp_collapse_ofo_queue(sk); 3282 tcp_collapse_ofo_queue(sk);
3240 tcp_collapse(sk, sk->sk_receive_queue.next, 3283 tcp_collapse(sk, &sk->sk_receive_queue,
3284 sk->sk_receive_queue.next,
3241 (struct sk_buff*)&sk->sk_receive_queue, 3285 (struct sk_buff*)&sk->sk_receive_queue,
3242 tp->copied_seq, tp->rcv_nxt); 3286 tp->copied_seq, tp->rcv_nxt);
3243 sk_stream_mem_reclaim(sk); 3287 sk_stream_mem_reclaim(sk);
@@ -3286,12 +3330,12 @@ void tcp_cwnd_application_limited(struct sock *sk)
3286{ 3330{
3287 struct tcp_sock *tp = tcp_sk(sk); 3331 struct tcp_sock *tp = tcp_sk(sk);
3288 3332
3289 if (tp->ca_state == TCP_CA_Open && 3333 if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open &&
3290 sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { 3334 sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
3291 /* Limited by application or receiver window. */ 3335 /* Limited by application or receiver window. */
3292 u32 win_used = max(tp->snd_cwnd_used, 2U); 3336 u32 win_used = max(tp->snd_cwnd_used, 2U);
3293 if (win_used < tp->snd_cwnd) { 3337 if (win_used < tp->snd_cwnd) {
3294 tp->snd_ssthresh = tcp_current_ssthresh(tp); 3338 tp->snd_ssthresh = tcp_current_ssthresh(sk);
3295 tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1; 3339 tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1;
3296 } 3340 }
3297 tp->snd_cwnd_used = 0; 3341 tp->snd_cwnd_used = 0;
@@ -3370,13 +3414,13 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
3370 struct tcp_sock *tp = tcp_sk(sk); 3414 struct tcp_sock *tp = tcp_sk(sk);
3371 3415
3372 /* More than one full frame received... */ 3416 /* More than one full frame received... */
3373 if (((tp->rcv_nxt - tp->rcv_wup) > tp->ack.rcv_mss 3417 if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss
3374 /* ... and right edge of window advances far enough. 3418 /* ... and right edge of window advances far enough.
3375 * (tcp_recvmsg() will send ACK otherwise). Or... 3419 * (tcp_recvmsg() will send ACK otherwise). Or...
3376 */ 3420 */
3377 && __tcp_select_window(sk) >= tp->rcv_wnd) || 3421 && __tcp_select_window(sk) >= tp->rcv_wnd) ||
3378 /* We ACK each frame or... */ 3422 /* We ACK each frame or... */
3379 tcp_in_quickack_mode(tp) || 3423 tcp_in_quickack_mode(sk) ||
3380 /* We have out of order data. */ 3424 /* We have out of order data. */
3381 (ofo_possible && 3425 (ofo_possible &&
3382 skb_peek(&tp->out_of_order_queue))) { 3426 skb_peek(&tp->out_of_order_queue))) {
@@ -3390,8 +3434,7 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
3390 3434
3391static __inline__ void tcp_ack_snd_check(struct sock *sk) 3435static __inline__ void tcp_ack_snd_check(struct sock *sk)
3392{ 3436{
3393 struct tcp_sock *tp = tcp_sk(sk); 3437 if (!inet_csk_ack_scheduled(sk)) {
3394 if (!tcp_ack_scheduled(tp)) {
3395 /* We sent a data segment already. */ 3438 /* We sent a data segment already. */
3396 return; 3439 return;
3397 } 3440 }
@@ -3462,7 +3505,7 @@ static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
3462 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); 3505 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
3463 tp->copied_seq++; 3506 tp->copied_seq++;
3464 if (skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)) { 3507 if (skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)) {
3465 __skb_unlink(skb, skb->list); 3508 __skb_unlink(skb, &sk->sk_receive_queue);
3466 __kfree_skb(skb); 3509 __kfree_skb(skb);
3467 } 3510 }
3468 } 3511 }
@@ -3645,7 +3688,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
3645 tp->rcv_nxt == tp->rcv_wup) 3688 tp->rcv_nxt == tp->rcv_wup)
3646 tcp_store_ts_recent(tp); 3689 tcp_store_ts_recent(tp);
3647 3690
3648 tcp_rcv_rtt_measure_ts(tp, skb); 3691 tcp_rcv_rtt_measure_ts(sk, skb);
3649 3692
3650 /* We know that such packets are checksummed 3693 /* We know that such packets are checksummed
3651 * on entry. 3694 * on entry.
@@ -3678,7 +3721,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
3678 tp->rcv_nxt == tp->rcv_wup) 3721 tp->rcv_nxt == tp->rcv_wup)
3679 tcp_store_ts_recent(tp); 3722 tcp_store_ts_recent(tp);
3680 3723
3681 tcp_rcv_rtt_measure_ts(tp, skb); 3724 tcp_rcv_rtt_measure_ts(sk, skb);
3682 3725
3683 __skb_pull(skb, tcp_header_len); 3726 __skb_pull(skb, tcp_header_len);
3684 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; 3727 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
@@ -3699,7 +3742,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
3699 tp->rcv_nxt == tp->rcv_wup) 3742 tp->rcv_nxt == tp->rcv_wup)
3700 tcp_store_ts_recent(tp); 3743 tcp_store_ts_recent(tp);
3701 3744
3702 tcp_rcv_rtt_measure_ts(tp, skb); 3745 tcp_rcv_rtt_measure_ts(sk, skb);
3703 3746
3704 if ((int)skb->truesize > sk->sk_forward_alloc) 3747 if ((int)skb->truesize > sk->sk_forward_alloc)
3705 goto step5; 3748 goto step5;
@@ -3719,7 +3762,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
3719 /* Well, only one small jumplet in fast path... */ 3762 /* Well, only one small jumplet in fast path... */
3720 tcp_ack(sk, skb, FLAG_DATA); 3763 tcp_ack(sk, skb, FLAG_DATA);
3721 tcp_data_snd_check(sk, tp); 3764 tcp_data_snd_check(sk, tp);
3722 if (!tcp_ack_scheduled(tp)) 3765 if (!inet_csk_ack_scheduled(sk))
3723 goto no_ack; 3766 goto no_ack;
3724 } 3767 }
3725 3768
@@ -3741,7 +3784,7 @@ slow_path:
3741 * RFC1323: H1. Apply PAWS check first. 3784 * RFC1323: H1. Apply PAWS check first.
3742 */ 3785 */
3743 if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp && 3786 if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp &&
3744 tcp_paws_discard(tp, skb)) { 3787 tcp_paws_discard(sk, skb)) {
3745 if (!th->rst) { 3788 if (!th->rst) {
3746 NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED); 3789 NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED);
3747 tcp_send_dupack(sk, skb); 3790 tcp_send_dupack(sk, skb);
@@ -3788,7 +3831,7 @@ step5:
3788 if(th->ack) 3831 if(th->ack)
3789 tcp_ack(sk, skb, FLAG_SLOWPATH); 3832 tcp_ack(sk, skb, FLAG_SLOWPATH);
3790 3833
3791 tcp_rcv_rtt_measure_ts(tp, skb); 3834 tcp_rcv_rtt_measure_ts(sk, skb);
3792 3835
3793 /* Process urgent data. */ 3836 /* Process urgent data. */
3794 tcp_urg(sk, skb, th); 3837 tcp_urg(sk, skb, th);
@@ -3817,6 +3860,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
3817 tcp_parse_options(skb, &tp->rx_opt, 0); 3860 tcp_parse_options(skb, &tp->rx_opt, 0);
3818 3861
3819 if (th->ack) { 3862 if (th->ack) {
3863 struct inet_connection_sock *icsk;
3820 /* rfc793: 3864 /* rfc793:
3821 * "If the state is SYN-SENT then 3865 * "If the state is SYN-SENT then
3822 * first check the ACK bit 3866 * first check the ACK bit
@@ -3920,7 +3964,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
3920 3964
3921 tcp_init_metrics(sk); 3965 tcp_init_metrics(sk);
3922 3966
3923 tcp_init_congestion_control(tp); 3967 tcp_init_congestion_control(sk);
3924 3968
3925 /* Prevent spurious tcp_cwnd_restart() on first data 3969 /* Prevent spurious tcp_cwnd_restart() on first data
3926 * packet. 3970 * packet.
@@ -3930,7 +3974,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
3930 tcp_init_buffer_space(sk); 3974 tcp_init_buffer_space(sk);
3931 3975
3932 if (sock_flag(sk, SOCK_KEEPOPEN)) 3976 if (sock_flag(sk, SOCK_KEEPOPEN))
3933 tcp_reset_keepalive_timer(sk, keepalive_time_when(tp)); 3977 inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));
3934 3978
3935 if (!tp->rx_opt.snd_wscale) 3979 if (!tp->rx_opt.snd_wscale)
3936 __tcp_fast_path_on(tp, tp->snd_wnd); 3980 __tcp_fast_path_on(tp, tp->snd_wnd);
@@ -3942,7 +3986,11 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
3942 sk_wake_async(sk, 0, POLL_OUT); 3986 sk_wake_async(sk, 0, POLL_OUT);
3943 } 3987 }
3944 3988
3945 if (sk->sk_write_pending || tp->defer_accept || tp->ack.pingpong) { 3989 icsk = inet_csk(sk);
3990
3991 if (sk->sk_write_pending ||
3992 icsk->icsk_accept_queue.rskq_defer_accept ||
3993 icsk->icsk_ack.pingpong) {
3946 /* Save one ACK. Data will be ready after 3994 /* Save one ACK. Data will be ready after
3947 * several ticks, if write_pending is set. 3995 * several ticks, if write_pending is set.
3948 * 3996 *
@@ -3950,12 +3998,13 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
3950 * look so _wonderfully_ clever, that I was not able 3998 * look so _wonderfully_ clever, that I was not able
3951 * to stand against the temptation 8) --ANK 3999 * to stand against the temptation 8) --ANK
3952 */ 4000 */
3953 tcp_schedule_ack(tp); 4001 inet_csk_schedule_ack(sk);
3954 tp->ack.lrcvtime = tcp_time_stamp; 4002 icsk->icsk_ack.lrcvtime = tcp_time_stamp;
3955 tp->ack.ato = TCP_ATO_MIN; 4003 icsk->icsk_ack.ato = TCP_ATO_MIN;
3956 tcp_incr_quickack(tp); 4004 tcp_incr_quickack(sk);
3957 tcp_enter_quickack_mode(tp); 4005 tcp_enter_quickack_mode(sk);
3958 tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MAX); 4006 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
4007 TCP_DELACK_MAX, TCP_RTO_MAX);
3959 4008
3960discard: 4009discard:
3961 __kfree_skb(skb); 4010 __kfree_skb(skb);
@@ -4111,7 +4160,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4111 } 4160 }
4112 4161
4113 if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp && 4162 if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp &&
4114 tcp_paws_discard(tp, skb)) { 4163 tcp_paws_discard(sk, skb)) {
4115 if (!th->rst) { 4164 if (!th->rst) {
4116 NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED); 4165 NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED);
4117 tcp_send_dupack(sk, skb); 4166 tcp_send_dupack(sk, skb);
@@ -4180,7 +4229,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4180 */ 4229 */
4181 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && 4230 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
4182 !tp->srtt) 4231 !tp->srtt)
4183 tcp_ack_saw_tstamp(tp, 0, 0); 4232 tcp_ack_saw_tstamp(sk, NULL, 0);
4184 4233
4185 if (tp->rx_opt.tstamp_ok) 4234 if (tp->rx_opt.tstamp_ok)
4186 tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; 4235 tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
@@ -4192,7 +4241,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4192 4241
4193 tcp_init_metrics(sk); 4242 tcp_init_metrics(sk);
4194 4243
4195 tcp_init_congestion_control(tp); 4244 tcp_init_congestion_control(sk);
4196 4245
4197 /* Prevent spurious tcp_cwnd_restart() on 4246 /* Prevent spurious tcp_cwnd_restart() on
4198 * first data packet. 4247 * first data packet.
@@ -4227,9 +4276,9 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4227 return 1; 4276 return 1;
4228 } 4277 }
4229 4278
4230 tmo = tcp_fin_time(tp); 4279 tmo = tcp_fin_time(sk);
4231 if (tmo > TCP_TIMEWAIT_LEN) { 4280 if (tmo > TCP_TIMEWAIT_LEN) {
4232 tcp_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN); 4281 inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);
4233 } else if (th->fin || sock_owned_by_user(sk)) { 4282 } else if (th->fin || sock_owned_by_user(sk)) {
4234 /* Bad case. We could lose such FIN otherwise. 4283 /* Bad case. We could lose such FIN otherwise.
4235 * It is not a big problem, but it looks confusing 4284 * It is not a big problem, but it looks confusing
@@ -4237,7 +4286,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4237 * if it spins in bh_lock_sock(), but it is really 4286 * if it spins in bh_lock_sock(), but it is really
4238 * marginal case. 4287 * marginal case.
4239 */ 4288 */
4240 tcp_reset_keepalive_timer(sk, tmo); 4289 inet_csk_reset_keepalive_timer(sk, tmo);
4241 } else { 4290 } else {
4242 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); 4291 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
4243 goto discard; 4292 goto discard;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 67c670886c1f..13dfb391cdf1 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -64,7 +64,9 @@
64#include <linux/times.h> 64#include <linux/times.h>
65 65
66#include <net/icmp.h> 66#include <net/icmp.h>
67#include <net/inet_hashtables.h>
67#include <net/tcp.h> 68#include <net/tcp.h>
69#include <net/transp_v6.h>
68#include <net/ipv6.h> 70#include <net/ipv6.h>
69#include <net/inet_common.h> 71#include <net/inet_common.h>
70#include <net/xfrm.h> 72#include <net/xfrm.h>
@@ -75,7 +77,6 @@
75#include <linux/proc_fs.h> 77#include <linux/proc_fs.h>
76#include <linux/seq_file.h> 78#include <linux/seq_file.h>
77 79
78extern int sysctl_ip_dynaddr;
79int sysctl_tcp_tw_reuse; 80int sysctl_tcp_tw_reuse;
80int sysctl_tcp_low_latency; 81int sysctl_tcp_low_latency;
81 82
@@ -88,463 +89,29 @@ static struct socket *tcp_socket;
88void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len, 89void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
89 struct sk_buff *skb); 90 struct sk_buff *skb);
90 91
91struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = { 92struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
92 .__tcp_lhash_lock = RW_LOCK_UNLOCKED, 93 .lhash_lock = RW_LOCK_UNLOCKED,
93 .__tcp_lhash_users = ATOMIC_INIT(0), 94 .lhash_users = ATOMIC_INIT(0),
94 .__tcp_lhash_wait 95 .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
95 = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.__tcp_lhash_wait), 96 .portalloc_lock = SPIN_LOCK_UNLOCKED,
96 .__tcp_portalloc_lock = SPIN_LOCK_UNLOCKED 97 .port_rover = 1024 - 1,
97}; 98};
98 99
99/*
100 * This array holds the first and last local port number.
101 * For high-usage systems, use sysctl to change this to
102 * 32768-61000
103 */
104int sysctl_local_port_range[2] = { 1024, 4999 };
105int tcp_port_rover = 1024 - 1;
106
107static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
108 __u32 faddr, __u16 fport)
109{
110 int h = (laddr ^ lport) ^ (faddr ^ fport);
111 h ^= h >> 16;
112 h ^= h >> 8;
113 return h & (tcp_ehash_size - 1);
114}
115
116static __inline__ int tcp_sk_hashfn(struct sock *sk)
117{
118 struct inet_sock *inet = inet_sk(sk);
119 __u32 laddr = inet->rcv_saddr;
120 __u16 lport = inet->num;
121 __u32 faddr = inet->daddr;
122 __u16 fport = inet->dport;
123
124 return tcp_hashfn(laddr, lport, faddr, fport);
125}
126
127/* Allocate and initialize a new TCP local port bind bucket.
128 * The bindhash mutex for snum's hash chain must be held here.
129 */
130struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head,
131 unsigned short snum)
132{
133 struct tcp_bind_bucket *tb = kmem_cache_alloc(tcp_bucket_cachep,
134 SLAB_ATOMIC);
135 if (tb) {
136 tb->port = snum;
137 tb->fastreuse = 0;
138 INIT_HLIST_HEAD(&tb->owners);
139 hlist_add_head(&tb->node, &head->chain);
140 }
141 return tb;
142}
143
144/* Caller must hold hashbucket lock for this tb with local BH disabled */
145void tcp_bucket_destroy(struct tcp_bind_bucket *tb)
146{
147 if (hlist_empty(&tb->owners)) {
148 __hlist_del(&tb->node);
149 kmem_cache_free(tcp_bucket_cachep, tb);
150 }
151}
152
153/* Caller must disable local BH processing. */
154static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child)
155{
156 struct tcp_bind_hashbucket *head =
157 &tcp_bhash[tcp_bhashfn(inet_sk(child)->num)];
158 struct tcp_bind_bucket *tb;
159
160 spin_lock(&head->lock);
161 tb = tcp_sk(sk)->bind_hash;
162 sk_add_bind_node(child, &tb->owners);
163 tcp_sk(child)->bind_hash = tb;
164 spin_unlock(&head->lock);
165}
166
167inline void tcp_inherit_port(struct sock *sk, struct sock *child)
168{
169 local_bh_disable();
170 __tcp_inherit_port(sk, child);
171 local_bh_enable();
172}
173
174void tcp_bind_hash(struct sock *sk, struct tcp_bind_bucket *tb,
175 unsigned short snum)
176{
177 inet_sk(sk)->num = snum;
178 sk_add_bind_node(sk, &tb->owners);
179 tcp_sk(sk)->bind_hash = tb;
180}
181
182static inline int tcp_bind_conflict(struct sock *sk, struct tcp_bind_bucket *tb)
183{
184 const u32 sk_rcv_saddr = tcp_v4_rcv_saddr(sk);
185 struct sock *sk2;
186 struct hlist_node *node;
187 int reuse = sk->sk_reuse;
188
189 sk_for_each_bound(sk2, node, &tb->owners) {
190 if (sk != sk2 &&
191 !tcp_v6_ipv6only(sk2) &&
192 (!sk->sk_bound_dev_if ||
193 !sk2->sk_bound_dev_if ||
194 sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
195 if (!reuse || !sk2->sk_reuse ||
196 sk2->sk_state == TCP_LISTEN) {
197 const u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2);
198 if (!sk2_rcv_saddr || !sk_rcv_saddr ||
199 sk2_rcv_saddr == sk_rcv_saddr)
200 break;
201 }
202 }
203 }
204 return node != NULL;
205}
206
207/* Obtain a reference to a local port for the given sock,
208 * if snum is zero it means select any available local port.
209 */
210static int tcp_v4_get_port(struct sock *sk, unsigned short snum) 100static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
211{ 101{
212 struct tcp_bind_hashbucket *head; 102 return inet_csk_get_port(&tcp_hashinfo, sk, snum);
213 struct hlist_node *node;
214 struct tcp_bind_bucket *tb;
215 int ret;
216
217 local_bh_disable();
218 if (!snum) {
219 int low = sysctl_local_port_range[0];
220 int high = sysctl_local_port_range[1];
221 int remaining = (high - low) + 1;
222 int rover;
223
224 spin_lock(&tcp_portalloc_lock);
225 if (tcp_port_rover < low)
226 rover = low;
227 else
228 rover = tcp_port_rover;
229 do {
230 rover++;
231 if (rover > high)
232 rover = low;
233 head = &tcp_bhash[tcp_bhashfn(rover)];
234 spin_lock(&head->lock);
235 tb_for_each(tb, node, &head->chain)
236 if (tb->port == rover)
237 goto next;
238 break;
239 next:
240 spin_unlock(&head->lock);
241 } while (--remaining > 0);
242 tcp_port_rover = rover;
243 spin_unlock(&tcp_portalloc_lock);
244
245 /* Exhausted local port range during search? It is not
246 * possible for us to be holding one of the bind hash
247 * locks if this test triggers, because if 'remaining'
248 * drops to zero, we broke out of the do/while loop at
249 * the top level, not from the 'break;' statement.
250 */
251 ret = 1;
252 if (unlikely(remaining <= 0))
253 goto fail;
254
255 /* OK, here is the one we will use. HEAD is
256 * non-NULL and we hold it's mutex.
257 */
258 snum = rover;
259 } else {
260 head = &tcp_bhash[tcp_bhashfn(snum)];
261 spin_lock(&head->lock);
262 tb_for_each(tb, node, &head->chain)
263 if (tb->port == snum)
264 goto tb_found;
265 }
266 tb = NULL;
267 goto tb_not_found;
268tb_found:
269 if (!hlist_empty(&tb->owners)) {
270 if (sk->sk_reuse > 1)
271 goto success;
272 if (tb->fastreuse > 0 &&
273 sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
274 goto success;
275 } else {
276 ret = 1;
277 if (tcp_bind_conflict(sk, tb))
278 goto fail_unlock;
279 }
280 }
281tb_not_found:
282 ret = 1;
283 if (!tb && (tb = tcp_bucket_create(head, snum)) == NULL)
284 goto fail_unlock;
285 if (hlist_empty(&tb->owners)) {
286 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
287 tb->fastreuse = 1;
288 else
289 tb->fastreuse = 0;
290 } else if (tb->fastreuse &&
291 (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
292 tb->fastreuse = 0;
293success:
294 if (!tcp_sk(sk)->bind_hash)
295 tcp_bind_hash(sk, tb, snum);
296 BUG_TRAP(tcp_sk(sk)->bind_hash == tb);
297 ret = 0;
298
299fail_unlock:
300 spin_unlock(&head->lock);
301fail:
302 local_bh_enable();
303 return ret;
304}
305
306/* Get rid of any references to a local port held by the
307 * given sock.
308 */
309static void __tcp_put_port(struct sock *sk)
310{
311 struct inet_sock *inet = inet_sk(sk);
312 struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(inet->num)];
313 struct tcp_bind_bucket *tb;
314
315 spin_lock(&head->lock);
316 tb = tcp_sk(sk)->bind_hash;
317 __sk_del_bind_node(sk);
318 tcp_sk(sk)->bind_hash = NULL;
319 inet->num = 0;
320 tcp_bucket_destroy(tb);
321 spin_unlock(&head->lock);
322}
323
324void tcp_put_port(struct sock *sk)
325{
326 local_bh_disable();
327 __tcp_put_port(sk);
328 local_bh_enable();
329}
330
331/* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
332 * Look, when several writers sleep and reader wakes them up, all but one
333 * immediately hit write lock and grab all the cpus. Exclusive sleep solves
334 * this, _but_ remember, it adds useless work on UP machines (wake up each
335 * exclusive lock release). It should be ifdefed really.
336 */
337
338void tcp_listen_wlock(void)
339{
340 write_lock(&tcp_lhash_lock);
341
342 if (atomic_read(&tcp_lhash_users)) {
343 DEFINE_WAIT(wait);
344
345 for (;;) {
346 prepare_to_wait_exclusive(&tcp_lhash_wait,
347 &wait, TASK_UNINTERRUPTIBLE);
348 if (!atomic_read(&tcp_lhash_users))
349 break;
350 write_unlock_bh(&tcp_lhash_lock);
351 schedule();
352 write_lock_bh(&tcp_lhash_lock);
353 }
354
355 finish_wait(&tcp_lhash_wait, &wait);
356 }
357}
358
359static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible)
360{
361 struct hlist_head *list;
362 rwlock_t *lock;
363
364 BUG_TRAP(sk_unhashed(sk));
365 if (listen_possible && sk->sk_state == TCP_LISTEN) {
366 list = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
367 lock = &tcp_lhash_lock;
368 tcp_listen_wlock();
369 } else {
370 list = &tcp_ehash[(sk->sk_hashent = tcp_sk_hashfn(sk))].chain;
371 lock = &tcp_ehash[sk->sk_hashent].lock;
372 write_lock(lock);
373 }
374 __sk_add_node(sk, list);
375 sock_prot_inc_use(sk->sk_prot);
376 write_unlock(lock);
377 if (listen_possible && sk->sk_state == TCP_LISTEN)
378 wake_up(&tcp_lhash_wait);
379} 103}
380 104
381static void tcp_v4_hash(struct sock *sk) 105static void tcp_v4_hash(struct sock *sk)
382{ 106{
383 if (sk->sk_state != TCP_CLOSE) { 107 inet_hash(&tcp_hashinfo, sk);
384 local_bh_disable();
385 __tcp_v4_hash(sk, 1);
386 local_bh_enable();
387 }
388} 108}
389 109
390void tcp_unhash(struct sock *sk) 110void tcp_unhash(struct sock *sk)
391{ 111{
392 rwlock_t *lock; 112 inet_unhash(&tcp_hashinfo, sk);
393
394 if (sk_unhashed(sk))
395 goto ende;
396
397 if (sk->sk_state == TCP_LISTEN) {
398 local_bh_disable();
399 tcp_listen_wlock();
400 lock = &tcp_lhash_lock;
401 } else {
402 struct tcp_ehash_bucket *head = &tcp_ehash[sk->sk_hashent];
403 lock = &head->lock;
404 write_lock_bh(&head->lock);
405 }
406
407 if (__sk_del_node_init(sk))
408 sock_prot_dec_use(sk->sk_prot);
409 write_unlock_bh(lock);
410
411 ende:
412 if (sk->sk_state == TCP_LISTEN)
413 wake_up(&tcp_lhash_wait);
414}
415
416/* Don't inline this cruft. Here are some nice properties to
417 * exploit here. The BSD API does not allow a listening TCP
418 * to specify the remote port nor the remote address for the
419 * connection. So always assume those are both wildcarded
420 * during the search since they can never be otherwise.
421 */
422static struct sock *__tcp_v4_lookup_listener(struct hlist_head *head, u32 daddr,
423 unsigned short hnum, int dif)
424{
425 struct sock *result = NULL, *sk;
426 struct hlist_node *node;
427 int score, hiscore;
428
429 hiscore=-1;
430 sk_for_each(sk, node, head) {
431 struct inet_sock *inet = inet_sk(sk);
432
433 if (inet->num == hnum && !ipv6_only_sock(sk)) {
434 __u32 rcv_saddr = inet->rcv_saddr;
435
436 score = (sk->sk_family == PF_INET ? 1 : 0);
437 if (rcv_saddr) {
438 if (rcv_saddr != daddr)
439 continue;
440 score+=2;
441 }
442 if (sk->sk_bound_dev_if) {
443 if (sk->sk_bound_dev_if != dif)
444 continue;
445 score+=2;
446 }
447 if (score == 5)
448 return sk;
449 if (score > hiscore) {
450 hiscore = score;
451 result = sk;
452 }
453 }
454 }
455 return result;
456}
457
458/* Optimize the common listener case. */
459static inline struct sock *tcp_v4_lookup_listener(u32 daddr,
460 unsigned short hnum, int dif)
461{
462 struct sock *sk = NULL;
463 struct hlist_head *head;
464
465 read_lock(&tcp_lhash_lock);
466 head = &tcp_listening_hash[tcp_lhashfn(hnum)];
467 if (!hlist_empty(head)) {
468 struct inet_sock *inet = inet_sk((sk = __sk_head(head)));
469
470 if (inet->num == hnum && !sk->sk_node.next &&
471 (!inet->rcv_saddr || inet->rcv_saddr == daddr) &&
472 (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
473 !sk->sk_bound_dev_if)
474 goto sherry_cache;
475 sk = __tcp_v4_lookup_listener(head, daddr, hnum, dif);
476 }
477 if (sk) {
478sherry_cache:
479 sock_hold(sk);
480 }
481 read_unlock(&tcp_lhash_lock);
482 return sk;
483} 113}
484 114
485/* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
486 * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
487 *
488 * Local BH must be disabled here.
489 */
490
491static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport,
492 u32 daddr, u16 hnum,
493 int dif)
494{
495 struct tcp_ehash_bucket *head;
496 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
497 __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
498 struct sock *sk;
499 struct hlist_node *node;
500 /* Optimize here for direct hit, only listening connections can
501 * have wildcards anyways.
502 */
503 int hash = tcp_hashfn(daddr, hnum, saddr, sport);
504 head = &tcp_ehash[hash];
505 read_lock(&head->lock);
506 sk_for_each(sk, node, &head->chain) {
507 if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
508 goto hit; /* You sunk my battleship! */
509 }
510
511 /* Must check for a TIME_WAIT'er before going to listener hash. */
512 sk_for_each(sk, node, &(head + tcp_ehash_size)->chain) {
513 if (TCP_IPV4_TW_MATCH(sk, acookie, saddr, daddr, ports, dif))
514 goto hit;
515 }
516 sk = NULL;
517out:
518 read_unlock(&head->lock);
519 return sk;
520hit:
521 sock_hold(sk);
522 goto out;
523}
524
525static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
526 u32 daddr, u16 hnum, int dif)
527{
528 struct sock *sk = __tcp_v4_lookup_established(saddr, sport,
529 daddr, hnum, dif);
530
531 return sk ? : tcp_v4_lookup_listener(daddr, hnum, dif);
532}
533
534inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr,
535 u16 dport, int dif)
536{
537 struct sock *sk;
538
539 local_bh_disable();
540 sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
541 local_bh_enable();
542
543 return sk;
544}
545
546EXPORT_SYMBOL_GPL(tcp_v4_lookup);
547
548static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb) 115static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
549{ 116{
550 return secure_tcp_sequence_number(skb->nh.iph->daddr, 117 return secure_tcp_sequence_number(skb->nh.iph->daddr,
@@ -555,27 +122,28 @@ static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
555 122
556/* called with local bh disabled */ 123/* called with local bh disabled */
557static int __tcp_v4_check_established(struct sock *sk, __u16 lport, 124static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
558 struct tcp_tw_bucket **twp) 125 struct inet_timewait_sock **twp)
559{ 126{
560 struct inet_sock *inet = inet_sk(sk); 127 struct inet_sock *inet = inet_sk(sk);
561 u32 daddr = inet->rcv_saddr; 128 u32 daddr = inet->rcv_saddr;
562 u32 saddr = inet->daddr; 129 u32 saddr = inet->daddr;
563 int dif = sk->sk_bound_dev_if; 130 int dif = sk->sk_bound_dev_if;
564 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr) 131 INET_ADDR_COOKIE(acookie, saddr, daddr)
565 __u32 ports = TCP_COMBINED_PORTS(inet->dport, lport); 132 const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
566 int hash = tcp_hashfn(daddr, lport, saddr, inet->dport); 133 const int hash = inet_ehashfn(daddr, lport, saddr, inet->dport, tcp_hashinfo.ehash_size);
567 struct tcp_ehash_bucket *head = &tcp_ehash[hash]; 134 struct inet_ehash_bucket *head = &tcp_hashinfo.ehash[hash];
568 struct sock *sk2; 135 struct sock *sk2;
569 struct hlist_node *node; 136 const struct hlist_node *node;
570 struct tcp_tw_bucket *tw; 137 struct inet_timewait_sock *tw;
571 138
572 write_lock(&head->lock); 139 write_lock(&head->lock);
573 140
574 /* Check TIME-WAIT sockets first. */ 141 /* Check TIME-WAIT sockets first. */
575 sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) { 142 sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) {
576 tw = (struct tcp_tw_bucket *)sk2; 143 tw = inet_twsk(sk2);
577 144
578 if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) { 145 if (INET_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
146 const struct tcp_timewait_sock *tcptw = tcp_twsk(sk2);
579 struct tcp_sock *tp = tcp_sk(sk); 147 struct tcp_sock *tp = tcp_sk(sk);
580 148
581 /* With PAWS, it is safe from the viewpoint 149 /* With PAWS, it is safe from the viewpoint
@@ -592,15 +160,15 @@ static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
592 fall back to VJ's scheme and use initial 160 fall back to VJ's scheme and use initial
593 timestamp retrieved from peer table. 161 timestamp retrieved from peer table.
594 */ 162 */
595 if (tw->tw_ts_recent_stamp && 163 if (tcptw->tw_ts_recent_stamp &&
596 (!twp || (sysctl_tcp_tw_reuse && 164 (!twp || (sysctl_tcp_tw_reuse &&
597 xtime.tv_sec - 165 xtime.tv_sec -
598 tw->tw_ts_recent_stamp > 1))) { 166 tcptw->tw_ts_recent_stamp > 1))) {
599 if ((tp->write_seq = 167 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
600 tw->tw_snd_nxt + 65535 + 2) == 0) 168 if (tp->write_seq == 0)
601 tp->write_seq = 1; 169 tp->write_seq = 1;
602 tp->rx_opt.ts_recent = tw->tw_ts_recent; 170 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
603 tp->rx_opt.ts_recent_stamp = tw->tw_ts_recent_stamp; 171 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
604 sock_hold(sk2); 172 sock_hold(sk2);
605 goto unique; 173 goto unique;
606 } else 174 } else
@@ -611,7 +179,7 @@ static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
611 179
612 /* And established part... */ 180 /* And established part... */
613 sk_for_each(sk2, node, &head->chain) { 181 sk_for_each(sk2, node, &head->chain) {
614 if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif)) 182 if (INET_MATCH(sk2, acookie, saddr, daddr, ports, dif))
615 goto not_unique; 183 goto not_unique;
616 } 184 }
617 185
@@ -631,10 +199,10 @@ unique:
631 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); 199 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
632 } else if (tw) { 200 } else if (tw) {
633 /* Silly. Should hash-dance instead... */ 201 /* Silly. Should hash-dance instead... */
634 tcp_tw_deschedule(tw); 202 inet_twsk_deschedule(tw, &tcp_death_row);
635 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); 203 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
636 204
637 tcp_tw_put(tw); 205 inet_twsk_put(tw);
638 } 206 }
639 207
640 return 0; 208 return 0;
@@ -657,9 +225,9 @@ static inline u32 connect_port_offset(const struct sock *sk)
657 */ 225 */
658static inline int tcp_v4_hash_connect(struct sock *sk) 226static inline int tcp_v4_hash_connect(struct sock *sk)
659{ 227{
660 unsigned short snum = inet_sk(sk)->num; 228 const unsigned short snum = inet_sk(sk)->num;
661 struct tcp_bind_hashbucket *head; 229 struct inet_bind_hashbucket *head;
662 struct tcp_bind_bucket *tb; 230 struct inet_bind_bucket *tb;
663 int ret; 231 int ret;
664 232
665 if (!snum) { 233 if (!snum) {
@@ -671,19 +239,19 @@ static inline int tcp_v4_hash_connect(struct sock *sk)
671 static u32 hint; 239 static u32 hint;
672 u32 offset = hint + connect_port_offset(sk); 240 u32 offset = hint + connect_port_offset(sk);
673 struct hlist_node *node; 241 struct hlist_node *node;
674 struct tcp_tw_bucket *tw = NULL; 242 struct inet_timewait_sock *tw = NULL;
675 243
676 local_bh_disable(); 244 local_bh_disable();
677 for (i = 1; i <= range; i++) { 245 for (i = 1; i <= range; i++) {
678 port = low + (i + offset) % range; 246 port = low + (i + offset) % range;
679 head = &tcp_bhash[tcp_bhashfn(port)]; 247 head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)];
680 spin_lock(&head->lock); 248 spin_lock(&head->lock);
681 249
682 /* Does not bother with rcv_saddr checks, 250 /* Does not bother with rcv_saddr checks,
683 * because the established check is already 251 * because the established check is already
684 * unique enough. 252 * unique enough.
685 */ 253 */
686 tb_for_each(tb, node, &head->chain) { 254 inet_bind_bucket_for_each(tb, node, &head->chain) {
687 if (tb->port == port) { 255 if (tb->port == port) {
688 BUG_TRAP(!hlist_empty(&tb->owners)); 256 BUG_TRAP(!hlist_empty(&tb->owners));
689 if (tb->fastreuse >= 0) 257 if (tb->fastreuse >= 0)
@@ -696,7 +264,7 @@ static inline int tcp_v4_hash_connect(struct sock *sk)
696 } 264 }
697 } 265 }
698 266
699 tb = tcp_bucket_create(head, port); 267 tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port);
700 if (!tb) { 268 if (!tb) {
701 spin_unlock(&head->lock); 269 spin_unlock(&head->lock);
702 break; 270 break;
@@ -715,27 +283,27 @@ ok:
715 hint += i; 283 hint += i;
716 284
717 /* Head lock still held and bh's disabled */ 285 /* Head lock still held and bh's disabled */
718 tcp_bind_hash(sk, tb, port); 286 inet_bind_hash(sk, tb, port);
719 if (sk_unhashed(sk)) { 287 if (sk_unhashed(sk)) {
720 inet_sk(sk)->sport = htons(port); 288 inet_sk(sk)->sport = htons(port);
721 __tcp_v4_hash(sk, 0); 289 __inet_hash(&tcp_hashinfo, sk, 0);
722 } 290 }
723 spin_unlock(&head->lock); 291 spin_unlock(&head->lock);
724 292
725 if (tw) { 293 if (tw) {
726 tcp_tw_deschedule(tw); 294 inet_twsk_deschedule(tw, &tcp_death_row);;
727 tcp_tw_put(tw); 295 inet_twsk_put(tw);
728 } 296 }
729 297
730 ret = 0; 298 ret = 0;
731 goto out; 299 goto out;
732 } 300 }
733 301
734 head = &tcp_bhash[tcp_bhashfn(snum)]; 302 head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
735 tb = tcp_sk(sk)->bind_hash; 303 tb = inet_csk(sk)->icsk_bind_hash;
736 spin_lock_bh(&head->lock); 304 spin_lock_bh(&head->lock);
737 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { 305 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
738 __tcp_v4_hash(sk, 0); 306 __inet_hash(&tcp_hashinfo, sk, 0);
739 spin_unlock_bh(&head->lock); 307 spin_unlock_bh(&head->lock);
740 return 0; 308 return 0;
741 } else { 309 } else {
@@ -798,7 +366,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
798 tp->write_seq = 0; 366 tp->write_seq = 0;
799 } 367 }
800 368
801 if (sysctl_tcp_tw_recycle && 369 if (tcp_death_row.sysctl_tw_recycle &&
802 !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) { 370 !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
803 struct inet_peer *peer = rt_get_peer(rt); 371 struct inet_peer *peer = rt_get_peer(rt);
804 372
@@ -837,8 +405,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
837 goto failure; 405 goto failure;
838 406
839 /* OK, now commit destination to socket. */ 407 /* OK, now commit destination to socket. */
840 __sk_dst_set(sk, &rt->u.dst); 408 sk_setup_caps(sk, &rt->u.dst);
841 tcp_v4_setup_caps(sk, &rt->u.dst);
842 409
843 if (!tp->write_seq) 410 if (!tp->write_seq)
844 tp->write_seq = secure_tcp_sequence_number(inet->saddr, 411 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
@@ -864,53 +431,6 @@ failure:
864 return err; 431 return err;
865} 432}
866 433
867static __inline__ int tcp_v4_iif(struct sk_buff *skb)
868{
869 return ((struct rtable *)skb->dst)->rt_iif;
870}
871
872static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
873{
874 return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
875}
876
877static struct request_sock *tcp_v4_search_req(struct tcp_sock *tp,
878 struct request_sock ***prevp,
879 __u16 rport,
880 __u32 raddr, __u32 laddr)
881{
882 struct listen_sock *lopt = tp->accept_queue.listen_opt;
883 struct request_sock *req, **prev;
884
885 for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
886 (req = *prev) != NULL;
887 prev = &req->dl_next) {
888 const struct inet_request_sock *ireq = inet_rsk(req);
889
890 if (ireq->rmt_port == rport &&
891 ireq->rmt_addr == raddr &&
892 ireq->loc_addr == laddr &&
893 TCP_INET_FAMILY(req->rsk_ops->family)) {
894 BUG_TRAP(!req->sk);
895 *prevp = prev;
896 break;
897 }
898 }
899
900 return req;
901}
902
903static void tcp_v4_synq_add(struct sock *sk, struct request_sock *req)
904{
905 struct tcp_sock *tp = tcp_sk(sk);
906 struct listen_sock *lopt = tp->accept_queue.listen_opt;
907 u32 h = tcp_v4_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd);
908
909 reqsk_queue_hash_req(&tp->accept_queue, h, req, TCP_TIMEOUT_INIT);
910 tcp_synq_added(sk);
911}
912
913
914/* 434/*
915 * This routine does path mtu discovery as defined in RFC1191. 435 * This routine does path mtu discovery as defined in RFC1191.
916 */ 436 */
@@ -993,14 +513,14 @@ void tcp_v4_err(struct sk_buff *skb, u32 info)
993 return; 513 return;
994 } 514 }
995 515
996 sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr, 516 sk = inet_lookup(&tcp_hashinfo, iph->daddr, th->dest, iph->saddr,
997 th->source, tcp_v4_iif(skb)); 517 th->source, inet_iif(skb));
998 if (!sk) { 518 if (!sk) {
999 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); 519 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
1000 return; 520 return;
1001 } 521 }
1002 if (sk->sk_state == TCP_TIME_WAIT) { 522 if (sk->sk_state == TCP_TIME_WAIT) {
1003 tcp_tw_put((struct tcp_tw_bucket *)sk); 523 inet_twsk_put((struct inet_timewait_sock *)sk);
1004 return; 524 return;
1005 } 525 }
1006 526
@@ -1054,8 +574,8 @@ void tcp_v4_err(struct sk_buff *skb, u32 info)
1054 if (sock_owned_by_user(sk)) 574 if (sock_owned_by_user(sk))
1055 goto out; 575 goto out;
1056 576
1057 req = tcp_v4_search_req(tp, &prev, th->dest, 577 req = inet_csk_search_req(sk, &prev, th->dest,
1058 iph->daddr, iph->saddr); 578 iph->daddr, iph->saddr);
1059 if (!req) 579 if (!req)
1060 goto out; 580 goto out;
1061 581
@@ -1075,7 +595,7 @@ void tcp_v4_err(struct sk_buff *skb, u32 info)
1075 * created socket, and POSIX does not want network 595 * created socket, and POSIX does not want network
1076 * errors returned from accept(). 596 * errors returned from accept().
1077 */ 597 */
1078 tcp_synq_drop(sk, req, prev); 598 inet_csk_reqsk_queue_drop(sk, req, prev);
1079 goto out; 599 goto out;
1080 600
1081 case TCP_SYN_SENT: 601 case TCP_SYN_SENT:
@@ -1245,12 +765,13 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
1245 765
1246static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) 766static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1247{ 767{
1248 struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk; 768 struct inet_timewait_sock *tw = inet_twsk(sk);
769 const struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1249 770
1250 tcp_v4_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt, 771 tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
1251 tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent); 772 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcptw->tw_ts_recent);
1252 773
1253 tcp_tw_put(tw); 774 inet_twsk_put(tw);
1254} 775}
1255 776
1256static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req) 777static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
@@ -1259,36 +780,6 @@ static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
1259 req->ts_recent); 780 req->ts_recent);
1260} 781}
1261 782
1262static struct dst_entry* tcp_v4_route_req(struct sock *sk,
1263 struct request_sock *req)
1264{
1265 struct rtable *rt;
1266 const struct inet_request_sock *ireq = inet_rsk(req);
1267 struct ip_options *opt = inet_rsk(req)->opt;
1268 struct flowi fl = { .oif = sk->sk_bound_dev_if,
1269 .nl_u = { .ip4_u =
1270 { .daddr = ((opt && opt->srr) ?
1271 opt->faddr :
1272 ireq->rmt_addr),
1273 .saddr = ireq->loc_addr,
1274 .tos = RT_CONN_FLAGS(sk) } },
1275 .proto = IPPROTO_TCP,
1276 .uli_u = { .ports =
1277 { .sport = inet_sk(sk)->sport,
1278 .dport = ireq->rmt_port } } };
1279
1280 if (ip_route_output_flow(&rt, &fl, sk, 0)) {
1281 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1282 return NULL;
1283 }
1284 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1285 ip_rt_put(rt);
1286 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1287 return NULL;
1288 }
1289 return &rt->u.dst;
1290}
1291
1292/* 783/*
1293 * Send a SYN-ACK after having received an ACK. 784 * Send a SYN-ACK after having received an ACK.
1294 * This still operates on a request_sock only, not on a big 785 * This still operates on a request_sock only, not on a big
@@ -1302,7 +793,7 @@ static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
1302 struct sk_buff * skb; 793 struct sk_buff * skb;
1303 794
1304 /* First, grab a route. */ 795 /* First, grab a route. */
1305 if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL) 796 if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1306 goto out; 797 goto out;
1307 798
1308 skb = tcp_make_synack(sk, dst, req); 799 skb = tcp_make_synack(sk, dst, req);
@@ -1404,7 +895,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1404 * limitations, they conserve resources and peer is 895 * limitations, they conserve resources and peer is
1405 * evidently real one. 896 * evidently real one.
1406 */ 897 */
1407 if (tcp_synq_is_full(sk) && !isn) { 898 if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1408#ifdef CONFIG_SYN_COOKIES 899#ifdef CONFIG_SYN_COOKIES
1409 if (sysctl_tcp_syncookies) { 900 if (sysctl_tcp_syncookies) {
1410 want_cookie = 1; 901 want_cookie = 1;
@@ -1418,7 +909,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1418 * clogging syn queue with openreqs with exponentially increasing 909 * clogging syn queue with openreqs with exponentially increasing
1419 * timeout. 910 * timeout.
1420 */ 911 */
1421 if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1) 912 if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1422 goto drop; 913 goto drop;
1423 914
1424 req = reqsk_alloc(&tcp_request_sock_ops); 915 req = reqsk_alloc(&tcp_request_sock_ops);
@@ -1474,8 +965,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1474 * are made in the function processing timewait state. 965 * are made in the function processing timewait state.
1475 */ 966 */
1476 if (tmp_opt.saw_tstamp && 967 if (tmp_opt.saw_tstamp &&
1477 sysctl_tcp_tw_recycle && 968 tcp_death_row.sysctl_tw_recycle &&
1478 (dst = tcp_v4_route_req(sk, req)) != NULL && 969 (dst = inet_csk_route_req(sk, req)) != NULL &&
1479 (peer = rt_get_peer((struct rtable *)dst)) != NULL && 970 (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1480 peer->v4daddr == saddr) { 971 peer->v4daddr == saddr) {
1481 if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL && 972 if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
@@ -1488,7 +979,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1488 } 979 }
1489 /* Kill the following clause, if you dislike this way. */ 980 /* Kill the following clause, if you dislike this way. */
1490 else if (!sysctl_tcp_syncookies && 981 else if (!sysctl_tcp_syncookies &&
1491 (sysctl_max_syn_backlog - tcp_synq_len(sk) < 982 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1492 (sysctl_max_syn_backlog >> 2)) && 983 (sysctl_max_syn_backlog >> 2)) &&
1493 (!peer || !peer->tcp_ts_stamp) && 984 (!peer || !peer->tcp_ts_stamp) &&
1494 (!dst || !dst_metric(dst, RTAX_RTT))) { 985 (!dst || !dst_metric(dst, RTAX_RTT))) {
@@ -1499,11 +990,10 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1499 * to destinations, already remembered 990 * to destinations, already remembered
1500 * to the moment of synflood. 991 * to the moment of synflood.
1501 */ 992 */
1502 LIMIT_NETDEBUG(printk(KERN_DEBUG "TCP: drop open " 993 LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open "
1503 "request from %u.%u." 994 "request from %u.%u.%u.%u/%u\n",
1504 "%u.%u/%u\n", 995 NIPQUAD(saddr),
1505 NIPQUAD(saddr), 996 ntohs(skb->h.th->source));
1506 ntohs(skb->h.th->source)));
1507 dst_release(dst); 997 dst_release(dst);
1508 goto drop_and_free; 998 goto drop_and_free;
1509 } 999 }
@@ -1518,7 +1008,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1518 if (want_cookie) { 1008 if (want_cookie) {
1519 reqsk_free(req); 1009 reqsk_free(req);
1520 } else { 1010 } else {
1521 tcp_v4_synq_add(sk, req); 1011 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1522 } 1012 }
1523 return 0; 1013 return 0;
1524 1014
@@ -1546,15 +1036,14 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1546 if (sk_acceptq_is_full(sk)) 1036 if (sk_acceptq_is_full(sk))
1547 goto exit_overflow; 1037 goto exit_overflow;
1548 1038
1549 if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL) 1039 if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1550 goto exit; 1040 goto exit;
1551 1041
1552 newsk = tcp_create_openreq_child(sk, req, skb); 1042 newsk = tcp_create_openreq_child(sk, req, skb);
1553 if (!newsk) 1043 if (!newsk)
1554 goto exit; 1044 goto exit;
1555 1045
1556 newsk->sk_dst_cache = dst; 1046 sk_setup_caps(newsk, dst);
1557 tcp_v4_setup_caps(newsk, dst);
1558 1047
1559 newtp = tcp_sk(newsk); 1048 newtp = tcp_sk(newsk);
1560 newinet = inet_sk(newsk); 1049 newinet = inet_sk(newsk);
@@ -1564,7 +1053,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1564 newinet->saddr = ireq->loc_addr; 1053 newinet->saddr = ireq->loc_addr;
1565 newinet->opt = ireq->opt; 1054 newinet->opt = ireq->opt;
1566 ireq->opt = NULL; 1055 ireq->opt = NULL;
1567 newinet->mc_index = tcp_v4_iif(skb); 1056 newinet->mc_index = inet_iif(skb);
1568 newinet->mc_ttl = skb->nh.iph->ttl; 1057 newinet->mc_ttl = skb->nh.iph->ttl;
1569 newtp->ext_header_len = 0; 1058 newtp->ext_header_len = 0;
1570 if (newinet->opt) 1059 if (newinet->opt)
@@ -1575,8 +1064,8 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1575 newtp->advmss = dst_metric(dst, RTAX_ADVMSS); 1064 newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1576 tcp_initialize_rcv_mss(newsk); 1065 tcp_initialize_rcv_mss(newsk);
1577 1066
1578 __tcp_v4_hash(newsk, 0); 1067 __inet_hash(&tcp_hashinfo, newsk, 0);
1579 __tcp_inherit_port(sk, newsk); 1068 __inet_inherit_port(&tcp_hashinfo, sk, newsk);
1580 1069
1581 return newsk; 1070 return newsk;
1582 1071
@@ -1592,27 +1081,24 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1592{ 1081{
1593 struct tcphdr *th = skb->h.th; 1082 struct tcphdr *th = skb->h.th;
1594 struct iphdr *iph = skb->nh.iph; 1083 struct iphdr *iph = skb->nh.iph;
1595 struct tcp_sock *tp = tcp_sk(sk);
1596 struct sock *nsk; 1084 struct sock *nsk;
1597 struct request_sock **prev; 1085 struct request_sock **prev;
1598 /* Find possible connection requests. */ 1086 /* Find possible connection requests. */
1599 struct request_sock *req = tcp_v4_search_req(tp, &prev, th->source, 1087 struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1600 iph->saddr, iph->daddr); 1088 iph->saddr, iph->daddr);
1601 if (req) 1089 if (req)
1602 return tcp_check_req(sk, skb, req, prev); 1090 return tcp_check_req(sk, skb, req, prev);
1603 1091
1604 nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr, 1092 nsk = __inet_lookup_established(&tcp_hashinfo, skb->nh.iph->saddr,
1605 th->source, 1093 th->source, skb->nh.iph->daddr,
1606 skb->nh.iph->daddr, 1094 ntohs(th->dest), inet_iif(skb));
1607 ntohs(th->dest),
1608 tcp_v4_iif(skb));
1609 1095
1610 if (nsk) { 1096 if (nsk) {
1611 if (nsk->sk_state != TCP_TIME_WAIT) { 1097 if (nsk->sk_state != TCP_TIME_WAIT) {
1612 bh_lock_sock(nsk); 1098 bh_lock_sock(nsk);
1613 return nsk; 1099 return nsk;
1614 } 1100 }
1615 tcp_tw_put((struct tcp_tw_bucket *)nsk); 1101 inet_twsk_put((struct inet_timewait_sock *)nsk);
1616 return NULL; 1102 return NULL;
1617 } 1103 }
1618 1104
@@ -1631,7 +1117,7 @@ static int tcp_v4_checksum_init(struct sk_buff *skb)
1631 skb->nh.iph->daddr, skb->csum)) 1117 skb->nh.iph->daddr, skb->csum))
1632 return 0; 1118 return 0;
1633 1119
1634 LIMIT_NETDEBUG(printk(KERN_DEBUG "hw tcp v4 csum failed\n")); 1120 LIMIT_NETDEBUG(KERN_DEBUG "hw tcp v4 csum failed\n");
1635 skb->ip_summed = CHECKSUM_NONE; 1121 skb->ip_summed = CHECKSUM_NONE;
1636 } 1122 }
1637 if (skb->len <= 76) { 1123 if (skb->len <= 76) {
@@ -1747,9 +1233,9 @@ int tcp_v4_rcv(struct sk_buff *skb)
1747 TCP_SKB_CB(skb)->flags = skb->nh.iph->tos; 1233 TCP_SKB_CB(skb)->flags = skb->nh.iph->tos;
1748 TCP_SKB_CB(skb)->sacked = 0; 1234 TCP_SKB_CB(skb)->sacked = 0;
1749 1235
1750 sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source, 1236 sk = __inet_lookup(&tcp_hashinfo, skb->nh.iph->saddr, th->source,
1751 skb->nh.iph->daddr, ntohs(th->dest), 1237 skb->nh.iph->daddr, ntohs(th->dest),
1752 tcp_v4_iif(skb)); 1238 inet_iif(skb));
1753 1239
1754 if (!sk) 1240 if (!sk)
1755 goto no_tcp_socket; 1241 goto no_tcp_socket;
@@ -1801,24 +1287,26 @@ discard_and_relse:
1801 1287
1802do_time_wait: 1288do_time_wait:
1803 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { 1289 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1804 tcp_tw_put((struct tcp_tw_bucket *) sk); 1290 inet_twsk_put((struct inet_timewait_sock *) sk);
1805 goto discard_it; 1291 goto discard_it;
1806 } 1292 }
1807 1293
1808 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) { 1294 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1809 TCP_INC_STATS_BH(TCP_MIB_INERRS); 1295 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1810 tcp_tw_put((struct tcp_tw_bucket *) sk); 1296 inet_twsk_put((struct inet_timewait_sock *) sk);
1811 goto discard_it; 1297 goto discard_it;
1812 } 1298 }
1813 switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk, 1299 switch (tcp_timewait_state_process((struct inet_timewait_sock *)sk,
1814 skb, th, skb->len)) { 1300 skb, th)) {
1815 case TCP_TW_SYN: { 1301 case TCP_TW_SYN: {
1816 struct sock *sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr, 1302 struct sock *sk2 = inet_lookup_listener(&tcp_hashinfo,
1817 ntohs(th->dest), 1303 skb->nh.iph->daddr,
1818 tcp_v4_iif(skb)); 1304 ntohs(th->dest),
1305 inet_iif(skb));
1819 if (sk2) { 1306 if (sk2) {
1820 tcp_tw_deschedule((struct tcp_tw_bucket *)sk); 1307 inet_twsk_deschedule((struct inet_timewait_sock *)sk,
1821 tcp_tw_put((struct tcp_tw_bucket *)sk); 1308 &tcp_death_row);
1309 inet_twsk_put((struct inet_timewait_sock *)sk);
1822 sk = sk2; 1310 sk = sk2;
1823 goto process; 1311 goto process;
1824 } 1312 }
@@ -1834,112 +1322,6 @@ do_time_wait:
1834 goto discard_it; 1322 goto discard_it;
1835} 1323}
1836 1324
1837/* With per-bucket locks this operation is not-atomic, so that
1838 * this version is not worse.
1839 */
1840static void __tcp_v4_rehash(struct sock *sk)
1841{
1842 sk->sk_prot->unhash(sk);
1843 sk->sk_prot->hash(sk);
1844}
1845
1846static int tcp_v4_reselect_saddr(struct sock *sk)
1847{
1848 struct inet_sock *inet = inet_sk(sk);
1849 int err;
1850 struct rtable *rt;
1851 __u32 old_saddr = inet->saddr;
1852 __u32 new_saddr;
1853 __u32 daddr = inet->daddr;
1854
1855 if (inet->opt && inet->opt->srr)
1856 daddr = inet->opt->faddr;
1857
1858 /* Query new route. */
1859 err = ip_route_connect(&rt, daddr, 0,
1860 RT_CONN_FLAGS(sk),
1861 sk->sk_bound_dev_if,
1862 IPPROTO_TCP,
1863 inet->sport, inet->dport, sk);
1864 if (err)
1865 return err;
1866
1867 __sk_dst_set(sk, &rt->u.dst);
1868 tcp_v4_setup_caps(sk, &rt->u.dst);
1869
1870 new_saddr = rt->rt_src;
1871
1872 if (new_saddr == old_saddr)
1873 return 0;
1874
1875 if (sysctl_ip_dynaddr > 1) {
1876 printk(KERN_INFO "tcp_v4_rebuild_header(): shifting inet->"
1877 "saddr from %d.%d.%d.%d to %d.%d.%d.%d\n",
1878 NIPQUAD(old_saddr),
1879 NIPQUAD(new_saddr));
1880 }
1881
1882 inet->saddr = new_saddr;
1883 inet->rcv_saddr = new_saddr;
1884
1885 /* XXX The only one ugly spot where we need to
1886 * XXX really change the sockets identity after
1887 * XXX it has entered the hashes. -DaveM
1888 *
1889 * Besides that, it does not check for connection
1890 * uniqueness. Wait for troubles.
1891 */
1892 __tcp_v4_rehash(sk);
1893 return 0;
1894}
1895
1896int tcp_v4_rebuild_header(struct sock *sk)
1897{
1898 struct inet_sock *inet = inet_sk(sk);
1899 struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
1900 u32 daddr;
1901 int err;
1902
1903 /* Route is OK, nothing to do. */
1904 if (rt)
1905 return 0;
1906
1907 /* Reroute. */
1908 daddr = inet->daddr;
1909 if (inet->opt && inet->opt->srr)
1910 daddr = inet->opt->faddr;
1911
1912 {
1913 struct flowi fl = { .oif = sk->sk_bound_dev_if,
1914 .nl_u = { .ip4_u =
1915 { .daddr = daddr,
1916 .saddr = inet->saddr,
1917 .tos = RT_CONN_FLAGS(sk) } },
1918 .proto = IPPROTO_TCP,
1919 .uli_u = { .ports =
1920 { .sport = inet->sport,
1921 .dport = inet->dport } } };
1922
1923 err = ip_route_output_flow(&rt, &fl, sk, 0);
1924 }
1925 if (!err) {
1926 __sk_dst_set(sk, &rt->u.dst);
1927 tcp_v4_setup_caps(sk, &rt->u.dst);
1928 return 0;
1929 }
1930
1931 /* Routing failed... */
1932 sk->sk_route_caps = 0;
1933
1934 if (!sysctl_ip_dynaddr ||
1935 sk->sk_state != TCP_SYN_SENT ||
1936 (sk->sk_userlocks & SOCK_BINDADDR_LOCK) ||
1937 (err = tcp_v4_reselect_saddr(sk)) != 0)
1938 sk->sk_err_soft = -err;
1939
1940 return err;
1941}
1942
1943static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr) 1325static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1944{ 1326{
1945 struct sockaddr_in *sin = (struct sockaddr_in *) uaddr; 1327 struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
@@ -1988,18 +1370,18 @@ int tcp_v4_remember_stamp(struct sock *sk)
1988 return 0; 1370 return 0;
1989} 1371}
1990 1372
1991int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw) 1373int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1992{ 1374{
1993 struct inet_peer *peer = NULL; 1375 struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1994
1995 peer = inet_getpeer(tw->tw_daddr, 1);
1996 1376
1997 if (peer) { 1377 if (peer) {
1998 if ((s32)(peer->tcp_ts - tw->tw_ts_recent) <= 0 || 1378 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1379
1380 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1999 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec && 1381 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
2000 peer->tcp_ts_stamp <= tw->tw_ts_recent_stamp)) { 1382 peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
2001 peer->tcp_ts_stamp = tw->tw_ts_recent_stamp; 1383 peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
2002 peer->tcp_ts = tw->tw_ts_recent; 1384 peer->tcp_ts = tcptw->tw_ts_recent;
2003 } 1385 }
2004 inet_putpeer(peer); 1386 inet_putpeer(peer);
2005 return 1; 1387 return 1;
@@ -2011,7 +1393,7 @@ int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
2011struct tcp_func ipv4_specific = { 1393struct tcp_func ipv4_specific = {
2012 .queue_xmit = ip_queue_xmit, 1394 .queue_xmit = ip_queue_xmit,
2013 .send_check = tcp_v4_send_check, 1395 .send_check = tcp_v4_send_check,
2014 .rebuild_header = tcp_v4_rebuild_header, 1396 .rebuild_header = inet_sk_rebuild_header,
2015 .conn_request = tcp_v4_conn_request, 1397 .conn_request = tcp_v4_conn_request,
2016 .syn_recv_sock = tcp_v4_syn_recv_sock, 1398 .syn_recv_sock = tcp_v4_syn_recv_sock,
2017 .remember_stamp = tcp_v4_remember_stamp, 1399 .remember_stamp = tcp_v4_remember_stamp,
@@ -2027,13 +1409,14 @@ struct tcp_func ipv4_specific = {
2027 */ 1409 */
2028static int tcp_v4_init_sock(struct sock *sk) 1410static int tcp_v4_init_sock(struct sock *sk)
2029{ 1411{
1412 struct inet_connection_sock *icsk = inet_csk(sk);
2030 struct tcp_sock *tp = tcp_sk(sk); 1413 struct tcp_sock *tp = tcp_sk(sk);
2031 1414
2032 skb_queue_head_init(&tp->out_of_order_queue); 1415 skb_queue_head_init(&tp->out_of_order_queue);
2033 tcp_init_xmit_timers(sk); 1416 tcp_init_xmit_timers(sk);
2034 tcp_prequeue_init(tp); 1417 tcp_prequeue_init(tp);
2035 1418
2036 tp->rto = TCP_TIMEOUT_INIT; 1419 icsk->icsk_rto = TCP_TIMEOUT_INIT;
2037 tp->mdev = TCP_TIMEOUT_INIT; 1420 tp->mdev = TCP_TIMEOUT_INIT;
2038 1421
2039 /* So many TCP implementations out there (incorrectly) count the 1422 /* So many TCP implementations out there (incorrectly) count the
@@ -2051,7 +1434,7 @@ static int tcp_v4_init_sock(struct sock *sk)
2051 tp->mss_cache = 536; 1434 tp->mss_cache = 536;
2052 1435
2053 tp->reordering = sysctl_tcp_reordering; 1436 tp->reordering = sysctl_tcp_reordering;
2054 tp->ca_ops = &tcp_init_congestion_ops; 1437 icsk->icsk_ca_ops = &tcp_init_congestion_ops;
2055 1438
2056 sk->sk_state = TCP_CLOSE; 1439 sk->sk_state = TCP_CLOSE;
2057 1440
@@ -2074,7 +1457,7 @@ int tcp_v4_destroy_sock(struct sock *sk)
2074 1457
2075 tcp_clear_xmit_timers(sk); 1458 tcp_clear_xmit_timers(sk);
2076 1459
2077 tcp_cleanup_congestion_control(tp); 1460 tcp_cleanup_congestion_control(sk);
2078 1461
2079 /* Cleanup up the write buffer. */ 1462 /* Cleanup up the write buffer. */
2080 sk_stream_writequeue_purge(sk); 1463 sk_stream_writequeue_purge(sk);
@@ -2086,8 +1469,8 @@ int tcp_v4_destroy_sock(struct sock *sk)
2086 __skb_queue_purge(&tp->ucopy.prequeue); 1469 __skb_queue_purge(&tp->ucopy.prequeue);
2087 1470
2088 /* Clean up a referenced TCP bind bucket. */ 1471 /* Clean up a referenced TCP bind bucket. */
2089 if (tp->bind_hash) 1472 if (inet_csk(sk)->icsk_bind_hash)
2090 tcp_put_port(sk); 1473 inet_put_port(&tcp_hashinfo, sk);
2091 1474
2092 /* 1475 /*
2093 * If sendmsg cached page exists, toss it. 1476 * If sendmsg cached page exists, toss it.
@@ -2107,13 +1490,13 @@ EXPORT_SYMBOL(tcp_v4_destroy_sock);
2107#ifdef CONFIG_PROC_FS 1490#ifdef CONFIG_PROC_FS
2108/* Proc filesystem TCP sock list dumping. */ 1491/* Proc filesystem TCP sock list dumping. */
2109 1492
2110static inline struct tcp_tw_bucket *tw_head(struct hlist_head *head) 1493static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
2111{ 1494{
2112 return hlist_empty(head) ? NULL : 1495 return hlist_empty(head) ? NULL :
2113 list_entry(head->first, struct tcp_tw_bucket, tw_node); 1496 list_entry(head->first, struct inet_timewait_sock, tw_node);
2114} 1497}
2115 1498
2116static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw) 1499static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
2117{ 1500{
2118 return tw->tw_node.next ? 1501 return tw->tw_node.next ?
2119 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL; 1502 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
@@ -2121,14 +1504,14 @@ static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw)
2121 1504
2122static void *listening_get_next(struct seq_file *seq, void *cur) 1505static void *listening_get_next(struct seq_file *seq, void *cur)
2123{ 1506{
2124 struct tcp_sock *tp; 1507 struct inet_connection_sock *icsk;
2125 struct hlist_node *node; 1508 struct hlist_node *node;
2126 struct sock *sk = cur; 1509 struct sock *sk = cur;
2127 struct tcp_iter_state* st = seq->private; 1510 struct tcp_iter_state* st = seq->private;
2128 1511
2129 if (!sk) { 1512 if (!sk) {
2130 st->bucket = 0; 1513 st->bucket = 0;
2131 sk = sk_head(&tcp_listening_hash[0]); 1514 sk = sk_head(&tcp_hashinfo.listening_hash[0]);
2132 goto get_sk; 1515 goto get_sk;
2133 } 1516 }
2134 1517
@@ -2137,7 +1520,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
2137 if (st->state == TCP_SEQ_STATE_OPENREQ) { 1520 if (st->state == TCP_SEQ_STATE_OPENREQ) {
2138 struct request_sock *req = cur; 1521 struct request_sock *req = cur;
2139 1522
2140 tp = tcp_sk(st->syn_wait_sk); 1523 icsk = inet_csk(st->syn_wait_sk);
2141 req = req->dl_next; 1524 req = req->dl_next;
2142 while (1) { 1525 while (1) {
2143 while (req) { 1526 while (req) {
@@ -2150,17 +1533,17 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
2150 if (++st->sbucket >= TCP_SYNQ_HSIZE) 1533 if (++st->sbucket >= TCP_SYNQ_HSIZE)
2151 break; 1534 break;
2152get_req: 1535get_req:
2153 req = tp->accept_queue.listen_opt->syn_table[st->sbucket]; 1536 req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
2154 } 1537 }
2155 sk = sk_next(st->syn_wait_sk); 1538 sk = sk_next(st->syn_wait_sk);
2156 st->state = TCP_SEQ_STATE_LISTENING; 1539 st->state = TCP_SEQ_STATE_LISTENING;
2157 read_unlock_bh(&tp->accept_queue.syn_wait_lock); 1540 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2158 } else { 1541 } else {
2159 tp = tcp_sk(sk); 1542 icsk = inet_csk(sk);
2160 read_lock_bh(&tp->accept_queue.syn_wait_lock); 1543 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2161 if (reqsk_queue_len(&tp->accept_queue)) 1544 if (reqsk_queue_len(&icsk->icsk_accept_queue))
2162 goto start_req; 1545 goto start_req;
2163 read_unlock_bh(&tp->accept_queue.syn_wait_lock); 1546 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2164 sk = sk_next(sk); 1547 sk = sk_next(sk);
2165 } 1548 }
2166get_sk: 1549get_sk:
@@ -2169,9 +1552,9 @@ get_sk:
2169 cur = sk; 1552 cur = sk;
2170 goto out; 1553 goto out;
2171 } 1554 }
2172 tp = tcp_sk(sk); 1555 icsk = inet_csk(sk);
2173 read_lock_bh(&tp->accept_queue.syn_wait_lock); 1556 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2174 if (reqsk_queue_len(&tp->accept_queue)) { 1557 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2175start_req: 1558start_req:
2176 st->uid = sock_i_uid(sk); 1559 st->uid = sock_i_uid(sk);
2177 st->syn_wait_sk = sk; 1560 st->syn_wait_sk = sk;
@@ -2179,10 +1562,10 @@ start_req:
2179 st->sbucket = 0; 1562 st->sbucket = 0;
2180 goto get_req; 1563 goto get_req;
2181 } 1564 }
2182 read_unlock_bh(&tp->accept_queue.syn_wait_lock); 1565 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2183 } 1566 }
2184 if (++st->bucket < TCP_LHTABLE_SIZE) { 1567 if (++st->bucket < INET_LHTABLE_SIZE) {
2185 sk = sk_head(&tcp_listening_hash[st->bucket]); 1568 sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
2186 goto get_sk; 1569 goto get_sk;
2187 } 1570 }
2188 cur = NULL; 1571 cur = NULL;
@@ -2206,16 +1589,16 @@ static void *established_get_first(struct seq_file *seq)
2206 struct tcp_iter_state* st = seq->private; 1589 struct tcp_iter_state* st = seq->private;
2207 void *rc = NULL; 1590 void *rc = NULL;
2208 1591
2209 for (st->bucket = 0; st->bucket < tcp_ehash_size; ++st->bucket) { 1592 for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
2210 struct sock *sk; 1593 struct sock *sk;
2211 struct hlist_node *node; 1594 struct hlist_node *node;
2212 struct tcp_tw_bucket *tw; 1595 struct inet_timewait_sock *tw;
2213 1596
2214 /* We can reschedule _before_ having picked the target: */ 1597 /* We can reschedule _before_ having picked the target: */
2215 cond_resched_softirq(); 1598 cond_resched_softirq();
2216 1599
2217 read_lock(&tcp_ehash[st->bucket].lock); 1600 read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
2218 sk_for_each(sk, node, &tcp_ehash[st->bucket].chain) { 1601 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2219 if (sk->sk_family != st->family) { 1602 if (sk->sk_family != st->family) {
2220 continue; 1603 continue;
2221 } 1604 }
@@ -2223,15 +1606,15 @@ static void *established_get_first(struct seq_file *seq)
2223 goto out; 1606 goto out;
2224 } 1607 }
2225 st->state = TCP_SEQ_STATE_TIME_WAIT; 1608 st->state = TCP_SEQ_STATE_TIME_WAIT;
2226 tw_for_each(tw, node, 1609 inet_twsk_for_each(tw, node,
2227 &tcp_ehash[st->bucket + tcp_ehash_size].chain) { 1610 &tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain) {
2228 if (tw->tw_family != st->family) { 1611 if (tw->tw_family != st->family) {
2229 continue; 1612 continue;
2230 } 1613 }
2231 rc = tw; 1614 rc = tw;
2232 goto out; 1615 goto out;
2233 } 1616 }
2234 read_unlock(&tcp_ehash[st->bucket].lock); 1617 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
2235 st->state = TCP_SEQ_STATE_ESTABLISHED; 1618 st->state = TCP_SEQ_STATE_ESTABLISHED;
2236 } 1619 }
2237out: 1620out:
@@ -2241,7 +1624,7 @@ out:
2241static void *established_get_next(struct seq_file *seq, void *cur) 1624static void *established_get_next(struct seq_file *seq, void *cur)
2242{ 1625{
2243 struct sock *sk = cur; 1626 struct sock *sk = cur;
2244 struct tcp_tw_bucket *tw; 1627 struct inet_timewait_sock *tw;
2245 struct hlist_node *node; 1628 struct hlist_node *node;
2246 struct tcp_iter_state* st = seq->private; 1629 struct tcp_iter_state* st = seq->private;
2247 1630
@@ -2258,15 +1641,15 @@ get_tw:
2258 cur = tw; 1641 cur = tw;
2259 goto out; 1642 goto out;
2260 } 1643 }
2261 read_unlock(&tcp_ehash[st->bucket].lock); 1644 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
2262 st->state = TCP_SEQ_STATE_ESTABLISHED; 1645 st->state = TCP_SEQ_STATE_ESTABLISHED;
2263 1646
2264 /* We can reschedule between buckets: */ 1647 /* We can reschedule between buckets: */
2265 cond_resched_softirq(); 1648 cond_resched_softirq();
2266 1649
2267 if (++st->bucket < tcp_ehash_size) { 1650 if (++st->bucket < tcp_hashinfo.ehash_size) {
2268 read_lock(&tcp_ehash[st->bucket].lock); 1651 read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
2269 sk = sk_head(&tcp_ehash[st->bucket].chain); 1652 sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
2270 } else { 1653 } else {
2271 cur = NULL; 1654 cur = NULL;
2272 goto out; 1655 goto out;
@@ -2280,7 +1663,7 @@ get_tw:
2280 } 1663 }
2281 1664
2282 st->state = TCP_SEQ_STATE_TIME_WAIT; 1665 st->state = TCP_SEQ_STATE_TIME_WAIT;
2283 tw = tw_head(&tcp_ehash[st->bucket + tcp_ehash_size].chain); 1666 tw = tw_head(&tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain);
2284 goto get_tw; 1667 goto get_tw;
2285found: 1668found:
2286 cur = sk; 1669 cur = sk;
@@ -2304,12 +1687,12 @@ static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2304 void *rc; 1687 void *rc;
2305 struct tcp_iter_state* st = seq->private; 1688 struct tcp_iter_state* st = seq->private;
2306 1689
2307 tcp_listen_lock(); 1690 inet_listen_lock(&tcp_hashinfo);
2308 st->state = TCP_SEQ_STATE_LISTENING; 1691 st->state = TCP_SEQ_STATE_LISTENING;
2309 rc = listening_get_idx(seq, &pos); 1692 rc = listening_get_idx(seq, &pos);
2310 1693
2311 if (!rc) { 1694 if (!rc) {
2312 tcp_listen_unlock(); 1695 inet_listen_unlock(&tcp_hashinfo);
2313 local_bh_disable(); 1696 local_bh_disable();
2314 st->state = TCP_SEQ_STATE_ESTABLISHED; 1697 st->state = TCP_SEQ_STATE_ESTABLISHED;
2315 rc = established_get_idx(seq, pos); 1698 rc = established_get_idx(seq, pos);
@@ -2342,7 +1725,7 @@ static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2342 case TCP_SEQ_STATE_LISTENING: 1725 case TCP_SEQ_STATE_LISTENING:
2343 rc = listening_get_next(seq, v); 1726 rc = listening_get_next(seq, v);
2344 if (!rc) { 1727 if (!rc) {
2345 tcp_listen_unlock(); 1728 inet_listen_unlock(&tcp_hashinfo);
2346 local_bh_disable(); 1729 local_bh_disable();
2347 st->state = TCP_SEQ_STATE_ESTABLISHED; 1730 st->state = TCP_SEQ_STATE_ESTABLISHED;
2348 rc = established_get_first(seq); 1731 rc = established_get_first(seq);
@@ -2365,17 +1748,17 @@ static void tcp_seq_stop(struct seq_file *seq, void *v)
2365 switch (st->state) { 1748 switch (st->state) {
2366 case TCP_SEQ_STATE_OPENREQ: 1749 case TCP_SEQ_STATE_OPENREQ:
2367 if (v) { 1750 if (v) {
2368 struct tcp_sock *tp = tcp_sk(st->syn_wait_sk); 1751 struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2369 read_unlock_bh(&tp->accept_queue.syn_wait_lock); 1752 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2370 } 1753 }
2371 case TCP_SEQ_STATE_LISTENING: 1754 case TCP_SEQ_STATE_LISTENING:
2372 if (v != SEQ_START_TOKEN) 1755 if (v != SEQ_START_TOKEN)
2373 tcp_listen_unlock(); 1756 inet_listen_unlock(&tcp_hashinfo);
2374 break; 1757 break;
2375 case TCP_SEQ_STATE_TIME_WAIT: 1758 case TCP_SEQ_STATE_TIME_WAIT:
2376 case TCP_SEQ_STATE_ESTABLISHED: 1759 case TCP_SEQ_STATE_ESTABLISHED:
2377 if (v) 1760 if (v)
2378 read_unlock(&tcp_ehash[st->bucket].lock); 1761 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
2379 local_bh_enable(); 1762 local_bh_enable();
2380 break; 1763 break;
2381 } 1764 }
@@ -2472,18 +1855,19 @@ static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
2472 int timer_active; 1855 int timer_active;
2473 unsigned long timer_expires; 1856 unsigned long timer_expires;
2474 struct tcp_sock *tp = tcp_sk(sp); 1857 struct tcp_sock *tp = tcp_sk(sp);
1858 const struct inet_connection_sock *icsk = inet_csk(sp);
2475 struct inet_sock *inet = inet_sk(sp); 1859 struct inet_sock *inet = inet_sk(sp);
2476 unsigned int dest = inet->daddr; 1860 unsigned int dest = inet->daddr;
2477 unsigned int src = inet->rcv_saddr; 1861 unsigned int src = inet->rcv_saddr;
2478 __u16 destp = ntohs(inet->dport); 1862 __u16 destp = ntohs(inet->dport);
2479 __u16 srcp = ntohs(inet->sport); 1863 __u16 srcp = ntohs(inet->sport);
2480 1864
2481 if (tp->pending == TCP_TIME_RETRANS) { 1865 if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2482 timer_active = 1; 1866 timer_active = 1;
2483 timer_expires = tp->timeout; 1867 timer_expires = icsk->icsk_timeout;
2484 } else if (tp->pending == TCP_TIME_PROBE0) { 1868 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2485 timer_active = 4; 1869 timer_active = 4;
2486 timer_expires = tp->timeout; 1870 timer_expires = icsk->icsk_timeout;
2487 } else if (timer_pending(&sp->sk_timer)) { 1871 } else if (timer_pending(&sp->sk_timer)) {
2488 timer_active = 2; 1872 timer_active = 2;
2489 timer_expires = sp->sk_timer.expires; 1873 timer_expires = sp->sk_timer.expires;
@@ -2498,17 +1882,19 @@ static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
2498 tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq, 1882 tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
2499 timer_active, 1883 timer_active,
2500 jiffies_to_clock_t(timer_expires - jiffies), 1884 jiffies_to_clock_t(timer_expires - jiffies),
2501 tp->retransmits, 1885 icsk->icsk_retransmits,
2502 sock_i_uid(sp), 1886 sock_i_uid(sp),
2503 tp->probes_out, 1887 icsk->icsk_probes_out,
2504 sock_i_ino(sp), 1888 sock_i_ino(sp),
2505 atomic_read(&sp->sk_refcnt), sp, 1889 atomic_read(&sp->sk_refcnt), sp,
2506 tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong, 1890 icsk->icsk_rto,
1891 icsk->icsk_ack.ato,
1892 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2507 tp->snd_cwnd, 1893 tp->snd_cwnd,
2508 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh); 1894 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2509} 1895}
2510 1896
2511static void get_timewait4_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i) 1897static void get_timewait4_sock(struct inet_timewait_sock *tw, char *tmpbuf, int i)
2512{ 1898{
2513 unsigned int dest, src; 1899 unsigned int dest, src;
2514 __u16 destp, srcp; 1900 __u16 destp, srcp;
@@ -2588,7 +1974,7 @@ struct proto tcp_prot = {
2588 .close = tcp_close, 1974 .close = tcp_close,
2589 .connect = tcp_v4_connect, 1975 .connect = tcp_v4_connect,
2590 .disconnect = tcp_disconnect, 1976 .disconnect = tcp_disconnect,
2591 .accept = tcp_accept, 1977 .accept = inet_csk_accept,
2592 .ioctl = tcp_ioctl, 1978 .ioctl = tcp_ioctl,
2593 .init = tcp_v4_init_sock, 1979 .init = tcp_v4_init_sock,
2594 .destroy = tcp_v4_destroy_sock, 1980 .destroy = tcp_v4_destroy_sock,
@@ -2603,6 +1989,7 @@ struct proto tcp_prot = {
2603 .get_port = tcp_v4_get_port, 1989 .get_port = tcp_v4_get_port,
2604 .enter_memory_pressure = tcp_enter_memory_pressure, 1990 .enter_memory_pressure = tcp_enter_memory_pressure,
2605 .sockets_allocated = &tcp_sockets_allocated, 1991 .sockets_allocated = &tcp_sockets_allocated,
1992 .orphan_count = &tcp_orphan_count,
2606 .memory_allocated = &tcp_memory_allocated, 1993 .memory_allocated = &tcp_memory_allocated,
2607 .memory_pressure = &tcp_memory_pressure, 1994 .memory_pressure = &tcp_memory_pressure,
2608 .sysctl_mem = sysctl_tcp_mem, 1995 .sysctl_mem = sysctl_tcp_mem,
@@ -2610,6 +1997,7 @@ struct proto tcp_prot = {
2610 .sysctl_rmem = sysctl_tcp_rmem, 1997 .sysctl_rmem = sysctl_tcp_rmem,
2611 .max_header = MAX_TCP_HEADER, 1998 .max_header = MAX_TCP_HEADER,
2612 .obj_size = sizeof(struct tcp_sock), 1999 .obj_size = sizeof(struct tcp_sock),
2000 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
2613 .rsk_prot = &tcp_request_sock_ops, 2001 .rsk_prot = &tcp_request_sock_ops,
2614}; 2002};
2615 2003
@@ -2631,19 +2019,13 @@ void __init tcp_v4_init(struct net_proto_family *ops)
2631} 2019}
2632 2020
2633EXPORT_SYMBOL(ipv4_specific); 2021EXPORT_SYMBOL(ipv4_specific);
2634EXPORT_SYMBOL(tcp_bind_hash); 2022EXPORT_SYMBOL(inet_bind_bucket_create);
2635EXPORT_SYMBOL(tcp_bucket_create);
2636EXPORT_SYMBOL(tcp_hashinfo); 2023EXPORT_SYMBOL(tcp_hashinfo);
2637EXPORT_SYMBOL(tcp_inherit_port);
2638EXPORT_SYMBOL(tcp_listen_wlock);
2639EXPORT_SYMBOL(tcp_port_rover);
2640EXPORT_SYMBOL(tcp_prot); 2024EXPORT_SYMBOL(tcp_prot);
2641EXPORT_SYMBOL(tcp_put_port);
2642EXPORT_SYMBOL(tcp_unhash); 2025EXPORT_SYMBOL(tcp_unhash);
2643EXPORT_SYMBOL(tcp_v4_conn_request); 2026EXPORT_SYMBOL(tcp_v4_conn_request);
2644EXPORT_SYMBOL(tcp_v4_connect); 2027EXPORT_SYMBOL(tcp_v4_connect);
2645EXPORT_SYMBOL(tcp_v4_do_rcv); 2028EXPORT_SYMBOL(tcp_v4_do_rcv);
2646EXPORT_SYMBOL(tcp_v4_rebuild_header);
2647EXPORT_SYMBOL(tcp_v4_remember_stamp); 2029EXPORT_SYMBOL(tcp_v4_remember_stamp);
2648EXPORT_SYMBOL(tcp_v4_send_check); 2030EXPORT_SYMBOL(tcp_v4_send_check);
2649EXPORT_SYMBOL(tcp_v4_syn_recv_sock); 2031EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index f42a284164b7..a88db28b0af7 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -35,13 +35,27 @@
35#define SYNC_INIT 1 35#define SYNC_INIT 1
36#endif 36#endif
37 37
38int sysctl_tcp_tw_recycle;
39int sysctl_tcp_max_tw_buckets = NR_FILE*2;
40
41int sysctl_tcp_syncookies = SYNC_INIT; 38int sysctl_tcp_syncookies = SYNC_INIT;
42int sysctl_tcp_abort_on_overflow; 39int sysctl_tcp_abort_on_overflow;
43 40
44static void tcp_tw_schedule(struct tcp_tw_bucket *tw, int timeo); 41struct inet_timewait_death_row tcp_death_row = {
42 .sysctl_max_tw_buckets = NR_FILE * 2,
43 .period = TCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS,
44 .death_lock = SPIN_LOCK_UNLOCKED,
45 .hashinfo = &tcp_hashinfo,
46 .tw_timer = TIMER_INITIALIZER(inet_twdr_hangman, 0,
47 (unsigned long)&tcp_death_row),
48 .twkill_work = __WORK_INITIALIZER(tcp_death_row.twkill_work,
49 inet_twdr_twkill_work,
50 &tcp_death_row),
51/* Short-time timewait calendar */
52
53 .twcal_hand = -1,
54 .twcal_timer = TIMER_INITIALIZER(inet_twdr_twcal_tick, 0,
55 (unsigned long)&tcp_death_row),
56};
57
58EXPORT_SYMBOL_GPL(tcp_death_row);
45 59
46static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) 60static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
47{ 61{
@@ -52,47 +66,6 @@ static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
52 return (seq == e_win && seq == end_seq); 66 return (seq == e_win && seq == end_seq);
53} 67}
54 68
55/* New-style handling of TIME_WAIT sockets. */
56
57int tcp_tw_count;
58
59
60/* Must be called with locally disabled BHs. */
61static void tcp_timewait_kill(struct tcp_tw_bucket *tw)
62{
63 struct tcp_ehash_bucket *ehead;
64 struct tcp_bind_hashbucket *bhead;
65 struct tcp_bind_bucket *tb;
66
67 /* Unlink from established hashes. */
68 ehead = &tcp_ehash[tw->tw_hashent];
69 write_lock(&ehead->lock);
70 if (hlist_unhashed(&tw->tw_node)) {
71 write_unlock(&ehead->lock);
72 return;
73 }
74 __hlist_del(&tw->tw_node);
75 sk_node_init(&tw->tw_node);
76 write_unlock(&ehead->lock);
77
78 /* Disassociate with bind bucket. */
79 bhead = &tcp_bhash[tcp_bhashfn(tw->tw_num)];
80 spin_lock(&bhead->lock);
81 tb = tw->tw_tb;
82 __hlist_del(&tw->tw_bind_node);
83 tw->tw_tb = NULL;
84 tcp_bucket_destroy(tb);
85 spin_unlock(&bhead->lock);
86
87#ifdef INET_REFCNT_DEBUG
88 if (atomic_read(&tw->tw_refcnt) != 1) {
89 printk(KERN_DEBUG "tw_bucket %p refcnt=%d\n", tw,
90 atomic_read(&tw->tw_refcnt));
91 }
92#endif
93 tcp_tw_put(tw);
94}
95
96/* 69/*
97 * * Main purpose of TIME-WAIT state is to close connection gracefully, 70 * * Main purpose of TIME-WAIT state is to close connection gracefully,
98 * when one of ends sits in LAST-ACK or CLOSING retransmitting FIN 71 * when one of ends sits in LAST-ACK or CLOSING retransmitting FIN
@@ -122,19 +95,20 @@ static void tcp_timewait_kill(struct tcp_tw_bucket *tw)
122 * to avoid misread sequence numbers, states etc. --ANK 95 * to avoid misread sequence numbers, states etc. --ANK
123 */ 96 */
124enum tcp_tw_status 97enum tcp_tw_status
125tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb, 98tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
126 struct tcphdr *th, unsigned len) 99 const struct tcphdr *th)
127{ 100{
101 struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
128 struct tcp_options_received tmp_opt; 102 struct tcp_options_received tmp_opt;
129 int paws_reject = 0; 103 int paws_reject = 0;
130 104
131 tmp_opt.saw_tstamp = 0; 105 tmp_opt.saw_tstamp = 0;
132 if (th->doff > (sizeof(struct tcphdr) >> 2) && tw->tw_ts_recent_stamp) { 106 if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
133 tcp_parse_options(skb, &tmp_opt, 0); 107 tcp_parse_options(skb, &tmp_opt, 0);
134 108
135 if (tmp_opt.saw_tstamp) { 109 if (tmp_opt.saw_tstamp) {
136 tmp_opt.ts_recent = tw->tw_ts_recent; 110 tmp_opt.ts_recent = tcptw->tw_ts_recent;
137 tmp_opt.ts_recent_stamp = tw->tw_ts_recent_stamp; 111 tmp_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
138 paws_reject = tcp_paws_check(&tmp_opt, th->rst); 112 paws_reject = tcp_paws_check(&tmp_opt, th->rst);
139 } 113 }
140 } 114 }
@@ -145,20 +119,20 @@ tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
145 /* Out of window, send ACK */ 119 /* Out of window, send ACK */
146 if (paws_reject || 120 if (paws_reject ||
147 !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, 121 !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
148 tw->tw_rcv_nxt, 122 tcptw->tw_rcv_nxt,
149 tw->tw_rcv_nxt + tw->tw_rcv_wnd)) 123 tcptw->tw_rcv_nxt + tcptw->tw_rcv_wnd))
150 return TCP_TW_ACK; 124 return TCP_TW_ACK;
151 125
152 if (th->rst) 126 if (th->rst)
153 goto kill; 127 goto kill;
154 128
155 if (th->syn && !before(TCP_SKB_CB(skb)->seq, tw->tw_rcv_nxt)) 129 if (th->syn && !before(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt))
156 goto kill_with_rst; 130 goto kill_with_rst;
157 131
158 /* Dup ACK? */ 132 /* Dup ACK? */
159 if (!after(TCP_SKB_CB(skb)->end_seq, tw->tw_rcv_nxt) || 133 if (!after(TCP_SKB_CB(skb)->end_seq, tcptw->tw_rcv_nxt) ||
160 TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) { 134 TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) {
161 tcp_tw_put(tw); 135 inet_twsk_put(tw);
162 return TCP_TW_SUCCESS; 136 return TCP_TW_SUCCESS;
163 } 137 }
164 138
@@ -166,19 +140,19 @@ tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
166 * reset. 140 * reset.
167 */ 141 */
168 if (!th->fin || 142 if (!th->fin ||
169 TCP_SKB_CB(skb)->end_seq != tw->tw_rcv_nxt + 1) { 143 TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) {
170kill_with_rst: 144kill_with_rst:
171 tcp_tw_deschedule(tw); 145 inet_twsk_deschedule(tw, &tcp_death_row);
172 tcp_tw_put(tw); 146 inet_twsk_put(tw);
173 return TCP_TW_RST; 147 return TCP_TW_RST;
174 } 148 }
175 149
176 /* FIN arrived, enter true time-wait state. */ 150 /* FIN arrived, enter true time-wait state. */
177 tw->tw_substate = TCP_TIME_WAIT; 151 tw->tw_substate = TCP_TIME_WAIT;
178 tw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq; 152 tcptw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq;
179 if (tmp_opt.saw_tstamp) { 153 if (tmp_opt.saw_tstamp) {
180 tw->tw_ts_recent_stamp = xtime.tv_sec; 154 tcptw->tw_ts_recent_stamp = xtime.tv_sec;
181 tw->tw_ts_recent = tmp_opt.rcv_tsval; 155 tcptw->tw_ts_recent = tmp_opt.rcv_tsval;
182 } 156 }
183 157
184 /* I am shamed, but failed to make it more elegant. 158 /* I am shamed, but failed to make it more elegant.
@@ -187,11 +161,13 @@ kill_with_rst:
187 * do not undertsnad recycling in any case, it not 161 * do not undertsnad recycling in any case, it not
188 * a big problem in practice. --ANK */ 162 * a big problem in practice. --ANK */
189 if (tw->tw_family == AF_INET && 163 if (tw->tw_family == AF_INET &&
190 sysctl_tcp_tw_recycle && tw->tw_ts_recent_stamp && 164 tcp_death_row.sysctl_tw_recycle && tcptw->tw_ts_recent_stamp &&
191 tcp_v4_tw_remember_stamp(tw)) 165 tcp_v4_tw_remember_stamp(tw))
192 tcp_tw_schedule(tw, tw->tw_timeout); 166 inet_twsk_schedule(tw, &tcp_death_row, tw->tw_timeout,
167 TCP_TIMEWAIT_LEN);
193 else 168 else
194 tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN); 169 inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,
170 TCP_TIMEWAIT_LEN);
195 return TCP_TW_ACK; 171 return TCP_TW_ACK;
196 } 172 }
197 173
@@ -213,7 +189,7 @@ kill_with_rst:
213 */ 189 */
214 190
215 if (!paws_reject && 191 if (!paws_reject &&
216 (TCP_SKB_CB(skb)->seq == tw->tw_rcv_nxt && 192 (TCP_SKB_CB(skb)->seq == tcptw->tw_rcv_nxt &&
217 (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) { 193 (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) {
218 /* In window segment, it may be only reset or bare ack. */ 194 /* In window segment, it may be only reset or bare ack. */
219 195
@@ -224,19 +200,20 @@ kill_with_rst:
224 */ 200 */
225 if (sysctl_tcp_rfc1337 == 0) { 201 if (sysctl_tcp_rfc1337 == 0) {
226kill: 202kill:
227 tcp_tw_deschedule(tw); 203 inet_twsk_deschedule(tw, &tcp_death_row);
228 tcp_tw_put(tw); 204 inet_twsk_put(tw);
229 return TCP_TW_SUCCESS; 205 return TCP_TW_SUCCESS;
230 } 206 }
231 } 207 }
232 tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN); 208 inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,
209 TCP_TIMEWAIT_LEN);
233 210
234 if (tmp_opt.saw_tstamp) { 211 if (tmp_opt.saw_tstamp) {
235 tw->tw_ts_recent = tmp_opt.rcv_tsval; 212 tcptw->tw_ts_recent = tmp_opt.rcv_tsval;
236 tw->tw_ts_recent_stamp = xtime.tv_sec; 213 tcptw->tw_ts_recent_stamp = xtime.tv_sec;
237 } 214 }
238 215
239 tcp_tw_put(tw); 216 inet_twsk_put(tw);
240 return TCP_TW_SUCCESS; 217 return TCP_TW_SUCCESS;
241 } 218 }
242 219
@@ -258,9 +235,10 @@ kill:
258 */ 235 */
259 236
260 if (th->syn && !th->rst && !th->ack && !paws_reject && 237 if (th->syn && !th->rst && !th->ack && !paws_reject &&
261 (after(TCP_SKB_CB(skb)->seq, tw->tw_rcv_nxt) || 238 (after(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt) ||
262 (tmp_opt.saw_tstamp && (s32)(tw->tw_ts_recent - tmp_opt.rcv_tsval) < 0))) { 239 (tmp_opt.saw_tstamp &&
263 u32 isn = tw->tw_snd_nxt + 65535 + 2; 240 (s32)(tcptw->tw_ts_recent - tmp_opt.rcv_tsval) < 0))) {
241 u32 isn = tcptw->tw_snd_nxt + 65535 + 2;
264 if (isn == 0) 242 if (isn == 0)
265 isn++; 243 isn++;
266 TCP_SKB_CB(skb)->when = isn; 244 TCP_SKB_CB(skb)->when = isn;
@@ -278,107 +256,57 @@ kill:
278 * Do not reschedule in the last case. 256 * Do not reschedule in the last case.
279 */ 257 */
280 if (paws_reject || th->ack) 258 if (paws_reject || th->ack)
281 tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN); 259 inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,
260 TCP_TIMEWAIT_LEN);
282 261
283 /* Send ACK. Note, we do not put the bucket, 262 /* Send ACK. Note, we do not put the bucket,
284 * it will be released by caller. 263 * it will be released by caller.
285 */ 264 */
286 return TCP_TW_ACK; 265 return TCP_TW_ACK;
287 } 266 }
288 tcp_tw_put(tw); 267 inet_twsk_put(tw);
289 return TCP_TW_SUCCESS; 268 return TCP_TW_SUCCESS;
290} 269}
291 270
292/* Enter the time wait state. This is called with locally disabled BH.
293 * Essentially we whip up a timewait bucket, copy the
294 * relevant info into it from the SK, and mess with hash chains
295 * and list linkage.
296 */
297static void __tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw)
298{
299 struct tcp_ehash_bucket *ehead = &tcp_ehash[sk->sk_hashent];
300 struct tcp_bind_hashbucket *bhead;
301
302 /* Step 1: Put TW into bind hash. Original socket stays there too.
303 Note, that any socket with inet_sk(sk)->num != 0 MUST be bound in
304 binding cache, even if it is closed.
305 */
306 bhead = &tcp_bhash[tcp_bhashfn(inet_sk(sk)->num)];
307 spin_lock(&bhead->lock);
308 tw->tw_tb = tcp_sk(sk)->bind_hash;
309 BUG_TRAP(tcp_sk(sk)->bind_hash);
310 tw_add_bind_node(tw, &tw->tw_tb->owners);
311 spin_unlock(&bhead->lock);
312
313 write_lock(&ehead->lock);
314
315 /* Step 2: Remove SK from established hash. */
316 if (__sk_del_node_init(sk))
317 sock_prot_dec_use(sk->sk_prot);
318
319 /* Step 3: Hash TW into TIMEWAIT half of established hash table. */
320 tw_add_node(tw, &(ehead + tcp_ehash_size)->chain);
321 atomic_inc(&tw->tw_refcnt);
322
323 write_unlock(&ehead->lock);
324}
325
326/* 271/*
327 * Move a socket to time-wait or dead fin-wait-2 state. 272 * Move a socket to time-wait or dead fin-wait-2 state.
328 */ 273 */
329void tcp_time_wait(struct sock *sk, int state, int timeo) 274void tcp_time_wait(struct sock *sk, int state, int timeo)
330{ 275{
331 struct tcp_tw_bucket *tw = NULL; 276 struct inet_timewait_sock *tw = NULL;
332 struct tcp_sock *tp = tcp_sk(sk); 277 const struct tcp_sock *tp = tcp_sk(sk);
333 int recycle_ok = 0; 278 int recycle_ok = 0;
334 279
335 if (sysctl_tcp_tw_recycle && tp->rx_opt.ts_recent_stamp) 280 if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp)
336 recycle_ok = tp->af_specific->remember_stamp(sk); 281 recycle_ok = tp->af_specific->remember_stamp(sk);
337 282
338 if (tcp_tw_count < sysctl_tcp_max_tw_buckets) 283 if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets)
339 tw = kmem_cache_alloc(tcp_timewait_cachep, SLAB_ATOMIC); 284 tw = inet_twsk_alloc(sk, state);
340
341 if(tw != NULL) {
342 struct inet_sock *inet = inet_sk(sk);
343 int rto = (tp->rto<<2) - (tp->rto>>1);
344
345 /* Give us an identity. */
346 tw->tw_daddr = inet->daddr;
347 tw->tw_rcv_saddr = inet->rcv_saddr;
348 tw->tw_bound_dev_if = sk->sk_bound_dev_if;
349 tw->tw_num = inet->num;
350 tw->tw_state = TCP_TIME_WAIT;
351 tw->tw_substate = state;
352 tw->tw_sport = inet->sport;
353 tw->tw_dport = inet->dport;
354 tw->tw_family = sk->sk_family;
355 tw->tw_reuse = sk->sk_reuse;
356 tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale;
357 atomic_set(&tw->tw_refcnt, 1);
358 285
359 tw->tw_hashent = sk->sk_hashent; 286 if (tw != NULL) {
360 tw->tw_rcv_nxt = tp->rcv_nxt; 287 struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
361 tw->tw_snd_nxt = tp->snd_nxt; 288 const struct inet_connection_sock *icsk = inet_csk(sk);
362 tw->tw_rcv_wnd = tcp_receive_window(tp); 289 const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1);
363 tw->tw_ts_recent = tp->rx_opt.ts_recent; 290
364 tw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp; 291 tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale;
365 tw_dead_node_init(tw); 292 tcptw->tw_rcv_nxt = tp->rcv_nxt;
293 tcptw->tw_snd_nxt = tp->snd_nxt;
294 tcptw->tw_rcv_wnd = tcp_receive_window(tp);
295 tcptw->tw_ts_recent = tp->rx_opt.ts_recent;
296 tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
366 297
367#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 298#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
368 if (tw->tw_family == PF_INET6) { 299 if (tw->tw_family == PF_INET6) {
369 struct ipv6_pinfo *np = inet6_sk(sk); 300 struct ipv6_pinfo *np = inet6_sk(sk);
301 struct tcp6_timewait_sock *tcp6tw = tcp6_twsk((struct sock *)tw);
370 302
371 ipv6_addr_copy(&tw->tw_v6_daddr, &np->daddr); 303 ipv6_addr_copy(&tcp6tw->tw_v6_daddr, &np->daddr);
372 ipv6_addr_copy(&tw->tw_v6_rcv_saddr, &np->rcv_saddr); 304 ipv6_addr_copy(&tcp6tw->tw_v6_rcv_saddr, &np->rcv_saddr);
373 tw->tw_v6_ipv6only = np->ipv6only; 305 tw->tw_ipv6only = np->ipv6only;
374 } else {
375 memset(&tw->tw_v6_daddr, 0, sizeof(tw->tw_v6_daddr));
376 memset(&tw->tw_v6_rcv_saddr, 0, sizeof(tw->tw_v6_rcv_saddr));
377 tw->tw_v6_ipv6only = 0;
378 } 306 }
379#endif 307#endif
380 /* Linkage updates. */ 308 /* Linkage updates. */
381 __tcp_tw_hashdance(sk, tw); 309 __inet_twsk_hashdance(tw, sk, &tcp_hashinfo);
382 310
383 /* Get the TIME_WAIT timeout firing. */ 311 /* Get the TIME_WAIT timeout firing. */
384 if (timeo < rto) 312 if (timeo < rto)
@@ -392,8 +320,9 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
392 timeo = TCP_TIMEWAIT_LEN; 320 timeo = TCP_TIMEWAIT_LEN;
393 } 321 }
394 322
395 tcp_tw_schedule(tw, timeo); 323 inet_twsk_schedule(tw, &tcp_death_row, timeo,
396 tcp_tw_put(tw); 324 TCP_TIMEWAIT_LEN);
325 inet_twsk_put(tw);
397 } else { 326 } else {
398 /* Sorry, if we're out of memory, just CLOSE this 327 /* Sorry, if we're out of memory, just CLOSE this
399 * socket up. We've got bigger problems than 328 * socket up. We've got bigger problems than
@@ -407,277 +336,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
407 tcp_done(sk); 336 tcp_done(sk);
408} 337}
409 338
410/* Kill off TIME_WAIT sockets once their lifetime has expired. */
411static int tcp_tw_death_row_slot;
412
413static void tcp_twkill(unsigned long);
414
415/* TIME_WAIT reaping mechanism. */
416#define TCP_TWKILL_SLOTS 8 /* Please keep this a power of 2. */
417#define TCP_TWKILL_PERIOD (TCP_TIMEWAIT_LEN/TCP_TWKILL_SLOTS)
418
419#define TCP_TWKILL_QUOTA 100
420
421static struct hlist_head tcp_tw_death_row[TCP_TWKILL_SLOTS];
422static DEFINE_SPINLOCK(tw_death_lock);
423static struct timer_list tcp_tw_timer = TIMER_INITIALIZER(tcp_twkill, 0, 0);
424static void twkill_work(void *);
425static DECLARE_WORK(tcp_twkill_work, twkill_work, NULL);
426static u32 twkill_thread_slots;
427
428/* Returns non-zero if quota exceeded. */
429static int tcp_do_twkill_work(int slot, unsigned int quota)
430{
431 struct tcp_tw_bucket *tw;
432 struct hlist_node *node;
433 unsigned int killed;
434 int ret;
435
436 /* NOTE: compare this to previous version where lock
437 * was released after detaching chain. It was racy,
438 * because tw buckets are scheduled in not serialized context
439 * in 2.3 (with netfilter), and with softnet it is common, because
440 * soft irqs are not sequenced.
441 */
442 killed = 0;
443 ret = 0;
444rescan:
445 tw_for_each_inmate(tw, node, &tcp_tw_death_row[slot]) {
446 __tw_del_dead_node(tw);
447 spin_unlock(&tw_death_lock);
448 tcp_timewait_kill(tw);
449 tcp_tw_put(tw);
450 killed++;
451 spin_lock(&tw_death_lock);
452 if (killed > quota) {
453 ret = 1;
454 break;
455 }
456
457 /* While we dropped tw_death_lock, another cpu may have
458 * killed off the next TW bucket in the list, therefore
459 * do a fresh re-read of the hlist head node with the
460 * lock reacquired. We still use the hlist traversal
461 * macro in order to get the prefetches.
462 */
463 goto rescan;
464 }
465
466 tcp_tw_count -= killed;
467 NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITED, killed);
468
469 return ret;
470}
471
472static void tcp_twkill(unsigned long dummy)
473{
474 int need_timer, ret;
475
476 spin_lock(&tw_death_lock);
477
478 if (tcp_tw_count == 0)
479 goto out;
480
481 need_timer = 0;
482 ret = tcp_do_twkill_work(tcp_tw_death_row_slot, TCP_TWKILL_QUOTA);
483 if (ret) {
484 twkill_thread_slots |= (1 << tcp_tw_death_row_slot);
485 mb();
486 schedule_work(&tcp_twkill_work);
487 need_timer = 1;
488 } else {
489 /* We purged the entire slot, anything left? */
490 if (tcp_tw_count)
491 need_timer = 1;
492 }
493 tcp_tw_death_row_slot =
494 ((tcp_tw_death_row_slot + 1) & (TCP_TWKILL_SLOTS - 1));
495 if (need_timer)
496 mod_timer(&tcp_tw_timer, jiffies + TCP_TWKILL_PERIOD);
497out:
498 spin_unlock(&tw_death_lock);
499}
500
501extern void twkill_slots_invalid(void);
502
503static void twkill_work(void *dummy)
504{
505 int i;
506
507 if ((TCP_TWKILL_SLOTS - 1) > (sizeof(twkill_thread_slots) * 8))
508 twkill_slots_invalid();
509
510 while (twkill_thread_slots) {
511 spin_lock_bh(&tw_death_lock);
512 for (i = 0; i < TCP_TWKILL_SLOTS; i++) {
513 if (!(twkill_thread_slots & (1 << i)))
514 continue;
515
516 while (tcp_do_twkill_work(i, TCP_TWKILL_QUOTA) != 0) {
517 if (need_resched()) {
518 spin_unlock_bh(&tw_death_lock);
519 schedule();
520 spin_lock_bh(&tw_death_lock);
521 }
522 }
523
524 twkill_thread_slots &= ~(1 << i);
525 }
526 spin_unlock_bh(&tw_death_lock);
527 }
528}
529
530/* These are always called from BH context. See callers in
531 * tcp_input.c to verify this.
532 */
533
534/* This is for handling early-kills of TIME_WAIT sockets. */
535void tcp_tw_deschedule(struct tcp_tw_bucket *tw)
536{
537 spin_lock(&tw_death_lock);
538 if (tw_del_dead_node(tw)) {
539 tcp_tw_put(tw);
540 if (--tcp_tw_count == 0)
541 del_timer(&tcp_tw_timer);
542 }
543 spin_unlock(&tw_death_lock);
544 tcp_timewait_kill(tw);
545}
546
547/* Short-time timewait calendar */
548
549static int tcp_twcal_hand = -1;
550static int tcp_twcal_jiffie;
551static void tcp_twcal_tick(unsigned long);
552static struct timer_list tcp_twcal_timer =
553 TIMER_INITIALIZER(tcp_twcal_tick, 0, 0);
554static struct hlist_head tcp_twcal_row[TCP_TW_RECYCLE_SLOTS];
555
556static void tcp_tw_schedule(struct tcp_tw_bucket *tw, int timeo)
557{
558 struct hlist_head *list;
559 int slot;
560
561 /* timeout := RTO * 3.5
562 *
563 * 3.5 = 1+2+0.5 to wait for two retransmits.
564 *
565 * RATIONALE: if FIN arrived and we entered TIME-WAIT state,
566 * our ACK acking that FIN can be lost. If N subsequent retransmitted
567 * FINs (or previous seqments) are lost (probability of such event
568 * is p^(N+1), where p is probability to lose single packet and
569 * time to detect the loss is about RTO*(2^N - 1) with exponential
570 * backoff). Normal timewait length is calculated so, that we
571 * waited at least for one retransmitted FIN (maximal RTO is 120sec).
572 * [ BTW Linux. following BSD, violates this requirement waiting
573 * only for 60sec, we should wait at least for 240 secs.
574 * Well, 240 consumes too much of resources 8)
575 * ]
576 * This interval is not reduced to catch old duplicate and
577 * responces to our wandering segments living for two MSLs.
578 * However, if we use PAWS to detect
579 * old duplicates, we can reduce the interval to bounds required
580 * by RTO, rather than MSL. So, if peer understands PAWS, we
581 * kill tw bucket after 3.5*RTO (it is important that this number
582 * is greater than TS tick!) and detect old duplicates with help
583 * of PAWS.
584 */
585 slot = (timeo + (1<<TCP_TW_RECYCLE_TICK) - 1) >> TCP_TW_RECYCLE_TICK;
586
587 spin_lock(&tw_death_lock);
588
589 /* Unlink it, if it was scheduled */
590 if (tw_del_dead_node(tw))
591 tcp_tw_count--;
592 else
593 atomic_inc(&tw->tw_refcnt);
594
595 if (slot >= TCP_TW_RECYCLE_SLOTS) {
596 /* Schedule to slow timer */
597 if (timeo >= TCP_TIMEWAIT_LEN) {
598 slot = TCP_TWKILL_SLOTS-1;
599 } else {
600 slot = (timeo + TCP_TWKILL_PERIOD-1) / TCP_TWKILL_PERIOD;
601 if (slot >= TCP_TWKILL_SLOTS)
602 slot = TCP_TWKILL_SLOTS-1;
603 }
604 tw->tw_ttd = jiffies + timeo;
605 slot = (tcp_tw_death_row_slot + slot) & (TCP_TWKILL_SLOTS - 1);
606 list = &tcp_tw_death_row[slot];
607 } else {
608 tw->tw_ttd = jiffies + (slot << TCP_TW_RECYCLE_TICK);
609
610 if (tcp_twcal_hand < 0) {
611 tcp_twcal_hand = 0;
612 tcp_twcal_jiffie = jiffies;
613 tcp_twcal_timer.expires = tcp_twcal_jiffie + (slot<<TCP_TW_RECYCLE_TICK);
614 add_timer(&tcp_twcal_timer);
615 } else {
616 if (time_after(tcp_twcal_timer.expires, jiffies + (slot<<TCP_TW_RECYCLE_TICK)))
617 mod_timer(&tcp_twcal_timer, jiffies + (slot<<TCP_TW_RECYCLE_TICK));
618 slot = (tcp_twcal_hand + slot)&(TCP_TW_RECYCLE_SLOTS-1);
619 }
620 list = &tcp_twcal_row[slot];
621 }
622
623 hlist_add_head(&tw->tw_death_node, list);
624
625 if (tcp_tw_count++ == 0)
626 mod_timer(&tcp_tw_timer, jiffies+TCP_TWKILL_PERIOD);
627 spin_unlock(&tw_death_lock);
628}
629
630void tcp_twcal_tick(unsigned long dummy)
631{
632 int n, slot;
633 unsigned long j;
634 unsigned long now = jiffies;
635 int killed = 0;
636 int adv = 0;
637
638 spin_lock(&tw_death_lock);
639 if (tcp_twcal_hand < 0)
640 goto out;
641
642 slot = tcp_twcal_hand;
643 j = tcp_twcal_jiffie;
644
645 for (n=0; n<TCP_TW_RECYCLE_SLOTS; n++) {
646 if (time_before_eq(j, now)) {
647 struct hlist_node *node, *safe;
648 struct tcp_tw_bucket *tw;
649
650 tw_for_each_inmate_safe(tw, node, safe,
651 &tcp_twcal_row[slot]) {
652 __tw_del_dead_node(tw);
653 tcp_timewait_kill(tw);
654 tcp_tw_put(tw);
655 killed++;
656 }
657 } else {
658 if (!adv) {
659 adv = 1;
660 tcp_twcal_jiffie = j;
661 tcp_twcal_hand = slot;
662 }
663
664 if (!hlist_empty(&tcp_twcal_row[slot])) {
665 mod_timer(&tcp_twcal_timer, j);
666 goto out;
667 }
668 }
669 j += (1<<TCP_TW_RECYCLE_TICK);
670 slot = (slot+1)&(TCP_TW_RECYCLE_SLOTS-1);
671 }
672 tcp_twcal_hand = -1;
673
674out:
675 if ((tcp_tw_count -= killed) == 0)
676 del_timer(&tcp_tw_timer);
677 NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITKILLED, killed);
678 spin_unlock(&tw_death_lock);
679}
680
681/* This is not only more efficient than what we used to do, it eliminates 339/* This is not only more efficient than what we used to do, it eliminates
682 * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM 340 * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
683 * 341 *
@@ -686,75 +344,27 @@ out:
686 */ 344 */
687struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct sk_buff *skb) 345struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct sk_buff *skb)
688{ 346{
689 /* allocate the newsk from the same slab of the master sock, 347 struct sock *newsk = inet_csk_clone(sk, req, GFP_ATOMIC);
690 * if not, at sk_free time we'll try to free it from the wrong
691 * slabcache (i.e. is it TCPv4 or v6?), this is handled thru sk->sk_prot -acme */
692 struct sock *newsk = sk_alloc(PF_INET, GFP_ATOMIC, sk->sk_prot, 0);
693 348
694 if(newsk != NULL) { 349 if (newsk != NULL) {
695 struct inet_request_sock *ireq = inet_rsk(req); 350 const struct inet_request_sock *ireq = inet_rsk(req);
696 struct tcp_request_sock *treq = tcp_rsk(req); 351 struct tcp_request_sock *treq = tcp_rsk(req);
352 struct inet_connection_sock *newicsk = inet_csk(sk);
697 struct tcp_sock *newtp; 353 struct tcp_sock *newtp;
698 struct sk_filter *filter;
699
700 memcpy(newsk, sk, sizeof(struct tcp_sock));
701 newsk->sk_state = TCP_SYN_RECV;
702
703 /* SANITY */
704 sk_node_init(&newsk->sk_node);
705 tcp_sk(newsk)->bind_hash = NULL;
706
707 /* Clone the TCP header template */
708 inet_sk(newsk)->dport = ireq->rmt_port;
709
710 sock_lock_init(newsk);
711 bh_lock_sock(newsk);
712
713 rwlock_init(&newsk->sk_dst_lock);
714 atomic_set(&newsk->sk_rmem_alloc, 0);
715 skb_queue_head_init(&newsk->sk_receive_queue);
716 atomic_set(&newsk->sk_wmem_alloc, 0);
717 skb_queue_head_init(&newsk->sk_write_queue);
718 atomic_set(&newsk->sk_omem_alloc, 0);
719 newsk->sk_wmem_queued = 0;
720 newsk->sk_forward_alloc = 0;
721
722 sock_reset_flag(newsk, SOCK_DONE);
723 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
724 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
725 newsk->sk_send_head = NULL;
726 rwlock_init(&newsk->sk_callback_lock);
727 skb_queue_head_init(&newsk->sk_error_queue);
728 newsk->sk_write_space = sk_stream_write_space;
729
730 if ((filter = newsk->sk_filter) != NULL)
731 sk_filter_charge(newsk, filter);
732
733 if (unlikely(xfrm_sk_clone_policy(newsk))) {
734 /* It is still raw copy of parent, so invalidate
735 * destructor and make plain sk_free() */
736 newsk->sk_destruct = NULL;
737 sk_free(newsk);
738 return NULL;
739 }
740 354
741 /* Now setup tcp_sock */ 355 /* Now setup tcp_sock */
742 newtp = tcp_sk(newsk); 356 newtp = tcp_sk(newsk);
743 newtp->pred_flags = 0; 357 newtp->pred_flags = 0;
744 newtp->rcv_nxt = treq->rcv_isn + 1; 358 newtp->rcv_nxt = treq->rcv_isn + 1;
745 newtp->snd_nxt = treq->snt_isn + 1; 359 newtp->snd_nxt = newtp->snd_una = newtp->snd_sml = treq->snt_isn + 1;
746 newtp->snd_una = treq->snt_isn + 1;
747 newtp->snd_sml = treq->snt_isn + 1;
748 360
749 tcp_prequeue_init(newtp); 361 tcp_prequeue_init(newtp);
750 362
751 tcp_init_wl(newtp, treq->snt_isn, treq->rcv_isn); 363 tcp_init_wl(newtp, treq->snt_isn, treq->rcv_isn);
752 364
753 newtp->retransmits = 0;
754 newtp->backoff = 0;
755 newtp->srtt = 0; 365 newtp->srtt = 0;
756 newtp->mdev = TCP_TIMEOUT_INIT; 366 newtp->mdev = TCP_TIMEOUT_INIT;
757 newtp->rto = TCP_TIMEOUT_INIT; 367 newicsk->icsk_rto = TCP_TIMEOUT_INIT;
758 368
759 newtp->packets_out = 0; 369 newtp->packets_out = 0;
760 newtp->left_out = 0; 370 newtp->left_out = 0;
@@ -774,9 +384,9 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
774 newtp->frto_counter = 0; 384 newtp->frto_counter = 0;
775 newtp->frto_highmark = 0; 385 newtp->frto_highmark = 0;
776 386
777 newtp->ca_ops = &tcp_reno; 387 newicsk->icsk_ca_ops = &tcp_reno;
778 388
779 tcp_set_ca_state(newtp, TCP_CA_Open); 389 tcp_set_ca_state(newsk, TCP_CA_Open);
780 tcp_init_xmit_timers(newsk); 390 tcp_init_xmit_timers(newsk);
781 skb_queue_head_init(&newtp->out_of_order_queue); 391 skb_queue_head_init(&newtp->out_of_order_queue);
782 newtp->rcv_wup = treq->rcv_isn + 1; 392 newtp->rcv_wup = treq->rcv_isn + 1;
@@ -789,26 +399,12 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
789 newtp->rx_opt.dsack = 0; 399 newtp->rx_opt.dsack = 0;
790 newtp->rx_opt.eff_sacks = 0; 400 newtp->rx_opt.eff_sacks = 0;
791 401
792 newtp->probes_out = 0;
793 newtp->rx_opt.num_sacks = 0; 402 newtp->rx_opt.num_sacks = 0;
794 newtp->urg_data = 0; 403 newtp->urg_data = 0;
795 /* Deinitialize accept_queue to trap illegal accesses. */
796 memset(&newtp->accept_queue, 0, sizeof(newtp->accept_queue));
797
798 /* Back to base struct sock members. */
799 newsk->sk_err = 0;
800 newsk->sk_priority = 0;
801 atomic_set(&newsk->sk_refcnt, 2);
802#ifdef INET_REFCNT_DEBUG
803 atomic_inc(&inet_sock_nr);
804#endif
805 atomic_inc(&tcp_sockets_allocated);
806 404
807 if (sock_flag(newsk, SOCK_KEEPOPEN)) 405 if (sock_flag(newsk, SOCK_KEEPOPEN))
808 tcp_reset_keepalive_timer(newsk, 406 inet_csk_reset_keepalive_timer(newsk,
809 keepalive_time_when(newtp)); 407 keepalive_time_when(newtp));
810 newsk->sk_socket = NULL;
811 newsk->sk_sleep = NULL;
812 408
813 newtp->rx_opt.tstamp_ok = ireq->tstamp_ok; 409 newtp->rx_opt.tstamp_ok = ireq->tstamp_ok;
814 if((newtp->rx_opt.sack_ok = ireq->sack_ok) != 0) { 410 if((newtp->rx_opt.sack_ok = ireq->sack_ok) != 0) {
@@ -838,7 +434,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
838 newtp->tcp_header_len = sizeof(struct tcphdr); 434 newtp->tcp_header_len = sizeof(struct tcphdr);
839 } 435 }
840 if (skb->len >= TCP_MIN_RCVMSS+newtp->tcp_header_len) 436 if (skb->len >= TCP_MIN_RCVMSS+newtp->tcp_header_len)
841 newtp->ack.last_seg_size = skb->len-newtp->tcp_header_len; 437 newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len;
842 newtp->rx_opt.mss_clamp = req->mss; 438 newtp->rx_opt.mss_clamp = req->mss;
843 TCP_ECN_openreq_child(newtp, req); 439 TCP_ECN_openreq_child(newtp, req);
844 if (newtp->ecn_flags&TCP_ECN_OK) 440 if (newtp->ecn_flags&TCP_ECN_OK)
@@ -934,9 +530,10 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
934 does sequence test, SYN is truncated, and thus we consider 530 does sequence test, SYN is truncated, and thus we consider
935 it a bare ACK. 531 it a bare ACK.
936 532
937 If tp->defer_accept, we silently drop this bare ACK. Otherwise, 533 If icsk->icsk_accept_queue.rskq_defer_accept, we silently drop this
938 we create an established connection. Both ends (listening sockets) 534 bare ACK. Otherwise, we create an established connection. Both
939 accept the new incoming connection and try to talk to each other. 8-) 535 ends (listening sockets) accept the new incoming connection and try
536 to talk to each other. 8-)
940 537
941 Note: This case is both harmless, and rare. Possibility is about the 538 Note: This case is both harmless, and rare. Possibility is about the
942 same as us discovering intelligent life on another plant tomorrow. 539 same as us discovering intelligent life on another plant tomorrow.
@@ -1003,7 +600,8 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
1003 return NULL; 600 return NULL;
1004 601
1005 /* If TCP_DEFER_ACCEPT is set, drop bare ACK. */ 602 /* If TCP_DEFER_ACCEPT is set, drop bare ACK. */
1006 if (tp->defer_accept && TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { 603 if (inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
604 TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
1007 inet_rsk(req)->acked = 1; 605 inet_rsk(req)->acked = 1;
1008 return NULL; 606 return NULL;
1009 } 607 }
@@ -1018,10 +616,10 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
1018 if (child == NULL) 616 if (child == NULL)
1019 goto listen_overflow; 617 goto listen_overflow;
1020 618
1021 tcp_synq_unlink(tp, req, prev); 619 inet_csk_reqsk_queue_unlink(sk, req, prev);
1022 tcp_synq_removed(sk, req); 620 inet_csk_reqsk_queue_removed(sk, req);
1023 621
1024 tcp_acceptq_queue(sk, req, child); 622 inet_csk_reqsk_queue_add(sk, req, child);
1025 return child; 623 return child;
1026 624
1027 listen_overflow: 625 listen_overflow:
@@ -1035,7 +633,7 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
1035 if (!(flg & TCP_FLAG_RST)) 633 if (!(flg & TCP_FLAG_RST))
1036 req->rsk_ops->send_reset(skb); 634 req->rsk_ops->send_reset(skb);
1037 635
1038 tcp_synq_drop(sk, req, prev); 636 inet_csk_reqsk_queue_drop(sk, req, prev);
1039 return NULL; 637 return NULL;
1040} 638}
1041 639
@@ -1074,4 +672,3 @@ EXPORT_SYMBOL(tcp_check_req);
1074EXPORT_SYMBOL(tcp_child_process); 672EXPORT_SYMBOL(tcp_child_process);
1075EXPORT_SYMBOL(tcp_create_openreq_child); 673EXPORT_SYMBOL(tcp_create_openreq_child);
1076EXPORT_SYMBOL(tcp_timewait_state_process); 674EXPORT_SYMBOL(tcp_timewait_state_process);
1077EXPORT_SYMBOL(tcp_tw_deschedule);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index dd30dd137b74..75b68116682a 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -105,18 +105,19 @@ static __u16 tcp_advertise_mss(struct sock *sk)
105 105
106/* RFC2861. Reset CWND after idle period longer RTO to "restart window". 106/* RFC2861. Reset CWND after idle period longer RTO to "restart window".
107 * This is the first part of cwnd validation mechanism. */ 107 * This is the first part of cwnd validation mechanism. */
108static void tcp_cwnd_restart(struct tcp_sock *tp, struct dst_entry *dst) 108static void tcp_cwnd_restart(struct sock *sk, struct dst_entry *dst)
109{ 109{
110 struct tcp_sock *tp = tcp_sk(sk);
110 s32 delta = tcp_time_stamp - tp->lsndtime; 111 s32 delta = tcp_time_stamp - tp->lsndtime;
111 u32 restart_cwnd = tcp_init_cwnd(tp, dst); 112 u32 restart_cwnd = tcp_init_cwnd(tp, dst);
112 u32 cwnd = tp->snd_cwnd; 113 u32 cwnd = tp->snd_cwnd;
113 114
114 tcp_ca_event(tp, CA_EVENT_CWND_RESTART); 115 tcp_ca_event(sk, CA_EVENT_CWND_RESTART);
115 116
116 tp->snd_ssthresh = tcp_current_ssthresh(tp); 117 tp->snd_ssthresh = tcp_current_ssthresh(sk);
117 restart_cwnd = min(restart_cwnd, cwnd); 118 restart_cwnd = min(restart_cwnd, cwnd);
118 119
119 while ((delta -= tp->rto) > 0 && cwnd > restart_cwnd) 120 while ((delta -= inet_csk(sk)->icsk_rto) > 0 && cwnd > restart_cwnd)
120 cwnd >>= 1; 121 cwnd >>= 1;
121 tp->snd_cwnd = max(cwnd, restart_cwnd); 122 tp->snd_cwnd = max(cwnd, restart_cwnd);
122 tp->snd_cwnd_stamp = tcp_time_stamp; 123 tp->snd_cwnd_stamp = tcp_time_stamp;
@@ -126,26 +127,25 @@ static void tcp_cwnd_restart(struct tcp_sock *tp, struct dst_entry *dst)
126static inline void tcp_event_data_sent(struct tcp_sock *tp, 127static inline void tcp_event_data_sent(struct tcp_sock *tp,
127 struct sk_buff *skb, struct sock *sk) 128 struct sk_buff *skb, struct sock *sk)
128{ 129{
129 u32 now = tcp_time_stamp; 130 struct inet_connection_sock *icsk = inet_csk(sk);
131 const u32 now = tcp_time_stamp;
130 132
131 if (!tp->packets_out && (s32)(now - tp->lsndtime) > tp->rto) 133 if (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto)
132 tcp_cwnd_restart(tp, __sk_dst_get(sk)); 134 tcp_cwnd_restart(sk, __sk_dst_get(sk));
133 135
134 tp->lsndtime = now; 136 tp->lsndtime = now;
135 137
136 /* If it is a reply for ato after last received 138 /* If it is a reply for ato after last received
137 * packet, enter pingpong mode. 139 * packet, enter pingpong mode.
138 */ 140 */
139 if ((u32)(now - tp->ack.lrcvtime) < tp->ack.ato) 141 if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato)
140 tp->ack.pingpong = 1; 142 icsk->icsk_ack.pingpong = 1;
141} 143}
142 144
143static __inline__ void tcp_event_ack_sent(struct sock *sk, unsigned int pkts) 145static __inline__ void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
144{ 146{
145 struct tcp_sock *tp = tcp_sk(sk); 147 tcp_dec_quickack_mode(sk, pkts);
146 148 inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
147 tcp_dec_quickack_mode(tp, pkts);
148 tcp_clear_xmit_timer(sk, TCP_TIME_DACK);
149} 149}
150 150
151/* Determine a window scaling and initial window to offer. 151/* Determine a window scaling and initial window to offer.
@@ -265,6 +265,7 @@ static __inline__ u16 tcp_select_window(struct sock *sk)
265static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) 265static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
266{ 266{
267 if (skb != NULL) { 267 if (skb != NULL) {
268 const struct inet_connection_sock *icsk = inet_csk(sk);
268 struct inet_sock *inet = inet_sk(sk); 269 struct inet_sock *inet = inet_sk(sk);
269 struct tcp_sock *tp = tcp_sk(sk); 270 struct tcp_sock *tp = tcp_sk(sk);
270 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); 271 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
@@ -280,8 +281,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
280#define SYSCTL_FLAG_SACK 0x4 281#define SYSCTL_FLAG_SACK 0x4
281 282
282 /* If congestion control is doing timestamping */ 283 /* If congestion control is doing timestamping */
283 if (tp->ca_ops->rtt_sample) 284 if (icsk->icsk_ca_ops->rtt_sample)
284 do_gettimeofday(&skb->stamp); 285 __net_timestamp(skb);
285 286
286 sysctl_flags = 0; 287 sysctl_flags = 0;
287 if (tcb->flags & TCPCB_FLAG_SYN) { 288 if (tcb->flags & TCPCB_FLAG_SYN) {
@@ -308,7 +309,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
308 } 309 }
309 310
310 if (tcp_packets_in_flight(tp) == 0) 311 if (tcp_packets_in_flight(tp) == 0)
311 tcp_ca_event(tp, CA_EVENT_TX_START); 312 tcp_ca_event(sk, CA_EVENT_TX_START);
312 313
313 th = (struct tcphdr *) skb_push(skb, tcp_header_size); 314 th = (struct tcphdr *) skb_push(skb, tcp_header_size);
314 skb->h.th = th; 315 skb->h.th = th;
@@ -366,7 +367,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
366 if (err <= 0) 367 if (err <= 0)
367 return err; 368 return err;
368 369
369 tcp_enter_cwr(tp); 370 tcp_enter_cwr(sk);
370 371
371 /* NET_XMIT_CN is special. It does not guarantee, 372 /* NET_XMIT_CN is special. It does not guarantee,
372 * that this packet is lost. It tells that device 373 * that this packet is lost. It tells that device
@@ -482,7 +483,7 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned
482 * skbs, which it never sent before. --ANK 483 * skbs, which it never sent before. --ANK
483 */ 484 */
484 TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when; 485 TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
485 buff->stamp = skb->stamp; 486 buff->tstamp = skb->tstamp;
486 487
487 if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) { 488 if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) {
488 tp->lost_out -= tcp_skb_pcount(skb); 489 tp->lost_out -= tcp_skb_pcount(skb);
@@ -505,7 +506,7 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned
505 506
506 /* Link BUFF into the send queue. */ 507 /* Link BUFF into the send queue. */
507 skb_header_release(buff); 508 skb_header_release(buff);
508 __skb_append(skb, buff); 509 __skb_append(skb, buff, &sk->sk_write_queue);
509 510
510 return 0; 511 return 0;
511} 512}
@@ -696,7 +697,7 @@ static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp)
696 if (tp->packets_out > tp->snd_cwnd_used) 697 if (tp->packets_out > tp->snd_cwnd_used)
697 tp->snd_cwnd_used = tp->packets_out; 698 tp->snd_cwnd_used = tp->packets_out;
698 699
699 if ((s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= tp->rto) 700 if ((s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto)
700 tcp_cwnd_application_limited(sk); 701 tcp_cwnd_application_limited(sk);
701 } 702 }
702} 703}
@@ -893,7 +894,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
893 894
894 /* Link BUFF into the send queue. */ 895 /* Link BUFF into the send queue. */
895 skb_header_release(buff); 896 skb_header_release(buff);
896 __skb_append(skb, buff); 897 __skb_append(skb, buff, &sk->sk_write_queue);
897 898
898 return 0; 899 return 0;
899} 900}
@@ -905,12 +906,13 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
905 */ 906 */
906static int tcp_tso_should_defer(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb) 907static int tcp_tso_should_defer(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb)
907{ 908{
909 const struct inet_connection_sock *icsk = inet_csk(sk);
908 u32 send_win, cong_win, limit, in_flight; 910 u32 send_win, cong_win, limit, in_flight;
909 911
910 if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) 912 if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)
911 return 0; 913 return 0;
912 914
913 if (tp->ca_state != TCP_CA_Open) 915 if (icsk->icsk_ca_state != TCP_CA_Open)
914 return 0; 916 return 0;
915 917
916 in_flight = tcp_packets_in_flight(tp); 918 in_flight = tcp_packets_in_flight(tp);
@@ -1147,6 +1149,7 @@ void tcp_push_one(struct sock *sk, unsigned int mss_now)
1147 */ 1149 */
1148u32 __tcp_select_window(struct sock *sk) 1150u32 __tcp_select_window(struct sock *sk)
1149{ 1151{
1152 struct inet_connection_sock *icsk = inet_csk(sk);
1150 struct tcp_sock *tp = tcp_sk(sk); 1153 struct tcp_sock *tp = tcp_sk(sk);
1151 /* MSS for the peer's data. Previous verions used mss_clamp 1154 /* MSS for the peer's data. Previous verions used mss_clamp
1152 * here. I don't know if the value based on our guesses 1155 * here. I don't know if the value based on our guesses
@@ -1154,7 +1157,7 @@ u32 __tcp_select_window(struct sock *sk)
1154 * but may be worse for the performance because of rcv_mss 1157 * but may be worse for the performance because of rcv_mss
1155 * fluctuations. --SAW 1998/11/1 1158 * fluctuations. --SAW 1998/11/1
1156 */ 1159 */
1157 int mss = tp->ack.rcv_mss; 1160 int mss = icsk->icsk_ack.rcv_mss;
1158 int free_space = tcp_space(sk); 1161 int free_space = tcp_space(sk);
1159 int full_space = min_t(int, tp->window_clamp, tcp_full_space(sk)); 1162 int full_space = min_t(int, tp->window_clamp, tcp_full_space(sk));
1160 int window; 1163 int window;
@@ -1163,7 +1166,7 @@ u32 __tcp_select_window(struct sock *sk)
1163 mss = full_space; 1166 mss = full_space;
1164 1167
1165 if (free_space < full_space/2) { 1168 if (free_space < full_space/2) {
1166 tp->ack.quick = 0; 1169 icsk->icsk_ack.quick = 0;
1167 1170
1168 if (tcp_memory_pressure) 1171 if (tcp_memory_pressure)
1169 tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U*tp->advmss); 1172 tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U*tp->advmss);
@@ -1238,7 +1241,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m
1238 tcp_skb_pcount(next_skb) != 1); 1241 tcp_skb_pcount(next_skb) != 1);
1239 1242
1240 /* Ok. We will be able to collapse the packet. */ 1243 /* Ok. We will be able to collapse the packet. */
1241 __skb_unlink(next_skb, next_skb->list); 1244 __skb_unlink(next_skb, &sk->sk_write_queue);
1242 1245
1243 memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size); 1246 memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size);
1244 1247
@@ -1286,6 +1289,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m
1286 */ 1289 */
1287void tcp_simple_retransmit(struct sock *sk) 1290void tcp_simple_retransmit(struct sock *sk)
1288{ 1291{
1292 const struct inet_connection_sock *icsk = inet_csk(sk);
1289 struct tcp_sock *tp = tcp_sk(sk); 1293 struct tcp_sock *tp = tcp_sk(sk);
1290 struct sk_buff *skb; 1294 struct sk_buff *skb;
1291 unsigned int mss = tcp_current_mss(sk, 0); 1295 unsigned int mss = tcp_current_mss(sk, 0);
@@ -1316,12 +1320,12 @@ void tcp_simple_retransmit(struct sock *sk)
1316 * in network, but units changed and effective 1320 * in network, but units changed and effective
1317 * cwnd/ssthresh really reduced now. 1321 * cwnd/ssthresh really reduced now.
1318 */ 1322 */
1319 if (tp->ca_state != TCP_CA_Loss) { 1323 if (icsk->icsk_ca_state != TCP_CA_Loss) {
1320 tp->high_seq = tp->snd_nxt; 1324 tp->high_seq = tp->snd_nxt;
1321 tp->snd_ssthresh = tcp_current_ssthresh(tp); 1325 tp->snd_ssthresh = tcp_current_ssthresh(sk);
1322 tp->prior_ssthresh = 0; 1326 tp->prior_ssthresh = 0;
1323 tp->undo_marker = 0; 1327 tp->undo_marker = 0;
1324 tcp_set_ca_state(tp, TCP_CA_Loss); 1328 tcp_set_ca_state(sk, TCP_CA_Loss);
1325 } 1329 }
1326 tcp_xmit_retransmit_queue(sk); 1330 tcp_xmit_retransmit_queue(sk);
1327} 1331}
@@ -1461,6 +1465,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
1461 */ 1465 */
1462void tcp_xmit_retransmit_queue(struct sock *sk) 1466void tcp_xmit_retransmit_queue(struct sock *sk)
1463{ 1467{
1468 const struct inet_connection_sock *icsk = inet_csk(sk);
1464 struct tcp_sock *tp = tcp_sk(sk); 1469 struct tcp_sock *tp = tcp_sk(sk);
1465 struct sk_buff *skb; 1470 struct sk_buff *skb;
1466 int packet_cnt = tp->lost_out; 1471 int packet_cnt = tp->lost_out;
@@ -1484,14 +1489,16 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
1484 if (!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) { 1489 if (!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) {
1485 if (tcp_retransmit_skb(sk, skb)) 1490 if (tcp_retransmit_skb(sk, skb))
1486 return; 1491 return;
1487 if (tp->ca_state != TCP_CA_Loss) 1492 if (icsk->icsk_ca_state != TCP_CA_Loss)
1488 NET_INC_STATS_BH(LINUX_MIB_TCPFASTRETRANS); 1493 NET_INC_STATS_BH(LINUX_MIB_TCPFASTRETRANS);
1489 else 1494 else
1490 NET_INC_STATS_BH(LINUX_MIB_TCPSLOWSTARTRETRANS); 1495 NET_INC_STATS_BH(LINUX_MIB_TCPSLOWSTARTRETRANS);
1491 1496
1492 if (skb == 1497 if (skb ==
1493 skb_peek(&sk->sk_write_queue)) 1498 skb_peek(&sk->sk_write_queue))
1494 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); 1499 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
1500 inet_csk(sk)->icsk_rto,
1501 TCP_RTO_MAX);
1495 } 1502 }
1496 1503
1497 packet_cnt -= tcp_skb_pcount(skb); 1504 packet_cnt -= tcp_skb_pcount(skb);
@@ -1504,7 +1511,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
1504 /* OK, demanded retransmission is finished. */ 1511 /* OK, demanded retransmission is finished. */
1505 1512
1506 /* Forward retransmissions are possible only during Recovery. */ 1513 /* Forward retransmissions are possible only during Recovery. */
1507 if (tp->ca_state != TCP_CA_Recovery) 1514 if (icsk->icsk_ca_state != TCP_CA_Recovery)
1508 return; 1515 return;
1509 1516
1510 /* No forward retransmissions in Reno are possible. */ 1517 /* No forward retransmissions in Reno are possible. */
@@ -1544,7 +1551,9 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
1544 break; 1551 break;
1545 1552
1546 if (skb == skb_peek(&sk->sk_write_queue)) 1553 if (skb == skb_peek(&sk->sk_write_queue))
1547 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); 1554 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
1555 inet_csk(sk)->icsk_rto,
1556 TCP_RTO_MAX);
1548 1557
1549 NET_INC_STATS_BH(LINUX_MIB_TCPFORWARDRETRANS); 1558 NET_INC_STATS_BH(LINUX_MIB_TCPFORWARDRETRANS);
1550 } 1559 }
@@ -1573,7 +1582,7 @@ void tcp_send_fin(struct sock *sk)
1573 } else { 1582 } else {
1574 /* Socket is locked, keep trying until memory is available. */ 1583 /* Socket is locked, keep trying until memory is available. */
1575 for (;;) { 1584 for (;;) {
1576 skb = alloc_skb(MAX_TCP_HEADER, GFP_KERNEL); 1585 skb = alloc_skb_fclone(MAX_TCP_HEADER, GFP_KERNEL);
1577 if (skb) 1586 if (skb)
1578 break; 1587 break;
1579 yield(); 1588 yield();
@@ -1780,8 +1789,8 @@ static inline void tcp_connect_init(struct sock *sk)
1780 tp->rcv_wup = 0; 1789 tp->rcv_wup = 0;
1781 tp->copied_seq = 0; 1790 tp->copied_seq = 0;
1782 1791
1783 tp->rto = TCP_TIMEOUT_INIT; 1792 inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
1784 tp->retransmits = 0; 1793 inet_csk(sk)->icsk_retransmits = 0;
1785 tcp_clear_retrans(tp); 1794 tcp_clear_retrans(tp);
1786} 1795}
1787 1796
@@ -1795,7 +1804,7 @@ int tcp_connect(struct sock *sk)
1795 1804
1796 tcp_connect_init(sk); 1805 tcp_connect_init(sk);
1797 1806
1798 buff = alloc_skb(MAX_TCP_HEADER + 15, sk->sk_allocation); 1807 buff = alloc_skb_fclone(MAX_TCP_HEADER + 15, sk->sk_allocation);
1799 if (unlikely(buff == NULL)) 1808 if (unlikely(buff == NULL))
1800 return -ENOBUFS; 1809 return -ENOBUFS;
1801 1810
@@ -1824,7 +1833,8 @@ int tcp_connect(struct sock *sk)
1824 TCP_INC_STATS(TCP_MIB_ACTIVEOPENS); 1833 TCP_INC_STATS(TCP_MIB_ACTIVEOPENS);
1825 1834
1826 /* Timer for repeating the SYN until an answer. */ 1835 /* Timer for repeating the SYN until an answer. */
1827 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); 1836 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
1837 inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
1828 return 0; 1838 return 0;
1829} 1839}
1830 1840
@@ -1834,20 +1844,21 @@ int tcp_connect(struct sock *sk)
1834 */ 1844 */
1835void tcp_send_delayed_ack(struct sock *sk) 1845void tcp_send_delayed_ack(struct sock *sk)
1836{ 1846{
1837 struct tcp_sock *tp = tcp_sk(sk); 1847 struct inet_connection_sock *icsk = inet_csk(sk);
1838 int ato = tp->ack.ato; 1848 int ato = icsk->icsk_ack.ato;
1839 unsigned long timeout; 1849 unsigned long timeout;
1840 1850
1841 if (ato > TCP_DELACK_MIN) { 1851 if (ato > TCP_DELACK_MIN) {
1852 const struct tcp_sock *tp = tcp_sk(sk);
1842 int max_ato = HZ/2; 1853 int max_ato = HZ/2;
1843 1854
1844 if (tp->ack.pingpong || (tp->ack.pending&TCP_ACK_PUSHED)) 1855 if (icsk->icsk_ack.pingpong || (icsk->icsk_ack.pending & ICSK_ACK_PUSHED))
1845 max_ato = TCP_DELACK_MAX; 1856 max_ato = TCP_DELACK_MAX;
1846 1857
1847 /* Slow path, intersegment interval is "high". */ 1858 /* Slow path, intersegment interval is "high". */
1848 1859
1849 /* If some rtt estimate is known, use it to bound delayed ack. 1860 /* If some rtt estimate is known, use it to bound delayed ack.
1850 * Do not use tp->rto here, use results of rtt measurements 1861 * Do not use inet_csk(sk)->icsk_rto here, use results of rtt measurements
1851 * directly. 1862 * directly.
1852 */ 1863 */
1853 if (tp->srtt) { 1864 if (tp->srtt) {
@@ -1864,21 +1875,22 @@ void tcp_send_delayed_ack(struct sock *sk)
1864 timeout = jiffies + ato; 1875 timeout = jiffies + ato;
1865 1876
1866 /* Use new timeout only if there wasn't a older one earlier. */ 1877 /* Use new timeout only if there wasn't a older one earlier. */
1867 if (tp->ack.pending&TCP_ACK_TIMER) { 1878 if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) {
1868 /* If delack timer was blocked or is about to expire, 1879 /* If delack timer was blocked or is about to expire,
1869 * send ACK now. 1880 * send ACK now.
1870 */ 1881 */
1871 if (tp->ack.blocked || time_before_eq(tp->ack.timeout, jiffies+(ato>>2))) { 1882 if (icsk->icsk_ack.blocked ||
1883 time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) {
1872 tcp_send_ack(sk); 1884 tcp_send_ack(sk);
1873 return; 1885 return;
1874 } 1886 }
1875 1887
1876 if (!time_before(timeout, tp->ack.timeout)) 1888 if (!time_before(timeout, icsk->icsk_ack.timeout))
1877 timeout = tp->ack.timeout; 1889 timeout = icsk->icsk_ack.timeout;
1878 } 1890 }
1879 tp->ack.pending |= TCP_ACK_SCHED|TCP_ACK_TIMER; 1891 icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER;
1880 tp->ack.timeout = timeout; 1892 icsk->icsk_ack.timeout = timeout;
1881 sk_reset_timer(sk, &tp->delack_timer, timeout); 1893 sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout);
1882} 1894}
1883 1895
1884/* This routine sends an ack and also updates the window. */ 1896/* This routine sends an ack and also updates the window. */
@@ -1895,9 +1907,10 @@ void tcp_send_ack(struct sock *sk)
1895 */ 1907 */
1896 buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); 1908 buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
1897 if (buff == NULL) { 1909 if (buff == NULL) {
1898 tcp_schedule_ack(tp); 1910 inet_csk_schedule_ack(sk);
1899 tp->ack.ato = TCP_ATO_MIN; 1911 inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
1900 tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MAX); 1912 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1913 TCP_DELACK_MAX, TCP_RTO_MAX);
1901 return; 1914 return;
1902 } 1915 }
1903 1916
@@ -2011,6 +2024,7 @@ int tcp_write_wakeup(struct sock *sk)
2011 */ 2024 */
2012void tcp_send_probe0(struct sock *sk) 2025void tcp_send_probe0(struct sock *sk)
2013{ 2026{
2027 struct inet_connection_sock *icsk = inet_csk(sk);
2014 struct tcp_sock *tp = tcp_sk(sk); 2028 struct tcp_sock *tp = tcp_sk(sk);
2015 int err; 2029 int err;
2016 2030
@@ -2018,28 +2032,31 @@ void tcp_send_probe0(struct sock *sk)
2018 2032
2019 if (tp->packets_out || !sk->sk_send_head) { 2033 if (tp->packets_out || !sk->sk_send_head) {
2020 /* Cancel probe timer, if it is not required. */ 2034 /* Cancel probe timer, if it is not required. */
2021 tp->probes_out = 0; 2035 icsk->icsk_probes_out = 0;
2022 tp->backoff = 0; 2036 icsk->icsk_backoff = 0;
2023 return; 2037 return;
2024 } 2038 }
2025 2039
2026 if (err <= 0) { 2040 if (err <= 0) {
2027 if (tp->backoff < sysctl_tcp_retries2) 2041 if (icsk->icsk_backoff < sysctl_tcp_retries2)
2028 tp->backoff++; 2042 icsk->icsk_backoff++;
2029 tp->probes_out++; 2043 icsk->icsk_probes_out++;
2030 tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0, 2044 inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
2031 min(tp->rto << tp->backoff, TCP_RTO_MAX)); 2045 min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX),
2046 TCP_RTO_MAX);
2032 } else { 2047 } else {
2033 /* If packet was not sent due to local congestion, 2048 /* If packet was not sent due to local congestion,
2034 * do not backoff and do not remember probes_out. 2049 * do not backoff and do not remember icsk_probes_out.
2035 * Let local senders to fight for local resources. 2050 * Let local senders to fight for local resources.
2036 * 2051 *
2037 * Use accumulated backoff yet. 2052 * Use accumulated backoff yet.
2038 */ 2053 */
2039 if (!tp->probes_out) 2054 if (!icsk->icsk_probes_out)
2040 tp->probes_out=1; 2055 icsk->icsk_probes_out = 1;
2041 tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0, 2056 inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
2042 min(tp->rto << tp->backoff, TCP_RESOURCE_PROBE_INTERVAL)); 2057 min(icsk->icsk_rto << icsk->icsk_backoff,
2058 TCP_RESOURCE_PROBE_INTERVAL),
2059 TCP_RTO_MAX);
2043 } 2060 }
2044} 2061}
2045 2062
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c
index 70e108e15c71..327770bf5522 100644
--- a/net/ipv4/tcp_scalable.c
+++ b/net/ipv4/tcp_scalable.c
@@ -16,9 +16,10 @@
16#define TCP_SCALABLE_AI_CNT 50U 16#define TCP_SCALABLE_AI_CNT 50U
17#define TCP_SCALABLE_MD_SCALE 3 17#define TCP_SCALABLE_MD_SCALE 3
18 18
19static void tcp_scalable_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, 19static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
20 u32 in_flight, int flag) 20 u32 in_flight, int flag)
21{ 21{
22 struct tcp_sock *tp = tcp_sk(sk);
22 if (in_flight < tp->snd_cwnd) 23 if (in_flight < tp->snd_cwnd)
23 return; 24 return;
24 25
@@ -35,8 +36,9 @@ static void tcp_scalable_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
35 tp->snd_cwnd_stamp = tcp_time_stamp; 36 tp->snd_cwnd_stamp = tcp_time_stamp;
36} 37}
37 38
38static u32 tcp_scalable_ssthresh(struct tcp_sock *tp) 39static u32 tcp_scalable_ssthresh(struct sock *sk)
39{ 40{
41 const struct tcp_sock *tp = tcp_sk(sk);
40 return max(tp->snd_cwnd - (tp->snd_cwnd>>TCP_SCALABLE_MD_SCALE), 2U); 42 return max(tp->snd_cwnd - (tp->snd_cwnd>>TCP_SCALABLE_MD_SCALE), 2U);
41} 43}
42 44
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 0084227438c2..415ee47ac1c5 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -36,49 +36,13 @@ static void tcp_write_timer(unsigned long);
36static void tcp_delack_timer(unsigned long); 36static void tcp_delack_timer(unsigned long);
37static void tcp_keepalive_timer (unsigned long data); 37static void tcp_keepalive_timer (unsigned long data);
38 38
39#ifdef TCP_DEBUG
40const char tcp_timer_bug_msg[] = KERN_DEBUG "tcpbug: unknown timer value\n";
41EXPORT_SYMBOL(tcp_timer_bug_msg);
42#endif
43
44/*
45 * Using different timers for retransmit, delayed acks and probes
46 * We may wish use just one timer maintaining a list of expire jiffies
47 * to optimize.
48 */
49
50void tcp_init_xmit_timers(struct sock *sk) 39void tcp_init_xmit_timers(struct sock *sk)
51{ 40{
52 struct tcp_sock *tp = tcp_sk(sk); 41 inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
53 42 &tcp_keepalive_timer);
54 init_timer(&tp->retransmit_timer);
55 tp->retransmit_timer.function=&tcp_write_timer;
56 tp->retransmit_timer.data = (unsigned long) sk;
57 tp->pending = 0;
58
59 init_timer(&tp->delack_timer);
60 tp->delack_timer.function=&tcp_delack_timer;
61 tp->delack_timer.data = (unsigned long) sk;
62 tp->ack.pending = 0;
63
64 init_timer(&sk->sk_timer);
65 sk->sk_timer.function = &tcp_keepalive_timer;
66 sk->sk_timer.data = (unsigned long)sk;
67} 43}
68 44
69void tcp_clear_xmit_timers(struct sock *sk) 45EXPORT_SYMBOL(tcp_init_xmit_timers);
70{
71 struct tcp_sock *tp = tcp_sk(sk);
72
73 tp->pending = 0;
74 sk_stop_timer(sk, &tp->retransmit_timer);
75
76 tp->ack.pending = 0;
77 tp->ack.blocked = 0;
78 sk_stop_timer(sk, &tp->delack_timer);
79
80 sk_stop_timer(sk, &sk->sk_timer);
81}
82 46
83static void tcp_write_err(struct sock *sk) 47static void tcp_write_err(struct sock *sk)
84{ 48{
@@ -155,15 +119,15 @@ static int tcp_orphan_retries(struct sock *sk, int alive)
155/* A write timeout has occurred. Process the after effects. */ 119/* A write timeout has occurred. Process the after effects. */
156static int tcp_write_timeout(struct sock *sk) 120static int tcp_write_timeout(struct sock *sk)
157{ 121{
158 struct tcp_sock *tp = tcp_sk(sk); 122 const struct inet_connection_sock *icsk = inet_csk(sk);
159 int retry_until; 123 int retry_until;
160 124
161 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { 125 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
162 if (tp->retransmits) 126 if (icsk->icsk_retransmits)
163 dst_negative_advice(&sk->sk_dst_cache); 127 dst_negative_advice(&sk->sk_dst_cache);
164 retry_until = tp->syn_retries ? : sysctl_tcp_syn_retries; 128 retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
165 } else { 129 } else {
166 if (tp->retransmits >= sysctl_tcp_retries1) { 130 if (icsk->icsk_retransmits >= sysctl_tcp_retries1) {
167 /* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu black 131 /* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu black
168 hole detection. :-( 132 hole detection. :-(
169 133
@@ -189,16 +153,16 @@ static int tcp_write_timeout(struct sock *sk)
189 153
190 retry_until = sysctl_tcp_retries2; 154 retry_until = sysctl_tcp_retries2;
191 if (sock_flag(sk, SOCK_DEAD)) { 155 if (sock_flag(sk, SOCK_DEAD)) {
192 int alive = (tp->rto < TCP_RTO_MAX); 156 const int alive = (icsk->icsk_rto < TCP_RTO_MAX);
193 157
194 retry_until = tcp_orphan_retries(sk, alive); 158 retry_until = tcp_orphan_retries(sk, alive);
195 159
196 if (tcp_out_of_resources(sk, alive || tp->retransmits < retry_until)) 160 if (tcp_out_of_resources(sk, alive || icsk->icsk_retransmits < retry_until))
197 return 1; 161 return 1;
198 } 162 }
199 } 163 }
200 164
201 if (tp->retransmits >= retry_until) { 165 if (icsk->icsk_retransmits >= retry_until) {
202 /* Has it gone just too far? */ 166 /* Has it gone just too far? */
203 tcp_write_err(sk); 167 tcp_write_err(sk);
204 return 1; 168 return 1;
@@ -210,26 +174,27 @@ static void tcp_delack_timer(unsigned long data)
210{ 174{
211 struct sock *sk = (struct sock*)data; 175 struct sock *sk = (struct sock*)data;
212 struct tcp_sock *tp = tcp_sk(sk); 176 struct tcp_sock *tp = tcp_sk(sk);
177 struct inet_connection_sock *icsk = inet_csk(sk);
213 178
214 bh_lock_sock(sk); 179 bh_lock_sock(sk);
215 if (sock_owned_by_user(sk)) { 180 if (sock_owned_by_user(sk)) {
216 /* Try again later. */ 181 /* Try again later. */
217 tp->ack.blocked = 1; 182 icsk->icsk_ack.blocked = 1;
218 NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOCKED); 183 NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOCKED);
219 sk_reset_timer(sk, &tp->delack_timer, jiffies + TCP_DELACK_MIN); 184 sk_reset_timer(sk, &icsk->icsk_delack_timer, jiffies + TCP_DELACK_MIN);
220 goto out_unlock; 185 goto out_unlock;
221 } 186 }
222 187
223 sk_stream_mem_reclaim(sk); 188 sk_stream_mem_reclaim(sk);
224 189
225 if (sk->sk_state == TCP_CLOSE || !(tp->ack.pending & TCP_ACK_TIMER)) 190 if (sk->sk_state == TCP_CLOSE || !(icsk->icsk_ack.pending & ICSK_ACK_TIMER))
226 goto out; 191 goto out;
227 192
228 if (time_after(tp->ack.timeout, jiffies)) { 193 if (time_after(icsk->icsk_ack.timeout, jiffies)) {
229 sk_reset_timer(sk, &tp->delack_timer, tp->ack.timeout); 194 sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout);
230 goto out; 195 goto out;
231 } 196 }
232 tp->ack.pending &= ~TCP_ACK_TIMER; 197 icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER;
233 198
234 if (!skb_queue_empty(&tp->ucopy.prequeue)) { 199 if (!skb_queue_empty(&tp->ucopy.prequeue)) {
235 struct sk_buff *skb; 200 struct sk_buff *skb;
@@ -242,16 +207,16 @@ static void tcp_delack_timer(unsigned long data)
242 tp->ucopy.memory = 0; 207 tp->ucopy.memory = 0;
243 } 208 }
244 209
245 if (tcp_ack_scheduled(tp)) { 210 if (inet_csk_ack_scheduled(sk)) {
246 if (!tp->ack.pingpong) { 211 if (!icsk->icsk_ack.pingpong) {
247 /* Delayed ACK missed: inflate ATO. */ 212 /* Delayed ACK missed: inflate ATO. */
248 tp->ack.ato = min(tp->ack.ato << 1, tp->rto); 213 icsk->icsk_ack.ato = min(icsk->icsk_ack.ato << 1, icsk->icsk_rto);
249 } else { 214 } else {
250 /* Delayed ACK missed: leave pingpong mode and 215 /* Delayed ACK missed: leave pingpong mode and
251 * deflate ATO. 216 * deflate ATO.
252 */ 217 */
253 tp->ack.pingpong = 0; 218 icsk->icsk_ack.pingpong = 0;
254 tp->ack.ato = TCP_ATO_MIN; 219 icsk->icsk_ack.ato = TCP_ATO_MIN;
255 } 220 }
256 tcp_send_ack(sk); 221 tcp_send_ack(sk);
257 NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKS); 222 NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKS);
@@ -268,11 +233,12 @@ out_unlock:
268 233
269static void tcp_probe_timer(struct sock *sk) 234static void tcp_probe_timer(struct sock *sk)
270{ 235{
236 struct inet_connection_sock *icsk = inet_csk(sk);
271 struct tcp_sock *tp = tcp_sk(sk); 237 struct tcp_sock *tp = tcp_sk(sk);
272 int max_probes; 238 int max_probes;
273 239
274 if (tp->packets_out || !sk->sk_send_head) { 240 if (tp->packets_out || !sk->sk_send_head) {
275 tp->probes_out = 0; 241 icsk->icsk_probes_out = 0;
276 return; 242 return;
277 } 243 }
278 244
@@ -283,7 +249,7 @@ static void tcp_probe_timer(struct sock *sk)
283 * FIXME: We ought not to do it, Solaris 2.5 actually has fixing 249 * FIXME: We ought not to do it, Solaris 2.5 actually has fixing
284 * this behaviour in Solaris down as a bug fix. [AC] 250 * this behaviour in Solaris down as a bug fix. [AC]
285 * 251 *
286 * Let me to explain. probes_out is zeroed by incoming ACKs 252 * Let me to explain. icsk_probes_out is zeroed by incoming ACKs
287 * even if they advertise zero window. Hence, connection is killed only 253 * even if they advertise zero window. Hence, connection is killed only
288 * if we received no ACKs for normal connection timeout. It is not killed 254 * if we received no ACKs for normal connection timeout. It is not killed
289 * only because window stays zero for some time, window may be zero 255 * only because window stays zero for some time, window may be zero
@@ -294,15 +260,15 @@ static void tcp_probe_timer(struct sock *sk)
294 max_probes = sysctl_tcp_retries2; 260 max_probes = sysctl_tcp_retries2;
295 261
296 if (sock_flag(sk, SOCK_DEAD)) { 262 if (sock_flag(sk, SOCK_DEAD)) {
297 int alive = ((tp->rto<<tp->backoff) < TCP_RTO_MAX); 263 const int alive = ((icsk->icsk_rto << icsk->icsk_backoff) < TCP_RTO_MAX);
298 264
299 max_probes = tcp_orphan_retries(sk, alive); 265 max_probes = tcp_orphan_retries(sk, alive);
300 266
301 if (tcp_out_of_resources(sk, alive || tp->probes_out <= max_probes)) 267 if (tcp_out_of_resources(sk, alive || icsk->icsk_probes_out <= max_probes))
302 return; 268 return;
303 } 269 }
304 270
305 if (tp->probes_out > max_probes) { 271 if (icsk->icsk_probes_out > max_probes) {
306 tcp_write_err(sk); 272 tcp_write_err(sk);
307 } else { 273 } else {
308 /* Only send another probe if we didn't close things up. */ 274 /* Only send another probe if we didn't close things up. */
@@ -317,6 +283,7 @@ static void tcp_probe_timer(struct sock *sk)
317static void tcp_retransmit_timer(struct sock *sk) 283static void tcp_retransmit_timer(struct sock *sk)
318{ 284{
319 struct tcp_sock *tp = tcp_sk(sk); 285 struct tcp_sock *tp = tcp_sk(sk);
286 struct inet_connection_sock *icsk = inet_csk(sk);
320 287
321 if (!tp->packets_out) 288 if (!tp->packets_out)
322 goto out; 289 goto out;
@@ -351,20 +318,21 @@ static void tcp_retransmit_timer(struct sock *sk)
351 if (tcp_write_timeout(sk)) 318 if (tcp_write_timeout(sk))
352 goto out; 319 goto out;
353 320
354 if (tp->retransmits == 0) { 321 if (icsk->icsk_retransmits == 0) {
355 if (tp->ca_state == TCP_CA_Disorder || tp->ca_state == TCP_CA_Recovery) { 322 if (icsk->icsk_ca_state == TCP_CA_Disorder ||
323 icsk->icsk_ca_state == TCP_CA_Recovery) {
356 if (tp->rx_opt.sack_ok) { 324 if (tp->rx_opt.sack_ok) {
357 if (tp->ca_state == TCP_CA_Recovery) 325 if (icsk->icsk_ca_state == TCP_CA_Recovery)
358 NET_INC_STATS_BH(LINUX_MIB_TCPSACKRECOVERYFAIL); 326 NET_INC_STATS_BH(LINUX_MIB_TCPSACKRECOVERYFAIL);
359 else 327 else
360 NET_INC_STATS_BH(LINUX_MIB_TCPSACKFAILURES); 328 NET_INC_STATS_BH(LINUX_MIB_TCPSACKFAILURES);
361 } else { 329 } else {
362 if (tp->ca_state == TCP_CA_Recovery) 330 if (icsk->icsk_ca_state == TCP_CA_Recovery)
363 NET_INC_STATS_BH(LINUX_MIB_TCPRENORECOVERYFAIL); 331 NET_INC_STATS_BH(LINUX_MIB_TCPRENORECOVERYFAIL);
364 else 332 else
365 NET_INC_STATS_BH(LINUX_MIB_TCPRENOFAILURES); 333 NET_INC_STATS_BH(LINUX_MIB_TCPRENOFAILURES);
366 } 334 }
367 } else if (tp->ca_state == TCP_CA_Loss) { 335 } else if (icsk->icsk_ca_state == TCP_CA_Loss) {
368 NET_INC_STATS_BH(LINUX_MIB_TCPLOSSFAILURES); 336 NET_INC_STATS_BH(LINUX_MIB_TCPLOSSFAILURES);
369 } else { 337 } else {
370 NET_INC_STATS_BH(LINUX_MIB_TCPTIMEOUTS); 338 NET_INC_STATS_BH(LINUX_MIB_TCPTIMEOUTS);
@@ -381,10 +349,11 @@ static void tcp_retransmit_timer(struct sock *sk)
381 /* Retransmission failed because of local congestion, 349 /* Retransmission failed because of local congestion,
382 * do not backoff. 350 * do not backoff.
383 */ 351 */
384 if (!tp->retransmits) 352 if (!icsk->icsk_retransmits)
385 tp->retransmits=1; 353 icsk->icsk_retransmits = 1;
386 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, 354 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
387 min(tp->rto, TCP_RESOURCE_PROBE_INTERVAL)); 355 min(icsk->icsk_rto, TCP_RESOURCE_PROBE_INTERVAL),
356 TCP_RTO_MAX);
388 goto out; 357 goto out;
389 } 358 }
390 359
@@ -403,13 +372,13 @@ static void tcp_retransmit_timer(struct sock *sk)
403 * implemented ftp to mars will work nicely. We will have to fix 372 * implemented ftp to mars will work nicely. We will have to fix
404 * the 120 second clamps though! 373 * the 120 second clamps though!
405 */ 374 */
406 tp->backoff++; 375 icsk->icsk_backoff++;
407 tp->retransmits++; 376 icsk->icsk_retransmits++;
408 377
409out_reset_timer: 378out_reset_timer:
410 tp->rto = min(tp->rto << 1, TCP_RTO_MAX); 379 icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
411 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); 380 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX);
412 if (tp->retransmits > sysctl_tcp_retries1) 381 if (icsk->icsk_retransmits > sysctl_tcp_retries1)
413 __sk_dst_reset(sk); 382 __sk_dst_reset(sk);
414 383
415out:; 384out:;
@@ -418,32 +387,32 @@ out:;
418static void tcp_write_timer(unsigned long data) 387static void tcp_write_timer(unsigned long data)
419{ 388{
420 struct sock *sk = (struct sock*)data; 389 struct sock *sk = (struct sock*)data;
421 struct tcp_sock *tp = tcp_sk(sk); 390 struct inet_connection_sock *icsk = inet_csk(sk);
422 int event; 391 int event;
423 392
424 bh_lock_sock(sk); 393 bh_lock_sock(sk);
425 if (sock_owned_by_user(sk)) { 394 if (sock_owned_by_user(sk)) {
426 /* Try again later */ 395 /* Try again later */
427 sk_reset_timer(sk, &tp->retransmit_timer, jiffies + (HZ / 20)); 396 sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + (HZ / 20));
428 goto out_unlock; 397 goto out_unlock;
429 } 398 }
430 399
431 if (sk->sk_state == TCP_CLOSE || !tp->pending) 400 if (sk->sk_state == TCP_CLOSE || !icsk->icsk_pending)
432 goto out; 401 goto out;
433 402
434 if (time_after(tp->timeout, jiffies)) { 403 if (time_after(icsk->icsk_timeout, jiffies)) {
435 sk_reset_timer(sk, &tp->retransmit_timer, tp->timeout); 404 sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout);
436 goto out; 405 goto out;
437 } 406 }
438 407
439 event = tp->pending; 408 event = icsk->icsk_pending;
440 tp->pending = 0; 409 icsk->icsk_pending = 0;
441 410
442 switch (event) { 411 switch (event) {
443 case TCP_TIME_RETRANS: 412 case ICSK_TIME_RETRANS:
444 tcp_retransmit_timer(sk); 413 tcp_retransmit_timer(sk);
445 break; 414 break;
446 case TCP_TIME_PROBE0: 415 case ICSK_TIME_PROBE0:
447 tcp_probe_timer(sk); 416 tcp_probe_timer(sk);
448 break; 417 break;
449 } 418 }
@@ -462,96 +431,8 @@ out_unlock:
462 431
463static void tcp_synack_timer(struct sock *sk) 432static void tcp_synack_timer(struct sock *sk)
464{ 433{
465 struct tcp_sock *tp = tcp_sk(sk); 434 inet_csk_reqsk_queue_prune(sk, TCP_SYNQ_INTERVAL,
466 struct listen_sock *lopt = tp->accept_queue.listen_opt; 435 TCP_TIMEOUT_INIT, TCP_RTO_MAX);
467 int max_retries = tp->syn_retries ? : sysctl_tcp_synack_retries;
468 int thresh = max_retries;
469 unsigned long now = jiffies;
470 struct request_sock **reqp, *req;
471 int i, budget;
472
473 if (lopt == NULL || lopt->qlen == 0)
474 return;
475
476 /* Normally all the openreqs are young and become mature
477 * (i.e. converted to established socket) for first timeout.
478 * If synack was not acknowledged for 3 seconds, it means
479 * one of the following things: synack was lost, ack was lost,
480 * rtt is high or nobody planned to ack (i.e. synflood).
481 * When server is a bit loaded, queue is populated with old
482 * open requests, reducing effective size of queue.
483 * When server is well loaded, queue size reduces to zero
484 * after several minutes of work. It is not synflood,
485 * it is normal operation. The solution is pruning
486 * too old entries overriding normal timeout, when
487 * situation becomes dangerous.
488 *
489 * Essentially, we reserve half of room for young
490 * embrions; and abort old ones without pity, if old
491 * ones are about to clog our table.
492 */
493 if (lopt->qlen>>(lopt->max_qlen_log-1)) {
494 int young = (lopt->qlen_young<<1);
495
496 while (thresh > 2) {
497 if (lopt->qlen < young)
498 break;
499 thresh--;
500 young <<= 1;
501 }
502 }
503
504 if (tp->defer_accept)
505 max_retries = tp->defer_accept;
506
507 budget = 2*(TCP_SYNQ_HSIZE/(TCP_TIMEOUT_INIT/TCP_SYNQ_INTERVAL));
508 i = lopt->clock_hand;
509
510 do {
511 reqp=&lopt->syn_table[i];
512 while ((req = *reqp) != NULL) {
513 if (time_after_eq(now, req->expires)) {
514 if ((req->retrans < thresh ||
515 (inet_rsk(req)->acked && req->retrans < max_retries))
516 && !req->rsk_ops->rtx_syn_ack(sk, req, NULL)) {
517 unsigned long timeo;
518
519 if (req->retrans++ == 0)
520 lopt->qlen_young--;
521 timeo = min((TCP_TIMEOUT_INIT << req->retrans),
522 TCP_RTO_MAX);
523 req->expires = now + timeo;
524 reqp = &req->dl_next;
525 continue;
526 }
527
528 /* Drop this request */
529 tcp_synq_unlink(tp, req, reqp);
530 reqsk_queue_removed(&tp->accept_queue, req);
531 reqsk_free(req);
532 continue;
533 }
534 reqp = &req->dl_next;
535 }
536
537 i = (i+1)&(TCP_SYNQ_HSIZE-1);
538
539 } while (--budget > 0);
540
541 lopt->clock_hand = i;
542
543 if (lopt->qlen)
544 tcp_reset_keepalive_timer(sk, TCP_SYNQ_INTERVAL);
545}
546
547void tcp_delete_keepalive_timer (struct sock *sk)
548{
549 sk_stop_timer(sk, &sk->sk_timer);
550}
551
552void tcp_reset_keepalive_timer (struct sock *sk, unsigned long len)
553{
554 sk_reset_timer(sk, &sk->sk_timer, jiffies + len);
555} 436}
556 437
557void tcp_set_keepalive(struct sock *sk, int val) 438void tcp_set_keepalive(struct sock *sk, int val)
@@ -560,15 +441,16 @@ void tcp_set_keepalive(struct sock *sk, int val)
560 return; 441 return;
561 442
562 if (val && !sock_flag(sk, SOCK_KEEPOPEN)) 443 if (val && !sock_flag(sk, SOCK_KEEPOPEN))
563 tcp_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk))); 444 inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk)));
564 else if (!val) 445 else if (!val)
565 tcp_delete_keepalive_timer(sk); 446 inet_csk_delete_keepalive_timer(sk);
566} 447}
567 448
568 449
569static void tcp_keepalive_timer (unsigned long data) 450static void tcp_keepalive_timer (unsigned long data)
570{ 451{
571 struct sock *sk = (struct sock *) data; 452 struct sock *sk = (struct sock *) data;
453 struct inet_connection_sock *icsk = inet_csk(sk);
572 struct tcp_sock *tp = tcp_sk(sk); 454 struct tcp_sock *tp = tcp_sk(sk);
573 __u32 elapsed; 455 __u32 elapsed;
574 456
@@ -576,7 +458,7 @@ static void tcp_keepalive_timer (unsigned long data)
576 bh_lock_sock(sk); 458 bh_lock_sock(sk);
577 if (sock_owned_by_user(sk)) { 459 if (sock_owned_by_user(sk)) {
578 /* Try again later. */ 460 /* Try again later. */
579 tcp_reset_keepalive_timer (sk, HZ/20); 461 inet_csk_reset_keepalive_timer (sk, HZ/20);
580 goto out; 462 goto out;
581 } 463 }
582 464
@@ -587,7 +469,7 @@ static void tcp_keepalive_timer (unsigned long data)
587 469
588 if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) { 470 if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) {
589 if (tp->linger2 >= 0) { 471 if (tp->linger2 >= 0) {
590 int tmo = tcp_fin_time(tp) - TCP_TIMEWAIT_LEN; 472 const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN;
591 473
592 if (tmo > 0) { 474 if (tmo > 0) {
593 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); 475 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
@@ -610,14 +492,14 @@ static void tcp_keepalive_timer (unsigned long data)
610 elapsed = tcp_time_stamp - tp->rcv_tstamp; 492 elapsed = tcp_time_stamp - tp->rcv_tstamp;
611 493
612 if (elapsed >= keepalive_time_when(tp)) { 494 if (elapsed >= keepalive_time_when(tp)) {
613 if ((!tp->keepalive_probes && tp->probes_out >= sysctl_tcp_keepalive_probes) || 495 if ((!tp->keepalive_probes && icsk->icsk_probes_out >= sysctl_tcp_keepalive_probes) ||
614 (tp->keepalive_probes && tp->probes_out >= tp->keepalive_probes)) { 496 (tp->keepalive_probes && icsk->icsk_probes_out >= tp->keepalive_probes)) {
615 tcp_send_active_reset(sk, GFP_ATOMIC); 497 tcp_send_active_reset(sk, GFP_ATOMIC);
616 tcp_write_err(sk); 498 tcp_write_err(sk);
617 goto out; 499 goto out;
618 } 500 }
619 if (tcp_write_wakeup(sk) <= 0) { 501 if (tcp_write_wakeup(sk) <= 0) {
620 tp->probes_out++; 502 icsk->icsk_probes_out++;
621 elapsed = keepalive_intvl_when(tp); 503 elapsed = keepalive_intvl_when(tp);
622 } else { 504 } else {
623 /* If keepalive was lost due to local congestion, 505 /* If keepalive was lost due to local congestion,
@@ -634,7 +516,7 @@ static void tcp_keepalive_timer (unsigned long data)
634 sk_stream_mem_reclaim(sk); 516 sk_stream_mem_reclaim(sk);
635 517
636resched: 518resched:
637 tcp_reset_keepalive_timer (sk, elapsed); 519 inet_csk_reset_keepalive_timer (sk, elapsed);
638 goto out; 520 goto out;
639 521
640death: 522death:
@@ -644,8 +526,3 @@ out:
644 bh_unlock_sock(sk); 526 bh_unlock_sock(sk);
645 sock_put(sk); 527 sock_put(sk);
646} 528}
647
648EXPORT_SYMBOL(tcp_clear_xmit_timers);
649EXPORT_SYMBOL(tcp_delete_keepalive_timer);
650EXPORT_SYMBOL(tcp_init_xmit_timers);
651EXPORT_SYMBOL(tcp_reset_keepalive_timer);
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index 9bd443db5193..93c5f92070f9 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -35,7 +35,7 @@
35#include <linux/mm.h> 35#include <linux/mm.h>
36#include <linux/module.h> 36#include <linux/module.h>
37#include <linux/skbuff.h> 37#include <linux/skbuff.h>
38#include <linux/tcp_diag.h> 38#include <linux/inet_diag.h>
39 39
40#include <net/tcp.h> 40#include <net/tcp.h>
41 41
@@ -82,9 +82,10 @@ struct vegas {
82 * Instead we must wait until the completion of an RTT during 82 * Instead we must wait until the completion of an RTT during
83 * which we actually receive ACKs. 83 * which we actually receive ACKs.
84 */ 84 */
85static inline void vegas_enable(struct tcp_sock *tp) 85static inline void vegas_enable(struct sock *sk)
86{ 86{
87 struct vegas *vegas = tcp_ca(tp); 87 const struct tcp_sock *tp = tcp_sk(sk);
88 struct vegas *vegas = inet_csk_ca(sk);
88 89
89 /* Begin taking Vegas samples next time we send something. */ 90 /* Begin taking Vegas samples next time we send something. */
90 vegas->doing_vegas_now = 1; 91 vegas->doing_vegas_now = 1;
@@ -97,19 +98,19 @@ static inline void vegas_enable(struct tcp_sock *tp)
97} 98}
98 99
99/* Stop taking Vegas samples for now. */ 100/* Stop taking Vegas samples for now. */
100static inline void vegas_disable(struct tcp_sock *tp) 101static inline void vegas_disable(struct sock *sk)
101{ 102{
102 struct vegas *vegas = tcp_ca(tp); 103 struct vegas *vegas = inet_csk_ca(sk);
103 104
104 vegas->doing_vegas_now = 0; 105 vegas->doing_vegas_now = 0;
105} 106}
106 107
107static void tcp_vegas_init(struct tcp_sock *tp) 108static void tcp_vegas_init(struct sock *sk)
108{ 109{
109 struct vegas *vegas = tcp_ca(tp); 110 struct vegas *vegas = inet_csk_ca(sk);
110 111
111 vegas->baseRTT = 0x7fffffff; 112 vegas->baseRTT = 0x7fffffff;
112 vegas_enable(tp); 113 vegas_enable(sk);
113} 114}
114 115
115/* Do RTT sampling needed for Vegas. 116/* Do RTT sampling needed for Vegas.
@@ -120,9 +121,9 @@ static void tcp_vegas_init(struct tcp_sock *tp)
120 * o min-filter RTT samples from a much longer window (forever for now) 121 * o min-filter RTT samples from a much longer window (forever for now)
121 * to find the propagation delay (baseRTT) 122 * to find the propagation delay (baseRTT)
122 */ 123 */
123static void tcp_vegas_rtt_calc(struct tcp_sock *tp, u32 usrtt) 124static void tcp_vegas_rtt_calc(struct sock *sk, u32 usrtt)
124{ 125{
125 struct vegas *vegas = tcp_ca(tp); 126 struct vegas *vegas = inet_csk_ca(sk);
126 u32 vrtt = usrtt + 1; /* Never allow zero rtt or baseRTT */ 127 u32 vrtt = usrtt + 1; /* Never allow zero rtt or baseRTT */
127 128
128 /* Filter to find propagation delay: */ 129 /* Filter to find propagation delay: */
@@ -136,13 +137,13 @@ static void tcp_vegas_rtt_calc(struct tcp_sock *tp, u32 usrtt)
136 vegas->cntRTT++; 137 vegas->cntRTT++;
137} 138}
138 139
139static void tcp_vegas_state(struct tcp_sock *tp, u8 ca_state) 140static void tcp_vegas_state(struct sock *sk, u8 ca_state)
140{ 141{
141 142
142 if (ca_state == TCP_CA_Open) 143 if (ca_state == TCP_CA_Open)
143 vegas_enable(tp); 144 vegas_enable(sk);
144 else 145 else
145 vegas_disable(tp); 146 vegas_disable(sk);
146} 147}
147 148
148/* 149/*
@@ -154,20 +155,21 @@ static void tcp_vegas_state(struct tcp_sock *tp, u8 ca_state)
154 * packets, _then_ we can make Vegas calculations 155 * packets, _then_ we can make Vegas calculations
155 * again. 156 * again.
156 */ 157 */
157static void tcp_vegas_cwnd_event(struct tcp_sock *tp, enum tcp_ca_event event) 158static void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event)
158{ 159{
159 if (event == CA_EVENT_CWND_RESTART || 160 if (event == CA_EVENT_CWND_RESTART ||
160 event == CA_EVENT_TX_START) 161 event == CA_EVENT_TX_START)
161 tcp_vegas_init(tp); 162 tcp_vegas_init(sk);
162} 163}
163 164
164static void tcp_vegas_cong_avoid(struct tcp_sock *tp, u32 ack, 165static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack,
165 u32 seq_rtt, u32 in_flight, int flag) 166 u32 seq_rtt, u32 in_flight, int flag)
166{ 167{
167 struct vegas *vegas = tcp_ca(tp); 168 struct tcp_sock *tp = tcp_sk(sk);
169 struct vegas *vegas = inet_csk_ca(sk);
168 170
169 if (!vegas->doing_vegas_now) 171 if (!vegas->doing_vegas_now)
170 return tcp_reno_cong_avoid(tp, ack, seq_rtt, in_flight, flag); 172 return tcp_reno_cong_avoid(sk, ack, seq_rtt, in_flight, flag);
171 173
172 /* The key players are v_beg_snd_una and v_beg_snd_nxt. 174 /* The key players are v_beg_snd_una and v_beg_snd_nxt.
173 * 175 *
@@ -219,7 +221,7 @@ static void tcp_vegas_cong_avoid(struct tcp_sock *tp, u32 ack,
219 * but that's not too awful, since we're taking the min, 221 * but that's not too awful, since we're taking the min,
220 * rather than averaging. 222 * rather than averaging.
221 */ 223 */
222 tcp_vegas_rtt_calc(tp, seq_rtt*1000); 224 tcp_vegas_rtt_calc(sk, seq_rtt * 1000);
223 225
224 /* We do the Vegas calculations only if we got enough RTT 226 /* We do the Vegas calculations only if we got enough RTT
225 * samples that we can be reasonably sure that we got 227 * samples that we can be reasonably sure that we got
@@ -359,14 +361,14 @@ static void tcp_vegas_cong_avoid(struct tcp_sock *tp, u32 ack,
359} 361}
360 362
361/* Extract info for Tcp socket info provided via netlink. */ 363/* Extract info for Tcp socket info provided via netlink. */
362static void tcp_vegas_get_info(struct tcp_sock *tp, u32 ext, 364static void tcp_vegas_get_info(struct sock *sk, u32 ext,
363 struct sk_buff *skb) 365 struct sk_buff *skb)
364{ 366{
365 const struct vegas *ca = tcp_ca(tp); 367 const struct vegas *ca = inet_csk_ca(sk);
366 if (ext & (1<<(TCPDIAG_VEGASINFO-1))) { 368 if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
367 struct tcpvegas_info *info; 369 struct tcpvegas_info *info;
368 370
369 info = RTA_DATA(__RTA_PUT(skb, TCPDIAG_VEGASINFO, 371 info = RTA_DATA(__RTA_PUT(skb, INET_DIAG_VEGASINFO,
370 sizeof(*info))); 372 sizeof(*info)));
371 373
372 info->tcpv_enabled = ca->doing_vegas_now; 374 info->tcpv_enabled = ca->doing_vegas_now;
@@ -393,7 +395,7 @@ static struct tcp_congestion_ops tcp_vegas = {
393 395
394static int __init tcp_vegas_register(void) 396static int __init tcp_vegas_register(void)
395{ 397{
396 BUG_ON(sizeof(struct vegas) > TCP_CA_PRIV_SIZE); 398 BUG_ON(sizeof(struct vegas) > ICSK_CA_PRIV_SIZE);
397 tcp_register_congestion_control(&tcp_vegas); 399 tcp_register_congestion_control(&tcp_vegas);
398 return 0; 400 return 0;
399} 401}
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
index ef827242c940..0c340c3756c2 100644
--- a/net/ipv4/tcp_westwood.c
+++ b/net/ipv4/tcp_westwood.c
@@ -8,7 +8,7 @@
8#include <linux/mm.h> 8#include <linux/mm.h>
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/skbuff.h> 10#include <linux/skbuff.h>
11#include <linux/tcp_diag.h> 11#include <linux/inet_diag.h>
12#include <net/tcp.h> 12#include <net/tcp.h>
13 13
14/* TCP Westwood structure */ 14/* TCP Westwood structure */
@@ -40,9 +40,9 @@ struct westwood {
40 * way as soon as possible. It will reasonably happen within the first 40 * way as soon as possible. It will reasonably happen within the first
41 * RTT period of the connection lifetime. 41 * RTT period of the connection lifetime.
42 */ 42 */
43static void tcp_westwood_init(struct tcp_sock *tp) 43static void tcp_westwood_init(struct sock *sk)
44{ 44{
45 struct westwood *w = tcp_ca(tp); 45 struct westwood *w = inet_csk_ca(sk);
46 46
47 w->bk = 0; 47 w->bk = 0;
48 w->bw_ns_est = 0; 48 w->bw_ns_est = 0;
@@ -51,7 +51,7 @@ static void tcp_westwood_init(struct tcp_sock *tp)
51 w->cumul_ack = 0; 51 w->cumul_ack = 0;
52 w->rtt_min = w->rtt = TCP_WESTWOOD_INIT_RTT; 52 w->rtt_min = w->rtt = TCP_WESTWOOD_INIT_RTT;
53 w->rtt_win_sx = tcp_time_stamp; 53 w->rtt_win_sx = tcp_time_stamp;
54 w->snd_una = tp->snd_una; 54 w->snd_una = tcp_sk(sk)->snd_una;
55} 55}
56 56
57/* 57/*
@@ -74,11 +74,11 @@ static inline void westwood_filter(struct westwood *w, u32 delta)
74 * Called after processing group of packets. 74 * Called after processing group of packets.
75 * but all westwood needs is the last sample of srtt. 75 * but all westwood needs is the last sample of srtt.
76 */ 76 */
77static void tcp_westwood_pkts_acked(struct tcp_sock *tp, u32 cnt) 77static void tcp_westwood_pkts_acked(struct sock *sk, u32 cnt)
78{ 78{
79 struct westwood *w = tcp_ca(tp); 79 struct westwood *w = inet_csk_ca(sk);
80 if (cnt > 0) 80 if (cnt > 0)
81 w->rtt = tp->srtt >> 3; 81 w->rtt = tcp_sk(sk)->srtt >> 3;
82} 82}
83 83
84/* 84/*
@@ -86,9 +86,9 @@ static void tcp_westwood_pkts_acked(struct tcp_sock *tp, u32 cnt)
86 * It updates RTT evaluation window if it is the right moment to do 86 * It updates RTT evaluation window if it is the right moment to do
87 * it. If so it calls filter for evaluating bandwidth. 87 * it. If so it calls filter for evaluating bandwidth.
88 */ 88 */
89static void westwood_update_window(struct tcp_sock *tp) 89static void westwood_update_window(struct sock *sk)
90{ 90{
91 struct westwood *w = tcp_ca(tp); 91 struct westwood *w = inet_csk_ca(sk);
92 s32 delta = tcp_time_stamp - w->rtt_win_sx; 92 s32 delta = tcp_time_stamp - w->rtt_win_sx;
93 93
94 /* 94 /*
@@ -114,11 +114,12 @@ static void westwood_update_window(struct tcp_sock *tp)
114 * header prediction is successful. In such case in fact update is 114 * header prediction is successful. In such case in fact update is
115 * straight forward and doesn't need any particular care. 115 * straight forward and doesn't need any particular care.
116 */ 116 */
117static inline void westwood_fast_bw(struct tcp_sock *tp) 117static inline void westwood_fast_bw(struct sock *sk)
118{ 118{
119 struct westwood *w = tcp_ca(tp); 119 const struct tcp_sock *tp = tcp_sk(sk);
120 struct westwood *w = inet_csk_ca(sk);
120 121
121 westwood_update_window(tp); 122 westwood_update_window(sk);
122 123
123 w->bk += tp->snd_una - w->snd_una; 124 w->bk += tp->snd_una - w->snd_una;
124 w->snd_una = tp->snd_una; 125 w->snd_una = tp->snd_una;
@@ -130,9 +131,10 @@ static inline void westwood_fast_bw(struct tcp_sock *tp)
130 * This function evaluates cumul_ack for evaluating bk in case of 131 * This function evaluates cumul_ack for evaluating bk in case of
131 * delayed or partial acks. 132 * delayed or partial acks.
132 */ 133 */
133static inline u32 westwood_acked_count(struct tcp_sock *tp) 134static inline u32 westwood_acked_count(struct sock *sk)
134{ 135{
135 struct westwood *w = tcp_ca(tp); 136 const struct tcp_sock *tp = tcp_sk(sk);
137 struct westwood *w = inet_csk_ca(sk);
136 138
137 w->cumul_ack = tp->snd_una - w->snd_una; 139 w->cumul_ack = tp->snd_una - w->snd_una;
138 140
@@ -160,9 +162,10 @@ static inline u32 westwood_acked_count(struct tcp_sock *tp)
160 return w->cumul_ack; 162 return w->cumul_ack;
161} 163}
162 164
163static inline u32 westwood_bw_rttmin(const struct tcp_sock *tp) 165static inline u32 westwood_bw_rttmin(const struct sock *sk)
164{ 166{
165 struct westwood *w = tcp_ca(tp); 167 const struct tcp_sock *tp = tcp_sk(sk);
168 const struct westwood *w = inet_csk_ca(sk);
166 return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2); 169 return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2);
167} 170}
168 171
@@ -172,31 +175,32 @@ static inline u32 westwood_bw_rttmin(const struct tcp_sock *tp)
172 * in packets we use mss_cache). Rttmin is guaranteed to be >= 2 175 * in packets we use mss_cache). Rttmin is guaranteed to be >= 2
173 * so avoids ever returning 0. 176 * so avoids ever returning 0.
174 */ 177 */
175static u32 tcp_westwood_cwnd_min(struct tcp_sock *tp) 178static u32 tcp_westwood_cwnd_min(struct sock *sk)
176{ 179{
177 return westwood_bw_rttmin(tp); 180 return westwood_bw_rttmin(sk);
178} 181}
179 182
180static void tcp_westwood_event(struct tcp_sock *tp, enum tcp_ca_event event) 183static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event)
181{ 184{
182 struct westwood *w = tcp_ca(tp); 185 struct tcp_sock *tp = tcp_sk(sk);
186 struct westwood *w = inet_csk_ca(sk);
183 187
184 switch(event) { 188 switch(event) {
185 case CA_EVENT_FAST_ACK: 189 case CA_EVENT_FAST_ACK:
186 westwood_fast_bw(tp); 190 westwood_fast_bw(sk);
187 break; 191 break;
188 192
189 case CA_EVENT_COMPLETE_CWR: 193 case CA_EVENT_COMPLETE_CWR:
190 tp->snd_cwnd = tp->snd_ssthresh = westwood_bw_rttmin(tp); 194 tp->snd_cwnd = tp->snd_ssthresh = westwood_bw_rttmin(sk);
191 break; 195 break;
192 196
193 case CA_EVENT_FRTO: 197 case CA_EVENT_FRTO:
194 tp->snd_ssthresh = westwood_bw_rttmin(tp); 198 tp->snd_ssthresh = westwood_bw_rttmin(sk);
195 break; 199 break;
196 200
197 case CA_EVENT_SLOW_ACK: 201 case CA_EVENT_SLOW_ACK:
198 westwood_update_window(tp); 202 westwood_update_window(sk);
199 w->bk += westwood_acked_count(tp); 203 w->bk += westwood_acked_count(sk);
200 w->rtt_min = min(w->rtt, w->rtt_min); 204 w->rtt_min = min(w->rtt, w->rtt_min);
201 break; 205 break;
202 206
@@ -208,15 +212,15 @@ static void tcp_westwood_event(struct tcp_sock *tp, enum tcp_ca_event event)
208 212
209 213
210/* Extract info for Tcp socket info provided via netlink. */ 214/* Extract info for Tcp socket info provided via netlink. */
211static void tcp_westwood_info(struct tcp_sock *tp, u32 ext, 215static void tcp_westwood_info(struct sock *sk, u32 ext,
212 struct sk_buff *skb) 216 struct sk_buff *skb)
213{ 217{
214 const struct westwood *ca = tcp_ca(tp); 218 const struct westwood *ca = inet_csk_ca(sk);
215 if (ext & (1<<(TCPDIAG_VEGASINFO-1))) { 219 if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
216 struct rtattr *rta; 220 struct rtattr *rta;
217 struct tcpvegas_info *info; 221 struct tcpvegas_info *info;
218 222
219 rta = __RTA_PUT(skb, TCPDIAG_VEGASINFO, sizeof(*info)); 223 rta = __RTA_PUT(skb, INET_DIAG_VEGASINFO, sizeof(*info));
220 info = RTA_DATA(rta); 224 info = RTA_DATA(rta);
221 info->tcpv_enabled = 1; 225 info->tcpv_enabled = 1;
222 info->tcpv_rttcnt = 0; 226 info->tcpv_rttcnt = 0;
@@ -242,7 +246,7 @@ static struct tcp_congestion_ops tcp_westwood = {
242 246
243static int __init tcp_westwood_register(void) 247static int __init tcp_westwood_register(void)
244{ 248{
245 BUG_ON(sizeof(struct westwood) > TCP_CA_PRIV_SIZE); 249 BUG_ON(sizeof(struct westwood) > ICSK_CA_PRIV_SIZE);
246 return tcp_register_congestion_control(&tcp_westwood); 250 return tcp_register_congestion_control(&tcp_westwood);
247} 251}
248 252
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index dc4d07357e3a..e5beca7de86c 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -95,7 +95,8 @@
95#include <linux/ipv6.h> 95#include <linux/ipv6.h>
96#include <linux/netdevice.h> 96#include <linux/netdevice.h>
97#include <net/snmp.h> 97#include <net/snmp.h>
98#include <net/tcp.h> 98#include <net/ip.h>
99#include <net/tcp_states.h>
99#include <net/protocol.h> 100#include <net/protocol.h>
100#include <linux/skbuff.h> 101#include <linux/skbuff.h>
101#include <linux/proc_fs.h> 102#include <linux/proc_fs.h>
@@ -112,7 +113,7 @@
112 * Snmp MIB for the UDP layer 113 * Snmp MIB for the UDP layer
113 */ 114 */
114 115
115DEFINE_SNMP_STAT(struct udp_mib, udp_statistics); 116DEFINE_SNMP_STAT(struct udp_mib, udp_statistics) __read_mostly;
116 117
117struct hlist_head udp_hash[UDP_HTABLE_SIZE]; 118struct hlist_head udp_hash[UDP_HTABLE_SIZE];
118DEFINE_RWLOCK(udp_hash_lock); 119DEFINE_RWLOCK(udp_hash_lock);
@@ -628,7 +629,7 @@ back_from_confirm:
628 /* ... which is an evident application bug. --ANK */ 629 /* ... which is an evident application bug. --ANK */
629 release_sock(sk); 630 release_sock(sk);
630 631
631 LIMIT_NETDEBUG(printk(KERN_DEBUG "udp cork app bug 2\n")); 632 LIMIT_NETDEBUG(KERN_DEBUG "udp cork app bug 2\n");
632 err = -EINVAL; 633 err = -EINVAL;
633 goto out; 634 goto out;
634 } 635 }
@@ -693,7 +694,7 @@ static int udp_sendpage(struct sock *sk, struct page *page, int offset,
693 if (unlikely(!up->pending)) { 694 if (unlikely(!up->pending)) {
694 release_sock(sk); 695 release_sock(sk);
695 696
696 LIMIT_NETDEBUG(printk(KERN_DEBUG "udp cork app bug 3\n")); 697 LIMIT_NETDEBUG(KERN_DEBUG "udp cork app bug 3\n");
697 return -EINVAL; 698 return -EINVAL;
698 } 699 }
699 700
@@ -1102,7 +1103,7 @@ static int udp_checksum_init(struct sk_buff *skb, struct udphdr *uh,
1102 skb->ip_summed = CHECKSUM_UNNECESSARY; 1103 skb->ip_summed = CHECKSUM_UNNECESSARY;
1103 if (!udp_check(uh, ulen, saddr, daddr, skb->csum)) 1104 if (!udp_check(uh, ulen, saddr, daddr, skb->csum))
1104 return 0; 1105 return 0;
1105 LIMIT_NETDEBUG(printk(KERN_DEBUG "udp v4 hw csum failure.\n")); 1106 LIMIT_NETDEBUG(KERN_DEBUG "udp v4 hw csum failure.\n");
1106 skb->ip_summed = CHECKSUM_NONE; 1107 skb->ip_summed = CHECKSUM_NONE;
1107 } 1108 }
1108 if (skb->ip_summed != CHECKSUM_UNNECESSARY) 1109 if (skb->ip_summed != CHECKSUM_UNNECESSARY)
@@ -1181,13 +1182,13 @@ int udp_rcv(struct sk_buff *skb)
1181 return(0); 1182 return(0);
1182 1183
1183short_packet: 1184short_packet:
1184 LIMIT_NETDEBUG(printk(KERN_DEBUG "UDP: short packet: From %u.%u.%u.%u:%u %d/%d to %u.%u.%u.%u:%u\n", 1185 LIMIT_NETDEBUG(KERN_DEBUG "UDP: short packet: From %u.%u.%u.%u:%u %d/%d to %u.%u.%u.%u:%u\n",
1185 NIPQUAD(saddr), 1186 NIPQUAD(saddr),
1186 ntohs(uh->source), 1187 ntohs(uh->source),
1187 ulen, 1188 ulen,
1188 len, 1189 len,
1189 NIPQUAD(daddr), 1190 NIPQUAD(daddr),
1190 ntohs(uh->dest))); 1191 ntohs(uh->dest));
1191no_header: 1192no_header:
1192 UDP_INC_STATS_BH(UDP_MIB_INERRORS); 1193 UDP_INC_STATS_BH(UDP_MIB_INERRORS);
1193 kfree_skb(skb); 1194 kfree_skb(skb);
@@ -1198,12 +1199,12 @@ csum_error:
1198 * RFC1122: OK. Discards the bad packet silently (as far as 1199 * RFC1122: OK. Discards the bad packet silently (as far as
1199 * the network is concerned, anyway) as per 4.1.3.4 (MUST). 1200 * the network is concerned, anyway) as per 4.1.3.4 (MUST).
1200 */ 1201 */
1201 LIMIT_NETDEBUG(printk(KERN_DEBUG "UDP: bad checksum. From %d.%d.%d.%d:%d to %d.%d.%d.%d:%d ulen %d\n", 1202 LIMIT_NETDEBUG(KERN_DEBUG "UDP: bad checksum. From %d.%d.%d.%d:%d to %d.%d.%d.%d:%d ulen %d\n",
1202 NIPQUAD(saddr), 1203 NIPQUAD(saddr),
1203 ntohs(uh->source), 1204 ntohs(uh->source),
1204 NIPQUAD(daddr), 1205 NIPQUAD(daddr),
1205 ntohs(uh->dest), 1206 ntohs(uh->dest),
1206 ulen)); 1207 ulen);
1207drop: 1208drop:
1208 UDP_INC_STATS_BH(UDP_MIB_INERRORS); 1209 UDP_INC_STATS_BH(UDP_MIB_INERRORS);
1209 kfree_skb(skb); 1210 kfree_skb(skb);
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
index 050611d7a967..d23e07fc81fa 100644
--- a/net/ipv4/xfrm4_state.c
+++ b/net/ipv4/xfrm4_state.c
@@ -128,8 +128,10 @@ void __init xfrm4_state_init(void)
128 xfrm_state_register_afinfo(&xfrm4_state_afinfo); 128 xfrm_state_register_afinfo(&xfrm4_state_afinfo);
129} 129}
130 130
131#if 0
131void __exit xfrm4_state_fini(void) 132void __exit xfrm4_state_fini(void)
132{ 133{
133 xfrm_state_unregister_afinfo(&xfrm4_state_afinfo); 134 xfrm_state_unregister_afinfo(&xfrm4_state_afinfo);
134} 135}
136#endif /* 0 */
135 137