aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Kconfig26
-rw-r--r--net/ipv4/Makefile4
-rw-r--r--net/ipv4/af_inet.c13
-rw-r--r--net/ipv4/ah4.c2
-rw-r--r--net/ipv4/devinet.c9
-rw-r--r--net/ipv4/esp4.c2
-rw-r--r--net/ipv4/fib_frontend.c55
-rw-r--r--net/ipv4/fib_hash.c3
-rw-r--r--net/ipv4/fib_lookup.h3
-rw-r--r--net/ipv4/fib_rules.c7
-rw-r--r--net/ipv4/fib_semantics.c10
-rw-r--r--net/ipv4/fib_trie.c2454
-rw-r--r--net/ipv4/icmp.c9
-rw-r--r--net/ipv4/ip_input.c5
-rw-r--r--net/ipv4/ip_output.c11
-rw-r--r--net/ipv4/ip_sockglue.c6
-rw-r--r--net/ipv4/ipcomp.c11
-rw-r--r--net/ipv4/ipmr.c1
-rw-r--r--net/ipv4/ipvs/ip_vs_xmit.c1
-rw-r--r--net/ipv4/multipath_drr.c2
-rw-r--r--net/ipv4/multipath_random.c2
-rw-r--r--net/ipv4/multipath_rr.c2
-rw-r--r--net/ipv4/multipath_wrandom.c2
-rw-r--r--net/ipv4/netfilter/arp_tables.c1
-rw-r--r--net/ipv4/netfilter/ip_conntrack_amanda.c7
-rw-r--r--net/ipv4/netfilter/ip_conntrack_core.c107
-rw-r--r--net/ipv4/netfilter/ip_conntrack_ftp.c7
-rw-r--r--net/ipv4/netfilter/ip_conntrack_irc.c7
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_sctp.c23
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_tcp.c27
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_udp.c1
-rw-r--r--net/ipv4/netfilter/ip_conntrack_standalone.c23
-rw-r--r--net/ipv4/netfilter/ip_nat_core.c32
-rw-r--r--net/ipv4/netfilter/ip_nat_helper.c13
-rw-r--r--net/ipv4/netfilter/ip_nat_rule.c4
-rw-r--r--net/ipv4/netfilter/ip_nat_standalone.c5
-rw-r--r--net/ipv4/netfilter/ip_tables.c1
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c51
-rw-r--r--net/ipv4/netfilter/ipt_MASQUERADE.c10
-rw-r--r--net/ipv4/netfilter/ipt_REJECT.c13
-rw-r--r--net/ipv4/netfilter/ipt_ULOG.c15
-rw-r--r--net/ipv4/netfilter/ipt_hashlimit.c17
-rw-r--r--net/ipv4/netfilter/ipt_helper.c4
-rw-r--r--net/ipv4/netfilter/ipt_recent.c10
-rw-r--r--net/ipv4/raw.c22
-rw-r--r--net/ipv4/route.c19
-rw-r--r--net/ipv4/syncookies.c49
-rw-r--r--net/ipv4/sysctl_net_ipv4.c9
-rw-r--r--net/ipv4/tcp.c86
-rw-r--r--net/ipv4/tcp_diag.c37
-rw-r--r--net/ipv4/tcp_ipv4.c172
-rw-r--r--net/ipv4/tcp_minisocks.c68
-rw-r--r--net/ipv4/tcp_output.c27
-rw-r--r--net/ipv4/tcp_timer.c18
-rw-r--r--net/ipv4/xfrm4_output.c8
-rw-r--r--net/ipv4/xfrm4_state.c9
-rw-r--r--net/ipv4/xfrm4_tunnel.c2
57 files changed, 3029 insertions, 515 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 6d3e8b1bd1f2..567b03b1c349 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -1,6 +1,32 @@
1# 1#
2# IP configuration 2# IP configuration
3# 3#
4choice
5 prompt "Choose IP: FIB lookup"
6 depends on INET
7 default IP_FIB_HASH
8
9config IP_FIB_HASH
10 bool "FIB_HASH"
11 ---help---
12 Current FIB is very proven and good enough for most users.
13
14config IP_FIB_TRIE
15 bool "FIB_TRIE"
16 ---help---
17 Use new experimental LC-trie as FIB lookup algoritm.
18 This improves lookup performance
19
20 LC-trie is described in:
21
22 IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24 An experimental study of compression methods for dynamic tries
25 Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
26 http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
27
28endchoice
29
4config IP_MULTICAST 30config IP_MULTICAST
5 bool "IP: multicasting" 31 bool "IP: multicasting"
6 depends on INET 32 depends on INET
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 8b379627ebb6..65d57d8e1add 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -7,8 +7,10 @@ obj-y := utils.o route.o inetpeer.o protocol.o \
7 ip_output.o ip_sockglue.o \ 7 ip_output.o ip_sockglue.o \
8 tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o tcp_minisocks.o \ 8 tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o tcp_minisocks.o \
9 datagram.o raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \ 9 datagram.o raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \
10 sysctl_net_ipv4.o fib_frontend.o fib_semantics.o fib_hash.o 10 sysctl_net_ipv4.o fib_frontend.o fib_semantics.o
11 11
12obj-$(CONFIG_IP_FIB_HASH) += fib_hash.o
13obj-$(CONFIG_IP_FIB_TRIE) += fib_trie.o
12obj-$(CONFIG_PROC_FS) += proc.o 14obj-$(CONFIG_PROC_FS) += proc.o
13obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o 15obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o
14obj-$(CONFIG_IP_MROUTE) += ipmr.o 16obj-$(CONFIG_IP_MROUTE) += ipmr.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index b3cb49ce5fad..658e7977924d 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1119,6 +1119,10 @@ module_init(inet_init);
1119#ifdef CONFIG_PROC_FS 1119#ifdef CONFIG_PROC_FS
1120extern int fib_proc_init(void); 1120extern int fib_proc_init(void);
1121extern void fib_proc_exit(void); 1121extern void fib_proc_exit(void);
1122#ifdef CONFIG_IP_FIB_TRIE
1123extern int fib_stat_proc_init(void);
1124extern void fib_stat_proc_exit(void);
1125#endif
1122extern int ip_misc_proc_init(void); 1126extern int ip_misc_proc_init(void);
1123extern int raw_proc_init(void); 1127extern int raw_proc_init(void);
1124extern void raw_proc_exit(void); 1128extern void raw_proc_exit(void);
@@ -1139,11 +1143,19 @@ static int __init ipv4_proc_init(void)
1139 goto out_udp; 1143 goto out_udp;
1140 if (fib_proc_init()) 1144 if (fib_proc_init())
1141 goto out_fib; 1145 goto out_fib;
1146#ifdef CONFIG_IP_FIB_TRIE
1147 if (fib_stat_proc_init())
1148 goto out_fib_stat;
1149 #endif
1142 if (ip_misc_proc_init()) 1150 if (ip_misc_proc_init())
1143 goto out_misc; 1151 goto out_misc;
1144out: 1152out:
1145 return rc; 1153 return rc;
1146out_misc: 1154out_misc:
1155#ifdef CONFIG_IP_FIB_TRIE
1156 fib_stat_proc_exit();
1157out_fib_stat:
1158#endif
1147 fib_proc_exit(); 1159 fib_proc_exit();
1148out_fib: 1160out_fib:
1149 udp4_proc_exit(); 1161 udp4_proc_exit();
@@ -1181,6 +1193,7 @@ EXPORT_SYMBOL(inet_stream_connect);
1181EXPORT_SYMBOL(inet_stream_ops); 1193EXPORT_SYMBOL(inet_stream_ops);
1182EXPORT_SYMBOL(inet_unregister_protosw); 1194EXPORT_SYMBOL(inet_unregister_protosw);
1183EXPORT_SYMBOL(net_statistics); 1195EXPORT_SYMBOL(net_statistics);
1196EXPORT_SYMBOL(sysctl_ip_nonlocal_bind);
1184 1197
1185#ifdef INET_REFCNT_DEBUG 1198#ifdef INET_REFCNT_DEBUG
1186EXPORT_SYMBOL(inet_sock_nr); 1199EXPORT_SYMBOL(inet_sock_nr);
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index 0e98f2235b6e..514c85b2631a 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -200,7 +200,7 @@ static void ah4_err(struct sk_buff *skb, u32 info)
200 xfrm_state_put(x); 200 xfrm_state_put(x);
201} 201}
202 202
203static int ah_init_state(struct xfrm_state *x, void *args) 203static int ah_init_state(struct xfrm_state *x)
204{ 204{
205 struct ah_data *ahp = NULL; 205 struct ah_data *ahp = NULL;
206 struct xfrm_algo_desc *aalg_desc; 206 struct xfrm_algo_desc *aalg_desc;
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 478a30179a52..650dcb12d9a1 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1030,14 +1030,13 @@ static struct notifier_block ip_netdev_notifier = {
1030}; 1030};
1031 1031
1032static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa, 1032static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa,
1033 u32 pid, u32 seq, int event) 1033 u32 pid, u32 seq, int event, unsigned int flags)
1034{ 1034{
1035 struct ifaddrmsg *ifm; 1035 struct ifaddrmsg *ifm;
1036 struct nlmsghdr *nlh; 1036 struct nlmsghdr *nlh;
1037 unsigned char *b = skb->tail; 1037 unsigned char *b = skb->tail;
1038 1038
1039 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*ifm)); 1039 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*ifm), flags);
1040 if (pid) nlh->nlmsg_flags |= NLM_F_MULTI;
1041 ifm = NLMSG_DATA(nlh); 1040 ifm = NLMSG_DATA(nlh);
1042 ifm->ifa_family = AF_INET; 1041 ifm->ifa_family = AF_INET;
1043 ifm->ifa_prefixlen = ifa->ifa_prefixlen; 1042 ifm->ifa_prefixlen = ifa->ifa_prefixlen;
@@ -1090,7 +1089,7 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
1090 continue; 1089 continue;
1091 if (inet_fill_ifaddr(skb, ifa, NETLINK_CB(cb->skb).pid, 1090 if (inet_fill_ifaddr(skb, ifa, NETLINK_CB(cb->skb).pid,
1092 cb->nlh->nlmsg_seq, 1091 cb->nlh->nlmsg_seq,
1093 RTM_NEWADDR) <= 0) { 1092 RTM_NEWADDR, NLM_F_MULTI) <= 0) {
1094 rcu_read_unlock(); 1093 rcu_read_unlock();
1095 goto done; 1094 goto done;
1096 } 1095 }
@@ -1113,7 +1112,7 @@ static void rtmsg_ifa(int event, struct in_ifaddr* ifa)
1113 1112
1114 if (!skb) 1113 if (!skb)
1115 netlink_set_err(rtnl, 0, RTMGRP_IPV4_IFADDR, ENOBUFS); 1114 netlink_set_err(rtnl, 0, RTMGRP_IPV4_IFADDR, ENOBUFS);
1116 else if (inet_fill_ifaddr(skb, ifa, 0, 0, event) < 0) { 1115 else if (inet_fill_ifaddr(skb, ifa, current->pid, 0, event, 0) < 0) {
1117 kfree_skb(skb); 1116 kfree_skb(skb);
1118 netlink_set_err(rtnl, 0, RTMGRP_IPV4_IFADDR, EINVAL); 1117 netlink_set_err(rtnl, 0, RTMGRP_IPV4_IFADDR, EINVAL);
1119 } else { 1118 } else {
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index eae84cc39d3f..ba57446d5d1f 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -362,7 +362,7 @@ static void esp_destroy(struct xfrm_state *x)
362 kfree(esp); 362 kfree(esp);
363} 363}
364 364
365static int esp_init_state(struct xfrm_state *x, void *args) 365static int esp_init_state(struct xfrm_state *x)
366{ 366{
367 struct esp_data *esp = NULL; 367 struct esp_data *esp = NULL;
368 368
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 563e7d612706..cd8e45ab9580 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -516,6 +516,60 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa)
516#undef BRD1_OK 516#undef BRD1_OK
517} 517}
518 518
519static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb )
520{
521
522 struct fib_result res;
523 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = frn->fl_addr,
524 .fwmark = frn->fl_fwmark,
525 .tos = frn->fl_tos,
526 .scope = frn->fl_scope } } };
527 if (tb) {
528 local_bh_disable();
529
530 frn->tb_id = tb->tb_id;
531 frn->err = tb->tb_lookup(tb, &fl, &res);
532
533 if (!frn->err) {
534 frn->prefixlen = res.prefixlen;
535 frn->nh_sel = res.nh_sel;
536 frn->type = res.type;
537 frn->scope = res.scope;
538 }
539 local_bh_enable();
540 }
541}
542
543static void nl_fib_input(struct sock *sk, int len)
544{
545 struct sk_buff *skb = NULL;
546 struct nlmsghdr *nlh = NULL;
547 struct fib_result_nl *frn;
548 int err;
549 u32 pid;
550 struct fib_table *tb;
551
552 skb = skb_recv_datagram(sk, 0, 0, &err);
553 nlh = (struct nlmsghdr *)skb->data;
554
555 frn = (struct fib_result_nl *) NLMSG_DATA(nlh);
556 tb = fib_get_table(frn->tb_id_in);
557
558 nl_fib_lookup(frn, tb);
559
560 pid = nlh->nlmsg_pid; /*pid of sending process */
561 NETLINK_CB(skb).groups = 0; /* not in mcast group */
562 NETLINK_CB(skb).pid = 0; /* from kernel */
563 NETLINK_CB(skb).dst_pid = pid;
564 NETLINK_CB(skb).dst_groups = 0; /* unicast */
565 netlink_unicast(sk, skb, pid, MSG_DONTWAIT);
566}
567
568static void nl_fib_lookup_init(void)
569{
570 netlink_kernel_create(NETLINK_FIB_LOOKUP, nl_fib_input);
571}
572
519static void fib_disable_ip(struct net_device *dev, int force) 573static void fib_disable_ip(struct net_device *dev, int force)
520{ 574{
521 if (fib_sync_down(0, dev, force)) 575 if (fib_sync_down(0, dev, force))
@@ -604,6 +658,7 @@ void __init ip_fib_init(void)
604 658
605 register_netdevice_notifier(&fib_netdev_notifier); 659 register_netdevice_notifier(&fib_netdev_notifier);
606 register_inetaddr_notifier(&fib_inetaddr_notifier); 660 register_inetaddr_notifier(&fib_inetaddr_notifier);
661 nl_fib_lookup_init();
607} 662}
608 663
609EXPORT_SYMBOL(inet_addr_type); 664EXPORT_SYMBOL(inet_addr_type);
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
index 6506dcc01b46..b10d6bb5ef3d 100644
--- a/net/ipv4/fib_hash.c
+++ b/net/ipv4/fib_hash.c
@@ -703,7 +703,8 @@ fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb,
703 &f->fn_key, 703 &f->fn_key,
704 fz->fz_order, 704 fz->fz_order,
705 fa->fa_tos, 705 fa->fa_tos,
706 fa->fa_info) < 0) { 706 fa->fa_info,
707 NLM_F_MULTI) < 0) {
707 cb->args[3] = i; 708 cb->args[3] = i;
708 return -1; 709 return -1;
709 } 710 }
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
index ac4485f75e97..b729d97cfa93 100644
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -30,7 +30,8 @@ extern int fib_nh_match(struct rtmsg *r, struct nlmsghdr *,
30 struct kern_rta *rta, struct fib_info *fi); 30 struct kern_rta *rta, struct fib_info *fi);
31extern int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, 31extern int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
32 u8 tb_id, u8 type, u8 scope, void *dst, 32 u8 tb_id, u8 type, u8 scope, void *dst,
33 int dst_len, u8 tos, struct fib_info *fi); 33 int dst_len, u8 tos, struct fib_info *fi,
34 unsigned int);
34extern void rtmsg_fib(int event, u32 key, struct fib_alias *fa, 35extern void rtmsg_fib(int event, u32 key, struct fib_alias *fa,
35 int z, int tb_id, 36 int z, int tb_id,
36 struct nlmsghdr *n, struct netlink_skb_parms *req); 37 struct nlmsghdr *n, struct netlink_skb_parms *req);
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 39d0aadb9a2a..0b298bbc1518 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -367,13 +367,14 @@ static struct notifier_block fib_rules_notifier = {
367 367
368static __inline__ int inet_fill_rule(struct sk_buff *skb, 368static __inline__ int inet_fill_rule(struct sk_buff *skb,
369 struct fib_rule *r, 369 struct fib_rule *r,
370 struct netlink_callback *cb) 370 struct netlink_callback *cb,
371 unsigned int flags)
371{ 372{
372 struct rtmsg *rtm; 373 struct rtmsg *rtm;
373 struct nlmsghdr *nlh; 374 struct nlmsghdr *nlh;
374 unsigned char *b = skb->tail; 375 unsigned char *b = skb->tail;
375 376
376 nlh = NLMSG_PUT(skb, NETLINK_CREDS(cb->skb)->pid, cb->nlh->nlmsg_seq, RTM_NEWRULE, sizeof(*rtm)); 377 nlh = NLMSG_NEW_ANSWER(skb, cb, RTM_NEWRULE, sizeof(*rtm), flags);
377 rtm = NLMSG_DATA(nlh); 378 rtm = NLMSG_DATA(nlh);
378 rtm->rtm_family = AF_INET; 379 rtm->rtm_family = AF_INET;
379 rtm->rtm_dst_len = r->r_dst_len; 380 rtm->rtm_dst_len = r->r_dst_len;
@@ -422,7 +423,7 @@ int inet_dump_rules(struct sk_buff *skb, struct netlink_callback *cb)
422 for (r=fib_rules, idx=0; r; r = r->r_next, idx++) { 423 for (r=fib_rules, idx=0; r; r = r->r_next, idx++) {
423 if (idx < s_idx) 424 if (idx < s_idx)
424 continue; 425 continue;
425 if (inet_fill_rule(skb, r, cb) < 0) 426 if (inet_fill_rule(skb, r, cb, NLM_F_MULTI) < 0)
426 break; 427 break;
427 } 428 }
428 read_unlock(&fib_rules_lock); 429 read_unlock(&fib_rules_lock);
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 029362d66135..c886b28ba9f5 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -276,7 +276,7 @@ void rtmsg_fib(int event, u32 key, struct fib_alias *fa,
276 struct nlmsghdr *n, struct netlink_skb_parms *req) 276 struct nlmsghdr *n, struct netlink_skb_parms *req)
277{ 277{
278 struct sk_buff *skb; 278 struct sk_buff *skb;
279 u32 pid = req ? req->pid : 0; 279 u32 pid = req ? req->pid : n->nlmsg_pid;
280 int size = NLMSG_SPACE(sizeof(struct rtmsg)+256); 280 int size = NLMSG_SPACE(sizeof(struct rtmsg)+256);
281 281
282 skb = alloc_skb(size, GFP_KERNEL); 282 skb = alloc_skb(size, GFP_KERNEL);
@@ -286,7 +286,7 @@ void rtmsg_fib(int event, u32 key, struct fib_alias *fa,
286 if (fib_dump_info(skb, pid, n->nlmsg_seq, event, tb_id, 286 if (fib_dump_info(skb, pid, n->nlmsg_seq, event, tb_id,
287 fa->fa_type, fa->fa_scope, &key, z, 287 fa->fa_type, fa->fa_scope, &key, z,
288 fa->fa_tos, 288 fa->fa_tos,
289 fa->fa_info) < 0) { 289 fa->fa_info, 0) < 0) {
290 kfree_skb(skb); 290 kfree_skb(skb);
291 return; 291 return;
292 } 292 }
@@ -932,13 +932,13 @@ u32 __fib_res_prefsrc(struct fib_result *res)
932int 932int
933fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, 933fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
934 u8 tb_id, u8 type, u8 scope, void *dst, int dst_len, u8 tos, 934 u8 tb_id, u8 type, u8 scope, void *dst, int dst_len, u8 tos,
935 struct fib_info *fi) 935 struct fib_info *fi, unsigned int flags)
936{ 936{
937 struct rtmsg *rtm; 937 struct rtmsg *rtm;
938 struct nlmsghdr *nlh; 938 struct nlmsghdr *nlh;
939 unsigned char *b = skb->tail; 939 unsigned char *b = skb->tail;
940 940
941 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*rtm)); 941 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*rtm), flags);
942 rtm = NLMSG_DATA(nlh); 942 rtm = NLMSG_DATA(nlh);
943 rtm->rtm_family = AF_INET; 943 rtm->rtm_family = AF_INET;
944 rtm->rtm_dst_len = dst_len; 944 rtm->rtm_dst_len = dst_len;
@@ -1035,7 +1035,7 @@ fib_convert_rtentry(int cmd, struct nlmsghdr *nl, struct rtmsg *rtm,
1035 } 1035 }
1036 1036
1037 nl->nlmsg_flags = NLM_F_REQUEST; 1037 nl->nlmsg_flags = NLM_F_REQUEST;
1038 nl->nlmsg_pid = 0; 1038 nl->nlmsg_pid = current->pid;
1039 nl->nlmsg_seq = 0; 1039 nl->nlmsg_seq = 0;
1040 nl->nlmsg_len = NLMSG_LENGTH(sizeof(*rtm)); 1040 nl->nlmsg_len = NLMSG_LENGTH(sizeof(*rtm));
1041 if (cmd == SIOCDELRT) { 1041 if (cmd == SIOCDELRT) {
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
new file mode 100644
index 000000000000..0671569ee6f0
--- /dev/null
+++ b/net/ipv4/fib_trie.c
@@ -0,0 +1,2454 @@
1/*
2 * This program is free software; you can redistribute it and/or
3 * modify it under the terms of the GNU General Public License
4 * as published by the Free Software Foundation; either version
5 * 2 of the License, or (at your option) any later version.
6 *
7 * Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8 * & Swedish University of Agricultural Sciences.
9 *
10 * Jens Laas <jens.laas@data.slu.se> Swedish University of
11 * Agricultural Sciences.
12 *
13 * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet
14 *
15 * This work is based on the LPC-trie which is originally descibed in:
16 *
17 * An experimental study of compression methods for dynamic tries
18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19 * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
20 *
21 *
22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24 *
25 * Version: $Id: fib_trie.c,v 1.3 2005/06/08 14:20:01 robert Exp $
26 *
27 *
28 * Code from fib_hash has been reused which includes the following header:
29 *
30 *
31 * INET An implementation of the TCP/IP protocol suite for the LINUX
32 * operating system. INET is implemented using the BSD Socket
33 * interface as the means of communication with the user level.
34 *
35 * IPv4 FIB: lookup engine and maintenance routines.
36 *
37 *
38 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
39 *
40 * This program is free software; you can redistribute it and/or
41 * modify it under the terms of the GNU General Public License
42 * as published by the Free Software Foundation; either version
43 * 2 of the License, or (at your option) any later version.
44 */
45
46#define VERSION "0.323"
47
48#include <linux/config.h>
49#include <asm/uaccess.h>
50#include <asm/system.h>
51#include <asm/bitops.h>
52#include <linux/types.h>
53#include <linux/kernel.h>
54#include <linux/sched.h>
55#include <linux/mm.h>
56#include <linux/string.h>
57#include <linux/socket.h>
58#include <linux/sockios.h>
59#include <linux/errno.h>
60#include <linux/in.h>
61#include <linux/inet.h>
62#include <linux/netdevice.h>
63#include <linux/if_arp.h>
64#include <linux/proc_fs.h>
65#include <linux/skbuff.h>
66#include <linux/netlink.h>
67#include <linux/init.h>
68#include <linux/list.h>
69#include <net/ip.h>
70#include <net/protocol.h>
71#include <net/route.h>
72#include <net/tcp.h>
73#include <net/sock.h>
74#include <net/ip_fib.h>
75#include "fib_lookup.h"
76
77#undef CONFIG_IP_FIB_TRIE_STATS
78#define MAX_CHILDS 16384
79
80#define EXTRACT(p, n, str) ((str)<<(p)>>(32-(n)))
81#define KEYLENGTH (8*sizeof(t_key))
82#define MASK_PFX(k, l) (((l)==0)?0:(k >> (KEYLENGTH-l)) << (KEYLENGTH-l))
83#define TKEY_GET_MASK(offset, bits) (((bits)==0)?0:((t_key)(-1) << (KEYLENGTH - bits) >> offset))
84
85static DEFINE_RWLOCK(fib_lock);
86
87typedef unsigned int t_key;
88
89#define T_TNODE 0
90#define T_LEAF 1
91#define NODE_TYPE_MASK 0x1UL
92#define NODE_PARENT(_node) \
93((struct tnode *)((_node)->_parent & ~NODE_TYPE_MASK))
94#define NODE_SET_PARENT(_node, _ptr) \
95((_node)->_parent = (((unsigned long)(_ptr)) | \
96 ((_node)->_parent & NODE_TYPE_MASK)))
97#define NODE_INIT_PARENT(_node, _type) \
98((_node)->_parent = (_type))
99#define NODE_TYPE(_node) \
100((_node)->_parent & NODE_TYPE_MASK)
101
102#define IS_TNODE(n) (!(n->_parent & T_LEAF))
103#define IS_LEAF(n) (n->_parent & T_LEAF)
104
105struct node {
106 t_key key;
107 unsigned long _parent;
108};
109
110struct leaf {
111 t_key key;
112 unsigned long _parent;
113 struct hlist_head list;
114};
115
116struct leaf_info {
117 struct hlist_node hlist;
118 int plen;
119 struct list_head falh;
120};
121
122struct tnode {
123 t_key key;
124 unsigned long _parent;
125 unsigned short pos:5; /* 2log(KEYLENGTH) bits needed */
126 unsigned short bits:5; /* 2log(KEYLENGTH) bits needed */
127 unsigned short full_children; /* KEYLENGTH bits needed */
128 unsigned short empty_children; /* KEYLENGTH bits needed */
129 struct node *child[0];
130};
131
132#ifdef CONFIG_IP_FIB_TRIE_STATS
133struct trie_use_stats {
134 unsigned int gets;
135 unsigned int backtrack;
136 unsigned int semantic_match_passed;
137 unsigned int semantic_match_miss;
138 unsigned int null_node_hit;
139};
140#endif
141
142struct trie_stat {
143 unsigned int totdepth;
144 unsigned int maxdepth;
145 unsigned int tnodes;
146 unsigned int leaves;
147 unsigned int nullpointers;
148 unsigned int nodesizes[MAX_CHILDS];
149};
150
151struct trie {
152 struct node *trie;
153#ifdef CONFIG_IP_FIB_TRIE_STATS
154 struct trie_use_stats stats;
155#endif
156 int size;
157 unsigned int revision;
158};
159
160static int trie_debug = 0;
161
162static int tnode_full(struct tnode *tn, struct node *n);
163static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
164static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull);
165static int tnode_child_length(struct tnode *tn);
166static struct node *resize(struct trie *t, struct tnode *tn);
167static struct tnode *inflate(struct trie *t, struct tnode *tn);
168static struct tnode *halve(struct trie *t, struct tnode *tn);
169static void tnode_free(struct tnode *tn);
170static void trie_dump_seq(struct seq_file *seq, struct trie *t);
171extern struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio);
172extern int fib_detect_death(struct fib_info *fi, int order,
173 struct fib_info **last_resort, int *last_idx, int *dflt);
174
175extern void rtmsg_fib(int event, u32 key, struct fib_alias *fa, int z, int tb_id,
176 struct nlmsghdr *n, struct netlink_skb_parms *req);
177
178static kmem_cache_t *fn_alias_kmem;
179static struct trie *trie_local = NULL, *trie_main = NULL;
180
181static void trie_bug(char *err)
182{
183 printk("Trie Bug: %s\n", err);
184 BUG();
185}
186
187static inline struct node *tnode_get_child(struct tnode *tn, int i)
188{
189 if (i >= 1<<tn->bits)
190 trie_bug("tnode_get_child");
191
192 return tn->child[i];
193}
194
195static inline int tnode_child_length(struct tnode *tn)
196{
197 return 1<<tn->bits;
198}
199
200/*
201 _________________________________________________________________
202 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
203 ----------------------------------------------------------------
204 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
205
206 _________________________________________________________________
207 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
208 -----------------------------------------------------------------
209 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
210
211 tp->pos = 7
212 tp->bits = 3
213 n->pos = 15
214 n->bits=4
215 KEYLENGTH=32
216*/
217
218static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
219{
220 if (offset < KEYLENGTH)
221 return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
222 else
223 return 0;
224}
225
226static inline int tkey_equals(t_key a, t_key b)
227{
228 return a == b;
229}
230
231static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
232{
233 if (bits == 0 || offset >= KEYLENGTH)
234 return 1;
235 bits = bits > KEYLENGTH ? KEYLENGTH : bits;
236 return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
237}
238
239static inline int tkey_mismatch(t_key a, int offset, t_key b)
240{
241 t_key diff = a ^ b;
242 int i = offset;
243
244 if(!diff)
245 return 0;
246 while((diff << i) >> (KEYLENGTH-1) == 0)
247 i++;
248 return i;
249}
250
251/* Candiate for fib_semantics */
252
253static void fn_free_alias(struct fib_alias *fa)
254{
255 fib_release_info(fa->fa_info);
256 kmem_cache_free(fn_alias_kmem, fa);
257}
258
259/*
260 To understand this stuff, an understanding of keys and all their bits is
261 necessary. Every node in the trie has a key associated with it, but not
262 all of the bits in that key are significant.
263
264 Consider a node 'n' and its parent 'tp'.
265
266 If n is a leaf, every bit in its key is significant. Its presence is
267 necessitaded by path compression, since during a tree traversal (when
268 searching for a leaf - unless we are doing an insertion) we will completely
269 ignore all skipped bits we encounter. Thus we need to verify, at the end of
270 a potentially successful search, that we have indeed been walking the
271 correct key path.
272
273 Note that we can never "miss" the correct key in the tree if present by
274 following the wrong path. Path compression ensures that segments of the key
275 that are the same for all keys with a given prefix are skipped, but the
276 skipped part *is* identical for each node in the subtrie below the skipped
277 bit! trie_insert() in this implementation takes care of that - note the
278 call to tkey_sub_equals() in trie_insert().
279
280 if n is an internal node - a 'tnode' here, the various parts of its key
281 have many different meanings.
282
283 Example:
284 _________________________________________________________________
285 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
286 -----------------------------------------------------------------
287 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
288
289 _________________________________________________________________
290 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
291 -----------------------------------------------------------------
292 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
293
294 tp->pos = 7
295 tp->bits = 3
296 n->pos = 15
297 n->bits=4
298
299 First, let's just ignore the bits that come before the parent tp, that is
300 the bits from 0 to (tp->pos-1). They are *known* but at this point we do
301 not use them for anything.
302
303 The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
304 index into the parent's child array. That is, they will be used to find
305 'n' among tp's children.
306
307 The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
308 for the node n.
309
310 All the bits we have seen so far are significant to the node n. The rest
311 of the bits are really not needed or indeed known in n->key.
312
313 The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
314 n's child array, and will of course be different for each child.
315
316 The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
317 at this point.
318
319*/
320
321static void check_tnode(struct tnode *tn)
322{
323 if(tn && tn->pos+tn->bits > 32) {
324 printk("TNODE ERROR tn=%p, pos=%d, bits=%d\n", tn, tn->pos, tn->bits);
325 }
326}
327
328static int halve_threshold = 25;
329static int inflate_threshold = 50;
330
331static struct leaf *leaf_new(void)
332{
333 struct leaf *l = kmalloc(sizeof(struct leaf), GFP_KERNEL);
334 if(l) {
335 NODE_INIT_PARENT(l, T_LEAF);
336 INIT_HLIST_HEAD(&l->list);
337 }
338 return l;
339}
340
341static struct leaf_info *leaf_info_new(int plen)
342{
343 struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL);
344 li->plen = plen;
345 INIT_LIST_HEAD(&li->falh);
346 return li;
347}
348
349static inline void free_leaf(struct leaf *l)
350{
351 kfree(l);
352}
353
354static inline void free_leaf_info(struct leaf_info *li)
355{
356 kfree(li);
357}
358
359static struct tnode* tnode_new(t_key key, int pos, int bits)
360{
361 int nchildren = 1<<bits;
362 int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *);
363 struct tnode *tn = kmalloc(sz, GFP_KERNEL);
364
365 if(tn) {
366 memset(tn, 0, sz);
367 NODE_INIT_PARENT(tn, T_TNODE);
368 tn->pos = pos;
369 tn->bits = bits;
370 tn->key = key;
371 tn->full_children = 0;
372 tn->empty_children = 1<<bits;
373 }
374 if(trie_debug > 0)
375 printk("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode),
376 (unsigned int) (sizeof(struct node) * 1<<bits));
377 return tn;
378}
379
380static void tnode_free(struct tnode *tn)
381{
382 if(!tn) {
383 trie_bug("tnode_free\n");
384 }
385 if(IS_LEAF(tn)) {
386 free_leaf((struct leaf *)tn);
387 if(trie_debug > 0 )
388 printk("FL %p \n", tn);
389 }
390 else if(IS_TNODE(tn)) {
391 kfree(tn);
392 if(trie_debug > 0 )
393 printk("FT %p \n", tn);
394 }
395 else {
396 trie_bug("tnode_free\n");
397 }
398}
399
400/*
401 * Check whether a tnode 'n' is "full", i.e. it is an internal node
402 * and no bits are skipped. See discussion in dyntree paper p. 6
403 */
404
405static inline int tnode_full(struct tnode *tn, struct node *n)
406{
407 if(n == NULL || IS_LEAF(n))
408 return 0;
409
410 return ((struct tnode *) n)->pos == tn->pos + tn->bits;
411}
412
413static inline void put_child(struct trie *t, struct tnode *tn, int i, struct node *n)
414{
415 tnode_put_child_reorg(tn, i, n, -1);
416}
417
418 /*
419 * Add a child at position i overwriting the old value.
420 * Update the value of full_children and empty_children.
421 */
422
423static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull)
424{
425 struct node *chi;
426 int isfull;
427
428 if(i >= 1<<tn->bits) {
429 printk("bits=%d, i=%d\n", tn->bits, i);
430 trie_bug("tnode_put_child_reorg bits");
431 }
432 write_lock_bh(&fib_lock);
433 chi = tn->child[i];
434
435 /* update emptyChildren */
436 if (n == NULL && chi != NULL)
437 tn->empty_children++;
438 else if (n != NULL && chi == NULL)
439 tn->empty_children--;
440
441 /* update fullChildren */
442 if (wasfull == -1)
443 wasfull = tnode_full(tn, chi);
444
445 isfull = tnode_full(tn, n);
446 if (wasfull && !isfull)
447 tn->full_children--;
448
449 else if (!wasfull && isfull)
450 tn->full_children++;
451 if(n)
452 NODE_SET_PARENT(n, tn);
453
454 tn->child[i] = n;
455 write_unlock_bh(&fib_lock);
456}
457
458static struct node *resize(struct trie *t, struct tnode *tn)
459{
460 int i;
461
462 if (!tn)
463 return NULL;
464
465 if(trie_debug)
466 printk("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
467 tn, inflate_threshold, halve_threshold);
468
469 /* No children */
470 if (tn->empty_children == tnode_child_length(tn)) {
471 tnode_free(tn);
472 return NULL;
473 }
474 /* One child */
475 if (tn->empty_children == tnode_child_length(tn) - 1)
476 for (i = 0; i < tnode_child_length(tn); i++) {
477
478 write_lock_bh(&fib_lock);
479 if (tn->child[i] != NULL) {
480
481 /* compress one level */
482 struct node *n = tn->child[i];
483 if(n)
484 NODE_INIT_PARENT(n, NODE_TYPE(n));
485
486 write_unlock_bh(&fib_lock);
487 tnode_free(tn);
488 return n;
489 }
490 write_unlock_bh(&fib_lock);
491 }
492 /*
493 * Double as long as the resulting node has a number of
494 * nonempty nodes that are above the threshold.
495 */
496
497 /*
498 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
499 * the Helsinki University of Technology and Matti Tikkanen of Nokia
500 * Telecommunications, page 6:
501 * "A node is doubled if the ratio of non-empty children to all
502 * children in the *doubled* node is at least 'high'."
503 *
504 * 'high' in this instance is the variable 'inflate_threshold'. It
505 * is expressed as a percentage, so we multiply it with
506 * tnode_child_length() and instead of multiplying by 2 (since the
507 * child array will be doubled by inflate()) and multiplying
508 * the left-hand side by 100 (to handle the percentage thing) we
509 * multiply the left-hand side by 50.
510 *
511 * The left-hand side may look a bit weird: tnode_child_length(tn)
512 * - tn->empty_children is of course the number of non-null children
513 * in the current node. tn->full_children is the number of "full"
514 * children, that is non-null tnodes with a skip value of 0.
515 * All of those will be doubled in the resulting inflated tnode, so
516 * we just count them one extra time here.
517 *
518 * A clearer way to write this would be:
519 *
520 * to_be_doubled = tn->full_children;
521 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
522 * tn->full_children;
523 *
524 * new_child_length = tnode_child_length(tn) * 2;
525 *
526 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
527 * new_child_length;
528 * if (new_fill_factor >= inflate_threshold)
529 *
530 * ...and so on, tho it would mess up the while() loop.
531 *
532 * anyway,
533 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
534 * inflate_threshold
535 *
536 * avoid a division:
537 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
538 * inflate_threshold * new_child_length
539 *
540 * expand not_to_be_doubled and to_be_doubled, and shorten:
541 * 100 * (tnode_child_length(tn) - tn->empty_children +
542 * tn->full_children ) >= inflate_threshold * new_child_length
543 *
544 * expand new_child_length:
545 * 100 * (tnode_child_length(tn) - tn->empty_children +
546 * tn->full_children ) >=
547 * inflate_threshold * tnode_child_length(tn) * 2
548 *
549 * shorten again:
550 * 50 * (tn->full_children + tnode_child_length(tn) -
551 * tn->empty_children ) >= inflate_threshold *
552 * tnode_child_length(tn)
553 *
554 */
555
556 check_tnode(tn);
557
558 while ((tn->full_children > 0 &&
559 50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >=
560 inflate_threshold * tnode_child_length(tn))) {
561
562 tn = inflate(t, tn);
563 }
564
565 check_tnode(tn);
566
567 /*
568 * Halve as long as the number of empty children in this
569 * node is above threshold.
570 */
571 while (tn->bits > 1 &&
572 100 * (tnode_child_length(tn) - tn->empty_children) <
573 halve_threshold * tnode_child_length(tn))
574
575 tn = halve(t, tn);
576
577 /* Only one child remains */
578
579 if (tn->empty_children == tnode_child_length(tn) - 1)
580 for (i = 0; i < tnode_child_length(tn); i++) {
581
582 write_lock_bh(&fib_lock);
583 if (tn->child[i] != NULL) {
584 /* compress one level */
585 struct node *n = tn->child[i];
586
587 if(n)
588 NODE_INIT_PARENT(n, NODE_TYPE(n));
589
590 write_unlock_bh(&fib_lock);
591 tnode_free(tn);
592 return n;
593 }
594 write_unlock_bh(&fib_lock);
595 }
596
597 return (struct node *) tn;
598}
599
600static struct tnode *inflate(struct trie *t, struct tnode *tn)
601{
602 struct tnode *inode;
603 struct tnode *oldtnode = tn;
604 int olen = tnode_child_length(tn);
605 int i;
606
607 if(trie_debug)
608 printk("In inflate\n");
609
610 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
611
612 if (!tn)
613 trie_bug("tnode_new failed");
614
615 for(i = 0; i < olen; i++) {
616 struct node *node = tnode_get_child(oldtnode, i);
617
618 /* An empty child */
619 if (node == NULL)
620 continue;
621
622 /* A leaf or an internal node with skipped bits */
623
624 if(IS_LEAF(node) || ((struct tnode *) node)->pos >
625 tn->pos + tn->bits - 1) {
626 if(tkey_extract_bits(node->key, tn->pos + tn->bits - 1,
627 1) == 0)
628 put_child(t, tn, 2*i, node);
629 else
630 put_child(t, tn, 2*i+1, node);
631 continue;
632 }
633
634 /* An internal node with two children */
635 inode = (struct tnode *) node;
636
637 if (inode->bits == 1) {
638 put_child(t, tn, 2*i, inode->child[0]);
639 put_child(t, tn, 2*i+1, inode->child[1]);
640
641 tnode_free(inode);
642 }
643
644 /* An internal node with more than two children */
645 else {
646 struct tnode *left, *right;
647 int size, j;
648
649 /* We will replace this node 'inode' with two new
650 * ones, 'left' and 'right', each with half of the
651 * original children. The two new nodes will have
652 * a position one bit further down the key and this
653 * means that the "significant" part of their keys
654 * (see the discussion near the top of this file)
655 * will differ by one bit, which will be "0" in
656 * left's key and "1" in right's key. Since we are
657 * moving the key position by one step, the bit that
658 * we are moving away from - the bit at position
659 * (inode->pos) - is the one that will differ between
660 * left and right. So... we synthesize that bit in the
661 * two new keys.
662 * The mask 'm' below will be a single "one" bit at
663 * the position (inode->pos)
664 */
665
666 t_key m = TKEY_GET_MASK(inode->pos, 1);
667
668 /* Use the old key, but set the new significant
669 * bit to zero.
670 */
671 left = tnode_new(inode->key&(~m), inode->pos + 1,
672 inode->bits - 1);
673
674 if(!left)
675 trie_bug("tnode_new failed");
676
677
678 /* Use the old key, but set the new significant
679 * bit to one.
680 */
681 right = tnode_new(inode->key|m, inode->pos + 1,
682 inode->bits - 1);
683
684 if(!right)
685 trie_bug("tnode_new failed");
686
687 size = tnode_child_length(left);
688 for(j = 0; j < size; j++) {
689 put_child(t, left, j, inode->child[j]);
690 put_child(t, right, j, inode->child[j + size]);
691 }
692 put_child(t, tn, 2*i, resize(t, left));
693 put_child(t, tn, 2*i+1, resize(t, right));
694
695 tnode_free(inode);
696 }
697 }
698 tnode_free(oldtnode);
699 return tn;
700}
701
702static struct tnode *halve(struct trie *t, struct tnode *tn)
703{
704 struct tnode *oldtnode = tn;
705 struct node *left, *right;
706 int i;
707 int olen = tnode_child_length(tn);
708
709 if(trie_debug) printk("In halve\n");
710
711 tn=tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
712
713 if(!tn)
714 trie_bug("tnode_new failed");
715
716 for(i = 0; i < olen; i += 2) {
717 left = tnode_get_child(oldtnode, i);
718 right = tnode_get_child(oldtnode, i+1);
719
720 /* At least one of the children is empty */
721 if (left == NULL) {
722 if (right == NULL) /* Both are empty */
723 continue;
724 put_child(t, tn, i/2, right);
725 } else if (right == NULL)
726 put_child(t, tn, i/2, left);
727
728 /* Two nonempty children */
729 else {
730 struct tnode *newBinNode =
731 tnode_new(left->key, tn->pos + tn->bits, 1);
732
733 if(!newBinNode)
734 trie_bug("tnode_new failed");
735
736 put_child(t, newBinNode, 0, left);
737 put_child(t, newBinNode, 1, right);
738 put_child(t, tn, i/2, resize(t, newBinNode));
739 }
740 }
741 tnode_free(oldtnode);
742 return tn;
743}
744
745static void *trie_init(struct trie *t)
746{
747 if(t) {
748 t->size = 0;
749 t->trie = NULL;
750 t->revision = 0;
751#ifdef CONFIG_IP_FIB_TRIE_STATS
752 memset(&t->stats, 0, sizeof(struct trie_use_stats));
753#endif
754 }
755 return t;
756}
757
758static struct leaf_info *find_leaf_info(struct hlist_head *head, int plen)
759{
760 struct hlist_node *node;
761 struct leaf_info *li;
762
763 hlist_for_each_entry(li, node, head, hlist) {
764
765 if ( li->plen == plen )
766 return li;
767 }
768 return NULL;
769}
770
771static inline struct list_head * get_fa_head(struct leaf *l, int plen)
772{
773 struct list_head *fa_head=NULL;
774 struct leaf_info *li = find_leaf_info(&l->list, plen);
775
776 if(li)
777 fa_head = &li->falh;
778
779 return fa_head;
780}
781
782static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
783{
784 struct leaf_info *li=NULL, *last=NULL;
785 struct hlist_node *node, *tmp;
786
787 write_lock_bh(&fib_lock);
788
789 if(hlist_empty(head))
790 hlist_add_head(&new->hlist, head);
791 else {
792 hlist_for_each_entry_safe(li, node, tmp, head, hlist) {
793
794 if (new->plen > li->plen)
795 break;
796
797 last = li;
798 }
799 if(last)
800 hlist_add_after(&last->hlist, &new->hlist);
801 else
802 hlist_add_before(&new->hlist, &li->hlist);
803 }
804 write_unlock_bh(&fib_lock);
805}
806
807static struct leaf *
808fib_find_node(struct trie *t, u32 key)
809{
810 int pos;
811 struct tnode *tn;
812 struct node *n;
813
814 pos = 0;
815 n=t->trie;
816
817 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
818 tn = (struct tnode *) n;
819
820 check_tnode(tn);
821
822 if(tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
823 pos=tn->pos + tn->bits;
824 n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
825 }
826 else
827 break;
828 }
829 /* Case we have found a leaf. Compare prefixes */
830
831 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
832 struct leaf *l = (struct leaf *) n;
833 return l;
834 }
835 return NULL;
836}
837
838static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
839{
840 int i = 0;
841 int wasfull;
842 t_key cindex, key;
843 struct tnode *tp = NULL;
844
845 if(!tn)
846 BUG();
847
848 key = tn->key;
849 i = 0;
850
851 while (tn != NULL && NODE_PARENT(tn) != NULL) {
852
853 if( i > 10 ) {
854 printk("Rebalance tn=%p \n", tn);
855 if(tn) printk("tn->parent=%p \n", NODE_PARENT(tn));
856
857 printk("Rebalance tp=%p \n", tp);
858 if(tp) printk("tp->parent=%p \n", NODE_PARENT(tp));
859 }
860
861 if( i > 12 ) BUG();
862 i++;
863
864 tp = NODE_PARENT(tn);
865 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
866 wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
867 tn = (struct tnode *) resize (t, (struct tnode *)tn);
868 tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull);
869
870 if(!NODE_PARENT(tn))
871 break;
872
873 tn = NODE_PARENT(tn);
874 }
875 /* Handle last (top) tnode */
876 if (IS_TNODE(tn))
877 tn = (struct tnode*) resize(t, (struct tnode *)tn);
878
879 return (struct node*) tn;
880}
881
882static struct list_head *
883fib_insert_node(struct trie *t, u32 key, int plen)
884{
885 int pos, newpos;
886 struct tnode *tp = NULL, *tn = NULL;
887 struct node *n;
888 struct leaf *l;
889 int missbit;
890 struct list_head *fa_head=NULL;
891 struct leaf_info *li;
892 t_key cindex;
893
894 pos = 0;
895 n=t->trie;
896
897 /* If we point to NULL, stop. Either the tree is empty and we should
898 * just put a new leaf in if, or we have reached an empty child slot,
899 * and we should just put our new leaf in that.
900 * If we point to a T_TNODE, check if it matches our key. Note that
901 * a T_TNODE might be skipping any number of bits - its 'pos' need
902 * not be the parent's 'pos'+'bits'!
903 *
904 * If it does match the current key, get pos/bits from it, extract
905 * the index from our key, push the T_TNODE and walk the tree.
906 *
907 * If it doesn't, we have to replace it with a new T_TNODE.
908 *
909 * If we point to a T_LEAF, it might or might not have the same key
910 * as we do. If it does, just change the value, update the T_LEAF's
911 * value, and return it.
912 * If it doesn't, we need to replace it with a T_TNODE.
913 */
914
915 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
916 tn = (struct tnode *) n;
917
918 check_tnode(tn);
919
920 if(tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
921 tp = tn;
922 pos=tn->pos + tn->bits;
923 n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
924
925 if(n && NODE_PARENT(n) != tn) {
926 printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n));
927 BUG();
928 }
929 }
930 else
931 break;
932 }
933
934 /*
935 * n ----> NULL, LEAF or TNODE
936 *
937 * tp is n's (parent) ----> NULL or TNODE
938 */
939
940 if(tp && IS_LEAF(tp))
941 BUG();
942
943 t->revision++;
944
945 /* Case 1: n is a leaf. Compare prefixes */
946
947 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
948 struct leaf *l = ( struct leaf *) n;
949
950 li = leaf_info_new(plen);
951
952 if(! li)
953 BUG();
954
955 fa_head = &li->falh;
956 insert_leaf_info(&l->list, li);
957 goto done;
958 }
959 t->size++;
960 l = leaf_new();
961
962 if(! l)
963 BUG();
964
965 l->key = key;
966 li = leaf_info_new(plen);
967
968 if(! li)
969 BUG();
970
971 fa_head = &li->falh;
972 insert_leaf_info(&l->list, li);
973
974 /* Case 2: n is NULL, and will just insert a new leaf */
975 if (t->trie && n == NULL) {
976
977 NODE_SET_PARENT(l, tp);
978
979 if (!tp)
980 BUG();
981
982 else {
983 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
984 put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
985 }
986 }
987 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
988 else {
989 /*
990 * Add a new tnode here
991 * first tnode need some special handling
992 */
993
994 if (tp)
995 pos=tp->pos+tp->bits;
996 else
997 pos=0;
998 if(n) {
999 newpos = tkey_mismatch(key, pos, n->key);
1000 tn = tnode_new(n->key, newpos, 1);
1001 }
1002 else {
1003 newpos = 0;
1004 tn = tnode_new(key, newpos, 1); /* First tnode */
1005 }
1006 if(!tn)
1007 trie_bug("tnode_pfx_new failed");
1008
1009 NODE_SET_PARENT(tn, tp);
1010
1011 missbit=tkey_extract_bits(key, newpos, 1);
1012 put_child(t, tn, missbit, (struct node *)l);
1013 put_child(t, tn, 1-missbit, n);
1014
1015 if(tp) {
1016 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1017 put_child(t, (struct tnode *)tp, cindex, (struct node *)tn);
1018 }
1019 else {
1020 t->trie = (struct node*) tn; /* First tnode */
1021 tp = tn;
1022 }
1023 }
1024 if(tp && tp->pos+tp->bits > 32) {
1025 printk("ERROR tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1026 tp, tp->pos, tp->bits, key, plen);
1027 }
1028 /* Rebalance the trie */
1029 t->trie = trie_rebalance(t, tp);
1030done:;
1031 return fa_head;
1032}
1033
1034static int
1035fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1036 struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
1037{
1038 struct trie *t = (struct trie *) tb->tb_data;
1039 struct fib_alias *fa, *new_fa;
1040 struct list_head *fa_head=NULL;
1041 struct fib_info *fi;
1042 int plen = r->rtm_dst_len;
1043 int type = r->rtm_type;
1044 u8 tos = r->rtm_tos;
1045 u32 key, mask;
1046 int err;
1047 struct leaf *l;
1048
1049 if (plen > 32)
1050 return -EINVAL;
1051
1052 key = 0;
1053 if (rta->rta_dst)
1054 memcpy(&key, rta->rta_dst, 4);
1055
1056 key = ntohl(key);
1057
1058 if(trie_debug)
1059 printk("Insert table=%d %08x/%d\n", tb->tb_id, key, plen);
1060
1061 mask = ntohl( inet_make_mask(plen) );
1062
1063 if(key & ~mask)
1064 return -EINVAL;
1065
1066 key = key & mask;
1067
1068 if ((fi = fib_create_info(r, rta, nlhdr, &err)) == NULL)
1069 goto err;
1070
1071 l = fib_find_node(t, key);
1072 fa = NULL;
1073
1074 if(l) {
1075 fa_head = get_fa_head(l, plen);
1076 fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1077 }
1078
1079 /* Now fa, if non-NULL, points to the first fib alias
1080 * with the same keys [prefix,tos,priority], if such key already
1081 * exists or to the node before which we will insert new one.
1082 *
1083 * If fa is NULL, we will need to allocate a new one and
1084 * insert to the head of f.
1085 *
1086 * If f is NULL, no fib node matched the destination key
1087 * and we need to allocate a new one of those as well.
1088 */
1089
1090 if (fa &&
1091 fa->fa_info->fib_priority == fi->fib_priority) {
1092 struct fib_alias *fa_orig;
1093
1094 err = -EEXIST;
1095 if (nlhdr->nlmsg_flags & NLM_F_EXCL)
1096 goto out;
1097
1098 if (nlhdr->nlmsg_flags & NLM_F_REPLACE) {
1099 struct fib_info *fi_drop;
1100 u8 state;
1101
1102 write_lock_bh(&fib_lock);
1103
1104 fi_drop = fa->fa_info;
1105 fa->fa_info = fi;
1106 fa->fa_type = type;
1107 fa->fa_scope = r->rtm_scope;
1108 state = fa->fa_state;
1109 fa->fa_state &= ~FA_S_ACCESSED;
1110
1111 write_unlock_bh(&fib_lock);
1112
1113 fib_release_info(fi_drop);
1114 if (state & FA_S_ACCESSED)
1115 rt_cache_flush(-1);
1116
1117 goto succeeded;
1118 }
1119 /* Error if we find a perfect match which
1120 * uses the same scope, type, and nexthop
1121 * information.
1122 */
1123 fa_orig = fa;
1124 list_for_each_entry(fa, fa_orig->fa_list.prev, fa_list) {
1125 if (fa->fa_tos != tos)
1126 break;
1127 if (fa->fa_info->fib_priority != fi->fib_priority)
1128 break;
1129 if (fa->fa_type == type &&
1130 fa->fa_scope == r->rtm_scope &&
1131 fa->fa_info == fi) {
1132 goto out;
1133 }
1134 }
1135 if (!(nlhdr->nlmsg_flags & NLM_F_APPEND))
1136 fa = fa_orig;
1137 }
1138 err = -ENOENT;
1139 if (!(nlhdr->nlmsg_flags&NLM_F_CREATE))
1140 goto out;
1141
1142 err = -ENOBUFS;
1143 new_fa = kmem_cache_alloc(fn_alias_kmem, SLAB_KERNEL);
1144 if (new_fa == NULL)
1145 goto out;
1146
1147 new_fa->fa_info = fi;
1148 new_fa->fa_tos = tos;
1149 new_fa->fa_type = type;
1150 new_fa->fa_scope = r->rtm_scope;
1151 new_fa->fa_state = 0;
1152#if 0
1153 new_fa->dst = NULL;
1154#endif
1155 /*
1156 * Insert new entry to the list.
1157 */
1158
1159 if(!fa_head)
1160 fa_head = fib_insert_node(t, key, plen);
1161
1162 write_lock_bh(&fib_lock);
1163
1164 list_add_tail(&new_fa->fa_list,
1165 (fa ? &fa->fa_list : fa_head));
1166
1167 write_unlock_bh(&fib_lock);
1168
1169 rt_cache_flush(-1);
1170 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, nlhdr, req);
1171succeeded:
1172 return 0;
1173out:
1174 fib_release_info(fi);
1175err:;
1176 return err;
1177}
1178
1179static inline int check_leaf(struct trie *t, struct leaf *l, t_key key, int *plen, const struct flowi *flp,
1180 struct fib_result *res, int *err)
1181{
1182 int i;
1183 t_key mask;
1184 struct leaf_info *li;
1185 struct hlist_head *hhead = &l->list;
1186 struct hlist_node *node;
1187
1188 hlist_for_each_entry(li, node, hhead, hlist) {
1189
1190 i = li->plen;
1191 mask = ntohl(inet_make_mask(i));
1192 if (l->key != (key & mask))
1193 continue;
1194
1195 if (((*err) = fib_semantic_match(&li->falh, flp, res, l->key, mask, i)) == 0) {
1196 *plen = i;
1197#ifdef CONFIG_IP_FIB_TRIE_STATS
1198 t->stats.semantic_match_passed++;
1199#endif
1200 return 1;
1201 }
1202#ifdef CONFIG_IP_FIB_TRIE_STATS
1203 t->stats.semantic_match_miss++;
1204#endif
1205 }
1206 return 0;
1207}
1208
1209static int
1210fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1211{
1212 struct trie *t = (struct trie *) tb->tb_data;
1213 int plen, ret = 0;
1214 struct node *n;
1215 struct tnode *pn;
1216 int pos, bits;
1217 t_key key=ntohl(flp->fl4_dst);
1218 int chopped_off;
1219 t_key cindex = 0;
1220 int current_prefix_length = KEYLENGTH;
1221 n = t->trie;
1222
1223 read_lock(&fib_lock);
1224 if(!n)
1225 goto failed;
1226
1227#ifdef CONFIG_IP_FIB_TRIE_STATS
1228 t->stats.gets++;
1229#endif
1230
1231 /* Just a leaf? */
1232 if (IS_LEAF(n)) {
1233 if( check_leaf(t, (struct leaf *)n, key, &plen, flp, res, &ret) )
1234 goto found;
1235 goto failed;
1236 }
1237 pn = (struct tnode *) n;
1238 chopped_off = 0;
1239
1240 while (pn) {
1241
1242 pos = pn->pos;
1243 bits = pn->bits;
1244
1245 if(!chopped_off)
1246 cindex = tkey_extract_bits(MASK_PFX(key, current_prefix_length), pos, bits);
1247
1248 n = tnode_get_child(pn, cindex);
1249
1250 if (n == NULL) {
1251#ifdef CONFIG_IP_FIB_TRIE_STATS
1252 t->stats.null_node_hit++;
1253#endif
1254 goto backtrace;
1255 }
1256
1257 if (IS_TNODE(n)) {
1258#define HL_OPTIMIZE
1259#ifdef HL_OPTIMIZE
1260 struct tnode *cn = (struct tnode *)n;
1261 t_key node_prefix, key_prefix, pref_mismatch;
1262 int mp;
1263
1264 /*
1265 * It's a tnode, and we can do some extra checks here if we
1266 * like, to avoid descending into a dead-end branch.
1267 * This tnode is in the parent's child array at index
1268 * key[p_pos..p_pos+p_bits] but potentially with some bits
1269 * chopped off, so in reality the index may be just a
1270 * subprefix, padded with zero at the end.
1271 * We can also take a look at any skipped bits in this
1272 * tnode - everything up to p_pos is supposed to be ok,
1273 * and the non-chopped bits of the index (se previous
1274 * paragraph) are also guaranteed ok, but the rest is
1275 * considered unknown.
1276 *
1277 * The skipped bits are key[pos+bits..cn->pos].
1278 */
1279
1280 /* If current_prefix_length < pos+bits, we are already doing
1281 * actual prefix matching, which means everything from
1282 * pos+(bits-chopped_off) onward must be zero along some
1283 * branch of this subtree - otherwise there is *no* valid
1284 * prefix present. Here we can only check the skipped
1285 * bits. Remember, since we have already indexed into the
1286 * parent's child array, we know that the bits we chopped of
1287 * *are* zero.
1288 */
1289
1290 /* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */
1291
1292 if (current_prefix_length < pos+bits) {
1293 if (tkey_extract_bits(cn->key, current_prefix_length,
1294 cn->pos - current_prefix_length) != 0 ||
1295 !(cn->child[0]))
1296 goto backtrace;
1297 }
1298
1299 /*
1300 * If chopped_off=0, the index is fully validated and we
1301 * only need to look at the skipped bits for this, the new,
1302 * tnode. What we actually want to do is to find out if
1303 * these skipped bits match our key perfectly, or if we will
1304 * have to count on finding a matching prefix further down,
1305 * because if we do, we would like to have some way of
1306 * verifying the existence of such a prefix at this point.
1307 */
1308
1309 /* The only thing we can do at this point is to verify that
1310 * any such matching prefix can indeed be a prefix to our
1311 * key, and if the bits in the node we are inspecting that
1312 * do not match our key are not ZERO, this cannot be true.
1313 * Thus, find out where there is a mismatch (before cn->pos)
1314 * and verify that all the mismatching bits are zero in the
1315 * new tnode's key.
1316 */
1317
1318 /* Note: We aren't very concerned about the piece of the key
1319 * that precede pn->pos+pn->bits, since these have already been
1320 * checked. The bits after cn->pos aren't checked since these are
1321 * by definition "unknown" at this point. Thus, what we want to
1322 * see is if we are about to enter the "prefix matching" state,
1323 * and in that case verify that the skipped bits that will prevail
1324 * throughout this subtree are zero, as they have to be if we are
1325 * to find a matching prefix.
1326 */
1327
1328 node_prefix = MASK_PFX(cn->key, cn->pos);
1329 key_prefix = MASK_PFX(key, cn->pos);
1330 pref_mismatch = key_prefix^node_prefix;
1331 mp = 0;
1332
1333 /* In short: If skipped bits in this node do not match the search
1334 * key, enter the "prefix matching" state.directly.
1335 */
1336 if (pref_mismatch) {
1337 while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
1338 mp++;
1339 pref_mismatch = pref_mismatch <<1;
1340 }
1341 key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1342
1343 if (key_prefix != 0)
1344 goto backtrace;
1345
1346 if (current_prefix_length >= cn->pos)
1347 current_prefix_length=mp;
1348 }
1349#endif
1350 pn = (struct tnode *)n; /* Descend */
1351 chopped_off = 0;
1352 continue;
1353 }
1354 if (IS_LEAF(n)) {
1355 if( check_leaf(t, (struct leaf *)n, key, &plen, flp, res, &ret))
1356 goto found;
1357 }
1358backtrace:
1359 chopped_off++;
1360
1361 /* As zero don't change the child key (cindex) */
1362 while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1)))) {
1363 chopped_off++;
1364 }
1365
1366 /* Decrease current_... with bits chopped off */
1367 if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1368 current_prefix_length = pn->pos + pn->bits - chopped_off;
1369
1370 /*
1371 * Either we do the actual chop off according or if we have
1372 * chopped off all bits in this tnode walk up to our parent.
1373 */
1374
1375 if(chopped_off <= pn->bits)
1376 cindex &= ~(1 << (chopped_off-1));
1377 else {
1378 if( NODE_PARENT(pn) == NULL)
1379 goto failed;
1380
1381 /* Get Child's index */
1382 cindex = tkey_extract_bits(pn->key, NODE_PARENT(pn)->pos, NODE_PARENT(pn)->bits);
1383 pn = NODE_PARENT(pn);
1384 chopped_off = 0;
1385
1386#ifdef CONFIG_IP_FIB_TRIE_STATS
1387 t->stats.backtrack++;
1388#endif
1389 goto backtrace;
1390 }
1391 }
1392failed:
1393 ret = 1;
1394found:
1395 read_unlock(&fib_lock);
1396 return ret;
1397}
1398
1399static int trie_leaf_remove(struct trie *t, t_key key)
1400{
1401 t_key cindex;
1402 struct tnode *tp = NULL;
1403 struct node *n = t->trie;
1404 struct leaf *l;
1405
1406 if(trie_debug)
1407 printk("entering trie_leaf_remove(%p)\n", n);
1408
1409 /* Note that in the case skipped bits, those bits are *not* checked!
1410 * When we finish this, we will have NULL or a T_LEAF, and the
1411 * T_LEAF may or may not match our key.
1412 */
1413
1414 while (n != NULL && IS_TNODE(n)) {
1415 struct tnode *tn = (struct tnode *) n;
1416 check_tnode(tn);
1417 n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits));
1418
1419 if(n && NODE_PARENT(n) != tn) {
1420 printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n));
1421 BUG();
1422 }
1423 }
1424 l = (struct leaf *) n;
1425
1426 if(!n || !tkey_equals(l->key, key))
1427 return 0;
1428
1429 /*
1430 * Key found.
1431 * Remove the leaf and rebalance the tree
1432 */
1433
1434 t->revision++;
1435 t->size--;
1436
1437 tp = NODE_PARENT(n);
1438 tnode_free((struct tnode *) n);
1439
1440 if(tp) {
1441 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1442 put_child(t, (struct tnode *)tp, cindex, NULL);
1443 t->trie = trie_rebalance(t, tp);
1444 }
1445 else
1446 t->trie = NULL;
1447
1448 return 1;
1449}
1450
1451static int
1452fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1453 struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
1454{
1455 struct trie *t = (struct trie *) tb->tb_data;
1456 u32 key, mask;
1457 int plen = r->rtm_dst_len;
1458 u8 tos = r->rtm_tos;
1459 struct fib_alias *fa, *fa_to_delete;
1460 struct list_head *fa_head;
1461 struct leaf *l;
1462
1463 if (plen > 32)
1464 return -EINVAL;
1465
1466 key = 0;
1467 if (rta->rta_dst)
1468 memcpy(&key, rta->rta_dst, 4);
1469
1470 key = ntohl(key);
1471 mask = ntohl( inet_make_mask(plen) );
1472
1473 if(key & ~mask)
1474 return -EINVAL;
1475
1476 key = key & mask;
1477 l = fib_find_node(t, key);
1478
1479 if(!l)
1480 return -ESRCH;
1481
1482 fa_head = get_fa_head(l, plen);
1483 fa = fib_find_alias(fa_head, tos, 0);
1484
1485 if (!fa)
1486 return -ESRCH;
1487
1488 if (trie_debug)
1489 printk("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1490
1491 fa_to_delete = NULL;
1492 fa_head = fa->fa_list.prev;
1493 list_for_each_entry(fa, fa_head, fa_list) {
1494 struct fib_info *fi = fa->fa_info;
1495
1496 if (fa->fa_tos != tos)
1497 break;
1498
1499 if ((!r->rtm_type ||
1500 fa->fa_type == r->rtm_type) &&
1501 (r->rtm_scope == RT_SCOPE_NOWHERE ||
1502 fa->fa_scope == r->rtm_scope) &&
1503 (!r->rtm_protocol ||
1504 fi->fib_protocol == r->rtm_protocol) &&
1505 fib_nh_match(r, nlhdr, rta, fi) == 0) {
1506 fa_to_delete = fa;
1507 break;
1508 }
1509 }
1510
1511 if (fa_to_delete) {
1512 int kill_li = 0;
1513 struct leaf_info *li;
1514
1515 fa = fa_to_delete;
1516 rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id, nlhdr, req);
1517
1518 l = fib_find_node(t, key);
1519 li = find_leaf_info(&l->list, plen);
1520
1521 write_lock_bh(&fib_lock);
1522
1523 list_del(&fa->fa_list);
1524
1525 if(list_empty(fa_head)) {
1526 hlist_del(&li->hlist);
1527 kill_li = 1;
1528 }
1529 write_unlock_bh(&fib_lock);
1530
1531 if(kill_li)
1532 free_leaf_info(li);
1533
1534 if(hlist_empty(&l->list))
1535 trie_leaf_remove(t, key);
1536
1537 if (fa->fa_state & FA_S_ACCESSED)
1538 rt_cache_flush(-1);
1539
1540 fn_free_alias(fa);
1541 return 0;
1542 }
1543 return -ESRCH;
1544}
1545
1546static int trie_flush_list(struct trie *t, struct list_head *head)
1547{
1548 struct fib_alias *fa, *fa_node;
1549 int found = 0;
1550
1551 list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1552 struct fib_info *fi = fa->fa_info;
1553
1554 if (fi && (fi->fib_flags&RTNH_F_DEAD)) {
1555
1556 write_lock_bh(&fib_lock);
1557 list_del(&fa->fa_list);
1558 write_unlock_bh(&fib_lock);
1559
1560 fn_free_alias(fa);
1561 found++;
1562 }
1563 }
1564 return found;
1565}
1566
1567static int trie_flush_leaf(struct trie *t, struct leaf *l)
1568{
1569 int found = 0;
1570 struct hlist_head *lih = &l->list;
1571 struct hlist_node *node, *tmp;
1572 struct leaf_info *li = NULL;
1573
1574 hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
1575
1576 found += trie_flush_list(t, &li->falh);
1577
1578 if (list_empty(&li->falh)) {
1579
1580 write_lock_bh(&fib_lock);
1581 hlist_del(&li->hlist);
1582 write_unlock_bh(&fib_lock);
1583
1584 free_leaf_info(li);
1585 }
1586 }
1587 return found;
1588}
1589
1590static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
1591{
1592 struct node *c = (struct node *) thisleaf;
1593 struct tnode *p;
1594 int idx;
1595
1596 if(c == NULL) {
1597 if(t->trie == NULL)
1598 return NULL;
1599
1600 if (IS_LEAF(t->trie)) /* trie w. just a leaf */
1601 return (struct leaf *) t->trie;
1602
1603 p = (struct tnode*) t->trie; /* Start */
1604 }
1605 else
1606 p = (struct tnode *) NODE_PARENT(c);
1607 while (p) {
1608 int pos, last;
1609
1610 /* Find the next child of the parent */
1611 if(c)
1612 pos = 1 + tkey_extract_bits(c->key, p->pos, p->bits);
1613 else
1614 pos = 0;
1615
1616 last = 1 << p->bits;
1617 for(idx = pos; idx < last ; idx++) {
1618 if( p->child[idx]) {
1619
1620 /* Decend if tnode */
1621
1622 while (IS_TNODE(p->child[idx])) {
1623 p = (struct tnode*) p->child[idx];
1624 idx = 0;
1625
1626 /* Rightmost non-NULL branch */
1627 if( p && IS_TNODE(p) )
1628 while ( p->child[idx] == NULL && idx < (1 << p->bits) ) idx++;
1629
1630 /* Done with this tnode? */
1631 if( idx >= (1 << p->bits) || p->child[idx] == NULL )
1632 goto up;
1633 }
1634 return (struct leaf*) p->child[idx];
1635 }
1636 }
1637up:
1638 /* No more children go up one step */
1639 c = (struct node*) p;
1640 p = (struct tnode *) NODE_PARENT(p);
1641 }
1642 return NULL; /* Ready. Root of trie */
1643}
1644
1645static int fn_trie_flush(struct fib_table *tb)
1646{
1647 struct trie *t = (struct trie *) tb->tb_data;
1648 struct leaf *ll = NULL, *l = NULL;
1649 int found = 0, h;
1650
1651 t->revision++;
1652
1653 for (h=0; (l = nextleaf(t, l)) != NULL; h++) {
1654 found += trie_flush_leaf(t, l);
1655
1656 if (ll && hlist_empty(&ll->list))
1657 trie_leaf_remove(t, ll->key);
1658 ll = l;
1659 }
1660
1661 if (ll && hlist_empty(&ll->list))
1662 trie_leaf_remove(t, ll->key);
1663
1664 if(trie_debug)
1665 printk("trie_flush found=%d\n", found);
1666 return found;
1667}
1668
1669static int trie_last_dflt=-1;
1670
1671static void
1672fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1673{
1674 struct trie *t = (struct trie *) tb->tb_data;
1675 int order, last_idx;
1676 struct fib_info *fi = NULL;
1677 struct fib_info *last_resort;
1678 struct fib_alias *fa = NULL;
1679 struct list_head *fa_head;
1680 struct leaf *l;
1681
1682 last_idx = -1;
1683 last_resort = NULL;
1684 order = -1;
1685
1686 read_lock(&fib_lock);
1687
1688 l = fib_find_node(t, 0);
1689 if(!l)
1690 goto out;
1691
1692 fa_head = get_fa_head(l, 0);
1693 if(!fa_head)
1694 goto out;
1695
1696 if (list_empty(fa_head))
1697 goto out;
1698
1699 list_for_each_entry(fa, fa_head, fa_list) {
1700 struct fib_info *next_fi = fa->fa_info;
1701
1702 if (fa->fa_scope != res->scope ||
1703 fa->fa_type != RTN_UNICAST)
1704 continue;
1705
1706 if (next_fi->fib_priority > res->fi->fib_priority)
1707 break;
1708 if (!next_fi->fib_nh[0].nh_gw ||
1709 next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1710 continue;
1711 fa->fa_state |= FA_S_ACCESSED;
1712
1713 if (fi == NULL) {
1714 if (next_fi != res->fi)
1715 break;
1716 } else if (!fib_detect_death(fi, order, &last_resort,
1717 &last_idx, &trie_last_dflt)) {
1718 if (res->fi)
1719 fib_info_put(res->fi);
1720 res->fi = fi;
1721 atomic_inc(&fi->fib_clntref);
1722 trie_last_dflt = order;
1723 goto out;
1724 }
1725 fi = next_fi;
1726 order++;
1727 }
1728 if (order <= 0 || fi == NULL) {
1729 trie_last_dflt = -1;
1730 goto out;
1731 }
1732
1733 if (!fib_detect_death(fi, order, &last_resort, &last_idx, &trie_last_dflt)) {
1734 if (res->fi)
1735 fib_info_put(res->fi);
1736 res->fi = fi;
1737 atomic_inc(&fi->fib_clntref);
1738 trie_last_dflt = order;
1739 goto out;
1740 }
1741 if (last_idx >= 0) {
1742 if (res->fi)
1743 fib_info_put(res->fi);
1744 res->fi = last_resort;
1745 if (last_resort)
1746 atomic_inc(&last_resort->fib_clntref);
1747 }
1748 trie_last_dflt = last_idx;
1749 out:;
1750 read_unlock(&fib_lock);
1751}
1752
1753static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb,
1754 struct sk_buff *skb, struct netlink_callback *cb)
1755{
1756 int i, s_i;
1757 struct fib_alias *fa;
1758
1759 u32 xkey=htonl(key);
1760
1761 s_i=cb->args[3];
1762 i = 0;
1763
1764 list_for_each_entry(fa, fah, fa_list) {
1765 if (i < s_i) {
1766 i++;
1767 continue;
1768 }
1769 if (fa->fa_info->fib_nh == NULL) {
1770 printk("Trie error _fib_nh=NULL in fa[%d] k=%08x plen=%d\n", i, key, plen);
1771 i++;
1772 continue;
1773 }
1774 if (fa->fa_info == NULL) {
1775 printk("Trie error fa_info=NULL in fa[%d] k=%08x plen=%d\n", i, key, plen);
1776 i++;
1777 continue;
1778 }
1779
1780 if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1781 cb->nlh->nlmsg_seq,
1782 RTM_NEWROUTE,
1783 tb->tb_id,
1784 fa->fa_type,
1785 fa->fa_scope,
1786 &xkey,
1787 plen,
1788 fa->fa_tos,
1789 fa->fa_info, 0) < 0) {
1790 cb->args[3] = i;
1791 return -1;
1792 }
1793 i++;
1794 }
1795 cb->args[3]=i;
1796 return skb->len;
1797}
1798
1799static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, struct sk_buff *skb,
1800 struct netlink_callback *cb)
1801{
1802 int h, s_h;
1803 struct list_head *fa_head;
1804 struct leaf *l = NULL;
1805 s_h=cb->args[2];
1806
1807 for (h=0; (l = nextleaf(t, l)) != NULL; h++) {
1808
1809 if (h < s_h)
1810 continue;
1811 if (h > s_h)
1812 memset(&cb->args[3], 0,
1813 sizeof(cb->args) - 3*sizeof(cb->args[0]));
1814
1815 fa_head = get_fa_head(l, plen);
1816
1817 if(!fa_head)
1818 continue;
1819
1820 if(list_empty(fa_head))
1821 continue;
1822
1823 if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) {
1824 cb->args[2]=h;
1825 return -1;
1826 }
1827 }
1828 cb->args[2]=h;
1829 return skb->len;
1830}
1831
1832static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb)
1833{
1834 int m, s_m;
1835 struct trie *t = (struct trie *) tb->tb_data;
1836
1837 s_m = cb->args[1];
1838
1839 read_lock(&fib_lock);
1840 for (m=0; m<=32; m++) {
1841
1842 if (m < s_m)
1843 continue;
1844 if (m > s_m)
1845 memset(&cb->args[2], 0,
1846 sizeof(cb->args) - 2*sizeof(cb->args[0]));
1847
1848 if (fn_trie_dump_plen(t, 32-m, tb, skb, cb)<0) {
1849 cb->args[1] = m;
1850 goto out;
1851 }
1852 }
1853 read_unlock(&fib_lock);
1854 cb->args[1] = m;
1855 return skb->len;
1856 out:
1857 read_unlock(&fib_lock);
1858 return -1;
1859}
1860
1861/* Fix more generic FIB names for init later */
1862
1863#ifdef CONFIG_IP_MULTIPLE_TABLES
1864struct fib_table * fib_hash_init(int id)
1865#else
1866struct fib_table * __init fib_hash_init(int id)
1867#endif
1868{
1869 struct fib_table *tb;
1870 struct trie *t;
1871
1872 if (fn_alias_kmem == NULL)
1873 fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1874 sizeof(struct fib_alias),
1875 0, SLAB_HWCACHE_ALIGN,
1876 NULL, NULL);
1877
1878 tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
1879 GFP_KERNEL);
1880 if (tb == NULL)
1881 return NULL;
1882
1883 tb->tb_id = id;
1884 tb->tb_lookup = fn_trie_lookup;
1885 tb->tb_insert = fn_trie_insert;
1886 tb->tb_delete = fn_trie_delete;
1887 tb->tb_flush = fn_trie_flush;
1888 tb->tb_select_default = fn_trie_select_default;
1889 tb->tb_dump = fn_trie_dump;
1890 memset(tb->tb_data, 0, sizeof(struct trie));
1891
1892 t = (struct trie *) tb->tb_data;
1893
1894 trie_init(t);
1895
1896 if (id == RT_TABLE_LOCAL)
1897 trie_local=t;
1898 else if (id == RT_TABLE_MAIN)
1899 trie_main=t;
1900
1901 if (id == RT_TABLE_LOCAL)
1902 printk("IPv4 FIB: Using LC-trie version %s\n", VERSION);
1903
1904 return tb;
1905}
1906
1907/* Trie dump functions */
1908
1909static void putspace_seq(struct seq_file *seq, int n)
1910{
1911 while (n--) seq_printf(seq, " ");
1912}
1913
1914static void printbin_seq(struct seq_file *seq, unsigned int v, int bits)
1915{
1916 while (bits--)
1917 seq_printf(seq, "%s", (v & (1<<bits))?"1":"0");
1918}
1919
1920static void printnode_seq(struct seq_file *seq, int indent, struct node *n,
1921 int pend, int cindex, int bits)
1922{
1923 putspace_seq(seq, indent);
1924 if (IS_LEAF(n))
1925 seq_printf(seq, "|");
1926 else
1927 seq_printf(seq, "+");
1928 if (bits) {
1929 seq_printf(seq, "%d/", cindex);
1930 printbin_seq(seq, cindex, bits);
1931 seq_printf(seq, ": ");
1932 }
1933 else
1934 seq_printf(seq, "<root>: ");
1935 seq_printf(seq, "%s:%p ", IS_LEAF(n)?"Leaf":"Internal node", n);
1936
1937 if (IS_LEAF(n))
1938 seq_printf(seq, "key=%d.%d.%d.%d\n",
1939 n->key >> 24, (n->key >> 16) % 256, (n->key >> 8) % 256, n->key % 256);
1940 else {
1941 int plen=((struct tnode *)n)->pos;
1942 t_key prf=MASK_PFX(n->key, plen);
1943 seq_printf(seq, "key=%d.%d.%d.%d/%d\n",
1944 prf >> 24, (prf >> 16) % 256, (prf >> 8) % 256, prf % 256, plen);
1945 }
1946 if (IS_LEAF(n)) {
1947 struct leaf *l=(struct leaf *)n;
1948 struct fib_alias *fa;
1949 int i;
1950 for (i=32; i>=0; i--)
1951 if(find_leaf_info(&l->list, i)) {
1952
1953 struct list_head *fa_head = get_fa_head(l, i);
1954
1955 if(!fa_head)
1956 continue;
1957
1958 if(list_empty(fa_head))
1959 continue;
1960
1961 putspace_seq(seq, indent+2);
1962 seq_printf(seq, "{/%d...dumping}\n", i);
1963
1964
1965 list_for_each_entry(fa, fa_head, fa_list) {
1966 putspace_seq(seq, indent+2);
1967 if (fa->fa_info->fib_nh == NULL) {
1968 seq_printf(seq, "Error _fib_nh=NULL\n");
1969 continue;
1970 }
1971 if (fa->fa_info == NULL) {
1972 seq_printf(seq, "Error fa_info=NULL\n");
1973 continue;
1974 }
1975
1976 seq_printf(seq, "{type=%d scope=%d TOS=%d}\n",
1977 fa->fa_type,
1978 fa->fa_scope,
1979 fa->fa_tos);
1980 }
1981 }
1982 }
1983 else if (IS_TNODE(n)) {
1984 struct tnode *tn=(struct tnode *)n;
1985 putspace_seq(seq, indent); seq_printf(seq, "| ");
1986 seq_printf(seq, "{key prefix=%08x/", tn->key&TKEY_GET_MASK(0, tn->pos));
1987 printbin_seq(seq, tkey_extract_bits(tn->key, 0, tn->pos), tn->pos);
1988 seq_printf(seq, "}\n");
1989 putspace_seq(seq, indent); seq_printf(seq, "| ");
1990 seq_printf(seq, "{pos=%d", tn->pos);
1991 seq_printf(seq, " (skip=%d bits)", tn->pos - pend);
1992 seq_printf(seq, " bits=%d (%u children)}\n", tn->bits, (1 << tn->bits));
1993 putspace_seq(seq, indent); seq_printf(seq, "| ");
1994 seq_printf(seq, "{empty=%d full=%d}\n", tn->empty_children, tn->full_children);
1995 }
1996}
1997
1998static void trie_dump_seq(struct seq_file *seq, struct trie *t)
1999{
2000 struct node *n=t->trie;
2001 int cindex=0;
2002 int indent=1;
2003 int pend=0;
2004 int depth = 0;
2005
2006 read_lock(&fib_lock);
2007
2008 seq_printf(seq, "------ trie_dump of t=%p ------\n", t);
2009 if (n) {
2010 printnode_seq(seq, indent, n, pend, cindex, 0);
2011 if (IS_TNODE(n)) {
2012 struct tnode *tn=(struct tnode *)n;
2013 pend = tn->pos+tn->bits;
2014 putspace_seq(seq, indent); seq_printf(seq, "\\--\n");
2015 indent += 3;
2016 depth++;
2017
2018 while (tn && cindex < (1 << tn->bits)) {
2019 if (tn->child[cindex]) {
2020
2021 /* Got a child */
2022
2023 printnode_seq(seq, indent, tn->child[cindex], pend, cindex, tn->bits);
2024 if (IS_LEAF(tn->child[cindex])) {
2025 cindex++;
2026
2027 }
2028 else {
2029 /*
2030 * New tnode. Decend one level
2031 */
2032
2033 depth++;
2034 n=tn->child[cindex];
2035 tn=(struct tnode *)n;
2036 pend=tn->pos+tn->bits;
2037 putspace_seq(seq, indent); seq_printf(seq, "\\--\n");
2038 indent+=3;
2039 cindex=0;
2040 }
2041 }
2042 else
2043 cindex++;
2044
2045 /*
2046 * Test if we are done
2047 */
2048
2049 while (cindex >= (1 << tn->bits)) {
2050
2051 /*
2052 * Move upwards and test for root
2053 * pop off all traversed nodes
2054 */
2055
2056 if (NODE_PARENT(tn) == NULL) {
2057 tn = NULL;
2058 n = NULL;
2059 break;
2060 }
2061 else {
2062 cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
2063 tn = NODE_PARENT(tn);
2064 cindex++;
2065 n=(struct node *)tn;
2066 pend=tn->pos+tn->bits;
2067 indent-=3;
2068 depth--;
2069 }
2070 }
2071 }
2072 }
2073 else n = NULL;
2074 }
2075 else seq_printf(seq, "------ trie is empty\n");
2076
2077 read_unlock(&fib_lock);
2078}
2079
2080static struct trie_stat *trie_stat_new(void)
2081{
2082 struct trie_stat *s = kmalloc(sizeof(struct trie_stat), GFP_KERNEL);
2083 int i;
2084
2085 if(s) {
2086 s->totdepth = 0;
2087 s->maxdepth = 0;
2088 s->tnodes = 0;
2089 s->leaves = 0;
2090 s->nullpointers = 0;
2091
2092 for(i=0; i< MAX_CHILDS; i++)
2093 s->nodesizes[i] = 0;
2094 }
2095 return s;
2096}
2097
2098static struct trie_stat *trie_collect_stats(struct trie *t)
2099{
2100 struct node *n=t->trie;
2101 struct trie_stat *s = trie_stat_new();
2102 int cindex = 0;
2103 int indent = 1;
2104 int pend = 0;
2105 int depth = 0;
2106
2107 read_lock(&fib_lock);
2108
2109 if (s) {
2110 if (n) {
2111 if (IS_TNODE(n)) {
2112 struct tnode *tn = (struct tnode *)n;
2113 pend=tn->pos+tn->bits;
2114 indent += 3;
2115 s->nodesizes[tn->bits]++;
2116 depth++;
2117
2118 while (tn && cindex < (1 << tn->bits)) {
2119 if (tn->child[cindex]) {
2120 /* Got a child */
2121
2122 if (IS_LEAF(tn->child[cindex])) {
2123 cindex++;
2124
2125 /* stats */
2126 if (depth > s->maxdepth)
2127 s->maxdepth = depth;
2128 s->totdepth += depth;
2129 s->leaves++;
2130 }
2131
2132 else {
2133 /*
2134 * New tnode. Decend one level
2135 */
2136
2137 s->tnodes++;
2138 s->nodesizes[tn->bits]++;
2139 depth++;
2140
2141 n = tn->child[cindex];
2142 tn = (struct tnode *)n;
2143 pend = tn->pos+tn->bits;
2144
2145 indent += 3;
2146 cindex = 0;
2147 }
2148 }
2149 else {
2150 cindex++;
2151 s->nullpointers++;
2152 }
2153
2154 /*
2155 * Test if we are done
2156 */
2157
2158 while (cindex >= (1 << tn->bits)) {
2159
2160 /*
2161 * Move upwards and test for root
2162 * pop off all traversed nodes
2163 */
2164
2165
2166 if (NODE_PARENT(tn) == NULL) {
2167 tn = NULL;
2168 n = NULL;
2169 break;
2170 }
2171 else {
2172 cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
2173 tn = NODE_PARENT(tn);
2174 cindex++;
2175 n = (struct node *)tn;
2176 pend=tn->pos+tn->bits;
2177 indent -= 3;
2178 depth--;
2179 }
2180 }
2181 }
2182 }
2183 else n = NULL;
2184 }
2185 }
2186
2187 read_unlock(&fib_lock);
2188 return s;
2189}
2190
2191#ifdef CONFIG_PROC_FS
2192
2193static struct fib_alias *fib_triestat_get_first(struct seq_file *seq)
2194{
2195 return NULL;
2196}
2197
2198static struct fib_alias *fib_triestat_get_next(struct seq_file *seq)
2199{
2200 return NULL;
2201}
2202
2203static void *fib_triestat_seq_start(struct seq_file *seq, loff_t *pos)
2204{
2205 void *v = NULL;
2206
2207 if (ip_fib_main_table)
2208 v = *pos ? fib_triestat_get_next(seq) : SEQ_START_TOKEN;
2209 return v;
2210}
2211
2212static void *fib_triestat_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2213{
2214 ++*pos;
2215 return v == SEQ_START_TOKEN ? fib_triestat_get_first(seq) : fib_triestat_get_next(seq);
2216}
2217
2218static void fib_triestat_seq_stop(struct seq_file *seq, void *v)
2219{
2220
2221}
2222
2223/*
2224 * This outputs /proc/net/fib_triestats
2225 *
2226 * It always works in backward compatibility mode.
2227 * The format of the file is not supposed to be changed.
2228 */
2229
2230static void collect_and_show(struct trie *t, struct seq_file *seq)
2231{
2232 int bytes = 0; /* How many bytes are used, a ref is 4 bytes */
2233 int i, max, pointers;
2234 struct trie_stat *stat;
2235 int avdepth;
2236
2237 stat = trie_collect_stats(t);
2238
2239 bytes=0;
2240 seq_printf(seq, "trie=%p\n", t);
2241
2242 if (stat) {
2243 if (stat->leaves)
2244 avdepth=stat->totdepth*100 / stat->leaves;
2245 else
2246 avdepth=0;
2247 seq_printf(seq, "Aver depth: %d.%02d\n", avdepth / 100, avdepth % 100 );
2248 seq_printf(seq, "Max depth: %4d\n", stat->maxdepth);
2249
2250 seq_printf(seq, "Leaves: %d\n", stat->leaves);
2251 bytes += sizeof(struct leaf) * stat->leaves;
2252 seq_printf(seq, "Internal nodes: %d\n", stat->tnodes);
2253 bytes += sizeof(struct tnode) * stat->tnodes;
2254
2255 max = MAX_CHILDS-1;
2256
2257 while (max >= 0 && stat->nodesizes[max] == 0)
2258 max--;
2259 pointers = 0;
2260
2261 for (i = 1; i <= max; i++)
2262 if (stat->nodesizes[i] != 0) {
2263 seq_printf(seq, " %d: %d", i, stat->nodesizes[i]);
2264 pointers += (1<<i) * stat->nodesizes[i];
2265 }
2266 seq_printf(seq, "\n");
2267 seq_printf(seq, "Pointers: %d\n", pointers);
2268 bytes += sizeof(struct node *) * pointers;
2269 seq_printf(seq, "Null ptrs: %d\n", stat->nullpointers);
2270 seq_printf(seq, "Total size: %d kB\n", bytes / 1024);
2271
2272 kfree(stat);
2273 }
2274
2275#ifdef CONFIG_IP_FIB_TRIE_STATS
2276 seq_printf(seq, "Counters:\n---------\n");
2277 seq_printf(seq,"gets = %d\n", t->stats.gets);
2278 seq_printf(seq,"backtracks = %d\n", t->stats.backtrack);
2279 seq_printf(seq,"semantic match passed = %d\n", t->stats.semantic_match_passed);
2280 seq_printf(seq,"semantic match miss = %d\n", t->stats.semantic_match_miss);
2281 seq_printf(seq,"null node hit= %d\n", t->stats.null_node_hit);
2282#ifdef CLEAR_STATS
2283 memset(&(t->stats), 0, sizeof(t->stats));
2284#endif
2285#endif /* CONFIG_IP_FIB_TRIE_STATS */
2286}
2287
2288static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2289{
2290 char bf[128];
2291
2292 if (v == SEQ_START_TOKEN) {
2293 seq_printf(seq, "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n",
2294 sizeof(struct leaf), sizeof(struct tnode));
2295 if (trie_local)
2296 collect_and_show(trie_local, seq);
2297
2298 if (trie_main)
2299 collect_and_show(trie_main, seq);
2300 }
2301 else {
2302 snprintf(bf, sizeof(bf),
2303 "*\t%08X\t%08X", 200, 400);
2304
2305 seq_printf(seq, "%-127s\n", bf);
2306 }
2307 return 0;
2308}
2309
2310static struct seq_operations fib_triestat_seq_ops = {
2311 .start = fib_triestat_seq_start,
2312 .next = fib_triestat_seq_next,
2313 .stop = fib_triestat_seq_stop,
2314 .show = fib_triestat_seq_show,
2315};
2316
2317static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2318{
2319 struct seq_file *seq;
2320 int rc = -ENOMEM;
2321
2322 rc = seq_open(file, &fib_triestat_seq_ops);
2323 if (rc)
2324 goto out_kfree;
2325
2326 seq = file->private_data;
2327out:
2328 return rc;
2329out_kfree:
2330 goto out;
2331}
2332
2333static struct file_operations fib_triestat_seq_fops = {
2334 .owner = THIS_MODULE,
2335 .open = fib_triestat_seq_open,
2336 .read = seq_read,
2337 .llseek = seq_lseek,
2338 .release = seq_release_private,
2339};
2340
2341int __init fib_stat_proc_init(void)
2342{
2343 if (!proc_net_fops_create("fib_triestat", S_IRUGO, &fib_triestat_seq_fops))
2344 return -ENOMEM;
2345 return 0;
2346}
2347
2348void __init fib_stat_proc_exit(void)
2349{
2350 proc_net_remove("fib_triestat");
2351}
2352
2353static struct fib_alias *fib_trie_get_first(struct seq_file *seq)
2354{
2355 return NULL;
2356}
2357
2358static struct fib_alias *fib_trie_get_next(struct seq_file *seq)
2359{
2360 return NULL;
2361}
2362
2363static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2364{
2365 void *v = NULL;
2366
2367 if (ip_fib_main_table)
2368 v = *pos ? fib_trie_get_next(seq) : SEQ_START_TOKEN;
2369 return v;
2370}
2371
2372static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2373{
2374 ++*pos;
2375 return v == SEQ_START_TOKEN ? fib_trie_get_first(seq) : fib_trie_get_next(seq);
2376}
2377
2378static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2379{
2380
2381}
2382
2383/*
2384 * This outputs /proc/net/fib_trie.
2385 *
2386 * It always works in backward compatibility mode.
2387 * The format of the file is not supposed to be changed.
2388 */
2389
2390static int fib_trie_seq_show(struct seq_file *seq, void *v)
2391{
2392 char bf[128];
2393
2394 if (v == SEQ_START_TOKEN) {
2395 if (trie_local)
2396 trie_dump_seq(seq, trie_local);
2397
2398 if (trie_main)
2399 trie_dump_seq(seq, trie_main);
2400 }
2401
2402 else {
2403 snprintf(bf, sizeof(bf),
2404 "*\t%08X\t%08X", 200, 400);
2405 seq_printf(seq, "%-127s\n", bf);
2406 }
2407
2408 return 0;
2409}
2410
2411static struct seq_operations fib_trie_seq_ops = {
2412 .start = fib_trie_seq_start,
2413 .next = fib_trie_seq_next,
2414 .stop = fib_trie_seq_stop,
2415 .show = fib_trie_seq_show,
2416};
2417
2418static int fib_trie_seq_open(struct inode *inode, struct file *file)
2419{
2420 struct seq_file *seq;
2421 int rc = -ENOMEM;
2422
2423 rc = seq_open(file, &fib_trie_seq_ops);
2424 if (rc)
2425 goto out_kfree;
2426
2427 seq = file->private_data;
2428out:
2429 return rc;
2430out_kfree:
2431 goto out;
2432}
2433
2434static struct file_operations fib_trie_seq_fops = {
2435 .owner = THIS_MODULE,
2436 .open = fib_trie_seq_open,
2437 .read = seq_read,
2438 .llseek = seq_lseek,
2439 .release = seq_release_private,
2440};
2441
2442int __init fib_proc_init(void)
2443{
2444 if (!proc_net_fops_create("fib_trie", S_IRUGO, &fib_trie_seq_fops))
2445 return -ENOMEM;
2446 return 0;
2447}
2448
2449void __init fib_proc_exit(void)
2450{
2451 proc_net_remove("fib_trie");
2452}
2453
2454#endif /* CONFIG_PROC_FS */
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 85bf0d3e294b..cb759484979d 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -207,6 +207,7 @@ int sysctl_icmp_ignore_bogus_error_responses;
207 207
208int sysctl_icmp_ratelimit = 1 * HZ; 208int sysctl_icmp_ratelimit = 1 * HZ;
209int sysctl_icmp_ratemask = 0x1818; 209int sysctl_icmp_ratemask = 0x1818;
210int sysctl_icmp_errors_use_inbound_ifaddr;
210 211
211/* 212/*
212 * ICMP control array. This specifies what to do with each ICMP. 213 * ICMP control array. This specifies what to do with each ICMP.
@@ -511,8 +512,12 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, u32 info)
511 */ 512 */
512 513
513 saddr = iph->daddr; 514 saddr = iph->daddr;
514 if (!(rt->rt_flags & RTCF_LOCAL)) 515 if (!(rt->rt_flags & RTCF_LOCAL)) {
515 saddr = 0; 516 if (sysctl_icmp_errors_use_inbound_ifaddr)
517 saddr = inet_select_addr(skb_in->dev, 0, RT_SCOPE_LINK);
518 else
519 saddr = 0;
520 }
516 521
517 tos = icmp_pointers[type].error ? ((iph->tos & IPTOS_TOS_MASK) | 522 tos = icmp_pointers[type].error ? ((iph->tos & IPTOS_TOS_MASK) |
518 IPTOS_PREC_INTERNETCONTROL) : 523 IPTOS_PREC_INTERNETCONTROL) :
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 4e47a2658c7c..af2ec88bbb2f 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -184,6 +184,7 @@ int ip_call_ra_chain(struct sk_buff *skb)
184 raw_rcv(last, skb2); 184 raw_rcv(last, skb2);
185 } 185 }
186 last = sk; 186 last = sk;
187 nf_reset(skb);
187 } 188 }
188 } 189 }
189 190
@@ -200,10 +201,6 @@ static inline int ip_local_deliver_finish(struct sk_buff *skb)
200{ 201{
201 int ihl = skb->nh.iph->ihl*4; 202 int ihl = skb->nh.iph->ihl*4;
202 203
203#ifdef CONFIG_NETFILTER_DEBUG
204 nf_debug_ip_local_deliver(skb);
205#endif /*CONFIG_NETFILTER_DEBUG*/
206
207 __skb_pull(skb, ihl); 204 __skb_pull(skb, ihl);
208 205
209 /* Free reference early: we don't need it any more, and it may 206 /* Free reference early: we don't need it any more, and it may
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 760dc8238d65..ee07aec215a0 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -107,10 +107,6 @@ static int ip_dev_loopback_xmit(struct sk_buff *newskb)
107 newskb->pkt_type = PACKET_LOOPBACK; 107 newskb->pkt_type = PACKET_LOOPBACK;
108 newskb->ip_summed = CHECKSUM_UNNECESSARY; 108 newskb->ip_summed = CHECKSUM_UNNECESSARY;
109 BUG_TRAP(newskb->dst); 109 BUG_TRAP(newskb->dst);
110
111#ifdef CONFIG_NETFILTER_DEBUG
112 nf_debug_ip_loopback_xmit(newskb);
113#endif
114 nf_reset(newskb); 110 nf_reset(newskb);
115 netif_rx(newskb); 111 netif_rx(newskb);
116 return 0; 112 return 0;
@@ -192,10 +188,6 @@ static inline int ip_finish_output2(struct sk_buff *skb)
192 skb = skb2; 188 skb = skb2;
193 } 189 }
194 190
195#ifdef CONFIG_NETFILTER_DEBUG
196 nf_debug_ip_finish_output2(skb);
197#endif /*CONFIG_NETFILTER_DEBUG*/
198
199 nf_reset(skb); 191 nf_reset(skb);
200 192
201 if (hh) { 193 if (hh) {
@@ -415,9 +407,6 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
415 to->nf_bridge = from->nf_bridge; 407 to->nf_bridge = from->nf_bridge;
416 nf_bridge_get(to->nf_bridge); 408 nf_bridge_get(to->nf_bridge);
417#endif 409#endif
418#ifdef CONFIG_NETFILTER_DEBUG
419 to->nf_debug = from->nf_debug;
420#endif
421#endif 410#endif
422} 411}
423 412
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 47012b93cad2..f8b172f89811 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -360,14 +360,14 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len)
360 err = copied; 360 err = copied;
361 361
362 /* Reset and regenerate socket error */ 362 /* Reset and regenerate socket error */
363 spin_lock_irq(&sk->sk_error_queue.lock); 363 spin_lock_bh(&sk->sk_error_queue.lock);
364 sk->sk_err = 0; 364 sk->sk_err = 0;
365 if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) { 365 if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) {
366 sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno; 366 sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno;
367 spin_unlock_irq(&sk->sk_error_queue.lock); 367 spin_unlock_bh(&sk->sk_error_queue.lock);
368 sk->sk_error_report(sk); 368 sk->sk_error_report(sk);
369 } else 369 } else
370 spin_unlock_irq(&sk->sk_error_queue.lock); 370 spin_unlock_bh(&sk->sk_error_queue.lock);
371 371
372out_free_skb: 372out_free_skb:
373 kfree_skb(skb); 373 kfree_skb(skb);
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
index 1a23c5263b99..2065944fd9e5 100644
--- a/net/ipv4/ipcomp.c
+++ b/net/ipv4/ipcomp.c
@@ -236,15 +236,10 @@ static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x)
236 t->props.mode = 1; 236 t->props.mode = 1;
237 t->props.saddr.a4 = x->props.saddr.a4; 237 t->props.saddr.a4 = x->props.saddr.a4;
238 t->props.flags = x->props.flags; 238 t->props.flags = x->props.flags;
239 239
240 t->type = xfrm_get_type(IPPROTO_IPIP, t->props.family); 240 if (xfrm_init_state(t))
241 if (t->type == NULL)
242 goto error;
243
244 if (t->type->init_state(t, NULL))
245 goto error; 241 goto error;
246 242
247 t->km.state = XFRM_STATE_VALID;
248 atomic_set(&t->tunnel_users, 1); 243 atomic_set(&t->tunnel_users, 1);
249out: 244out:
250 return t; 245 return t;
@@ -422,7 +417,7 @@ static void ipcomp_destroy(struct xfrm_state *x)
422 kfree(ipcd); 417 kfree(ipcd);
423} 418}
424 419
425static int ipcomp_init_state(struct xfrm_state *x, void *args) 420static int ipcomp_init_state(struct xfrm_state *x)
426{ 421{
427 int err; 422 int err;
428 struct ipcomp_data *ipcd; 423 struct ipcomp_data *ipcd;
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index e21c049ec62a..e4f809a93f47 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -1350,6 +1350,7 @@ int ip_mr_input(struct sk_buff *skb)
1350 */ 1350 */
1351 read_lock(&mrt_lock); 1351 read_lock(&mrt_lock);
1352 if (mroute_socket) { 1352 if (mroute_socket) {
1353 nf_reset(skb);
1353 raw_rcv(mroute_socket, skb); 1354 raw_rcv(mroute_socket, skb);
1354 read_unlock(&mrt_lock); 1355 read_unlock(&mrt_lock);
1355 return 0; 1356 return 0;
diff --git a/net/ipv4/ipvs/ip_vs_xmit.c b/net/ipv4/ipvs/ip_vs_xmit.c
index de21da00057f..a8512a3fd08a 100644
--- a/net/ipv4/ipvs/ip_vs_xmit.c
+++ b/net/ipv4/ipvs/ip_vs_xmit.c
@@ -127,7 +127,6 @@ ip_vs_dst_reset(struct ip_vs_dest *dest)
127 127
128#define IP_VS_XMIT(skb, rt) \ 128#define IP_VS_XMIT(skb, rt) \
129do { \ 129do { \
130 nf_reset_debug(skb); \
131 (skb)->nfcache |= NFC_IPVS_PROPERTY; \ 130 (skb)->nfcache |= NFC_IPVS_PROPERTY; \
132 (skb)->ip_summed = CHECKSUM_NONE; \ 131 (skb)->ip_summed = CHECKSUM_NONE; \
133 NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, (skb), NULL, \ 132 NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, (skb), NULL, \
diff --git a/net/ipv4/multipath_drr.c b/net/ipv4/multipath_drr.c
index cf2e6bcf7973..c9cf8726051d 100644
--- a/net/ipv4/multipath_drr.c
+++ b/net/ipv4/multipath_drr.c
@@ -31,6 +31,7 @@
31#include <linux/igmp.h> 31#include <linux/igmp.h>
32#include <linux/proc_fs.h> 32#include <linux/proc_fs.h>
33#include <linux/seq_file.h> 33#include <linux/seq_file.h>
34#include <linux/module.h>
34#include <linux/mroute.h> 35#include <linux/mroute.h>
35#include <linux/init.h> 36#include <linux/init.h>
36#include <net/ip.h> 37#include <net/ip.h>
@@ -247,3 +248,4 @@ static void __exit drr_exit(void)
247 248
248module_init(drr_init); 249module_init(drr_init);
249module_exit(drr_exit); 250module_exit(drr_exit);
251MODULE_LICENSE("GPL");
diff --git a/net/ipv4/multipath_random.c b/net/ipv4/multipath_random.c
index 805a16e47de5..5249dbe7c559 100644
--- a/net/ipv4/multipath_random.c
+++ b/net/ipv4/multipath_random.c
@@ -31,6 +31,7 @@
31#include <linux/igmp.h> 31#include <linux/igmp.h>
32#include <linux/proc_fs.h> 32#include <linux/proc_fs.h>
33#include <linux/seq_file.h> 33#include <linux/seq_file.h>
34#include <linux/module.h>
34#include <linux/mroute.h> 35#include <linux/mroute.h>
35#include <linux/init.h> 36#include <linux/init.h>
36#include <net/ip.h> 37#include <net/ip.h>
@@ -126,3 +127,4 @@ static void __exit random_exit(void)
126 127
127module_init(random_init); 128module_init(random_init);
128module_exit(random_exit); 129module_exit(random_exit);
130MODULE_LICENSE("GPL");
diff --git a/net/ipv4/multipath_rr.c b/net/ipv4/multipath_rr.c
index 061b6b253982..b6cd2870478f 100644
--- a/net/ipv4/multipath_rr.c
+++ b/net/ipv4/multipath_rr.c
@@ -31,6 +31,7 @@
31#include <linux/igmp.h> 31#include <linux/igmp.h>
32#include <linux/proc_fs.h> 32#include <linux/proc_fs.h>
33#include <linux/seq_file.h> 33#include <linux/seq_file.h>
34#include <linux/module.h>
34#include <linux/mroute.h> 35#include <linux/mroute.h>
35#include <linux/init.h> 36#include <linux/init.h>
36#include <net/ip.h> 37#include <net/ip.h>
@@ -93,3 +94,4 @@ static void __exit rr_exit(void)
93 94
94module_init(rr_init); 95module_init(rr_init);
95module_exit(rr_exit); 96module_exit(rr_exit);
97MODULE_LICENSE("GPL");
diff --git a/net/ipv4/multipath_wrandom.c b/net/ipv4/multipath_wrandom.c
index c3d2ca1a6781..bd7d75b6abe0 100644
--- a/net/ipv4/multipath_wrandom.c
+++ b/net/ipv4/multipath_wrandom.c
@@ -31,6 +31,7 @@
31#include <linux/igmp.h> 31#include <linux/igmp.h>
32#include <linux/proc_fs.h> 32#include <linux/proc_fs.h>
33#include <linux/seq_file.h> 33#include <linux/seq_file.h>
34#include <linux/module.h>
34#include <linux/mroute.h> 35#include <linux/mroute.h>
35#include <linux/init.h> 36#include <linux/init.h>
36#include <net/ip.h> 37#include <net/ip.h>
@@ -342,3 +343,4 @@ static void __exit wrandom_exit(void)
342 343
343module_init(wrandom_init); 344module_init(wrandom_init);
344module_exit(wrandom_exit); 345module_exit(wrandom_exit);
346MODULE_LICENSE("GPL");
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index df79f5ed6a0a..fa1634256680 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -60,7 +60,6 @@ static DECLARE_MUTEX(arpt_mutex);
60 60
61#define ASSERT_READ_LOCK(x) ARP_NF_ASSERT(down_trylock(&arpt_mutex) != 0) 61#define ASSERT_READ_LOCK(x) ARP_NF_ASSERT(down_trylock(&arpt_mutex) != 0)
62#define ASSERT_WRITE_LOCK(x) ARP_NF_ASSERT(down_trylock(&arpt_mutex) != 0) 62#define ASSERT_WRITE_LOCK(x) ARP_NF_ASSERT(down_trylock(&arpt_mutex) != 0)
63#include <linux/netfilter_ipv4/lockhelp.h>
64#include <linux/netfilter_ipv4/listhelp.h> 63#include <linux/netfilter_ipv4/listhelp.h>
65 64
66struct arpt_table_info { 65struct arpt_table_info {
diff --git a/net/ipv4/netfilter/ip_conntrack_amanda.c b/net/ipv4/netfilter/ip_conntrack_amanda.c
index 3dbddd062605..a78a320eee08 100644
--- a/net/ipv4/netfilter/ip_conntrack_amanda.c
+++ b/net/ipv4/netfilter/ip_conntrack_amanda.c
@@ -26,7 +26,6 @@
26#include <net/checksum.h> 26#include <net/checksum.h>
27#include <net/udp.h> 27#include <net/udp.h>
28 28
29#include <linux/netfilter_ipv4/lockhelp.h>
30#include <linux/netfilter_ipv4/ip_conntrack_helper.h> 29#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
31#include <linux/netfilter_ipv4/ip_conntrack_amanda.h> 30#include <linux/netfilter_ipv4/ip_conntrack_amanda.h>
32 31
@@ -42,7 +41,7 @@ static char *conns[] = { "DATA ", "MESG ", "INDEX " };
42 41
43/* This is slow, but it's simple. --RR */ 42/* This is slow, but it's simple. --RR */
44static char amanda_buffer[65536]; 43static char amanda_buffer[65536];
45static DECLARE_LOCK(amanda_buffer_lock); 44static DEFINE_SPINLOCK(amanda_buffer_lock);
46 45
47unsigned int (*ip_nat_amanda_hook)(struct sk_buff **pskb, 46unsigned int (*ip_nat_amanda_hook)(struct sk_buff **pskb,
48 enum ip_conntrack_info ctinfo, 47 enum ip_conntrack_info ctinfo,
@@ -76,7 +75,7 @@ static int help(struct sk_buff **pskb,
76 return NF_ACCEPT; 75 return NF_ACCEPT;
77 } 76 }
78 77
79 LOCK_BH(&amanda_buffer_lock); 78 spin_lock_bh(&amanda_buffer_lock);
80 skb_copy_bits(*pskb, dataoff, amanda_buffer, (*pskb)->len - dataoff); 79 skb_copy_bits(*pskb, dataoff, amanda_buffer, (*pskb)->len - dataoff);
81 data = amanda_buffer; 80 data = amanda_buffer;
82 data_limit = amanda_buffer + (*pskb)->len - dataoff; 81 data_limit = amanda_buffer + (*pskb)->len - dataoff;
@@ -134,7 +133,7 @@ static int help(struct sk_buff **pskb,
134 } 133 }
135 134
136out: 135out:
137 UNLOCK_BH(&amanda_buffer_lock); 136 spin_unlock_bh(&amanda_buffer_lock);
138 return ret; 137 return ret;
139} 138}
140 139
diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c
index 09e824622977..4b78ebeb6635 100644
--- a/net/ipv4/netfilter/ip_conntrack_core.c
+++ b/net/ipv4/netfilter/ip_conntrack_core.c
@@ -38,10 +38,10 @@
38#include <linux/percpu.h> 38#include <linux/percpu.h>
39#include <linux/moduleparam.h> 39#include <linux/moduleparam.h>
40 40
41/* This rwlock protects the main hash table, protocol/helper/expected 41/* ip_conntrack_lock protects the main hash table, protocol/helper/expected
42 registrations, conntrack timers*/ 42 registrations, conntrack timers*/
43#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_conntrack_lock) 43#define ASSERT_READ_LOCK(x)
44#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_conntrack_lock) 44#define ASSERT_WRITE_LOCK(x)
45 45
46#include <linux/netfilter_ipv4/ip_conntrack.h> 46#include <linux/netfilter_ipv4/ip_conntrack.h>
47#include <linux/netfilter_ipv4/ip_conntrack_protocol.h> 47#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
@@ -57,7 +57,7 @@
57#define DEBUGP(format, args...) 57#define DEBUGP(format, args...)
58#endif 58#endif
59 59
60DECLARE_RWLOCK(ip_conntrack_lock); 60DEFINE_RWLOCK(ip_conntrack_lock);
61 61
62/* ip_conntrack_standalone needs this */ 62/* ip_conntrack_standalone needs this */
63atomic_t ip_conntrack_count = ATOMIC_INIT(0); 63atomic_t ip_conntrack_count = ATOMIC_INIT(0);
@@ -147,7 +147,7 @@ static void destroy_expect(struct ip_conntrack_expect *exp)
147 147
148static void unlink_expect(struct ip_conntrack_expect *exp) 148static void unlink_expect(struct ip_conntrack_expect *exp)
149{ 149{
150 MUST_BE_WRITE_LOCKED(&ip_conntrack_lock); 150 ASSERT_WRITE_LOCK(&ip_conntrack_lock);
151 list_del(&exp->list); 151 list_del(&exp->list);
152 /* Logically in destroy_expect, but we hold the lock here. */ 152 /* Logically in destroy_expect, but we hold the lock here. */
153 exp->master->expecting--; 153 exp->master->expecting--;
@@ -157,9 +157,9 @@ static void expectation_timed_out(unsigned long ul_expect)
157{ 157{
158 struct ip_conntrack_expect *exp = (void *)ul_expect; 158 struct ip_conntrack_expect *exp = (void *)ul_expect;
159 159
160 WRITE_LOCK(&ip_conntrack_lock); 160 write_lock_bh(&ip_conntrack_lock);
161 unlink_expect(exp); 161 unlink_expect(exp);
162 WRITE_UNLOCK(&ip_conntrack_lock); 162 write_unlock_bh(&ip_conntrack_lock);
163 destroy_expect(exp); 163 destroy_expect(exp);
164} 164}
165 165
@@ -209,7 +209,7 @@ clean_from_lists(struct ip_conntrack *ct)
209 unsigned int ho, hr; 209 unsigned int ho, hr;
210 210
211 DEBUGP("clean_from_lists(%p)\n", ct); 211 DEBUGP("clean_from_lists(%p)\n", ct);
212 MUST_BE_WRITE_LOCKED(&ip_conntrack_lock); 212 ASSERT_WRITE_LOCK(&ip_conntrack_lock);
213 213
214 ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); 214 ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
215 hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); 215 hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
@@ -240,7 +240,7 @@ destroy_conntrack(struct nf_conntrack *nfct)
240 if (ip_conntrack_destroyed) 240 if (ip_conntrack_destroyed)
241 ip_conntrack_destroyed(ct); 241 ip_conntrack_destroyed(ct);
242 242
243 WRITE_LOCK(&ip_conntrack_lock); 243 write_lock_bh(&ip_conntrack_lock);
244 /* Expectations will have been removed in clean_from_lists, 244 /* Expectations will have been removed in clean_from_lists,
245 * except TFTP can create an expectation on the first packet, 245 * except TFTP can create an expectation on the first packet,
246 * before connection is in the list, so we need to clean here, 246 * before connection is in the list, so we need to clean here,
@@ -254,7 +254,7 @@ destroy_conntrack(struct nf_conntrack *nfct)
254 } 254 }
255 255
256 CONNTRACK_STAT_INC(delete); 256 CONNTRACK_STAT_INC(delete);
257 WRITE_UNLOCK(&ip_conntrack_lock); 257 write_unlock_bh(&ip_conntrack_lock);
258 258
259 if (ct->master) 259 if (ct->master)
260 ip_conntrack_put(ct->master); 260 ip_conntrack_put(ct->master);
@@ -268,12 +268,12 @@ static void death_by_timeout(unsigned long ul_conntrack)
268{ 268{
269 struct ip_conntrack *ct = (void *)ul_conntrack; 269 struct ip_conntrack *ct = (void *)ul_conntrack;
270 270
271 WRITE_LOCK(&ip_conntrack_lock); 271 write_lock_bh(&ip_conntrack_lock);
272 /* Inside lock so preempt is disabled on module removal path. 272 /* Inside lock so preempt is disabled on module removal path.
273 * Otherwise we can get spurious warnings. */ 273 * Otherwise we can get spurious warnings. */
274 CONNTRACK_STAT_INC(delete_list); 274 CONNTRACK_STAT_INC(delete_list);
275 clean_from_lists(ct); 275 clean_from_lists(ct);
276 WRITE_UNLOCK(&ip_conntrack_lock); 276 write_unlock_bh(&ip_conntrack_lock);
277 ip_conntrack_put(ct); 277 ip_conntrack_put(ct);
278} 278}
279 279
@@ -282,7 +282,7 @@ conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i,
282 const struct ip_conntrack_tuple *tuple, 282 const struct ip_conntrack_tuple *tuple,
283 const struct ip_conntrack *ignored_conntrack) 283 const struct ip_conntrack *ignored_conntrack)
284{ 284{
285 MUST_BE_READ_LOCKED(&ip_conntrack_lock); 285 ASSERT_READ_LOCK(&ip_conntrack_lock);
286 return tuplehash_to_ctrack(i) != ignored_conntrack 286 return tuplehash_to_ctrack(i) != ignored_conntrack
287 && ip_ct_tuple_equal(tuple, &i->tuple); 287 && ip_ct_tuple_equal(tuple, &i->tuple);
288} 288}
@@ -294,7 +294,7 @@ __ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
294 struct ip_conntrack_tuple_hash *h; 294 struct ip_conntrack_tuple_hash *h;
295 unsigned int hash = hash_conntrack(tuple); 295 unsigned int hash = hash_conntrack(tuple);
296 296
297 MUST_BE_READ_LOCKED(&ip_conntrack_lock); 297 ASSERT_READ_LOCK(&ip_conntrack_lock);
298 list_for_each_entry(h, &ip_conntrack_hash[hash], list) { 298 list_for_each_entry(h, &ip_conntrack_hash[hash], list) {
299 if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) { 299 if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) {
300 CONNTRACK_STAT_INC(found); 300 CONNTRACK_STAT_INC(found);
@@ -313,11 +313,11 @@ ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
313{ 313{
314 struct ip_conntrack_tuple_hash *h; 314 struct ip_conntrack_tuple_hash *h;
315 315
316 READ_LOCK(&ip_conntrack_lock); 316 read_lock_bh(&ip_conntrack_lock);
317 h = __ip_conntrack_find(tuple, ignored_conntrack); 317 h = __ip_conntrack_find(tuple, ignored_conntrack);
318 if (h) 318 if (h)
319 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use); 319 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
320 READ_UNLOCK(&ip_conntrack_lock); 320 read_unlock_bh(&ip_conntrack_lock);
321 321
322 return h; 322 return h;
323} 323}
@@ -352,7 +352,7 @@ __ip_conntrack_confirm(struct sk_buff **pskb)
352 IP_NF_ASSERT(!is_confirmed(ct)); 352 IP_NF_ASSERT(!is_confirmed(ct));
353 DEBUGP("Confirming conntrack %p\n", ct); 353 DEBUGP("Confirming conntrack %p\n", ct);
354 354
355 WRITE_LOCK(&ip_conntrack_lock); 355 write_lock_bh(&ip_conntrack_lock);
356 356
357 /* See if there's one in the list already, including reverse: 357 /* See if there's one in the list already, including reverse:
358 NAT could have grabbed it without realizing, since we're 358 NAT could have grabbed it without realizing, since we're
@@ -380,12 +380,12 @@ __ip_conntrack_confirm(struct sk_buff **pskb)
380 atomic_inc(&ct->ct_general.use); 380 atomic_inc(&ct->ct_general.use);
381 set_bit(IPS_CONFIRMED_BIT, &ct->status); 381 set_bit(IPS_CONFIRMED_BIT, &ct->status);
382 CONNTRACK_STAT_INC(insert); 382 CONNTRACK_STAT_INC(insert);
383 WRITE_UNLOCK(&ip_conntrack_lock); 383 write_unlock_bh(&ip_conntrack_lock);
384 return NF_ACCEPT; 384 return NF_ACCEPT;
385 } 385 }
386 386
387 CONNTRACK_STAT_INC(insert_failed); 387 CONNTRACK_STAT_INC(insert_failed);
388 WRITE_UNLOCK(&ip_conntrack_lock); 388 write_unlock_bh(&ip_conntrack_lock);
389 389
390 return NF_DROP; 390 return NF_DROP;
391} 391}
@@ -398,9 +398,9 @@ ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
398{ 398{
399 struct ip_conntrack_tuple_hash *h; 399 struct ip_conntrack_tuple_hash *h;
400 400
401 READ_LOCK(&ip_conntrack_lock); 401 read_lock_bh(&ip_conntrack_lock);
402 h = __ip_conntrack_find(tuple, ignored_conntrack); 402 h = __ip_conntrack_find(tuple, ignored_conntrack);
403 READ_UNLOCK(&ip_conntrack_lock); 403 read_unlock_bh(&ip_conntrack_lock);
404 404
405 return h != NULL; 405 return h != NULL;
406} 406}
@@ -419,13 +419,13 @@ static int early_drop(struct list_head *chain)
419 struct ip_conntrack *ct = NULL; 419 struct ip_conntrack *ct = NULL;
420 int dropped = 0; 420 int dropped = 0;
421 421
422 READ_LOCK(&ip_conntrack_lock); 422 read_lock_bh(&ip_conntrack_lock);
423 h = LIST_FIND_B(chain, unreplied, struct ip_conntrack_tuple_hash *); 423 h = LIST_FIND_B(chain, unreplied, struct ip_conntrack_tuple_hash *);
424 if (h) { 424 if (h) {
425 ct = tuplehash_to_ctrack(h); 425 ct = tuplehash_to_ctrack(h);
426 atomic_inc(&ct->ct_general.use); 426 atomic_inc(&ct->ct_general.use);
427 } 427 }
428 READ_UNLOCK(&ip_conntrack_lock); 428 read_unlock_bh(&ip_conntrack_lock);
429 429
430 if (!ct) 430 if (!ct)
431 return dropped; 431 return dropped;
@@ -508,7 +508,7 @@ init_conntrack(const struct ip_conntrack_tuple *tuple,
508 conntrack->timeout.data = (unsigned long)conntrack; 508 conntrack->timeout.data = (unsigned long)conntrack;
509 conntrack->timeout.function = death_by_timeout; 509 conntrack->timeout.function = death_by_timeout;
510 510
511 WRITE_LOCK(&ip_conntrack_lock); 511 write_lock_bh(&ip_conntrack_lock);
512 exp = find_expectation(tuple); 512 exp = find_expectation(tuple);
513 513
514 if (exp) { 514 if (exp) {
@@ -532,7 +532,7 @@ init_conntrack(const struct ip_conntrack_tuple *tuple,
532 list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed); 532 list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
533 533
534 atomic_inc(&ip_conntrack_count); 534 atomic_inc(&ip_conntrack_count);
535 WRITE_UNLOCK(&ip_conntrack_lock); 535 write_unlock_bh(&ip_conntrack_lock);
536 536
537 if (exp) { 537 if (exp) {
538 if (exp->expectfn) 538 if (exp->expectfn)
@@ -723,17 +723,17 @@ void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp)
723{ 723{
724 struct ip_conntrack_expect *i; 724 struct ip_conntrack_expect *i;
725 725
726 WRITE_LOCK(&ip_conntrack_lock); 726 write_lock_bh(&ip_conntrack_lock);
727 /* choose the the oldest expectation to evict */ 727 /* choose the the oldest expectation to evict */
728 list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) { 728 list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
729 if (expect_matches(i, exp) && del_timer(&i->timeout)) { 729 if (expect_matches(i, exp) && del_timer(&i->timeout)) {
730 unlink_expect(i); 730 unlink_expect(i);
731 WRITE_UNLOCK(&ip_conntrack_lock); 731 write_unlock_bh(&ip_conntrack_lock);
732 destroy_expect(i); 732 destroy_expect(i);
733 return; 733 return;
734 } 734 }
735 } 735 }
736 WRITE_UNLOCK(&ip_conntrack_lock); 736 write_unlock_bh(&ip_conntrack_lock);
737} 737}
738 738
739struct ip_conntrack_expect *ip_conntrack_expect_alloc(void) 739struct ip_conntrack_expect *ip_conntrack_expect_alloc(void)
@@ -760,15 +760,11 @@ static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp)
760 exp->master->expecting++; 760 exp->master->expecting++;
761 list_add(&exp->list, &ip_conntrack_expect_list); 761 list_add(&exp->list, &ip_conntrack_expect_list);
762 762
763 if (exp->master->helper->timeout) { 763 init_timer(&exp->timeout);
764 init_timer(&exp->timeout); 764 exp->timeout.data = (unsigned long)exp;
765 exp->timeout.data = (unsigned long)exp; 765 exp->timeout.function = expectation_timed_out;
766 exp->timeout.function = expectation_timed_out; 766 exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ;
767 exp->timeout.expires 767 add_timer(&exp->timeout);
768 = jiffies + exp->master->helper->timeout * HZ;
769 add_timer(&exp->timeout);
770 } else
771 exp->timeout.function = NULL;
772 768
773 CONNTRACK_STAT_INC(expect_create); 769 CONNTRACK_STAT_INC(expect_create);
774} 770}
@@ -808,7 +804,7 @@ int ip_conntrack_expect_related(struct ip_conntrack_expect *expect)
808 DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple); 804 DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple);
809 DEBUGP("mask: "); DUMP_TUPLE(&expect->mask); 805 DEBUGP("mask: "); DUMP_TUPLE(&expect->mask);
810 806
811 WRITE_LOCK(&ip_conntrack_lock); 807 write_lock_bh(&ip_conntrack_lock);
812 list_for_each_entry(i, &ip_conntrack_expect_list, list) { 808 list_for_each_entry(i, &ip_conntrack_expect_list, list) {
813 if (expect_matches(i, expect)) { 809 if (expect_matches(i, expect)) {
814 /* Refresh timer: if it's dying, ignore.. */ 810 /* Refresh timer: if it's dying, ignore.. */
@@ -832,7 +828,7 @@ int ip_conntrack_expect_related(struct ip_conntrack_expect *expect)
832 ip_conntrack_expect_insert(expect); 828 ip_conntrack_expect_insert(expect);
833 ret = 0; 829 ret = 0;
834out: 830out:
835 WRITE_UNLOCK(&ip_conntrack_lock); 831 write_unlock_bh(&ip_conntrack_lock);
836 return ret; 832 return ret;
837} 833}
838 834
@@ -841,7 +837,7 @@ out:
841void ip_conntrack_alter_reply(struct ip_conntrack *conntrack, 837void ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
842 const struct ip_conntrack_tuple *newreply) 838 const struct ip_conntrack_tuple *newreply)
843{ 839{
844 WRITE_LOCK(&ip_conntrack_lock); 840 write_lock_bh(&ip_conntrack_lock);
845 /* Should be unconfirmed, so not in hash table yet */ 841 /* Should be unconfirmed, so not in hash table yet */
846 IP_NF_ASSERT(!is_confirmed(conntrack)); 842 IP_NF_ASSERT(!is_confirmed(conntrack));
847 843
@@ -851,15 +847,15 @@ void ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
851 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply; 847 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
852 if (!conntrack->master && conntrack->expecting == 0) 848 if (!conntrack->master && conntrack->expecting == 0)
853 conntrack->helper = ip_ct_find_helper(newreply); 849 conntrack->helper = ip_ct_find_helper(newreply);
854 WRITE_UNLOCK(&ip_conntrack_lock); 850 write_unlock_bh(&ip_conntrack_lock);
855} 851}
856 852
857int ip_conntrack_helper_register(struct ip_conntrack_helper *me) 853int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
858{ 854{
859 BUG_ON(me->timeout == 0); 855 BUG_ON(me->timeout == 0);
860 WRITE_LOCK(&ip_conntrack_lock); 856 write_lock_bh(&ip_conntrack_lock);
861 list_prepend(&helpers, me); 857 list_prepend(&helpers, me);
862 WRITE_UNLOCK(&ip_conntrack_lock); 858 write_unlock_bh(&ip_conntrack_lock);
863 859
864 return 0; 860 return 0;
865} 861}
@@ -878,7 +874,7 @@ void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
878 struct ip_conntrack_expect *exp, *tmp; 874 struct ip_conntrack_expect *exp, *tmp;
879 875
880 /* Need write lock here, to delete helper. */ 876 /* Need write lock here, to delete helper. */
881 WRITE_LOCK(&ip_conntrack_lock); 877 write_lock_bh(&ip_conntrack_lock);
882 LIST_DELETE(&helpers, me); 878 LIST_DELETE(&helpers, me);
883 879
884 /* Get rid of expectations */ 880 /* Get rid of expectations */
@@ -893,7 +889,7 @@ void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
893 for (i = 0; i < ip_conntrack_htable_size; i++) 889 for (i = 0; i < ip_conntrack_htable_size; i++)
894 LIST_FIND_W(&ip_conntrack_hash[i], unhelp, 890 LIST_FIND_W(&ip_conntrack_hash[i], unhelp,
895 struct ip_conntrack_tuple_hash *, me); 891 struct ip_conntrack_tuple_hash *, me);
896 WRITE_UNLOCK(&ip_conntrack_lock); 892 write_unlock_bh(&ip_conntrack_lock);
897 893
898 /* Someone could be still looking at the helper in a bh. */ 894 /* Someone could be still looking at the helper in a bh. */
899 synchronize_net(); 895 synchronize_net();
@@ -925,14 +921,14 @@ void ip_ct_refresh_acct(struct ip_conntrack *ct,
925 ct->timeout.expires = extra_jiffies; 921 ct->timeout.expires = extra_jiffies;
926 ct_add_counters(ct, ctinfo, skb); 922 ct_add_counters(ct, ctinfo, skb);
927 } else { 923 } else {
928 WRITE_LOCK(&ip_conntrack_lock); 924 write_lock_bh(&ip_conntrack_lock);
929 /* Need del_timer for race avoidance (may already be dying). */ 925 /* Need del_timer for race avoidance (may already be dying). */
930 if (del_timer(&ct->timeout)) { 926 if (del_timer(&ct->timeout)) {
931 ct->timeout.expires = jiffies + extra_jiffies; 927 ct->timeout.expires = jiffies + extra_jiffies;
932 add_timer(&ct->timeout); 928 add_timer(&ct->timeout);
933 } 929 }
934 ct_add_counters(ct, ctinfo, skb); 930 ct_add_counters(ct, ctinfo, skb);
935 WRITE_UNLOCK(&ip_conntrack_lock); 931 write_unlock_bh(&ip_conntrack_lock);
936 } 932 }
937} 933}
938 934
@@ -940,10 +936,6 @@ void ip_ct_refresh_acct(struct ip_conntrack *ct,
940struct sk_buff * 936struct sk_buff *
941ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user) 937ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
942{ 938{
943#ifdef CONFIG_NETFILTER_DEBUG
944 unsigned int olddebug = skb->nf_debug;
945#endif
946
947 skb_orphan(skb); 939 skb_orphan(skb);
948 940
949 local_bh_disable(); 941 local_bh_disable();
@@ -953,12 +945,7 @@ ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
953 if (skb) { 945 if (skb) {
954 ip_send_check(skb->nh.iph); 946 ip_send_check(skb->nh.iph);
955 skb->nfcache |= NFC_ALTERED; 947 skb->nfcache |= NFC_ALTERED;
956#ifdef CONFIG_NETFILTER_DEBUG
957 /* Packet path as if nothing had happened. */
958 skb->nf_debug = olddebug;
959#endif
960 } 948 }
961
962 return skb; 949 return skb;
963} 950}
964 951
@@ -997,7 +984,7 @@ get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data),
997{ 984{
998 struct ip_conntrack_tuple_hash *h = NULL; 985 struct ip_conntrack_tuple_hash *h = NULL;
999 986
1000 WRITE_LOCK(&ip_conntrack_lock); 987 write_lock_bh(&ip_conntrack_lock);
1001 for (; *bucket < ip_conntrack_htable_size; (*bucket)++) { 988 for (; *bucket < ip_conntrack_htable_size; (*bucket)++) {
1002 h = LIST_FIND_W(&ip_conntrack_hash[*bucket], do_iter, 989 h = LIST_FIND_W(&ip_conntrack_hash[*bucket], do_iter,
1003 struct ip_conntrack_tuple_hash *, iter, data); 990 struct ip_conntrack_tuple_hash *, iter, data);
@@ -1009,7 +996,7 @@ get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data),
1009 struct ip_conntrack_tuple_hash *, iter, data); 996 struct ip_conntrack_tuple_hash *, iter, data);
1010 if (h) 997 if (h)
1011 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use); 998 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
1012 WRITE_UNLOCK(&ip_conntrack_lock); 999 write_unlock_bh(&ip_conntrack_lock);
1013 1000
1014 return h; 1001 return h;
1015} 1002}
@@ -1201,14 +1188,14 @@ int __init ip_conntrack_init(void)
1201 } 1188 }
1202 1189
1203 /* Don't NEED lock here, but good form anyway. */ 1190 /* Don't NEED lock here, but good form anyway. */
1204 WRITE_LOCK(&ip_conntrack_lock); 1191 write_lock_bh(&ip_conntrack_lock);
1205 for (i = 0; i < MAX_IP_CT_PROTO; i++) 1192 for (i = 0; i < MAX_IP_CT_PROTO; i++)
1206 ip_ct_protos[i] = &ip_conntrack_generic_protocol; 1193 ip_ct_protos[i] = &ip_conntrack_generic_protocol;
1207 /* Sew in builtin protocols. */ 1194 /* Sew in builtin protocols. */
1208 ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp; 1195 ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp;
1209 ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp; 1196 ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp;
1210 ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp; 1197 ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp;
1211 WRITE_UNLOCK(&ip_conntrack_lock); 1198 write_unlock_bh(&ip_conntrack_lock);
1212 1199
1213 for (i = 0; i < ip_conntrack_htable_size; i++) 1200 for (i = 0; i < ip_conntrack_htable_size; i++)
1214 INIT_LIST_HEAD(&ip_conntrack_hash[i]); 1201 INIT_LIST_HEAD(&ip_conntrack_hash[i]);
diff --git a/net/ipv4/netfilter/ip_conntrack_ftp.c b/net/ipv4/netfilter/ip_conntrack_ftp.c
index dd86503aa788..fea6dd2a00b6 100644
--- a/net/ipv4/netfilter/ip_conntrack_ftp.c
+++ b/net/ipv4/netfilter/ip_conntrack_ftp.c
@@ -16,7 +16,6 @@
16#include <net/checksum.h> 16#include <net/checksum.h>
17#include <net/tcp.h> 17#include <net/tcp.h>
18 18
19#include <linux/netfilter_ipv4/lockhelp.h>
20#include <linux/netfilter_ipv4/ip_conntrack_helper.h> 19#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
21#include <linux/netfilter_ipv4/ip_conntrack_ftp.h> 20#include <linux/netfilter_ipv4/ip_conntrack_ftp.h>
22#include <linux/moduleparam.h> 21#include <linux/moduleparam.h>
@@ -28,7 +27,7 @@ MODULE_DESCRIPTION("ftp connection tracking helper");
28/* This is slow, but it's simple. --RR */ 27/* This is slow, but it's simple. --RR */
29static char ftp_buffer[65536]; 28static char ftp_buffer[65536];
30 29
31static DECLARE_LOCK(ip_ftp_lock); 30static DEFINE_SPINLOCK(ip_ftp_lock);
32 31
33#define MAX_PORTS 8 32#define MAX_PORTS 8
34static int ports[MAX_PORTS]; 33static int ports[MAX_PORTS];
@@ -319,7 +318,7 @@ static int help(struct sk_buff **pskb,
319 } 318 }
320 datalen = (*pskb)->len - dataoff; 319 datalen = (*pskb)->len - dataoff;
321 320
322 LOCK_BH(&ip_ftp_lock); 321 spin_lock_bh(&ip_ftp_lock);
323 fb_ptr = skb_header_pointer(*pskb, dataoff, 322 fb_ptr = skb_header_pointer(*pskb, dataoff,
324 (*pskb)->len - dataoff, ftp_buffer); 323 (*pskb)->len - dataoff, ftp_buffer);
325 BUG_ON(fb_ptr == NULL); 324 BUG_ON(fb_ptr == NULL);
@@ -442,7 +441,7 @@ out_update_nl:
442 if (ends_in_nl) 441 if (ends_in_nl)
443 update_nl_seq(seq, ct_ftp_info,dir); 442 update_nl_seq(seq, ct_ftp_info,dir);
444 out: 443 out:
445 UNLOCK_BH(&ip_ftp_lock); 444 spin_unlock_bh(&ip_ftp_lock);
446 return ret; 445 return ret;
447} 446}
448 447
diff --git a/net/ipv4/netfilter/ip_conntrack_irc.c b/net/ipv4/netfilter/ip_conntrack_irc.c
index 33cc7348b6ee..cd98772cc332 100644
--- a/net/ipv4/netfilter/ip_conntrack_irc.c
+++ b/net/ipv4/netfilter/ip_conntrack_irc.c
@@ -29,7 +29,6 @@
29#include <net/checksum.h> 29#include <net/checksum.h>
30#include <net/tcp.h> 30#include <net/tcp.h>
31 31
32#include <linux/netfilter_ipv4/lockhelp.h>
33#include <linux/netfilter_ipv4/ip_conntrack_helper.h> 32#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
34#include <linux/netfilter_ipv4/ip_conntrack_irc.h> 33#include <linux/netfilter_ipv4/ip_conntrack_irc.h>
35#include <linux/moduleparam.h> 34#include <linux/moduleparam.h>
@@ -41,7 +40,7 @@ static int max_dcc_channels = 8;
41static unsigned int dcc_timeout = 300; 40static unsigned int dcc_timeout = 300;
42/* This is slow, but it's simple. --RR */ 41/* This is slow, but it's simple. --RR */
43static char irc_buffer[65536]; 42static char irc_buffer[65536];
44static DECLARE_LOCK(irc_buffer_lock); 43static DEFINE_SPINLOCK(irc_buffer_lock);
45 44
46unsigned int (*ip_nat_irc_hook)(struct sk_buff **pskb, 45unsigned int (*ip_nat_irc_hook)(struct sk_buff **pskb,
47 enum ip_conntrack_info ctinfo, 46 enum ip_conntrack_info ctinfo,
@@ -141,7 +140,7 @@ static int help(struct sk_buff **pskb,
141 if (dataoff >= (*pskb)->len) 140 if (dataoff >= (*pskb)->len)
142 return NF_ACCEPT; 141 return NF_ACCEPT;
143 142
144 LOCK_BH(&irc_buffer_lock); 143 spin_lock_bh(&irc_buffer_lock);
145 ib_ptr = skb_header_pointer(*pskb, dataoff, 144 ib_ptr = skb_header_pointer(*pskb, dataoff,
146 (*pskb)->len - dataoff, irc_buffer); 145 (*pskb)->len - dataoff, irc_buffer);
147 BUG_ON(ib_ptr == NULL); 146 BUG_ON(ib_ptr == NULL);
@@ -237,7 +236,7 @@ static int help(struct sk_buff **pskb,
237 } /* while data < ... */ 236 } /* while data < ... */
238 237
239 out: 238 out:
240 UNLOCK_BH(&irc_buffer_lock); 239 spin_unlock_bh(&irc_buffer_lock);
241 return ret; 240 return ret;
242} 241}
243 242
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_sctp.c b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c
index ff8c34a860ff..31d75390bf12 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_sctp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c
@@ -26,7 +26,6 @@
26 26
27#include <linux/netfilter_ipv4/ip_conntrack.h> 27#include <linux/netfilter_ipv4/ip_conntrack.h>
28#include <linux/netfilter_ipv4/ip_conntrack_protocol.h> 28#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
29#include <linux/netfilter_ipv4/lockhelp.h>
30 29
31#if 0 30#if 0
32#define DEBUGP(format, ...) printk(format, ## __VA_ARGS__) 31#define DEBUGP(format, ...) printk(format, ## __VA_ARGS__)
@@ -35,7 +34,7 @@
35#endif 34#endif
36 35
37/* Protects conntrack->proto.sctp */ 36/* Protects conntrack->proto.sctp */
38static DECLARE_RWLOCK(sctp_lock); 37static DEFINE_RWLOCK(sctp_lock);
39 38
40/* FIXME: Examine ipfilter's timeouts and conntrack transitions more 39/* FIXME: Examine ipfilter's timeouts and conntrack transitions more
41 closely. They're more complex. --RR 40 closely. They're more complex. --RR
@@ -199,9 +198,9 @@ static int sctp_print_conntrack(struct seq_file *s,
199 DEBUGP(__FUNCTION__); 198 DEBUGP(__FUNCTION__);
200 DEBUGP("\n"); 199 DEBUGP("\n");
201 200
202 READ_LOCK(&sctp_lock); 201 read_lock_bh(&sctp_lock);
203 state = conntrack->proto.sctp.state; 202 state = conntrack->proto.sctp.state;
204 READ_UNLOCK(&sctp_lock); 203 read_unlock_bh(&sctp_lock);
205 204
206 return seq_printf(s, "%s ", sctp_conntrack_names[state]); 205 return seq_printf(s, "%s ", sctp_conntrack_names[state]);
207} 206}
@@ -343,13 +342,13 @@ static int sctp_packet(struct ip_conntrack *conntrack,
343 342
344 oldsctpstate = newconntrack = SCTP_CONNTRACK_MAX; 343 oldsctpstate = newconntrack = SCTP_CONNTRACK_MAX;
345 for_each_sctp_chunk (skb, sch, _sch, offset, count) { 344 for_each_sctp_chunk (skb, sch, _sch, offset, count) {
346 WRITE_LOCK(&sctp_lock); 345 write_lock_bh(&sctp_lock);
347 346
348 /* Special cases of Verification tag check (Sec 8.5.1) */ 347 /* Special cases of Verification tag check (Sec 8.5.1) */
349 if (sch->type == SCTP_CID_INIT) { 348 if (sch->type == SCTP_CID_INIT) {
350 /* Sec 8.5.1 (A) */ 349 /* Sec 8.5.1 (A) */
351 if (sh->vtag != 0) { 350 if (sh->vtag != 0) {
352 WRITE_UNLOCK(&sctp_lock); 351 write_unlock_bh(&sctp_lock);
353 return -1; 352 return -1;
354 } 353 }
355 } else if (sch->type == SCTP_CID_ABORT) { 354 } else if (sch->type == SCTP_CID_ABORT) {
@@ -357,7 +356,7 @@ static int sctp_packet(struct ip_conntrack *conntrack,
357 if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)]) 356 if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])
358 && !(sh->vtag == conntrack->proto.sctp.vtag 357 && !(sh->vtag == conntrack->proto.sctp.vtag
359 [1 - CTINFO2DIR(ctinfo)])) { 358 [1 - CTINFO2DIR(ctinfo)])) {
360 WRITE_UNLOCK(&sctp_lock); 359 write_unlock_bh(&sctp_lock);
361 return -1; 360 return -1;
362 } 361 }
363 } else if (sch->type == SCTP_CID_SHUTDOWN_COMPLETE) { 362 } else if (sch->type == SCTP_CID_SHUTDOWN_COMPLETE) {
@@ -366,13 +365,13 @@ static int sctp_packet(struct ip_conntrack *conntrack,
366 && !(sh->vtag == conntrack->proto.sctp.vtag 365 && !(sh->vtag == conntrack->proto.sctp.vtag
367 [1 - CTINFO2DIR(ctinfo)] 366 [1 - CTINFO2DIR(ctinfo)]
368 && (sch->flags & 1))) { 367 && (sch->flags & 1))) {
369 WRITE_UNLOCK(&sctp_lock); 368 write_unlock_bh(&sctp_lock);
370 return -1; 369 return -1;
371 } 370 }
372 } else if (sch->type == SCTP_CID_COOKIE_ECHO) { 371 } else if (sch->type == SCTP_CID_COOKIE_ECHO) {
373 /* Sec 8.5.1 (D) */ 372 /* Sec 8.5.1 (D) */
374 if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])) { 373 if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])) {
375 WRITE_UNLOCK(&sctp_lock); 374 write_unlock_bh(&sctp_lock);
376 return -1; 375 return -1;
377 } 376 }
378 } 377 }
@@ -384,7 +383,7 @@ static int sctp_packet(struct ip_conntrack *conntrack,
384 if (newconntrack == SCTP_CONNTRACK_MAX) { 383 if (newconntrack == SCTP_CONNTRACK_MAX) {
385 DEBUGP("ip_conntrack_sctp: Invalid dir=%i ctype=%u conntrack=%u\n", 384 DEBUGP("ip_conntrack_sctp: Invalid dir=%i ctype=%u conntrack=%u\n",
386 CTINFO2DIR(ctinfo), sch->type, oldsctpstate); 385 CTINFO2DIR(ctinfo), sch->type, oldsctpstate);
387 WRITE_UNLOCK(&sctp_lock); 386 write_unlock_bh(&sctp_lock);
388 return -1; 387 return -1;
389 } 388 }
390 389
@@ -396,7 +395,7 @@ static int sctp_packet(struct ip_conntrack *conntrack,
396 ih = skb_header_pointer(skb, offset + sizeof(sctp_chunkhdr_t), 395 ih = skb_header_pointer(skb, offset + sizeof(sctp_chunkhdr_t),
397 sizeof(_inithdr), &_inithdr); 396 sizeof(_inithdr), &_inithdr);
398 if (ih == NULL) { 397 if (ih == NULL) {
399 WRITE_UNLOCK(&sctp_lock); 398 write_unlock_bh(&sctp_lock);
400 return -1; 399 return -1;
401 } 400 }
402 DEBUGP("Setting vtag %x for dir %d\n", 401 DEBUGP("Setting vtag %x for dir %d\n",
@@ -405,7 +404,7 @@ static int sctp_packet(struct ip_conntrack *conntrack,
405 } 404 }
406 405
407 conntrack->proto.sctp.state = newconntrack; 406 conntrack->proto.sctp.state = newconntrack;
408 WRITE_UNLOCK(&sctp_lock); 407 write_unlock_bh(&sctp_lock);
409 } 408 }
410 409
411 ip_ct_refresh_acct(conntrack, ctinfo, skb, *sctp_timeouts[newconntrack]); 410 ip_ct_refresh_acct(conntrack, ctinfo, skb, *sctp_timeouts[newconntrack]);
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
index 721ddbf522b4..809dfed766d4 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
@@ -36,7 +36,6 @@
36#include <linux/netfilter_ipv4.h> 36#include <linux/netfilter_ipv4.h>
37#include <linux/netfilter_ipv4/ip_conntrack.h> 37#include <linux/netfilter_ipv4/ip_conntrack.h>
38#include <linux/netfilter_ipv4/ip_conntrack_protocol.h> 38#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
39#include <linux/netfilter_ipv4/lockhelp.h>
40 39
41#if 0 40#if 0
42#define DEBUGP printk 41#define DEBUGP printk
@@ -46,7 +45,7 @@
46#endif 45#endif
47 46
48/* Protects conntrack->proto.tcp */ 47/* Protects conntrack->proto.tcp */
49static DECLARE_RWLOCK(tcp_lock); 48static DEFINE_RWLOCK(tcp_lock);
50 49
51/* "Be conservative in what you do, 50/* "Be conservative in what you do,
52 be liberal in what you accept from others." 51 be liberal in what you accept from others."
@@ -330,9 +329,9 @@ static int tcp_print_conntrack(struct seq_file *s,
330{ 329{
331 enum tcp_conntrack state; 330 enum tcp_conntrack state;
332 331
333 READ_LOCK(&tcp_lock); 332 read_lock_bh(&tcp_lock);
334 state = conntrack->proto.tcp.state; 333 state = conntrack->proto.tcp.state;
335 READ_UNLOCK(&tcp_lock); 334 read_unlock_bh(&tcp_lock);
336 335
337 return seq_printf(s, "%s ", tcp_conntrack_names[state]); 336 return seq_printf(s, "%s ", tcp_conntrack_names[state]);
338} 337}
@@ -738,14 +737,14 @@ void ip_conntrack_tcp_update(struct sk_buff *skb,
738 737
739 end = segment_seq_plus_len(ntohl(tcph->seq), skb->len, iph, tcph); 738 end = segment_seq_plus_len(ntohl(tcph->seq), skb->len, iph, tcph);
740 739
741 WRITE_LOCK(&tcp_lock); 740 write_lock_bh(&tcp_lock);
742 /* 741 /*
743 * We have to worry for the ack in the reply packet only... 742 * We have to worry for the ack in the reply packet only...
744 */ 743 */
745 if (after(end, conntrack->proto.tcp.seen[dir].td_end)) 744 if (after(end, conntrack->proto.tcp.seen[dir].td_end))
746 conntrack->proto.tcp.seen[dir].td_end = end; 745 conntrack->proto.tcp.seen[dir].td_end = end;
747 conntrack->proto.tcp.last_end = end; 746 conntrack->proto.tcp.last_end = end;
748 WRITE_UNLOCK(&tcp_lock); 747 write_unlock_bh(&tcp_lock);
749 DEBUGP("tcp_update: sender end=%u maxend=%u maxwin=%u scale=%i " 748 DEBUGP("tcp_update: sender end=%u maxend=%u maxwin=%u scale=%i "
750 "receiver end=%u maxend=%u maxwin=%u scale=%i\n", 749 "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
751 sender->td_end, sender->td_maxend, sender->td_maxwin, 750 sender->td_end, sender->td_maxend, sender->td_maxwin,
@@ -857,7 +856,7 @@ static int tcp_packet(struct ip_conntrack *conntrack,
857 sizeof(_tcph), &_tcph); 856 sizeof(_tcph), &_tcph);
858 BUG_ON(th == NULL); 857 BUG_ON(th == NULL);
859 858
860 WRITE_LOCK(&tcp_lock); 859 write_lock_bh(&tcp_lock);
861 old_state = conntrack->proto.tcp.state; 860 old_state = conntrack->proto.tcp.state;
862 dir = CTINFO2DIR(ctinfo); 861 dir = CTINFO2DIR(ctinfo);
863 index = get_conntrack_index(th); 862 index = get_conntrack_index(th);
@@ -879,7 +878,7 @@ static int tcp_packet(struct ip_conntrack *conntrack,
879 * that the client cannot but retransmit its SYN and 878 * that the client cannot but retransmit its SYN and
880 * thus initiate a clean new session. 879 * thus initiate a clean new session.
881 */ 880 */
882 WRITE_UNLOCK(&tcp_lock); 881 write_unlock_bh(&tcp_lock);
883 if (LOG_INVALID(IPPROTO_TCP)) 882 if (LOG_INVALID(IPPROTO_TCP))
884 nf_log_packet(PF_INET, 0, skb, NULL, NULL, 883 nf_log_packet(PF_INET, 0, skb, NULL, NULL,
885 "ip_ct_tcp: killing out of sync session "); 884 "ip_ct_tcp: killing out of sync session ");
@@ -894,7 +893,7 @@ static int tcp_packet(struct ip_conntrack *conntrack,
894 conntrack->proto.tcp.last_end = 893 conntrack->proto.tcp.last_end =
895 segment_seq_plus_len(ntohl(th->seq), skb->len, iph, th); 894 segment_seq_plus_len(ntohl(th->seq), skb->len, iph, th);
896 895
897 WRITE_UNLOCK(&tcp_lock); 896 write_unlock_bh(&tcp_lock);
898 if (LOG_INVALID(IPPROTO_TCP)) 897 if (LOG_INVALID(IPPROTO_TCP))
899 nf_log_packet(PF_INET, 0, skb, NULL, NULL, 898 nf_log_packet(PF_INET, 0, skb, NULL, NULL,
900 "ip_ct_tcp: invalid packet ignored "); 899 "ip_ct_tcp: invalid packet ignored ");
@@ -904,7 +903,7 @@ static int tcp_packet(struct ip_conntrack *conntrack,
904 DEBUGP("ip_ct_tcp: Invalid dir=%i index=%u ostate=%u\n", 903 DEBUGP("ip_ct_tcp: Invalid dir=%i index=%u ostate=%u\n",
905 dir, get_conntrack_index(th), 904 dir, get_conntrack_index(th),
906 old_state); 905 old_state);
907 WRITE_UNLOCK(&tcp_lock); 906 write_unlock_bh(&tcp_lock);
908 if (LOG_INVALID(IPPROTO_TCP)) 907 if (LOG_INVALID(IPPROTO_TCP))
909 nf_log_packet(PF_INET, 0, skb, NULL, NULL, 908 nf_log_packet(PF_INET, 0, skb, NULL, NULL,
910 "ip_ct_tcp: invalid state "); 909 "ip_ct_tcp: invalid state ");
@@ -918,13 +917,13 @@ static int tcp_packet(struct ip_conntrack *conntrack,
918 conntrack->proto.tcp.seen[dir].td_end)) { 917 conntrack->proto.tcp.seen[dir].td_end)) {
919 /* Attempt to reopen a closed connection. 918 /* Attempt to reopen a closed connection.
920 * Delete this connection and look up again. */ 919 * Delete this connection and look up again. */
921 WRITE_UNLOCK(&tcp_lock); 920 write_unlock_bh(&tcp_lock);
922 if (del_timer(&conntrack->timeout)) 921 if (del_timer(&conntrack->timeout))
923 conntrack->timeout.function((unsigned long) 922 conntrack->timeout.function((unsigned long)
924 conntrack); 923 conntrack);
925 return -NF_REPEAT; 924 return -NF_REPEAT;
926 } else { 925 } else {
927 WRITE_UNLOCK(&tcp_lock); 926 write_unlock_bh(&tcp_lock);
928 if (LOG_INVALID(IPPROTO_TCP)) 927 if (LOG_INVALID(IPPROTO_TCP))
929 nf_log_packet(PF_INET, 0, skb, NULL, NULL, 928 nf_log_packet(PF_INET, 0, skb, NULL, NULL,
930 "ip_ct_tcp: invalid SYN"); 929 "ip_ct_tcp: invalid SYN");
@@ -949,7 +948,7 @@ static int tcp_packet(struct ip_conntrack *conntrack,
949 948
950 if (!tcp_in_window(&conntrack->proto.tcp, dir, index, 949 if (!tcp_in_window(&conntrack->proto.tcp, dir, index,
951 skb, iph, th)) { 950 skb, iph, th)) {
952 WRITE_UNLOCK(&tcp_lock); 951 write_unlock_bh(&tcp_lock);
953 return -NF_ACCEPT; 952 return -NF_ACCEPT;
954 } 953 }
955 in_window: 954 in_window:
@@ -972,7 +971,7 @@ static int tcp_packet(struct ip_conntrack *conntrack,
972 timeout = conntrack->proto.tcp.retrans >= ip_ct_tcp_max_retrans 971 timeout = conntrack->proto.tcp.retrans >= ip_ct_tcp_max_retrans
973 && *tcp_timeouts[new_state] > ip_ct_tcp_timeout_max_retrans 972 && *tcp_timeouts[new_state] > ip_ct_tcp_timeout_max_retrans
974 ? ip_ct_tcp_timeout_max_retrans : *tcp_timeouts[new_state]; 973 ? ip_ct_tcp_timeout_max_retrans : *tcp_timeouts[new_state];
975 WRITE_UNLOCK(&tcp_lock); 974 write_unlock_bh(&tcp_lock);
976 975
977 if (!test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) { 976 if (!test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) {
978 /* If only reply is a RST, we can consider ourselves not to 977 /* If only reply is a RST, we can consider ourselves not to
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_udp.c b/net/ipv4/netfilter/ip_conntrack_proto_udp.c
index 5bc28a224623..8c1eaba098d4 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_udp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_udp.c
@@ -120,6 +120,7 @@ static int udp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
120 * and moreover root might send raw packets. 120 * and moreover root might send raw packets.
121 * FIXME: Source route IP option packets --RR */ 121 * FIXME: Source route IP option packets --RR */
122 if (hooknum == NF_IP_PRE_ROUTING 122 if (hooknum == NF_IP_PRE_ROUTING
123 && skb->ip_summed != CHECKSUM_UNNECESSARY
123 && csum_tcpudp_magic(iph->saddr, iph->daddr, udplen, IPPROTO_UDP, 124 && csum_tcpudp_magic(iph->saddr, iph->daddr, udplen, IPPROTO_UDP,
124 skb->ip_summed == CHECKSUM_HW ? skb->csum 125 skb->ip_summed == CHECKSUM_HW ? skb->csum
125 : skb_checksum(skb, iph->ihl*4, udplen, 0))) { 126 : skb_checksum(skb, iph->ihl*4, udplen, 0))) {
diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c
index 46ca45f74d85..42dc95102873 100644
--- a/net/ipv4/netfilter/ip_conntrack_standalone.c
+++ b/net/ipv4/netfilter/ip_conntrack_standalone.c
@@ -28,8 +28,8 @@
28#include <net/checksum.h> 28#include <net/checksum.h>
29#include <net/ip.h> 29#include <net/ip.h>
30 30
31#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_conntrack_lock) 31#define ASSERT_READ_LOCK(x)
32#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_conntrack_lock) 32#define ASSERT_WRITE_LOCK(x)
33 33
34#include <linux/netfilter_ipv4/ip_conntrack.h> 34#include <linux/netfilter_ipv4/ip_conntrack.h>
35#include <linux/netfilter_ipv4/ip_conntrack_protocol.h> 35#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
@@ -119,7 +119,7 @@ static struct list_head *ct_get_idx(struct seq_file *seq, loff_t pos)
119 119
120static void *ct_seq_start(struct seq_file *seq, loff_t *pos) 120static void *ct_seq_start(struct seq_file *seq, loff_t *pos)
121{ 121{
122 READ_LOCK(&ip_conntrack_lock); 122 read_lock_bh(&ip_conntrack_lock);
123 return ct_get_idx(seq, *pos); 123 return ct_get_idx(seq, *pos);
124} 124}
125 125
@@ -131,7 +131,7 @@ static void *ct_seq_next(struct seq_file *s, void *v, loff_t *pos)
131 131
132static void ct_seq_stop(struct seq_file *s, void *v) 132static void ct_seq_stop(struct seq_file *s, void *v)
133{ 133{
134 READ_UNLOCK(&ip_conntrack_lock); 134 read_unlock_bh(&ip_conntrack_lock);
135} 135}
136 136
137static int ct_seq_show(struct seq_file *s, void *v) 137static int ct_seq_show(struct seq_file *s, void *v)
@@ -140,7 +140,7 @@ static int ct_seq_show(struct seq_file *s, void *v)
140 const struct ip_conntrack *conntrack = tuplehash_to_ctrack(hash); 140 const struct ip_conntrack *conntrack = tuplehash_to_ctrack(hash);
141 struct ip_conntrack_protocol *proto; 141 struct ip_conntrack_protocol *proto;
142 142
143 MUST_BE_READ_LOCKED(&ip_conntrack_lock); 143 ASSERT_READ_LOCK(&ip_conntrack_lock);
144 IP_NF_ASSERT(conntrack); 144 IP_NF_ASSERT(conntrack);
145 145
146 /* we only want to print DIR_ORIGINAL */ 146 /* we only want to print DIR_ORIGINAL */
@@ -239,7 +239,7 @@ static void *exp_seq_start(struct seq_file *s, loff_t *pos)
239 239
240 /* strange seq_file api calls stop even if we fail, 240 /* strange seq_file api calls stop even if we fail,
241 * thus we need to grab lock since stop unlocks */ 241 * thus we need to grab lock since stop unlocks */
242 READ_LOCK(&ip_conntrack_lock); 242 read_lock_bh(&ip_conntrack_lock);
243 243
244 if (list_empty(e)) 244 if (list_empty(e))
245 return NULL; 245 return NULL;
@@ -256,6 +256,7 @@ static void *exp_seq_next(struct seq_file *s, void *v, loff_t *pos)
256{ 256{
257 struct list_head *e = v; 257 struct list_head *e = v;
258 258
259 ++*pos;
259 e = e->next; 260 e = e->next;
260 261
261 if (e == &ip_conntrack_expect_list) 262 if (e == &ip_conntrack_expect_list)
@@ -266,7 +267,7 @@ static void *exp_seq_next(struct seq_file *s, void *v, loff_t *pos)
266 267
267static void exp_seq_stop(struct seq_file *s, void *v) 268static void exp_seq_stop(struct seq_file *s, void *v)
268{ 269{
269 READ_UNLOCK(&ip_conntrack_lock); 270 read_unlock_bh(&ip_conntrack_lock);
270} 271}
271 272
272static int exp_seq_show(struct seq_file *s, void *v) 273static int exp_seq_show(struct seq_file *s, void *v)
@@ -920,22 +921,22 @@ int ip_conntrack_protocol_register(struct ip_conntrack_protocol *proto)
920{ 921{
921 int ret = 0; 922 int ret = 0;
922 923
923 WRITE_LOCK(&ip_conntrack_lock); 924 write_lock_bh(&ip_conntrack_lock);
924 if (ip_ct_protos[proto->proto] != &ip_conntrack_generic_protocol) { 925 if (ip_ct_protos[proto->proto] != &ip_conntrack_generic_protocol) {
925 ret = -EBUSY; 926 ret = -EBUSY;
926 goto out; 927 goto out;
927 } 928 }
928 ip_ct_protos[proto->proto] = proto; 929 ip_ct_protos[proto->proto] = proto;
929 out: 930 out:
930 WRITE_UNLOCK(&ip_conntrack_lock); 931 write_unlock_bh(&ip_conntrack_lock);
931 return ret; 932 return ret;
932} 933}
933 934
934void ip_conntrack_protocol_unregister(struct ip_conntrack_protocol *proto) 935void ip_conntrack_protocol_unregister(struct ip_conntrack_protocol *proto)
935{ 936{
936 WRITE_LOCK(&ip_conntrack_lock); 937 write_lock_bh(&ip_conntrack_lock);
937 ip_ct_protos[proto->proto] = &ip_conntrack_generic_protocol; 938 ip_ct_protos[proto->proto] = &ip_conntrack_generic_protocol;
938 WRITE_UNLOCK(&ip_conntrack_lock); 939 write_unlock_bh(&ip_conntrack_lock);
939 940
940 /* Somebody could be still looking at the proto in bh. */ 941 /* Somebody could be still looking at the proto in bh. */
941 synchronize_net(); 942 synchronize_net();
diff --git a/net/ipv4/netfilter/ip_nat_core.c b/net/ipv4/netfilter/ip_nat_core.c
index 9fc6f93af0dd..739b6dde1c82 100644
--- a/net/ipv4/netfilter/ip_nat_core.c
+++ b/net/ipv4/netfilter/ip_nat_core.c
@@ -22,8 +22,8 @@
22#include <linux/udp.h> 22#include <linux/udp.h>
23#include <linux/jhash.h> 23#include <linux/jhash.h>
24 24
25#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock) 25#define ASSERT_READ_LOCK(x)
26#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock) 26#define ASSERT_WRITE_LOCK(x)
27 27
28#include <linux/netfilter_ipv4/ip_conntrack.h> 28#include <linux/netfilter_ipv4/ip_conntrack.h>
29#include <linux/netfilter_ipv4/ip_conntrack_core.h> 29#include <linux/netfilter_ipv4/ip_conntrack_core.h>
@@ -41,7 +41,7 @@
41#define DEBUGP(format, args...) 41#define DEBUGP(format, args...)
42#endif 42#endif
43 43
44DECLARE_RWLOCK(ip_nat_lock); 44DEFINE_RWLOCK(ip_nat_lock);
45 45
46/* Calculated at init based on memory size */ 46/* Calculated at init based on memory size */
47static unsigned int ip_nat_htable_size; 47static unsigned int ip_nat_htable_size;
@@ -65,9 +65,9 @@ static void ip_nat_cleanup_conntrack(struct ip_conntrack *conn)
65 if (!(conn->status & IPS_NAT_DONE_MASK)) 65 if (!(conn->status & IPS_NAT_DONE_MASK))
66 return; 66 return;
67 67
68 WRITE_LOCK(&ip_nat_lock); 68 write_lock_bh(&ip_nat_lock);
69 list_del(&conn->nat.info.bysource); 69 list_del(&conn->nat.info.bysource);
70 WRITE_UNLOCK(&ip_nat_lock); 70 write_unlock_bh(&ip_nat_lock);
71} 71}
72 72
73/* We do checksum mangling, so if they were wrong before they're still 73/* We do checksum mangling, so if they were wrong before they're still
@@ -142,7 +142,7 @@ find_appropriate_src(const struct ip_conntrack_tuple *tuple,
142 unsigned int h = hash_by_src(tuple); 142 unsigned int h = hash_by_src(tuple);
143 struct ip_conntrack *ct; 143 struct ip_conntrack *ct;
144 144
145 READ_LOCK(&ip_nat_lock); 145 read_lock_bh(&ip_nat_lock);
146 list_for_each_entry(ct, &bysource[h], nat.info.bysource) { 146 list_for_each_entry(ct, &bysource[h], nat.info.bysource) {
147 if (same_src(ct, tuple)) { 147 if (same_src(ct, tuple)) {
148 /* Copy source part from reply tuple. */ 148 /* Copy source part from reply tuple. */
@@ -151,12 +151,12 @@ find_appropriate_src(const struct ip_conntrack_tuple *tuple,
151 result->dst = tuple->dst; 151 result->dst = tuple->dst;
152 152
153 if (in_range(result, range)) { 153 if (in_range(result, range)) {
154 READ_UNLOCK(&ip_nat_lock); 154 read_unlock_bh(&ip_nat_lock);
155 return 1; 155 return 1;
156 } 156 }
157 } 157 }
158 } 158 }
159 READ_UNLOCK(&ip_nat_lock); 159 read_unlock_bh(&ip_nat_lock);
160 return 0; 160 return 0;
161} 161}
162 162
@@ -297,9 +297,9 @@ ip_nat_setup_info(struct ip_conntrack *conntrack,
297 unsigned int srchash 297 unsigned int srchash
298 = hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL] 298 = hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
299 .tuple); 299 .tuple);
300 WRITE_LOCK(&ip_nat_lock); 300 write_lock_bh(&ip_nat_lock);
301 list_add(&info->bysource, &bysource[srchash]); 301 list_add(&info->bysource, &bysource[srchash]);
302 WRITE_UNLOCK(&ip_nat_lock); 302 write_unlock_bh(&ip_nat_lock);
303 } 303 }
304 304
305 /* It's done. */ 305 /* It's done. */
@@ -474,23 +474,23 @@ int ip_nat_protocol_register(struct ip_nat_protocol *proto)
474{ 474{
475 int ret = 0; 475 int ret = 0;
476 476
477 WRITE_LOCK(&ip_nat_lock); 477 write_lock_bh(&ip_nat_lock);
478 if (ip_nat_protos[proto->protonum] != &ip_nat_unknown_protocol) { 478 if (ip_nat_protos[proto->protonum] != &ip_nat_unknown_protocol) {
479 ret = -EBUSY; 479 ret = -EBUSY;
480 goto out; 480 goto out;
481 } 481 }
482 ip_nat_protos[proto->protonum] = proto; 482 ip_nat_protos[proto->protonum] = proto;
483 out: 483 out:
484 WRITE_UNLOCK(&ip_nat_lock); 484 write_unlock_bh(&ip_nat_lock);
485 return ret; 485 return ret;
486} 486}
487 487
488/* Noone stores the protocol anywhere; simply delete it. */ 488/* Noone stores the protocol anywhere; simply delete it. */
489void ip_nat_protocol_unregister(struct ip_nat_protocol *proto) 489void ip_nat_protocol_unregister(struct ip_nat_protocol *proto)
490{ 490{
491 WRITE_LOCK(&ip_nat_lock); 491 write_lock_bh(&ip_nat_lock);
492 ip_nat_protos[proto->protonum] = &ip_nat_unknown_protocol; 492 ip_nat_protos[proto->protonum] = &ip_nat_unknown_protocol;
493 WRITE_UNLOCK(&ip_nat_lock); 493 write_unlock_bh(&ip_nat_lock);
494 494
495 /* Someone could be still looking at the proto in a bh. */ 495 /* Someone could be still looking at the proto in a bh. */
496 synchronize_net(); 496 synchronize_net();
@@ -509,13 +509,13 @@ int __init ip_nat_init(void)
509 return -ENOMEM; 509 return -ENOMEM;
510 510
511 /* Sew in builtin protocols. */ 511 /* Sew in builtin protocols. */
512 WRITE_LOCK(&ip_nat_lock); 512 write_lock_bh(&ip_nat_lock);
513 for (i = 0; i < MAX_IP_NAT_PROTO; i++) 513 for (i = 0; i < MAX_IP_NAT_PROTO; i++)
514 ip_nat_protos[i] = &ip_nat_unknown_protocol; 514 ip_nat_protos[i] = &ip_nat_unknown_protocol;
515 ip_nat_protos[IPPROTO_TCP] = &ip_nat_protocol_tcp; 515 ip_nat_protos[IPPROTO_TCP] = &ip_nat_protocol_tcp;
516 ip_nat_protos[IPPROTO_UDP] = &ip_nat_protocol_udp; 516 ip_nat_protos[IPPROTO_UDP] = &ip_nat_protocol_udp;
517 ip_nat_protos[IPPROTO_ICMP] = &ip_nat_protocol_icmp; 517 ip_nat_protos[IPPROTO_ICMP] = &ip_nat_protocol_icmp;
518 WRITE_UNLOCK(&ip_nat_lock); 518 write_unlock_bh(&ip_nat_lock);
519 519
520 for (i = 0; i < ip_nat_htable_size; i++) { 520 for (i = 0; i < ip_nat_htable_size; i++) {
521 INIT_LIST_HEAD(&bysource[i]); 521 INIT_LIST_HEAD(&bysource[i]);
diff --git a/net/ipv4/netfilter/ip_nat_helper.c b/net/ipv4/netfilter/ip_nat_helper.c
index 1637b96d8c01..158f34f32c04 100644
--- a/net/ipv4/netfilter/ip_nat_helper.c
+++ b/net/ipv4/netfilter/ip_nat_helper.c
@@ -28,8 +28,8 @@
28#include <net/tcp.h> 28#include <net/tcp.h>
29#include <net/udp.h> 29#include <net/udp.h>
30 30
31#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock) 31#define ASSERT_READ_LOCK(x)
32#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock) 32#define ASSERT_WRITE_LOCK(x)
33 33
34#include <linux/netfilter_ipv4/ip_conntrack.h> 34#include <linux/netfilter_ipv4/ip_conntrack.h>
35#include <linux/netfilter_ipv4/ip_conntrack_helper.h> 35#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
@@ -47,7 +47,7 @@
47#define DUMP_OFFSET(x) 47#define DUMP_OFFSET(x)
48#endif 48#endif
49 49
50static DECLARE_LOCK(ip_nat_seqofs_lock); 50static DEFINE_SPINLOCK(ip_nat_seqofs_lock);
51 51
52/* Setup TCP sequence correction given this change at this sequence */ 52/* Setup TCP sequence correction given this change at this sequence */
53static inline void 53static inline void
@@ -70,7 +70,7 @@ adjust_tcp_sequence(u32 seq,
70 DEBUGP("ip_nat_resize_packet: Seq_offset before: "); 70 DEBUGP("ip_nat_resize_packet: Seq_offset before: ");
71 DUMP_OFFSET(this_way); 71 DUMP_OFFSET(this_way);
72 72
73 LOCK_BH(&ip_nat_seqofs_lock); 73 spin_lock_bh(&ip_nat_seqofs_lock);
74 74
75 /* SYN adjust. If it's uninitialized, or this is after last 75 /* SYN adjust. If it's uninitialized, or this is after last
76 * correction, record it: we don't handle more than one 76 * correction, record it: we don't handle more than one
@@ -82,7 +82,7 @@ adjust_tcp_sequence(u32 seq,
82 this_way->offset_before = this_way->offset_after; 82 this_way->offset_before = this_way->offset_after;
83 this_way->offset_after += sizediff; 83 this_way->offset_after += sizediff;
84 } 84 }
85 UNLOCK_BH(&ip_nat_seqofs_lock); 85 spin_unlock_bh(&ip_nat_seqofs_lock);
86 86
87 DEBUGP("ip_nat_resize_packet: Seq_offset after: "); 87 DEBUGP("ip_nat_resize_packet: Seq_offset after: ");
88 DUMP_OFFSET(this_way); 88 DUMP_OFFSET(this_way);
@@ -142,9 +142,6 @@ static int enlarge_skb(struct sk_buff **pskb, unsigned int extra)
142 /* Transfer socket to new skb. */ 142 /* Transfer socket to new skb. */
143 if ((*pskb)->sk) 143 if ((*pskb)->sk)
144 skb_set_owner_w(nskb, (*pskb)->sk); 144 skb_set_owner_w(nskb, (*pskb)->sk);
145#ifdef CONFIG_NETFILTER_DEBUG
146 nskb->nf_debug = (*pskb)->nf_debug;
147#endif
148 kfree_skb(*pskb); 145 kfree_skb(*pskb);
149 *pskb = nskb; 146 *pskb = nskb;
150 return 1; 147 return 1;
diff --git a/net/ipv4/netfilter/ip_nat_rule.c b/net/ipv4/netfilter/ip_nat_rule.c
index 581f097f5a24..60d70fa41a15 100644
--- a/net/ipv4/netfilter/ip_nat_rule.c
+++ b/net/ipv4/netfilter/ip_nat_rule.c
@@ -19,8 +19,8 @@
19#include <net/route.h> 19#include <net/route.h>
20#include <linux/bitops.h> 20#include <linux/bitops.h>
21 21
22#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock) 22#define ASSERT_READ_LOCK(x)
23#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock) 23#define ASSERT_WRITE_LOCK(x)
24 24
25#include <linux/netfilter_ipv4/ip_tables.h> 25#include <linux/netfilter_ipv4/ip_tables.h>
26#include <linux/netfilter_ipv4/ip_nat.h> 26#include <linux/netfilter_ipv4/ip_nat.h>
diff --git a/net/ipv4/netfilter/ip_nat_standalone.c b/net/ipv4/netfilter/ip_nat_standalone.c
index 79f56f662b33..bc59d0d6e89e 100644
--- a/net/ipv4/netfilter/ip_nat_standalone.c
+++ b/net/ipv4/netfilter/ip_nat_standalone.c
@@ -31,8 +31,8 @@
31#include <net/checksum.h> 31#include <net/checksum.h>
32#include <linux/spinlock.h> 32#include <linux/spinlock.h>
33 33
34#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock) 34#define ASSERT_READ_LOCK(x)
35#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock) 35#define ASSERT_WRITE_LOCK(x)
36 36
37#include <linux/netfilter_ipv4/ip_nat.h> 37#include <linux/netfilter_ipv4/ip_nat.h>
38#include <linux/netfilter_ipv4/ip_nat_rule.h> 38#include <linux/netfilter_ipv4/ip_nat_rule.h>
@@ -373,7 +373,6 @@ static int init_or_cleanup(int init)
373 cleanup_rule_init: 373 cleanup_rule_init:
374 ip_nat_rule_cleanup(); 374 ip_nat_rule_cleanup();
375 cleanup_nothing: 375 cleanup_nothing:
376 MUST_BE_READ_WRITE_UNLOCKED(&ip_nat_lock);
377 return ret; 376 return ret;
378} 377}
379 378
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 8a54f92b8496..c88dfcd38c56 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -67,7 +67,6 @@ static DECLARE_MUTEX(ipt_mutex);
67/* Must have mutex */ 67/* Must have mutex */
68#define ASSERT_READ_LOCK(x) IP_NF_ASSERT(down_trylock(&ipt_mutex) != 0) 68#define ASSERT_READ_LOCK(x) IP_NF_ASSERT(down_trylock(&ipt_mutex) != 0)
69#define ASSERT_WRITE_LOCK(x) IP_NF_ASSERT(down_trylock(&ipt_mutex) != 0) 69#define ASSERT_WRITE_LOCK(x) IP_NF_ASSERT(down_trylock(&ipt_mutex) != 0)
70#include <linux/netfilter_ipv4/lockhelp.h>
71#include <linux/netfilter_ipv4/listhelp.h> 70#include <linux/netfilter_ipv4/listhelp.h>
72 71
73#if 0 72#if 0
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 0f12e3a3dc73..9cde8c61f525 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -29,7 +29,6 @@
29#include <linux/netfilter_ipv4/ip_tables.h> 29#include <linux/netfilter_ipv4/ip_tables.h>
30#include <linux/netfilter_ipv4/ipt_CLUSTERIP.h> 30#include <linux/netfilter_ipv4/ipt_CLUSTERIP.h>
31#include <linux/netfilter_ipv4/ip_conntrack.h> 31#include <linux/netfilter_ipv4/ip_conntrack.h>
32#include <linux/netfilter_ipv4/lockhelp.h>
33 32
34#define CLUSTERIP_VERSION "0.6" 33#define CLUSTERIP_VERSION "0.6"
35 34
@@ -41,6 +40,8 @@
41#define DEBUGP 40#define DEBUGP
42#endif 41#endif
43 42
43#define ASSERT_READ_LOCK(x)
44
44MODULE_LICENSE("GPL"); 45MODULE_LICENSE("GPL");
45MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); 46MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
46MODULE_DESCRIPTION("iptables target for CLUSTERIP"); 47MODULE_DESCRIPTION("iptables target for CLUSTERIP");
@@ -67,7 +68,7 @@ static LIST_HEAD(clusterip_configs);
67 68
68/* clusterip_lock protects the clusterip_configs list _AND_ the configurable 69/* clusterip_lock protects the clusterip_configs list _AND_ the configurable
69 * data within all structurses (num_local_nodes, local_nodes[]) */ 70 * data within all structurses (num_local_nodes, local_nodes[]) */
70static DECLARE_RWLOCK(clusterip_lock); 71static DEFINE_RWLOCK(clusterip_lock);
71 72
72#ifdef CONFIG_PROC_FS 73#ifdef CONFIG_PROC_FS
73static struct file_operations clusterip_proc_fops; 74static struct file_operations clusterip_proc_fops;
@@ -82,9 +83,9 @@ clusterip_config_get(struct clusterip_config *c) {
82static inline void 83static inline void
83clusterip_config_put(struct clusterip_config *c) { 84clusterip_config_put(struct clusterip_config *c) {
84 if (atomic_dec_and_test(&c->refcount)) { 85 if (atomic_dec_and_test(&c->refcount)) {
85 WRITE_LOCK(&clusterip_lock); 86 write_lock_bh(&clusterip_lock);
86 list_del(&c->list); 87 list_del(&c->list);
87 WRITE_UNLOCK(&clusterip_lock); 88 write_unlock_bh(&clusterip_lock);
88 dev_mc_delete(c->dev, c->clustermac, ETH_ALEN, 0); 89 dev_mc_delete(c->dev, c->clustermac, ETH_ALEN, 0);
89 dev_put(c->dev); 90 dev_put(c->dev);
90 kfree(c); 91 kfree(c);
@@ -97,7 +98,7 @@ __clusterip_config_find(u_int32_t clusterip)
97{ 98{
98 struct list_head *pos; 99 struct list_head *pos;
99 100
100 MUST_BE_READ_LOCKED(&clusterip_lock); 101 ASSERT_READ_LOCK(&clusterip_lock);
101 list_for_each(pos, &clusterip_configs) { 102 list_for_each(pos, &clusterip_configs) {
102 struct clusterip_config *c = list_entry(pos, 103 struct clusterip_config *c = list_entry(pos,
103 struct clusterip_config, list); 104 struct clusterip_config, list);
@@ -114,14 +115,14 @@ clusterip_config_find_get(u_int32_t clusterip)
114{ 115{
115 struct clusterip_config *c; 116 struct clusterip_config *c;
116 117
117 READ_LOCK(&clusterip_lock); 118 read_lock_bh(&clusterip_lock);
118 c = __clusterip_config_find(clusterip); 119 c = __clusterip_config_find(clusterip);
119 if (!c) { 120 if (!c) {
120 READ_UNLOCK(&clusterip_lock); 121 read_unlock_bh(&clusterip_lock);
121 return NULL; 122 return NULL;
122 } 123 }
123 atomic_inc(&c->refcount); 124 atomic_inc(&c->refcount);
124 READ_UNLOCK(&clusterip_lock); 125 read_unlock_bh(&clusterip_lock);
125 126
126 return c; 127 return c;
127} 128}
@@ -160,9 +161,9 @@ clusterip_config_init(struct ipt_clusterip_tgt_info *i, u_int32_t ip,
160 c->pde->data = c; 161 c->pde->data = c;
161#endif 162#endif
162 163
163 WRITE_LOCK(&clusterip_lock); 164 write_lock_bh(&clusterip_lock);
164 list_add(&c->list, &clusterip_configs); 165 list_add(&c->list, &clusterip_configs);
165 WRITE_UNLOCK(&clusterip_lock); 166 write_unlock_bh(&clusterip_lock);
166 167
167 return c; 168 return c;
168} 169}
@@ -172,25 +173,25 @@ clusterip_add_node(struct clusterip_config *c, u_int16_t nodenum)
172{ 173{
173 int i; 174 int i;
174 175
175 WRITE_LOCK(&clusterip_lock); 176 write_lock_bh(&clusterip_lock);
176 177
177 if (c->num_local_nodes >= CLUSTERIP_MAX_NODES 178 if (c->num_local_nodes >= CLUSTERIP_MAX_NODES
178 || nodenum > CLUSTERIP_MAX_NODES) { 179 || nodenum > CLUSTERIP_MAX_NODES) {
179 WRITE_UNLOCK(&clusterip_lock); 180 write_unlock_bh(&clusterip_lock);
180 return 1; 181 return 1;
181 } 182 }
182 183
183 /* check if we alrady have this number in our array */ 184 /* check if we alrady have this number in our array */
184 for (i = 0; i < c->num_local_nodes; i++) { 185 for (i = 0; i < c->num_local_nodes; i++) {
185 if (c->local_nodes[i] == nodenum) { 186 if (c->local_nodes[i] == nodenum) {
186 WRITE_UNLOCK(&clusterip_lock); 187 write_unlock_bh(&clusterip_lock);
187 return 1; 188 return 1;
188 } 189 }
189 } 190 }
190 191
191 c->local_nodes[c->num_local_nodes++] = nodenum; 192 c->local_nodes[c->num_local_nodes++] = nodenum;
192 193
193 WRITE_UNLOCK(&clusterip_lock); 194 write_unlock_bh(&clusterip_lock);
194 return 0; 195 return 0;
195} 196}
196 197
@@ -199,10 +200,10 @@ clusterip_del_node(struct clusterip_config *c, u_int16_t nodenum)
199{ 200{
200 int i; 201 int i;
201 202
202 WRITE_LOCK(&clusterip_lock); 203 write_lock_bh(&clusterip_lock);
203 204
204 if (c->num_local_nodes <= 1 || nodenum > CLUSTERIP_MAX_NODES) { 205 if (c->num_local_nodes <= 1 || nodenum > CLUSTERIP_MAX_NODES) {
205 WRITE_UNLOCK(&clusterip_lock); 206 write_unlock_bh(&clusterip_lock);
206 return 1; 207 return 1;
207 } 208 }
208 209
@@ -211,12 +212,12 @@ clusterip_del_node(struct clusterip_config *c, u_int16_t nodenum)
211 int size = sizeof(u_int16_t)*(c->num_local_nodes-(i+1)); 212 int size = sizeof(u_int16_t)*(c->num_local_nodes-(i+1));
212 memmove(&c->local_nodes[i], &c->local_nodes[i+1], size); 213 memmove(&c->local_nodes[i], &c->local_nodes[i+1], size);
213 c->num_local_nodes--; 214 c->num_local_nodes--;
214 WRITE_UNLOCK(&clusterip_lock); 215 write_unlock_bh(&clusterip_lock);
215 return 0; 216 return 0;
216 } 217 }
217 } 218 }
218 219
219 WRITE_UNLOCK(&clusterip_lock); 220 write_unlock_bh(&clusterip_lock);
220 return 1; 221 return 1;
221} 222}
222 223
@@ -286,21 +287,21 @@ clusterip_responsible(struct clusterip_config *config, u_int32_t hash)
286{ 287{
287 int i; 288 int i;
288 289
289 READ_LOCK(&clusterip_lock); 290 read_lock_bh(&clusterip_lock);
290 291
291 if (config->num_local_nodes == 0) { 292 if (config->num_local_nodes == 0) {
292 READ_UNLOCK(&clusterip_lock); 293 read_unlock_bh(&clusterip_lock);
293 return 0; 294 return 0;
294 } 295 }
295 296
296 for (i = 0; i < config->num_local_nodes; i++) { 297 for (i = 0; i < config->num_local_nodes; i++) {
297 if (config->local_nodes[i] == hash) { 298 if (config->local_nodes[i] == hash) {
298 READ_UNLOCK(&clusterip_lock); 299 read_unlock_bh(&clusterip_lock);
299 return 1; 300 return 1;
300 } 301 }
301 } 302 }
302 303
303 READ_UNLOCK(&clusterip_lock); 304 read_unlock_bh(&clusterip_lock);
304 305
305 return 0; 306 return 0;
306} 307}
@@ -338,7 +339,7 @@ target(struct sk_buff **pskb,
338 * error messages (RELATED) and information requests (see below) */ 339 * error messages (RELATED) and information requests (see below) */
339 if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP 340 if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP
340 && (ctinfo == IP_CT_RELATED 341 && (ctinfo == IP_CT_RELATED
341 || ctinfo == IP_CT_IS_REPLY+IP_CT_IS_REPLY)) 342 || ctinfo == IP_CT_RELATED+IP_CT_IS_REPLY))
342 return IPT_CONTINUE; 343 return IPT_CONTINUE;
343 344
344 /* ip_conntrack_icmp guarantees us that we only have ICMP_ECHO, 345 /* ip_conntrack_icmp guarantees us that we only have ICMP_ECHO,
@@ -578,7 +579,7 @@ static void *clusterip_seq_start(struct seq_file *s, loff_t *pos)
578 struct clusterip_config *c = pde->data; 579 struct clusterip_config *c = pde->data;
579 unsigned int *nodeidx; 580 unsigned int *nodeidx;
580 581
581 READ_LOCK(&clusterip_lock); 582 read_lock_bh(&clusterip_lock);
582 if (*pos >= c->num_local_nodes) 583 if (*pos >= c->num_local_nodes)
583 return NULL; 584 return NULL;
584 585
@@ -608,7 +609,7 @@ static void clusterip_seq_stop(struct seq_file *s, void *v)
608{ 609{
609 kfree(v); 610 kfree(v);
610 611
611 READ_UNLOCK(&clusterip_lock); 612 read_unlock_bh(&clusterip_lock);
612} 613}
613 614
614static int clusterip_seq_show(struct seq_file *s, void *v) 615static int clusterip_seq_show(struct seq_file *s, void *v)
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index 57e9f6cf1c36..91e74502c3d3 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -33,7 +33,7 @@ MODULE_DESCRIPTION("iptables MASQUERADE target module");
33#endif 33#endif
34 34
35/* Lock protects masq region inside conntrack */ 35/* Lock protects masq region inside conntrack */
36static DECLARE_RWLOCK(masq_lock); 36static DEFINE_RWLOCK(masq_lock);
37 37
38/* FIXME: Multiple targets. --RR */ 38/* FIXME: Multiple targets. --RR */
39static int 39static int
@@ -103,9 +103,9 @@ masquerade_target(struct sk_buff **pskb,
103 return NF_DROP; 103 return NF_DROP;
104 } 104 }
105 105
106 WRITE_LOCK(&masq_lock); 106 write_lock_bh(&masq_lock);
107 ct->nat.masq_index = out->ifindex; 107 ct->nat.masq_index = out->ifindex;
108 WRITE_UNLOCK(&masq_lock); 108 write_unlock_bh(&masq_lock);
109 109
110 /* Transfer from original range. */ 110 /* Transfer from original range. */
111 newrange = ((struct ip_nat_range) 111 newrange = ((struct ip_nat_range)
@@ -122,9 +122,9 @@ device_cmp(struct ip_conntrack *i, void *ifindex)
122{ 122{
123 int ret; 123 int ret;
124 124
125 READ_LOCK(&masq_lock); 125 read_lock_bh(&masq_lock);
126 ret = (i->nat.masq_index == (int)(long)ifindex); 126 ret = (i->nat.masq_index == (int)(long)ifindex);
127 READ_UNLOCK(&masq_lock); 127 read_unlock_bh(&masq_lock);
128 128
129 return ret; 129 return ret;
130} 130}
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index 266d64979286..915696446020 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -104,10 +104,12 @@ static inline struct rtable *route_reverse(struct sk_buff *skb,
104static void send_reset(struct sk_buff *oldskb, int hook) 104static void send_reset(struct sk_buff *oldskb, int hook)
105{ 105{
106 struct sk_buff *nskb; 106 struct sk_buff *nskb;
107 struct iphdr *iph = oldskb->nh.iph;
107 struct tcphdr _otcph, *oth, *tcph; 108 struct tcphdr _otcph, *oth, *tcph;
108 struct rtable *rt; 109 struct rtable *rt;
109 u_int16_t tmp_port; 110 u_int16_t tmp_port;
110 u_int32_t tmp_addr; 111 u_int32_t tmp_addr;
112 unsigned int tcplen;
111 int needs_ack; 113 int needs_ack;
112 int hh_len; 114 int hh_len;
113 115
@@ -124,7 +126,16 @@ static void send_reset(struct sk_buff *oldskb, int hook)
124 if (oth->rst) 126 if (oth->rst)
125 return; 127 return;
126 128
127 /* FIXME: Check checksum --RR */ 129 /* Check checksum */
130 tcplen = oldskb->len - iph->ihl * 4;
131 if (((hook != NF_IP_LOCAL_IN && oldskb->ip_summed != CHECKSUM_HW) ||
132 (hook == NF_IP_LOCAL_IN &&
133 oldskb->ip_summed != CHECKSUM_UNNECESSARY)) &&
134 csum_tcpudp_magic(iph->saddr, iph->daddr, tcplen, IPPROTO_TCP,
135 oldskb->ip_summed == CHECKSUM_HW ? oldskb->csum :
136 skb_checksum(oldskb, iph->ihl * 4, tcplen, 0)))
137 return;
138
128 if ((rt = route_reverse(oldskb, oth, hook)) == NULL) 139 if ((rt = route_reverse(oldskb, oth, hook)) == NULL)
129 return; 140 return;
130 141
diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c
index 6f2cefbe16cd..52a0076302a7 100644
--- a/net/ipv4/netfilter/ipt_ULOG.c
+++ b/net/ipv4/netfilter/ipt_ULOG.c
@@ -56,7 +56,6 @@
56#include <linux/netfilter.h> 56#include <linux/netfilter.h>
57#include <linux/netfilter_ipv4/ip_tables.h> 57#include <linux/netfilter_ipv4/ip_tables.h>
58#include <linux/netfilter_ipv4/ipt_ULOG.h> 58#include <linux/netfilter_ipv4/ipt_ULOG.h>
59#include <linux/netfilter_ipv4/lockhelp.h>
60#include <net/sock.h> 59#include <net/sock.h>
61#include <linux/bitops.h> 60#include <linux/bitops.h>
62 61
@@ -99,8 +98,8 @@ typedef struct {
99 98
100static ulog_buff_t ulog_buffers[ULOG_MAXNLGROUPS]; /* array of buffers */ 99static ulog_buff_t ulog_buffers[ULOG_MAXNLGROUPS]; /* array of buffers */
101 100
102static struct sock *nflognl; /* our socket */ 101static struct sock *nflognl; /* our socket */
103static DECLARE_LOCK(ulog_lock); /* spinlock */ 102static DEFINE_SPINLOCK(ulog_lock); /* spinlock */
104 103
105/* send one ulog_buff_t to userspace */ 104/* send one ulog_buff_t to userspace */
106static void ulog_send(unsigned int nlgroupnum) 105static void ulog_send(unsigned int nlgroupnum)
@@ -135,9 +134,9 @@ static void ulog_timer(unsigned long data)
135 134
136 /* lock to protect against somebody modifying our structure 135 /* lock to protect against somebody modifying our structure
137 * from ipt_ulog_target at the same time */ 136 * from ipt_ulog_target at the same time */
138 LOCK_BH(&ulog_lock); 137 spin_lock_bh(&ulog_lock);
139 ulog_send(data); 138 ulog_send(data);
140 UNLOCK_BH(&ulog_lock); 139 spin_unlock_bh(&ulog_lock);
141} 140}
142 141
143static struct sk_buff *ulog_alloc_skb(unsigned int size) 142static struct sk_buff *ulog_alloc_skb(unsigned int size)
@@ -193,7 +192,7 @@ static void ipt_ulog_packet(unsigned int hooknum,
193 192
194 ub = &ulog_buffers[groupnum]; 193 ub = &ulog_buffers[groupnum];
195 194
196 LOCK_BH(&ulog_lock); 195 spin_lock_bh(&ulog_lock);
197 196
198 if (!ub->skb) { 197 if (!ub->skb) {
199 if (!(ub->skb = ulog_alloc_skb(size))) 198 if (!(ub->skb = ulog_alloc_skb(size)))
@@ -278,7 +277,7 @@ static void ipt_ulog_packet(unsigned int hooknum,
278 ulog_send(groupnum); 277 ulog_send(groupnum);
279 } 278 }
280 279
281 UNLOCK_BH(&ulog_lock); 280 spin_unlock_bh(&ulog_lock);
282 281
283 return; 282 return;
284 283
@@ -288,7 +287,7 @@ nlmsg_failure:
288alloc_failure: 287alloc_failure:
289 PRINTR("ipt_ULOG: Error building netlink message\n"); 288 PRINTR("ipt_ULOG: Error building netlink message\n");
290 289
291 UNLOCK_BH(&ulog_lock); 290 spin_unlock_bh(&ulog_lock);
292} 291}
293 292
294static unsigned int ipt_ulog_target(struct sk_buff **pskb, 293static unsigned int ipt_ulog_target(struct sk_buff **pskb,
diff --git a/net/ipv4/netfilter/ipt_hashlimit.c b/net/ipv4/netfilter/ipt_hashlimit.c
index f1937190cd77..564b49bfebcf 100644
--- a/net/ipv4/netfilter/ipt_hashlimit.c
+++ b/net/ipv4/netfilter/ipt_hashlimit.c
@@ -37,7 +37,6 @@
37 37
38#include <linux/netfilter_ipv4/ip_tables.h> 38#include <linux/netfilter_ipv4/ip_tables.h>
39#include <linux/netfilter_ipv4/ipt_hashlimit.h> 39#include <linux/netfilter_ipv4/ipt_hashlimit.h>
40#include <linux/netfilter_ipv4/lockhelp.h>
41 40
42/* FIXME: this is just for IP_NF_ASSERRT */ 41/* FIXME: this is just for IP_NF_ASSERRT */
43#include <linux/netfilter_ipv4/ip_conntrack.h> 42#include <linux/netfilter_ipv4/ip_conntrack.h>
@@ -92,7 +91,7 @@ struct ipt_hashlimit_htable {
92 struct hlist_head hash[0]; /* hashtable itself */ 91 struct hlist_head hash[0]; /* hashtable itself */
93}; 92};
94 93
95static DECLARE_LOCK(hashlimit_lock); /* protects htables list */ 94static DEFINE_SPINLOCK(hashlimit_lock); /* protects htables list */
96static DECLARE_MUTEX(hlimit_mutex); /* additional checkentry protection */ 95static DECLARE_MUTEX(hlimit_mutex); /* additional checkentry protection */
97static HLIST_HEAD(hashlimit_htables); 96static HLIST_HEAD(hashlimit_htables);
98static kmem_cache_t *hashlimit_cachep; 97static kmem_cache_t *hashlimit_cachep;
@@ -233,9 +232,9 @@ static int htable_create(struct ipt_hashlimit_info *minfo)
233 hinfo->timer.function = htable_gc; 232 hinfo->timer.function = htable_gc;
234 add_timer(&hinfo->timer); 233 add_timer(&hinfo->timer);
235 234
236 LOCK_BH(&hashlimit_lock); 235 spin_lock_bh(&hashlimit_lock);
237 hlist_add_head(&hinfo->node, &hashlimit_htables); 236 hlist_add_head(&hinfo->node, &hashlimit_htables);
238 UNLOCK_BH(&hashlimit_lock); 237 spin_unlock_bh(&hashlimit_lock);
239 238
240 return 0; 239 return 0;
241} 240}
@@ -301,15 +300,15 @@ static struct ipt_hashlimit_htable *htable_find_get(char *name)
301 struct ipt_hashlimit_htable *hinfo; 300 struct ipt_hashlimit_htable *hinfo;
302 struct hlist_node *pos; 301 struct hlist_node *pos;
303 302
304 LOCK_BH(&hashlimit_lock); 303 spin_lock_bh(&hashlimit_lock);
305 hlist_for_each_entry(hinfo, pos, &hashlimit_htables, node) { 304 hlist_for_each_entry(hinfo, pos, &hashlimit_htables, node) {
306 if (!strcmp(name, hinfo->pde->name)) { 305 if (!strcmp(name, hinfo->pde->name)) {
307 atomic_inc(&hinfo->use); 306 atomic_inc(&hinfo->use);
308 UNLOCK_BH(&hashlimit_lock); 307 spin_unlock_bh(&hashlimit_lock);
309 return hinfo; 308 return hinfo;
310 } 309 }
311 } 310 }
312 UNLOCK_BH(&hashlimit_lock); 311 spin_unlock_bh(&hashlimit_lock);
313 312
314 return NULL; 313 return NULL;
315} 314}
@@ -317,9 +316,9 @@ static struct ipt_hashlimit_htable *htable_find_get(char *name)
317static void htable_put(struct ipt_hashlimit_htable *hinfo) 316static void htable_put(struct ipt_hashlimit_htable *hinfo)
318{ 317{
319 if (atomic_dec_and_test(&hinfo->use)) { 318 if (atomic_dec_and_test(&hinfo->use)) {
320 LOCK_BH(&hashlimit_lock); 319 spin_lock_bh(&hashlimit_lock);
321 hlist_del(&hinfo->node); 320 hlist_del(&hinfo->node);
322 UNLOCK_BH(&hashlimit_lock); 321 spin_unlock_bh(&hashlimit_lock);
323 htable_destroy(hinfo); 322 htable_destroy(hinfo);
324 } 323 }
325} 324}
diff --git a/net/ipv4/netfilter/ipt_helper.c b/net/ipv4/netfilter/ipt_helper.c
index 33fdf364d3d3..3e7dd014de43 100644
--- a/net/ipv4/netfilter/ipt_helper.c
+++ b/net/ipv4/netfilter/ipt_helper.c
@@ -53,7 +53,7 @@ match(const struct sk_buff *skb,
53 return ret; 53 return ret;
54 } 54 }
55 55
56 READ_LOCK(&ip_conntrack_lock); 56 read_lock_bh(&ip_conntrack_lock);
57 if (!ct->master->helper) { 57 if (!ct->master->helper) {
58 DEBUGP("ipt_helper: master ct %p has no helper\n", 58 DEBUGP("ipt_helper: master ct %p has no helper\n",
59 exp->expectant); 59 exp->expectant);
@@ -69,7 +69,7 @@ match(const struct sk_buff *skb,
69 ret ^= !strncmp(ct->master->helper->name, info->name, 69 ret ^= !strncmp(ct->master->helper->name, info->name,
70 strlen(ct->master->helper->name)); 70 strlen(ct->master->helper->name));
71out_unlock: 71out_unlock:
72 READ_UNLOCK(&ip_conntrack_lock); 72 read_unlock_bh(&ip_conntrack_lock);
73 return ret; 73 return ret;
74} 74}
75 75
diff --git a/net/ipv4/netfilter/ipt_recent.c b/net/ipv4/netfilter/ipt_recent.c
index 25ab9fabdcba..2d44b07688af 100644
--- a/net/ipv4/netfilter/ipt_recent.c
+++ b/net/ipv4/netfilter/ipt_recent.c
@@ -223,7 +223,7 @@ static int ip_recent_ctrl(struct file *file, const char __user *input, unsigned
223 curr_table->table[count].last_seen = 0; 223 curr_table->table[count].last_seen = 0;
224 curr_table->table[count].addr = 0; 224 curr_table->table[count].addr = 0;
225 curr_table->table[count].ttl = 0; 225 curr_table->table[count].ttl = 0;
226 memset(curr_table->table[count].last_pkts,0,ip_pkt_list_tot*sizeof(u_int32_t)); 226 memset(curr_table->table[count].last_pkts,0,ip_pkt_list_tot*sizeof(unsigned long));
227 curr_table->table[count].oldest_pkt = 0; 227 curr_table->table[count].oldest_pkt = 0;
228 curr_table->table[count].time_pos = 0; 228 curr_table->table[count].time_pos = 0;
229 curr_table->time_info[count].position = count; 229 curr_table->time_info[count].position = count;
@@ -502,7 +502,7 @@ match(const struct sk_buff *skb,
502 location = time_info[curr_table->time_pos].position; 502 location = time_info[curr_table->time_pos].position;
503 hash_table[r_list[location].hash_entry] = -1; 503 hash_table[r_list[location].hash_entry] = -1;
504 hash_table[hash_result] = location; 504 hash_table[hash_result] = location;
505 memset(r_list[location].last_pkts,0,ip_pkt_list_tot*sizeof(u_int32_t)); 505 memset(r_list[location].last_pkts,0,ip_pkt_list_tot*sizeof(unsigned long));
506 r_list[location].time_pos = curr_table->time_pos; 506 r_list[location].time_pos = curr_table->time_pos;
507 r_list[location].addr = addr; 507 r_list[location].addr = addr;
508 r_list[location].ttl = ttl; 508 r_list[location].ttl = ttl;
@@ -631,7 +631,7 @@ match(const struct sk_buff *skb,
631 r_list[location].last_seen = 0; 631 r_list[location].last_seen = 0;
632 r_list[location].addr = 0; 632 r_list[location].addr = 0;
633 r_list[location].ttl = 0; 633 r_list[location].ttl = 0;
634 memset(r_list[location].last_pkts,0,ip_pkt_list_tot*sizeof(u_int32_t)); 634 memset(r_list[location].last_pkts,0,ip_pkt_list_tot*sizeof(unsigned long));
635 r_list[location].oldest_pkt = 0; 635 r_list[location].oldest_pkt = 0;
636 ans = !info->invert; 636 ans = !info->invert;
637 } 637 }
@@ -734,10 +734,10 @@ checkentry(const char *tablename,
734 memset(curr_table->table,0,sizeof(struct recent_ip_list)*ip_list_tot); 734 memset(curr_table->table,0,sizeof(struct recent_ip_list)*ip_list_tot);
735#ifdef DEBUG 735#ifdef DEBUG
736 if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for pkt_list.\n", 736 if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for pkt_list.\n",
737 sizeof(u_int32_t)*ip_pkt_list_tot*ip_list_tot); 737 sizeof(unsigned long)*ip_pkt_list_tot*ip_list_tot);
738#endif 738#endif
739 739
740 hold = vmalloc(sizeof(u_int32_t)*ip_pkt_list_tot*ip_list_tot); 740 hold = vmalloc(sizeof(unsigned long)*ip_pkt_list_tot*ip_list_tot);
741#ifdef DEBUG 741#ifdef DEBUG
742 if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: After pkt_list allocation.\n"); 742 if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: After pkt_list allocation.\n");
743#endif 743#endif
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 5b1ec586bae6..d1835b1bc8c4 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -259,7 +259,7 @@ int raw_rcv(struct sock *sk, struct sk_buff *skb)
259 return 0; 259 return 0;
260} 260}
261 261
262static int raw_send_hdrinc(struct sock *sk, void *from, int length, 262static int raw_send_hdrinc(struct sock *sk, void *from, size_t length,
263 struct rtable *rt, 263 struct rtable *rt,
264 unsigned int flags) 264 unsigned int flags)
265{ 265{
@@ -298,7 +298,7 @@ static int raw_send_hdrinc(struct sock *sk, void *from, int length,
298 goto error_fault; 298 goto error_fault;
299 299
300 /* We don't modify invalid header */ 300 /* We don't modify invalid header */
301 if (length >= sizeof(*iph) && iph->ihl * 4 <= length) { 301 if (length >= sizeof(*iph) && iph->ihl * 4U <= length) {
302 if (!iph->saddr) 302 if (!iph->saddr)
303 iph->saddr = rt->rt_src; 303 iph->saddr = rt->rt_src;
304 iph->check = 0; 304 iph->check = 0;
@@ -332,7 +332,7 @@ static void raw_probe_proto_opt(struct flowi *fl, struct msghdr *msg)
332 u8 __user *type = NULL; 332 u8 __user *type = NULL;
333 u8 __user *code = NULL; 333 u8 __user *code = NULL;
334 int probed = 0; 334 int probed = 0;
335 int i; 335 unsigned int i;
336 336
337 if (!msg->msg_iov) 337 if (!msg->msg_iov)
338 return; 338 return;
@@ -384,7 +384,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
384 int err; 384 int err;
385 385
386 err = -EMSGSIZE; 386 err = -EMSGSIZE;
387 if (len < 0 || len > 0xFFFF) 387 if (len > 0xFFFF)
388 goto out; 388 goto out;
389 389
390 /* 390 /*
@@ -514,7 +514,10 @@ done:
514 kfree(ipc.opt); 514 kfree(ipc.opt);
515 ip_rt_put(rt); 515 ip_rt_put(rt);
516 516
517out: return err < 0 ? err : len; 517out:
518 if (err < 0)
519 return err;
520 return len;
518 521
519do_confirm: 522do_confirm:
520 dst_confirm(&rt->u.dst); 523 dst_confirm(&rt->u.dst);
@@ -610,7 +613,10 @@ static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
610 copied = skb->len; 613 copied = skb->len;
611done: 614done:
612 skb_free_datagram(sk, skb); 615 skb_free_datagram(sk, skb);
613out: return err ? err : copied; 616out:
617 if (err)
618 return err;
619 return copied;
614} 620}
615 621
616static int raw_init(struct sock *sk) 622static int raw_init(struct sock *sk)
@@ -691,11 +697,11 @@ static int raw_ioctl(struct sock *sk, int cmd, unsigned long arg)
691 struct sk_buff *skb; 697 struct sk_buff *skb;
692 int amount = 0; 698 int amount = 0;
693 699
694 spin_lock_irq(&sk->sk_receive_queue.lock); 700 spin_lock_bh(&sk->sk_receive_queue.lock);
695 skb = skb_peek(&sk->sk_receive_queue); 701 skb = skb_peek(&sk->sk_receive_queue);
696 if (skb != NULL) 702 if (skb != NULL)
697 amount = skb->len; 703 amount = skb->len;
698 spin_unlock_irq(&sk->sk_receive_queue.lock); 704 spin_unlock_bh(&sk->sk_receive_queue.lock);
699 return put_user(amount, (int __user *)arg); 705 return put_user(amount, (int __user *)arg);
700 } 706 }
701 707
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index a682d28e247b..80cf633d9f4a 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1767,7 +1767,7 @@ static inline int ip_mkroute_input_def(struct sk_buff *skb,
1767 struct in_device *in_dev, 1767 struct in_device *in_dev,
1768 u32 daddr, u32 saddr, u32 tos) 1768 u32 daddr, u32 saddr, u32 tos)
1769{ 1769{
1770 struct rtable* rth; 1770 struct rtable* rth = NULL;
1771 int err; 1771 int err;
1772 unsigned hash; 1772 unsigned hash;
1773 1773
@@ -1794,7 +1794,7 @@ static inline int ip_mkroute_input(struct sk_buff *skb,
1794 u32 daddr, u32 saddr, u32 tos) 1794 u32 daddr, u32 saddr, u32 tos)
1795{ 1795{
1796#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED 1796#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1797 struct rtable* rth; 1797 struct rtable* rth = NULL;
1798 unsigned char hop, hopcount, lasthop; 1798 unsigned char hop, hopcount, lasthop;
1799 int err = -EINVAL; 1799 int err = -EINVAL;
1800 unsigned int hash; 1800 unsigned int hash;
@@ -2239,7 +2239,7 @@ static inline int ip_mkroute_output_def(struct rtable **rp,
2239 struct net_device *dev_out, 2239 struct net_device *dev_out,
2240 unsigned flags) 2240 unsigned flags)
2241{ 2241{
2242 struct rtable *rth; 2242 struct rtable *rth = NULL;
2243 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags); 2243 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2244 unsigned hash; 2244 unsigned hash;
2245 if (err == 0) { 2245 if (err == 0) {
@@ -2267,7 +2267,7 @@ static inline int ip_mkroute_output(struct rtable** rp,
2267 unsigned char hop; 2267 unsigned char hop;
2268 unsigned hash; 2268 unsigned hash;
2269 int err = -EINVAL; 2269 int err = -EINVAL;
2270 struct rtable *rth; 2270 struct rtable *rth = NULL;
2271 2271
2272 if (res->fi && res->fi->fib_nhs > 1) { 2272 if (res->fi && res->fi->fib_nhs > 1) {
2273 unsigned char hopcount = res->fi->fib_nhs; 2273 unsigned char hopcount = res->fi->fib_nhs;
@@ -2581,7 +2581,7 @@ int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2581} 2581}
2582 2582
2583static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event, 2583static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2584 int nowait) 2584 int nowait, unsigned int flags)
2585{ 2585{
2586 struct rtable *rt = (struct rtable*)skb->dst; 2586 struct rtable *rt = (struct rtable*)skb->dst;
2587 struct rtmsg *r; 2587 struct rtmsg *r;
@@ -2591,9 +2591,8 @@ static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2591#ifdef CONFIG_IP_MROUTE 2591#ifdef CONFIG_IP_MROUTE
2592 struct rtattr *eptr; 2592 struct rtattr *eptr;
2593#endif 2593#endif
2594 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*r)); 2594 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*r), flags);
2595 r = NLMSG_DATA(nlh); 2595 r = NLMSG_DATA(nlh);
2596 nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0;
2597 r->rtm_family = AF_INET; 2596 r->rtm_family = AF_INET;
2598 r->rtm_dst_len = 32; 2597 r->rtm_dst_len = 32;
2599 r->rtm_src_len = 0; 2598 r->rtm_src_len = 0;
@@ -2744,7 +2743,7 @@ int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2744 NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid; 2743 NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
2745 2744
2746 err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, 2745 err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2747 RTM_NEWROUTE, 0); 2746 RTM_NEWROUTE, 0, 0);
2748 if (!err) 2747 if (!err)
2749 goto out_free; 2748 goto out_free;
2750 if (err < 0) { 2749 if (err < 0) {
@@ -2781,8 +2780,8 @@ int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2781 continue; 2780 continue;
2782 skb->dst = dst_clone(&rt->u.dst); 2781 skb->dst = dst_clone(&rt->u.dst);
2783 if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid, 2782 if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2784 cb->nlh->nlmsg_seq, 2783 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2785 RTM_NEWROUTE, 1) <= 0) { 2784 1, NLM_F_MULTI) <= 0) {
2786 dst_release(xchg(&skb->dst, NULL)); 2785 dst_release(xchg(&skb->dst, NULL));
2787 rcu_read_unlock_bh(); 2786 rcu_read_unlock_bh();
2788 goto done; 2787 goto done;
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index e923d2f021aa..72d014442185 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -169,10 +169,10 @@ static inline int cookie_check(struct sk_buff *skb, __u32 cookie)
169 return mssind < NUM_MSS ? msstab[mssind] + 1 : 0; 169 return mssind < NUM_MSS ? msstab[mssind] + 1 : 0;
170} 170}
171 171
172extern struct or_calltable or_ipv4; 172extern struct request_sock_ops tcp_request_sock_ops;
173 173
174static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, 174static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
175 struct open_request *req, 175 struct request_sock *req,
176 struct dst_entry *dst) 176 struct dst_entry *dst)
177{ 177{
178 struct tcp_sock *tp = tcp_sk(sk); 178 struct tcp_sock *tp = tcp_sk(sk);
@@ -182,7 +182,7 @@ static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
182 if (child) 182 if (child)
183 tcp_acceptq_queue(sk, req, child); 183 tcp_acceptq_queue(sk, req, child);
184 else 184 else
185 tcp_openreq_free(req); 185 reqsk_free(req);
186 186
187 return child; 187 return child;
188} 188}
@@ -190,10 +190,12 @@ static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
190struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, 190struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
191 struct ip_options *opt) 191 struct ip_options *opt)
192{ 192{
193 struct inet_request_sock *ireq;
194 struct tcp_request_sock *treq;
193 struct tcp_sock *tp = tcp_sk(sk); 195 struct tcp_sock *tp = tcp_sk(sk);
194 __u32 cookie = ntohl(skb->h.th->ack_seq) - 1; 196 __u32 cookie = ntohl(skb->h.th->ack_seq) - 1;
195 struct sock *ret = sk; 197 struct sock *ret = sk;
196 struct open_request *req; 198 struct request_sock *req;
197 int mss; 199 int mss;
198 struct rtable *rt; 200 struct rtable *rt;
199 __u8 rcv_wscale; 201 __u8 rcv_wscale;
@@ -209,19 +211,20 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
209 211
210 NET_INC_STATS_BH(LINUX_MIB_SYNCOOKIESRECV); 212 NET_INC_STATS_BH(LINUX_MIB_SYNCOOKIESRECV);
211 213
212 req = tcp_openreq_alloc();
213 ret = NULL; 214 ret = NULL;
215 req = reqsk_alloc(&tcp_request_sock_ops); /* for safety */
214 if (!req) 216 if (!req)
215 goto out; 217 goto out;
216 218
217 req->rcv_isn = htonl(skb->h.th->seq) - 1; 219 ireq = inet_rsk(req);
218 req->snt_isn = cookie; 220 treq = tcp_rsk(req);
221 treq->rcv_isn = htonl(skb->h.th->seq) - 1;
222 treq->snt_isn = cookie;
219 req->mss = mss; 223 req->mss = mss;
220 req->rmt_port = skb->h.th->source; 224 ireq->rmt_port = skb->h.th->source;
221 req->af.v4_req.loc_addr = skb->nh.iph->daddr; 225 ireq->loc_addr = skb->nh.iph->daddr;
222 req->af.v4_req.rmt_addr = skb->nh.iph->saddr; 226 ireq->rmt_addr = skb->nh.iph->saddr;
223 req->class = &or_ipv4; /* for savety */ 227 ireq->opt = NULL;
224 req->af.v4_req.opt = NULL;
225 228
226 /* We throwed the options of the initial SYN away, so we hope 229 /* We throwed the options of the initial SYN away, so we hope
227 * the ACK carries the same options again (see RFC1122 4.2.3.8) 230 * the ACK carries the same options again (see RFC1122 4.2.3.8)
@@ -229,17 +232,15 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
229 if (opt && opt->optlen) { 232 if (opt && opt->optlen) {
230 int opt_size = sizeof(struct ip_options) + opt->optlen; 233 int opt_size = sizeof(struct ip_options) + opt->optlen;
231 234
232 req->af.v4_req.opt = kmalloc(opt_size, GFP_ATOMIC); 235 ireq->opt = kmalloc(opt_size, GFP_ATOMIC);
233 if (req->af.v4_req.opt) { 236 if (ireq->opt != NULL && ip_options_echo(ireq->opt, skb)) {
234 if (ip_options_echo(req->af.v4_req.opt, skb)) { 237 kfree(ireq->opt);
235 kfree(req->af.v4_req.opt); 238 ireq->opt = NULL;
236 req->af.v4_req.opt = NULL;
237 }
238 } 239 }
239 } 240 }
240 241
241 req->snd_wscale = req->rcv_wscale = req->tstamp_ok = 0; 242 ireq->snd_wscale = ireq->rcv_wscale = ireq->tstamp_ok = 0;
242 req->wscale_ok = req->sack_ok = 0; 243 ireq->wscale_ok = ireq->sack_ok = 0;
243 req->expires = 0UL; 244 req->expires = 0UL;
244 req->retrans = 0; 245 req->retrans = 0;
245 246
@@ -253,15 +254,15 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
253 struct flowi fl = { .nl_u = { .ip4_u = 254 struct flowi fl = { .nl_u = { .ip4_u =
254 { .daddr = ((opt && opt->srr) ? 255 { .daddr = ((opt && opt->srr) ?
255 opt->faddr : 256 opt->faddr :
256 req->af.v4_req.rmt_addr), 257 ireq->rmt_addr),
257 .saddr = req->af.v4_req.loc_addr, 258 .saddr = ireq->loc_addr,
258 .tos = RT_CONN_FLAGS(sk) } }, 259 .tos = RT_CONN_FLAGS(sk) } },
259 .proto = IPPROTO_TCP, 260 .proto = IPPROTO_TCP,
260 .uli_u = { .ports = 261 .uli_u = { .ports =
261 { .sport = skb->h.th->dest, 262 { .sport = skb->h.th->dest,
262 .dport = skb->h.th->source } } }; 263 .dport = skb->h.th->source } } };
263 if (ip_route_output_key(&rt, &fl)) { 264 if (ip_route_output_key(&rt, &fl)) {
264 tcp_openreq_free(req); 265 reqsk_free(req);
265 goto out; 266 goto out;
266 } 267 }
267 } 268 }
@@ -272,7 +273,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
272 &req->rcv_wnd, &req->window_clamp, 273 &req->rcv_wnd, &req->window_clamp,
273 0, &rcv_wscale); 274 0, &rcv_wscale);
274 /* BTW win scale with syncookies is 0 by definition */ 275 /* BTW win scale with syncookies is 0 by definition */
275 req->rcv_wscale = rcv_wscale; 276 ireq->rcv_wscale = rcv_wscale;
276 277
277 ret = get_cookie_sock(sk, skb, req, &rt->u.dst); 278 ret = get_cookie_sock(sk, skb, req, &rt->u.dst);
278out: return ret; 279out: return ret;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 3aafb298c1c1..23068bddbf0b 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -23,6 +23,7 @@ extern int sysctl_ip_nonlocal_bind;
23extern int sysctl_icmp_echo_ignore_all; 23extern int sysctl_icmp_echo_ignore_all;
24extern int sysctl_icmp_echo_ignore_broadcasts; 24extern int sysctl_icmp_echo_ignore_broadcasts;
25extern int sysctl_icmp_ignore_bogus_error_responses; 25extern int sysctl_icmp_ignore_bogus_error_responses;
26extern int sysctl_icmp_errors_use_inbound_ifaddr;
26 27
27/* From ip_fragment.c */ 28/* From ip_fragment.c */
28extern int sysctl_ipfrag_low_thresh; 29extern int sysctl_ipfrag_low_thresh;
@@ -396,6 +397,14 @@ ctl_table ipv4_table[] = {
396 .proc_handler = &proc_dointvec 397 .proc_handler = &proc_dointvec
397 }, 398 },
398 { 399 {
400 .ctl_name = NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR,
401 .procname = "icmp_errors_use_inbound_ifaddr",
402 .data = &sysctl_icmp_errors_use_inbound_ifaddr,
403 .maxlen = sizeof(int),
404 .mode = 0644,
405 .proc_handler = &proc_dointvec
406 },
407 {
399 .ctl_name = NET_IPV4_ROUTE, 408 .ctl_name = NET_IPV4_ROUTE,
400 .procname = "route", 409 .procname = "route",
401 .maxlen = 0, 410 .maxlen = 0,
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index a037bafcba3c..674bbd8cfd36 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -271,7 +271,6 @@ int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
271 271
272DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics); 272DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics);
273 273
274kmem_cache_t *tcp_openreq_cachep;
275kmem_cache_t *tcp_bucket_cachep; 274kmem_cache_t *tcp_bucket_cachep;
276kmem_cache_t *tcp_timewait_cachep; 275kmem_cache_t *tcp_timewait_cachep;
277 276
@@ -317,7 +316,7 @@ EXPORT_SYMBOL(tcp_enter_memory_pressure);
317static __inline__ unsigned int tcp_listen_poll(struct sock *sk, 316static __inline__ unsigned int tcp_listen_poll(struct sock *sk,
318 poll_table *wait) 317 poll_table *wait)
319{ 318{
320 return tcp_sk(sk)->accept_queue ? (POLLIN | POLLRDNORM) : 0; 319 return !reqsk_queue_empty(&tcp_sk(sk)->accept_queue) ? (POLLIN | POLLRDNORM) : 0;
321} 320}
322 321
323/* 322/*
@@ -463,28 +462,15 @@ int tcp_listen_start(struct sock *sk)
463{ 462{
464 struct inet_sock *inet = inet_sk(sk); 463 struct inet_sock *inet = inet_sk(sk);
465 struct tcp_sock *tp = tcp_sk(sk); 464 struct tcp_sock *tp = tcp_sk(sk);
466 struct tcp_listen_opt *lopt; 465 int rc = reqsk_queue_alloc(&tp->accept_queue, TCP_SYNQ_HSIZE);
466
467 if (rc != 0)
468 return rc;
467 469
468 sk->sk_max_ack_backlog = 0; 470 sk->sk_max_ack_backlog = 0;
469 sk->sk_ack_backlog = 0; 471 sk->sk_ack_backlog = 0;
470 tp->accept_queue = tp->accept_queue_tail = NULL;
471 rwlock_init(&tp->syn_wait_lock);
472 tcp_delack_init(tp); 472 tcp_delack_init(tp);
473 473
474 lopt = kmalloc(sizeof(struct tcp_listen_opt), GFP_KERNEL);
475 if (!lopt)
476 return -ENOMEM;
477
478 memset(lopt, 0, sizeof(struct tcp_listen_opt));
479 for (lopt->max_qlen_log = 6; ; lopt->max_qlen_log++)
480 if ((1 << lopt->max_qlen_log) >= sysctl_max_syn_backlog)
481 break;
482 get_random_bytes(&lopt->hash_rnd, 4);
483
484 write_lock_bh(&tp->syn_wait_lock);
485 tp->listen_opt = lopt;
486 write_unlock_bh(&tp->syn_wait_lock);
487
488 /* There is race window here: we announce ourselves listening, 474 /* There is race window here: we announce ourselves listening,
489 * but this transition is still not validated by get_port(). 475 * but this transition is still not validated by get_port().
490 * It is OK, because this socket enters to hash table only 476 * It is OK, because this socket enters to hash table only
@@ -501,10 +487,7 @@ int tcp_listen_start(struct sock *sk)
501 } 487 }
502 488
503 sk->sk_state = TCP_CLOSE; 489 sk->sk_state = TCP_CLOSE;
504 write_lock_bh(&tp->syn_wait_lock); 490 reqsk_queue_destroy(&tp->accept_queue);
505 tp->listen_opt = NULL;
506 write_unlock_bh(&tp->syn_wait_lock);
507 kfree(lopt);
508 return -EADDRINUSE; 491 return -EADDRINUSE;
509} 492}
510 493
@@ -516,25 +499,23 @@ int tcp_listen_start(struct sock *sk)
516static void tcp_listen_stop (struct sock *sk) 499static void tcp_listen_stop (struct sock *sk)
517{ 500{
518 struct tcp_sock *tp = tcp_sk(sk); 501 struct tcp_sock *tp = tcp_sk(sk);
519 struct tcp_listen_opt *lopt = tp->listen_opt; 502 struct listen_sock *lopt;
520 struct open_request *acc_req = tp->accept_queue; 503 struct request_sock *acc_req;
521 struct open_request *req; 504 struct request_sock *req;
522 int i; 505 int i;
523 506
524 tcp_delete_keepalive_timer(sk); 507 tcp_delete_keepalive_timer(sk);
525 508
526 /* make all the listen_opt local to us */ 509 /* make all the listen_opt local to us */
527 write_lock_bh(&tp->syn_wait_lock); 510 lopt = reqsk_queue_yank_listen_sk(&tp->accept_queue);
528 tp->listen_opt = NULL; 511 acc_req = reqsk_queue_yank_acceptq(&tp->accept_queue);
529 write_unlock_bh(&tp->syn_wait_lock);
530 tp->accept_queue = tp->accept_queue_tail = NULL;
531 512
532 if (lopt->qlen) { 513 if (lopt->qlen) {
533 for (i = 0; i < TCP_SYNQ_HSIZE; i++) { 514 for (i = 0; i < TCP_SYNQ_HSIZE; i++) {
534 while ((req = lopt->syn_table[i]) != NULL) { 515 while ((req = lopt->syn_table[i]) != NULL) {
535 lopt->syn_table[i] = req->dl_next; 516 lopt->syn_table[i] = req->dl_next;
536 lopt->qlen--; 517 lopt->qlen--;
537 tcp_openreq_free(req); 518 reqsk_free(req);
538 519
539 /* Following specs, it would be better either to send FIN 520 /* Following specs, it would be better either to send FIN
540 * (and enter FIN-WAIT-1, it is normal close) 521 * (and enter FIN-WAIT-1, it is normal close)
@@ -574,7 +555,7 @@ static void tcp_listen_stop (struct sock *sk)
574 sock_put(child); 555 sock_put(child);
575 556
576 sk_acceptq_removed(sk); 557 sk_acceptq_removed(sk);
577 tcp_openreq_fastfree(req); 558 __reqsk_free(req);
578 } 559 }
579 BUG_TRAP(!sk->sk_ack_backlog); 560 BUG_TRAP(!sk->sk_ack_backlog);
580} 561}
@@ -1345,7 +1326,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1345 1326
1346 cleanup_rbuf(sk, copied); 1327 cleanup_rbuf(sk, copied);
1347 1328
1348 if (tp->ucopy.task == user_recv) { 1329 if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
1349 /* Install new reader */ 1330 /* Install new reader */
1350 if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) { 1331 if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
1351 user_recv = current; 1332 user_recv = current;
@@ -1868,11 +1849,11 @@ static int wait_for_connect(struct sock *sk, long timeo)
1868 prepare_to_wait_exclusive(sk->sk_sleep, &wait, 1849 prepare_to_wait_exclusive(sk->sk_sleep, &wait,
1869 TASK_INTERRUPTIBLE); 1850 TASK_INTERRUPTIBLE);
1870 release_sock(sk); 1851 release_sock(sk);
1871 if (!tp->accept_queue) 1852 if (reqsk_queue_empty(&tp->accept_queue))
1872 timeo = schedule_timeout(timeo); 1853 timeo = schedule_timeout(timeo);
1873 lock_sock(sk); 1854 lock_sock(sk);
1874 err = 0; 1855 err = 0;
1875 if (tp->accept_queue) 1856 if (!reqsk_queue_empty(&tp->accept_queue))
1876 break; 1857 break;
1877 err = -EINVAL; 1858 err = -EINVAL;
1878 if (sk->sk_state != TCP_LISTEN) 1859 if (sk->sk_state != TCP_LISTEN)
@@ -1895,7 +1876,6 @@ static int wait_for_connect(struct sock *sk, long timeo)
1895struct sock *tcp_accept(struct sock *sk, int flags, int *err) 1876struct sock *tcp_accept(struct sock *sk, int flags, int *err)
1896{ 1877{
1897 struct tcp_sock *tp = tcp_sk(sk); 1878 struct tcp_sock *tp = tcp_sk(sk);
1898 struct open_request *req;
1899 struct sock *newsk; 1879 struct sock *newsk;
1900 int error; 1880 int error;
1901 1881
@@ -1906,37 +1886,31 @@ struct sock *tcp_accept(struct sock *sk, int flags, int *err)
1906 */ 1886 */
1907 error = -EINVAL; 1887 error = -EINVAL;
1908 if (sk->sk_state != TCP_LISTEN) 1888 if (sk->sk_state != TCP_LISTEN)
1909 goto out; 1889 goto out_err;
1910 1890
1911 /* Find already established connection */ 1891 /* Find already established connection */
1912 if (!tp->accept_queue) { 1892 if (reqsk_queue_empty(&tp->accept_queue)) {
1913 long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); 1893 long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
1914 1894
1915 /* If this is a non blocking socket don't sleep */ 1895 /* If this is a non blocking socket don't sleep */
1916 error = -EAGAIN; 1896 error = -EAGAIN;
1917 if (!timeo) 1897 if (!timeo)
1918 goto out; 1898 goto out_err;
1919 1899
1920 error = wait_for_connect(sk, timeo); 1900 error = wait_for_connect(sk, timeo);
1921 if (error) 1901 if (error)
1922 goto out; 1902 goto out_err;
1923 } 1903 }
1924 1904
1925 req = tp->accept_queue; 1905 newsk = reqsk_queue_get_child(&tp->accept_queue, sk);
1926 if ((tp->accept_queue = req->dl_next) == NULL)
1927 tp->accept_queue_tail = NULL;
1928
1929 newsk = req->sk;
1930 sk_acceptq_removed(sk);
1931 tcp_openreq_fastfree(req);
1932 BUG_TRAP(newsk->sk_state != TCP_SYN_RECV); 1906 BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);
1933 release_sock(sk);
1934 return newsk;
1935
1936out: 1907out:
1937 release_sock(sk); 1908 release_sock(sk);
1909 return newsk;
1910out_err:
1911 newsk = NULL;
1938 *err = error; 1912 *err = error;
1939 return NULL; 1913 goto out;
1940} 1914}
1941 1915
1942/* 1916/*
@@ -2271,13 +2245,6 @@ void __init tcp_init(void)
2271 __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb), 2245 __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
2272 sizeof(skb->cb)); 2246 sizeof(skb->cb));
2273 2247
2274 tcp_openreq_cachep = kmem_cache_create("tcp_open_request",
2275 sizeof(struct open_request),
2276 0, SLAB_HWCACHE_ALIGN,
2277 NULL, NULL);
2278 if (!tcp_openreq_cachep)
2279 panic("tcp_init: Cannot alloc open_request cache.");
2280
2281 tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket", 2248 tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket",
2282 sizeof(struct tcp_bind_bucket), 2249 sizeof(struct tcp_bind_bucket),
2283 0, SLAB_HWCACHE_ALIGN, 2250 0, SLAB_HWCACHE_ALIGN,
@@ -2338,7 +2305,7 @@ void __init tcp_init(void)
2338 (tcp_bhash_size * sizeof(struct tcp_bind_hashbucket)); 2305 (tcp_bhash_size * sizeof(struct tcp_bind_hashbucket));
2339 order++) 2306 order++)
2340 ; 2307 ;
2341 if (order > 4) { 2308 if (order >= 4) {
2342 sysctl_local_port_range[0] = 32768; 2309 sysctl_local_port_range[0] = 32768;
2343 sysctl_local_port_range[1] = 61000; 2310 sysctl_local_port_range[1] = 61000;
2344 sysctl_tcp_max_tw_buckets = 180000; 2311 sysctl_tcp_max_tw_buckets = 180000;
@@ -2374,7 +2341,6 @@ EXPORT_SYMBOL(tcp_destroy_sock);
2374EXPORT_SYMBOL(tcp_disconnect); 2341EXPORT_SYMBOL(tcp_disconnect);
2375EXPORT_SYMBOL(tcp_getsockopt); 2342EXPORT_SYMBOL(tcp_getsockopt);
2376EXPORT_SYMBOL(tcp_ioctl); 2343EXPORT_SYMBOL(tcp_ioctl);
2377EXPORT_SYMBOL(tcp_openreq_cachep);
2378EXPORT_SYMBOL(tcp_poll); 2344EXPORT_SYMBOL(tcp_poll);
2379EXPORT_SYMBOL(tcp_read_sock); 2345EXPORT_SYMBOL(tcp_read_sock);
2380EXPORT_SYMBOL(tcp_recvmsg); 2346EXPORT_SYMBOL(tcp_recvmsg);
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index 8faa8948f75c..634befc07921 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -455,9 +455,10 @@ static int tcpdiag_dump_sock(struct sk_buff *skb, struct sock *sk,
455} 455}
456 456
457static int tcpdiag_fill_req(struct sk_buff *skb, struct sock *sk, 457static int tcpdiag_fill_req(struct sk_buff *skb, struct sock *sk,
458 struct open_request *req, 458 struct request_sock *req,
459 u32 pid, u32 seq) 459 u32 pid, u32 seq)
460{ 460{
461 const struct inet_request_sock *ireq = inet_rsk(req);
461 struct inet_sock *inet = inet_sk(sk); 462 struct inet_sock *inet = inet_sk(sk);
462 unsigned char *b = skb->tail; 463 unsigned char *b = skb->tail;
463 struct tcpdiagmsg *r; 464 struct tcpdiagmsg *r;
@@ -482,9 +483,9 @@ static int tcpdiag_fill_req(struct sk_buff *skb, struct sock *sk,
482 tmo = 0; 483 tmo = 0;
483 484
484 r->id.tcpdiag_sport = inet->sport; 485 r->id.tcpdiag_sport = inet->sport;
485 r->id.tcpdiag_dport = req->rmt_port; 486 r->id.tcpdiag_dport = ireq->rmt_port;
486 r->id.tcpdiag_src[0] = req->af.v4_req.loc_addr; 487 r->id.tcpdiag_src[0] = ireq->loc_addr;
487 r->id.tcpdiag_dst[0] = req->af.v4_req.rmt_addr; 488 r->id.tcpdiag_dst[0] = ireq->rmt_addr;
488 r->tcpdiag_expires = jiffies_to_msecs(tmo), 489 r->tcpdiag_expires = jiffies_to_msecs(tmo),
489 r->tcpdiag_rqueue = 0; 490 r->tcpdiag_rqueue = 0;
490 r->tcpdiag_wqueue = 0; 491 r->tcpdiag_wqueue = 0;
@@ -493,9 +494,9 @@ static int tcpdiag_fill_req(struct sk_buff *skb, struct sock *sk,
493#ifdef CONFIG_IP_TCPDIAG_IPV6 494#ifdef CONFIG_IP_TCPDIAG_IPV6
494 if (r->tcpdiag_family == AF_INET6) { 495 if (r->tcpdiag_family == AF_INET6) {
495 ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_src, 496 ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_src,
496 &req->af.v6_req.loc_addr); 497 &tcp6_rsk(req)->loc_addr);
497 ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_dst, 498 ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_dst,
498 &req->af.v6_req.rmt_addr); 499 &tcp6_rsk(req)->rmt_addr);
499 } 500 }
500#endif 501#endif
501 nlh->nlmsg_len = skb->tail - b; 502 nlh->nlmsg_len = skb->tail - b;
@@ -513,7 +514,7 @@ static int tcpdiag_dump_reqs(struct sk_buff *skb, struct sock *sk,
513 struct tcpdiag_entry entry; 514 struct tcpdiag_entry entry;
514 struct tcpdiagreq *r = NLMSG_DATA(cb->nlh); 515 struct tcpdiagreq *r = NLMSG_DATA(cb->nlh);
515 struct tcp_sock *tp = tcp_sk(sk); 516 struct tcp_sock *tp = tcp_sk(sk);
516 struct tcp_listen_opt *lopt; 517 struct listen_sock *lopt;
517 struct rtattr *bc = NULL; 518 struct rtattr *bc = NULL;
518 struct inet_sock *inet = inet_sk(sk); 519 struct inet_sock *inet = inet_sk(sk);
519 int j, s_j; 520 int j, s_j;
@@ -528,9 +529,9 @@ static int tcpdiag_dump_reqs(struct sk_buff *skb, struct sock *sk,
528 529
529 entry.family = sk->sk_family; 530 entry.family = sk->sk_family;
530 531
531 read_lock_bh(&tp->syn_wait_lock); 532 read_lock_bh(&tp->accept_queue.syn_wait_lock);
532 533
533 lopt = tp->listen_opt; 534 lopt = tp->accept_queue.listen_opt;
534 if (!lopt || !lopt->qlen) 535 if (!lopt || !lopt->qlen)
535 goto out; 536 goto out;
536 537
@@ -541,13 +542,15 @@ static int tcpdiag_dump_reqs(struct sk_buff *skb, struct sock *sk,
541 } 542 }
542 543
543 for (j = s_j; j < TCP_SYNQ_HSIZE; j++) { 544 for (j = s_j; j < TCP_SYNQ_HSIZE; j++) {
544 struct open_request *req, *head = lopt->syn_table[j]; 545 struct request_sock *req, *head = lopt->syn_table[j];
545 546
546 reqnum = 0; 547 reqnum = 0;
547 for (req = head; req; reqnum++, req = req->dl_next) { 548 for (req = head; req; reqnum++, req = req->dl_next) {
549 struct inet_request_sock *ireq = inet_rsk(req);
550
548 if (reqnum < s_reqnum) 551 if (reqnum < s_reqnum)
549 continue; 552 continue;
550 if (r->id.tcpdiag_dport != req->rmt_port && 553 if (r->id.tcpdiag_dport != ireq->rmt_port &&
551 r->id.tcpdiag_dport) 554 r->id.tcpdiag_dport)
552 continue; 555 continue;
553 556
@@ -555,16 +558,16 @@ static int tcpdiag_dump_reqs(struct sk_buff *skb, struct sock *sk,
555 entry.saddr = 558 entry.saddr =
556#ifdef CONFIG_IP_TCPDIAG_IPV6 559#ifdef CONFIG_IP_TCPDIAG_IPV6
557 (entry.family == AF_INET6) ? 560 (entry.family == AF_INET6) ?
558 req->af.v6_req.loc_addr.s6_addr32 : 561 tcp6_rsk(req)->loc_addr.s6_addr32 :
559#endif 562#endif
560 &req->af.v4_req.loc_addr; 563 &ireq->loc_addr;
561 entry.daddr = 564 entry.daddr =
562#ifdef CONFIG_IP_TCPDIAG_IPV6 565#ifdef CONFIG_IP_TCPDIAG_IPV6
563 (entry.family == AF_INET6) ? 566 (entry.family == AF_INET6) ?
564 req->af.v6_req.rmt_addr.s6_addr32 : 567 tcp6_rsk(req)->rmt_addr.s6_addr32 :
565#endif 568#endif
566 &req->af.v4_req.rmt_addr; 569 &ireq->rmt_addr;
567 entry.dport = ntohs(req->rmt_port); 570 entry.dport = ntohs(ireq->rmt_port);
568 571
569 if (!tcpdiag_bc_run(RTA_DATA(bc), 572 if (!tcpdiag_bc_run(RTA_DATA(bc),
570 RTA_PAYLOAD(bc), &entry)) 573 RTA_PAYLOAD(bc), &entry))
@@ -585,7 +588,7 @@ static int tcpdiag_dump_reqs(struct sk_buff *skb, struct sock *sk,
585 } 588 }
586 589
587out: 590out:
588 read_unlock_bh(&tp->syn_wait_lock); 591 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
589 592
590 return err; 593 return err;
591} 594}
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index dad98e4a5043..2d41d5d6ad19 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -36,7 +36,7 @@
36 * ACK bit. 36 * ACK bit.
37 * Andi Kleen : Implemented fast path mtu discovery. 37 * Andi Kleen : Implemented fast path mtu discovery.
38 * Fixed many serious bugs in the 38 * Fixed many serious bugs in the
39 * open_request handling and moved 39 * request_sock handling and moved
40 * most of it into the af independent code. 40 * most of it into the af independent code.
41 * Added tail drop and some other bugfixes. 41 * Added tail drop and some other bugfixes.
42 * Added new listen sematics. 42 * Added new listen sematics.
@@ -869,21 +869,23 @@ static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
869 return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1)); 869 return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
870} 870}
871 871
872static struct open_request *tcp_v4_search_req(struct tcp_sock *tp, 872static struct request_sock *tcp_v4_search_req(struct tcp_sock *tp,
873 struct open_request ***prevp, 873 struct request_sock ***prevp,
874 __u16 rport, 874 __u16 rport,
875 __u32 raddr, __u32 laddr) 875 __u32 raddr, __u32 laddr)
876{ 876{
877 struct tcp_listen_opt *lopt = tp->listen_opt; 877 struct listen_sock *lopt = tp->accept_queue.listen_opt;
878 struct open_request *req, **prev; 878 struct request_sock *req, **prev;
879 879
880 for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)]; 880 for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
881 (req = *prev) != NULL; 881 (req = *prev) != NULL;
882 prev = &req->dl_next) { 882 prev = &req->dl_next) {
883 if (req->rmt_port == rport && 883 const struct inet_request_sock *ireq = inet_rsk(req);
884 req->af.v4_req.rmt_addr == raddr && 884
885 req->af.v4_req.loc_addr == laddr && 885 if (ireq->rmt_port == rport &&
886 TCP_INET_FAMILY(req->class->family)) { 886 ireq->rmt_addr == raddr &&
887 ireq->loc_addr == laddr &&
888 TCP_INET_FAMILY(req->rsk_ops->family)) {
887 BUG_TRAP(!req->sk); 889 BUG_TRAP(!req->sk);
888 *prevp = prev; 890 *prevp = prev;
889 break; 891 break;
@@ -893,21 +895,13 @@ static struct open_request *tcp_v4_search_req(struct tcp_sock *tp,
893 return req; 895 return req;
894} 896}
895 897
896static void tcp_v4_synq_add(struct sock *sk, struct open_request *req) 898static void tcp_v4_synq_add(struct sock *sk, struct request_sock *req)
897{ 899{
898 struct tcp_sock *tp = tcp_sk(sk); 900 struct tcp_sock *tp = tcp_sk(sk);
899 struct tcp_listen_opt *lopt = tp->listen_opt; 901 struct listen_sock *lopt = tp->accept_queue.listen_opt;
900 u32 h = tcp_v4_synq_hash(req->af.v4_req.rmt_addr, req->rmt_port, lopt->hash_rnd); 902 u32 h = tcp_v4_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd);
901
902 req->expires = jiffies + TCP_TIMEOUT_INIT;
903 req->retrans = 0;
904 req->sk = NULL;
905 req->dl_next = lopt->syn_table[h];
906
907 write_lock(&tp->syn_wait_lock);
908 lopt->syn_table[h] = req;
909 write_unlock(&tp->syn_wait_lock);
910 903
904 reqsk_queue_hash_req(&tp->accept_queue, h, req, TCP_TIMEOUT_INIT);
911 tcp_synq_added(sk); 905 tcp_synq_added(sk);
912} 906}
913 907
@@ -1050,7 +1044,7 @@ void tcp_v4_err(struct sk_buff *skb, u32 info)
1050 } 1044 }
1051 1045
1052 switch (sk->sk_state) { 1046 switch (sk->sk_state) {
1053 struct open_request *req, **prev; 1047 struct request_sock *req, **prev;
1054 case TCP_LISTEN: 1048 case TCP_LISTEN:
1055 if (sock_owned_by_user(sk)) 1049 if (sock_owned_by_user(sk))
1056 goto out; 1050 goto out;
@@ -1065,7 +1059,7 @@ void tcp_v4_err(struct sk_buff *skb, u32 info)
1065 */ 1059 */
1066 BUG_TRAP(!req->sk); 1060 BUG_TRAP(!req->sk);
1067 1061
1068 if (seq != req->snt_isn) { 1062 if (seq != tcp_rsk(req)->snt_isn) {
1069 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS); 1063 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
1070 goto out; 1064 goto out;
1071 } 1065 }
@@ -1254,28 +1248,29 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1254 tcp_tw_put(tw); 1248 tcp_tw_put(tw);
1255} 1249}
1256 1250
1257static void tcp_v4_or_send_ack(struct sk_buff *skb, struct open_request *req) 1251static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
1258{ 1252{
1259 tcp_v4_send_ack(skb, req->snt_isn + 1, req->rcv_isn + 1, req->rcv_wnd, 1253 tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
1260 req->ts_recent); 1254 req->ts_recent);
1261} 1255}
1262 1256
1263static struct dst_entry* tcp_v4_route_req(struct sock *sk, 1257static struct dst_entry* tcp_v4_route_req(struct sock *sk,
1264 struct open_request *req) 1258 struct request_sock *req)
1265{ 1259{
1266 struct rtable *rt; 1260 struct rtable *rt;
1267 struct ip_options *opt = req->af.v4_req.opt; 1261 const struct inet_request_sock *ireq = inet_rsk(req);
1262 struct ip_options *opt = inet_rsk(req)->opt;
1268 struct flowi fl = { .oif = sk->sk_bound_dev_if, 1263 struct flowi fl = { .oif = sk->sk_bound_dev_if,
1269 .nl_u = { .ip4_u = 1264 .nl_u = { .ip4_u =
1270 { .daddr = ((opt && opt->srr) ? 1265 { .daddr = ((opt && opt->srr) ?
1271 opt->faddr : 1266 opt->faddr :
1272 req->af.v4_req.rmt_addr), 1267 ireq->rmt_addr),
1273 .saddr = req->af.v4_req.loc_addr, 1268 .saddr = ireq->loc_addr,
1274 .tos = RT_CONN_FLAGS(sk) } }, 1269 .tos = RT_CONN_FLAGS(sk) } },
1275 .proto = IPPROTO_TCP, 1270 .proto = IPPROTO_TCP,
1276 .uli_u = { .ports = 1271 .uli_u = { .ports =
1277 { .sport = inet_sk(sk)->sport, 1272 { .sport = inet_sk(sk)->sport,
1278 .dport = req->rmt_port } } }; 1273 .dport = ireq->rmt_port } } };
1279 1274
1280 if (ip_route_output_flow(&rt, &fl, sk, 0)) { 1275 if (ip_route_output_flow(&rt, &fl, sk, 0)) {
1281 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); 1276 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
@@ -1291,12 +1286,13 @@ static struct dst_entry* tcp_v4_route_req(struct sock *sk,
1291 1286
1292/* 1287/*
1293 * Send a SYN-ACK after having received an ACK. 1288 * Send a SYN-ACK after having received an ACK.
1294 * This still operates on a open_request only, not on a big 1289 * This still operates on a request_sock only, not on a big
1295 * socket. 1290 * socket.
1296 */ 1291 */
1297static int tcp_v4_send_synack(struct sock *sk, struct open_request *req, 1292static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
1298 struct dst_entry *dst) 1293 struct dst_entry *dst)
1299{ 1294{
1295 const struct inet_request_sock *ireq = inet_rsk(req);
1300 int err = -1; 1296 int err = -1;
1301 struct sk_buff * skb; 1297 struct sk_buff * skb;
1302 1298
@@ -1310,14 +1306,14 @@ static int tcp_v4_send_synack(struct sock *sk, struct open_request *req,
1310 struct tcphdr *th = skb->h.th; 1306 struct tcphdr *th = skb->h.th;
1311 1307
1312 th->check = tcp_v4_check(th, skb->len, 1308 th->check = tcp_v4_check(th, skb->len,
1313 req->af.v4_req.loc_addr, 1309 ireq->loc_addr,
1314 req->af.v4_req.rmt_addr, 1310 ireq->rmt_addr,
1315 csum_partial((char *)th, skb->len, 1311 csum_partial((char *)th, skb->len,
1316 skb->csum)); 1312 skb->csum));
1317 1313
1318 err = ip_build_and_send_pkt(skb, sk, req->af.v4_req.loc_addr, 1314 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
1319 req->af.v4_req.rmt_addr, 1315 ireq->rmt_addr,
1320 req->af.v4_req.opt); 1316 ireq->opt);
1321 if (err == NET_XMIT_CN) 1317 if (err == NET_XMIT_CN)
1322 err = 0; 1318 err = 0;
1323 } 1319 }
@@ -1328,12 +1324,12 @@ out:
1328} 1324}
1329 1325
1330/* 1326/*
1331 * IPv4 open_request destructor. 1327 * IPv4 request_sock destructor.
1332 */ 1328 */
1333static void tcp_v4_or_free(struct open_request *req) 1329static void tcp_v4_reqsk_destructor(struct request_sock *req)
1334{ 1330{
1335 if (req->af.v4_req.opt) 1331 if (inet_rsk(req)->opt)
1336 kfree(req->af.v4_req.opt); 1332 kfree(inet_rsk(req)->opt);
1337} 1333}
1338 1334
1339static inline void syn_flood_warning(struct sk_buff *skb) 1335static inline void syn_flood_warning(struct sk_buff *skb)
@@ -1349,7 +1345,7 @@ static inline void syn_flood_warning(struct sk_buff *skb)
1349} 1345}
1350 1346
1351/* 1347/*
1352 * Save and compile IPv4 options into the open_request if needed. 1348 * Save and compile IPv4 options into the request_sock if needed.
1353 */ 1349 */
1354static inline struct ip_options *tcp_v4_save_options(struct sock *sk, 1350static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
1355 struct sk_buff *skb) 1351 struct sk_buff *skb)
@@ -1370,33 +1366,20 @@ static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
1370 return dopt; 1366 return dopt;
1371} 1367}
1372 1368
1373/* 1369struct request_sock_ops tcp_request_sock_ops = {
1374 * Maximum number of SYN_RECV sockets in queue per LISTEN socket.
1375 * One SYN_RECV socket costs about 80bytes on a 32bit machine.
1376 * It would be better to replace it with a global counter for all sockets
1377 * but then some measure against one socket starving all other sockets
1378 * would be needed.
1379 *
1380 * It was 128 by default. Experiments with real servers show, that
1381 * it is absolutely not enough even at 100conn/sec. 256 cures most
1382 * of problems. This value is adjusted to 128 for very small machines
1383 * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb).
1384 * Further increasing requires to change hash table size.
1385 */
1386int sysctl_max_syn_backlog = 256;
1387
1388struct or_calltable or_ipv4 = {
1389 .family = PF_INET, 1370 .family = PF_INET,
1371 .obj_size = sizeof(struct tcp_request_sock),
1390 .rtx_syn_ack = tcp_v4_send_synack, 1372 .rtx_syn_ack = tcp_v4_send_synack,
1391 .send_ack = tcp_v4_or_send_ack, 1373 .send_ack = tcp_v4_reqsk_send_ack,
1392 .destructor = tcp_v4_or_free, 1374 .destructor = tcp_v4_reqsk_destructor,
1393 .send_reset = tcp_v4_send_reset, 1375 .send_reset = tcp_v4_send_reset,
1394}; 1376};
1395 1377
1396int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 1378int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1397{ 1379{
1380 struct inet_request_sock *ireq;
1398 struct tcp_options_received tmp_opt; 1381 struct tcp_options_received tmp_opt;
1399 struct open_request *req; 1382 struct request_sock *req;
1400 __u32 saddr = skb->nh.iph->saddr; 1383 __u32 saddr = skb->nh.iph->saddr;
1401 __u32 daddr = skb->nh.iph->daddr; 1384 __u32 daddr = skb->nh.iph->daddr;
1402 __u32 isn = TCP_SKB_CB(skb)->when; 1385 __u32 isn = TCP_SKB_CB(skb)->when;
@@ -1433,7 +1416,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1433 if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1) 1416 if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1434 goto drop; 1417 goto drop;
1435 1418
1436 req = tcp_openreq_alloc(); 1419 req = reqsk_alloc(&tcp_request_sock_ops);
1437 if (!req) 1420 if (!req)
1438 goto drop; 1421 goto drop;
1439 1422
@@ -1461,10 +1444,10 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1461 1444
1462 tcp_openreq_init(req, &tmp_opt, skb); 1445 tcp_openreq_init(req, &tmp_opt, skb);
1463 1446
1464 req->af.v4_req.loc_addr = daddr; 1447 ireq = inet_rsk(req);
1465 req->af.v4_req.rmt_addr = saddr; 1448 ireq->loc_addr = daddr;
1466 req->af.v4_req.opt = tcp_v4_save_options(sk, skb); 1449 ireq->rmt_addr = saddr;
1467 req->class = &or_ipv4; 1450 ireq->opt = tcp_v4_save_options(sk, skb);
1468 if (!want_cookie) 1451 if (!want_cookie)
1469 TCP_ECN_create_request(req, skb->h.th); 1452 TCP_ECN_create_request(req, skb->h.th);
1470 1453
@@ -1523,20 +1506,20 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1523 1506
1524 isn = tcp_v4_init_sequence(sk, skb); 1507 isn = tcp_v4_init_sequence(sk, skb);
1525 } 1508 }
1526 req->snt_isn = isn; 1509 tcp_rsk(req)->snt_isn = isn;
1527 1510
1528 if (tcp_v4_send_synack(sk, req, dst)) 1511 if (tcp_v4_send_synack(sk, req, dst))
1529 goto drop_and_free; 1512 goto drop_and_free;
1530 1513
1531 if (want_cookie) { 1514 if (want_cookie) {
1532 tcp_openreq_free(req); 1515 reqsk_free(req);
1533 } else { 1516 } else {
1534 tcp_v4_synq_add(sk, req); 1517 tcp_v4_synq_add(sk, req);
1535 } 1518 }
1536 return 0; 1519 return 0;
1537 1520
1538drop_and_free: 1521drop_and_free:
1539 tcp_openreq_free(req); 1522 reqsk_free(req);
1540drop: 1523drop:
1541 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS); 1524 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1542 return 0; 1525 return 0;
@@ -1548,9 +1531,10 @@ drop:
1548 * now create the new socket. 1531 * now create the new socket.
1549 */ 1532 */
1550struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, 1533struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1551 struct open_request *req, 1534 struct request_sock *req,
1552 struct dst_entry *dst) 1535 struct dst_entry *dst)
1553{ 1536{
1537 struct inet_request_sock *ireq;
1554 struct inet_sock *newinet; 1538 struct inet_sock *newinet;
1555 struct tcp_sock *newtp; 1539 struct tcp_sock *newtp;
1556 struct sock *newsk; 1540 struct sock *newsk;
@@ -1570,11 +1554,12 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1570 1554
1571 newtp = tcp_sk(newsk); 1555 newtp = tcp_sk(newsk);
1572 newinet = inet_sk(newsk); 1556 newinet = inet_sk(newsk);
1573 newinet->daddr = req->af.v4_req.rmt_addr; 1557 ireq = inet_rsk(req);
1574 newinet->rcv_saddr = req->af.v4_req.loc_addr; 1558 newinet->daddr = ireq->rmt_addr;
1575 newinet->saddr = req->af.v4_req.loc_addr; 1559 newinet->rcv_saddr = ireq->loc_addr;
1576 newinet->opt = req->af.v4_req.opt; 1560 newinet->saddr = ireq->loc_addr;
1577 req->af.v4_req.opt = NULL; 1561 newinet->opt = ireq->opt;
1562 ireq->opt = NULL;
1578 newinet->mc_index = tcp_v4_iif(skb); 1563 newinet->mc_index = tcp_v4_iif(skb);
1579 newinet->mc_ttl = skb->nh.iph->ttl; 1564 newinet->mc_ttl = skb->nh.iph->ttl;
1580 newtp->ext_header_len = 0; 1565 newtp->ext_header_len = 0;
@@ -1605,9 +1590,9 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1605 struct iphdr *iph = skb->nh.iph; 1590 struct iphdr *iph = skb->nh.iph;
1606 struct tcp_sock *tp = tcp_sk(sk); 1591 struct tcp_sock *tp = tcp_sk(sk);
1607 struct sock *nsk; 1592 struct sock *nsk;
1608 struct open_request **prev; 1593 struct request_sock **prev;
1609 /* Find possible connection requests. */ 1594 /* Find possible connection requests. */
1610 struct open_request *req = tcp_v4_search_req(tp, &prev, th->source, 1595 struct request_sock *req = tcp_v4_search_req(tp, &prev, th->source,
1611 iph->saddr, iph->daddr); 1596 iph->saddr, iph->daddr);
1612 if (req) 1597 if (req)
1613 return tcp_check_req(sk, skb, req, prev); 1598 return tcp_check_req(sk, skb, req, prev);
@@ -2144,13 +2129,13 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
2144 ++st->num; 2129 ++st->num;
2145 2130
2146 if (st->state == TCP_SEQ_STATE_OPENREQ) { 2131 if (st->state == TCP_SEQ_STATE_OPENREQ) {
2147 struct open_request *req = cur; 2132 struct request_sock *req = cur;
2148 2133
2149 tp = tcp_sk(st->syn_wait_sk); 2134 tp = tcp_sk(st->syn_wait_sk);
2150 req = req->dl_next; 2135 req = req->dl_next;
2151 while (1) { 2136 while (1) {
2152 while (req) { 2137 while (req) {
2153 if (req->class->family == st->family) { 2138 if (req->rsk_ops->family == st->family) {
2154 cur = req; 2139 cur = req;
2155 goto out; 2140 goto out;
2156 } 2141 }
@@ -2159,17 +2144,17 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
2159 if (++st->sbucket >= TCP_SYNQ_HSIZE) 2144 if (++st->sbucket >= TCP_SYNQ_HSIZE)
2160 break; 2145 break;
2161get_req: 2146get_req:
2162 req = tp->listen_opt->syn_table[st->sbucket]; 2147 req = tp->accept_queue.listen_opt->syn_table[st->sbucket];
2163 } 2148 }
2164 sk = sk_next(st->syn_wait_sk); 2149 sk = sk_next(st->syn_wait_sk);
2165 st->state = TCP_SEQ_STATE_LISTENING; 2150 st->state = TCP_SEQ_STATE_LISTENING;
2166 read_unlock_bh(&tp->syn_wait_lock); 2151 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
2167 } else { 2152 } else {
2168 tp = tcp_sk(sk); 2153 tp = tcp_sk(sk);
2169 read_lock_bh(&tp->syn_wait_lock); 2154 read_lock_bh(&tp->accept_queue.syn_wait_lock);
2170 if (tp->listen_opt && tp->listen_opt->qlen) 2155 if (reqsk_queue_len(&tp->accept_queue))
2171 goto start_req; 2156 goto start_req;
2172 read_unlock_bh(&tp->syn_wait_lock); 2157 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
2173 sk = sk_next(sk); 2158 sk = sk_next(sk);
2174 } 2159 }
2175get_sk: 2160get_sk:
@@ -2179,8 +2164,8 @@ get_sk:
2179 goto out; 2164 goto out;
2180 } 2165 }
2181 tp = tcp_sk(sk); 2166 tp = tcp_sk(sk);
2182 read_lock_bh(&tp->syn_wait_lock); 2167 read_lock_bh(&tp->accept_queue.syn_wait_lock);
2183 if (tp->listen_opt && tp->listen_opt->qlen) { 2168 if (reqsk_queue_len(&tp->accept_queue)) {
2184start_req: 2169start_req:
2185 st->uid = sock_i_uid(sk); 2170 st->uid = sock_i_uid(sk);
2186 st->syn_wait_sk = sk; 2171 st->syn_wait_sk = sk;
@@ -2188,7 +2173,7 @@ start_req:
2188 st->sbucket = 0; 2173 st->sbucket = 0;
2189 goto get_req; 2174 goto get_req;
2190 } 2175 }
2191 read_unlock_bh(&tp->syn_wait_lock); 2176 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
2192 } 2177 }
2193 if (++st->bucket < TCP_LHTABLE_SIZE) { 2178 if (++st->bucket < TCP_LHTABLE_SIZE) {
2194 sk = sk_head(&tcp_listening_hash[st->bucket]); 2179 sk = sk_head(&tcp_listening_hash[st->bucket]);
@@ -2375,7 +2360,7 @@ static void tcp_seq_stop(struct seq_file *seq, void *v)
2375 case TCP_SEQ_STATE_OPENREQ: 2360 case TCP_SEQ_STATE_OPENREQ:
2376 if (v) { 2361 if (v) {
2377 struct tcp_sock *tp = tcp_sk(st->syn_wait_sk); 2362 struct tcp_sock *tp = tcp_sk(st->syn_wait_sk);
2378 read_unlock_bh(&tp->syn_wait_lock); 2363 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
2379 } 2364 }
2380 case TCP_SEQ_STATE_LISTENING: 2365 case TCP_SEQ_STATE_LISTENING:
2381 if (v != SEQ_START_TOKEN) 2366 if (v != SEQ_START_TOKEN)
@@ -2451,18 +2436,19 @@ void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2451 memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops)); 2436 memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
2452} 2437}
2453 2438
2454static void get_openreq4(struct sock *sk, struct open_request *req, 2439static void get_openreq4(struct sock *sk, struct request_sock *req,
2455 char *tmpbuf, int i, int uid) 2440 char *tmpbuf, int i, int uid)
2456{ 2441{
2442 const struct inet_request_sock *ireq = inet_rsk(req);
2457 int ttd = req->expires - jiffies; 2443 int ttd = req->expires - jiffies;
2458 2444
2459 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X" 2445 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2460 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p", 2446 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2461 i, 2447 i,
2462 req->af.v4_req.loc_addr, 2448 ireq->loc_addr,
2463 ntohs(inet_sk(sk)->sport), 2449 ntohs(inet_sk(sk)->sport),
2464 req->af.v4_req.rmt_addr, 2450 ireq->rmt_addr,
2465 ntohs(req->rmt_port), 2451 ntohs(ireq->rmt_port),
2466 TCP_SYN_RECV, 2452 TCP_SYN_RECV,
2467 0, 0, /* could print option size, but that is af dependent. */ 2453 0, 0, /* could print option size, but that is af dependent. */
2468 1, /* timers active (only the expire timer) */ 2454 1, /* timers active (only the expire timer) */
@@ -2618,6 +2604,7 @@ struct proto tcp_prot = {
2618 .sysctl_rmem = sysctl_tcp_rmem, 2604 .sysctl_rmem = sysctl_tcp_rmem,
2619 .max_header = MAX_TCP_HEADER, 2605 .max_header = MAX_TCP_HEADER,
2620 .obj_size = sizeof(struct tcp_sock), 2606 .obj_size = sizeof(struct tcp_sock),
2607 .rsk_prot = &tcp_request_sock_ops,
2621}; 2608};
2622 2609
2623 2610
@@ -2660,7 +2647,6 @@ EXPORT_SYMBOL(tcp_proc_register);
2660EXPORT_SYMBOL(tcp_proc_unregister); 2647EXPORT_SYMBOL(tcp_proc_unregister);
2661#endif 2648#endif
2662EXPORT_SYMBOL(sysctl_local_port_range); 2649EXPORT_SYMBOL(sysctl_local_port_range);
2663EXPORT_SYMBOL(sysctl_max_syn_backlog);
2664EXPORT_SYMBOL(sysctl_tcp_low_latency); 2650EXPORT_SYMBOL(sysctl_tcp_low_latency);
2665EXPORT_SYMBOL(sysctl_tcp_tw_reuse); 2651EXPORT_SYMBOL(sysctl_tcp_tw_reuse);
2666 2652
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index eea1a17a9ac2..b3943e7562f3 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -684,7 +684,7 @@ out:
684 * Actually, we could lots of memory writes here. tp of listening 684 * Actually, we could lots of memory writes here. tp of listening
685 * socket contains all necessary default parameters. 685 * socket contains all necessary default parameters.
686 */ 686 */
687struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, struct sk_buff *skb) 687struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct sk_buff *skb)
688{ 688{
689 /* allocate the newsk from the same slab of the master sock, 689 /* allocate the newsk from the same slab of the master sock,
690 * if not, at sk_free time we'll try to free it from the wrong 690 * if not, at sk_free time we'll try to free it from the wrong
@@ -692,6 +692,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req,
692 struct sock *newsk = sk_alloc(PF_INET, GFP_ATOMIC, sk->sk_prot, 0); 692 struct sock *newsk = sk_alloc(PF_INET, GFP_ATOMIC, sk->sk_prot, 0);
693 693
694 if(newsk != NULL) { 694 if(newsk != NULL) {
695 struct inet_request_sock *ireq = inet_rsk(req);
696 struct tcp_request_sock *treq = tcp_rsk(req);
695 struct tcp_sock *newtp; 697 struct tcp_sock *newtp;
696 struct sk_filter *filter; 698 struct sk_filter *filter;
697 699
@@ -703,7 +705,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req,
703 tcp_sk(newsk)->bind_hash = NULL; 705 tcp_sk(newsk)->bind_hash = NULL;
704 706
705 /* Clone the TCP header template */ 707 /* Clone the TCP header template */
706 inet_sk(newsk)->dport = req->rmt_port; 708 inet_sk(newsk)->dport = ireq->rmt_port;
707 709
708 sock_lock_init(newsk); 710 sock_lock_init(newsk);
709 bh_lock_sock(newsk); 711 bh_lock_sock(newsk);
@@ -739,14 +741,14 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req,
739 /* Now setup tcp_sock */ 741 /* Now setup tcp_sock */
740 newtp = tcp_sk(newsk); 742 newtp = tcp_sk(newsk);
741 newtp->pred_flags = 0; 743 newtp->pred_flags = 0;
742 newtp->rcv_nxt = req->rcv_isn + 1; 744 newtp->rcv_nxt = treq->rcv_isn + 1;
743 newtp->snd_nxt = req->snt_isn + 1; 745 newtp->snd_nxt = treq->snt_isn + 1;
744 newtp->snd_una = req->snt_isn + 1; 746 newtp->snd_una = treq->snt_isn + 1;
745 newtp->snd_sml = req->snt_isn + 1; 747 newtp->snd_sml = treq->snt_isn + 1;
746 748
747 tcp_prequeue_init(newtp); 749 tcp_prequeue_init(newtp);
748 750
749 tcp_init_wl(newtp, req->snt_isn, req->rcv_isn); 751 tcp_init_wl(newtp, treq->snt_isn, treq->rcv_isn);
750 752
751 newtp->retransmits = 0; 753 newtp->retransmits = 0;
752 newtp->backoff = 0; 754 newtp->backoff = 0;
@@ -775,10 +777,10 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req,
775 tcp_set_ca_state(newtp, TCP_CA_Open); 777 tcp_set_ca_state(newtp, TCP_CA_Open);
776 tcp_init_xmit_timers(newsk); 778 tcp_init_xmit_timers(newsk);
777 skb_queue_head_init(&newtp->out_of_order_queue); 779 skb_queue_head_init(&newtp->out_of_order_queue);
778 newtp->rcv_wup = req->rcv_isn + 1; 780 newtp->rcv_wup = treq->rcv_isn + 1;
779 newtp->write_seq = req->snt_isn + 1; 781 newtp->write_seq = treq->snt_isn + 1;
780 newtp->pushed_seq = newtp->write_seq; 782 newtp->pushed_seq = newtp->write_seq;
781 newtp->copied_seq = req->rcv_isn + 1; 783 newtp->copied_seq = treq->rcv_isn + 1;
782 784
783 newtp->rx_opt.saw_tstamp = 0; 785 newtp->rx_opt.saw_tstamp = 0;
784 786
@@ -788,10 +790,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req,
788 newtp->probes_out = 0; 790 newtp->probes_out = 0;
789 newtp->rx_opt.num_sacks = 0; 791 newtp->rx_opt.num_sacks = 0;
790 newtp->urg_data = 0; 792 newtp->urg_data = 0;
791 newtp->listen_opt = NULL; 793 /* Deinitialize accept_queue to trap illegal accesses. */
792 newtp->accept_queue = newtp->accept_queue_tail = NULL; 794 memset(&newtp->accept_queue, 0, sizeof(newtp->accept_queue));
793 /* Deinitialize syn_wait_lock to trap illegal accesses. */
794 memset(&newtp->syn_wait_lock, 0, sizeof(newtp->syn_wait_lock));
795 795
796 /* Back to base struct sock members. */ 796 /* Back to base struct sock members. */
797 newsk->sk_err = 0; 797 newsk->sk_err = 0;
@@ -808,18 +808,18 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req,
808 newsk->sk_socket = NULL; 808 newsk->sk_socket = NULL;
809 newsk->sk_sleep = NULL; 809 newsk->sk_sleep = NULL;
810 810
811 newtp->rx_opt.tstamp_ok = req->tstamp_ok; 811 newtp->rx_opt.tstamp_ok = ireq->tstamp_ok;
812 if((newtp->rx_opt.sack_ok = req->sack_ok) != 0) { 812 if((newtp->rx_opt.sack_ok = ireq->sack_ok) != 0) {
813 if (sysctl_tcp_fack) 813 if (sysctl_tcp_fack)
814 newtp->rx_opt.sack_ok |= 2; 814 newtp->rx_opt.sack_ok |= 2;
815 } 815 }
816 newtp->window_clamp = req->window_clamp; 816 newtp->window_clamp = req->window_clamp;
817 newtp->rcv_ssthresh = req->rcv_wnd; 817 newtp->rcv_ssthresh = req->rcv_wnd;
818 newtp->rcv_wnd = req->rcv_wnd; 818 newtp->rcv_wnd = req->rcv_wnd;
819 newtp->rx_opt.wscale_ok = req->wscale_ok; 819 newtp->rx_opt.wscale_ok = ireq->wscale_ok;
820 if (newtp->rx_opt.wscale_ok) { 820 if (newtp->rx_opt.wscale_ok) {
821 newtp->rx_opt.snd_wscale = req->snd_wscale; 821 newtp->rx_opt.snd_wscale = ireq->snd_wscale;
822 newtp->rx_opt.rcv_wscale = req->rcv_wscale; 822 newtp->rx_opt.rcv_wscale = ireq->rcv_wscale;
823 } else { 823 } else {
824 newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0; 824 newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0;
825 newtp->window_clamp = min(newtp->window_clamp, 65535U); 825 newtp->window_clamp = min(newtp->window_clamp, 65535U);
@@ -851,12 +851,12 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req,
851 851
852/* 852/*
853 * Process an incoming packet for SYN_RECV sockets represented 853 * Process an incoming packet for SYN_RECV sockets represented
854 * as an open_request. 854 * as a request_sock.
855 */ 855 */
856 856
857struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, 857struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
858 struct open_request *req, 858 struct request_sock *req,
859 struct open_request **prev) 859 struct request_sock **prev)
860{ 860{
861 struct tcphdr *th = skb->h.th; 861 struct tcphdr *th = skb->h.th;
862 struct tcp_sock *tp = tcp_sk(sk); 862 struct tcp_sock *tp = tcp_sk(sk);
@@ -881,7 +881,7 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
881 } 881 }
882 882
883 /* Check for pure retransmitted SYN. */ 883 /* Check for pure retransmitted SYN. */
884 if (TCP_SKB_CB(skb)->seq == req->rcv_isn && 884 if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn &&
885 flg == TCP_FLAG_SYN && 885 flg == TCP_FLAG_SYN &&
886 !paws_reject) { 886 !paws_reject) {
887 /* 887 /*
@@ -901,7 +901,7 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
901 * Enforce "SYN-ACK" according to figure 8, figure 6 901 * Enforce "SYN-ACK" according to figure 8, figure 6
902 * of RFC793, fixed by RFC1122. 902 * of RFC793, fixed by RFC1122.
903 */ 903 */
904 req->class->rtx_syn_ack(sk, req, NULL); 904 req->rsk_ops->rtx_syn_ack(sk, req, NULL);
905 return NULL; 905 return NULL;
906 } 906 }
907 907
@@ -959,7 +959,7 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
959 * Invalid ACK: reset will be sent by listening socket 959 * Invalid ACK: reset will be sent by listening socket
960 */ 960 */
961 if ((flg & TCP_FLAG_ACK) && 961 if ((flg & TCP_FLAG_ACK) &&
962 (TCP_SKB_CB(skb)->ack_seq != req->snt_isn+1)) 962 (TCP_SKB_CB(skb)->ack_seq != tcp_rsk(req)->snt_isn + 1))
963 return sk; 963 return sk;
964 964
965 /* Also, it would be not so bad idea to check rcv_tsecr, which 965 /* Also, it would be not so bad idea to check rcv_tsecr, which
@@ -970,10 +970,10 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
970 /* RFC793: "first check sequence number". */ 970 /* RFC793: "first check sequence number". */
971 971
972 if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, 972 if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
973 req->rcv_isn+1, req->rcv_isn+1+req->rcv_wnd)) { 973 tcp_rsk(req)->rcv_isn + 1, tcp_rsk(req)->rcv_isn + 1 + req->rcv_wnd)) {
974 /* Out of window: send ACK and drop. */ 974 /* Out of window: send ACK and drop. */
975 if (!(flg & TCP_FLAG_RST)) 975 if (!(flg & TCP_FLAG_RST))
976 req->class->send_ack(skb, req); 976 req->rsk_ops->send_ack(skb, req);
977 if (paws_reject) 977 if (paws_reject)
978 NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED); 978 NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED);
979 return NULL; 979 return NULL;
@@ -981,12 +981,12 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
981 981
982 /* In sequence, PAWS is OK. */ 982 /* In sequence, PAWS is OK. */
983 983
984 if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, req->rcv_isn+1)) 984 if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_isn + 1))
985 req->ts_recent = tmp_opt.rcv_tsval; 985 req->ts_recent = tmp_opt.rcv_tsval;
986 986
987 if (TCP_SKB_CB(skb)->seq == req->rcv_isn) { 987 if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) {
988 /* Truncate SYN, it is out of window starting 988 /* Truncate SYN, it is out of window starting
989 at req->rcv_isn+1. */ 989 at tcp_rsk(req)->rcv_isn + 1. */
990 flg &= ~TCP_FLAG_SYN; 990 flg &= ~TCP_FLAG_SYN;
991 } 991 }
992 992
@@ -1003,8 +1003,8 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
1003 return NULL; 1003 return NULL;
1004 1004
1005 /* If TCP_DEFER_ACCEPT is set, drop bare ACK. */ 1005 /* If TCP_DEFER_ACCEPT is set, drop bare ACK. */
1006 if (tp->defer_accept && TCP_SKB_CB(skb)->end_seq == req->rcv_isn+1) { 1006 if (tp->defer_accept && TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
1007 req->acked = 1; 1007 inet_rsk(req)->acked = 1;
1008 return NULL; 1008 return NULL;
1009 } 1009 }
1010 1010
@@ -1026,14 +1026,14 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
1026 1026
1027 listen_overflow: 1027 listen_overflow:
1028 if (!sysctl_tcp_abort_on_overflow) { 1028 if (!sysctl_tcp_abort_on_overflow) {
1029 req->acked = 1; 1029 inet_rsk(req)->acked = 1;
1030 return NULL; 1030 return NULL;
1031 } 1031 }
1032 1032
1033 embryonic_reset: 1033 embryonic_reset:
1034 NET_INC_STATS_BH(LINUX_MIB_EMBRYONICRSTS); 1034 NET_INC_STATS_BH(LINUX_MIB_EMBRYONICRSTS);
1035 if (!(flg & TCP_FLAG_RST)) 1035 if (!(flg & TCP_FLAG_RST))
1036 req->class->send_reset(skb); 1036 req->rsk_ops->send_reset(skb);
1037 1037
1038 tcp_synq_drop(sk, req, prev); 1038 tcp_synq_drop(sk, req, prev);
1039 return NULL; 1039 return NULL;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index fa24e7ae1f40..f17c6577e337 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1356,8 +1356,9 @@ int tcp_send_synack(struct sock *sk)
1356 * Prepare a SYN-ACK. 1356 * Prepare a SYN-ACK.
1357 */ 1357 */
1358struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst, 1358struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
1359 struct open_request *req) 1359 struct request_sock *req)
1360{ 1360{
1361 struct inet_request_sock *ireq = inet_rsk(req);
1361 struct tcp_sock *tp = tcp_sk(sk); 1362 struct tcp_sock *tp = tcp_sk(sk);
1362 struct tcphdr *th; 1363 struct tcphdr *th;
1363 int tcp_header_size; 1364 int tcp_header_size;
@@ -1373,47 +1374,47 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
1373 skb->dst = dst_clone(dst); 1374 skb->dst = dst_clone(dst);
1374 1375
1375 tcp_header_size = (sizeof(struct tcphdr) + TCPOLEN_MSS + 1376 tcp_header_size = (sizeof(struct tcphdr) + TCPOLEN_MSS +
1376 (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) + 1377 (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) +
1377 (req->wscale_ok ? TCPOLEN_WSCALE_ALIGNED : 0) + 1378 (ireq->wscale_ok ? TCPOLEN_WSCALE_ALIGNED : 0) +
1378 /* SACK_PERM is in the place of NOP NOP of TS */ 1379 /* SACK_PERM is in the place of NOP NOP of TS */
1379 ((req->sack_ok && !req->tstamp_ok) ? TCPOLEN_SACKPERM_ALIGNED : 0)); 1380 ((ireq->sack_ok && !ireq->tstamp_ok) ? TCPOLEN_SACKPERM_ALIGNED : 0));
1380 skb->h.th = th = (struct tcphdr *) skb_push(skb, tcp_header_size); 1381 skb->h.th = th = (struct tcphdr *) skb_push(skb, tcp_header_size);
1381 1382
1382 memset(th, 0, sizeof(struct tcphdr)); 1383 memset(th, 0, sizeof(struct tcphdr));
1383 th->syn = 1; 1384 th->syn = 1;
1384 th->ack = 1; 1385 th->ack = 1;
1385 if (dst->dev->features&NETIF_F_TSO) 1386 if (dst->dev->features&NETIF_F_TSO)
1386 req->ecn_ok = 0; 1387 ireq->ecn_ok = 0;
1387 TCP_ECN_make_synack(req, th); 1388 TCP_ECN_make_synack(req, th);
1388 th->source = inet_sk(sk)->sport; 1389 th->source = inet_sk(sk)->sport;
1389 th->dest = req->rmt_port; 1390 th->dest = ireq->rmt_port;
1390 TCP_SKB_CB(skb)->seq = req->snt_isn; 1391 TCP_SKB_CB(skb)->seq = tcp_rsk(req)->snt_isn;
1391 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1; 1392 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
1392 TCP_SKB_CB(skb)->sacked = 0; 1393 TCP_SKB_CB(skb)->sacked = 0;
1393 skb_shinfo(skb)->tso_segs = 1; 1394 skb_shinfo(skb)->tso_segs = 1;
1394 skb_shinfo(skb)->tso_size = 0; 1395 skb_shinfo(skb)->tso_size = 0;
1395 th->seq = htonl(TCP_SKB_CB(skb)->seq); 1396 th->seq = htonl(TCP_SKB_CB(skb)->seq);
1396 th->ack_seq = htonl(req->rcv_isn + 1); 1397 th->ack_seq = htonl(tcp_rsk(req)->rcv_isn + 1);
1397 if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */ 1398 if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
1398 __u8 rcv_wscale; 1399 __u8 rcv_wscale;
1399 /* Set this up on the first call only */ 1400 /* Set this up on the first call only */
1400 req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW); 1401 req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW);
1401 /* tcp_full_space because it is guaranteed to be the first packet */ 1402 /* tcp_full_space because it is guaranteed to be the first packet */
1402 tcp_select_initial_window(tcp_full_space(sk), 1403 tcp_select_initial_window(tcp_full_space(sk),
1403 dst_metric(dst, RTAX_ADVMSS) - (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), 1404 dst_metric(dst, RTAX_ADVMSS) - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
1404 &req->rcv_wnd, 1405 &req->rcv_wnd,
1405 &req->window_clamp, 1406 &req->window_clamp,
1406 req->wscale_ok, 1407 ireq->wscale_ok,
1407 &rcv_wscale); 1408 &rcv_wscale);
1408 req->rcv_wscale = rcv_wscale; 1409 ireq->rcv_wscale = rcv_wscale;
1409 } 1410 }
1410 1411
1411 /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */ 1412 /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
1412 th->window = htons(req->rcv_wnd); 1413 th->window = htons(req->rcv_wnd);
1413 1414
1414 TCP_SKB_CB(skb)->when = tcp_time_stamp; 1415 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1415 tcp_syn_build_options((__u32 *)(th + 1), dst_metric(dst, RTAX_ADVMSS), req->tstamp_ok, 1416 tcp_syn_build_options((__u32 *)(th + 1), dst_metric(dst, RTAX_ADVMSS), ireq->tstamp_ok,
1416 req->sack_ok, req->wscale_ok, req->rcv_wscale, 1417 ireq->sack_ok, ireq->wscale_ok, ireq->rcv_wscale,
1417 TCP_SKB_CB(skb)->when, 1418 TCP_SKB_CB(skb)->when,
1418 req->ts_recent); 1419 req->ts_recent);
1419 1420
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 799ebe061e2c..b127b4498565 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -464,11 +464,11 @@ out_unlock:
464static void tcp_synack_timer(struct sock *sk) 464static void tcp_synack_timer(struct sock *sk)
465{ 465{
466 struct tcp_sock *tp = tcp_sk(sk); 466 struct tcp_sock *tp = tcp_sk(sk);
467 struct tcp_listen_opt *lopt = tp->listen_opt; 467 struct listen_sock *lopt = tp->accept_queue.listen_opt;
468 int max_retries = tp->syn_retries ? : sysctl_tcp_synack_retries; 468 int max_retries = tp->syn_retries ? : sysctl_tcp_synack_retries;
469 int thresh = max_retries; 469 int thresh = max_retries;
470 unsigned long now = jiffies; 470 unsigned long now = jiffies;
471 struct open_request **reqp, *req; 471 struct request_sock **reqp, *req;
472 int i, budget; 472 int i, budget;
473 473
474 if (lopt == NULL || lopt->qlen == 0) 474 if (lopt == NULL || lopt->qlen == 0)
@@ -513,8 +513,8 @@ static void tcp_synack_timer(struct sock *sk)
513 while ((req = *reqp) != NULL) { 513 while ((req = *reqp) != NULL) {
514 if (time_after_eq(now, req->expires)) { 514 if (time_after_eq(now, req->expires)) {
515 if ((req->retrans < thresh || 515 if ((req->retrans < thresh ||
516 (req->acked && req->retrans < max_retries)) 516 (inet_rsk(req)->acked && req->retrans < max_retries))
517 && !req->class->rtx_syn_ack(sk, req, NULL)) { 517 && !req->rsk_ops->rtx_syn_ack(sk, req, NULL)) {
518 unsigned long timeo; 518 unsigned long timeo;
519 519
520 if (req->retrans++ == 0) 520 if (req->retrans++ == 0)
@@ -527,13 +527,9 @@ static void tcp_synack_timer(struct sock *sk)
527 } 527 }
528 528
529 /* Drop this request */ 529 /* Drop this request */
530 write_lock(&tp->syn_wait_lock); 530 tcp_synq_unlink(tp, req, reqp);
531 *reqp = req->dl_next; 531 reqsk_queue_removed(&tp->accept_queue, req);
532 write_unlock(&tp->syn_wait_lock); 532 reqsk_free(req);
533 lopt->qlen--;
534 if (req->retrans == 0)
535 lopt->qlen_young--;
536 tcp_openreq_free(req);
537 continue; 533 continue;
538 } 534 }
539 reqp = &req->dl_next; 535 reqp = &req->dl_next;
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
index af2392ae5769..66620a95942a 100644
--- a/net/ipv4/xfrm4_output.c
+++ b/net/ipv4/xfrm4_output.c
@@ -33,6 +33,7 @@ static void xfrm4_encap(struct sk_buff *skb)
33 struct dst_entry *dst = skb->dst; 33 struct dst_entry *dst = skb->dst;
34 struct xfrm_state *x = dst->xfrm; 34 struct xfrm_state *x = dst->xfrm;
35 struct iphdr *iph, *top_iph; 35 struct iphdr *iph, *top_iph;
36 int flags;
36 37
37 iph = skb->nh.iph; 38 iph = skb->nh.iph;
38 skb->h.ipiph = iph; 39 skb->h.ipiph = iph;
@@ -51,10 +52,13 @@ static void xfrm4_encap(struct sk_buff *skb)
51 52
52 /* DS disclosed */ 53 /* DS disclosed */
53 top_iph->tos = INET_ECN_encapsulate(iph->tos, iph->tos); 54 top_iph->tos = INET_ECN_encapsulate(iph->tos, iph->tos);
54 if (x->props.flags & XFRM_STATE_NOECN) 55
56 flags = x->props.flags;
57 if (flags & XFRM_STATE_NOECN)
55 IP_ECN_clear(top_iph); 58 IP_ECN_clear(top_iph);
56 59
57 top_iph->frag_off = iph->frag_off & htons(IP_DF); 60 top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ?
61 0 : (iph->frag_off & htons(IP_DF));
58 if (!top_iph->frag_off) 62 if (!top_iph->frag_off)
59 __ip_select_ident(top_iph, dst, 0); 63 __ip_select_ident(top_iph, dst, 0);
60 64
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
index 223a2e83853f..050611d7a967 100644
--- a/net/ipv4/xfrm4_state.c
+++ b/net/ipv4/xfrm4_state.c
@@ -7,12 +7,20 @@
7 * 7 *
8 */ 8 */
9 9
10#include <net/ip.h>
10#include <net/xfrm.h> 11#include <net/xfrm.h>
11#include <linux/pfkeyv2.h> 12#include <linux/pfkeyv2.h>
12#include <linux/ipsec.h> 13#include <linux/ipsec.h>
13 14
14static struct xfrm_state_afinfo xfrm4_state_afinfo; 15static struct xfrm_state_afinfo xfrm4_state_afinfo;
15 16
17static int xfrm4_init_flags(struct xfrm_state *x)
18{
19 if (ipv4_config.no_pmtu_disc)
20 x->props.flags |= XFRM_STATE_NOPMTUDISC;
21 return 0;
22}
23
16static void 24static void
17__xfrm4_init_tempsel(struct xfrm_state *x, struct flowi *fl, 25__xfrm4_init_tempsel(struct xfrm_state *x, struct flowi *fl,
18 struct xfrm_tmpl *tmpl, 26 struct xfrm_tmpl *tmpl,
@@ -109,6 +117,7 @@ __xfrm4_find_acq(u8 mode, u32 reqid, u8 proto,
109static struct xfrm_state_afinfo xfrm4_state_afinfo = { 117static struct xfrm_state_afinfo xfrm4_state_afinfo = {
110 .family = AF_INET, 118 .family = AF_INET,
111 .lock = RW_LOCK_UNLOCKED, 119 .lock = RW_LOCK_UNLOCKED,
120 .init_flags = xfrm4_init_flags,
112 .init_tempsel = __xfrm4_init_tempsel, 121 .init_tempsel = __xfrm4_init_tempsel,
113 .state_lookup = __xfrm4_state_lookup, 122 .state_lookup = __xfrm4_state_lookup,
114 .find_acq = __xfrm4_find_acq, 123 .find_acq = __xfrm4_find_acq,
diff --git a/net/ipv4/xfrm4_tunnel.c b/net/ipv4/xfrm4_tunnel.c
index 413191f585f6..e1fe360ed27a 100644
--- a/net/ipv4/xfrm4_tunnel.c
+++ b/net/ipv4/xfrm4_tunnel.c
@@ -84,7 +84,7 @@ static void ipip_err(struct sk_buff *skb, u32 info)
84 handler->err_handler(skb, &arg); 84 handler->err_handler(skb, &arg);
85} 85}
86 86
87static int ipip_init_state(struct xfrm_state *x, void *args) 87static int ipip_init_state(struct xfrm_state *x)
88{ 88{
89 if (!x->props.mode) 89 if (!x->props.mode)
90 return -EINVAL; 90 return -EINVAL;