aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@ppc970.osdl.org>2005-06-21 18:45:19 -0400
committerLinus Torvalds <torvalds@ppc970.osdl.org>2005-06-21 18:45:19 -0400
commit4a4f8fdba6f5a34ca90f426021e17491a30202da (patch)
tree6eb8be6c9542845321252b1d64394a2c2ea84dd0
parent2c6e5a839f92591a4bc6cac4a575d42151645af3 (diff)
parent90f66914c89b0be63548d4387d1211280aa7bc8e (diff)
Merge rsync://rsync.kernel.org/pub/scm/linux/kernel/git/davem/net-2.6
-rw-r--r--include/linux/netfilter_ipv4.h6
-rw-r--r--include/linux/netfilter_ipv4/ip_conntrack_core.h3
-rw-r--r--include/linux/netfilter_ipv4/ip_nat.h3
-rw-r--r--include/linux/netfilter_ipv4/listhelp.h1
-rw-r--r--include/linux/netfilter_ipv4/lockhelp.h129
-rw-r--r--include/linux/netlink.h2
-rw-r--r--include/linux/skbuff.h13
-rw-r--r--include/net/ip6_fib.h9
-rw-r--r--include/net/ip6_route.h9
-rw-r--r--net/bridge/br_forward.c3
-rw-r--r--net/bridge/br_input.c4
-rw-r--r--net/bridge/br_netfilter.c38
-rw-r--r--net/core/netfilter.c138
-rw-r--r--net/core/skbuff.c6
-rw-r--r--net/ipv4/Kconfig26
-rw-r--r--net/ipv4/Makefile4
-rw-r--r--net/ipv4/af_inet.c12
-rw-r--r--net/ipv4/fib_trie.c2454
-rw-r--r--net/ipv4/ip_input.c5
-rw-r--r--net/ipv4/ip_output.c11
-rw-r--r--net/ipv4/ipmr.c1
-rw-r--r--net/ipv4/ipvs/ip_vs_xmit.c1
-rw-r--r--net/ipv4/netfilter/arp_tables.c1
-rw-r--r--net/ipv4/netfilter/ip_conntrack_amanda.c7
-rw-r--r--net/ipv4/netfilter/ip_conntrack_core.c107
-rw-r--r--net/ipv4/netfilter/ip_conntrack_ftp.c7
-rw-r--r--net/ipv4/netfilter/ip_conntrack_irc.c7
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_sctp.c23
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_tcp.c27
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_udp.c1
-rw-r--r--net/ipv4/netfilter/ip_conntrack_standalone.c22
-rw-r--r--net/ipv4/netfilter/ip_nat_core.c32
-rw-r--r--net/ipv4/netfilter/ip_nat_helper.c13
-rw-r--r--net/ipv4/netfilter/ip_nat_rule.c4
-rw-r--r--net/ipv4/netfilter/ip_nat_standalone.c5
-rw-r--r--net/ipv4/netfilter/ip_tables.c1
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c49
-rw-r--r--net/ipv4/netfilter/ipt_MASQUERADE.c10
-rw-r--r--net/ipv4/netfilter/ipt_REJECT.c13
-rw-r--r--net/ipv4/netfilter/ipt_ULOG.c15
-rw-r--r--net/ipv4/netfilter/ipt_hashlimit.c17
-rw-r--r--net/ipv4/netfilter/ipt_helper.c4
-rw-r--r--net/ipv6/addrconf.c14
-rw-r--r--net/ipv6/anycast.c4
-rw-r--r--net/ipv6/ip6_fib.c19
-rw-r--r--net/ipv6/ip6_output.c3
-rw-r--r--net/ipv6/ipv6_sockglue.c5
-rw-r--r--net/ipv6/mcast.c68
-rw-r--r--net/ipv6/ndisc.c4
-rw-r--r--net/ipv6/netfilter/ip6_tables.c1
-rw-r--r--net/ipv6/netfilter/ip6t_LOG.c54
-rw-r--r--net/ipv6/netfilter/ip6table_raw.c6
-rw-r--r--net/ipv6/route.c78
53 files changed, 2823 insertions, 676 deletions
diff --git a/include/linux/netfilter_ipv4.h b/include/linux/netfilter_ipv4.h
index 9e5750079e09..3ebc36afae1a 100644
--- a/include/linux/netfilter_ipv4.h
+++ b/include/linux/netfilter_ipv4.h
@@ -75,12 +75,6 @@ enum nf_ip_hook_priorities {
75#define SO_ORIGINAL_DST 80 75#define SO_ORIGINAL_DST 80
76 76
77#ifdef __KERNEL__ 77#ifdef __KERNEL__
78#ifdef CONFIG_NETFILTER_DEBUG
79void nf_debug_ip_local_deliver(struct sk_buff *skb);
80void nf_debug_ip_loopback_xmit(struct sk_buff *newskb);
81void nf_debug_ip_finish_output2(struct sk_buff *skb);
82#endif /*CONFIG_NETFILTER_DEBUG*/
83
84extern int ip_route_me_harder(struct sk_buff **pskb); 78extern int ip_route_me_harder(struct sk_buff **pskb);
85 79
86/* Call this before modifying an existing IP packet: ensures it is 80/* Call this before modifying an existing IP packet: ensures it is
diff --git a/include/linux/netfilter_ipv4/ip_conntrack_core.h b/include/linux/netfilter_ipv4/ip_conntrack_core.h
index d84be02cb4fc..694aec9b4784 100644
--- a/include/linux/netfilter_ipv4/ip_conntrack_core.h
+++ b/include/linux/netfilter_ipv4/ip_conntrack_core.h
@@ -1,7 +1,6 @@
1#ifndef _IP_CONNTRACK_CORE_H 1#ifndef _IP_CONNTRACK_CORE_H
2#define _IP_CONNTRACK_CORE_H 2#define _IP_CONNTRACK_CORE_H
3#include <linux/netfilter.h> 3#include <linux/netfilter.h>
4#include <linux/netfilter_ipv4/lockhelp.h>
5 4
6/* This header is used to share core functionality between the 5/* This header is used to share core functionality between the
7 standalone connection tracking module, and the compatibility layer's use 6 standalone connection tracking module, and the compatibility layer's use
@@ -47,6 +46,6 @@ static inline int ip_conntrack_confirm(struct sk_buff **pskb)
47 46
48extern struct list_head *ip_conntrack_hash; 47extern struct list_head *ip_conntrack_hash;
49extern struct list_head ip_conntrack_expect_list; 48extern struct list_head ip_conntrack_expect_list;
50DECLARE_RWLOCK_EXTERN(ip_conntrack_lock); 49extern rwlock_t ip_conntrack_lock;
51#endif /* _IP_CONNTRACK_CORE_H */ 50#endif /* _IP_CONNTRACK_CORE_H */
52 51
diff --git a/include/linux/netfilter_ipv4/ip_nat.h b/include/linux/netfilter_ipv4/ip_nat.h
index 2b72b86176f0..e201ec6e9905 100644
--- a/include/linux/netfilter_ipv4/ip_nat.h
+++ b/include/linux/netfilter_ipv4/ip_nat.h
@@ -50,10 +50,9 @@ struct ip_nat_multi_range_compat
50 50
51#ifdef __KERNEL__ 51#ifdef __KERNEL__
52#include <linux/list.h> 52#include <linux/list.h>
53#include <linux/netfilter_ipv4/lockhelp.h>
54 53
55/* Protects NAT hash tables, and NAT-private part of conntracks. */ 54/* Protects NAT hash tables, and NAT-private part of conntracks. */
56DECLARE_RWLOCK_EXTERN(ip_nat_lock); 55extern rwlock_t ip_nat_lock;
57 56
58/* The structure embedded in the conntrack structure. */ 57/* The structure embedded in the conntrack structure. */
59struct ip_nat_info 58struct ip_nat_info
diff --git a/include/linux/netfilter_ipv4/listhelp.h b/include/linux/netfilter_ipv4/listhelp.h
index f2ae7c5e57bb..360429f48737 100644
--- a/include/linux/netfilter_ipv4/listhelp.h
+++ b/include/linux/netfilter_ipv4/listhelp.h
@@ -2,7 +2,6 @@
2#define _LISTHELP_H 2#define _LISTHELP_H
3#include <linux/config.h> 3#include <linux/config.h>
4#include <linux/list.h> 4#include <linux/list.h>
5#include <linux/netfilter_ipv4/lockhelp.h>
6 5
7/* Header to do more comprehensive job than linux/list.h; assume list 6/* Header to do more comprehensive job than linux/list.h; assume list
8 is first entry in structure. */ 7 is first entry in structure. */
diff --git a/include/linux/netfilter_ipv4/lockhelp.h b/include/linux/netfilter_ipv4/lockhelp.h
deleted file mode 100644
index a3288633ab46..000000000000
--- a/include/linux/netfilter_ipv4/lockhelp.h
+++ /dev/null
@@ -1,129 +0,0 @@
1#ifndef _LOCKHELP_H
2#define _LOCKHELP_H
3#include <linux/config.h>
4
5#include <linux/spinlock.h>
6#include <asm/atomic.h>
7#include <linux/interrupt.h>
8#include <linux/smp.h>
9
10/* Header to do help in lock debugging. */
11
12#ifdef CONFIG_NETFILTER_DEBUG
13struct spinlock_debug
14{
15 spinlock_t l;
16 atomic_t locked_by;
17};
18
19struct rwlock_debug
20{
21 rwlock_t l;
22 long read_locked_map;
23 long write_locked_map;
24};
25
26#define DECLARE_LOCK(l) \
27struct spinlock_debug l = { SPIN_LOCK_UNLOCKED, ATOMIC_INIT(-1) }
28#define DECLARE_LOCK_EXTERN(l) \
29extern struct spinlock_debug l
30#define DECLARE_RWLOCK(l) \
31struct rwlock_debug l = { RW_LOCK_UNLOCKED, 0, 0 }
32#define DECLARE_RWLOCK_EXTERN(l) \
33extern struct rwlock_debug l
34
35#define MUST_BE_LOCKED(l) \
36do { if (atomic_read(&(l)->locked_by) != smp_processor_id()) \
37 printk("ASSERT %s:%u %s unlocked\n", __FILE__, __LINE__, #l); \
38} while(0)
39
40#define MUST_BE_UNLOCKED(l) \
41do { if (atomic_read(&(l)->locked_by) == smp_processor_id()) \
42 printk("ASSERT %s:%u %s locked\n", __FILE__, __LINE__, #l); \
43} while(0)
44
45/* Write locked OK as well. */
46#define MUST_BE_READ_LOCKED(l) \
47do { if (!((l)->read_locked_map & (1UL << smp_processor_id())) \
48 && !((l)->write_locked_map & (1UL << smp_processor_id()))) \
49 printk("ASSERT %s:%u %s not readlocked\n", __FILE__, __LINE__, #l); \
50} while(0)
51
52#define MUST_BE_WRITE_LOCKED(l) \
53do { if (!((l)->write_locked_map & (1UL << smp_processor_id()))) \
54 printk("ASSERT %s:%u %s not writelocked\n", __FILE__, __LINE__, #l); \
55} while(0)
56
57#define MUST_BE_READ_WRITE_UNLOCKED(l) \
58do { if ((l)->read_locked_map & (1UL << smp_processor_id())) \
59 printk("ASSERT %s:%u %s readlocked\n", __FILE__, __LINE__, #l); \
60 else if ((l)->write_locked_map & (1UL << smp_processor_id())) \
61 printk("ASSERT %s:%u %s writelocked\n", __FILE__, __LINE__, #l); \
62} while(0)
63
64#define LOCK_BH(lk) \
65do { \
66 MUST_BE_UNLOCKED(lk); \
67 spin_lock_bh(&(lk)->l); \
68 atomic_set(&(lk)->locked_by, smp_processor_id()); \
69} while(0)
70
71#define UNLOCK_BH(lk) \
72do { \
73 MUST_BE_LOCKED(lk); \
74 atomic_set(&(lk)->locked_by, -1); \
75 spin_unlock_bh(&(lk)->l); \
76} while(0)
77
78#define READ_LOCK(lk) \
79do { \
80 MUST_BE_READ_WRITE_UNLOCKED(lk); \
81 read_lock_bh(&(lk)->l); \
82 set_bit(smp_processor_id(), &(lk)->read_locked_map); \
83} while(0)
84
85#define WRITE_LOCK(lk) \
86do { \
87 MUST_BE_READ_WRITE_UNLOCKED(lk); \
88 write_lock_bh(&(lk)->l); \
89 set_bit(smp_processor_id(), &(lk)->write_locked_map); \
90} while(0)
91
92#define READ_UNLOCK(lk) \
93do { \
94 if (!((lk)->read_locked_map & (1UL << smp_processor_id()))) \
95 printk("ASSERT: %s:%u %s not readlocked\n", \
96 __FILE__, __LINE__, #lk); \
97 clear_bit(smp_processor_id(), &(lk)->read_locked_map); \
98 read_unlock_bh(&(lk)->l); \
99} while(0)
100
101#define WRITE_UNLOCK(lk) \
102do { \
103 MUST_BE_WRITE_LOCKED(lk); \
104 clear_bit(smp_processor_id(), &(lk)->write_locked_map); \
105 write_unlock_bh(&(lk)->l); \
106} while(0)
107
108#else
109#define DECLARE_LOCK(l) spinlock_t l = SPIN_LOCK_UNLOCKED
110#define DECLARE_LOCK_EXTERN(l) extern spinlock_t l
111#define DECLARE_RWLOCK(l) rwlock_t l = RW_LOCK_UNLOCKED
112#define DECLARE_RWLOCK_EXTERN(l) extern rwlock_t l
113
114#define MUST_BE_LOCKED(l)
115#define MUST_BE_UNLOCKED(l)
116#define MUST_BE_READ_LOCKED(l)
117#define MUST_BE_WRITE_LOCKED(l)
118#define MUST_BE_READ_WRITE_UNLOCKED(l)
119
120#define LOCK_BH(l) spin_lock_bh(l)
121#define UNLOCK_BH(l) spin_unlock_bh(l)
122
123#define READ_LOCK(l) read_lock_bh(l)
124#define WRITE_LOCK(l) write_lock_bh(l)
125#define READ_UNLOCK(l) read_unlock_bh(l)
126#define WRITE_UNLOCK(l) write_unlock_bh(l)
127#endif /*CONFIG_NETFILTER_DEBUG*/
128
129#endif /* _LOCKHELP_H */
diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index 561d4dc75836..3029cad63a01 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -147,7 +147,7 @@ struct netlink_callback
147 int (*dump)(struct sk_buff * skb, struct netlink_callback *cb); 147 int (*dump)(struct sk_buff * skb, struct netlink_callback *cb);
148 int (*done)(struct netlink_callback *cb); 148 int (*done)(struct netlink_callback *cb);
149 int family; 149 int family;
150 long args[4]; 150 long args[5];
151}; 151};
152 152
153struct netlink_notify 153struct netlink_notify
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index cc04f5cd2286..d7c839a21842 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -193,7 +193,6 @@ struct skb_shared_info {
193 * @nfcache: Cache info 193 * @nfcache: Cache info
194 * @nfct: Associated connection, if any 194 * @nfct: Associated connection, if any
195 * @nfctinfo: Relationship of this skb to the connection 195 * @nfctinfo: Relationship of this skb to the connection
196 * @nf_debug: Netfilter debugging
197 * @nf_bridge: Saved data about a bridged frame - see br_netfilter.c 196 * @nf_bridge: Saved data about a bridged frame - see br_netfilter.c
198 * @private: Data which is private to the HIPPI implementation 197 * @private: Data which is private to the HIPPI implementation
199 * @tc_index: Traffic control index 198 * @tc_index: Traffic control index
@@ -264,9 +263,6 @@ struct sk_buff {
264 __u32 nfcache; 263 __u32 nfcache;
265 __u32 nfctinfo; 264 __u32 nfctinfo;
266 struct nf_conntrack *nfct; 265 struct nf_conntrack *nfct;
267#ifdef CONFIG_NETFILTER_DEBUG
268 unsigned int nf_debug;
269#endif
270#ifdef CONFIG_BRIDGE_NETFILTER 266#ifdef CONFIG_BRIDGE_NETFILTER
271 struct nf_bridge_info *nf_bridge; 267 struct nf_bridge_info *nf_bridge;
272#endif 268#endif
@@ -1219,15 +1215,6 @@ static inline void nf_reset(struct sk_buff *skb)
1219{ 1215{
1220 nf_conntrack_put(skb->nfct); 1216 nf_conntrack_put(skb->nfct);
1221 skb->nfct = NULL; 1217 skb->nfct = NULL;
1222#ifdef CONFIG_NETFILTER_DEBUG
1223 skb->nf_debug = 0;
1224#endif
1225}
1226static inline void nf_reset_debug(struct sk_buff *skb)
1227{
1228#ifdef CONFIG_NETFILTER_DEBUG
1229 skb->nf_debug = 0;
1230#endif
1231} 1218}
1232 1219
1233#ifdef CONFIG_BRIDGE_NETFILTER 1220#ifdef CONFIG_BRIDGE_NETFILTER
diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 319904518194..a66e9de16a6c 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -167,14 +167,17 @@ extern int fib6_walk_continue(struct fib6_walker_t *w);
167extern int fib6_add(struct fib6_node *root, 167extern int fib6_add(struct fib6_node *root,
168 struct rt6_info *rt, 168 struct rt6_info *rt,
169 struct nlmsghdr *nlh, 169 struct nlmsghdr *nlh,
170 void *rtattr); 170 void *rtattr,
171 struct netlink_skb_parms *req);
171 172
172extern int fib6_del(struct rt6_info *rt, 173extern int fib6_del(struct rt6_info *rt,
173 struct nlmsghdr *nlh, 174 struct nlmsghdr *nlh,
174 void *rtattr); 175 void *rtattr,
176 struct netlink_skb_parms *req);
175 177
176extern void inet6_rt_notify(int event, struct rt6_info *rt, 178extern void inet6_rt_notify(int event, struct rt6_info *rt,
177 struct nlmsghdr *nlh); 179 struct nlmsghdr *nlh,
180 struct netlink_skb_parms *req);
178 181
179extern void fib6_run_gc(unsigned long dummy); 182extern void fib6_run_gc(unsigned long dummy);
180 183
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index d5d1dd10cdb8..f920706d526b 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -41,13 +41,16 @@ extern int ipv6_route_ioctl(unsigned int cmd, void __user *arg);
41 41
42extern int ip6_route_add(struct in6_rtmsg *rtmsg, 42extern int ip6_route_add(struct in6_rtmsg *rtmsg,
43 struct nlmsghdr *, 43 struct nlmsghdr *,
44 void *rtattr); 44 void *rtattr,
45 struct netlink_skb_parms *req);
45extern int ip6_ins_rt(struct rt6_info *, 46extern int ip6_ins_rt(struct rt6_info *,
46 struct nlmsghdr *, 47 struct nlmsghdr *,
47 void *rtattr); 48 void *rtattr,
49 struct netlink_skb_parms *req);
48extern int ip6_del_rt(struct rt6_info *, 50extern int ip6_del_rt(struct rt6_info *,
49 struct nlmsghdr *, 51 struct nlmsghdr *,
50 void *rtattr); 52 void *rtattr,
53 struct netlink_skb_parms *req);
51 54
52extern int ip6_rt_addr_add(struct in6_addr *addr, 55extern int ip6_rt_addr_add(struct in6_addr *addr,
53 struct net_device *dev, 56 struct net_device *dev,
diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index ef9f2095f96e..069253f830c1 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -57,9 +57,6 @@ int br_forward_finish(struct sk_buff *skb)
57static void __br_deliver(const struct net_bridge_port *to, struct sk_buff *skb) 57static void __br_deliver(const struct net_bridge_port *to, struct sk_buff *skb)
58{ 58{
59 skb->dev = to->dev; 59 skb->dev = to->dev;
60#ifdef CONFIG_NETFILTER_DEBUG
61 skb->nf_debug = 0;
62#endif
63 NF_HOOK(PF_BRIDGE, NF_BR_LOCAL_OUT, skb, NULL, skb->dev, 60 NF_HOOK(PF_BRIDGE, NF_BR_LOCAL_OUT, skb, NULL, skb->dev,
64 br_forward_finish); 61 br_forward_finish);
65} 62}
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 8f5f2e730992..9a45e6279c57 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -23,11 +23,7 @@ const unsigned char bridge_ula[6] = { 0x01, 0x80, 0xc2, 0x00, 0x00, 0x00 };
23 23
24static int br_pass_frame_up_finish(struct sk_buff *skb) 24static int br_pass_frame_up_finish(struct sk_buff *skb)
25{ 25{
26#ifdef CONFIG_NETFILTER_DEBUG
27 skb->nf_debug = 0;
28#endif
29 netif_receive_skb(skb); 26 netif_receive_skb(skb);
30
31 return 0; 27 return 0;
32} 28}
33 29
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index be03d3ad2648..03ae4edddac3 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -102,10 +102,6 @@ static int br_nf_pre_routing_finish_ipv6(struct sk_buff *skb)
102{ 102{
103 struct nf_bridge_info *nf_bridge = skb->nf_bridge; 103 struct nf_bridge_info *nf_bridge = skb->nf_bridge;
104 104
105#ifdef CONFIG_NETFILTER_DEBUG
106 skb->nf_debug ^= (1 << NF_BR_PRE_ROUTING);
107#endif
108
109 if (nf_bridge->mask & BRNF_PKT_TYPE) { 105 if (nf_bridge->mask & BRNF_PKT_TYPE) {
110 skb->pkt_type = PACKET_OTHERHOST; 106 skb->pkt_type = PACKET_OTHERHOST;
111 nf_bridge->mask ^= BRNF_PKT_TYPE; 107 nf_bridge->mask ^= BRNF_PKT_TYPE;
@@ -182,10 +178,6 @@ static void __br_dnat_complain(void)
182 * --Bart, 20021007 (updated) */ 178 * --Bart, 20021007 (updated) */
183static int br_nf_pre_routing_finish_bridge(struct sk_buff *skb) 179static int br_nf_pre_routing_finish_bridge(struct sk_buff *skb)
184{ 180{
185#ifdef CONFIG_NETFILTER_DEBUG
186 skb->nf_debug |= (1 << NF_BR_PRE_ROUTING) | (1 << NF_BR_FORWARD);
187#endif
188
189 if (skb->pkt_type == PACKET_OTHERHOST) { 181 if (skb->pkt_type == PACKET_OTHERHOST) {
190 skb->pkt_type = PACKET_HOST; 182 skb->pkt_type = PACKET_HOST;
191 skb->nf_bridge->mask |= BRNF_PKT_TYPE; 183 skb->nf_bridge->mask |= BRNF_PKT_TYPE;
@@ -207,10 +199,6 @@ static int br_nf_pre_routing_finish(struct sk_buff *skb)
207 struct iphdr *iph = skb->nh.iph; 199 struct iphdr *iph = skb->nh.iph;
208 struct nf_bridge_info *nf_bridge = skb->nf_bridge; 200 struct nf_bridge_info *nf_bridge = skb->nf_bridge;
209 201
210#ifdef CONFIG_NETFILTER_DEBUG
211 skb->nf_debug ^= (1 << NF_BR_PRE_ROUTING);
212#endif
213
214 if (nf_bridge->mask & BRNF_PKT_TYPE) { 202 if (nf_bridge->mask & BRNF_PKT_TYPE) {
215 skb->pkt_type = PACKET_OTHERHOST; 203 skb->pkt_type = PACKET_OTHERHOST;
216 nf_bridge->mask ^= BRNF_PKT_TYPE; 204 nf_bridge->mask ^= BRNF_PKT_TYPE;
@@ -382,9 +370,6 @@ static unsigned int br_nf_pre_routing_ipv6(unsigned int hook,
382 if (hdr->nexthdr == NEXTHDR_HOP && check_hbh_len(skb)) 370 if (hdr->nexthdr == NEXTHDR_HOP && check_hbh_len(skb))
383 goto inhdr_error; 371 goto inhdr_error;
384 372
385#ifdef CONFIG_NETFILTER_DEBUG
386 skb->nf_debug ^= (1 << NF_IP6_PRE_ROUTING);
387#endif
388 if ((nf_bridge = nf_bridge_alloc(skb)) == NULL) 373 if ((nf_bridge = nf_bridge_alloc(skb)) == NULL)
389 return NF_DROP; 374 return NF_DROP;
390 setup_pre_routing(skb); 375 setup_pre_routing(skb);
@@ -468,9 +453,6 @@ static unsigned int br_nf_pre_routing(unsigned int hook, struct sk_buff **pskb,
468 skb->ip_summed = CHECKSUM_NONE; 453 skb->ip_summed = CHECKSUM_NONE;
469 } 454 }
470 455
471#ifdef CONFIG_NETFILTER_DEBUG
472 skb->nf_debug ^= (1 << NF_IP_PRE_ROUTING);
473#endif
474 if ((nf_bridge = nf_bridge_alloc(skb)) == NULL) 456 if ((nf_bridge = nf_bridge_alloc(skb)) == NULL)
475 return NF_DROP; 457 return NF_DROP;
476 setup_pre_routing(skb); 458 setup_pre_routing(skb);
@@ -517,10 +499,6 @@ static int br_nf_forward_finish(struct sk_buff *skb)
517 struct net_device *in; 499 struct net_device *in;
518 struct vlan_ethhdr *hdr = vlan_eth_hdr(skb); 500 struct vlan_ethhdr *hdr = vlan_eth_hdr(skb);
519 501
520#ifdef CONFIG_NETFILTER_DEBUG
521 skb->nf_debug ^= (1 << NF_BR_FORWARD);
522#endif
523
524 if (skb->protocol != __constant_htons(ETH_P_ARP) && !IS_VLAN_ARP) { 502 if (skb->protocol != __constant_htons(ETH_P_ARP) && !IS_VLAN_ARP) {
525 in = nf_bridge->physindev; 503 in = nf_bridge->physindev;
526 if (nf_bridge->mask & BRNF_PKT_TYPE) { 504 if (nf_bridge->mask & BRNF_PKT_TYPE) {
@@ -566,9 +544,6 @@ static unsigned int br_nf_forward_ip(unsigned int hook, struct sk_buff **pskb,
566 (*pskb)->nh.raw += VLAN_HLEN; 544 (*pskb)->nh.raw += VLAN_HLEN;
567 } 545 }
568 546
569#ifdef CONFIG_NETFILTER_DEBUG
570 skb->nf_debug ^= (1 << NF_BR_FORWARD);
571#endif
572 nf_bridge = skb->nf_bridge; 547 nf_bridge = skb->nf_bridge;
573 if (skb->pkt_type == PACKET_OTHERHOST) { 548 if (skb->pkt_type == PACKET_OTHERHOST) {
574 skb->pkt_type = PACKET_HOST; 549 skb->pkt_type = PACKET_HOST;
@@ -605,10 +580,6 @@ static unsigned int br_nf_forward_arp(unsigned int hook, struct sk_buff **pskb,
605 (*pskb)->nh.raw += VLAN_HLEN; 580 (*pskb)->nh.raw += VLAN_HLEN;
606 } 581 }
607 582
608#ifdef CONFIG_NETFILTER_DEBUG
609 skb->nf_debug ^= (1 << NF_BR_FORWARD);
610#endif
611
612 if (skb->nh.arph->ar_pln != 4) { 583 if (skb->nh.arph->ar_pln != 4) {
613 if (IS_VLAN_ARP) { 584 if (IS_VLAN_ARP) {
614 skb_push(*pskb, VLAN_HLEN); 585 skb_push(*pskb, VLAN_HLEN);
@@ -627,9 +598,6 @@ static unsigned int br_nf_forward_arp(unsigned int hook, struct sk_buff **pskb,
627/* PF_BRIDGE/LOCAL_OUT ***********************************************/ 598/* PF_BRIDGE/LOCAL_OUT ***********************************************/
628static int br_nf_local_out_finish(struct sk_buff *skb) 599static int br_nf_local_out_finish(struct sk_buff *skb)
629{ 600{
630#ifdef CONFIG_NETFILTER_DEBUG
631 skb->nf_debug &= ~(1 << NF_BR_LOCAL_OUT);
632#endif
633 if (skb->protocol == __constant_htons(ETH_P_8021Q)) { 601 if (skb->protocol == __constant_htons(ETH_P_8021Q)) {
634 skb_push(skb, VLAN_HLEN); 602 skb_push(skb, VLAN_HLEN);
635 skb->nh.raw -= VLAN_HLEN; 603 skb->nh.raw -= VLAN_HLEN;
@@ -731,10 +699,6 @@ static unsigned int br_nf_local_out(unsigned int hook, struct sk_buff **pskb,
731 realoutdev, br_nf_local_out_finish, 699 realoutdev, br_nf_local_out_finish,
732 NF_IP_PRI_BRIDGE_SABOTAGE_FORWARD + 1); 700 NF_IP_PRI_BRIDGE_SABOTAGE_FORWARD + 1);
733 } else { 701 } else {
734#ifdef CONFIG_NETFILTER_DEBUG
735 skb->nf_debug ^= (1 << NF_IP_LOCAL_OUT);
736#endif
737
738 NF_HOOK_THRESH(pf, NF_IP_LOCAL_OUT, skb, realindev, 702 NF_HOOK_THRESH(pf, NF_IP_LOCAL_OUT, skb, realindev,
739 realoutdev, br_nf_local_out_finish, 703 realoutdev, br_nf_local_out_finish,
740 NF_IP_PRI_BRIDGE_SABOTAGE_LOCAL_OUT + 1); 704 NF_IP_PRI_BRIDGE_SABOTAGE_LOCAL_OUT + 1);
@@ -779,8 +743,6 @@ static unsigned int br_nf_post_routing(unsigned int hook, struct sk_buff **pskb,
779 printk(KERN_CRIT "br_netfilter: skb->dst == NULL."); 743 printk(KERN_CRIT "br_netfilter: skb->dst == NULL.");
780 goto print_error; 744 goto print_error;
781 } 745 }
782
783 skb->nf_debug ^= (1 << NF_IP_POST_ROUTING);
784#endif 746#endif
785 747
786 /* We assume any code from br_dev_queue_push_xmit onwards doesn't care 748 /* We assume any code from br_dev_queue_push_xmit onwards doesn't care
diff --git a/net/core/netfilter.c b/net/core/netfilter.c
index 22a8f127c4aa..076c156d5eda 100644
--- a/net/core/netfilter.c
+++ b/net/core/netfilter.c
@@ -141,136 +141,6 @@ void nf_unregister_sockopt(struct nf_sockopt_ops *reg)
141 up(&nf_sockopt_mutex); 141 up(&nf_sockopt_mutex);
142} 142}
143 143
144#ifdef CONFIG_NETFILTER_DEBUG
145#include <net/ip.h>
146#include <net/tcp.h>
147#include <linux/netfilter_ipv4.h>
148
149static void debug_print_hooks_ip(unsigned int nf_debug)
150{
151 if (nf_debug & (1 << NF_IP_PRE_ROUTING)) {
152 printk("PRE_ROUTING ");
153 nf_debug ^= (1 << NF_IP_PRE_ROUTING);
154 }
155 if (nf_debug & (1 << NF_IP_LOCAL_IN)) {
156 printk("LOCAL_IN ");
157 nf_debug ^= (1 << NF_IP_LOCAL_IN);
158 }
159 if (nf_debug & (1 << NF_IP_FORWARD)) {
160 printk("FORWARD ");
161 nf_debug ^= (1 << NF_IP_FORWARD);
162 }
163 if (nf_debug & (1 << NF_IP_LOCAL_OUT)) {
164 printk("LOCAL_OUT ");
165 nf_debug ^= (1 << NF_IP_LOCAL_OUT);
166 }
167 if (nf_debug & (1 << NF_IP_POST_ROUTING)) {
168 printk("POST_ROUTING ");
169 nf_debug ^= (1 << NF_IP_POST_ROUTING);
170 }
171 if (nf_debug)
172 printk("Crap bits: 0x%04X", nf_debug);
173 printk("\n");
174}
175
176static void nf_dump_skb(int pf, struct sk_buff *skb)
177{
178 printk("skb: pf=%i %s dev=%s len=%u\n",
179 pf,
180 skb->sk ? "(owned)" : "(unowned)",
181 skb->dev ? skb->dev->name : "(no dev)",
182 skb->len);
183 switch (pf) {
184 case PF_INET: {
185 const struct iphdr *ip = skb->nh.iph;
186 __u32 *opt = (__u32 *) (ip + 1);
187 int opti;
188 __u16 src_port = 0, dst_port = 0;
189
190 if (ip->protocol == IPPROTO_TCP
191 || ip->protocol == IPPROTO_UDP) {
192 struct tcphdr *tcp=(struct tcphdr *)((__u32 *)ip+ip->ihl);
193 src_port = ntohs(tcp->source);
194 dst_port = ntohs(tcp->dest);
195 }
196
197 printk("PROTO=%d %u.%u.%u.%u:%hu %u.%u.%u.%u:%hu"
198 " L=%hu S=0x%2.2hX I=%hu F=0x%4.4hX T=%hu",
199 ip->protocol, NIPQUAD(ip->saddr),
200 src_port, NIPQUAD(ip->daddr),
201 dst_port,
202 ntohs(ip->tot_len), ip->tos, ntohs(ip->id),
203 ntohs(ip->frag_off), ip->ttl);
204
205 for (opti = 0; opti < (ip->ihl - sizeof(struct iphdr) / 4); opti++)
206 printk(" O=0x%8.8X", *opt++);
207 printk("\n");
208 }
209 }
210}
211
212void nf_debug_ip_local_deliver(struct sk_buff *skb)
213{
214 /* If it's a loopback packet, it must have come through
215 * NF_IP_LOCAL_OUT, NF_IP_RAW_INPUT, NF_IP_PRE_ROUTING and
216 * NF_IP_LOCAL_IN. Otherwise, must have gone through
217 * NF_IP_RAW_INPUT and NF_IP_PRE_ROUTING. */
218 if (!skb->dev) {
219 printk("ip_local_deliver: skb->dev is NULL.\n");
220 } else {
221 if (skb->nf_debug != ((1<<NF_IP_PRE_ROUTING)
222 | (1<<NF_IP_LOCAL_IN))) {
223 printk("ip_local_deliver: bad skb: ");
224 debug_print_hooks_ip(skb->nf_debug);
225 nf_dump_skb(PF_INET, skb);
226 }
227 }
228}
229
230void nf_debug_ip_loopback_xmit(struct sk_buff *newskb)
231{
232 if (newskb->nf_debug != ((1 << NF_IP_LOCAL_OUT)
233 | (1 << NF_IP_POST_ROUTING))) {
234 printk("ip_dev_loopback_xmit: bad owned skb = %p: ",
235 newskb);
236 debug_print_hooks_ip(newskb->nf_debug);
237 nf_dump_skb(PF_INET, newskb);
238 }
239}
240
241void nf_debug_ip_finish_output2(struct sk_buff *skb)
242{
243 /* If it's owned, it must have gone through the
244 * NF_IP_LOCAL_OUT and NF_IP_POST_ROUTING.
245 * Otherwise, must have gone through
246 * NF_IP_PRE_ROUTING, NF_IP_FORWARD and NF_IP_POST_ROUTING.
247 */
248 if (skb->sk) {
249 if (skb->nf_debug != ((1 << NF_IP_LOCAL_OUT)
250 | (1 << NF_IP_POST_ROUTING))) {
251 printk("ip_finish_output: bad owned skb = %p: ", skb);
252 debug_print_hooks_ip(skb->nf_debug);
253 nf_dump_skb(PF_INET, skb);
254 }
255 } else {
256 if (skb->nf_debug != ((1 << NF_IP_PRE_ROUTING)
257 | (1 << NF_IP_FORWARD)
258 | (1 << NF_IP_POST_ROUTING))) {
259 /* Fragments, entunnelled packets, TCP RSTs
260 generated by ipt_REJECT will have no
261 owners, but still may be local */
262 if (skb->nf_debug != ((1 << NF_IP_LOCAL_OUT)
263 | (1 << NF_IP_POST_ROUTING))){
264 printk("ip_finish_output:"
265 " bad unowned skb = %p: ",skb);
266 debug_print_hooks_ip(skb->nf_debug);
267 nf_dump_skb(PF_INET, skb);
268 }
269 }
270 }
271}
272#endif /*CONFIG_NETFILTER_DEBUG*/
273
274/* Call get/setsockopt() */ 144/* Call get/setsockopt() */
275static int nf_sockopt(struct sock *sk, int pf, int val, 145static int nf_sockopt(struct sock *sk, int pf, int val,
276 char __user *opt, int *len, int get) 146 char __user *opt, int *len, int get)
@@ -488,14 +358,6 @@ int nf_hook_slow(int pf, unsigned int hook, struct sk_buff **pskb,
488 /* We may already have this, but read-locks nest anyway */ 358 /* We may already have this, but read-locks nest anyway */
489 rcu_read_lock(); 359 rcu_read_lock();
490 360
491#ifdef CONFIG_NETFILTER_DEBUG
492 if (unlikely((*pskb)->nf_debug & (1 << hook))) {
493 printk("nf_hook: hook %i already set.\n", hook);
494 nf_dump_skb(pf, *pskb);
495 }
496 (*pskb)->nf_debug |= (1 << hook);
497#endif
498
499 elem = &nf_hooks[pf][hook]; 361 elem = &nf_hooks[pf][hook];
500next_hook: 362next_hook:
501 verdict = nf_iterate(&nf_hooks[pf][hook], pskb, hook, indev, 363 verdict = nf_iterate(&nf_hooks[pf][hook], pskb, hook, indev,
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index f65b3de590a9..6d68c03bc051 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -365,9 +365,6 @@ struct sk_buff *skb_clone(struct sk_buff *skb, int gfp_mask)
365 C(nfct); 365 C(nfct);
366 nf_conntrack_get(skb->nfct); 366 nf_conntrack_get(skb->nfct);
367 C(nfctinfo); 367 C(nfctinfo);
368#ifdef CONFIG_NETFILTER_DEBUG
369 C(nf_debug);
370#endif
371#ifdef CONFIG_BRIDGE_NETFILTER 368#ifdef CONFIG_BRIDGE_NETFILTER
372 C(nf_bridge); 369 C(nf_bridge);
373 nf_bridge_get(skb->nf_bridge); 370 nf_bridge_get(skb->nf_bridge);
@@ -432,9 +429,6 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
432 new->nfct = old->nfct; 429 new->nfct = old->nfct;
433 nf_conntrack_get(old->nfct); 430 nf_conntrack_get(old->nfct);
434 new->nfctinfo = old->nfctinfo; 431 new->nfctinfo = old->nfctinfo;
435#ifdef CONFIG_NETFILTER_DEBUG
436 new->nf_debug = old->nf_debug;
437#endif
438#ifdef CONFIG_BRIDGE_NETFILTER 432#ifdef CONFIG_BRIDGE_NETFILTER
439 new->nf_bridge = old->nf_bridge; 433 new->nf_bridge = old->nf_bridge;
440 nf_bridge_get(old->nf_bridge); 434 nf_bridge_get(old->nf_bridge);
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 6d3e8b1bd1f2..05107e0dc145 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -1,6 +1,32 @@
1# 1#
2# IP configuration 2# IP configuration
3# 3#
4choice
5 prompt "Choose IP: FIB lookup""
6 depends on INET
7 default IP_FIB_HASH
8
9config IP_FIB_HASH
10 bool "FIB_HASH"
11 ---help---
12 Current FIB is very proven and good enough for most users.
13
14config IP_FIB_TRIE
15 bool "FIB_TRIE"
16 ---help---
17 Use new experimental LC-trie as FIB lookup algoritm.
18 This improves lookup performance
19
20 LC-trie is described in:
21
22 IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24 An experimental study of compression methods for dynamic tries
25 Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
26 http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
27
28endchoice
29
4config IP_MULTICAST 30config IP_MULTICAST
5 bool "IP: multicasting" 31 bool "IP: multicasting"
6 depends on INET 32 depends on INET
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 8b379627ebb6..65d57d8e1add 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -7,8 +7,10 @@ obj-y := utils.o route.o inetpeer.o protocol.o \
7 ip_output.o ip_sockglue.o \ 7 ip_output.o ip_sockglue.o \
8 tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o tcp_minisocks.o \ 8 tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o tcp_minisocks.o \
9 datagram.o raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \ 9 datagram.o raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \
10 sysctl_net_ipv4.o fib_frontend.o fib_semantics.o fib_hash.o 10 sysctl_net_ipv4.o fib_frontend.o fib_semantics.o
11 11
12obj-$(CONFIG_IP_FIB_HASH) += fib_hash.o
13obj-$(CONFIG_IP_FIB_TRIE) += fib_trie.o
12obj-$(CONFIG_PROC_FS) += proc.o 14obj-$(CONFIG_PROC_FS) += proc.o
13obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o 15obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o
14obj-$(CONFIG_IP_MROUTE) += ipmr.o 16obj-$(CONFIG_IP_MROUTE) += ipmr.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 03942f133944..658e7977924d 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1119,6 +1119,10 @@ module_init(inet_init);
1119#ifdef CONFIG_PROC_FS 1119#ifdef CONFIG_PROC_FS
1120extern int fib_proc_init(void); 1120extern int fib_proc_init(void);
1121extern void fib_proc_exit(void); 1121extern void fib_proc_exit(void);
1122#ifdef CONFIG_IP_FIB_TRIE
1123extern int fib_stat_proc_init(void);
1124extern void fib_stat_proc_exit(void);
1125#endif
1122extern int ip_misc_proc_init(void); 1126extern int ip_misc_proc_init(void);
1123extern int raw_proc_init(void); 1127extern int raw_proc_init(void);
1124extern void raw_proc_exit(void); 1128extern void raw_proc_exit(void);
@@ -1139,11 +1143,19 @@ static int __init ipv4_proc_init(void)
1139 goto out_udp; 1143 goto out_udp;
1140 if (fib_proc_init()) 1144 if (fib_proc_init())
1141 goto out_fib; 1145 goto out_fib;
1146#ifdef CONFIG_IP_FIB_TRIE
1147 if (fib_stat_proc_init())
1148 goto out_fib_stat;
1149 #endif
1142 if (ip_misc_proc_init()) 1150 if (ip_misc_proc_init())
1143 goto out_misc; 1151 goto out_misc;
1144out: 1152out:
1145 return rc; 1153 return rc;
1146out_misc: 1154out_misc:
1155#ifdef CONFIG_IP_FIB_TRIE
1156 fib_stat_proc_exit();
1157out_fib_stat:
1158#endif
1147 fib_proc_exit(); 1159 fib_proc_exit();
1148out_fib: 1160out_fib:
1149 udp4_proc_exit(); 1161 udp4_proc_exit();
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
new file mode 100644
index 000000000000..0671569ee6f0
--- /dev/null
+++ b/net/ipv4/fib_trie.c
@@ -0,0 +1,2454 @@
1/*
2 * This program is free software; you can redistribute it and/or
3 * modify it under the terms of the GNU General Public License
4 * as published by the Free Software Foundation; either version
5 * 2 of the License, or (at your option) any later version.
6 *
7 * Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8 * & Swedish University of Agricultural Sciences.
9 *
10 * Jens Laas <jens.laas@data.slu.se> Swedish University of
11 * Agricultural Sciences.
12 *
13 * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet
14 *
15 * This work is based on the LPC-trie which is originally descibed in:
16 *
17 * An experimental study of compression methods for dynamic tries
18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19 * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
20 *
21 *
22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24 *
25 * Version: $Id: fib_trie.c,v 1.3 2005/06/08 14:20:01 robert Exp $
26 *
27 *
28 * Code from fib_hash has been reused which includes the following header:
29 *
30 *
31 * INET An implementation of the TCP/IP protocol suite for the LINUX
32 * operating system. INET is implemented using the BSD Socket
33 * interface as the means of communication with the user level.
34 *
35 * IPv4 FIB: lookup engine and maintenance routines.
36 *
37 *
38 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
39 *
40 * This program is free software; you can redistribute it and/or
41 * modify it under the terms of the GNU General Public License
42 * as published by the Free Software Foundation; either version
43 * 2 of the License, or (at your option) any later version.
44 */
45
46#define VERSION "0.323"
47
48#include <linux/config.h>
49#include <asm/uaccess.h>
50#include <asm/system.h>
51#include <asm/bitops.h>
52#include <linux/types.h>
53#include <linux/kernel.h>
54#include <linux/sched.h>
55#include <linux/mm.h>
56#include <linux/string.h>
57#include <linux/socket.h>
58#include <linux/sockios.h>
59#include <linux/errno.h>
60#include <linux/in.h>
61#include <linux/inet.h>
62#include <linux/netdevice.h>
63#include <linux/if_arp.h>
64#include <linux/proc_fs.h>
65#include <linux/skbuff.h>
66#include <linux/netlink.h>
67#include <linux/init.h>
68#include <linux/list.h>
69#include <net/ip.h>
70#include <net/protocol.h>
71#include <net/route.h>
72#include <net/tcp.h>
73#include <net/sock.h>
74#include <net/ip_fib.h>
75#include "fib_lookup.h"
76
77#undef CONFIG_IP_FIB_TRIE_STATS
78#define MAX_CHILDS 16384
79
80#define EXTRACT(p, n, str) ((str)<<(p)>>(32-(n)))
81#define KEYLENGTH (8*sizeof(t_key))
82#define MASK_PFX(k, l) (((l)==0)?0:(k >> (KEYLENGTH-l)) << (KEYLENGTH-l))
83#define TKEY_GET_MASK(offset, bits) (((bits)==0)?0:((t_key)(-1) << (KEYLENGTH - bits) >> offset))
84
85static DEFINE_RWLOCK(fib_lock);
86
87typedef unsigned int t_key;
88
89#define T_TNODE 0
90#define T_LEAF 1
91#define NODE_TYPE_MASK 0x1UL
92#define NODE_PARENT(_node) \
93((struct tnode *)((_node)->_parent & ~NODE_TYPE_MASK))
94#define NODE_SET_PARENT(_node, _ptr) \
95((_node)->_parent = (((unsigned long)(_ptr)) | \
96 ((_node)->_parent & NODE_TYPE_MASK)))
97#define NODE_INIT_PARENT(_node, _type) \
98((_node)->_parent = (_type))
99#define NODE_TYPE(_node) \
100((_node)->_parent & NODE_TYPE_MASK)
101
102#define IS_TNODE(n) (!(n->_parent & T_LEAF))
103#define IS_LEAF(n) (n->_parent & T_LEAF)
104
105struct node {
106 t_key key;
107 unsigned long _parent;
108};
109
110struct leaf {
111 t_key key;
112 unsigned long _parent;
113 struct hlist_head list;
114};
115
116struct leaf_info {
117 struct hlist_node hlist;
118 int plen;
119 struct list_head falh;
120};
121
122struct tnode {
123 t_key key;
124 unsigned long _parent;
125 unsigned short pos:5; /* 2log(KEYLENGTH) bits needed */
126 unsigned short bits:5; /* 2log(KEYLENGTH) bits needed */
127 unsigned short full_children; /* KEYLENGTH bits needed */
128 unsigned short empty_children; /* KEYLENGTH bits needed */
129 struct node *child[0];
130};
131
132#ifdef CONFIG_IP_FIB_TRIE_STATS
133struct trie_use_stats {
134 unsigned int gets;
135 unsigned int backtrack;
136 unsigned int semantic_match_passed;
137 unsigned int semantic_match_miss;
138 unsigned int null_node_hit;
139};
140#endif
141
142struct trie_stat {
143 unsigned int totdepth;
144 unsigned int maxdepth;
145 unsigned int tnodes;
146 unsigned int leaves;
147 unsigned int nullpointers;
148 unsigned int nodesizes[MAX_CHILDS];
149};
150
151struct trie {
152 struct node *trie;
153#ifdef CONFIG_IP_FIB_TRIE_STATS
154 struct trie_use_stats stats;
155#endif
156 int size;
157 unsigned int revision;
158};
159
160static int trie_debug = 0;
161
162static int tnode_full(struct tnode *tn, struct node *n);
163static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
164static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull);
165static int tnode_child_length(struct tnode *tn);
166static struct node *resize(struct trie *t, struct tnode *tn);
167static struct tnode *inflate(struct trie *t, struct tnode *tn);
168static struct tnode *halve(struct trie *t, struct tnode *tn);
169static void tnode_free(struct tnode *tn);
170static void trie_dump_seq(struct seq_file *seq, struct trie *t);
171extern struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio);
172extern int fib_detect_death(struct fib_info *fi, int order,
173 struct fib_info **last_resort, int *last_idx, int *dflt);
174
175extern void rtmsg_fib(int event, u32 key, struct fib_alias *fa, int z, int tb_id,
176 struct nlmsghdr *n, struct netlink_skb_parms *req);
177
178static kmem_cache_t *fn_alias_kmem;
179static struct trie *trie_local = NULL, *trie_main = NULL;
180
181static void trie_bug(char *err)
182{
183 printk("Trie Bug: %s\n", err);
184 BUG();
185}
186
187static inline struct node *tnode_get_child(struct tnode *tn, int i)
188{
189 if (i >= 1<<tn->bits)
190 trie_bug("tnode_get_child");
191
192 return tn->child[i];
193}
194
195static inline int tnode_child_length(struct tnode *tn)
196{
197 return 1<<tn->bits;
198}
199
200/*
201 _________________________________________________________________
202 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
203 ----------------------------------------------------------------
204 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
205
206 _________________________________________________________________
207 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
208 -----------------------------------------------------------------
209 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
210
211 tp->pos = 7
212 tp->bits = 3
213 n->pos = 15
214 n->bits=4
215 KEYLENGTH=32
216*/
217
218static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
219{
220 if (offset < KEYLENGTH)
221 return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
222 else
223 return 0;
224}
225
226static inline int tkey_equals(t_key a, t_key b)
227{
228 return a == b;
229}
230
231static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
232{
233 if (bits == 0 || offset >= KEYLENGTH)
234 return 1;
235 bits = bits > KEYLENGTH ? KEYLENGTH : bits;
236 return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
237}
238
239static inline int tkey_mismatch(t_key a, int offset, t_key b)
240{
241 t_key diff = a ^ b;
242 int i = offset;
243
244 if(!diff)
245 return 0;
246 while((diff << i) >> (KEYLENGTH-1) == 0)
247 i++;
248 return i;
249}
250
251/* Candiate for fib_semantics */
252
253static void fn_free_alias(struct fib_alias *fa)
254{
255 fib_release_info(fa->fa_info);
256 kmem_cache_free(fn_alias_kmem, fa);
257}
258
259/*
260 To understand this stuff, an understanding of keys and all their bits is
261 necessary. Every node in the trie has a key associated with it, but not
262 all of the bits in that key are significant.
263
264 Consider a node 'n' and its parent 'tp'.
265
266 If n is a leaf, every bit in its key is significant. Its presence is
267 necessitaded by path compression, since during a tree traversal (when
268 searching for a leaf - unless we are doing an insertion) we will completely
269 ignore all skipped bits we encounter. Thus we need to verify, at the end of
270 a potentially successful search, that we have indeed been walking the
271 correct key path.
272
273 Note that we can never "miss" the correct key in the tree if present by
274 following the wrong path. Path compression ensures that segments of the key
275 that are the same for all keys with a given prefix are skipped, but the
276 skipped part *is* identical for each node in the subtrie below the skipped
277 bit! trie_insert() in this implementation takes care of that - note the
278 call to tkey_sub_equals() in trie_insert().
279
280 if n is an internal node - a 'tnode' here, the various parts of its key
281 have many different meanings.
282
283 Example:
284 _________________________________________________________________
285 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
286 -----------------------------------------------------------------
287 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
288
289 _________________________________________________________________
290 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
291 -----------------------------------------------------------------
292 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
293
294 tp->pos = 7
295 tp->bits = 3
296 n->pos = 15
297 n->bits=4
298
299 First, let's just ignore the bits that come before the parent tp, that is
300 the bits from 0 to (tp->pos-1). They are *known* but at this point we do
301 not use them for anything.
302
303 The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
304 index into the parent's child array. That is, they will be used to find
305 'n' among tp's children.
306
307 The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
308 for the node n.
309
310 All the bits we have seen so far are significant to the node n. The rest
311 of the bits are really not needed or indeed known in n->key.
312
313 The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
314 n's child array, and will of course be different for each child.
315
316 The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
317 at this point.
318
319*/
320
321static void check_tnode(struct tnode *tn)
322{
323 if(tn && tn->pos+tn->bits > 32) {
324 printk("TNODE ERROR tn=%p, pos=%d, bits=%d\n", tn, tn->pos, tn->bits);
325 }
326}
327
328static int halve_threshold = 25;
329static int inflate_threshold = 50;
330
331static struct leaf *leaf_new(void)
332{
333 struct leaf *l = kmalloc(sizeof(struct leaf), GFP_KERNEL);
334 if(l) {
335 NODE_INIT_PARENT(l, T_LEAF);
336 INIT_HLIST_HEAD(&l->list);
337 }
338 return l;
339}
340
341static struct leaf_info *leaf_info_new(int plen)
342{
343 struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL);
344 li->plen = plen;
345 INIT_LIST_HEAD(&li->falh);
346 return li;
347}
348
349static inline void free_leaf(struct leaf *l)
350{
351 kfree(l);
352}
353
354static inline void free_leaf_info(struct leaf_info *li)
355{
356 kfree(li);
357}
358
359static struct tnode* tnode_new(t_key key, int pos, int bits)
360{
361 int nchildren = 1<<bits;
362 int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *);
363 struct tnode *tn = kmalloc(sz, GFP_KERNEL);
364
365 if(tn) {
366 memset(tn, 0, sz);
367 NODE_INIT_PARENT(tn, T_TNODE);
368 tn->pos = pos;
369 tn->bits = bits;
370 tn->key = key;
371 tn->full_children = 0;
372 tn->empty_children = 1<<bits;
373 }
374 if(trie_debug > 0)
375 printk("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode),
376 (unsigned int) (sizeof(struct node) * 1<<bits));
377 return tn;
378}
379
380static void tnode_free(struct tnode *tn)
381{
382 if(!tn) {
383 trie_bug("tnode_free\n");
384 }
385 if(IS_LEAF(tn)) {
386 free_leaf((struct leaf *)tn);
387 if(trie_debug > 0 )
388 printk("FL %p \n", tn);
389 }
390 else if(IS_TNODE(tn)) {
391 kfree(tn);
392 if(trie_debug > 0 )
393 printk("FT %p \n", tn);
394 }
395 else {
396 trie_bug("tnode_free\n");
397 }
398}
399
400/*
401 * Check whether a tnode 'n' is "full", i.e. it is an internal node
402 * and no bits are skipped. See discussion in dyntree paper p. 6
403 */
404
405static inline int tnode_full(struct tnode *tn, struct node *n)
406{
407 if(n == NULL || IS_LEAF(n))
408 return 0;
409
410 return ((struct tnode *) n)->pos == tn->pos + tn->bits;
411}
412
413static inline void put_child(struct trie *t, struct tnode *tn, int i, struct node *n)
414{
415 tnode_put_child_reorg(tn, i, n, -1);
416}
417
418 /*
419 * Add a child at position i overwriting the old value.
420 * Update the value of full_children and empty_children.
421 */
422
423static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull)
424{
425 struct node *chi;
426 int isfull;
427
428 if(i >= 1<<tn->bits) {
429 printk("bits=%d, i=%d\n", tn->bits, i);
430 trie_bug("tnode_put_child_reorg bits");
431 }
432 write_lock_bh(&fib_lock);
433 chi = tn->child[i];
434
435 /* update emptyChildren */
436 if (n == NULL && chi != NULL)
437 tn->empty_children++;
438 else if (n != NULL && chi == NULL)
439 tn->empty_children--;
440
441 /* update fullChildren */
442 if (wasfull == -1)
443 wasfull = tnode_full(tn, chi);
444
445 isfull = tnode_full(tn, n);
446 if (wasfull && !isfull)
447 tn->full_children--;
448
449 else if (!wasfull && isfull)
450 tn->full_children++;
451 if(n)
452 NODE_SET_PARENT(n, tn);
453
454 tn->child[i] = n;
455 write_unlock_bh(&fib_lock);
456}
457
458static struct node *resize(struct trie *t, struct tnode *tn)
459{
460 int i;
461
462 if (!tn)
463 return NULL;
464
465 if(trie_debug)
466 printk("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
467 tn, inflate_threshold, halve_threshold);
468
469 /* No children */
470 if (tn->empty_children == tnode_child_length(tn)) {
471 tnode_free(tn);
472 return NULL;
473 }
474 /* One child */
475 if (tn->empty_children == tnode_child_length(tn) - 1)
476 for (i = 0; i < tnode_child_length(tn); i++) {
477
478 write_lock_bh(&fib_lock);
479 if (tn->child[i] != NULL) {
480
481 /* compress one level */
482 struct node *n = tn->child[i];
483 if(n)
484 NODE_INIT_PARENT(n, NODE_TYPE(n));
485
486 write_unlock_bh(&fib_lock);
487 tnode_free(tn);
488 return n;
489 }
490 write_unlock_bh(&fib_lock);
491 }
492 /*
493 * Double as long as the resulting node has a number of
494 * nonempty nodes that are above the threshold.
495 */
496
497 /*
498 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
499 * the Helsinki University of Technology and Matti Tikkanen of Nokia
500 * Telecommunications, page 6:
501 * "A node is doubled if the ratio of non-empty children to all
502 * children in the *doubled* node is at least 'high'."
503 *
504 * 'high' in this instance is the variable 'inflate_threshold'. It
505 * is expressed as a percentage, so we multiply it with
506 * tnode_child_length() and instead of multiplying by 2 (since the
507 * child array will be doubled by inflate()) and multiplying
508 * the left-hand side by 100 (to handle the percentage thing) we
509 * multiply the left-hand side by 50.
510 *
511 * The left-hand side may look a bit weird: tnode_child_length(tn)
512 * - tn->empty_children is of course the number of non-null children
513 * in the current node. tn->full_children is the number of "full"
514 * children, that is non-null tnodes with a skip value of 0.
515 * All of those will be doubled in the resulting inflated tnode, so
516 * we just count them one extra time here.
517 *
518 * A clearer way to write this would be:
519 *
520 * to_be_doubled = tn->full_children;
521 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
522 * tn->full_children;
523 *
524 * new_child_length = tnode_child_length(tn) * 2;
525 *
526 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
527 * new_child_length;
528 * if (new_fill_factor >= inflate_threshold)
529 *
530 * ...and so on, tho it would mess up the while() loop.
531 *
532 * anyway,
533 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
534 * inflate_threshold
535 *
536 * avoid a division:
537 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
538 * inflate_threshold * new_child_length
539 *
540 * expand not_to_be_doubled and to_be_doubled, and shorten:
541 * 100 * (tnode_child_length(tn) - tn->empty_children +
542 * tn->full_children ) >= inflate_threshold * new_child_length
543 *
544 * expand new_child_length:
545 * 100 * (tnode_child_length(tn) - tn->empty_children +
546 * tn->full_children ) >=
547 * inflate_threshold * tnode_child_length(tn) * 2
548 *
549 * shorten again:
550 * 50 * (tn->full_children + tnode_child_length(tn) -
551 * tn->empty_children ) >= inflate_threshold *
552 * tnode_child_length(tn)
553 *
554 */
555
556 check_tnode(tn);
557
558 while ((tn->full_children > 0 &&
559 50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >=
560 inflate_threshold * tnode_child_length(tn))) {
561
562 tn = inflate(t, tn);
563 }
564
565 check_tnode(tn);
566
567 /*
568 * Halve as long as the number of empty children in this
569 * node is above threshold.
570 */
571 while (tn->bits > 1 &&
572 100 * (tnode_child_length(tn) - tn->empty_children) <
573 halve_threshold * tnode_child_length(tn))
574
575 tn = halve(t, tn);
576
577 /* Only one child remains */
578
579 if (tn->empty_children == tnode_child_length(tn) - 1)
580 for (i = 0; i < tnode_child_length(tn); i++) {
581
582 write_lock_bh(&fib_lock);
583 if (tn->child[i] != NULL) {
584 /* compress one level */
585 struct node *n = tn->child[i];
586
587 if(n)
588 NODE_INIT_PARENT(n, NODE_TYPE(n));
589
590 write_unlock_bh(&fib_lock);
591 tnode_free(tn);
592 return n;
593 }
594 write_unlock_bh(&fib_lock);
595 }
596
597 return (struct node *) tn;
598}
599
600static struct tnode *inflate(struct trie *t, struct tnode *tn)
601{
602 struct tnode *inode;
603 struct tnode *oldtnode = tn;
604 int olen = tnode_child_length(tn);
605 int i;
606
607 if(trie_debug)
608 printk("In inflate\n");
609
610 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
611
612 if (!tn)
613 trie_bug("tnode_new failed");
614
615 for(i = 0; i < olen; i++) {
616 struct node *node = tnode_get_child(oldtnode, i);
617
618 /* An empty child */
619 if (node == NULL)
620 continue;
621
622 /* A leaf or an internal node with skipped bits */
623
624 if(IS_LEAF(node) || ((struct tnode *) node)->pos >
625 tn->pos + tn->bits - 1) {
626 if(tkey_extract_bits(node->key, tn->pos + tn->bits - 1,
627 1) == 0)
628 put_child(t, tn, 2*i, node);
629 else
630 put_child(t, tn, 2*i+1, node);
631 continue;
632 }
633
634 /* An internal node with two children */
635 inode = (struct tnode *) node;
636
637 if (inode->bits == 1) {
638 put_child(t, tn, 2*i, inode->child[0]);
639 put_child(t, tn, 2*i+1, inode->child[1]);
640
641 tnode_free(inode);
642 }
643
644 /* An internal node with more than two children */
645 else {
646 struct tnode *left, *right;
647 int size, j;
648
649 /* We will replace this node 'inode' with two new
650 * ones, 'left' and 'right', each with half of the
651 * original children. The two new nodes will have
652 * a position one bit further down the key and this
653 * means that the "significant" part of their keys
654 * (see the discussion near the top of this file)
655 * will differ by one bit, which will be "0" in
656 * left's key and "1" in right's key. Since we are
657 * moving the key position by one step, the bit that
658 * we are moving away from - the bit at position
659 * (inode->pos) - is the one that will differ between
660 * left and right. So... we synthesize that bit in the
661 * two new keys.
662 * The mask 'm' below will be a single "one" bit at
663 * the position (inode->pos)
664 */
665
666 t_key m = TKEY_GET_MASK(inode->pos, 1);
667
668 /* Use the old key, but set the new significant
669 * bit to zero.
670 */
671 left = tnode_new(inode->key&(~m), inode->pos + 1,
672 inode->bits - 1);
673
674 if(!left)
675 trie_bug("tnode_new failed");
676
677
678 /* Use the old key, but set the new significant
679 * bit to one.
680 */
681 right = tnode_new(inode->key|m, inode->pos + 1,
682 inode->bits - 1);
683
684 if(!right)
685 trie_bug("tnode_new failed");
686
687 size = tnode_child_length(left);
688 for(j = 0; j < size; j++) {
689 put_child(t, left, j, inode->child[j]);
690 put_child(t, right, j, inode->child[j + size]);
691 }
692 put_child(t, tn, 2*i, resize(t, left));
693 put_child(t, tn, 2*i+1, resize(t, right));
694
695 tnode_free(inode);
696 }
697 }
698 tnode_free(oldtnode);
699 return tn;
700}
701
702static struct tnode *halve(struct trie *t, struct tnode *tn)
703{
704 struct tnode *oldtnode = tn;
705 struct node *left, *right;
706 int i;
707 int olen = tnode_child_length(tn);
708
709 if(trie_debug) printk("In halve\n");
710
711 tn=tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
712
713 if(!tn)
714 trie_bug("tnode_new failed");
715
716 for(i = 0; i < olen; i += 2) {
717 left = tnode_get_child(oldtnode, i);
718 right = tnode_get_child(oldtnode, i+1);
719
720 /* At least one of the children is empty */
721 if (left == NULL) {
722 if (right == NULL) /* Both are empty */
723 continue;
724 put_child(t, tn, i/2, right);
725 } else if (right == NULL)
726 put_child(t, tn, i/2, left);
727
728 /* Two nonempty children */
729 else {
730 struct tnode *newBinNode =
731 tnode_new(left->key, tn->pos + tn->bits, 1);
732
733 if(!newBinNode)
734 trie_bug("tnode_new failed");
735
736 put_child(t, newBinNode, 0, left);
737 put_child(t, newBinNode, 1, right);
738 put_child(t, tn, i/2, resize(t, newBinNode));
739 }
740 }
741 tnode_free(oldtnode);
742 return tn;
743}
744
745static void *trie_init(struct trie *t)
746{
747 if(t) {
748 t->size = 0;
749 t->trie = NULL;
750 t->revision = 0;
751#ifdef CONFIG_IP_FIB_TRIE_STATS
752 memset(&t->stats, 0, sizeof(struct trie_use_stats));
753#endif
754 }
755 return t;
756}
757
758static struct leaf_info *find_leaf_info(struct hlist_head *head, int plen)
759{
760 struct hlist_node *node;
761 struct leaf_info *li;
762
763 hlist_for_each_entry(li, node, head, hlist) {
764
765 if ( li->plen == plen )
766 return li;
767 }
768 return NULL;
769}
770
771static inline struct list_head * get_fa_head(struct leaf *l, int plen)
772{
773 struct list_head *fa_head=NULL;
774 struct leaf_info *li = find_leaf_info(&l->list, plen);
775
776 if(li)
777 fa_head = &li->falh;
778
779 return fa_head;
780}
781
782static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
783{
784 struct leaf_info *li=NULL, *last=NULL;
785 struct hlist_node *node, *tmp;
786
787 write_lock_bh(&fib_lock);
788
789 if(hlist_empty(head))
790 hlist_add_head(&new->hlist, head);
791 else {
792 hlist_for_each_entry_safe(li, node, tmp, head, hlist) {
793
794 if (new->plen > li->plen)
795 break;
796
797 last = li;
798 }
799 if(last)
800 hlist_add_after(&last->hlist, &new->hlist);
801 else
802 hlist_add_before(&new->hlist, &li->hlist);
803 }
804 write_unlock_bh(&fib_lock);
805}
806
807static struct leaf *
808fib_find_node(struct trie *t, u32 key)
809{
810 int pos;
811 struct tnode *tn;
812 struct node *n;
813
814 pos = 0;
815 n=t->trie;
816
817 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
818 tn = (struct tnode *) n;
819
820 check_tnode(tn);
821
822 if(tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
823 pos=tn->pos + tn->bits;
824 n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
825 }
826 else
827 break;
828 }
829 /* Case we have found a leaf. Compare prefixes */
830
831 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
832 struct leaf *l = (struct leaf *) n;
833 return l;
834 }
835 return NULL;
836}
837
838static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
839{
840 int i = 0;
841 int wasfull;
842 t_key cindex, key;
843 struct tnode *tp = NULL;
844
845 if(!tn)
846 BUG();
847
848 key = tn->key;
849 i = 0;
850
851 while (tn != NULL && NODE_PARENT(tn) != NULL) {
852
853 if( i > 10 ) {
854 printk("Rebalance tn=%p \n", tn);
855 if(tn) printk("tn->parent=%p \n", NODE_PARENT(tn));
856
857 printk("Rebalance tp=%p \n", tp);
858 if(tp) printk("tp->parent=%p \n", NODE_PARENT(tp));
859 }
860
861 if( i > 12 ) BUG();
862 i++;
863
864 tp = NODE_PARENT(tn);
865 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
866 wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
867 tn = (struct tnode *) resize (t, (struct tnode *)tn);
868 tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull);
869
870 if(!NODE_PARENT(tn))
871 break;
872
873 tn = NODE_PARENT(tn);
874 }
875 /* Handle last (top) tnode */
876 if (IS_TNODE(tn))
877 tn = (struct tnode*) resize(t, (struct tnode *)tn);
878
879 return (struct node*) tn;
880}
881
882static struct list_head *
883fib_insert_node(struct trie *t, u32 key, int plen)
884{
885 int pos, newpos;
886 struct tnode *tp = NULL, *tn = NULL;
887 struct node *n;
888 struct leaf *l;
889 int missbit;
890 struct list_head *fa_head=NULL;
891 struct leaf_info *li;
892 t_key cindex;
893
894 pos = 0;
895 n=t->trie;
896
897 /* If we point to NULL, stop. Either the tree is empty and we should
898 * just put a new leaf in if, or we have reached an empty child slot,
899 * and we should just put our new leaf in that.
900 * If we point to a T_TNODE, check if it matches our key. Note that
901 * a T_TNODE might be skipping any number of bits - its 'pos' need
902 * not be the parent's 'pos'+'bits'!
903 *
904 * If it does match the current key, get pos/bits from it, extract
905 * the index from our key, push the T_TNODE and walk the tree.
906 *
907 * If it doesn't, we have to replace it with a new T_TNODE.
908 *
909 * If we point to a T_LEAF, it might or might not have the same key
910 * as we do. If it does, just change the value, update the T_LEAF's
911 * value, and return it.
912 * If it doesn't, we need to replace it with a T_TNODE.
913 */
914
915 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
916 tn = (struct tnode *) n;
917
918 check_tnode(tn);
919
920 if(tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
921 tp = tn;
922 pos=tn->pos + tn->bits;
923 n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
924
925 if(n && NODE_PARENT(n) != tn) {
926 printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n));
927 BUG();
928 }
929 }
930 else
931 break;
932 }
933
934 /*
935 * n ----> NULL, LEAF or TNODE
936 *
937 * tp is n's (parent) ----> NULL or TNODE
938 */
939
940 if(tp && IS_LEAF(tp))
941 BUG();
942
943 t->revision++;
944
945 /* Case 1: n is a leaf. Compare prefixes */
946
947 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
948 struct leaf *l = ( struct leaf *) n;
949
950 li = leaf_info_new(plen);
951
952 if(! li)
953 BUG();
954
955 fa_head = &li->falh;
956 insert_leaf_info(&l->list, li);
957 goto done;
958 }
959 t->size++;
960 l = leaf_new();
961
962 if(! l)
963 BUG();
964
965 l->key = key;
966 li = leaf_info_new(plen);
967
968 if(! li)
969 BUG();
970
971 fa_head = &li->falh;
972 insert_leaf_info(&l->list, li);
973
974 /* Case 2: n is NULL, and will just insert a new leaf */
975 if (t->trie && n == NULL) {
976
977 NODE_SET_PARENT(l, tp);
978
979 if (!tp)
980 BUG();
981
982 else {
983 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
984 put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
985 }
986 }
987 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
988 else {
989 /*
990 * Add a new tnode here
991 * first tnode need some special handling
992 */
993
994 if (tp)
995 pos=tp->pos+tp->bits;
996 else
997 pos=0;
998 if(n) {
999 newpos = tkey_mismatch(key, pos, n->key);
1000 tn = tnode_new(n->key, newpos, 1);
1001 }
1002 else {
1003 newpos = 0;
1004 tn = tnode_new(key, newpos, 1); /* First tnode */
1005 }
1006 if(!tn)
1007 trie_bug("tnode_pfx_new failed");
1008
1009 NODE_SET_PARENT(tn, tp);
1010
1011 missbit=tkey_extract_bits(key, newpos, 1);
1012 put_child(t, tn, missbit, (struct node *)l);
1013 put_child(t, tn, 1-missbit, n);
1014
1015 if(tp) {
1016 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1017 put_child(t, (struct tnode *)tp, cindex, (struct node *)tn);
1018 }
1019 else {
1020 t->trie = (struct node*) tn; /* First tnode */
1021 tp = tn;
1022 }
1023 }
1024 if(tp && tp->pos+tp->bits > 32) {
1025 printk("ERROR tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1026 tp, tp->pos, tp->bits, key, plen);
1027 }
1028 /* Rebalance the trie */
1029 t->trie = trie_rebalance(t, tp);
1030done:;
1031 return fa_head;
1032}
1033
1034static int
1035fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1036 struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
1037{
1038 struct trie *t = (struct trie *) tb->tb_data;
1039 struct fib_alias *fa, *new_fa;
1040 struct list_head *fa_head=NULL;
1041 struct fib_info *fi;
1042 int plen = r->rtm_dst_len;
1043 int type = r->rtm_type;
1044 u8 tos = r->rtm_tos;
1045 u32 key, mask;
1046 int err;
1047 struct leaf *l;
1048
1049 if (plen > 32)
1050 return -EINVAL;
1051
1052 key = 0;
1053 if (rta->rta_dst)
1054 memcpy(&key, rta->rta_dst, 4);
1055
1056 key = ntohl(key);
1057
1058 if(trie_debug)
1059 printk("Insert table=%d %08x/%d\n", tb->tb_id, key, plen);
1060
1061 mask = ntohl( inet_make_mask(plen) );
1062
1063 if(key & ~mask)
1064 return -EINVAL;
1065
1066 key = key & mask;
1067
1068 if ((fi = fib_create_info(r, rta, nlhdr, &err)) == NULL)
1069 goto err;
1070
1071 l = fib_find_node(t, key);
1072 fa = NULL;
1073
1074 if(l) {
1075 fa_head = get_fa_head(l, plen);
1076 fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1077 }
1078
1079 /* Now fa, if non-NULL, points to the first fib alias
1080 * with the same keys [prefix,tos,priority], if such key already
1081 * exists or to the node before which we will insert new one.
1082 *
1083 * If fa is NULL, we will need to allocate a new one and
1084 * insert to the head of f.
1085 *
1086 * If f is NULL, no fib node matched the destination key
1087 * and we need to allocate a new one of those as well.
1088 */
1089
1090 if (fa &&
1091 fa->fa_info->fib_priority == fi->fib_priority) {
1092 struct fib_alias *fa_orig;
1093
1094 err = -EEXIST;
1095 if (nlhdr->nlmsg_flags & NLM_F_EXCL)
1096 goto out;
1097
1098 if (nlhdr->nlmsg_flags & NLM_F_REPLACE) {
1099 struct fib_info *fi_drop;
1100 u8 state;
1101
1102 write_lock_bh(&fib_lock);
1103
1104 fi_drop = fa->fa_info;
1105 fa->fa_info = fi;
1106 fa->fa_type = type;
1107 fa->fa_scope = r->rtm_scope;
1108 state = fa->fa_state;
1109 fa->fa_state &= ~FA_S_ACCESSED;
1110
1111 write_unlock_bh(&fib_lock);
1112
1113 fib_release_info(fi_drop);
1114 if (state & FA_S_ACCESSED)
1115 rt_cache_flush(-1);
1116
1117 goto succeeded;
1118 }
1119 /* Error if we find a perfect match which
1120 * uses the same scope, type, and nexthop
1121 * information.
1122 */
1123 fa_orig = fa;
1124 list_for_each_entry(fa, fa_orig->fa_list.prev, fa_list) {
1125 if (fa->fa_tos != tos)
1126 break;
1127 if (fa->fa_info->fib_priority != fi->fib_priority)
1128 break;
1129 if (fa->fa_type == type &&
1130 fa->fa_scope == r->rtm_scope &&
1131 fa->fa_info == fi) {
1132 goto out;
1133 }
1134 }
1135 if (!(nlhdr->nlmsg_flags & NLM_F_APPEND))
1136 fa = fa_orig;
1137 }
1138 err = -ENOENT;
1139 if (!(nlhdr->nlmsg_flags&NLM_F_CREATE))
1140 goto out;
1141
1142 err = -ENOBUFS;
1143 new_fa = kmem_cache_alloc(fn_alias_kmem, SLAB_KERNEL);
1144 if (new_fa == NULL)
1145 goto out;
1146
1147 new_fa->fa_info = fi;
1148 new_fa->fa_tos = tos;
1149 new_fa->fa_type = type;
1150 new_fa->fa_scope = r->rtm_scope;
1151 new_fa->fa_state = 0;
1152#if 0
1153 new_fa->dst = NULL;
1154#endif
1155 /*
1156 * Insert new entry to the list.
1157 */
1158
1159 if(!fa_head)
1160 fa_head = fib_insert_node(t, key, plen);
1161
1162 write_lock_bh(&fib_lock);
1163
1164 list_add_tail(&new_fa->fa_list,
1165 (fa ? &fa->fa_list : fa_head));
1166
1167 write_unlock_bh(&fib_lock);
1168
1169 rt_cache_flush(-1);
1170 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, nlhdr, req);
1171succeeded:
1172 return 0;
1173out:
1174 fib_release_info(fi);
1175err:;
1176 return err;
1177}
1178
1179static inline int check_leaf(struct trie *t, struct leaf *l, t_key key, int *plen, const struct flowi *flp,
1180 struct fib_result *res, int *err)
1181{
1182 int i;
1183 t_key mask;
1184 struct leaf_info *li;
1185 struct hlist_head *hhead = &l->list;
1186 struct hlist_node *node;
1187
1188 hlist_for_each_entry(li, node, hhead, hlist) {
1189
1190 i = li->plen;
1191 mask = ntohl(inet_make_mask(i));
1192 if (l->key != (key & mask))
1193 continue;
1194
1195 if (((*err) = fib_semantic_match(&li->falh, flp, res, l->key, mask, i)) == 0) {
1196 *plen = i;
1197#ifdef CONFIG_IP_FIB_TRIE_STATS
1198 t->stats.semantic_match_passed++;
1199#endif
1200 return 1;
1201 }
1202#ifdef CONFIG_IP_FIB_TRIE_STATS
1203 t->stats.semantic_match_miss++;
1204#endif
1205 }
1206 return 0;
1207}
1208
1209static int
1210fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1211{
1212 struct trie *t = (struct trie *) tb->tb_data;
1213 int plen, ret = 0;
1214 struct node *n;
1215 struct tnode *pn;
1216 int pos, bits;
1217 t_key key=ntohl(flp->fl4_dst);
1218 int chopped_off;
1219 t_key cindex = 0;
1220 int current_prefix_length = KEYLENGTH;
1221 n = t->trie;
1222
1223 read_lock(&fib_lock);
1224 if(!n)
1225 goto failed;
1226
1227#ifdef CONFIG_IP_FIB_TRIE_STATS
1228 t->stats.gets++;
1229#endif
1230
1231 /* Just a leaf? */
1232 if (IS_LEAF(n)) {
1233 if( check_leaf(t, (struct leaf *)n, key, &plen, flp, res, &ret) )
1234 goto found;
1235 goto failed;
1236 }
1237 pn = (struct tnode *) n;
1238 chopped_off = 0;
1239
1240 while (pn) {
1241
1242 pos = pn->pos;
1243 bits = pn->bits;
1244
1245 if(!chopped_off)
1246 cindex = tkey_extract_bits(MASK_PFX(key, current_prefix_length), pos, bits);
1247
1248 n = tnode_get_child(pn, cindex);
1249
1250 if (n == NULL) {
1251#ifdef CONFIG_IP_FIB_TRIE_STATS
1252 t->stats.null_node_hit++;
1253#endif
1254 goto backtrace;
1255 }
1256
1257 if (IS_TNODE(n)) {
1258#define HL_OPTIMIZE
1259#ifdef HL_OPTIMIZE
1260 struct tnode *cn = (struct tnode *)n;
1261 t_key node_prefix, key_prefix, pref_mismatch;
1262 int mp;
1263
1264 /*
1265 * It's a tnode, and we can do some extra checks here if we
1266 * like, to avoid descending into a dead-end branch.
1267 * This tnode is in the parent's child array at index
1268 * key[p_pos..p_pos+p_bits] but potentially with some bits
1269 * chopped off, so in reality the index may be just a
1270 * subprefix, padded with zero at the end.
1271 * We can also take a look at any skipped bits in this
1272 * tnode - everything up to p_pos is supposed to be ok,
1273 * and the non-chopped bits of the index (se previous
1274 * paragraph) are also guaranteed ok, but the rest is
1275 * considered unknown.
1276 *
1277 * The skipped bits are key[pos+bits..cn->pos].
1278 */
1279
1280 /* If current_prefix_length < pos+bits, we are already doing
1281 * actual prefix matching, which means everything from
1282 * pos+(bits-chopped_off) onward must be zero along some
1283 * branch of this subtree - otherwise there is *no* valid
1284 * prefix present. Here we can only check the skipped
1285 * bits. Remember, since we have already indexed into the
1286 * parent's child array, we know that the bits we chopped of
1287 * *are* zero.
1288 */
1289
1290 /* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */
1291
1292 if (current_prefix_length < pos+bits) {
1293 if (tkey_extract_bits(cn->key, current_prefix_length,
1294 cn->pos - current_prefix_length) != 0 ||
1295 !(cn->child[0]))
1296 goto backtrace;
1297 }
1298
1299 /*
1300 * If chopped_off=0, the index is fully validated and we
1301 * only need to look at the skipped bits for this, the new,
1302 * tnode. What we actually want to do is to find out if
1303 * these skipped bits match our key perfectly, or if we will
1304 * have to count on finding a matching prefix further down,
1305 * because if we do, we would like to have some way of
1306 * verifying the existence of such a prefix at this point.
1307 */
1308
1309 /* The only thing we can do at this point is to verify that
1310 * any such matching prefix can indeed be a prefix to our
1311 * key, and if the bits in the node we are inspecting that
1312 * do not match our key are not ZERO, this cannot be true.
1313 * Thus, find out where there is a mismatch (before cn->pos)
1314 * and verify that all the mismatching bits are zero in the
1315 * new tnode's key.
1316 */
1317
1318 /* Note: We aren't very concerned about the piece of the key
1319 * that precede pn->pos+pn->bits, since these have already been
1320 * checked. The bits after cn->pos aren't checked since these are
1321 * by definition "unknown" at this point. Thus, what we want to
1322 * see is if we are about to enter the "prefix matching" state,
1323 * and in that case verify that the skipped bits that will prevail
1324 * throughout this subtree are zero, as they have to be if we are
1325 * to find a matching prefix.
1326 */
1327
1328 node_prefix = MASK_PFX(cn->key, cn->pos);
1329 key_prefix = MASK_PFX(key, cn->pos);
1330 pref_mismatch = key_prefix^node_prefix;
1331 mp = 0;
1332
1333 /* In short: If skipped bits in this node do not match the search
1334 * key, enter the "prefix matching" state.directly.
1335 */
1336 if (pref_mismatch) {
1337 while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
1338 mp++;
1339 pref_mismatch = pref_mismatch <<1;
1340 }
1341 key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1342
1343 if (key_prefix != 0)
1344 goto backtrace;
1345
1346 if (current_prefix_length >= cn->pos)
1347 current_prefix_length=mp;
1348 }
1349#endif
1350 pn = (struct tnode *)n; /* Descend */
1351 chopped_off = 0;
1352 continue;
1353 }
1354 if (IS_LEAF(n)) {
1355 if( check_leaf(t, (struct leaf *)n, key, &plen, flp, res, &ret))
1356 goto found;
1357 }
1358backtrace:
1359 chopped_off++;
1360
1361 /* As zero don't change the child key (cindex) */
1362 while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1)))) {
1363 chopped_off++;
1364 }
1365
1366 /* Decrease current_... with bits chopped off */
1367 if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1368 current_prefix_length = pn->pos + pn->bits - chopped_off;
1369
1370 /*
1371 * Either we do the actual chop off according or if we have
1372 * chopped off all bits in this tnode walk up to our parent.
1373 */
1374
1375 if(chopped_off <= pn->bits)
1376 cindex &= ~(1 << (chopped_off-1));
1377 else {
1378 if( NODE_PARENT(pn) == NULL)
1379 goto failed;
1380
1381 /* Get Child's index */
1382 cindex = tkey_extract_bits(pn->key, NODE_PARENT(pn)->pos, NODE_PARENT(pn)->bits);
1383 pn = NODE_PARENT(pn);
1384 chopped_off = 0;
1385
1386#ifdef CONFIG_IP_FIB_TRIE_STATS
1387 t->stats.backtrack++;
1388#endif
1389 goto backtrace;
1390 }
1391 }
1392failed:
1393 ret = 1;
1394found:
1395 read_unlock(&fib_lock);
1396 return ret;
1397}
1398
1399static int trie_leaf_remove(struct trie *t, t_key key)
1400{
1401 t_key cindex;
1402 struct tnode *tp = NULL;
1403 struct node *n = t->trie;
1404 struct leaf *l;
1405
1406 if(trie_debug)
1407 printk("entering trie_leaf_remove(%p)\n", n);
1408
1409 /* Note that in the case skipped bits, those bits are *not* checked!
1410 * When we finish this, we will have NULL or a T_LEAF, and the
1411 * T_LEAF may or may not match our key.
1412 */
1413
1414 while (n != NULL && IS_TNODE(n)) {
1415 struct tnode *tn = (struct tnode *) n;
1416 check_tnode(tn);
1417 n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits));
1418
1419 if(n && NODE_PARENT(n) != tn) {
1420 printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n));
1421 BUG();
1422 }
1423 }
1424 l = (struct leaf *) n;
1425
1426 if(!n || !tkey_equals(l->key, key))
1427 return 0;
1428
1429 /*
1430 * Key found.
1431 * Remove the leaf and rebalance the tree
1432 */
1433
1434 t->revision++;
1435 t->size--;
1436
1437 tp = NODE_PARENT(n);
1438 tnode_free((struct tnode *) n);
1439
1440 if(tp) {
1441 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1442 put_child(t, (struct tnode *)tp, cindex, NULL);
1443 t->trie = trie_rebalance(t, tp);
1444 }
1445 else
1446 t->trie = NULL;
1447
1448 return 1;
1449}
1450
1451static int
1452fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1453 struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
1454{
1455 struct trie *t = (struct trie *) tb->tb_data;
1456 u32 key, mask;
1457 int plen = r->rtm_dst_len;
1458 u8 tos = r->rtm_tos;
1459 struct fib_alias *fa, *fa_to_delete;
1460 struct list_head *fa_head;
1461 struct leaf *l;
1462
1463 if (plen > 32)
1464 return -EINVAL;
1465
1466 key = 0;
1467 if (rta->rta_dst)
1468 memcpy(&key, rta->rta_dst, 4);
1469
1470 key = ntohl(key);
1471 mask = ntohl( inet_make_mask(plen) );
1472
1473 if(key & ~mask)
1474 return -EINVAL;
1475
1476 key = key & mask;
1477 l = fib_find_node(t, key);
1478
1479 if(!l)
1480 return -ESRCH;
1481
1482 fa_head = get_fa_head(l, plen);
1483 fa = fib_find_alias(fa_head, tos, 0);
1484
1485 if (!fa)
1486 return -ESRCH;
1487
1488 if (trie_debug)
1489 printk("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1490
1491 fa_to_delete = NULL;
1492 fa_head = fa->fa_list.prev;
1493 list_for_each_entry(fa, fa_head, fa_list) {
1494 struct fib_info *fi = fa->fa_info;
1495
1496 if (fa->fa_tos != tos)
1497 break;
1498
1499 if ((!r->rtm_type ||
1500 fa->fa_type == r->rtm_type) &&
1501 (r->rtm_scope == RT_SCOPE_NOWHERE ||
1502 fa->fa_scope == r->rtm_scope) &&
1503 (!r->rtm_protocol ||
1504 fi->fib_protocol == r->rtm_protocol) &&
1505 fib_nh_match(r, nlhdr, rta, fi) == 0) {
1506 fa_to_delete = fa;
1507 break;
1508 }
1509 }
1510
1511 if (fa_to_delete) {
1512 int kill_li = 0;
1513 struct leaf_info *li;
1514
1515 fa = fa_to_delete;
1516 rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id, nlhdr, req);
1517
1518 l = fib_find_node(t, key);
1519 li = find_leaf_info(&l->list, plen);
1520
1521 write_lock_bh(&fib_lock);
1522
1523 list_del(&fa->fa_list);
1524
1525 if(list_empty(fa_head)) {
1526 hlist_del(&li->hlist);
1527 kill_li = 1;
1528 }
1529 write_unlock_bh(&fib_lock);
1530
1531 if(kill_li)
1532 free_leaf_info(li);
1533
1534 if(hlist_empty(&l->list))
1535 trie_leaf_remove(t, key);
1536
1537 if (fa->fa_state & FA_S_ACCESSED)
1538 rt_cache_flush(-1);
1539
1540 fn_free_alias(fa);
1541 return 0;
1542 }
1543 return -ESRCH;
1544}
1545
1546static int trie_flush_list(struct trie *t, struct list_head *head)
1547{
1548 struct fib_alias *fa, *fa_node;
1549 int found = 0;
1550
1551 list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1552 struct fib_info *fi = fa->fa_info;
1553
1554 if (fi && (fi->fib_flags&RTNH_F_DEAD)) {
1555
1556 write_lock_bh(&fib_lock);
1557 list_del(&fa->fa_list);
1558 write_unlock_bh(&fib_lock);
1559
1560 fn_free_alias(fa);
1561 found++;
1562 }
1563 }
1564 return found;
1565}
1566
1567static int trie_flush_leaf(struct trie *t, struct leaf *l)
1568{
1569 int found = 0;
1570 struct hlist_head *lih = &l->list;
1571 struct hlist_node *node, *tmp;
1572 struct leaf_info *li = NULL;
1573
1574 hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
1575
1576 found += trie_flush_list(t, &li->falh);
1577
1578 if (list_empty(&li->falh)) {
1579
1580 write_lock_bh(&fib_lock);
1581 hlist_del(&li->hlist);
1582 write_unlock_bh(&fib_lock);
1583
1584 free_leaf_info(li);
1585 }
1586 }
1587 return found;
1588}
1589
1590static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
1591{
1592 struct node *c = (struct node *) thisleaf;
1593 struct tnode *p;
1594 int idx;
1595
1596 if(c == NULL) {
1597 if(t->trie == NULL)
1598 return NULL;
1599
1600 if (IS_LEAF(t->trie)) /* trie w. just a leaf */
1601 return (struct leaf *) t->trie;
1602
1603 p = (struct tnode*) t->trie; /* Start */
1604 }
1605 else
1606 p = (struct tnode *) NODE_PARENT(c);
1607 while (p) {
1608 int pos, last;
1609
1610 /* Find the next child of the parent */
1611 if(c)
1612 pos = 1 + tkey_extract_bits(c->key, p->pos, p->bits);
1613 else
1614 pos = 0;
1615
1616 last = 1 << p->bits;
1617 for(idx = pos; idx < last ; idx++) {
1618 if( p->child[idx]) {
1619
1620 /* Decend if tnode */
1621
1622 while (IS_TNODE(p->child[idx])) {
1623 p = (struct tnode*) p->child[idx];
1624 idx = 0;
1625
1626 /* Rightmost non-NULL branch */
1627 if( p && IS_TNODE(p) )
1628 while ( p->child[idx] == NULL && idx < (1 << p->bits) ) idx++;
1629
1630 /* Done with this tnode? */
1631 if( idx >= (1 << p->bits) || p->child[idx] == NULL )
1632 goto up;
1633 }
1634 return (struct leaf*) p->child[idx];
1635 }
1636 }
1637up:
1638 /* No more children go up one step */
1639 c = (struct node*) p;
1640 p = (struct tnode *) NODE_PARENT(p);
1641 }
1642 return NULL; /* Ready. Root of trie */
1643}
1644
1645static int fn_trie_flush(struct fib_table *tb)
1646{
1647 struct trie *t = (struct trie *) tb->tb_data;
1648 struct leaf *ll = NULL, *l = NULL;
1649 int found = 0, h;
1650
1651 t->revision++;
1652
1653 for (h=0; (l = nextleaf(t, l)) != NULL; h++) {
1654 found += trie_flush_leaf(t, l);
1655
1656 if (ll && hlist_empty(&ll->list))
1657 trie_leaf_remove(t, ll->key);
1658 ll = l;
1659 }
1660
1661 if (ll && hlist_empty(&ll->list))
1662 trie_leaf_remove(t, ll->key);
1663
1664 if(trie_debug)
1665 printk("trie_flush found=%d\n", found);
1666 return found;
1667}
1668
1669static int trie_last_dflt=-1;
1670
1671static void
1672fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1673{
1674 struct trie *t = (struct trie *) tb->tb_data;
1675 int order, last_idx;
1676 struct fib_info *fi = NULL;
1677 struct fib_info *last_resort;
1678 struct fib_alias *fa = NULL;
1679 struct list_head *fa_head;
1680 struct leaf *l;
1681
1682 last_idx = -1;
1683 last_resort = NULL;
1684 order = -1;
1685
1686 read_lock(&fib_lock);
1687
1688 l = fib_find_node(t, 0);
1689 if(!l)
1690 goto out;
1691
1692 fa_head = get_fa_head(l, 0);
1693 if(!fa_head)
1694 goto out;
1695
1696 if (list_empty(fa_head))
1697 goto out;
1698
1699 list_for_each_entry(fa, fa_head, fa_list) {
1700 struct fib_info *next_fi = fa->fa_info;
1701
1702 if (fa->fa_scope != res->scope ||
1703 fa->fa_type != RTN_UNICAST)
1704 continue;
1705
1706 if (next_fi->fib_priority > res->fi->fib_priority)
1707 break;
1708 if (!next_fi->fib_nh[0].nh_gw ||
1709 next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1710 continue;
1711 fa->fa_state |= FA_S_ACCESSED;
1712
1713 if (fi == NULL) {
1714 if (next_fi != res->fi)
1715 break;
1716 } else if (!fib_detect_death(fi, order, &last_resort,
1717 &last_idx, &trie_last_dflt)) {
1718 if (res->fi)
1719 fib_info_put(res->fi);
1720 res->fi = fi;
1721 atomic_inc(&fi->fib_clntref);
1722 trie_last_dflt = order;
1723 goto out;
1724 }
1725 fi = next_fi;
1726 order++;
1727 }
1728 if (order <= 0 || fi == NULL) {
1729 trie_last_dflt = -1;
1730 goto out;
1731 }
1732
1733 if (!fib_detect_death(fi, order, &last_resort, &last_idx, &trie_last_dflt)) {
1734 if (res->fi)
1735 fib_info_put(res->fi);
1736 res->fi = fi;
1737 atomic_inc(&fi->fib_clntref);
1738 trie_last_dflt = order;
1739 goto out;
1740 }
1741 if (last_idx >= 0) {
1742 if (res->fi)
1743 fib_info_put(res->fi);
1744 res->fi = last_resort;
1745 if (last_resort)
1746 atomic_inc(&last_resort->fib_clntref);
1747 }
1748 trie_last_dflt = last_idx;
1749 out:;
1750 read_unlock(&fib_lock);
1751}
1752
1753static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb,
1754 struct sk_buff *skb, struct netlink_callback *cb)
1755{
1756 int i, s_i;
1757 struct fib_alias *fa;
1758
1759 u32 xkey=htonl(key);
1760
1761 s_i=cb->args[3];
1762 i = 0;
1763
1764 list_for_each_entry(fa, fah, fa_list) {
1765 if (i < s_i) {
1766 i++;
1767 continue;
1768 }
1769 if (fa->fa_info->fib_nh == NULL) {
1770 printk("Trie error _fib_nh=NULL in fa[%d] k=%08x plen=%d\n", i, key, plen);
1771 i++;
1772 continue;
1773 }
1774 if (fa->fa_info == NULL) {
1775 printk("Trie error fa_info=NULL in fa[%d] k=%08x plen=%d\n", i, key, plen);
1776 i++;
1777 continue;
1778 }
1779
1780 if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1781 cb->nlh->nlmsg_seq,
1782 RTM_NEWROUTE,
1783 tb->tb_id,
1784 fa->fa_type,
1785 fa->fa_scope,
1786 &xkey,
1787 plen,
1788 fa->fa_tos,
1789 fa->fa_info, 0) < 0) {
1790 cb->args[3] = i;
1791 return -1;
1792 }
1793 i++;
1794 }
1795 cb->args[3]=i;
1796 return skb->len;
1797}
1798
1799static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, struct sk_buff *skb,
1800 struct netlink_callback *cb)
1801{
1802 int h, s_h;
1803 struct list_head *fa_head;
1804 struct leaf *l = NULL;
1805 s_h=cb->args[2];
1806
1807 for (h=0; (l = nextleaf(t, l)) != NULL; h++) {
1808
1809 if (h < s_h)
1810 continue;
1811 if (h > s_h)
1812 memset(&cb->args[3], 0,
1813 sizeof(cb->args) - 3*sizeof(cb->args[0]));
1814
1815 fa_head = get_fa_head(l, plen);
1816
1817 if(!fa_head)
1818 continue;
1819
1820 if(list_empty(fa_head))
1821 continue;
1822
1823 if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) {
1824 cb->args[2]=h;
1825 return -1;
1826 }
1827 }
1828 cb->args[2]=h;
1829 return skb->len;
1830}
1831
1832static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb)
1833{
1834 int m, s_m;
1835 struct trie *t = (struct trie *) tb->tb_data;
1836
1837 s_m = cb->args[1];
1838
1839 read_lock(&fib_lock);
1840 for (m=0; m<=32; m++) {
1841
1842 if (m < s_m)
1843 continue;
1844 if (m > s_m)
1845 memset(&cb->args[2], 0,
1846 sizeof(cb->args) - 2*sizeof(cb->args[0]));
1847
1848 if (fn_trie_dump_plen(t, 32-m, tb, skb, cb)<0) {
1849 cb->args[1] = m;
1850 goto out;
1851 }
1852 }
1853 read_unlock(&fib_lock);
1854 cb->args[1] = m;
1855 return skb->len;
1856 out:
1857 read_unlock(&fib_lock);
1858 return -1;
1859}
1860
1861/* Fix more generic FIB names for init later */
1862
1863#ifdef CONFIG_IP_MULTIPLE_TABLES
1864struct fib_table * fib_hash_init(int id)
1865#else
1866struct fib_table * __init fib_hash_init(int id)
1867#endif
1868{
1869 struct fib_table *tb;
1870 struct trie *t;
1871
1872 if (fn_alias_kmem == NULL)
1873 fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1874 sizeof(struct fib_alias),
1875 0, SLAB_HWCACHE_ALIGN,
1876 NULL, NULL);
1877
1878 tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
1879 GFP_KERNEL);
1880 if (tb == NULL)
1881 return NULL;
1882
1883 tb->tb_id = id;
1884 tb->tb_lookup = fn_trie_lookup;
1885 tb->tb_insert = fn_trie_insert;
1886 tb->tb_delete = fn_trie_delete;
1887 tb->tb_flush = fn_trie_flush;
1888 tb->tb_select_default = fn_trie_select_default;
1889 tb->tb_dump = fn_trie_dump;
1890 memset(tb->tb_data, 0, sizeof(struct trie));
1891
1892 t = (struct trie *) tb->tb_data;
1893
1894 trie_init(t);
1895
1896 if (id == RT_TABLE_LOCAL)
1897 trie_local=t;
1898 else if (id == RT_TABLE_MAIN)
1899 trie_main=t;
1900
1901 if (id == RT_TABLE_LOCAL)
1902 printk("IPv4 FIB: Using LC-trie version %s\n", VERSION);
1903
1904 return tb;
1905}
1906
1907/* Trie dump functions */
1908
1909static void putspace_seq(struct seq_file *seq, int n)
1910{
1911 while (n--) seq_printf(seq, " ");
1912}
1913
1914static void printbin_seq(struct seq_file *seq, unsigned int v, int bits)
1915{
1916 while (bits--)
1917 seq_printf(seq, "%s", (v & (1<<bits))?"1":"0");
1918}
1919
1920static void printnode_seq(struct seq_file *seq, int indent, struct node *n,
1921 int pend, int cindex, int bits)
1922{
1923 putspace_seq(seq, indent);
1924 if (IS_LEAF(n))
1925 seq_printf(seq, "|");
1926 else
1927 seq_printf(seq, "+");
1928 if (bits) {
1929 seq_printf(seq, "%d/", cindex);
1930 printbin_seq(seq, cindex, bits);
1931 seq_printf(seq, ": ");
1932 }
1933 else
1934 seq_printf(seq, "<root>: ");
1935 seq_printf(seq, "%s:%p ", IS_LEAF(n)?"Leaf":"Internal node", n);
1936
1937 if (IS_LEAF(n))
1938 seq_printf(seq, "key=%d.%d.%d.%d\n",
1939 n->key >> 24, (n->key >> 16) % 256, (n->key >> 8) % 256, n->key % 256);
1940 else {
1941 int plen=((struct tnode *)n)->pos;
1942 t_key prf=MASK_PFX(n->key, plen);
1943 seq_printf(seq, "key=%d.%d.%d.%d/%d\n",
1944 prf >> 24, (prf >> 16) % 256, (prf >> 8) % 256, prf % 256, plen);
1945 }
1946 if (IS_LEAF(n)) {
1947 struct leaf *l=(struct leaf *)n;
1948 struct fib_alias *fa;
1949 int i;
1950 for (i=32; i>=0; i--)
1951 if(find_leaf_info(&l->list, i)) {
1952
1953 struct list_head *fa_head = get_fa_head(l, i);
1954
1955 if(!fa_head)
1956 continue;
1957
1958 if(list_empty(fa_head))
1959 continue;
1960
1961 putspace_seq(seq, indent+2);
1962 seq_printf(seq, "{/%d...dumping}\n", i);
1963
1964
1965 list_for_each_entry(fa, fa_head, fa_list) {
1966 putspace_seq(seq, indent+2);
1967 if (fa->fa_info->fib_nh == NULL) {
1968 seq_printf(seq, "Error _fib_nh=NULL\n");
1969 continue;
1970 }
1971 if (fa->fa_info == NULL) {
1972 seq_printf(seq, "Error fa_info=NULL\n");
1973 continue;
1974 }
1975
1976 seq_printf(seq, "{type=%d scope=%d TOS=%d}\n",
1977 fa->fa_type,
1978 fa->fa_scope,
1979 fa->fa_tos);
1980 }
1981 }
1982 }
1983 else if (IS_TNODE(n)) {
1984 struct tnode *tn=(struct tnode *)n;
1985 putspace_seq(seq, indent); seq_printf(seq, "| ");
1986 seq_printf(seq, "{key prefix=%08x/", tn->key&TKEY_GET_MASK(0, tn->pos));
1987 printbin_seq(seq, tkey_extract_bits(tn->key, 0, tn->pos), tn->pos);
1988 seq_printf(seq, "}\n");
1989 putspace_seq(seq, indent); seq_printf(seq, "| ");
1990 seq_printf(seq, "{pos=%d", tn->pos);
1991 seq_printf(seq, " (skip=%d bits)", tn->pos - pend);
1992 seq_printf(seq, " bits=%d (%u children)}\n", tn->bits, (1 << tn->bits));
1993 putspace_seq(seq, indent); seq_printf(seq, "| ");
1994 seq_printf(seq, "{empty=%d full=%d}\n", tn->empty_children, tn->full_children);
1995 }
1996}
1997
1998static void trie_dump_seq(struct seq_file *seq, struct trie *t)
1999{
2000 struct node *n=t->trie;
2001 int cindex=0;
2002 int indent=1;
2003 int pend=0;
2004 int depth = 0;
2005
2006 read_lock(&fib_lock);
2007
2008 seq_printf(seq, "------ trie_dump of t=%p ------\n", t);
2009 if (n) {
2010 printnode_seq(seq, indent, n, pend, cindex, 0);
2011 if (IS_TNODE(n)) {
2012 struct tnode *tn=(struct tnode *)n;
2013 pend = tn->pos+tn->bits;
2014 putspace_seq(seq, indent); seq_printf(seq, "\\--\n");
2015 indent += 3;
2016 depth++;
2017
2018 while (tn && cindex < (1 << tn->bits)) {
2019 if (tn->child[cindex]) {
2020
2021 /* Got a child */
2022
2023 printnode_seq(seq, indent, tn->child[cindex], pend, cindex, tn->bits);
2024 if (IS_LEAF(tn->child[cindex])) {
2025 cindex++;
2026
2027 }
2028 else {
2029 /*
2030 * New tnode. Decend one level
2031 */
2032
2033 depth++;
2034 n=tn->child[cindex];
2035 tn=(struct tnode *)n;
2036 pend=tn->pos+tn->bits;
2037 putspace_seq(seq, indent); seq_printf(seq, "\\--\n");
2038 indent+=3;
2039 cindex=0;
2040 }
2041 }
2042 else
2043 cindex++;
2044
2045 /*
2046 * Test if we are done
2047 */
2048
2049 while (cindex >= (1 << tn->bits)) {
2050
2051 /*
2052 * Move upwards and test for root
2053 * pop off all traversed nodes
2054 */
2055
2056 if (NODE_PARENT(tn) == NULL) {
2057 tn = NULL;
2058 n = NULL;
2059 break;
2060 }
2061 else {
2062 cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
2063 tn = NODE_PARENT(tn);
2064 cindex++;
2065 n=(struct node *)tn;
2066 pend=tn->pos+tn->bits;
2067 indent-=3;
2068 depth--;
2069 }
2070 }
2071 }
2072 }
2073 else n = NULL;
2074 }
2075 else seq_printf(seq, "------ trie is empty\n");
2076
2077 read_unlock(&fib_lock);
2078}
2079
2080static struct trie_stat *trie_stat_new(void)
2081{
2082 struct trie_stat *s = kmalloc(sizeof(struct trie_stat), GFP_KERNEL);
2083 int i;
2084
2085 if(s) {
2086 s->totdepth = 0;
2087 s->maxdepth = 0;
2088 s->tnodes = 0;
2089 s->leaves = 0;
2090 s->nullpointers = 0;
2091
2092 for(i=0; i< MAX_CHILDS; i++)
2093 s->nodesizes[i] = 0;
2094 }
2095 return s;
2096}
2097
2098static struct trie_stat *trie_collect_stats(struct trie *t)
2099{
2100 struct node *n=t->trie;
2101 struct trie_stat *s = trie_stat_new();
2102 int cindex = 0;
2103 int indent = 1;
2104 int pend = 0;
2105 int depth = 0;
2106
2107 read_lock(&fib_lock);
2108
2109 if (s) {
2110 if (n) {
2111 if (IS_TNODE(n)) {
2112 struct tnode *tn = (struct tnode *)n;
2113 pend=tn->pos+tn->bits;
2114 indent += 3;
2115 s->nodesizes[tn->bits]++;
2116 depth++;
2117
2118 while (tn && cindex < (1 << tn->bits)) {
2119 if (tn->child[cindex]) {
2120 /* Got a child */
2121
2122 if (IS_LEAF(tn->child[cindex])) {
2123 cindex++;
2124
2125 /* stats */
2126 if (depth > s->maxdepth)
2127 s->maxdepth = depth;
2128 s->totdepth += depth;
2129 s->leaves++;
2130 }
2131
2132 else {
2133 /*
2134 * New tnode. Decend one level
2135 */
2136
2137 s->tnodes++;
2138 s->nodesizes[tn->bits]++;
2139 depth++;
2140
2141 n = tn->child[cindex];
2142 tn = (struct tnode *)n;
2143 pend = tn->pos+tn->bits;
2144
2145 indent += 3;
2146 cindex = 0;
2147 }
2148 }
2149 else {
2150 cindex++;
2151 s->nullpointers++;
2152 }
2153
2154 /*
2155 * Test if we are done
2156 */
2157
2158 while (cindex >= (1 << tn->bits)) {
2159
2160 /*
2161 * Move upwards and test for root
2162 * pop off all traversed nodes
2163 */
2164
2165
2166 if (NODE_PARENT(tn) == NULL) {
2167 tn = NULL;
2168 n = NULL;
2169 break;
2170 }
2171 else {
2172 cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
2173 tn = NODE_PARENT(tn);
2174 cindex++;
2175 n = (struct node *)tn;
2176 pend=tn->pos+tn->bits;
2177 indent -= 3;
2178 depth--;
2179 }
2180 }
2181 }
2182 }
2183 else n = NULL;
2184 }
2185 }
2186
2187 read_unlock(&fib_lock);
2188 return s;
2189}
2190
2191#ifdef CONFIG_PROC_FS
2192
2193static struct fib_alias *fib_triestat_get_first(struct seq_file *seq)
2194{
2195 return NULL;
2196}
2197
2198static struct fib_alias *fib_triestat_get_next(struct seq_file *seq)
2199{
2200 return NULL;
2201}
2202
2203static void *fib_triestat_seq_start(struct seq_file *seq, loff_t *pos)
2204{
2205 void *v = NULL;
2206
2207 if (ip_fib_main_table)
2208 v = *pos ? fib_triestat_get_next(seq) : SEQ_START_TOKEN;
2209 return v;
2210}
2211
2212static void *fib_triestat_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2213{
2214 ++*pos;
2215 return v == SEQ_START_TOKEN ? fib_triestat_get_first(seq) : fib_triestat_get_next(seq);
2216}
2217
2218static void fib_triestat_seq_stop(struct seq_file *seq, void *v)
2219{
2220
2221}
2222
2223/*
2224 * This outputs /proc/net/fib_triestats
2225 *
2226 * It always works in backward compatibility mode.
2227 * The format of the file is not supposed to be changed.
2228 */
2229
2230static void collect_and_show(struct trie *t, struct seq_file *seq)
2231{
2232 int bytes = 0; /* How many bytes are used, a ref is 4 bytes */
2233 int i, max, pointers;
2234 struct trie_stat *stat;
2235 int avdepth;
2236
2237 stat = trie_collect_stats(t);
2238
2239 bytes=0;
2240 seq_printf(seq, "trie=%p\n", t);
2241
2242 if (stat) {
2243 if (stat->leaves)
2244 avdepth=stat->totdepth*100 / stat->leaves;
2245 else
2246 avdepth=0;
2247 seq_printf(seq, "Aver depth: %d.%02d\n", avdepth / 100, avdepth % 100 );
2248 seq_printf(seq, "Max depth: %4d\n", stat->maxdepth);
2249
2250 seq_printf(seq, "Leaves: %d\n", stat->leaves);
2251 bytes += sizeof(struct leaf) * stat->leaves;
2252 seq_printf(seq, "Internal nodes: %d\n", stat->tnodes);
2253 bytes += sizeof(struct tnode) * stat->tnodes;
2254
2255 max = MAX_CHILDS-1;
2256
2257 while (max >= 0 && stat->nodesizes[max] == 0)
2258 max--;
2259 pointers = 0;
2260
2261 for (i = 1; i <= max; i++)
2262 if (stat->nodesizes[i] != 0) {
2263 seq_printf(seq, " %d: %d", i, stat->nodesizes[i]);
2264 pointers += (1<<i) * stat->nodesizes[i];
2265 }
2266 seq_printf(seq, "\n");
2267 seq_printf(seq, "Pointers: %d\n", pointers);
2268 bytes += sizeof(struct node *) * pointers;
2269 seq_printf(seq, "Null ptrs: %d\n", stat->nullpointers);
2270 seq_printf(seq, "Total size: %d kB\n", bytes / 1024);
2271
2272 kfree(stat);
2273 }
2274
2275#ifdef CONFIG_IP_FIB_TRIE_STATS
2276 seq_printf(seq, "Counters:\n---------\n");
2277 seq_printf(seq,"gets = %d\n", t->stats.gets);
2278 seq_printf(seq,"backtracks = %d\n", t->stats.backtrack);
2279 seq_printf(seq,"semantic match passed = %d\n", t->stats.semantic_match_passed);
2280 seq_printf(seq,"semantic match miss = %d\n", t->stats.semantic_match_miss);
2281 seq_printf(seq,"null node hit= %d\n", t->stats.null_node_hit);
2282#ifdef CLEAR_STATS
2283 memset(&(t->stats), 0, sizeof(t->stats));
2284#endif
2285#endif /* CONFIG_IP_FIB_TRIE_STATS */
2286}
2287
2288static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2289{
2290 char bf[128];
2291
2292 if (v == SEQ_START_TOKEN) {
2293 seq_printf(seq, "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n",
2294 sizeof(struct leaf), sizeof(struct tnode));
2295 if (trie_local)
2296 collect_and_show(trie_local, seq);
2297
2298 if (trie_main)
2299 collect_and_show(trie_main, seq);
2300 }
2301 else {
2302 snprintf(bf, sizeof(bf),
2303 "*\t%08X\t%08X", 200, 400);
2304
2305 seq_printf(seq, "%-127s\n", bf);
2306 }
2307 return 0;
2308}
2309
2310static struct seq_operations fib_triestat_seq_ops = {
2311 .start = fib_triestat_seq_start,
2312 .next = fib_triestat_seq_next,
2313 .stop = fib_triestat_seq_stop,
2314 .show = fib_triestat_seq_show,
2315};
2316
2317static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2318{
2319 struct seq_file *seq;
2320 int rc = -ENOMEM;
2321
2322 rc = seq_open(file, &fib_triestat_seq_ops);
2323 if (rc)
2324 goto out_kfree;
2325
2326 seq = file->private_data;
2327out:
2328 return rc;
2329out_kfree:
2330 goto out;
2331}
2332
2333static struct file_operations fib_triestat_seq_fops = {
2334 .owner = THIS_MODULE,
2335 .open = fib_triestat_seq_open,
2336 .read = seq_read,
2337 .llseek = seq_lseek,
2338 .release = seq_release_private,
2339};
2340
2341int __init fib_stat_proc_init(void)
2342{
2343 if (!proc_net_fops_create("fib_triestat", S_IRUGO, &fib_triestat_seq_fops))
2344 return -ENOMEM;
2345 return 0;
2346}
2347
2348void __init fib_stat_proc_exit(void)
2349{
2350 proc_net_remove("fib_triestat");
2351}
2352
2353static struct fib_alias *fib_trie_get_first(struct seq_file *seq)
2354{
2355 return NULL;
2356}
2357
2358static struct fib_alias *fib_trie_get_next(struct seq_file *seq)
2359{
2360 return NULL;
2361}
2362
2363static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2364{
2365 void *v = NULL;
2366
2367 if (ip_fib_main_table)
2368 v = *pos ? fib_trie_get_next(seq) : SEQ_START_TOKEN;
2369 return v;
2370}
2371
2372static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2373{
2374 ++*pos;
2375 return v == SEQ_START_TOKEN ? fib_trie_get_first(seq) : fib_trie_get_next(seq);
2376}
2377
2378static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2379{
2380
2381}
2382
2383/*
2384 * This outputs /proc/net/fib_trie.
2385 *
2386 * It always works in backward compatibility mode.
2387 * The format of the file is not supposed to be changed.
2388 */
2389
2390static int fib_trie_seq_show(struct seq_file *seq, void *v)
2391{
2392 char bf[128];
2393
2394 if (v == SEQ_START_TOKEN) {
2395 if (trie_local)
2396 trie_dump_seq(seq, trie_local);
2397
2398 if (trie_main)
2399 trie_dump_seq(seq, trie_main);
2400 }
2401
2402 else {
2403 snprintf(bf, sizeof(bf),
2404 "*\t%08X\t%08X", 200, 400);
2405 seq_printf(seq, "%-127s\n", bf);
2406 }
2407
2408 return 0;
2409}
2410
2411static struct seq_operations fib_trie_seq_ops = {
2412 .start = fib_trie_seq_start,
2413 .next = fib_trie_seq_next,
2414 .stop = fib_trie_seq_stop,
2415 .show = fib_trie_seq_show,
2416};
2417
2418static int fib_trie_seq_open(struct inode *inode, struct file *file)
2419{
2420 struct seq_file *seq;
2421 int rc = -ENOMEM;
2422
2423 rc = seq_open(file, &fib_trie_seq_ops);
2424 if (rc)
2425 goto out_kfree;
2426
2427 seq = file->private_data;
2428out:
2429 return rc;
2430out_kfree:
2431 goto out;
2432}
2433
2434static struct file_operations fib_trie_seq_fops = {
2435 .owner = THIS_MODULE,
2436 .open = fib_trie_seq_open,
2437 .read = seq_read,
2438 .llseek = seq_lseek,
2439 .release = seq_release_private,
2440};
2441
2442int __init fib_proc_init(void)
2443{
2444 if (!proc_net_fops_create("fib_trie", S_IRUGO, &fib_trie_seq_fops))
2445 return -ENOMEM;
2446 return 0;
2447}
2448
2449void __init fib_proc_exit(void)
2450{
2451 proc_net_remove("fib_trie");
2452}
2453
2454#endif /* CONFIG_PROC_FS */
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 4e47a2658c7c..af2ec88bbb2f 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -184,6 +184,7 @@ int ip_call_ra_chain(struct sk_buff *skb)
184 raw_rcv(last, skb2); 184 raw_rcv(last, skb2);
185 } 185 }
186 last = sk; 186 last = sk;
187 nf_reset(skb);
187 } 188 }
188 } 189 }
189 190
@@ -200,10 +201,6 @@ static inline int ip_local_deliver_finish(struct sk_buff *skb)
200{ 201{
201 int ihl = skb->nh.iph->ihl*4; 202 int ihl = skb->nh.iph->ihl*4;
202 203
203#ifdef CONFIG_NETFILTER_DEBUG
204 nf_debug_ip_local_deliver(skb);
205#endif /*CONFIG_NETFILTER_DEBUG*/
206
207 __skb_pull(skb, ihl); 204 __skb_pull(skb, ihl);
208 205
209 /* Free reference early: we don't need it any more, and it may 206 /* Free reference early: we don't need it any more, and it may
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 760dc8238d65..ee07aec215a0 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -107,10 +107,6 @@ static int ip_dev_loopback_xmit(struct sk_buff *newskb)
107 newskb->pkt_type = PACKET_LOOPBACK; 107 newskb->pkt_type = PACKET_LOOPBACK;
108 newskb->ip_summed = CHECKSUM_UNNECESSARY; 108 newskb->ip_summed = CHECKSUM_UNNECESSARY;
109 BUG_TRAP(newskb->dst); 109 BUG_TRAP(newskb->dst);
110
111#ifdef CONFIG_NETFILTER_DEBUG
112 nf_debug_ip_loopback_xmit(newskb);
113#endif
114 nf_reset(newskb); 110 nf_reset(newskb);
115 netif_rx(newskb); 111 netif_rx(newskb);
116 return 0; 112 return 0;
@@ -192,10 +188,6 @@ static inline int ip_finish_output2(struct sk_buff *skb)
192 skb = skb2; 188 skb = skb2;
193 } 189 }
194 190
195#ifdef CONFIG_NETFILTER_DEBUG
196 nf_debug_ip_finish_output2(skb);
197#endif /*CONFIG_NETFILTER_DEBUG*/
198
199 nf_reset(skb); 191 nf_reset(skb);
200 192
201 if (hh) { 193 if (hh) {
@@ -415,9 +407,6 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
415 to->nf_bridge = from->nf_bridge; 407 to->nf_bridge = from->nf_bridge;
416 nf_bridge_get(to->nf_bridge); 408 nf_bridge_get(to->nf_bridge);
417#endif 409#endif
418#ifdef CONFIG_NETFILTER_DEBUG
419 to->nf_debug = from->nf_debug;
420#endif
421#endif 410#endif
422} 411}
423 412
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index e21c049ec62a..e4f809a93f47 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -1350,6 +1350,7 @@ int ip_mr_input(struct sk_buff *skb)
1350 */ 1350 */
1351 read_lock(&mrt_lock); 1351 read_lock(&mrt_lock);
1352 if (mroute_socket) { 1352 if (mroute_socket) {
1353 nf_reset(skb);
1353 raw_rcv(mroute_socket, skb); 1354 raw_rcv(mroute_socket, skb);
1354 read_unlock(&mrt_lock); 1355 read_unlock(&mrt_lock);
1355 return 0; 1356 return 0;
diff --git a/net/ipv4/ipvs/ip_vs_xmit.c b/net/ipv4/ipvs/ip_vs_xmit.c
index de21da00057f..a8512a3fd08a 100644
--- a/net/ipv4/ipvs/ip_vs_xmit.c
+++ b/net/ipv4/ipvs/ip_vs_xmit.c
@@ -127,7 +127,6 @@ ip_vs_dst_reset(struct ip_vs_dest *dest)
127 127
128#define IP_VS_XMIT(skb, rt) \ 128#define IP_VS_XMIT(skb, rt) \
129do { \ 129do { \
130 nf_reset_debug(skb); \
131 (skb)->nfcache |= NFC_IPVS_PROPERTY; \ 130 (skb)->nfcache |= NFC_IPVS_PROPERTY; \
132 (skb)->ip_summed = CHECKSUM_NONE; \ 131 (skb)->ip_summed = CHECKSUM_NONE; \
133 NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, (skb), NULL, \ 132 NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, (skb), NULL, \
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index df79f5ed6a0a..fa1634256680 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -60,7 +60,6 @@ static DECLARE_MUTEX(arpt_mutex);
60 60
61#define ASSERT_READ_LOCK(x) ARP_NF_ASSERT(down_trylock(&arpt_mutex) != 0) 61#define ASSERT_READ_LOCK(x) ARP_NF_ASSERT(down_trylock(&arpt_mutex) != 0)
62#define ASSERT_WRITE_LOCK(x) ARP_NF_ASSERT(down_trylock(&arpt_mutex) != 0) 62#define ASSERT_WRITE_LOCK(x) ARP_NF_ASSERT(down_trylock(&arpt_mutex) != 0)
63#include <linux/netfilter_ipv4/lockhelp.h>
64#include <linux/netfilter_ipv4/listhelp.h> 63#include <linux/netfilter_ipv4/listhelp.h>
65 64
66struct arpt_table_info { 65struct arpt_table_info {
diff --git a/net/ipv4/netfilter/ip_conntrack_amanda.c b/net/ipv4/netfilter/ip_conntrack_amanda.c
index 3dbddd062605..a78a320eee08 100644
--- a/net/ipv4/netfilter/ip_conntrack_amanda.c
+++ b/net/ipv4/netfilter/ip_conntrack_amanda.c
@@ -26,7 +26,6 @@
26#include <net/checksum.h> 26#include <net/checksum.h>
27#include <net/udp.h> 27#include <net/udp.h>
28 28
29#include <linux/netfilter_ipv4/lockhelp.h>
30#include <linux/netfilter_ipv4/ip_conntrack_helper.h> 29#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
31#include <linux/netfilter_ipv4/ip_conntrack_amanda.h> 30#include <linux/netfilter_ipv4/ip_conntrack_amanda.h>
32 31
@@ -42,7 +41,7 @@ static char *conns[] = { "DATA ", "MESG ", "INDEX " };
42 41
43/* This is slow, but it's simple. --RR */ 42/* This is slow, but it's simple. --RR */
44static char amanda_buffer[65536]; 43static char amanda_buffer[65536];
45static DECLARE_LOCK(amanda_buffer_lock); 44static DEFINE_SPINLOCK(amanda_buffer_lock);
46 45
47unsigned int (*ip_nat_amanda_hook)(struct sk_buff **pskb, 46unsigned int (*ip_nat_amanda_hook)(struct sk_buff **pskb,
48 enum ip_conntrack_info ctinfo, 47 enum ip_conntrack_info ctinfo,
@@ -76,7 +75,7 @@ static int help(struct sk_buff **pskb,
76 return NF_ACCEPT; 75 return NF_ACCEPT;
77 } 76 }
78 77
79 LOCK_BH(&amanda_buffer_lock); 78 spin_lock_bh(&amanda_buffer_lock);
80 skb_copy_bits(*pskb, dataoff, amanda_buffer, (*pskb)->len - dataoff); 79 skb_copy_bits(*pskb, dataoff, amanda_buffer, (*pskb)->len - dataoff);
81 data = amanda_buffer; 80 data = amanda_buffer;
82 data_limit = amanda_buffer + (*pskb)->len - dataoff; 81 data_limit = amanda_buffer + (*pskb)->len - dataoff;
@@ -134,7 +133,7 @@ static int help(struct sk_buff **pskb,
134 } 133 }
135 134
136out: 135out:
137 UNLOCK_BH(&amanda_buffer_lock); 136 spin_unlock_bh(&amanda_buffer_lock);
138 return ret; 137 return ret;
139} 138}
140 139
diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c
index 09e824622977..4b78ebeb6635 100644
--- a/net/ipv4/netfilter/ip_conntrack_core.c
+++ b/net/ipv4/netfilter/ip_conntrack_core.c
@@ -38,10 +38,10 @@
38#include <linux/percpu.h> 38#include <linux/percpu.h>
39#include <linux/moduleparam.h> 39#include <linux/moduleparam.h>
40 40
41/* This rwlock protects the main hash table, protocol/helper/expected 41/* ip_conntrack_lock protects the main hash table, protocol/helper/expected
42 registrations, conntrack timers*/ 42 registrations, conntrack timers*/
43#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_conntrack_lock) 43#define ASSERT_READ_LOCK(x)
44#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_conntrack_lock) 44#define ASSERT_WRITE_LOCK(x)
45 45
46#include <linux/netfilter_ipv4/ip_conntrack.h> 46#include <linux/netfilter_ipv4/ip_conntrack.h>
47#include <linux/netfilter_ipv4/ip_conntrack_protocol.h> 47#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
@@ -57,7 +57,7 @@
57#define DEBUGP(format, args...) 57#define DEBUGP(format, args...)
58#endif 58#endif
59 59
60DECLARE_RWLOCK(ip_conntrack_lock); 60DEFINE_RWLOCK(ip_conntrack_lock);
61 61
62/* ip_conntrack_standalone needs this */ 62/* ip_conntrack_standalone needs this */
63atomic_t ip_conntrack_count = ATOMIC_INIT(0); 63atomic_t ip_conntrack_count = ATOMIC_INIT(0);
@@ -147,7 +147,7 @@ static void destroy_expect(struct ip_conntrack_expect *exp)
147 147
148static void unlink_expect(struct ip_conntrack_expect *exp) 148static void unlink_expect(struct ip_conntrack_expect *exp)
149{ 149{
150 MUST_BE_WRITE_LOCKED(&ip_conntrack_lock); 150 ASSERT_WRITE_LOCK(&ip_conntrack_lock);
151 list_del(&exp->list); 151 list_del(&exp->list);
152 /* Logically in destroy_expect, but we hold the lock here. */ 152 /* Logically in destroy_expect, but we hold the lock here. */
153 exp->master->expecting--; 153 exp->master->expecting--;
@@ -157,9 +157,9 @@ static void expectation_timed_out(unsigned long ul_expect)
157{ 157{
158 struct ip_conntrack_expect *exp = (void *)ul_expect; 158 struct ip_conntrack_expect *exp = (void *)ul_expect;
159 159
160 WRITE_LOCK(&ip_conntrack_lock); 160 write_lock_bh(&ip_conntrack_lock);
161 unlink_expect(exp); 161 unlink_expect(exp);
162 WRITE_UNLOCK(&ip_conntrack_lock); 162 write_unlock_bh(&ip_conntrack_lock);
163 destroy_expect(exp); 163 destroy_expect(exp);
164} 164}
165 165
@@ -209,7 +209,7 @@ clean_from_lists(struct ip_conntrack *ct)
209 unsigned int ho, hr; 209 unsigned int ho, hr;
210 210
211 DEBUGP("clean_from_lists(%p)\n", ct); 211 DEBUGP("clean_from_lists(%p)\n", ct);
212 MUST_BE_WRITE_LOCKED(&ip_conntrack_lock); 212 ASSERT_WRITE_LOCK(&ip_conntrack_lock);
213 213
214 ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); 214 ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
215 hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); 215 hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
@@ -240,7 +240,7 @@ destroy_conntrack(struct nf_conntrack *nfct)
240 if (ip_conntrack_destroyed) 240 if (ip_conntrack_destroyed)
241 ip_conntrack_destroyed(ct); 241 ip_conntrack_destroyed(ct);
242 242
243 WRITE_LOCK(&ip_conntrack_lock); 243 write_lock_bh(&ip_conntrack_lock);
244 /* Expectations will have been removed in clean_from_lists, 244 /* Expectations will have been removed in clean_from_lists,
245 * except TFTP can create an expectation on the first packet, 245 * except TFTP can create an expectation on the first packet,
246 * before connection is in the list, so we need to clean here, 246 * before connection is in the list, so we need to clean here,
@@ -254,7 +254,7 @@ destroy_conntrack(struct nf_conntrack *nfct)
254 } 254 }
255 255
256 CONNTRACK_STAT_INC(delete); 256 CONNTRACK_STAT_INC(delete);
257 WRITE_UNLOCK(&ip_conntrack_lock); 257 write_unlock_bh(&ip_conntrack_lock);
258 258
259 if (ct->master) 259 if (ct->master)
260 ip_conntrack_put(ct->master); 260 ip_conntrack_put(ct->master);
@@ -268,12 +268,12 @@ static void death_by_timeout(unsigned long ul_conntrack)
268{ 268{
269 struct ip_conntrack *ct = (void *)ul_conntrack; 269 struct ip_conntrack *ct = (void *)ul_conntrack;
270 270
271 WRITE_LOCK(&ip_conntrack_lock); 271 write_lock_bh(&ip_conntrack_lock);
272 /* Inside lock so preempt is disabled on module removal path. 272 /* Inside lock so preempt is disabled on module removal path.
273 * Otherwise we can get spurious warnings. */ 273 * Otherwise we can get spurious warnings. */
274 CONNTRACK_STAT_INC(delete_list); 274 CONNTRACK_STAT_INC(delete_list);
275 clean_from_lists(ct); 275 clean_from_lists(ct);
276 WRITE_UNLOCK(&ip_conntrack_lock); 276 write_unlock_bh(&ip_conntrack_lock);
277 ip_conntrack_put(ct); 277 ip_conntrack_put(ct);
278} 278}
279 279
@@ -282,7 +282,7 @@ conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i,
282 const struct ip_conntrack_tuple *tuple, 282 const struct ip_conntrack_tuple *tuple,
283 const struct ip_conntrack *ignored_conntrack) 283 const struct ip_conntrack *ignored_conntrack)
284{ 284{
285 MUST_BE_READ_LOCKED(&ip_conntrack_lock); 285 ASSERT_READ_LOCK(&ip_conntrack_lock);
286 return tuplehash_to_ctrack(i) != ignored_conntrack 286 return tuplehash_to_ctrack(i) != ignored_conntrack
287 && ip_ct_tuple_equal(tuple, &i->tuple); 287 && ip_ct_tuple_equal(tuple, &i->tuple);
288} 288}
@@ -294,7 +294,7 @@ __ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
294 struct ip_conntrack_tuple_hash *h; 294 struct ip_conntrack_tuple_hash *h;
295 unsigned int hash = hash_conntrack(tuple); 295 unsigned int hash = hash_conntrack(tuple);
296 296
297 MUST_BE_READ_LOCKED(&ip_conntrack_lock); 297 ASSERT_READ_LOCK(&ip_conntrack_lock);
298 list_for_each_entry(h, &ip_conntrack_hash[hash], list) { 298 list_for_each_entry(h, &ip_conntrack_hash[hash], list) {
299 if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) { 299 if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) {
300 CONNTRACK_STAT_INC(found); 300 CONNTRACK_STAT_INC(found);
@@ -313,11 +313,11 @@ ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
313{ 313{
314 struct ip_conntrack_tuple_hash *h; 314 struct ip_conntrack_tuple_hash *h;
315 315
316 READ_LOCK(&ip_conntrack_lock); 316 read_lock_bh(&ip_conntrack_lock);
317 h = __ip_conntrack_find(tuple, ignored_conntrack); 317 h = __ip_conntrack_find(tuple, ignored_conntrack);
318 if (h) 318 if (h)
319 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use); 319 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
320 READ_UNLOCK(&ip_conntrack_lock); 320 read_unlock_bh(&ip_conntrack_lock);
321 321
322 return h; 322 return h;
323} 323}
@@ -352,7 +352,7 @@ __ip_conntrack_confirm(struct sk_buff **pskb)
352 IP_NF_ASSERT(!is_confirmed(ct)); 352 IP_NF_ASSERT(!is_confirmed(ct));
353 DEBUGP("Confirming conntrack %p\n", ct); 353 DEBUGP("Confirming conntrack %p\n", ct);
354 354
355 WRITE_LOCK(&ip_conntrack_lock); 355 write_lock_bh(&ip_conntrack_lock);
356 356
357 /* See if there's one in the list already, including reverse: 357 /* See if there's one in the list already, including reverse:
358 NAT could have grabbed it without realizing, since we're 358 NAT could have grabbed it without realizing, since we're
@@ -380,12 +380,12 @@ __ip_conntrack_confirm(struct sk_buff **pskb)
380 atomic_inc(&ct->ct_general.use); 380 atomic_inc(&ct->ct_general.use);
381 set_bit(IPS_CONFIRMED_BIT, &ct->status); 381 set_bit(IPS_CONFIRMED_BIT, &ct->status);
382 CONNTRACK_STAT_INC(insert); 382 CONNTRACK_STAT_INC(insert);
383 WRITE_UNLOCK(&ip_conntrack_lock); 383 write_unlock_bh(&ip_conntrack_lock);
384 return NF_ACCEPT; 384 return NF_ACCEPT;
385 } 385 }
386 386
387 CONNTRACK_STAT_INC(insert_failed); 387 CONNTRACK_STAT_INC(insert_failed);
388 WRITE_UNLOCK(&ip_conntrack_lock); 388 write_unlock_bh(&ip_conntrack_lock);
389 389
390 return NF_DROP; 390 return NF_DROP;
391} 391}
@@ -398,9 +398,9 @@ ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
398{ 398{
399 struct ip_conntrack_tuple_hash *h; 399 struct ip_conntrack_tuple_hash *h;
400 400
401 READ_LOCK(&ip_conntrack_lock); 401 read_lock_bh(&ip_conntrack_lock);
402 h = __ip_conntrack_find(tuple, ignored_conntrack); 402 h = __ip_conntrack_find(tuple, ignored_conntrack);
403 READ_UNLOCK(&ip_conntrack_lock); 403 read_unlock_bh(&ip_conntrack_lock);
404 404
405 return h != NULL; 405 return h != NULL;
406} 406}
@@ -419,13 +419,13 @@ static int early_drop(struct list_head *chain)
419 struct ip_conntrack *ct = NULL; 419 struct ip_conntrack *ct = NULL;
420 int dropped = 0; 420 int dropped = 0;
421 421
422 READ_LOCK(&ip_conntrack_lock); 422 read_lock_bh(&ip_conntrack_lock);
423 h = LIST_FIND_B(chain, unreplied, struct ip_conntrack_tuple_hash *); 423 h = LIST_FIND_B(chain, unreplied, struct ip_conntrack_tuple_hash *);
424 if (h) { 424 if (h) {
425 ct = tuplehash_to_ctrack(h); 425 ct = tuplehash_to_ctrack(h);
426 atomic_inc(&ct->ct_general.use); 426 atomic_inc(&ct->ct_general.use);
427 } 427 }
428 READ_UNLOCK(&ip_conntrack_lock); 428 read_unlock_bh(&ip_conntrack_lock);
429 429
430 if (!ct) 430 if (!ct)
431 return dropped; 431 return dropped;
@@ -508,7 +508,7 @@ init_conntrack(const struct ip_conntrack_tuple *tuple,
508 conntrack->timeout.data = (unsigned long)conntrack; 508 conntrack->timeout.data = (unsigned long)conntrack;
509 conntrack->timeout.function = death_by_timeout; 509 conntrack->timeout.function = death_by_timeout;
510 510
511 WRITE_LOCK(&ip_conntrack_lock); 511 write_lock_bh(&ip_conntrack_lock);
512 exp = find_expectation(tuple); 512 exp = find_expectation(tuple);
513 513
514 if (exp) { 514 if (exp) {
@@ -532,7 +532,7 @@ init_conntrack(const struct ip_conntrack_tuple *tuple,
532 list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed); 532 list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
533 533
534 atomic_inc(&ip_conntrack_count); 534 atomic_inc(&ip_conntrack_count);
535 WRITE_UNLOCK(&ip_conntrack_lock); 535 write_unlock_bh(&ip_conntrack_lock);
536 536
537 if (exp) { 537 if (exp) {
538 if (exp->expectfn) 538 if (exp->expectfn)
@@ -723,17 +723,17 @@ void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp)
723{ 723{
724 struct ip_conntrack_expect *i; 724 struct ip_conntrack_expect *i;
725 725
726 WRITE_LOCK(&ip_conntrack_lock); 726 write_lock_bh(&ip_conntrack_lock);
727 /* choose the the oldest expectation to evict */ 727 /* choose the the oldest expectation to evict */
728 list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) { 728 list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
729 if (expect_matches(i, exp) && del_timer(&i->timeout)) { 729 if (expect_matches(i, exp) && del_timer(&i->timeout)) {
730 unlink_expect(i); 730 unlink_expect(i);
731 WRITE_UNLOCK(&ip_conntrack_lock); 731 write_unlock_bh(&ip_conntrack_lock);
732 destroy_expect(i); 732 destroy_expect(i);
733 return; 733 return;
734 } 734 }
735 } 735 }
736 WRITE_UNLOCK(&ip_conntrack_lock); 736 write_unlock_bh(&ip_conntrack_lock);
737} 737}
738 738
739struct ip_conntrack_expect *ip_conntrack_expect_alloc(void) 739struct ip_conntrack_expect *ip_conntrack_expect_alloc(void)
@@ -760,15 +760,11 @@ static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp)
760 exp->master->expecting++; 760 exp->master->expecting++;
761 list_add(&exp->list, &ip_conntrack_expect_list); 761 list_add(&exp->list, &ip_conntrack_expect_list);
762 762
763 if (exp->master->helper->timeout) { 763 init_timer(&exp->timeout);
764 init_timer(&exp->timeout); 764 exp->timeout.data = (unsigned long)exp;
765 exp->timeout.data = (unsigned long)exp; 765 exp->timeout.function = expectation_timed_out;
766 exp->timeout.function = expectation_timed_out; 766 exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ;
767 exp->timeout.expires 767 add_timer(&exp->timeout);
768 = jiffies + exp->master->helper->timeout * HZ;
769 add_timer(&exp->timeout);
770 } else
771 exp->timeout.function = NULL;
772 768
773 CONNTRACK_STAT_INC(expect_create); 769 CONNTRACK_STAT_INC(expect_create);
774} 770}
@@ -808,7 +804,7 @@ int ip_conntrack_expect_related(struct ip_conntrack_expect *expect)
808 DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple); 804 DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple);
809 DEBUGP("mask: "); DUMP_TUPLE(&expect->mask); 805 DEBUGP("mask: "); DUMP_TUPLE(&expect->mask);
810 806
811 WRITE_LOCK(&ip_conntrack_lock); 807 write_lock_bh(&ip_conntrack_lock);
812 list_for_each_entry(i, &ip_conntrack_expect_list, list) { 808 list_for_each_entry(i, &ip_conntrack_expect_list, list) {
813 if (expect_matches(i, expect)) { 809 if (expect_matches(i, expect)) {
814 /* Refresh timer: if it's dying, ignore.. */ 810 /* Refresh timer: if it's dying, ignore.. */
@@ -832,7 +828,7 @@ int ip_conntrack_expect_related(struct ip_conntrack_expect *expect)
832 ip_conntrack_expect_insert(expect); 828 ip_conntrack_expect_insert(expect);
833 ret = 0; 829 ret = 0;
834out: 830out:
835 WRITE_UNLOCK(&ip_conntrack_lock); 831 write_unlock_bh(&ip_conntrack_lock);
836 return ret; 832 return ret;
837} 833}
838 834
@@ -841,7 +837,7 @@ out:
841void ip_conntrack_alter_reply(struct ip_conntrack *conntrack, 837void ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
842 const struct ip_conntrack_tuple *newreply) 838 const struct ip_conntrack_tuple *newreply)
843{ 839{
844 WRITE_LOCK(&ip_conntrack_lock); 840 write_lock_bh(&ip_conntrack_lock);
845 /* Should be unconfirmed, so not in hash table yet */ 841 /* Should be unconfirmed, so not in hash table yet */
846 IP_NF_ASSERT(!is_confirmed(conntrack)); 842 IP_NF_ASSERT(!is_confirmed(conntrack));
847 843
@@ -851,15 +847,15 @@ void ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
851 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply; 847 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
852 if (!conntrack->master && conntrack->expecting == 0) 848 if (!conntrack->master && conntrack->expecting == 0)
853 conntrack->helper = ip_ct_find_helper(newreply); 849 conntrack->helper = ip_ct_find_helper(newreply);
854 WRITE_UNLOCK(&ip_conntrack_lock); 850 write_unlock_bh(&ip_conntrack_lock);
855} 851}
856 852
857int ip_conntrack_helper_register(struct ip_conntrack_helper *me) 853int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
858{ 854{
859 BUG_ON(me->timeout == 0); 855 BUG_ON(me->timeout == 0);
860 WRITE_LOCK(&ip_conntrack_lock); 856 write_lock_bh(&ip_conntrack_lock);
861 list_prepend(&helpers, me); 857 list_prepend(&helpers, me);
862 WRITE_UNLOCK(&ip_conntrack_lock); 858 write_unlock_bh(&ip_conntrack_lock);
863 859
864 return 0; 860 return 0;
865} 861}
@@ -878,7 +874,7 @@ void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
878 struct ip_conntrack_expect *exp, *tmp; 874 struct ip_conntrack_expect *exp, *tmp;
879 875
880 /* Need write lock here, to delete helper. */ 876 /* Need write lock here, to delete helper. */
881 WRITE_LOCK(&ip_conntrack_lock); 877 write_lock_bh(&ip_conntrack_lock);
882 LIST_DELETE(&helpers, me); 878 LIST_DELETE(&helpers, me);
883 879
884 /* Get rid of expectations */ 880 /* Get rid of expectations */
@@ -893,7 +889,7 @@ void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
893 for (i = 0; i < ip_conntrack_htable_size; i++) 889 for (i = 0; i < ip_conntrack_htable_size; i++)
894 LIST_FIND_W(&ip_conntrack_hash[i], unhelp, 890 LIST_FIND_W(&ip_conntrack_hash[i], unhelp,
895 struct ip_conntrack_tuple_hash *, me); 891 struct ip_conntrack_tuple_hash *, me);
896 WRITE_UNLOCK(&ip_conntrack_lock); 892 write_unlock_bh(&ip_conntrack_lock);
897 893
898 /* Someone could be still looking at the helper in a bh. */ 894 /* Someone could be still looking at the helper in a bh. */
899 synchronize_net(); 895 synchronize_net();
@@ -925,14 +921,14 @@ void ip_ct_refresh_acct(struct ip_conntrack *ct,
925 ct->timeout.expires = extra_jiffies; 921 ct->timeout.expires = extra_jiffies;
926 ct_add_counters(ct, ctinfo, skb); 922 ct_add_counters(ct, ctinfo, skb);
927 } else { 923 } else {
928 WRITE_LOCK(&ip_conntrack_lock); 924 write_lock_bh(&ip_conntrack_lock);
929 /* Need del_timer for race avoidance (may already be dying). */ 925 /* Need del_timer for race avoidance (may already be dying). */
930 if (del_timer(&ct->timeout)) { 926 if (del_timer(&ct->timeout)) {
931 ct->timeout.expires = jiffies + extra_jiffies; 927 ct->timeout.expires = jiffies + extra_jiffies;
932 add_timer(&ct->timeout); 928 add_timer(&ct->timeout);
933 } 929 }
934 ct_add_counters(ct, ctinfo, skb); 930 ct_add_counters(ct, ctinfo, skb);
935 WRITE_UNLOCK(&ip_conntrack_lock); 931 write_unlock_bh(&ip_conntrack_lock);
936 } 932 }
937} 933}
938 934
@@ -940,10 +936,6 @@ void ip_ct_refresh_acct(struct ip_conntrack *ct,
940struct sk_buff * 936struct sk_buff *
941ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user) 937ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
942{ 938{
943#ifdef CONFIG_NETFILTER_DEBUG
944 unsigned int olddebug = skb->nf_debug;
945#endif
946
947 skb_orphan(skb); 939 skb_orphan(skb);
948 940
949 local_bh_disable(); 941 local_bh_disable();
@@ -953,12 +945,7 @@ ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
953 if (skb) { 945 if (skb) {
954 ip_send_check(skb->nh.iph); 946 ip_send_check(skb->nh.iph);
955 skb->nfcache |= NFC_ALTERED; 947 skb->nfcache |= NFC_ALTERED;
956#ifdef CONFIG_NETFILTER_DEBUG
957 /* Packet path as if nothing had happened. */
958 skb->nf_debug = olddebug;
959#endif
960 } 948 }
961
962 return skb; 949 return skb;
963} 950}
964 951
@@ -997,7 +984,7 @@ get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data),
997{ 984{
998 struct ip_conntrack_tuple_hash *h = NULL; 985 struct ip_conntrack_tuple_hash *h = NULL;
999 986
1000 WRITE_LOCK(&ip_conntrack_lock); 987 write_lock_bh(&ip_conntrack_lock);
1001 for (; *bucket < ip_conntrack_htable_size; (*bucket)++) { 988 for (; *bucket < ip_conntrack_htable_size; (*bucket)++) {
1002 h = LIST_FIND_W(&ip_conntrack_hash[*bucket], do_iter, 989 h = LIST_FIND_W(&ip_conntrack_hash[*bucket], do_iter,
1003 struct ip_conntrack_tuple_hash *, iter, data); 990 struct ip_conntrack_tuple_hash *, iter, data);
@@ -1009,7 +996,7 @@ get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data),
1009 struct ip_conntrack_tuple_hash *, iter, data); 996 struct ip_conntrack_tuple_hash *, iter, data);
1010 if (h) 997 if (h)
1011 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use); 998 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
1012 WRITE_UNLOCK(&ip_conntrack_lock); 999 write_unlock_bh(&ip_conntrack_lock);
1013 1000
1014 return h; 1001 return h;
1015} 1002}
@@ -1201,14 +1188,14 @@ int __init ip_conntrack_init(void)
1201 } 1188 }
1202 1189
1203 /* Don't NEED lock here, but good form anyway. */ 1190 /* Don't NEED lock here, but good form anyway. */
1204 WRITE_LOCK(&ip_conntrack_lock); 1191 write_lock_bh(&ip_conntrack_lock);
1205 for (i = 0; i < MAX_IP_CT_PROTO; i++) 1192 for (i = 0; i < MAX_IP_CT_PROTO; i++)
1206 ip_ct_protos[i] = &ip_conntrack_generic_protocol; 1193 ip_ct_protos[i] = &ip_conntrack_generic_protocol;
1207 /* Sew in builtin protocols. */ 1194 /* Sew in builtin protocols. */
1208 ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp; 1195 ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp;
1209 ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp; 1196 ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp;
1210 ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp; 1197 ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp;
1211 WRITE_UNLOCK(&ip_conntrack_lock); 1198 write_unlock_bh(&ip_conntrack_lock);
1212 1199
1213 for (i = 0; i < ip_conntrack_htable_size; i++) 1200 for (i = 0; i < ip_conntrack_htable_size; i++)
1214 INIT_LIST_HEAD(&ip_conntrack_hash[i]); 1201 INIT_LIST_HEAD(&ip_conntrack_hash[i]);
diff --git a/net/ipv4/netfilter/ip_conntrack_ftp.c b/net/ipv4/netfilter/ip_conntrack_ftp.c
index dd86503aa788..fea6dd2a00b6 100644
--- a/net/ipv4/netfilter/ip_conntrack_ftp.c
+++ b/net/ipv4/netfilter/ip_conntrack_ftp.c
@@ -16,7 +16,6 @@
16#include <net/checksum.h> 16#include <net/checksum.h>
17#include <net/tcp.h> 17#include <net/tcp.h>
18 18
19#include <linux/netfilter_ipv4/lockhelp.h>
20#include <linux/netfilter_ipv4/ip_conntrack_helper.h> 19#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
21#include <linux/netfilter_ipv4/ip_conntrack_ftp.h> 20#include <linux/netfilter_ipv4/ip_conntrack_ftp.h>
22#include <linux/moduleparam.h> 21#include <linux/moduleparam.h>
@@ -28,7 +27,7 @@ MODULE_DESCRIPTION("ftp connection tracking helper");
28/* This is slow, but it's simple. --RR */ 27/* This is slow, but it's simple. --RR */
29static char ftp_buffer[65536]; 28static char ftp_buffer[65536];
30 29
31static DECLARE_LOCK(ip_ftp_lock); 30static DEFINE_SPINLOCK(ip_ftp_lock);
32 31
33#define MAX_PORTS 8 32#define MAX_PORTS 8
34static int ports[MAX_PORTS]; 33static int ports[MAX_PORTS];
@@ -319,7 +318,7 @@ static int help(struct sk_buff **pskb,
319 } 318 }
320 datalen = (*pskb)->len - dataoff; 319 datalen = (*pskb)->len - dataoff;
321 320
322 LOCK_BH(&ip_ftp_lock); 321 spin_lock_bh(&ip_ftp_lock);
323 fb_ptr = skb_header_pointer(*pskb, dataoff, 322 fb_ptr = skb_header_pointer(*pskb, dataoff,
324 (*pskb)->len - dataoff, ftp_buffer); 323 (*pskb)->len - dataoff, ftp_buffer);
325 BUG_ON(fb_ptr == NULL); 324 BUG_ON(fb_ptr == NULL);
@@ -442,7 +441,7 @@ out_update_nl:
442 if (ends_in_nl) 441 if (ends_in_nl)
443 update_nl_seq(seq, ct_ftp_info,dir); 442 update_nl_seq(seq, ct_ftp_info,dir);
444 out: 443 out:
445 UNLOCK_BH(&ip_ftp_lock); 444 spin_unlock_bh(&ip_ftp_lock);
446 return ret; 445 return ret;
447} 446}
448 447
diff --git a/net/ipv4/netfilter/ip_conntrack_irc.c b/net/ipv4/netfilter/ip_conntrack_irc.c
index 33cc7348b6ee..cd98772cc332 100644
--- a/net/ipv4/netfilter/ip_conntrack_irc.c
+++ b/net/ipv4/netfilter/ip_conntrack_irc.c
@@ -29,7 +29,6 @@
29#include <net/checksum.h> 29#include <net/checksum.h>
30#include <net/tcp.h> 30#include <net/tcp.h>
31 31
32#include <linux/netfilter_ipv4/lockhelp.h>
33#include <linux/netfilter_ipv4/ip_conntrack_helper.h> 32#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
34#include <linux/netfilter_ipv4/ip_conntrack_irc.h> 33#include <linux/netfilter_ipv4/ip_conntrack_irc.h>
35#include <linux/moduleparam.h> 34#include <linux/moduleparam.h>
@@ -41,7 +40,7 @@ static int max_dcc_channels = 8;
41static unsigned int dcc_timeout = 300; 40static unsigned int dcc_timeout = 300;
42/* This is slow, but it's simple. --RR */ 41/* This is slow, but it's simple. --RR */
43static char irc_buffer[65536]; 42static char irc_buffer[65536];
44static DECLARE_LOCK(irc_buffer_lock); 43static DEFINE_SPINLOCK(irc_buffer_lock);
45 44
46unsigned int (*ip_nat_irc_hook)(struct sk_buff **pskb, 45unsigned int (*ip_nat_irc_hook)(struct sk_buff **pskb,
47 enum ip_conntrack_info ctinfo, 46 enum ip_conntrack_info ctinfo,
@@ -141,7 +140,7 @@ static int help(struct sk_buff **pskb,
141 if (dataoff >= (*pskb)->len) 140 if (dataoff >= (*pskb)->len)
142 return NF_ACCEPT; 141 return NF_ACCEPT;
143 142
144 LOCK_BH(&irc_buffer_lock); 143 spin_lock_bh(&irc_buffer_lock);
145 ib_ptr = skb_header_pointer(*pskb, dataoff, 144 ib_ptr = skb_header_pointer(*pskb, dataoff,
146 (*pskb)->len - dataoff, irc_buffer); 145 (*pskb)->len - dataoff, irc_buffer);
147 BUG_ON(ib_ptr == NULL); 146 BUG_ON(ib_ptr == NULL);
@@ -237,7 +236,7 @@ static int help(struct sk_buff **pskb,
237 } /* while data < ... */ 236 } /* while data < ... */
238 237
239 out: 238 out:
240 UNLOCK_BH(&irc_buffer_lock); 239 spin_unlock_bh(&irc_buffer_lock);
241 return ret; 240 return ret;
242} 241}
243 242
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_sctp.c b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c
index ff8c34a860ff..31d75390bf12 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_sctp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c
@@ -26,7 +26,6 @@
26 26
27#include <linux/netfilter_ipv4/ip_conntrack.h> 27#include <linux/netfilter_ipv4/ip_conntrack.h>
28#include <linux/netfilter_ipv4/ip_conntrack_protocol.h> 28#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
29#include <linux/netfilter_ipv4/lockhelp.h>
30 29
31#if 0 30#if 0
32#define DEBUGP(format, ...) printk(format, ## __VA_ARGS__) 31#define DEBUGP(format, ...) printk(format, ## __VA_ARGS__)
@@ -35,7 +34,7 @@
35#endif 34#endif
36 35
37/* Protects conntrack->proto.sctp */ 36/* Protects conntrack->proto.sctp */
38static DECLARE_RWLOCK(sctp_lock); 37static DEFINE_RWLOCK(sctp_lock);
39 38
40/* FIXME: Examine ipfilter's timeouts and conntrack transitions more 39/* FIXME: Examine ipfilter's timeouts and conntrack transitions more
41 closely. They're more complex. --RR 40 closely. They're more complex. --RR
@@ -199,9 +198,9 @@ static int sctp_print_conntrack(struct seq_file *s,
199 DEBUGP(__FUNCTION__); 198 DEBUGP(__FUNCTION__);
200 DEBUGP("\n"); 199 DEBUGP("\n");
201 200
202 READ_LOCK(&sctp_lock); 201 read_lock_bh(&sctp_lock);
203 state = conntrack->proto.sctp.state; 202 state = conntrack->proto.sctp.state;
204 READ_UNLOCK(&sctp_lock); 203 read_unlock_bh(&sctp_lock);
205 204
206 return seq_printf(s, "%s ", sctp_conntrack_names[state]); 205 return seq_printf(s, "%s ", sctp_conntrack_names[state]);
207} 206}
@@ -343,13 +342,13 @@ static int sctp_packet(struct ip_conntrack *conntrack,
343 342
344 oldsctpstate = newconntrack = SCTP_CONNTRACK_MAX; 343 oldsctpstate = newconntrack = SCTP_CONNTRACK_MAX;
345 for_each_sctp_chunk (skb, sch, _sch, offset, count) { 344 for_each_sctp_chunk (skb, sch, _sch, offset, count) {
346 WRITE_LOCK(&sctp_lock); 345 write_lock_bh(&sctp_lock);
347 346
348 /* Special cases of Verification tag check (Sec 8.5.1) */ 347 /* Special cases of Verification tag check (Sec 8.5.1) */
349 if (sch->type == SCTP_CID_INIT) { 348 if (sch->type == SCTP_CID_INIT) {
350 /* Sec 8.5.1 (A) */ 349 /* Sec 8.5.1 (A) */
351 if (sh->vtag != 0) { 350 if (sh->vtag != 0) {
352 WRITE_UNLOCK(&sctp_lock); 351 write_unlock_bh(&sctp_lock);
353 return -1; 352 return -1;
354 } 353 }
355 } else if (sch->type == SCTP_CID_ABORT) { 354 } else if (sch->type == SCTP_CID_ABORT) {
@@ -357,7 +356,7 @@ static int sctp_packet(struct ip_conntrack *conntrack,
357 if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)]) 356 if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])
358 && !(sh->vtag == conntrack->proto.sctp.vtag 357 && !(sh->vtag == conntrack->proto.sctp.vtag
359 [1 - CTINFO2DIR(ctinfo)])) { 358 [1 - CTINFO2DIR(ctinfo)])) {
360 WRITE_UNLOCK(&sctp_lock); 359 write_unlock_bh(&sctp_lock);
361 return -1; 360 return -1;
362 } 361 }
363 } else if (sch->type == SCTP_CID_SHUTDOWN_COMPLETE) { 362 } else if (sch->type == SCTP_CID_SHUTDOWN_COMPLETE) {
@@ -366,13 +365,13 @@ static int sctp_packet(struct ip_conntrack *conntrack,
366 && !(sh->vtag == conntrack->proto.sctp.vtag 365 && !(sh->vtag == conntrack->proto.sctp.vtag
367 [1 - CTINFO2DIR(ctinfo)] 366 [1 - CTINFO2DIR(ctinfo)]
368 && (sch->flags & 1))) { 367 && (sch->flags & 1))) {
369 WRITE_UNLOCK(&sctp_lock); 368 write_unlock_bh(&sctp_lock);
370 return -1; 369 return -1;
371 } 370 }
372 } else if (sch->type == SCTP_CID_COOKIE_ECHO) { 371 } else if (sch->type == SCTP_CID_COOKIE_ECHO) {
373 /* Sec 8.5.1 (D) */ 372 /* Sec 8.5.1 (D) */
374 if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])) { 373 if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])) {
375 WRITE_UNLOCK(&sctp_lock); 374 write_unlock_bh(&sctp_lock);
376 return -1; 375 return -1;
377 } 376 }
378 } 377 }
@@ -384,7 +383,7 @@ static int sctp_packet(struct ip_conntrack *conntrack,
384 if (newconntrack == SCTP_CONNTRACK_MAX) { 383 if (newconntrack == SCTP_CONNTRACK_MAX) {
385 DEBUGP("ip_conntrack_sctp: Invalid dir=%i ctype=%u conntrack=%u\n", 384 DEBUGP("ip_conntrack_sctp: Invalid dir=%i ctype=%u conntrack=%u\n",
386 CTINFO2DIR(ctinfo), sch->type, oldsctpstate); 385 CTINFO2DIR(ctinfo), sch->type, oldsctpstate);
387 WRITE_UNLOCK(&sctp_lock); 386 write_unlock_bh(&sctp_lock);
388 return -1; 387 return -1;
389 } 388 }
390 389
@@ -396,7 +395,7 @@ static int sctp_packet(struct ip_conntrack *conntrack,
396 ih = skb_header_pointer(skb, offset + sizeof(sctp_chunkhdr_t), 395 ih = skb_header_pointer(skb, offset + sizeof(sctp_chunkhdr_t),
397 sizeof(_inithdr), &_inithdr); 396 sizeof(_inithdr), &_inithdr);
398 if (ih == NULL) { 397 if (ih == NULL) {
399 WRITE_UNLOCK(&sctp_lock); 398 write_unlock_bh(&sctp_lock);
400 return -1; 399 return -1;
401 } 400 }
402 DEBUGP("Setting vtag %x for dir %d\n", 401 DEBUGP("Setting vtag %x for dir %d\n",
@@ -405,7 +404,7 @@ static int sctp_packet(struct ip_conntrack *conntrack,
405 } 404 }
406 405
407 conntrack->proto.sctp.state = newconntrack; 406 conntrack->proto.sctp.state = newconntrack;
408 WRITE_UNLOCK(&sctp_lock); 407 write_unlock_bh(&sctp_lock);
409 } 408 }
410 409
411 ip_ct_refresh_acct(conntrack, ctinfo, skb, *sctp_timeouts[newconntrack]); 410 ip_ct_refresh_acct(conntrack, ctinfo, skb, *sctp_timeouts[newconntrack]);
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
index 721ddbf522b4..809dfed766d4 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
@@ -36,7 +36,6 @@
36#include <linux/netfilter_ipv4.h> 36#include <linux/netfilter_ipv4.h>
37#include <linux/netfilter_ipv4/ip_conntrack.h> 37#include <linux/netfilter_ipv4/ip_conntrack.h>
38#include <linux/netfilter_ipv4/ip_conntrack_protocol.h> 38#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
39#include <linux/netfilter_ipv4/lockhelp.h>
40 39
41#if 0 40#if 0
42#define DEBUGP printk 41#define DEBUGP printk
@@ -46,7 +45,7 @@
46#endif 45#endif
47 46
48/* Protects conntrack->proto.tcp */ 47/* Protects conntrack->proto.tcp */
49static DECLARE_RWLOCK(tcp_lock); 48static DEFINE_RWLOCK(tcp_lock);
50 49
51/* "Be conservative in what you do, 50/* "Be conservative in what you do,
52 be liberal in what you accept from others." 51 be liberal in what you accept from others."
@@ -330,9 +329,9 @@ static int tcp_print_conntrack(struct seq_file *s,
330{ 329{
331 enum tcp_conntrack state; 330 enum tcp_conntrack state;
332 331
333 READ_LOCK(&tcp_lock); 332 read_lock_bh(&tcp_lock);
334 state = conntrack->proto.tcp.state; 333 state = conntrack->proto.tcp.state;
335 READ_UNLOCK(&tcp_lock); 334 read_unlock_bh(&tcp_lock);
336 335
337 return seq_printf(s, "%s ", tcp_conntrack_names[state]); 336 return seq_printf(s, "%s ", tcp_conntrack_names[state]);
338} 337}
@@ -738,14 +737,14 @@ void ip_conntrack_tcp_update(struct sk_buff *skb,
738 737
739 end = segment_seq_plus_len(ntohl(tcph->seq), skb->len, iph, tcph); 738 end = segment_seq_plus_len(ntohl(tcph->seq), skb->len, iph, tcph);
740 739
741 WRITE_LOCK(&tcp_lock); 740 write_lock_bh(&tcp_lock);
742 /* 741 /*
743 * We have to worry for the ack in the reply packet only... 742 * We have to worry for the ack in the reply packet only...
744 */ 743 */
745 if (after(end, conntrack->proto.tcp.seen[dir].td_end)) 744 if (after(end, conntrack->proto.tcp.seen[dir].td_end))
746 conntrack->proto.tcp.seen[dir].td_end = end; 745 conntrack->proto.tcp.seen[dir].td_end = end;
747 conntrack->proto.tcp.last_end = end; 746 conntrack->proto.tcp.last_end = end;
748 WRITE_UNLOCK(&tcp_lock); 747 write_unlock_bh(&tcp_lock);
749 DEBUGP("tcp_update: sender end=%u maxend=%u maxwin=%u scale=%i " 748 DEBUGP("tcp_update: sender end=%u maxend=%u maxwin=%u scale=%i "
750 "receiver end=%u maxend=%u maxwin=%u scale=%i\n", 749 "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
751 sender->td_end, sender->td_maxend, sender->td_maxwin, 750 sender->td_end, sender->td_maxend, sender->td_maxwin,
@@ -857,7 +856,7 @@ static int tcp_packet(struct ip_conntrack *conntrack,
857 sizeof(_tcph), &_tcph); 856 sizeof(_tcph), &_tcph);
858 BUG_ON(th == NULL); 857 BUG_ON(th == NULL);
859 858
860 WRITE_LOCK(&tcp_lock); 859 write_lock_bh(&tcp_lock);
861 old_state = conntrack->proto.tcp.state; 860 old_state = conntrack->proto.tcp.state;
862 dir = CTINFO2DIR(ctinfo); 861 dir = CTINFO2DIR(ctinfo);
863 index = get_conntrack_index(th); 862 index = get_conntrack_index(th);
@@ -879,7 +878,7 @@ static int tcp_packet(struct ip_conntrack *conntrack,
879 * that the client cannot but retransmit its SYN and 878 * that the client cannot but retransmit its SYN and
880 * thus initiate a clean new session. 879 * thus initiate a clean new session.
881 */ 880 */
882 WRITE_UNLOCK(&tcp_lock); 881 write_unlock_bh(&tcp_lock);
883 if (LOG_INVALID(IPPROTO_TCP)) 882 if (LOG_INVALID(IPPROTO_TCP))
884 nf_log_packet(PF_INET, 0, skb, NULL, NULL, 883 nf_log_packet(PF_INET, 0, skb, NULL, NULL,
885 "ip_ct_tcp: killing out of sync session "); 884 "ip_ct_tcp: killing out of sync session ");
@@ -894,7 +893,7 @@ static int tcp_packet(struct ip_conntrack *conntrack,
894 conntrack->proto.tcp.last_end = 893 conntrack->proto.tcp.last_end =
895 segment_seq_plus_len(ntohl(th->seq), skb->len, iph, th); 894 segment_seq_plus_len(ntohl(th->seq), skb->len, iph, th);
896 895
897 WRITE_UNLOCK(&tcp_lock); 896 write_unlock_bh(&tcp_lock);
898 if (LOG_INVALID(IPPROTO_TCP)) 897 if (LOG_INVALID(IPPROTO_TCP))
899 nf_log_packet(PF_INET, 0, skb, NULL, NULL, 898 nf_log_packet(PF_INET, 0, skb, NULL, NULL,
900 "ip_ct_tcp: invalid packet ignored "); 899 "ip_ct_tcp: invalid packet ignored ");
@@ -904,7 +903,7 @@ static int tcp_packet(struct ip_conntrack *conntrack,
904 DEBUGP("ip_ct_tcp: Invalid dir=%i index=%u ostate=%u\n", 903 DEBUGP("ip_ct_tcp: Invalid dir=%i index=%u ostate=%u\n",
905 dir, get_conntrack_index(th), 904 dir, get_conntrack_index(th),
906 old_state); 905 old_state);
907 WRITE_UNLOCK(&tcp_lock); 906 write_unlock_bh(&tcp_lock);
908 if (LOG_INVALID(IPPROTO_TCP)) 907 if (LOG_INVALID(IPPROTO_TCP))
909 nf_log_packet(PF_INET, 0, skb, NULL, NULL, 908 nf_log_packet(PF_INET, 0, skb, NULL, NULL,
910 "ip_ct_tcp: invalid state "); 909 "ip_ct_tcp: invalid state ");
@@ -918,13 +917,13 @@ static int tcp_packet(struct ip_conntrack *conntrack,
918 conntrack->proto.tcp.seen[dir].td_end)) { 917 conntrack->proto.tcp.seen[dir].td_end)) {
919 /* Attempt to reopen a closed connection. 918 /* Attempt to reopen a closed connection.
920 * Delete this connection and look up again. */ 919 * Delete this connection and look up again. */
921 WRITE_UNLOCK(&tcp_lock); 920 write_unlock_bh(&tcp_lock);
922 if (del_timer(&conntrack->timeout)) 921 if (del_timer(&conntrack->timeout))
923 conntrack->timeout.function((unsigned long) 922 conntrack->timeout.function((unsigned long)
924 conntrack); 923 conntrack);
925 return -NF_REPEAT; 924 return -NF_REPEAT;
926 } else { 925 } else {
927 WRITE_UNLOCK(&tcp_lock); 926 write_unlock_bh(&tcp_lock);
928 if (LOG_INVALID(IPPROTO_TCP)) 927 if (LOG_INVALID(IPPROTO_TCP))
929 nf_log_packet(PF_INET, 0, skb, NULL, NULL, 928 nf_log_packet(PF_INET, 0, skb, NULL, NULL,
930 "ip_ct_tcp: invalid SYN"); 929 "ip_ct_tcp: invalid SYN");
@@ -949,7 +948,7 @@ static int tcp_packet(struct ip_conntrack *conntrack,
949 948
950 if (!tcp_in_window(&conntrack->proto.tcp, dir, index, 949 if (!tcp_in_window(&conntrack->proto.tcp, dir, index,
951 skb, iph, th)) { 950 skb, iph, th)) {
952 WRITE_UNLOCK(&tcp_lock); 951 write_unlock_bh(&tcp_lock);
953 return -NF_ACCEPT; 952 return -NF_ACCEPT;
954 } 953 }
955 in_window: 954 in_window:
@@ -972,7 +971,7 @@ static int tcp_packet(struct ip_conntrack *conntrack,
972 timeout = conntrack->proto.tcp.retrans >= ip_ct_tcp_max_retrans 971 timeout = conntrack->proto.tcp.retrans >= ip_ct_tcp_max_retrans
973 && *tcp_timeouts[new_state] > ip_ct_tcp_timeout_max_retrans 972 && *tcp_timeouts[new_state] > ip_ct_tcp_timeout_max_retrans
974 ? ip_ct_tcp_timeout_max_retrans : *tcp_timeouts[new_state]; 973 ? ip_ct_tcp_timeout_max_retrans : *tcp_timeouts[new_state];
975 WRITE_UNLOCK(&tcp_lock); 974 write_unlock_bh(&tcp_lock);
976 975
977 if (!test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) { 976 if (!test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) {
978 /* If only reply is a RST, we can consider ourselves not to 977 /* If only reply is a RST, we can consider ourselves not to
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_udp.c b/net/ipv4/netfilter/ip_conntrack_proto_udp.c
index 5bc28a224623..8c1eaba098d4 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_udp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_udp.c
@@ -120,6 +120,7 @@ static int udp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
120 * and moreover root might send raw packets. 120 * and moreover root might send raw packets.
121 * FIXME: Source route IP option packets --RR */ 121 * FIXME: Source route IP option packets --RR */
122 if (hooknum == NF_IP_PRE_ROUTING 122 if (hooknum == NF_IP_PRE_ROUTING
123 && skb->ip_summed != CHECKSUM_UNNECESSARY
123 && csum_tcpudp_magic(iph->saddr, iph->daddr, udplen, IPPROTO_UDP, 124 && csum_tcpudp_magic(iph->saddr, iph->daddr, udplen, IPPROTO_UDP,
124 skb->ip_summed == CHECKSUM_HW ? skb->csum 125 skb->ip_summed == CHECKSUM_HW ? skb->csum
125 : skb_checksum(skb, iph->ihl*4, udplen, 0))) { 126 : skb_checksum(skb, iph->ihl*4, udplen, 0))) {
diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c
index bc59f7b39805..42dc95102873 100644
--- a/net/ipv4/netfilter/ip_conntrack_standalone.c
+++ b/net/ipv4/netfilter/ip_conntrack_standalone.c
@@ -28,8 +28,8 @@
28#include <net/checksum.h> 28#include <net/checksum.h>
29#include <net/ip.h> 29#include <net/ip.h>
30 30
31#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_conntrack_lock) 31#define ASSERT_READ_LOCK(x)
32#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_conntrack_lock) 32#define ASSERT_WRITE_LOCK(x)
33 33
34#include <linux/netfilter_ipv4/ip_conntrack.h> 34#include <linux/netfilter_ipv4/ip_conntrack.h>
35#include <linux/netfilter_ipv4/ip_conntrack_protocol.h> 35#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
@@ -119,7 +119,7 @@ static struct list_head *ct_get_idx(struct seq_file *seq, loff_t pos)
119 119
120static void *ct_seq_start(struct seq_file *seq, loff_t *pos) 120static void *ct_seq_start(struct seq_file *seq, loff_t *pos)
121{ 121{
122 READ_LOCK(&ip_conntrack_lock); 122 read_lock_bh(&ip_conntrack_lock);
123 return ct_get_idx(seq, *pos); 123 return ct_get_idx(seq, *pos);
124} 124}
125 125
@@ -131,7 +131,7 @@ static void *ct_seq_next(struct seq_file *s, void *v, loff_t *pos)
131 131
132static void ct_seq_stop(struct seq_file *s, void *v) 132static void ct_seq_stop(struct seq_file *s, void *v)
133{ 133{
134 READ_UNLOCK(&ip_conntrack_lock); 134 read_unlock_bh(&ip_conntrack_lock);
135} 135}
136 136
137static int ct_seq_show(struct seq_file *s, void *v) 137static int ct_seq_show(struct seq_file *s, void *v)
@@ -140,7 +140,7 @@ static int ct_seq_show(struct seq_file *s, void *v)
140 const struct ip_conntrack *conntrack = tuplehash_to_ctrack(hash); 140 const struct ip_conntrack *conntrack = tuplehash_to_ctrack(hash);
141 struct ip_conntrack_protocol *proto; 141 struct ip_conntrack_protocol *proto;
142 142
143 MUST_BE_READ_LOCKED(&ip_conntrack_lock); 143 ASSERT_READ_LOCK(&ip_conntrack_lock);
144 IP_NF_ASSERT(conntrack); 144 IP_NF_ASSERT(conntrack);
145 145
146 /* we only want to print DIR_ORIGINAL */ 146 /* we only want to print DIR_ORIGINAL */
@@ -239,7 +239,7 @@ static void *exp_seq_start(struct seq_file *s, loff_t *pos)
239 239
240 /* strange seq_file api calls stop even if we fail, 240 /* strange seq_file api calls stop even if we fail,
241 * thus we need to grab lock since stop unlocks */ 241 * thus we need to grab lock since stop unlocks */
242 READ_LOCK(&ip_conntrack_lock); 242 read_lock_bh(&ip_conntrack_lock);
243 243
244 if (list_empty(e)) 244 if (list_empty(e))
245 return NULL; 245 return NULL;
@@ -267,7 +267,7 @@ static void *exp_seq_next(struct seq_file *s, void *v, loff_t *pos)
267 267
268static void exp_seq_stop(struct seq_file *s, void *v) 268static void exp_seq_stop(struct seq_file *s, void *v)
269{ 269{
270 READ_UNLOCK(&ip_conntrack_lock); 270 read_unlock_bh(&ip_conntrack_lock);
271} 271}
272 272
273static int exp_seq_show(struct seq_file *s, void *v) 273static int exp_seq_show(struct seq_file *s, void *v)
@@ -921,22 +921,22 @@ int ip_conntrack_protocol_register(struct ip_conntrack_protocol *proto)
921{ 921{
922 int ret = 0; 922 int ret = 0;
923 923
924 WRITE_LOCK(&ip_conntrack_lock); 924 write_lock_bh(&ip_conntrack_lock);
925 if (ip_ct_protos[proto->proto] != &ip_conntrack_generic_protocol) { 925 if (ip_ct_protos[proto->proto] != &ip_conntrack_generic_protocol) {
926 ret = -EBUSY; 926 ret = -EBUSY;
927 goto out; 927 goto out;
928 } 928 }
929 ip_ct_protos[proto->proto] = proto; 929 ip_ct_protos[proto->proto] = proto;
930 out: 930 out:
931 WRITE_UNLOCK(&ip_conntrack_lock); 931 write_unlock_bh(&ip_conntrack_lock);
932 return ret; 932 return ret;
933} 933}
934 934
935void ip_conntrack_protocol_unregister(struct ip_conntrack_protocol *proto) 935void ip_conntrack_protocol_unregister(struct ip_conntrack_protocol *proto)
936{ 936{
937 WRITE_LOCK(&ip_conntrack_lock); 937 write_lock_bh(&ip_conntrack_lock);
938 ip_ct_protos[proto->proto] = &ip_conntrack_generic_protocol; 938 ip_ct_protos[proto->proto] = &ip_conntrack_generic_protocol;
939 WRITE_UNLOCK(&ip_conntrack_lock); 939 write_unlock_bh(&ip_conntrack_lock);
940 940
941 /* Somebody could be still looking at the proto in bh. */ 941 /* Somebody could be still looking at the proto in bh. */
942 synchronize_net(); 942 synchronize_net();
diff --git a/net/ipv4/netfilter/ip_nat_core.c b/net/ipv4/netfilter/ip_nat_core.c
index 9fc6f93af0dd..739b6dde1c82 100644
--- a/net/ipv4/netfilter/ip_nat_core.c
+++ b/net/ipv4/netfilter/ip_nat_core.c
@@ -22,8 +22,8 @@
22#include <linux/udp.h> 22#include <linux/udp.h>
23#include <linux/jhash.h> 23#include <linux/jhash.h>
24 24
25#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock) 25#define ASSERT_READ_LOCK(x)
26#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock) 26#define ASSERT_WRITE_LOCK(x)
27 27
28#include <linux/netfilter_ipv4/ip_conntrack.h> 28#include <linux/netfilter_ipv4/ip_conntrack.h>
29#include <linux/netfilter_ipv4/ip_conntrack_core.h> 29#include <linux/netfilter_ipv4/ip_conntrack_core.h>
@@ -41,7 +41,7 @@
41#define DEBUGP(format, args...) 41#define DEBUGP(format, args...)
42#endif 42#endif
43 43
44DECLARE_RWLOCK(ip_nat_lock); 44DEFINE_RWLOCK(ip_nat_lock);
45 45
46/* Calculated at init based on memory size */ 46/* Calculated at init based on memory size */
47static unsigned int ip_nat_htable_size; 47static unsigned int ip_nat_htable_size;
@@ -65,9 +65,9 @@ static void ip_nat_cleanup_conntrack(struct ip_conntrack *conn)
65 if (!(conn->status & IPS_NAT_DONE_MASK)) 65 if (!(conn->status & IPS_NAT_DONE_MASK))
66 return; 66 return;
67 67
68 WRITE_LOCK(&ip_nat_lock); 68 write_lock_bh(&ip_nat_lock);
69 list_del(&conn->nat.info.bysource); 69 list_del(&conn->nat.info.bysource);
70 WRITE_UNLOCK(&ip_nat_lock); 70 write_unlock_bh(&ip_nat_lock);
71} 71}
72 72
73/* We do checksum mangling, so if they were wrong before they're still 73/* We do checksum mangling, so if they were wrong before they're still
@@ -142,7 +142,7 @@ find_appropriate_src(const struct ip_conntrack_tuple *tuple,
142 unsigned int h = hash_by_src(tuple); 142 unsigned int h = hash_by_src(tuple);
143 struct ip_conntrack *ct; 143 struct ip_conntrack *ct;
144 144
145 READ_LOCK(&ip_nat_lock); 145 read_lock_bh(&ip_nat_lock);
146 list_for_each_entry(ct, &bysource[h], nat.info.bysource) { 146 list_for_each_entry(ct, &bysource[h], nat.info.bysource) {
147 if (same_src(ct, tuple)) { 147 if (same_src(ct, tuple)) {
148 /* Copy source part from reply tuple. */ 148 /* Copy source part from reply tuple. */
@@ -151,12 +151,12 @@ find_appropriate_src(const struct ip_conntrack_tuple *tuple,
151 result->dst = tuple->dst; 151 result->dst = tuple->dst;
152 152
153 if (in_range(result, range)) { 153 if (in_range(result, range)) {
154 READ_UNLOCK(&ip_nat_lock); 154 read_unlock_bh(&ip_nat_lock);
155 return 1; 155 return 1;
156 } 156 }
157 } 157 }
158 } 158 }
159 READ_UNLOCK(&ip_nat_lock); 159 read_unlock_bh(&ip_nat_lock);
160 return 0; 160 return 0;
161} 161}
162 162
@@ -297,9 +297,9 @@ ip_nat_setup_info(struct ip_conntrack *conntrack,
297 unsigned int srchash 297 unsigned int srchash
298 = hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL] 298 = hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
299 .tuple); 299 .tuple);
300 WRITE_LOCK(&ip_nat_lock); 300 write_lock_bh(&ip_nat_lock);
301 list_add(&info->bysource, &bysource[srchash]); 301 list_add(&info->bysource, &bysource[srchash]);
302 WRITE_UNLOCK(&ip_nat_lock); 302 write_unlock_bh(&ip_nat_lock);
303 } 303 }
304 304
305 /* It's done. */ 305 /* It's done. */
@@ -474,23 +474,23 @@ int ip_nat_protocol_register(struct ip_nat_protocol *proto)
474{ 474{
475 int ret = 0; 475 int ret = 0;
476 476
477 WRITE_LOCK(&ip_nat_lock); 477 write_lock_bh(&ip_nat_lock);
478 if (ip_nat_protos[proto->protonum] != &ip_nat_unknown_protocol) { 478 if (ip_nat_protos[proto->protonum] != &ip_nat_unknown_protocol) {
479 ret = -EBUSY; 479 ret = -EBUSY;
480 goto out; 480 goto out;
481 } 481 }
482 ip_nat_protos[proto->protonum] = proto; 482 ip_nat_protos[proto->protonum] = proto;
483 out: 483 out:
484 WRITE_UNLOCK(&ip_nat_lock); 484 write_unlock_bh(&ip_nat_lock);
485 return ret; 485 return ret;
486} 486}
487 487
488/* Noone stores the protocol anywhere; simply delete it. */ 488/* Noone stores the protocol anywhere; simply delete it. */
489void ip_nat_protocol_unregister(struct ip_nat_protocol *proto) 489void ip_nat_protocol_unregister(struct ip_nat_protocol *proto)
490{ 490{
491 WRITE_LOCK(&ip_nat_lock); 491 write_lock_bh(&ip_nat_lock);
492 ip_nat_protos[proto->protonum] = &ip_nat_unknown_protocol; 492 ip_nat_protos[proto->protonum] = &ip_nat_unknown_protocol;
493 WRITE_UNLOCK(&ip_nat_lock); 493 write_unlock_bh(&ip_nat_lock);
494 494
495 /* Someone could be still looking at the proto in a bh. */ 495 /* Someone could be still looking at the proto in a bh. */
496 synchronize_net(); 496 synchronize_net();
@@ -509,13 +509,13 @@ int __init ip_nat_init(void)
509 return -ENOMEM; 509 return -ENOMEM;
510 510
511 /* Sew in builtin protocols. */ 511 /* Sew in builtin protocols. */
512 WRITE_LOCK(&ip_nat_lock); 512 write_lock_bh(&ip_nat_lock);
513 for (i = 0; i < MAX_IP_NAT_PROTO; i++) 513 for (i = 0; i < MAX_IP_NAT_PROTO; i++)
514 ip_nat_protos[i] = &ip_nat_unknown_protocol; 514 ip_nat_protos[i] = &ip_nat_unknown_protocol;
515 ip_nat_protos[IPPROTO_TCP] = &ip_nat_protocol_tcp; 515 ip_nat_protos[IPPROTO_TCP] = &ip_nat_protocol_tcp;
516 ip_nat_protos[IPPROTO_UDP] = &ip_nat_protocol_udp; 516 ip_nat_protos[IPPROTO_UDP] = &ip_nat_protocol_udp;
517 ip_nat_protos[IPPROTO_ICMP] = &ip_nat_protocol_icmp; 517 ip_nat_protos[IPPROTO_ICMP] = &ip_nat_protocol_icmp;
518 WRITE_UNLOCK(&ip_nat_lock); 518 write_unlock_bh(&ip_nat_lock);
519 519
520 for (i = 0; i < ip_nat_htable_size; i++) { 520 for (i = 0; i < ip_nat_htable_size; i++) {
521 INIT_LIST_HEAD(&bysource[i]); 521 INIT_LIST_HEAD(&bysource[i]);
diff --git a/net/ipv4/netfilter/ip_nat_helper.c b/net/ipv4/netfilter/ip_nat_helper.c
index 1637b96d8c01..158f34f32c04 100644
--- a/net/ipv4/netfilter/ip_nat_helper.c
+++ b/net/ipv4/netfilter/ip_nat_helper.c
@@ -28,8 +28,8 @@
28#include <net/tcp.h> 28#include <net/tcp.h>
29#include <net/udp.h> 29#include <net/udp.h>
30 30
31#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock) 31#define ASSERT_READ_LOCK(x)
32#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock) 32#define ASSERT_WRITE_LOCK(x)
33 33
34#include <linux/netfilter_ipv4/ip_conntrack.h> 34#include <linux/netfilter_ipv4/ip_conntrack.h>
35#include <linux/netfilter_ipv4/ip_conntrack_helper.h> 35#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
@@ -47,7 +47,7 @@
47#define DUMP_OFFSET(x) 47#define DUMP_OFFSET(x)
48#endif 48#endif
49 49
50static DECLARE_LOCK(ip_nat_seqofs_lock); 50static DEFINE_SPINLOCK(ip_nat_seqofs_lock);
51 51
52/* Setup TCP sequence correction given this change at this sequence */ 52/* Setup TCP sequence correction given this change at this sequence */
53static inline void 53static inline void
@@ -70,7 +70,7 @@ adjust_tcp_sequence(u32 seq,
70 DEBUGP("ip_nat_resize_packet: Seq_offset before: "); 70 DEBUGP("ip_nat_resize_packet: Seq_offset before: ");
71 DUMP_OFFSET(this_way); 71 DUMP_OFFSET(this_way);
72 72
73 LOCK_BH(&ip_nat_seqofs_lock); 73 spin_lock_bh(&ip_nat_seqofs_lock);
74 74
75 /* SYN adjust. If it's uninitialized, or this is after last 75 /* SYN adjust. If it's uninitialized, or this is after last
76 * correction, record it: we don't handle more than one 76 * correction, record it: we don't handle more than one
@@ -82,7 +82,7 @@ adjust_tcp_sequence(u32 seq,
82 this_way->offset_before = this_way->offset_after; 82 this_way->offset_before = this_way->offset_after;
83 this_way->offset_after += sizediff; 83 this_way->offset_after += sizediff;
84 } 84 }
85 UNLOCK_BH(&ip_nat_seqofs_lock); 85 spin_unlock_bh(&ip_nat_seqofs_lock);
86 86
87 DEBUGP("ip_nat_resize_packet: Seq_offset after: "); 87 DEBUGP("ip_nat_resize_packet: Seq_offset after: ");
88 DUMP_OFFSET(this_way); 88 DUMP_OFFSET(this_way);
@@ -142,9 +142,6 @@ static int enlarge_skb(struct sk_buff **pskb, unsigned int extra)
142 /* Transfer socket to new skb. */ 142 /* Transfer socket to new skb. */
143 if ((*pskb)->sk) 143 if ((*pskb)->sk)
144 skb_set_owner_w(nskb, (*pskb)->sk); 144 skb_set_owner_w(nskb, (*pskb)->sk);
145#ifdef CONFIG_NETFILTER_DEBUG
146 nskb->nf_debug = (*pskb)->nf_debug;
147#endif
148 kfree_skb(*pskb); 145 kfree_skb(*pskb);
149 *pskb = nskb; 146 *pskb = nskb;
150 return 1; 147 return 1;
diff --git a/net/ipv4/netfilter/ip_nat_rule.c b/net/ipv4/netfilter/ip_nat_rule.c
index 581f097f5a24..60d70fa41a15 100644
--- a/net/ipv4/netfilter/ip_nat_rule.c
+++ b/net/ipv4/netfilter/ip_nat_rule.c
@@ -19,8 +19,8 @@
19#include <net/route.h> 19#include <net/route.h>
20#include <linux/bitops.h> 20#include <linux/bitops.h>
21 21
22#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock) 22#define ASSERT_READ_LOCK(x)
23#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock) 23#define ASSERT_WRITE_LOCK(x)
24 24
25#include <linux/netfilter_ipv4/ip_tables.h> 25#include <linux/netfilter_ipv4/ip_tables.h>
26#include <linux/netfilter_ipv4/ip_nat.h> 26#include <linux/netfilter_ipv4/ip_nat.h>
diff --git a/net/ipv4/netfilter/ip_nat_standalone.c b/net/ipv4/netfilter/ip_nat_standalone.c
index 79f56f662b33..bc59d0d6e89e 100644
--- a/net/ipv4/netfilter/ip_nat_standalone.c
+++ b/net/ipv4/netfilter/ip_nat_standalone.c
@@ -31,8 +31,8 @@
31#include <net/checksum.h> 31#include <net/checksum.h>
32#include <linux/spinlock.h> 32#include <linux/spinlock.h>
33 33
34#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock) 34#define ASSERT_READ_LOCK(x)
35#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock) 35#define ASSERT_WRITE_LOCK(x)
36 36
37#include <linux/netfilter_ipv4/ip_nat.h> 37#include <linux/netfilter_ipv4/ip_nat.h>
38#include <linux/netfilter_ipv4/ip_nat_rule.h> 38#include <linux/netfilter_ipv4/ip_nat_rule.h>
@@ -373,7 +373,6 @@ static int init_or_cleanup(int init)
373 cleanup_rule_init: 373 cleanup_rule_init:
374 ip_nat_rule_cleanup(); 374 ip_nat_rule_cleanup();
375 cleanup_nothing: 375 cleanup_nothing:
376 MUST_BE_READ_WRITE_UNLOCKED(&ip_nat_lock);
377 return ret; 376 return ret;
378} 377}
379 378
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 8a54f92b8496..c88dfcd38c56 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -67,7 +67,6 @@ static DECLARE_MUTEX(ipt_mutex);
67/* Must have mutex */ 67/* Must have mutex */
68#define ASSERT_READ_LOCK(x) IP_NF_ASSERT(down_trylock(&ipt_mutex) != 0) 68#define ASSERT_READ_LOCK(x) IP_NF_ASSERT(down_trylock(&ipt_mutex) != 0)
69#define ASSERT_WRITE_LOCK(x) IP_NF_ASSERT(down_trylock(&ipt_mutex) != 0) 69#define ASSERT_WRITE_LOCK(x) IP_NF_ASSERT(down_trylock(&ipt_mutex) != 0)
70#include <linux/netfilter_ipv4/lockhelp.h>
71#include <linux/netfilter_ipv4/listhelp.h> 70#include <linux/netfilter_ipv4/listhelp.h>
72 71
73#if 0 72#if 0
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 0f12e3a3dc73..dc4362b57cfa 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -29,7 +29,6 @@
29#include <linux/netfilter_ipv4/ip_tables.h> 29#include <linux/netfilter_ipv4/ip_tables.h>
30#include <linux/netfilter_ipv4/ipt_CLUSTERIP.h> 30#include <linux/netfilter_ipv4/ipt_CLUSTERIP.h>
31#include <linux/netfilter_ipv4/ip_conntrack.h> 31#include <linux/netfilter_ipv4/ip_conntrack.h>
32#include <linux/netfilter_ipv4/lockhelp.h>
33 32
34#define CLUSTERIP_VERSION "0.6" 33#define CLUSTERIP_VERSION "0.6"
35 34
@@ -41,6 +40,8 @@
41#define DEBUGP 40#define DEBUGP
42#endif 41#endif
43 42
43#define ASSERT_READ_LOCK(x)
44
44MODULE_LICENSE("GPL"); 45MODULE_LICENSE("GPL");
45MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); 46MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
46MODULE_DESCRIPTION("iptables target for CLUSTERIP"); 47MODULE_DESCRIPTION("iptables target for CLUSTERIP");
@@ -67,7 +68,7 @@ static LIST_HEAD(clusterip_configs);
67 68
68/* clusterip_lock protects the clusterip_configs list _AND_ the configurable 69/* clusterip_lock protects the clusterip_configs list _AND_ the configurable
69 * data within all structurses (num_local_nodes, local_nodes[]) */ 70 * data within all structurses (num_local_nodes, local_nodes[]) */
70static DECLARE_RWLOCK(clusterip_lock); 71static DEFINE_RWLOCK(clusterip_lock);
71 72
72#ifdef CONFIG_PROC_FS 73#ifdef CONFIG_PROC_FS
73static struct file_operations clusterip_proc_fops; 74static struct file_operations clusterip_proc_fops;
@@ -82,9 +83,9 @@ clusterip_config_get(struct clusterip_config *c) {
82static inline void 83static inline void
83clusterip_config_put(struct clusterip_config *c) { 84clusterip_config_put(struct clusterip_config *c) {
84 if (atomic_dec_and_test(&c->refcount)) { 85 if (atomic_dec_and_test(&c->refcount)) {
85 WRITE_LOCK(&clusterip_lock); 86 write_lock_bh(&clusterip_lock);
86 list_del(&c->list); 87 list_del(&c->list);
87 WRITE_UNLOCK(&clusterip_lock); 88 write_unlock_bh(&clusterip_lock);
88 dev_mc_delete(c->dev, c->clustermac, ETH_ALEN, 0); 89 dev_mc_delete(c->dev, c->clustermac, ETH_ALEN, 0);
89 dev_put(c->dev); 90 dev_put(c->dev);
90 kfree(c); 91 kfree(c);
@@ -97,7 +98,7 @@ __clusterip_config_find(u_int32_t clusterip)
97{ 98{
98 struct list_head *pos; 99 struct list_head *pos;
99 100
100 MUST_BE_READ_LOCKED(&clusterip_lock); 101 ASSERT_READ_LOCK(&clusterip_lock);
101 list_for_each(pos, &clusterip_configs) { 102 list_for_each(pos, &clusterip_configs) {
102 struct clusterip_config *c = list_entry(pos, 103 struct clusterip_config *c = list_entry(pos,
103 struct clusterip_config, list); 104 struct clusterip_config, list);
@@ -114,14 +115,14 @@ clusterip_config_find_get(u_int32_t clusterip)
114{ 115{
115 struct clusterip_config *c; 116 struct clusterip_config *c;
116 117
117 READ_LOCK(&clusterip_lock); 118 read_lock_bh(&clusterip_lock);
118 c = __clusterip_config_find(clusterip); 119 c = __clusterip_config_find(clusterip);
119 if (!c) { 120 if (!c) {
120 READ_UNLOCK(&clusterip_lock); 121 read_unlock_bh(&clusterip_lock);
121 return NULL; 122 return NULL;
122 } 123 }
123 atomic_inc(&c->refcount); 124 atomic_inc(&c->refcount);
124 READ_UNLOCK(&clusterip_lock); 125 read_unlock_bh(&clusterip_lock);
125 126
126 return c; 127 return c;
127} 128}
@@ -160,9 +161,9 @@ clusterip_config_init(struct ipt_clusterip_tgt_info *i, u_int32_t ip,
160 c->pde->data = c; 161 c->pde->data = c;
161#endif 162#endif
162 163
163 WRITE_LOCK(&clusterip_lock); 164 write_lock_bh(&clusterip_lock);
164 list_add(&c->list, &clusterip_configs); 165 list_add(&c->list, &clusterip_configs);
165 WRITE_UNLOCK(&clusterip_lock); 166 write_unlock_bh(&clusterip_lock);
166 167
167 return c; 168 return c;
168} 169}
@@ -172,25 +173,25 @@ clusterip_add_node(struct clusterip_config *c, u_int16_t nodenum)
172{ 173{
173 int i; 174 int i;
174 175
175 WRITE_LOCK(&clusterip_lock); 176 write_lock_bh(&clusterip_lock);
176 177
177 if (c->num_local_nodes >= CLUSTERIP_MAX_NODES 178 if (c->num_local_nodes >= CLUSTERIP_MAX_NODES
178 || nodenum > CLUSTERIP_MAX_NODES) { 179 || nodenum > CLUSTERIP_MAX_NODES) {
179 WRITE_UNLOCK(&clusterip_lock); 180 write_unlock_bh(&clusterip_lock);
180 return 1; 181 return 1;
181 } 182 }
182 183
183 /* check if we alrady have this number in our array */ 184 /* check if we alrady have this number in our array */
184 for (i = 0; i < c->num_local_nodes; i++) { 185 for (i = 0; i < c->num_local_nodes; i++) {
185 if (c->local_nodes[i] == nodenum) { 186 if (c->local_nodes[i] == nodenum) {
186 WRITE_UNLOCK(&clusterip_lock); 187 write_unlock_bh(&clusterip_lock);
187 return 1; 188 return 1;
188 } 189 }
189 } 190 }
190 191
191 c->local_nodes[c->num_local_nodes++] = nodenum; 192 c->local_nodes[c->num_local_nodes++] = nodenum;
192 193
193 WRITE_UNLOCK(&clusterip_lock); 194 write_unlock_bh(&clusterip_lock);
194 return 0; 195 return 0;
195} 196}
196 197
@@ -199,10 +200,10 @@ clusterip_del_node(struct clusterip_config *c, u_int16_t nodenum)
199{ 200{
200 int i; 201 int i;
201 202
202 WRITE_LOCK(&clusterip_lock); 203 write_lock_bh(&clusterip_lock);
203 204
204 if (c->num_local_nodes <= 1 || nodenum > CLUSTERIP_MAX_NODES) { 205 if (c->num_local_nodes <= 1 || nodenum > CLUSTERIP_MAX_NODES) {
205 WRITE_UNLOCK(&clusterip_lock); 206 write_unlock_bh(&clusterip_lock);
206 return 1; 207 return 1;
207 } 208 }
208 209
@@ -211,12 +212,12 @@ clusterip_del_node(struct clusterip_config *c, u_int16_t nodenum)
211 int size = sizeof(u_int16_t)*(c->num_local_nodes-(i+1)); 212 int size = sizeof(u_int16_t)*(c->num_local_nodes-(i+1));
212 memmove(&c->local_nodes[i], &c->local_nodes[i+1], size); 213 memmove(&c->local_nodes[i], &c->local_nodes[i+1], size);
213 c->num_local_nodes--; 214 c->num_local_nodes--;
214 WRITE_UNLOCK(&clusterip_lock); 215 write_unlock_bh(&clusterip_lock);
215 return 0; 216 return 0;
216 } 217 }
217 } 218 }
218 219
219 WRITE_UNLOCK(&clusterip_lock); 220 write_unlock_bh(&clusterip_lock);
220 return 1; 221 return 1;
221} 222}
222 223
@@ -286,21 +287,21 @@ clusterip_responsible(struct clusterip_config *config, u_int32_t hash)
286{ 287{
287 int i; 288 int i;
288 289
289 READ_LOCK(&clusterip_lock); 290 read_lock_bh(&clusterip_lock);
290 291
291 if (config->num_local_nodes == 0) { 292 if (config->num_local_nodes == 0) {
292 READ_UNLOCK(&clusterip_lock); 293 read_unlock_bh(&clusterip_lock);
293 return 0; 294 return 0;
294 } 295 }
295 296
296 for (i = 0; i < config->num_local_nodes; i++) { 297 for (i = 0; i < config->num_local_nodes; i++) {
297 if (config->local_nodes[i] == hash) { 298 if (config->local_nodes[i] == hash) {
298 READ_UNLOCK(&clusterip_lock); 299 read_unlock_bh(&clusterip_lock);
299 return 1; 300 return 1;
300 } 301 }
301 } 302 }
302 303
303 READ_UNLOCK(&clusterip_lock); 304 read_unlock_bh(&clusterip_lock);
304 305
305 return 0; 306 return 0;
306} 307}
@@ -578,7 +579,7 @@ static void *clusterip_seq_start(struct seq_file *s, loff_t *pos)
578 struct clusterip_config *c = pde->data; 579 struct clusterip_config *c = pde->data;
579 unsigned int *nodeidx; 580 unsigned int *nodeidx;
580 581
581 READ_LOCK(&clusterip_lock); 582 read_lock_bh(&clusterip_lock);
582 if (*pos >= c->num_local_nodes) 583 if (*pos >= c->num_local_nodes)
583 return NULL; 584 return NULL;
584 585
@@ -608,7 +609,7 @@ static void clusterip_seq_stop(struct seq_file *s, void *v)
608{ 609{
609 kfree(v); 610 kfree(v);
610 611
611 READ_UNLOCK(&clusterip_lock); 612 read_unlock_bh(&clusterip_lock);
612} 613}
613 614
614static int clusterip_seq_show(struct seq_file *s, void *v) 615static int clusterip_seq_show(struct seq_file *s, void *v)
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index 57e9f6cf1c36..91e74502c3d3 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -33,7 +33,7 @@ MODULE_DESCRIPTION("iptables MASQUERADE target module");
33#endif 33#endif
34 34
35/* Lock protects masq region inside conntrack */ 35/* Lock protects masq region inside conntrack */
36static DECLARE_RWLOCK(masq_lock); 36static DEFINE_RWLOCK(masq_lock);
37 37
38/* FIXME: Multiple targets. --RR */ 38/* FIXME: Multiple targets. --RR */
39static int 39static int
@@ -103,9 +103,9 @@ masquerade_target(struct sk_buff **pskb,
103 return NF_DROP; 103 return NF_DROP;
104 } 104 }
105 105
106 WRITE_LOCK(&masq_lock); 106 write_lock_bh(&masq_lock);
107 ct->nat.masq_index = out->ifindex; 107 ct->nat.masq_index = out->ifindex;
108 WRITE_UNLOCK(&masq_lock); 108 write_unlock_bh(&masq_lock);
109 109
110 /* Transfer from original range. */ 110 /* Transfer from original range. */
111 newrange = ((struct ip_nat_range) 111 newrange = ((struct ip_nat_range)
@@ -122,9 +122,9 @@ device_cmp(struct ip_conntrack *i, void *ifindex)
122{ 122{
123 int ret; 123 int ret;
124 124
125 READ_LOCK(&masq_lock); 125 read_lock_bh(&masq_lock);
126 ret = (i->nat.masq_index == (int)(long)ifindex); 126 ret = (i->nat.masq_index == (int)(long)ifindex);
127 READ_UNLOCK(&masq_lock); 127 read_unlock_bh(&masq_lock);
128 128
129 return ret; 129 return ret;
130} 130}
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index 266d64979286..915696446020 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -104,10 +104,12 @@ static inline struct rtable *route_reverse(struct sk_buff *skb,
104static void send_reset(struct sk_buff *oldskb, int hook) 104static void send_reset(struct sk_buff *oldskb, int hook)
105{ 105{
106 struct sk_buff *nskb; 106 struct sk_buff *nskb;
107 struct iphdr *iph = oldskb->nh.iph;
107 struct tcphdr _otcph, *oth, *tcph; 108 struct tcphdr _otcph, *oth, *tcph;
108 struct rtable *rt; 109 struct rtable *rt;
109 u_int16_t tmp_port; 110 u_int16_t tmp_port;
110 u_int32_t tmp_addr; 111 u_int32_t tmp_addr;
112 unsigned int tcplen;
111 int needs_ack; 113 int needs_ack;
112 int hh_len; 114 int hh_len;
113 115
@@ -124,7 +126,16 @@ static void send_reset(struct sk_buff *oldskb, int hook)
124 if (oth->rst) 126 if (oth->rst)
125 return; 127 return;
126 128
127 /* FIXME: Check checksum --RR */ 129 /* Check checksum */
130 tcplen = oldskb->len - iph->ihl * 4;
131 if (((hook != NF_IP_LOCAL_IN && oldskb->ip_summed != CHECKSUM_HW) ||
132 (hook == NF_IP_LOCAL_IN &&
133 oldskb->ip_summed != CHECKSUM_UNNECESSARY)) &&
134 csum_tcpudp_magic(iph->saddr, iph->daddr, tcplen, IPPROTO_TCP,
135 oldskb->ip_summed == CHECKSUM_HW ? oldskb->csum :
136 skb_checksum(oldskb, iph->ihl * 4, tcplen, 0)))
137 return;
138
128 if ((rt = route_reverse(oldskb, oth, hook)) == NULL) 139 if ((rt = route_reverse(oldskb, oth, hook)) == NULL)
129 return; 140 return;
130 141
diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c
index 6f2cefbe16cd..52a0076302a7 100644
--- a/net/ipv4/netfilter/ipt_ULOG.c
+++ b/net/ipv4/netfilter/ipt_ULOG.c
@@ -56,7 +56,6 @@
56#include <linux/netfilter.h> 56#include <linux/netfilter.h>
57#include <linux/netfilter_ipv4/ip_tables.h> 57#include <linux/netfilter_ipv4/ip_tables.h>
58#include <linux/netfilter_ipv4/ipt_ULOG.h> 58#include <linux/netfilter_ipv4/ipt_ULOG.h>
59#include <linux/netfilter_ipv4/lockhelp.h>
60#include <net/sock.h> 59#include <net/sock.h>
61#include <linux/bitops.h> 60#include <linux/bitops.h>
62 61
@@ -99,8 +98,8 @@ typedef struct {
99 98
100static ulog_buff_t ulog_buffers[ULOG_MAXNLGROUPS]; /* array of buffers */ 99static ulog_buff_t ulog_buffers[ULOG_MAXNLGROUPS]; /* array of buffers */
101 100
102static struct sock *nflognl; /* our socket */ 101static struct sock *nflognl; /* our socket */
103static DECLARE_LOCK(ulog_lock); /* spinlock */ 102static DEFINE_SPINLOCK(ulog_lock); /* spinlock */
104 103
105/* send one ulog_buff_t to userspace */ 104/* send one ulog_buff_t to userspace */
106static void ulog_send(unsigned int nlgroupnum) 105static void ulog_send(unsigned int nlgroupnum)
@@ -135,9 +134,9 @@ static void ulog_timer(unsigned long data)
135 134
136 /* lock to protect against somebody modifying our structure 135 /* lock to protect against somebody modifying our structure
137 * from ipt_ulog_target at the same time */ 136 * from ipt_ulog_target at the same time */
138 LOCK_BH(&ulog_lock); 137 spin_lock_bh(&ulog_lock);
139 ulog_send(data); 138 ulog_send(data);
140 UNLOCK_BH(&ulog_lock); 139 spin_unlock_bh(&ulog_lock);
141} 140}
142 141
143static struct sk_buff *ulog_alloc_skb(unsigned int size) 142static struct sk_buff *ulog_alloc_skb(unsigned int size)
@@ -193,7 +192,7 @@ static void ipt_ulog_packet(unsigned int hooknum,
193 192
194 ub = &ulog_buffers[groupnum]; 193 ub = &ulog_buffers[groupnum];
195 194
196 LOCK_BH(&ulog_lock); 195 spin_lock_bh(&ulog_lock);
197 196
198 if (!ub->skb) { 197 if (!ub->skb) {
199 if (!(ub->skb = ulog_alloc_skb(size))) 198 if (!(ub->skb = ulog_alloc_skb(size)))
@@ -278,7 +277,7 @@ static void ipt_ulog_packet(unsigned int hooknum,
278 ulog_send(groupnum); 277 ulog_send(groupnum);
279 } 278 }
280 279
281 UNLOCK_BH(&ulog_lock); 280 spin_unlock_bh(&ulog_lock);
282 281
283 return; 282 return;
284 283
@@ -288,7 +287,7 @@ nlmsg_failure:
288alloc_failure: 287alloc_failure:
289 PRINTR("ipt_ULOG: Error building netlink message\n"); 288 PRINTR("ipt_ULOG: Error building netlink message\n");
290 289
291 UNLOCK_BH(&ulog_lock); 290 spin_unlock_bh(&ulog_lock);
292} 291}
293 292
294static unsigned int ipt_ulog_target(struct sk_buff **pskb, 293static unsigned int ipt_ulog_target(struct sk_buff **pskb,
diff --git a/net/ipv4/netfilter/ipt_hashlimit.c b/net/ipv4/netfilter/ipt_hashlimit.c
index f1937190cd77..564b49bfebcf 100644
--- a/net/ipv4/netfilter/ipt_hashlimit.c
+++ b/net/ipv4/netfilter/ipt_hashlimit.c
@@ -37,7 +37,6 @@
37 37
38#include <linux/netfilter_ipv4/ip_tables.h> 38#include <linux/netfilter_ipv4/ip_tables.h>
39#include <linux/netfilter_ipv4/ipt_hashlimit.h> 39#include <linux/netfilter_ipv4/ipt_hashlimit.h>
40#include <linux/netfilter_ipv4/lockhelp.h>
41 40
42/* FIXME: this is just for IP_NF_ASSERRT */ 41/* FIXME: this is just for IP_NF_ASSERRT */
43#include <linux/netfilter_ipv4/ip_conntrack.h> 42#include <linux/netfilter_ipv4/ip_conntrack.h>
@@ -92,7 +91,7 @@ struct ipt_hashlimit_htable {
92 struct hlist_head hash[0]; /* hashtable itself */ 91 struct hlist_head hash[0]; /* hashtable itself */
93}; 92};
94 93
95static DECLARE_LOCK(hashlimit_lock); /* protects htables list */ 94static DEFINE_SPINLOCK(hashlimit_lock); /* protects htables list */
96static DECLARE_MUTEX(hlimit_mutex); /* additional checkentry protection */ 95static DECLARE_MUTEX(hlimit_mutex); /* additional checkentry protection */
97static HLIST_HEAD(hashlimit_htables); 96static HLIST_HEAD(hashlimit_htables);
98static kmem_cache_t *hashlimit_cachep; 97static kmem_cache_t *hashlimit_cachep;
@@ -233,9 +232,9 @@ static int htable_create(struct ipt_hashlimit_info *minfo)
233 hinfo->timer.function = htable_gc; 232 hinfo->timer.function = htable_gc;
234 add_timer(&hinfo->timer); 233 add_timer(&hinfo->timer);
235 234
236 LOCK_BH(&hashlimit_lock); 235 spin_lock_bh(&hashlimit_lock);
237 hlist_add_head(&hinfo->node, &hashlimit_htables); 236 hlist_add_head(&hinfo->node, &hashlimit_htables);
238 UNLOCK_BH(&hashlimit_lock); 237 spin_unlock_bh(&hashlimit_lock);
239 238
240 return 0; 239 return 0;
241} 240}
@@ -301,15 +300,15 @@ static struct ipt_hashlimit_htable *htable_find_get(char *name)
301 struct ipt_hashlimit_htable *hinfo; 300 struct ipt_hashlimit_htable *hinfo;
302 struct hlist_node *pos; 301 struct hlist_node *pos;
303 302
304 LOCK_BH(&hashlimit_lock); 303 spin_lock_bh(&hashlimit_lock);
305 hlist_for_each_entry(hinfo, pos, &hashlimit_htables, node) { 304 hlist_for_each_entry(hinfo, pos, &hashlimit_htables, node) {
306 if (!strcmp(name, hinfo->pde->name)) { 305 if (!strcmp(name, hinfo->pde->name)) {
307 atomic_inc(&hinfo->use); 306 atomic_inc(&hinfo->use);
308 UNLOCK_BH(&hashlimit_lock); 307 spin_unlock_bh(&hashlimit_lock);
309 return hinfo; 308 return hinfo;
310 } 309 }
311 } 310 }
312 UNLOCK_BH(&hashlimit_lock); 311 spin_unlock_bh(&hashlimit_lock);
313 312
314 return NULL; 313 return NULL;
315} 314}
@@ -317,9 +316,9 @@ static struct ipt_hashlimit_htable *htable_find_get(char *name)
317static void htable_put(struct ipt_hashlimit_htable *hinfo) 316static void htable_put(struct ipt_hashlimit_htable *hinfo)
318{ 317{
319 if (atomic_dec_and_test(&hinfo->use)) { 318 if (atomic_dec_and_test(&hinfo->use)) {
320 LOCK_BH(&hashlimit_lock); 319 spin_lock_bh(&hashlimit_lock);
321 hlist_del(&hinfo->node); 320 hlist_del(&hinfo->node);
322 UNLOCK_BH(&hashlimit_lock); 321 spin_unlock_bh(&hashlimit_lock);
323 htable_destroy(hinfo); 322 htable_destroy(hinfo);
324 } 323 }
325} 324}
diff --git a/net/ipv4/netfilter/ipt_helper.c b/net/ipv4/netfilter/ipt_helper.c
index 33fdf364d3d3..3e7dd014de43 100644
--- a/net/ipv4/netfilter/ipt_helper.c
+++ b/net/ipv4/netfilter/ipt_helper.c
@@ -53,7 +53,7 @@ match(const struct sk_buff *skb,
53 return ret; 53 return ret;
54 } 54 }
55 55
56 READ_LOCK(&ip_conntrack_lock); 56 read_lock_bh(&ip_conntrack_lock);
57 if (!ct->master->helper) { 57 if (!ct->master->helper) {
58 DEBUGP("ipt_helper: master ct %p has no helper\n", 58 DEBUGP("ipt_helper: master ct %p has no helper\n",
59 exp->expectant); 59 exp->expectant);
@@ -69,7 +69,7 @@ match(const struct sk_buff *skb,
69 ret ^= !strncmp(ct->master->helper->name, info->name, 69 ret ^= !strncmp(ct->master->helper->name, info->name,
70 strlen(ct->master->helper->name)); 70 strlen(ct->master->helper->name));
71out_unlock: 71out_unlock:
72 READ_UNLOCK(&ip_conntrack_lock); 72 read_unlock_bh(&ip_conntrack_lock);
73 return ret; 73 return ret;
74} 74}
75 75
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 47a30c3188ea..14f5c53235fe 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -695,7 +695,7 @@ static void ipv6_del_addr(struct inet6_ifaddr *ifp)
695 695
696 if (rt && ((rt->rt6i_flags & (RTF_GATEWAY | RTF_DEFAULT)) == 0)) { 696 if (rt && ((rt->rt6i_flags & (RTF_GATEWAY | RTF_DEFAULT)) == 0)) {
697 if (onlink == 0) { 697 if (onlink == 0) {
698 ip6_del_rt(rt, NULL, NULL); 698 ip6_del_rt(rt, NULL, NULL, NULL);
699 rt = NULL; 699 rt = NULL;
700 } else if (!(rt->rt6i_flags & RTF_EXPIRES)) { 700 } else if (!(rt->rt6i_flags & RTF_EXPIRES)) {
701 rt->rt6i_expires = expires; 701 rt->rt6i_expires = expires;
@@ -1340,7 +1340,7 @@ addrconf_prefix_route(struct in6_addr *pfx, int plen, struct net_device *dev,
1340 if (dev->type == ARPHRD_SIT && (dev->flags&IFF_POINTOPOINT)) 1340 if (dev->type == ARPHRD_SIT && (dev->flags&IFF_POINTOPOINT))
1341 rtmsg.rtmsg_flags |= RTF_NONEXTHOP; 1341 rtmsg.rtmsg_flags |= RTF_NONEXTHOP;
1342 1342
1343 ip6_route_add(&rtmsg, NULL, NULL); 1343 ip6_route_add(&rtmsg, NULL, NULL, NULL);
1344} 1344}
1345 1345
1346/* Create "default" multicast route to the interface */ 1346/* Create "default" multicast route to the interface */
@@ -1357,7 +1357,7 @@ static void addrconf_add_mroute(struct net_device *dev)
1357 rtmsg.rtmsg_ifindex = dev->ifindex; 1357 rtmsg.rtmsg_ifindex = dev->ifindex;
1358 rtmsg.rtmsg_flags = RTF_UP; 1358 rtmsg.rtmsg_flags = RTF_UP;
1359 rtmsg.rtmsg_type = RTMSG_NEWROUTE; 1359 rtmsg.rtmsg_type = RTMSG_NEWROUTE;
1360 ip6_route_add(&rtmsg, NULL, NULL); 1360 ip6_route_add(&rtmsg, NULL, NULL, NULL);
1361} 1361}
1362 1362
1363static void sit_route_add(struct net_device *dev) 1363static void sit_route_add(struct net_device *dev)
@@ -1374,7 +1374,7 @@ static void sit_route_add(struct net_device *dev)
1374 rtmsg.rtmsg_flags = RTF_UP|RTF_NONEXTHOP; 1374 rtmsg.rtmsg_flags = RTF_UP|RTF_NONEXTHOP;
1375 rtmsg.rtmsg_ifindex = dev->ifindex; 1375 rtmsg.rtmsg_ifindex = dev->ifindex;
1376 1376
1377 ip6_route_add(&rtmsg, NULL, NULL); 1377 ip6_route_add(&rtmsg, NULL, NULL, NULL);
1378} 1378}
1379 1379
1380static void addrconf_add_lroute(struct net_device *dev) 1380static void addrconf_add_lroute(struct net_device *dev)
@@ -1467,7 +1467,7 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len)
1467 if (rt && ((rt->rt6i_flags & (RTF_GATEWAY | RTF_DEFAULT)) == 0)) { 1467 if (rt && ((rt->rt6i_flags & (RTF_GATEWAY | RTF_DEFAULT)) == 0)) {
1468 if (rt->rt6i_flags&RTF_EXPIRES) { 1468 if (rt->rt6i_flags&RTF_EXPIRES) {
1469 if (valid_lft == 0) { 1469 if (valid_lft == 0) {
1470 ip6_del_rt(rt, NULL, NULL); 1470 ip6_del_rt(rt, NULL, NULL, NULL);
1471 rt = NULL; 1471 rt = NULL;
1472 } else { 1472 } else {
1473 rt->rt6i_expires = rt_expires; 1473 rt->rt6i_expires = rt_expires;
@@ -3094,7 +3094,7 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
3094 switch (event) { 3094 switch (event) {
3095 case RTM_NEWADDR: 3095 case RTM_NEWADDR:
3096 dst_hold(&ifp->rt->u.dst); 3096 dst_hold(&ifp->rt->u.dst);
3097 if (ip6_ins_rt(ifp->rt, NULL, NULL)) 3097 if (ip6_ins_rt(ifp->rt, NULL, NULL, NULL))
3098 dst_release(&ifp->rt->u.dst); 3098 dst_release(&ifp->rt->u.dst);
3099 if (ifp->idev->cnf.forwarding) 3099 if (ifp->idev->cnf.forwarding)
3100 addrconf_join_anycast(ifp); 3100 addrconf_join_anycast(ifp);
@@ -3104,7 +3104,7 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
3104 addrconf_leave_anycast(ifp); 3104 addrconf_leave_anycast(ifp);
3105 addrconf_leave_solict(ifp->idev, &ifp->addr); 3105 addrconf_leave_solict(ifp->idev, &ifp->addr);
3106 dst_hold(&ifp->rt->u.dst); 3106 dst_hold(&ifp->rt->u.dst);
3107 if (ip6_del_rt(ifp->rt, NULL, NULL)) 3107 if (ip6_del_rt(ifp->rt, NULL, NULL, NULL))
3108 dst_free(&ifp->rt->u.dst); 3108 dst_free(&ifp->rt->u.dst);
3109 else 3109 else
3110 dst_release(&ifp->rt->u.dst); 3110 dst_release(&ifp->rt->u.dst);
diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
index 5d22ca3cca2e..6b7294047238 100644
--- a/net/ipv6/anycast.c
+++ b/net/ipv6/anycast.c
@@ -337,7 +337,7 @@ int ipv6_dev_ac_inc(struct net_device *dev, struct in6_addr *addr)
337 write_unlock_bh(&idev->lock); 337 write_unlock_bh(&idev->lock);
338 338
339 dst_hold(&rt->u.dst); 339 dst_hold(&rt->u.dst);
340 if (ip6_ins_rt(rt, NULL, NULL)) 340 if (ip6_ins_rt(rt, NULL, NULL, NULL))
341 dst_release(&rt->u.dst); 341 dst_release(&rt->u.dst);
342 342
343 addrconf_join_solict(dev, &aca->aca_addr); 343 addrconf_join_solict(dev, &aca->aca_addr);
@@ -380,7 +380,7 @@ int __ipv6_dev_ac_dec(struct inet6_dev *idev, struct in6_addr *addr)
380 addrconf_leave_solict(idev, &aca->aca_addr); 380 addrconf_leave_solict(idev, &aca->aca_addr);
381 381
382 dst_hold(&aca->aca_rt->u.dst); 382 dst_hold(&aca->aca_rt->u.dst);
383 if (ip6_del_rt(aca->aca_rt, NULL, NULL)) 383 if (ip6_del_rt(aca->aca_rt, NULL, NULL, NULL))
384 dst_free(&aca->aca_rt->u.dst); 384 dst_free(&aca->aca_rt->u.dst);
385 else 385 else
386 dst_release(&aca->aca_rt->u.dst); 386 dst_release(&aca->aca_rt->u.dst);
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 405740b75abb..1b354aa97934 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -394,7 +394,7 @@ insert_above:
394 */ 394 */
395 395
396static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, 396static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
397 struct nlmsghdr *nlh) 397 struct nlmsghdr *nlh, struct netlink_skb_parms *req)
398{ 398{
399 struct rt6_info *iter = NULL; 399 struct rt6_info *iter = NULL;
400 struct rt6_info **ins; 400 struct rt6_info **ins;
@@ -449,7 +449,7 @@ out:
449 *ins = rt; 449 *ins = rt;
450 rt->rt6i_node = fn; 450 rt->rt6i_node = fn;
451 atomic_inc(&rt->rt6i_ref); 451 atomic_inc(&rt->rt6i_ref);
452 inet6_rt_notify(RTM_NEWROUTE, rt, nlh); 452 inet6_rt_notify(RTM_NEWROUTE, rt, nlh, req);
453 rt6_stats.fib_rt_entries++; 453 rt6_stats.fib_rt_entries++;
454 454
455 if ((fn->fn_flags & RTN_RTINFO) == 0) { 455 if ((fn->fn_flags & RTN_RTINFO) == 0) {
@@ -479,7 +479,8 @@ void fib6_force_start_gc(void)
479 * with source addr info in sub-trees 479 * with source addr info in sub-trees
480 */ 480 */
481 481
482int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr) 482int fib6_add(struct fib6_node *root, struct rt6_info *rt,
483 struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
483{ 484{
484 struct fib6_node *fn; 485 struct fib6_node *fn;
485 int err = -ENOMEM; 486 int err = -ENOMEM;
@@ -552,7 +553,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nlmsghdr *nlh,
552 } 553 }
553#endif 554#endif
554 555
555 err = fib6_add_rt2node(fn, rt, nlh); 556 err = fib6_add_rt2node(fn, rt, nlh, req);
556 557
557 if (err == 0) { 558 if (err == 0) {
558 fib6_start_gc(rt); 559 fib6_start_gc(rt);
@@ -859,7 +860,7 @@ static struct fib6_node * fib6_repair_tree(struct fib6_node *fn)
859} 860}
860 861
861static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp, 862static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
862 struct nlmsghdr *nlh, void *_rtattr) 863 struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
863{ 864{
864 struct fib6_walker_t *w; 865 struct fib6_walker_t *w;
865 struct rt6_info *rt = *rtp; 866 struct rt6_info *rt = *rtp;
@@ -915,11 +916,11 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
915 if (atomic_read(&rt->rt6i_ref) != 1) BUG(); 916 if (atomic_read(&rt->rt6i_ref) != 1) BUG();
916 } 917 }
917 918
918 inet6_rt_notify(RTM_DELROUTE, rt, nlh); 919 inet6_rt_notify(RTM_DELROUTE, rt, nlh, req);
919 rt6_release(rt); 920 rt6_release(rt);
920} 921}
921 922
922int fib6_del(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr) 923int fib6_del(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
923{ 924{
924 struct fib6_node *fn = rt->rt6i_node; 925 struct fib6_node *fn = rt->rt6i_node;
925 struct rt6_info **rtp; 926 struct rt6_info **rtp;
@@ -944,7 +945,7 @@ int fib6_del(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr)
944 945
945 for (rtp = &fn->leaf; *rtp; rtp = &(*rtp)->u.next) { 946 for (rtp = &fn->leaf; *rtp; rtp = &(*rtp)->u.next) {
946 if (*rtp == rt) { 947 if (*rtp == rt) {
947 fib6_del_route(fn, rtp, nlh, _rtattr); 948 fib6_del_route(fn, rtp, nlh, _rtattr, req);
948 return 0; 949 return 0;
949 } 950 }
950 } 951 }
@@ -1073,7 +1074,7 @@ static int fib6_clean_node(struct fib6_walker_t *w)
1073 res = c->func(rt, c->arg); 1074 res = c->func(rt, c->arg);
1074 if (res < 0) { 1075 if (res < 0) {
1075 w->leaf = rt; 1076 w->leaf = rt;
1076 res = fib6_del(rt, NULL, NULL); 1077 res = fib6_del(rt, NULL, NULL, NULL);
1077 if (res) { 1078 if (res) {
1078#if RT6_DEBUG >= 2 1079#if RT6_DEBUG >= 2
1079 printk(KERN_DEBUG "fib6_clean_node: del failed: rt=%p@%p err=%d\n", rt, rt->rt6i_node, res); 1080 printk(KERN_DEBUG "fib6_clean_node: del failed: rt=%p@%p err=%d\n", rt, rt->rt6i_node, res);
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index b78a53586804..06e7cdaeedc5 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -484,9 +484,6 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
484 to->nf_bridge = from->nf_bridge; 484 to->nf_bridge = from->nf_bridge;
485 nf_bridge_get(to->nf_bridge); 485 nf_bridge_get(to->nf_bridge);
486#endif 486#endif
487#ifdef CONFIG_NETFILTER_DEBUG
488 to->nf_debug = from->nf_debug;
489#endif
490#endif 487#endif
491} 488}
492 489
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 279ab86be662..f3ef4c38d315 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -423,11 +423,12 @@ done:
423 psin6 = (struct sockaddr_in6 *)&greqs.gsr_group; 423 psin6 = (struct sockaddr_in6 *)&greqs.gsr_group;
424 retv = ipv6_sock_mc_join(sk, greqs.gsr_interface, 424 retv = ipv6_sock_mc_join(sk, greqs.gsr_interface,
425 &psin6->sin6_addr); 425 &psin6->sin6_addr);
426 if (retv) 426 /* prior join w/ different source is ok */
427 if (retv && retv != -EADDRINUSE)
427 break; 428 break;
428 omode = MCAST_INCLUDE; 429 omode = MCAST_INCLUDE;
429 add = 1; 430 add = 1;
430 } else /*IP_DROP_SOURCE_MEMBERSHIP */ { 431 } else /* MCAST_LEAVE_SOURCE_GROUP */ {
431 omode = MCAST_INCLUDE; 432 omode = MCAST_INCLUDE;
432 add = 0; 433 add = 0;
433 } 434 }
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 393b6e6f50a9..562fcd14fdea 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -188,6 +188,16 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, struct in6_addr *addr)
188 if (!ipv6_addr_is_multicast(addr)) 188 if (!ipv6_addr_is_multicast(addr))
189 return -EINVAL; 189 return -EINVAL;
190 190
191 read_lock_bh(&ipv6_sk_mc_lock);
192 for (mc_lst=np->ipv6_mc_list; mc_lst; mc_lst=mc_lst->next) {
193 if ((ifindex == 0 || mc_lst->ifindex == ifindex) &&
194 ipv6_addr_equal(&mc_lst->addr, addr)) {
195 read_unlock_bh(&ipv6_sk_mc_lock);
196 return -EADDRINUSE;
197 }
198 }
199 read_unlock_bh(&ipv6_sk_mc_lock);
200
191 mc_lst = sock_kmalloc(sk, sizeof(struct ipv6_mc_socklist), GFP_KERNEL); 201 mc_lst = sock_kmalloc(sk, sizeof(struct ipv6_mc_socklist), GFP_KERNEL);
192 202
193 if (mc_lst == NULL) 203 if (mc_lst == NULL)
@@ -349,6 +359,7 @@ int ip6_mc_source(int add, int omode, struct sock *sk,
349 struct ipv6_pinfo *inet6 = inet6_sk(sk); 359 struct ipv6_pinfo *inet6 = inet6_sk(sk);
350 struct ip6_sf_socklist *psl; 360 struct ip6_sf_socklist *psl;
351 int i, j, rv; 361 int i, j, rv;
362 int leavegroup = 0;
352 int err; 363 int err;
353 364
354 if (pgsr->gsr_group.ss_family != AF_INET6 || 365 if (pgsr->gsr_group.ss_family != AF_INET6 ||
@@ -368,6 +379,7 @@ int ip6_mc_source(int add, int omode, struct sock *sk,
368 379
369 err = -EADDRNOTAVAIL; 380 err = -EADDRNOTAVAIL;
370 381
382 read_lock_bh(&ipv6_sk_mc_lock);
371 for (pmc=inet6->ipv6_mc_list; pmc; pmc=pmc->next) { 383 for (pmc=inet6->ipv6_mc_list; pmc; pmc=pmc->next) {
372 if (pgsr->gsr_interface && pmc->ifindex != pgsr->gsr_interface) 384 if (pgsr->gsr_interface && pmc->ifindex != pgsr->gsr_interface)
373 continue; 385 continue;
@@ -401,6 +413,12 @@ int ip6_mc_source(int add, int omode, struct sock *sk,
401 if (rv) /* source not found */ 413 if (rv) /* source not found */
402 goto done; 414 goto done;
403 415
416 /* special case - (INCLUDE, empty) == LEAVE_GROUP */
417 if (psl->sl_count == 1 && omode == MCAST_INCLUDE) {
418 leavegroup = 1;
419 goto done;
420 }
421
404 /* update the interface filter */ 422 /* update the interface filter */
405 ip6_mc_del_src(idev, group, omode, 1, source, 1); 423 ip6_mc_del_src(idev, group, omode, 1, source, 1);
406 424
@@ -453,9 +471,12 @@ int ip6_mc_source(int add, int omode, struct sock *sk,
453 /* update the interface list */ 471 /* update the interface list */
454 ip6_mc_add_src(idev, group, omode, 1, source, 1); 472 ip6_mc_add_src(idev, group, omode, 1, source, 1);
455done: 473done:
474 read_unlock_bh(&ipv6_sk_mc_lock);
456 read_unlock_bh(&idev->lock); 475 read_unlock_bh(&idev->lock);
457 in6_dev_put(idev); 476 in6_dev_put(idev);
458 dev_put(dev); 477 dev_put(dev);
478 if (leavegroup)
479 return ipv6_sock_mc_drop(sk, pgsr->gsr_interface, group);
459 return err; 480 return err;
460} 481}
461 482
@@ -1280,15 +1301,6 @@ static struct sk_buff *mld_newpack(struct net_device *dev, int size)
1280 return NULL; 1301 return NULL;
1281 1302
1282 skb_reserve(skb, LL_RESERVED_SPACE(dev)); 1303 skb_reserve(skb, LL_RESERVED_SPACE(dev));
1283 if (dev->hard_header) {
1284 unsigned char ha[MAX_ADDR_LEN];
1285
1286 ndisc_mc_map(&mld2_all_mcr, ha, dev, 1);
1287 if (dev->hard_header(skb, dev, ETH_P_IPV6,ha,NULL,size) < 0) {
1288 kfree_skb(skb);
1289 return NULL;
1290 }
1291 }
1292 1304
1293 if (ipv6_get_lladdr(dev, &addr_buf)) { 1305 if (ipv6_get_lladdr(dev, &addr_buf)) {
1294 /* <draft-ietf-magma-mld-source-05.txt>: 1306 /* <draft-ietf-magma-mld-source-05.txt>:
@@ -1312,6 +1324,30 @@ static struct sk_buff *mld_newpack(struct net_device *dev, int size)
1312 return skb; 1324 return skb;
1313} 1325}
1314 1326
1327static inline int mld_dev_queue_xmit2(struct sk_buff *skb)
1328{
1329 struct net_device *dev = skb->dev;
1330
1331 if (dev->hard_header) {
1332 unsigned char ha[MAX_ADDR_LEN];
1333 int err;
1334
1335 ndisc_mc_map(&skb->nh.ipv6h->daddr, ha, dev, 1);
1336 err = dev->hard_header(skb, dev, ETH_P_IPV6, ha, NULL, skb->len);
1337 if (err < 0) {
1338 kfree_skb(skb);
1339 return err;
1340 }
1341 }
1342 return dev_queue_xmit(skb);
1343}
1344
1345static inline int mld_dev_queue_xmit(struct sk_buff *skb)
1346{
1347 return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb, NULL, skb->dev,
1348 mld_dev_queue_xmit2);
1349}
1350
1315static void mld_sendpack(struct sk_buff *skb) 1351static void mld_sendpack(struct sk_buff *skb)
1316{ 1352{
1317 struct ipv6hdr *pip6 = skb->nh.ipv6h; 1353 struct ipv6hdr *pip6 = skb->nh.ipv6h;
@@ -1329,7 +1365,7 @@ static void mld_sendpack(struct sk_buff *skb)
1329 pmr->csum = csum_ipv6_magic(&pip6->saddr, &pip6->daddr, mldlen, 1365 pmr->csum = csum_ipv6_magic(&pip6->saddr, &pip6->daddr, mldlen,
1330 IPPROTO_ICMPV6, csum_partial(skb->h.raw, mldlen, 0)); 1366 IPPROTO_ICMPV6, csum_partial(skb->h.raw, mldlen, 0));
1331 err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dev, 1367 err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dev,
1332 dev_queue_xmit); 1368 mld_dev_queue_xmit);
1333 if (!err) { 1369 if (!err) {
1334 ICMP6_INC_STATS(idev,ICMP6_MIB_OUTMSGS); 1370 ICMP6_INC_STATS(idev,ICMP6_MIB_OUTMSGS);
1335 IP6_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS); 1371 IP6_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS);
@@ -1635,12 +1671,6 @@ static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type)
1635 } 1671 }
1636 1672
1637 skb_reserve(skb, LL_RESERVED_SPACE(dev)); 1673 skb_reserve(skb, LL_RESERVED_SPACE(dev));
1638 if (dev->hard_header) {
1639 unsigned char ha[MAX_ADDR_LEN];
1640 ndisc_mc_map(snd_addr, ha, dev, 1);
1641 if (dev->hard_header(skb, dev, ETH_P_IPV6, ha, NULL, full_len) < 0)
1642 goto out;
1643 }
1644 1674
1645 if (ipv6_get_lladdr(dev, &addr_buf)) { 1675 if (ipv6_get_lladdr(dev, &addr_buf)) {
1646 /* <draft-ietf-magma-mld-source-05.txt>: 1676 /* <draft-ietf-magma-mld-source-05.txt>:
@@ -1668,7 +1698,7 @@ static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type)
1668 idev = in6_dev_get(skb->dev); 1698 idev = in6_dev_get(skb->dev);
1669 1699
1670 err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dev, 1700 err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dev,
1671 dev_queue_xmit); 1701 mld_dev_queue_xmit);
1672 if (!err) { 1702 if (!err) {
1673 if (type == ICMPV6_MGM_REDUCTION) 1703 if (type == ICMPV6_MGM_REDUCTION)
1674 ICMP6_INC_STATS(idev, ICMP6_MIB_OUTGROUPMEMBREDUCTIONS); 1704 ICMP6_INC_STATS(idev, ICMP6_MIB_OUTGROUPMEMBREDUCTIONS);
@@ -1682,10 +1712,6 @@ static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type)
1682 if (likely(idev != NULL)) 1712 if (likely(idev != NULL))
1683 in6_dev_put(idev); 1713 in6_dev_put(idev);
1684 return; 1714 return;
1685
1686out:
1687 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1688 kfree_skb(skb);
1689} 1715}
1690 1716
1691static int ip6_mc_del1_src(struct ifmcaddr6 *pmc, int sfmode, 1717static int ip6_mc_del1_src(struct ifmcaddr6 *pmc, int sfmode,
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 7c291f4e9edc..7ae72d4c9bd2 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -955,7 +955,7 @@ static void ndisc_recv_na(struct sk_buff *skb)
955 struct rt6_info *rt; 955 struct rt6_info *rt;
956 rt = rt6_get_dflt_router(saddr, dev); 956 rt = rt6_get_dflt_router(saddr, dev);
957 if (rt) 957 if (rt)
958 ip6_del_rt(rt, NULL, NULL); 958 ip6_del_rt(rt, NULL, NULL, NULL);
959 } 959 }
960 960
961out: 961out:
@@ -1096,7 +1096,7 @@ static void ndisc_router_discovery(struct sk_buff *skb)
1096 1096
1097 if (rt && lifetime == 0) { 1097 if (rt && lifetime == 0) {
1098 neigh_clone(neigh); 1098 neigh_clone(neigh);
1099 ip6_del_rt(rt, NULL, NULL); 1099 ip6_del_rt(rt, NULL, NULL, NULL);
1100 rt = NULL; 1100 rt = NULL;
1101 } 1101 }
1102 1102
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index c735276fdd5f..73034511c8db 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -71,7 +71,6 @@ static DECLARE_MUTEX(ip6t_mutex);
71/* Must have mutex */ 71/* Must have mutex */
72#define ASSERT_READ_LOCK(x) IP_NF_ASSERT(down_trylock(&ip6t_mutex) != 0) 72#define ASSERT_READ_LOCK(x) IP_NF_ASSERT(down_trylock(&ip6t_mutex) != 0)
73#define ASSERT_WRITE_LOCK(x) IP_NF_ASSERT(down_trylock(&ip6t_mutex) != 0) 73#define ASSERT_WRITE_LOCK(x) IP_NF_ASSERT(down_trylock(&ip6t_mutex) != 0)
74#include <linux/netfilter_ipv4/lockhelp.h>
75#include <linux/netfilter_ipv4/listhelp.h> 74#include <linux/netfilter_ipv4/listhelp.h>
76 75
77#if 0 76#if 0
diff --git a/net/ipv6/netfilter/ip6t_LOG.c b/net/ipv6/netfilter/ip6t_LOG.c
index bfc3d0185d19..c44685e391b7 100644
--- a/net/ipv6/netfilter/ip6t_LOG.c
+++ b/net/ipv6/netfilter/ip6t_LOG.c
@@ -366,8 +366,6 @@ ip6t_log_packet(unsigned int hooknum,
366 const char *level_string, 366 const char *level_string,
367 const char *prefix) 367 const char *prefix)
368{ 368{
369 struct ipv6hdr *ipv6h = skb->nh.ipv6h;
370
371 spin_lock_bh(&log_lock); 369 spin_lock_bh(&log_lock);
372 printk(level_string); 370 printk(level_string);
373 printk("%sIN=%s OUT=%s ", 371 printk("%sIN=%s OUT=%s ",
@@ -377,39 +375,25 @@ ip6t_log_packet(unsigned int hooknum,
377 if (in && !out) { 375 if (in && !out) {
378 /* MAC logging for input chain only. */ 376 /* MAC logging for input chain only. */
379 printk("MAC="); 377 printk("MAC=");
380 if (skb->dev && skb->dev->hard_header_len && skb->mac.raw != (void*)ipv6h) { 378 if (skb->dev && skb->dev->hard_header_len &&
381 if (skb->dev->type != ARPHRD_SIT){ 379 skb->mac.raw != skb->nh.raw) {
382 int i; 380 unsigned char *p = skb->mac.raw;
383 unsigned char *p = skb->mac.raw; 381 int i;
384 for (i = 0; i < skb->dev->hard_header_len; i++,p++) 382
385 printk("%02x%c", *p, 383 if (skb->dev->type == ARPHRD_SIT &&
386 i==skb->dev->hard_header_len - 1 384 (p -= ETH_HLEN) < skb->head)
387 ? ' ':':'); 385 p = NULL;
388 } else { 386
389 int i; 387 if (p != NULL)
390 unsigned char *p = skb->mac.raw; 388 for (i = 0; i < skb->dev->hard_header_len; i++)
391 if ( p - (ETH_ALEN*2+2) > skb->head ){ 389 printk("%02x", p[i]);
392 p -= (ETH_ALEN+2); 390 printk(" ");
393 for (i = 0; i < (ETH_ALEN); i++,p++) 391
394 printk("%02x%s", *p, 392 if (skb->dev->type == ARPHRD_SIT) {
395 i == ETH_ALEN-1 ? "->" : ":"); 393 struct iphdr *iph = (struct iphdr *)skb->mac.raw;
396 p -= (ETH_ALEN*2); 394 printk("TUNNEL=%u.%u.%u.%u->%u.%u.%u.%u ",
397 for (i = 0; i < (ETH_ALEN); i++,p++) 395 NIPQUAD(iph->saddr),
398 printk("%02x%c", *p, 396 NIPQUAD(iph->daddr));
399 i == ETH_ALEN-1 ? ' ' : ':');
400 }
401
402 if ((skb->dev->addr_len == 4) &&
403 skb->dev->hard_header_len > 20){
404 printk("TUNNEL=");
405 p = skb->mac.raw + 12;
406 for (i = 0; i < 4; i++,p++)
407 printk("%3d%s", *p,
408 i == 3 ? "->" : ".");
409 for (i = 0; i < 4; i++,p++)
410 printk("%3d%c", *p,
411 i == 3 ? ' ' : '.');
412 }
413 } 397 }
414 } else 398 } else
415 printk(" "); 399 printk(" ");
diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c
index 71407beaf790..c2982efd14af 100644
--- a/net/ipv6/netfilter/ip6table_raw.c
+++ b/net/ipv6/netfilter/ip6table_raw.c
@@ -129,13 +129,15 @@ static struct nf_hook_ops ip6t_ops[] = {
129 .hook = ip6t_hook, 129 .hook = ip6t_hook,
130 .pf = PF_INET6, 130 .pf = PF_INET6,
131 .hooknum = NF_IP6_PRE_ROUTING, 131 .hooknum = NF_IP6_PRE_ROUTING,
132 .priority = NF_IP6_PRI_FIRST 132 .priority = NF_IP6_PRI_FIRST,
133 .owner = THIS_MODULE,
133 }, 134 },
134 { 135 {
135 .hook = ip6t_hook, 136 .hook = ip6t_hook,
136 .pf = PF_INET6, 137 .pf = PF_INET6,
137 .hooknum = NF_IP6_LOCAL_OUT, 138 .hooknum = NF_IP6_LOCAL_OUT,
138 .priority = NF_IP6_PRI_FIRST 139 .priority = NF_IP6_PRI_FIRST,
140 .owner = THIS_MODULE,
139 }, 141 },
140}; 142};
141 143
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 1f5b226c3573..878789b3122d 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -384,12 +384,13 @@ struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr,
384 be destroyed. 384 be destroyed.
385 */ 385 */
386 386
387int ip6_ins_rt(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr) 387int ip6_ins_rt(struct rt6_info *rt, struct nlmsghdr *nlh,
388 void *_rtattr, struct netlink_skb_parms *req)
388{ 389{
389 int err; 390 int err;
390 391
391 write_lock_bh(&rt6_lock); 392 write_lock_bh(&rt6_lock);
392 err = fib6_add(&ip6_routing_table, rt, nlh, _rtattr); 393 err = fib6_add(&ip6_routing_table, rt, nlh, _rtattr, req);
393 write_unlock_bh(&rt6_lock); 394 write_unlock_bh(&rt6_lock);
394 395
395 return err; 396 return err;
@@ -400,7 +401,7 @@ int ip6_ins_rt(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr)
400 */ 401 */
401 402
402static struct rt6_info *rt6_cow(struct rt6_info *ort, struct in6_addr *daddr, 403static struct rt6_info *rt6_cow(struct rt6_info *ort, struct in6_addr *daddr,
403 struct in6_addr *saddr) 404 struct in6_addr *saddr, struct netlink_skb_parms *req)
404{ 405{
405 int err; 406 int err;
406 struct rt6_info *rt; 407 struct rt6_info *rt;
@@ -432,7 +433,7 @@ static struct rt6_info *rt6_cow(struct rt6_info *ort, struct in6_addr *daddr,
432 433
433 dst_hold(&rt->u.dst); 434 dst_hold(&rt->u.dst);
434 435
435 err = ip6_ins_rt(rt, NULL, NULL); 436 err = ip6_ins_rt(rt, NULL, NULL, req);
436 if (err == 0) 437 if (err == 0)
437 return rt; 438 return rt;
438 439
@@ -491,7 +492,8 @@ restart:
491 read_unlock_bh(&rt6_lock); 492 read_unlock_bh(&rt6_lock);
492 493
493 nrt = rt6_cow(rt, &skb->nh.ipv6h->daddr, 494 nrt = rt6_cow(rt, &skb->nh.ipv6h->daddr,
494 &skb->nh.ipv6h->saddr); 495 &skb->nh.ipv6h->saddr,
496 &NETLINK_CB(skb));
495 497
496 dst_release(&rt->u.dst); 498 dst_release(&rt->u.dst);
497 rt = nrt; 499 rt = nrt;
@@ -551,7 +553,7 @@ restart:
551 dst_hold(&rt->u.dst); 553 dst_hold(&rt->u.dst);
552 read_unlock_bh(&rt6_lock); 554 read_unlock_bh(&rt6_lock);
553 555
554 nrt = rt6_cow(rt, &fl->fl6_dst, &fl->fl6_src); 556 nrt = rt6_cow(rt, &fl->fl6_dst, &fl->fl6_src, NULL);
555 557
556 dst_release(&rt->u.dst); 558 dst_release(&rt->u.dst);
557 rt = nrt; 559 rt = nrt;
@@ -598,7 +600,7 @@ static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
598 600
599 if (rt) { 601 if (rt) {
600 if (rt->rt6i_flags & RTF_CACHE) 602 if (rt->rt6i_flags & RTF_CACHE)
601 ip6_del_rt(rt, NULL, NULL); 603 ip6_del_rt(rt, NULL, NULL, NULL);
602 else 604 else
603 dst_release(dst); 605 dst_release(dst);
604 } 606 }
@@ -787,7 +789,8 @@ int ipv6_get_hoplimit(struct net_device *dev)
787 * 789 *
788 */ 790 */
789 791
790int ip6_route_add(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, void *_rtattr) 792int ip6_route_add(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh,
793 void *_rtattr, struct netlink_skb_parms *req)
791{ 794{
792 int err; 795 int err;
793 struct rtmsg *r; 796 struct rtmsg *r;
@@ -974,7 +977,7 @@ install_route:
974 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst)); 977 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
975 rt->u.dst.dev = dev; 978 rt->u.dst.dev = dev;
976 rt->rt6i_idev = idev; 979 rt->rt6i_idev = idev;
977 return ip6_ins_rt(rt, nlh, _rtattr); 980 return ip6_ins_rt(rt, nlh, _rtattr, req);
978 981
979out: 982out:
980 if (dev) 983 if (dev)
@@ -986,7 +989,7 @@ out:
986 return err; 989 return err;
987} 990}
988 991
989int ip6_del_rt(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr) 992int ip6_del_rt(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
990{ 993{
991 int err; 994 int err;
992 995
@@ -994,7 +997,7 @@ int ip6_del_rt(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr)
994 997
995 rt6_reset_dflt_pointer(NULL); 998 rt6_reset_dflt_pointer(NULL);
996 999
997 err = fib6_del(rt, nlh, _rtattr); 1000 err = fib6_del(rt, nlh, _rtattr, req);
998 dst_release(&rt->u.dst); 1001 dst_release(&rt->u.dst);
999 1002
1000 write_unlock_bh(&rt6_lock); 1003 write_unlock_bh(&rt6_lock);
@@ -1002,7 +1005,7 @@ int ip6_del_rt(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr)
1002 return err; 1005 return err;
1003} 1006}
1004 1007
1005static int ip6_route_del(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, void *_rtattr) 1008static int ip6_route_del(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
1006{ 1009{
1007 struct fib6_node *fn; 1010 struct fib6_node *fn;
1008 struct rt6_info *rt; 1011 struct rt6_info *rt;
@@ -1029,7 +1032,7 @@ static int ip6_route_del(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, void *_r
1029 dst_hold(&rt->u.dst); 1032 dst_hold(&rt->u.dst);
1030 read_unlock_bh(&rt6_lock); 1033 read_unlock_bh(&rt6_lock);
1031 1034
1032 return ip6_del_rt(rt, nlh, _rtattr); 1035 return ip6_del_rt(rt, nlh, _rtattr, req);
1033 } 1036 }
1034 } 1037 }
1035 read_unlock_bh(&rt6_lock); 1038 read_unlock_bh(&rt6_lock);
@@ -1136,11 +1139,11 @@ source_ok:
1136 nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev); 1139 nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1137 nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&nrt->u.dst)); 1140 nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&nrt->u.dst));
1138 1141
1139 if (ip6_ins_rt(nrt, NULL, NULL)) 1142 if (ip6_ins_rt(nrt, NULL, NULL, NULL))
1140 goto out; 1143 goto out;
1141 1144
1142 if (rt->rt6i_flags&RTF_CACHE) { 1145 if (rt->rt6i_flags&RTF_CACHE) {
1143 ip6_del_rt(rt, NULL, NULL); 1146 ip6_del_rt(rt, NULL, NULL, NULL);
1144 return; 1147 return;
1145 } 1148 }
1146 1149
@@ -1204,7 +1207,7 @@ void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1204 2. It is gatewayed route or NONEXTHOP route. Action: clone it. 1207 2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1205 */ 1208 */
1206 if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP)) { 1209 if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP)) {
1207 nrt = rt6_cow(rt, daddr, saddr); 1210 nrt = rt6_cow(rt, daddr, saddr, NULL);
1208 if (!nrt->u.dst.error) { 1211 if (!nrt->u.dst.error) {
1209 nrt->u.dst.metrics[RTAX_MTU-1] = pmtu; 1212 nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1210 if (allfrag) 1213 if (allfrag)
@@ -1232,7 +1235,7 @@ void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1232 nrt->u.dst.metrics[RTAX_MTU-1] = pmtu; 1235 nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1233 if (allfrag) 1236 if (allfrag)
1234 nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG; 1237 nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1235 ip6_ins_rt(nrt, NULL, NULL); 1238 ip6_ins_rt(nrt, NULL, NULL, NULL);
1236 } 1239 }
1237 1240
1238out: 1241out:
@@ -1305,7 +1308,7 @@ struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1305 1308
1306 rtmsg.rtmsg_ifindex = dev->ifindex; 1309 rtmsg.rtmsg_ifindex = dev->ifindex;
1307 1310
1308 ip6_route_add(&rtmsg, NULL, NULL); 1311 ip6_route_add(&rtmsg, NULL, NULL, NULL);
1309 return rt6_get_dflt_router(gwaddr, dev); 1312 return rt6_get_dflt_router(gwaddr, dev);
1310} 1313}
1311 1314
@@ -1323,7 +1326,7 @@ restart:
1323 1326
1324 read_unlock_bh(&rt6_lock); 1327 read_unlock_bh(&rt6_lock);
1325 1328
1326 ip6_del_rt(rt, NULL, NULL); 1329 ip6_del_rt(rt, NULL, NULL, NULL);
1327 1330
1328 goto restart; 1331 goto restart;
1329 } 1332 }
@@ -1349,10 +1352,10 @@ int ipv6_route_ioctl(unsigned int cmd, void __user *arg)
1349 rtnl_lock(); 1352 rtnl_lock();
1350 switch (cmd) { 1353 switch (cmd) {
1351 case SIOCADDRT: 1354 case SIOCADDRT:
1352 err = ip6_route_add(&rtmsg, NULL, NULL); 1355 err = ip6_route_add(&rtmsg, NULL, NULL, NULL);
1353 break; 1356 break;
1354 case SIOCDELRT: 1357 case SIOCDELRT:
1355 err = ip6_route_del(&rtmsg, NULL, NULL); 1358 err = ip6_route_del(&rtmsg, NULL, NULL, NULL);
1356 break; 1359 break;
1357 default: 1360 default:
1358 err = -EINVAL; 1361 err = -EINVAL;
@@ -1546,7 +1549,7 @@ int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1546 1549
1547 if (inet6_rtm_to_rtmsg(r, arg, &rtmsg)) 1550 if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
1548 return -EINVAL; 1551 return -EINVAL;
1549 return ip6_route_del(&rtmsg, nlh, arg); 1552 return ip6_route_del(&rtmsg, nlh, arg, &NETLINK_CB(skb));
1550} 1553}
1551 1554
1552int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) 1555int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
@@ -1556,7 +1559,7 @@ int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1556 1559
1557 if (inet6_rtm_to_rtmsg(r, arg, &rtmsg)) 1560 if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
1558 return -EINVAL; 1561 return -EINVAL;
1559 return ip6_route_add(&rtmsg, nlh, arg); 1562 return ip6_route_add(&rtmsg, nlh, arg, &NETLINK_CB(skb));
1560} 1563}
1561 1564
1562struct rt6_rtnl_dump_arg 1565struct rt6_rtnl_dump_arg
@@ -1566,12 +1569,9 @@ struct rt6_rtnl_dump_arg
1566}; 1569};
1567 1570
1568static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt, 1571static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
1569 struct in6_addr *dst, 1572 struct in6_addr *dst, struct in6_addr *src,
1570 struct in6_addr *src, 1573 int iif, int type, u32 pid, u32 seq,
1571 int iif, 1574 int prefix, unsigned int flags)
1572 int type, u32 pid, u32 seq,
1573 struct nlmsghdr *in_nlh, int prefix,
1574 unsigned int flags)
1575{ 1575{
1576 struct rtmsg *rtm; 1576 struct rtmsg *rtm;
1577 struct nlmsghdr *nlh; 1577 struct nlmsghdr *nlh;
@@ -1585,10 +1585,6 @@ static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
1585 } 1585 }
1586 } 1586 }
1587 1587
1588 if (!pid && in_nlh) {
1589 pid = in_nlh->nlmsg_pid;
1590 }
1591
1592 nlh = NLMSG_NEW(skb, pid, seq, type, sizeof(*rtm), flags); 1588 nlh = NLMSG_NEW(skb, pid, seq, type, sizeof(*rtm), flags);
1593 rtm = NLMSG_DATA(nlh); 1589 rtm = NLMSG_DATA(nlh);
1594 rtm->rtm_family = AF_INET6; 1590 rtm->rtm_family = AF_INET6;
@@ -1675,7 +1671,7 @@ static int rt6_dump_route(struct rt6_info *rt, void *p_arg)
1675 1671
1676 return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE, 1672 return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
1677 NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq, 1673 NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
1678 NULL, prefix, NLM_F_MULTI); 1674 prefix, NLM_F_MULTI);
1679} 1675}
1680 1676
1681static int fib6_dump_node(struct fib6_walker_t *w) 1677static int fib6_dump_node(struct fib6_walker_t *w)
@@ -1823,7 +1819,7 @@ int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
1823 &fl.fl6_dst, &fl.fl6_src, 1819 &fl.fl6_dst, &fl.fl6_src,
1824 iif, 1820 iif,
1825 RTM_NEWROUTE, NETLINK_CB(in_skb).pid, 1821 RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
1826 nlh->nlmsg_seq, nlh, 0, 0); 1822 nlh->nlmsg_seq, 0, 0);
1827 if (err < 0) { 1823 if (err < 0) {
1828 err = -EMSGSIZE; 1824 err = -EMSGSIZE;
1829 goto out_free; 1825 goto out_free;
@@ -1839,17 +1835,25 @@ out_free:
1839 goto out; 1835 goto out;
1840} 1836}
1841 1837
1842void inet6_rt_notify(int event, struct rt6_info *rt, struct nlmsghdr *nlh) 1838void inet6_rt_notify(int event, struct rt6_info *rt, struct nlmsghdr *nlh,
1839 struct netlink_skb_parms *req)
1843{ 1840{
1844 struct sk_buff *skb; 1841 struct sk_buff *skb;
1845 int size = NLMSG_SPACE(sizeof(struct rtmsg)+256); 1842 int size = NLMSG_SPACE(sizeof(struct rtmsg)+256);
1843 u32 pid = current->pid;
1844 u32 seq = 0;
1846 1845
1846 if (req)
1847 pid = req->pid;
1848 if (nlh)
1849 seq = nlh->nlmsg_seq;
1850
1847 skb = alloc_skb(size, gfp_any()); 1851 skb = alloc_skb(size, gfp_any());
1848 if (!skb) { 1852 if (!skb) {
1849 netlink_set_err(rtnl, 0, RTMGRP_IPV6_ROUTE, ENOBUFS); 1853 netlink_set_err(rtnl, 0, RTMGRP_IPV6_ROUTE, ENOBUFS);
1850 return; 1854 return;
1851 } 1855 }
1852 if (rt6_fill_node(skb, rt, NULL, NULL, 0, event, 0, 0, nlh, 0, 0) < 0) { 1856 if (rt6_fill_node(skb, rt, NULL, NULL, 0, event, pid, seq, 0, 0) < 0) {
1853 kfree_skb(skb); 1857 kfree_skb(skb);
1854 netlink_set_err(rtnl, 0, RTMGRP_IPV6_ROUTE, EINVAL); 1858 netlink_set_err(rtnl, 0, RTMGRP_IPV6_ROUTE, EINVAL);
1855 return; 1859 return;