aboutsummaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/appletalk/aarp.c2
-rw-r--r--net/appletalk/ddp.c2
-rw-r--r--net/bridge/br_forward.c3
-rw-r--r--net/bridge/br_input.c4
-rw-r--r--net/bridge/br_netfilter.c38
-rw-r--r--net/core/netfilter.c138
-rw-r--r--net/core/skbuff.c6
-rw-r--r--net/ipv4/Kconfig26
-rw-r--r--net/ipv4/Makefile4
-rw-r--r--net/ipv4/af_inet.c12
-rw-r--r--net/ipv4/ah4.c2
-rw-r--r--net/ipv4/esp4.c2
-rw-r--r--net/ipv4/fib_frontend.c55
-rw-r--r--net/ipv4/fib_trie.c2454
-rw-r--r--net/ipv4/ip_input.c5
-rw-r--r--net/ipv4/ip_output.c11
-rw-r--r--net/ipv4/ipcomp.c11
-rw-r--r--net/ipv4/ipmr.c1
-rw-r--r--net/ipv4/ipvs/ip_vs_xmit.c1
-rw-r--r--net/ipv4/netfilter/arp_tables.c1
-rw-r--r--net/ipv4/netfilter/ip_conntrack_amanda.c7
-rw-r--r--net/ipv4/netfilter/ip_conntrack_core.c107
-rw-r--r--net/ipv4/netfilter/ip_conntrack_ftp.c7
-rw-r--r--net/ipv4/netfilter/ip_conntrack_irc.c7
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_sctp.c23
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_tcp.c27
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_udp.c1
-rw-r--r--net/ipv4/netfilter/ip_conntrack_standalone.c22
-rw-r--r--net/ipv4/netfilter/ip_nat_core.c32
-rw-r--r--net/ipv4/netfilter/ip_nat_helper.c13
-rw-r--r--net/ipv4/netfilter/ip_nat_rule.c4
-rw-r--r--net/ipv4/netfilter/ip_nat_standalone.c5
-rw-r--r--net/ipv4/netfilter/ip_tables.c1
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c49
-rw-r--r--net/ipv4/netfilter/ipt_MASQUERADE.c10
-rw-r--r--net/ipv4/netfilter/ipt_REJECT.c13
-rw-r--r--net/ipv4/netfilter/ipt_ULOG.c15
-rw-r--r--net/ipv4/netfilter/ipt_hashlimit.c17
-rw-r--r--net/ipv4/netfilter/ipt_helper.c4
-rw-r--r--net/ipv4/xfrm4_output.c8
-rw-r--r--net/ipv4/xfrm4_state.c9
-rw-r--r--net/ipv4/xfrm4_tunnel.c2
-rw-r--r--net/ipv6/addrconf.c14
-rw-r--r--net/ipv6/ah6.c2
-rw-r--r--net/ipv6/anycast.c4
-rw-r--r--net/ipv6/esp6.c2
-rw-r--r--net/ipv6/ip6_fib.c19
-rw-r--r--net/ipv6/ip6_output.c3
-rw-r--r--net/ipv6/ipcomp6.c9
-rw-r--r--net/ipv6/ipv6_sockglue.c5
-rw-r--r--net/ipv6/mcast.c68
-rw-r--r--net/ipv6/ndisc.c4
-rw-r--r--net/ipv6/netfilter/ip6_tables.c1
-rw-r--r--net/ipv6/netfilter/ip6t_LOG.c54
-rw-r--r--net/ipv6/netfilter/ip6table_raw.c6
-rw-r--r--net/ipv6/route.c78
-rw-r--r--net/ipv6/xfrm6_tunnel.c2
-rw-r--r--net/key/af_key.c16
-rw-r--r--net/sctp/associola.c151
-rw-r--r--net/sctp/endpointola.c1
-rw-r--r--net/sctp/input.c2
-rw-r--r--net/sctp/outqueue.c11
-rw-r--r--net/sctp/sm_make_chunk.c20
-rw-r--r--net/sctp/sm_sideeffect.c105
-rw-r--r--net/sctp/sm_statefuns.c148
-rw-r--r--net/sctp/sm_statetable.c6
-rw-r--r--net/sctp/socket.c405
-rw-r--r--net/sctp/transport.c4
-rw-r--r--net/xfrm/xfrm_policy.c1
-rw-r--r--net/xfrm/xfrm_state.c37
-rw-r--r--net/xfrm/xfrm_user.c9
71 files changed, 3551 insertions, 797 deletions
diff --git a/net/appletalk/aarp.c b/net/appletalk/aarp.c
index 54640c01b50c..10d040461021 100644
--- a/net/appletalk/aarp.c
+++ b/net/appletalk/aarp.c
@@ -565,7 +565,7 @@ int aarp_send_ddp(struct net_device *dev, struct sk_buff *skb,
565 * numbers we just happen to need. Now put the 565 * numbers we just happen to need. Now put the
566 * length in the lower two. 566 * length in the lower two.
567 */ 567 */
568 *((__u16 *)skb->data) = htons(skb->len); 568 *((__be16 *)skb->data) = htons(skb->len);
569 ft = 1; 569 ft = 1;
570 } 570 }
571 /* 571 /*
diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c
index 876dbac71060..192b529f86a4 100644
--- a/net/appletalk/ddp.c
+++ b/net/appletalk/ddp.c
@@ -401,7 +401,7 @@ out_err:
401} 401}
402 402
403/* Find a match for a specific network:node pair */ 403/* Find a match for a specific network:node pair */
404static struct atalk_iface *atalk_find_interface(int net, int node) 404static struct atalk_iface *atalk_find_interface(__be16 net, int node)
405{ 405{
406 struct atalk_iface *iface; 406 struct atalk_iface *iface;
407 407
diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index ef9f2095f96e..069253f830c1 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -57,9 +57,6 @@ int br_forward_finish(struct sk_buff *skb)
57static void __br_deliver(const struct net_bridge_port *to, struct sk_buff *skb) 57static void __br_deliver(const struct net_bridge_port *to, struct sk_buff *skb)
58{ 58{
59 skb->dev = to->dev; 59 skb->dev = to->dev;
60#ifdef CONFIG_NETFILTER_DEBUG
61 skb->nf_debug = 0;
62#endif
63 NF_HOOK(PF_BRIDGE, NF_BR_LOCAL_OUT, skb, NULL, skb->dev, 60 NF_HOOK(PF_BRIDGE, NF_BR_LOCAL_OUT, skb, NULL, skb->dev,
64 br_forward_finish); 61 br_forward_finish);
65} 62}
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 8f5f2e730992..9a45e6279c57 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -23,11 +23,7 @@ const unsigned char bridge_ula[6] = { 0x01, 0x80, 0xc2, 0x00, 0x00, 0x00 };
23 23
24static int br_pass_frame_up_finish(struct sk_buff *skb) 24static int br_pass_frame_up_finish(struct sk_buff *skb)
25{ 25{
26#ifdef CONFIG_NETFILTER_DEBUG
27 skb->nf_debug = 0;
28#endif
29 netif_receive_skb(skb); 26 netif_receive_skb(skb);
30
31 return 0; 27 return 0;
32} 28}
33 29
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index be03d3ad2648..03ae4edddac3 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -102,10 +102,6 @@ static int br_nf_pre_routing_finish_ipv6(struct sk_buff *skb)
102{ 102{
103 struct nf_bridge_info *nf_bridge = skb->nf_bridge; 103 struct nf_bridge_info *nf_bridge = skb->nf_bridge;
104 104
105#ifdef CONFIG_NETFILTER_DEBUG
106 skb->nf_debug ^= (1 << NF_BR_PRE_ROUTING);
107#endif
108
109 if (nf_bridge->mask & BRNF_PKT_TYPE) { 105 if (nf_bridge->mask & BRNF_PKT_TYPE) {
110 skb->pkt_type = PACKET_OTHERHOST; 106 skb->pkt_type = PACKET_OTHERHOST;
111 nf_bridge->mask ^= BRNF_PKT_TYPE; 107 nf_bridge->mask ^= BRNF_PKT_TYPE;
@@ -182,10 +178,6 @@ static void __br_dnat_complain(void)
182 * --Bart, 20021007 (updated) */ 178 * --Bart, 20021007 (updated) */
183static int br_nf_pre_routing_finish_bridge(struct sk_buff *skb) 179static int br_nf_pre_routing_finish_bridge(struct sk_buff *skb)
184{ 180{
185#ifdef CONFIG_NETFILTER_DEBUG
186 skb->nf_debug |= (1 << NF_BR_PRE_ROUTING) | (1 << NF_BR_FORWARD);
187#endif
188
189 if (skb->pkt_type == PACKET_OTHERHOST) { 181 if (skb->pkt_type == PACKET_OTHERHOST) {
190 skb->pkt_type = PACKET_HOST; 182 skb->pkt_type = PACKET_HOST;
191 skb->nf_bridge->mask |= BRNF_PKT_TYPE; 183 skb->nf_bridge->mask |= BRNF_PKT_TYPE;
@@ -207,10 +199,6 @@ static int br_nf_pre_routing_finish(struct sk_buff *skb)
207 struct iphdr *iph = skb->nh.iph; 199 struct iphdr *iph = skb->nh.iph;
208 struct nf_bridge_info *nf_bridge = skb->nf_bridge; 200 struct nf_bridge_info *nf_bridge = skb->nf_bridge;
209 201
210#ifdef CONFIG_NETFILTER_DEBUG
211 skb->nf_debug ^= (1 << NF_BR_PRE_ROUTING);
212#endif
213
214 if (nf_bridge->mask & BRNF_PKT_TYPE) { 202 if (nf_bridge->mask & BRNF_PKT_TYPE) {
215 skb->pkt_type = PACKET_OTHERHOST; 203 skb->pkt_type = PACKET_OTHERHOST;
216 nf_bridge->mask ^= BRNF_PKT_TYPE; 204 nf_bridge->mask ^= BRNF_PKT_TYPE;
@@ -382,9 +370,6 @@ static unsigned int br_nf_pre_routing_ipv6(unsigned int hook,
382 if (hdr->nexthdr == NEXTHDR_HOP && check_hbh_len(skb)) 370 if (hdr->nexthdr == NEXTHDR_HOP && check_hbh_len(skb))
383 goto inhdr_error; 371 goto inhdr_error;
384 372
385#ifdef CONFIG_NETFILTER_DEBUG
386 skb->nf_debug ^= (1 << NF_IP6_PRE_ROUTING);
387#endif
388 if ((nf_bridge = nf_bridge_alloc(skb)) == NULL) 373 if ((nf_bridge = nf_bridge_alloc(skb)) == NULL)
389 return NF_DROP; 374 return NF_DROP;
390 setup_pre_routing(skb); 375 setup_pre_routing(skb);
@@ -468,9 +453,6 @@ static unsigned int br_nf_pre_routing(unsigned int hook, struct sk_buff **pskb,
468 skb->ip_summed = CHECKSUM_NONE; 453 skb->ip_summed = CHECKSUM_NONE;
469 } 454 }
470 455
471#ifdef CONFIG_NETFILTER_DEBUG
472 skb->nf_debug ^= (1 << NF_IP_PRE_ROUTING);
473#endif
474 if ((nf_bridge = nf_bridge_alloc(skb)) == NULL) 456 if ((nf_bridge = nf_bridge_alloc(skb)) == NULL)
475 return NF_DROP; 457 return NF_DROP;
476 setup_pre_routing(skb); 458 setup_pre_routing(skb);
@@ -517,10 +499,6 @@ static int br_nf_forward_finish(struct sk_buff *skb)
517 struct net_device *in; 499 struct net_device *in;
518 struct vlan_ethhdr *hdr = vlan_eth_hdr(skb); 500 struct vlan_ethhdr *hdr = vlan_eth_hdr(skb);
519 501
520#ifdef CONFIG_NETFILTER_DEBUG
521 skb->nf_debug ^= (1 << NF_BR_FORWARD);
522#endif
523
524 if (skb->protocol != __constant_htons(ETH_P_ARP) && !IS_VLAN_ARP) { 502 if (skb->protocol != __constant_htons(ETH_P_ARP) && !IS_VLAN_ARP) {
525 in = nf_bridge->physindev; 503 in = nf_bridge->physindev;
526 if (nf_bridge->mask & BRNF_PKT_TYPE) { 504 if (nf_bridge->mask & BRNF_PKT_TYPE) {
@@ -566,9 +544,6 @@ static unsigned int br_nf_forward_ip(unsigned int hook, struct sk_buff **pskb,
566 (*pskb)->nh.raw += VLAN_HLEN; 544 (*pskb)->nh.raw += VLAN_HLEN;
567 } 545 }
568 546
569#ifdef CONFIG_NETFILTER_DEBUG
570 skb->nf_debug ^= (1 << NF_BR_FORWARD);
571#endif
572 nf_bridge = skb->nf_bridge; 547 nf_bridge = skb->nf_bridge;
573 if (skb->pkt_type == PACKET_OTHERHOST) { 548 if (skb->pkt_type == PACKET_OTHERHOST) {
574 skb->pkt_type = PACKET_HOST; 549 skb->pkt_type = PACKET_HOST;
@@ -605,10 +580,6 @@ static unsigned int br_nf_forward_arp(unsigned int hook, struct sk_buff **pskb,
605 (*pskb)->nh.raw += VLAN_HLEN; 580 (*pskb)->nh.raw += VLAN_HLEN;
606 } 581 }
607 582
608#ifdef CONFIG_NETFILTER_DEBUG
609 skb->nf_debug ^= (1 << NF_BR_FORWARD);
610#endif
611
612 if (skb->nh.arph->ar_pln != 4) { 583 if (skb->nh.arph->ar_pln != 4) {
613 if (IS_VLAN_ARP) { 584 if (IS_VLAN_ARP) {
614 skb_push(*pskb, VLAN_HLEN); 585 skb_push(*pskb, VLAN_HLEN);
@@ -627,9 +598,6 @@ static unsigned int br_nf_forward_arp(unsigned int hook, struct sk_buff **pskb,
627/* PF_BRIDGE/LOCAL_OUT ***********************************************/ 598/* PF_BRIDGE/LOCAL_OUT ***********************************************/
628static int br_nf_local_out_finish(struct sk_buff *skb) 599static int br_nf_local_out_finish(struct sk_buff *skb)
629{ 600{
630#ifdef CONFIG_NETFILTER_DEBUG
631 skb->nf_debug &= ~(1 << NF_BR_LOCAL_OUT);
632#endif
633 if (skb->protocol == __constant_htons(ETH_P_8021Q)) { 601 if (skb->protocol == __constant_htons(ETH_P_8021Q)) {
634 skb_push(skb, VLAN_HLEN); 602 skb_push(skb, VLAN_HLEN);
635 skb->nh.raw -= VLAN_HLEN; 603 skb->nh.raw -= VLAN_HLEN;
@@ -731,10 +699,6 @@ static unsigned int br_nf_local_out(unsigned int hook, struct sk_buff **pskb,
731 realoutdev, br_nf_local_out_finish, 699 realoutdev, br_nf_local_out_finish,
732 NF_IP_PRI_BRIDGE_SABOTAGE_FORWARD + 1); 700 NF_IP_PRI_BRIDGE_SABOTAGE_FORWARD + 1);
733 } else { 701 } else {
734#ifdef CONFIG_NETFILTER_DEBUG
735 skb->nf_debug ^= (1 << NF_IP_LOCAL_OUT);
736#endif
737
738 NF_HOOK_THRESH(pf, NF_IP_LOCAL_OUT, skb, realindev, 702 NF_HOOK_THRESH(pf, NF_IP_LOCAL_OUT, skb, realindev,
739 realoutdev, br_nf_local_out_finish, 703 realoutdev, br_nf_local_out_finish,
740 NF_IP_PRI_BRIDGE_SABOTAGE_LOCAL_OUT + 1); 704 NF_IP_PRI_BRIDGE_SABOTAGE_LOCAL_OUT + 1);
@@ -779,8 +743,6 @@ static unsigned int br_nf_post_routing(unsigned int hook, struct sk_buff **pskb,
779 printk(KERN_CRIT "br_netfilter: skb->dst == NULL."); 743 printk(KERN_CRIT "br_netfilter: skb->dst == NULL.");
780 goto print_error; 744 goto print_error;
781 } 745 }
782
783 skb->nf_debug ^= (1 << NF_IP_POST_ROUTING);
784#endif 746#endif
785 747
786 /* We assume any code from br_dev_queue_push_xmit onwards doesn't care 748 /* We assume any code from br_dev_queue_push_xmit onwards doesn't care
diff --git a/net/core/netfilter.c b/net/core/netfilter.c
index 22a8f127c4aa..076c156d5eda 100644
--- a/net/core/netfilter.c
+++ b/net/core/netfilter.c
@@ -141,136 +141,6 @@ void nf_unregister_sockopt(struct nf_sockopt_ops *reg)
141 up(&nf_sockopt_mutex); 141 up(&nf_sockopt_mutex);
142} 142}
143 143
144#ifdef CONFIG_NETFILTER_DEBUG
145#include <net/ip.h>
146#include <net/tcp.h>
147#include <linux/netfilter_ipv4.h>
148
149static void debug_print_hooks_ip(unsigned int nf_debug)
150{
151 if (nf_debug & (1 << NF_IP_PRE_ROUTING)) {
152 printk("PRE_ROUTING ");
153 nf_debug ^= (1 << NF_IP_PRE_ROUTING);
154 }
155 if (nf_debug & (1 << NF_IP_LOCAL_IN)) {
156 printk("LOCAL_IN ");
157 nf_debug ^= (1 << NF_IP_LOCAL_IN);
158 }
159 if (nf_debug & (1 << NF_IP_FORWARD)) {
160 printk("FORWARD ");
161 nf_debug ^= (1 << NF_IP_FORWARD);
162 }
163 if (nf_debug & (1 << NF_IP_LOCAL_OUT)) {
164 printk("LOCAL_OUT ");
165 nf_debug ^= (1 << NF_IP_LOCAL_OUT);
166 }
167 if (nf_debug & (1 << NF_IP_POST_ROUTING)) {
168 printk("POST_ROUTING ");
169 nf_debug ^= (1 << NF_IP_POST_ROUTING);
170 }
171 if (nf_debug)
172 printk("Crap bits: 0x%04X", nf_debug);
173 printk("\n");
174}
175
176static void nf_dump_skb(int pf, struct sk_buff *skb)
177{
178 printk("skb: pf=%i %s dev=%s len=%u\n",
179 pf,
180 skb->sk ? "(owned)" : "(unowned)",
181 skb->dev ? skb->dev->name : "(no dev)",
182 skb->len);
183 switch (pf) {
184 case PF_INET: {
185 const struct iphdr *ip = skb->nh.iph;
186 __u32 *opt = (__u32 *) (ip + 1);
187 int opti;
188 __u16 src_port = 0, dst_port = 0;
189
190 if (ip->protocol == IPPROTO_TCP
191 || ip->protocol == IPPROTO_UDP) {
192 struct tcphdr *tcp=(struct tcphdr *)((__u32 *)ip+ip->ihl);
193 src_port = ntohs(tcp->source);
194 dst_port = ntohs(tcp->dest);
195 }
196
197 printk("PROTO=%d %u.%u.%u.%u:%hu %u.%u.%u.%u:%hu"
198 " L=%hu S=0x%2.2hX I=%hu F=0x%4.4hX T=%hu",
199 ip->protocol, NIPQUAD(ip->saddr),
200 src_port, NIPQUAD(ip->daddr),
201 dst_port,
202 ntohs(ip->tot_len), ip->tos, ntohs(ip->id),
203 ntohs(ip->frag_off), ip->ttl);
204
205 for (opti = 0; opti < (ip->ihl - sizeof(struct iphdr) / 4); opti++)
206 printk(" O=0x%8.8X", *opt++);
207 printk("\n");
208 }
209 }
210}
211
212void nf_debug_ip_local_deliver(struct sk_buff *skb)
213{
214 /* If it's a loopback packet, it must have come through
215 * NF_IP_LOCAL_OUT, NF_IP_RAW_INPUT, NF_IP_PRE_ROUTING and
216 * NF_IP_LOCAL_IN. Otherwise, must have gone through
217 * NF_IP_RAW_INPUT and NF_IP_PRE_ROUTING. */
218 if (!skb->dev) {
219 printk("ip_local_deliver: skb->dev is NULL.\n");
220 } else {
221 if (skb->nf_debug != ((1<<NF_IP_PRE_ROUTING)
222 | (1<<NF_IP_LOCAL_IN))) {
223 printk("ip_local_deliver: bad skb: ");
224 debug_print_hooks_ip(skb->nf_debug);
225 nf_dump_skb(PF_INET, skb);
226 }
227 }
228}
229
230void nf_debug_ip_loopback_xmit(struct sk_buff *newskb)
231{
232 if (newskb->nf_debug != ((1 << NF_IP_LOCAL_OUT)
233 | (1 << NF_IP_POST_ROUTING))) {
234 printk("ip_dev_loopback_xmit: bad owned skb = %p: ",
235 newskb);
236 debug_print_hooks_ip(newskb->nf_debug);
237 nf_dump_skb(PF_INET, newskb);
238 }
239}
240
241void nf_debug_ip_finish_output2(struct sk_buff *skb)
242{
243 /* If it's owned, it must have gone through the
244 * NF_IP_LOCAL_OUT and NF_IP_POST_ROUTING.
245 * Otherwise, must have gone through
246 * NF_IP_PRE_ROUTING, NF_IP_FORWARD and NF_IP_POST_ROUTING.
247 */
248 if (skb->sk) {
249 if (skb->nf_debug != ((1 << NF_IP_LOCAL_OUT)
250 | (1 << NF_IP_POST_ROUTING))) {
251 printk("ip_finish_output: bad owned skb = %p: ", skb);
252 debug_print_hooks_ip(skb->nf_debug);
253 nf_dump_skb(PF_INET, skb);
254 }
255 } else {
256 if (skb->nf_debug != ((1 << NF_IP_PRE_ROUTING)
257 | (1 << NF_IP_FORWARD)
258 | (1 << NF_IP_POST_ROUTING))) {
259 /* Fragments, entunnelled packets, TCP RSTs
260 generated by ipt_REJECT will have no
261 owners, but still may be local */
262 if (skb->nf_debug != ((1 << NF_IP_LOCAL_OUT)
263 | (1 << NF_IP_POST_ROUTING))){
264 printk("ip_finish_output:"
265 " bad unowned skb = %p: ",skb);
266 debug_print_hooks_ip(skb->nf_debug);
267 nf_dump_skb(PF_INET, skb);
268 }
269 }
270 }
271}
272#endif /*CONFIG_NETFILTER_DEBUG*/
273
274/* Call get/setsockopt() */ 144/* Call get/setsockopt() */
275static int nf_sockopt(struct sock *sk, int pf, int val, 145static int nf_sockopt(struct sock *sk, int pf, int val,
276 char __user *opt, int *len, int get) 146 char __user *opt, int *len, int get)
@@ -488,14 +358,6 @@ int nf_hook_slow(int pf, unsigned int hook, struct sk_buff **pskb,
488 /* We may already have this, but read-locks nest anyway */ 358 /* We may already have this, but read-locks nest anyway */
489 rcu_read_lock(); 359 rcu_read_lock();
490 360
491#ifdef CONFIG_NETFILTER_DEBUG
492 if (unlikely((*pskb)->nf_debug & (1 << hook))) {
493 printk("nf_hook: hook %i already set.\n", hook);
494 nf_dump_skb(pf, *pskb);
495 }
496 (*pskb)->nf_debug |= (1 << hook);
497#endif
498
499 elem = &nf_hooks[pf][hook]; 361 elem = &nf_hooks[pf][hook];
500next_hook: 362next_hook:
501 verdict = nf_iterate(&nf_hooks[pf][hook], pskb, hook, indev, 363 verdict = nf_iterate(&nf_hooks[pf][hook], pskb, hook, indev,
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index f65b3de590a9..6d68c03bc051 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -365,9 +365,6 @@ struct sk_buff *skb_clone(struct sk_buff *skb, int gfp_mask)
365 C(nfct); 365 C(nfct);
366 nf_conntrack_get(skb->nfct); 366 nf_conntrack_get(skb->nfct);
367 C(nfctinfo); 367 C(nfctinfo);
368#ifdef CONFIG_NETFILTER_DEBUG
369 C(nf_debug);
370#endif
371#ifdef CONFIG_BRIDGE_NETFILTER 368#ifdef CONFIG_BRIDGE_NETFILTER
372 C(nf_bridge); 369 C(nf_bridge);
373 nf_bridge_get(skb->nf_bridge); 370 nf_bridge_get(skb->nf_bridge);
@@ -432,9 +429,6 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
432 new->nfct = old->nfct; 429 new->nfct = old->nfct;
433 nf_conntrack_get(old->nfct); 430 nf_conntrack_get(old->nfct);
434 new->nfctinfo = old->nfctinfo; 431 new->nfctinfo = old->nfctinfo;
435#ifdef CONFIG_NETFILTER_DEBUG
436 new->nf_debug = old->nf_debug;
437#endif
438#ifdef CONFIG_BRIDGE_NETFILTER 432#ifdef CONFIG_BRIDGE_NETFILTER
439 new->nf_bridge = old->nf_bridge; 433 new->nf_bridge = old->nf_bridge;
440 nf_bridge_get(old->nf_bridge); 434 nf_bridge_get(old->nf_bridge);
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 6d3e8b1bd1f2..05107e0dc145 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -1,6 +1,32 @@
1# 1#
2# IP configuration 2# IP configuration
3# 3#
4choice
5 prompt "Choose IP: FIB lookup""
6 depends on INET
7 default IP_FIB_HASH
8
9config IP_FIB_HASH
10 bool "FIB_HASH"
11 ---help---
12 Current FIB is very proven and good enough for most users.
13
14config IP_FIB_TRIE
15 bool "FIB_TRIE"
16 ---help---
17 Use new experimental LC-trie as FIB lookup algoritm.
18 This improves lookup performance
19
20 LC-trie is described in:
21
22 IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24 An experimental study of compression methods for dynamic tries
25 Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
26 http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
27
28endchoice
29
4config IP_MULTICAST 30config IP_MULTICAST
5 bool "IP: multicasting" 31 bool "IP: multicasting"
6 depends on INET 32 depends on INET
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 8b379627ebb6..65d57d8e1add 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -7,8 +7,10 @@ obj-y := utils.o route.o inetpeer.o protocol.o \
7 ip_output.o ip_sockglue.o \ 7 ip_output.o ip_sockglue.o \
8 tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o tcp_minisocks.o \ 8 tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o tcp_minisocks.o \
9 datagram.o raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \ 9 datagram.o raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \
10 sysctl_net_ipv4.o fib_frontend.o fib_semantics.o fib_hash.o 10 sysctl_net_ipv4.o fib_frontend.o fib_semantics.o
11 11
12obj-$(CONFIG_IP_FIB_HASH) += fib_hash.o
13obj-$(CONFIG_IP_FIB_TRIE) += fib_trie.o
12obj-$(CONFIG_PROC_FS) += proc.o 14obj-$(CONFIG_PROC_FS) += proc.o
13obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o 15obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o
14obj-$(CONFIG_IP_MROUTE) += ipmr.o 16obj-$(CONFIG_IP_MROUTE) += ipmr.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 03942f133944..658e7977924d 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1119,6 +1119,10 @@ module_init(inet_init);
1119#ifdef CONFIG_PROC_FS 1119#ifdef CONFIG_PROC_FS
1120extern int fib_proc_init(void); 1120extern int fib_proc_init(void);
1121extern void fib_proc_exit(void); 1121extern void fib_proc_exit(void);
1122#ifdef CONFIG_IP_FIB_TRIE
1123extern int fib_stat_proc_init(void);
1124extern void fib_stat_proc_exit(void);
1125#endif
1122extern int ip_misc_proc_init(void); 1126extern int ip_misc_proc_init(void);
1123extern int raw_proc_init(void); 1127extern int raw_proc_init(void);
1124extern void raw_proc_exit(void); 1128extern void raw_proc_exit(void);
@@ -1139,11 +1143,19 @@ static int __init ipv4_proc_init(void)
1139 goto out_udp; 1143 goto out_udp;
1140 if (fib_proc_init()) 1144 if (fib_proc_init())
1141 goto out_fib; 1145 goto out_fib;
1146#ifdef CONFIG_IP_FIB_TRIE
1147 if (fib_stat_proc_init())
1148 goto out_fib_stat;
1149 #endif
1142 if (ip_misc_proc_init()) 1150 if (ip_misc_proc_init())
1143 goto out_misc; 1151 goto out_misc;
1144out: 1152out:
1145 return rc; 1153 return rc;
1146out_misc: 1154out_misc:
1155#ifdef CONFIG_IP_FIB_TRIE
1156 fib_stat_proc_exit();
1157out_fib_stat:
1158#endif
1147 fib_proc_exit(); 1159 fib_proc_exit();
1148out_fib: 1160out_fib:
1149 udp4_proc_exit(); 1161 udp4_proc_exit();
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index 0e98f2235b6e..514c85b2631a 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -200,7 +200,7 @@ static void ah4_err(struct sk_buff *skb, u32 info)
200 xfrm_state_put(x); 200 xfrm_state_put(x);
201} 201}
202 202
203static int ah_init_state(struct xfrm_state *x, void *args) 203static int ah_init_state(struct xfrm_state *x)
204{ 204{
205 struct ah_data *ahp = NULL; 205 struct ah_data *ahp = NULL;
206 struct xfrm_algo_desc *aalg_desc; 206 struct xfrm_algo_desc *aalg_desc;
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index eae84cc39d3f..ba57446d5d1f 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -362,7 +362,7 @@ static void esp_destroy(struct xfrm_state *x)
362 kfree(esp); 362 kfree(esp);
363} 363}
364 364
365static int esp_init_state(struct xfrm_state *x, void *args) 365static int esp_init_state(struct xfrm_state *x)
366{ 366{
367 struct esp_data *esp = NULL; 367 struct esp_data *esp = NULL;
368 368
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 563e7d612706..cd8e45ab9580 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -516,6 +516,60 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa)
516#undef BRD1_OK 516#undef BRD1_OK
517} 517}
518 518
519static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb )
520{
521
522 struct fib_result res;
523 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = frn->fl_addr,
524 .fwmark = frn->fl_fwmark,
525 .tos = frn->fl_tos,
526 .scope = frn->fl_scope } } };
527 if (tb) {
528 local_bh_disable();
529
530 frn->tb_id = tb->tb_id;
531 frn->err = tb->tb_lookup(tb, &fl, &res);
532
533 if (!frn->err) {
534 frn->prefixlen = res.prefixlen;
535 frn->nh_sel = res.nh_sel;
536 frn->type = res.type;
537 frn->scope = res.scope;
538 }
539 local_bh_enable();
540 }
541}
542
543static void nl_fib_input(struct sock *sk, int len)
544{
545 struct sk_buff *skb = NULL;
546 struct nlmsghdr *nlh = NULL;
547 struct fib_result_nl *frn;
548 int err;
549 u32 pid;
550 struct fib_table *tb;
551
552 skb = skb_recv_datagram(sk, 0, 0, &err);
553 nlh = (struct nlmsghdr *)skb->data;
554
555 frn = (struct fib_result_nl *) NLMSG_DATA(nlh);
556 tb = fib_get_table(frn->tb_id_in);
557
558 nl_fib_lookup(frn, tb);
559
560 pid = nlh->nlmsg_pid; /*pid of sending process */
561 NETLINK_CB(skb).groups = 0; /* not in mcast group */
562 NETLINK_CB(skb).pid = 0; /* from kernel */
563 NETLINK_CB(skb).dst_pid = pid;
564 NETLINK_CB(skb).dst_groups = 0; /* unicast */
565 netlink_unicast(sk, skb, pid, MSG_DONTWAIT);
566}
567
568static void nl_fib_lookup_init(void)
569{
570 netlink_kernel_create(NETLINK_FIB_LOOKUP, nl_fib_input);
571}
572
519static void fib_disable_ip(struct net_device *dev, int force) 573static void fib_disable_ip(struct net_device *dev, int force)
520{ 574{
521 if (fib_sync_down(0, dev, force)) 575 if (fib_sync_down(0, dev, force))
@@ -604,6 +658,7 @@ void __init ip_fib_init(void)
604 658
605 register_netdevice_notifier(&fib_netdev_notifier); 659 register_netdevice_notifier(&fib_netdev_notifier);
606 register_inetaddr_notifier(&fib_inetaddr_notifier); 660 register_inetaddr_notifier(&fib_inetaddr_notifier);
661 nl_fib_lookup_init();
607} 662}
608 663
609EXPORT_SYMBOL(inet_addr_type); 664EXPORT_SYMBOL(inet_addr_type);
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
new file mode 100644
index 000000000000..0671569ee6f0
--- /dev/null
+++ b/net/ipv4/fib_trie.c
@@ -0,0 +1,2454 @@
1/*
2 * This program is free software; you can redistribute it and/or
3 * modify it under the terms of the GNU General Public License
4 * as published by the Free Software Foundation; either version
5 * 2 of the License, or (at your option) any later version.
6 *
7 * Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8 * & Swedish University of Agricultural Sciences.
9 *
10 * Jens Laas <jens.laas@data.slu.se> Swedish University of
11 * Agricultural Sciences.
12 *
13 * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet
14 *
15 * This work is based on the LPC-trie which is originally descibed in:
16 *
17 * An experimental study of compression methods for dynamic tries
18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19 * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
20 *
21 *
22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24 *
25 * Version: $Id: fib_trie.c,v 1.3 2005/06/08 14:20:01 robert Exp $
26 *
27 *
28 * Code from fib_hash has been reused which includes the following header:
29 *
30 *
31 * INET An implementation of the TCP/IP protocol suite for the LINUX
32 * operating system. INET is implemented using the BSD Socket
33 * interface as the means of communication with the user level.
34 *
35 * IPv4 FIB: lookup engine and maintenance routines.
36 *
37 *
38 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
39 *
40 * This program is free software; you can redistribute it and/or
41 * modify it under the terms of the GNU General Public License
42 * as published by the Free Software Foundation; either version
43 * 2 of the License, or (at your option) any later version.
44 */
45
46#define VERSION "0.323"
47
48#include <linux/config.h>
49#include <asm/uaccess.h>
50#include <asm/system.h>
51#include <asm/bitops.h>
52#include <linux/types.h>
53#include <linux/kernel.h>
54#include <linux/sched.h>
55#include <linux/mm.h>
56#include <linux/string.h>
57#include <linux/socket.h>
58#include <linux/sockios.h>
59#include <linux/errno.h>
60#include <linux/in.h>
61#include <linux/inet.h>
62#include <linux/netdevice.h>
63#include <linux/if_arp.h>
64#include <linux/proc_fs.h>
65#include <linux/skbuff.h>
66#include <linux/netlink.h>
67#include <linux/init.h>
68#include <linux/list.h>
69#include <net/ip.h>
70#include <net/protocol.h>
71#include <net/route.h>
72#include <net/tcp.h>
73#include <net/sock.h>
74#include <net/ip_fib.h>
75#include "fib_lookup.h"
76
77#undef CONFIG_IP_FIB_TRIE_STATS
78#define MAX_CHILDS 16384
79
80#define EXTRACT(p, n, str) ((str)<<(p)>>(32-(n)))
81#define KEYLENGTH (8*sizeof(t_key))
82#define MASK_PFX(k, l) (((l)==0)?0:(k >> (KEYLENGTH-l)) << (KEYLENGTH-l))
83#define TKEY_GET_MASK(offset, bits) (((bits)==0)?0:((t_key)(-1) << (KEYLENGTH - bits) >> offset))
84
85static DEFINE_RWLOCK(fib_lock);
86
87typedef unsigned int t_key;
88
89#define T_TNODE 0
90#define T_LEAF 1
91#define NODE_TYPE_MASK 0x1UL
92#define NODE_PARENT(_node) \
93((struct tnode *)((_node)->_parent & ~NODE_TYPE_MASK))
94#define NODE_SET_PARENT(_node, _ptr) \
95((_node)->_parent = (((unsigned long)(_ptr)) | \
96 ((_node)->_parent & NODE_TYPE_MASK)))
97#define NODE_INIT_PARENT(_node, _type) \
98((_node)->_parent = (_type))
99#define NODE_TYPE(_node) \
100((_node)->_parent & NODE_TYPE_MASK)
101
102#define IS_TNODE(n) (!(n->_parent & T_LEAF))
103#define IS_LEAF(n) (n->_parent & T_LEAF)
104
105struct node {
106 t_key key;
107 unsigned long _parent;
108};
109
110struct leaf {
111 t_key key;
112 unsigned long _parent;
113 struct hlist_head list;
114};
115
116struct leaf_info {
117 struct hlist_node hlist;
118 int plen;
119 struct list_head falh;
120};
121
122struct tnode {
123 t_key key;
124 unsigned long _parent;
125 unsigned short pos:5; /* 2log(KEYLENGTH) bits needed */
126 unsigned short bits:5; /* 2log(KEYLENGTH) bits needed */
127 unsigned short full_children; /* KEYLENGTH bits needed */
128 unsigned short empty_children; /* KEYLENGTH bits needed */
129 struct node *child[0];
130};
131
132#ifdef CONFIG_IP_FIB_TRIE_STATS
133struct trie_use_stats {
134 unsigned int gets;
135 unsigned int backtrack;
136 unsigned int semantic_match_passed;
137 unsigned int semantic_match_miss;
138 unsigned int null_node_hit;
139};
140#endif
141
142struct trie_stat {
143 unsigned int totdepth;
144 unsigned int maxdepth;
145 unsigned int tnodes;
146 unsigned int leaves;
147 unsigned int nullpointers;
148 unsigned int nodesizes[MAX_CHILDS];
149};
150
151struct trie {
152 struct node *trie;
153#ifdef CONFIG_IP_FIB_TRIE_STATS
154 struct trie_use_stats stats;
155#endif
156 int size;
157 unsigned int revision;
158};
159
160static int trie_debug = 0;
161
162static int tnode_full(struct tnode *tn, struct node *n);
163static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
164static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull);
165static int tnode_child_length(struct tnode *tn);
166static struct node *resize(struct trie *t, struct tnode *tn);
167static struct tnode *inflate(struct trie *t, struct tnode *tn);
168static struct tnode *halve(struct trie *t, struct tnode *tn);
169static void tnode_free(struct tnode *tn);
170static void trie_dump_seq(struct seq_file *seq, struct trie *t);
171extern struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio);
172extern int fib_detect_death(struct fib_info *fi, int order,
173 struct fib_info **last_resort, int *last_idx, int *dflt);
174
175extern void rtmsg_fib(int event, u32 key, struct fib_alias *fa, int z, int tb_id,
176 struct nlmsghdr *n, struct netlink_skb_parms *req);
177
178static kmem_cache_t *fn_alias_kmem;
179static struct trie *trie_local = NULL, *trie_main = NULL;
180
181static void trie_bug(char *err)
182{
183 printk("Trie Bug: %s\n", err);
184 BUG();
185}
186
187static inline struct node *tnode_get_child(struct tnode *tn, int i)
188{
189 if (i >= 1<<tn->bits)
190 trie_bug("tnode_get_child");
191
192 return tn->child[i];
193}
194
195static inline int tnode_child_length(struct tnode *tn)
196{
197 return 1<<tn->bits;
198}
199
200/*
201 _________________________________________________________________
202 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
203 ----------------------------------------------------------------
204 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
205
206 _________________________________________________________________
207 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
208 -----------------------------------------------------------------
209 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
210
211 tp->pos = 7
212 tp->bits = 3
213 n->pos = 15
214 n->bits=4
215 KEYLENGTH=32
216*/
217
218static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
219{
220 if (offset < KEYLENGTH)
221 return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
222 else
223 return 0;
224}
225
226static inline int tkey_equals(t_key a, t_key b)
227{
228 return a == b;
229}
230
231static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
232{
233 if (bits == 0 || offset >= KEYLENGTH)
234 return 1;
235 bits = bits > KEYLENGTH ? KEYLENGTH : bits;
236 return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
237}
238
239static inline int tkey_mismatch(t_key a, int offset, t_key b)
240{
241 t_key diff = a ^ b;
242 int i = offset;
243
244 if(!diff)
245 return 0;
246 while((diff << i) >> (KEYLENGTH-1) == 0)
247 i++;
248 return i;
249}
250
251/* Candiate for fib_semantics */
252
253static void fn_free_alias(struct fib_alias *fa)
254{
255 fib_release_info(fa->fa_info);
256 kmem_cache_free(fn_alias_kmem, fa);
257}
258
259/*
260 To understand this stuff, an understanding of keys and all their bits is
261 necessary. Every node in the trie has a key associated with it, but not
262 all of the bits in that key are significant.
263
264 Consider a node 'n' and its parent 'tp'.
265
266 If n is a leaf, every bit in its key is significant. Its presence is
267 necessitaded by path compression, since during a tree traversal (when
268 searching for a leaf - unless we are doing an insertion) we will completely
269 ignore all skipped bits we encounter. Thus we need to verify, at the end of
270 a potentially successful search, that we have indeed been walking the
271 correct key path.
272
273 Note that we can never "miss" the correct key in the tree if present by
274 following the wrong path. Path compression ensures that segments of the key
275 that are the same for all keys with a given prefix are skipped, but the
276 skipped part *is* identical for each node in the subtrie below the skipped
277 bit! trie_insert() in this implementation takes care of that - note the
278 call to tkey_sub_equals() in trie_insert().
279
280 if n is an internal node - a 'tnode' here, the various parts of its key
281 have many different meanings.
282
283 Example:
284 _________________________________________________________________
285 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
286 -----------------------------------------------------------------
287 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
288
289 _________________________________________________________________
290 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
291 -----------------------------------------------------------------
292 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
293
294 tp->pos = 7
295 tp->bits = 3
296 n->pos = 15
297 n->bits=4
298
299 First, let's just ignore the bits that come before the parent tp, that is
300 the bits from 0 to (tp->pos-1). They are *known* but at this point we do
301 not use them for anything.
302
303 The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
304 index into the parent's child array. That is, they will be used to find
305 'n' among tp's children.
306
307 The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
308 for the node n.
309
310 All the bits we have seen so far are significant to the node n. The rest
311 of the bits are really not needed or indeed known in n->key.
312
313 The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
314 n's child array, and will of course be different for each child.
315
316 The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
317 at this point.
318
319*/
320
321static void check_tnode(struct tnode *tn)
322{
323 if(tn && tn->pos+tn->bits > 32) {
324 printk("TNODE ERROR tn=%p, pos=%d, bits=%d\n", tn, tn->pos, tn->bits);
325 }
326}
327
328static int halve_threshold = 25;
329static int inflate_threshold = 50;
330
331static struct leaf *leaf_new(void)
332{
333 struct leaf *l = kmalloc(sizeof(struct leaf), GFP_KERNEL);
334 if(l) {
335 NODE_INIT_PARENT(l, T_LEAF);
336 INIT_HLIST_HEAD(&l->list);
337 }
338 return l;
339}
340
341static struct leaf_info *leaf_info_new(int plen)
342{
343 struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL);
344 li->plen = plen;
345 INIT_LIST_HEAD(&li->falh);
346 return li;
347}
348
349static inline void free_leaf(struct leaf *l)
350{
351 kfree(l);
352}
353
354static inline void free_leaf_info(struct leaf_info *li)
355{
356 kfree(li);
357}
358
359static struct tnode* tnode_new(t_key key, int pos, int bits)
360{
361 int nchildren = 1<<bits;
362 int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *);
363 struct tnode *tn = kmalloc(sz, GFP_KERNEL);
364
365 if(tn) {
366 memset(tn, 0, sz);
367 NODE_INIT_PARENT(tn, T_TNODE);
368 tn->pos = pos;
369 tn->bits = bits;
370 tn->key = key;
371 tn->full_children = 0;
372 tn->empty_children = 1<<bits;
373 }
374 if(trie_debug > 0)
375 printk("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode),
376 (unsigned int) (sizeof(struct node) * 1<<bits));
377 return tn;
378}
379
380static void tnode_free(struct tnode *tn)
381{
382 if(!tn) {
383 trie_bug("tnode_free\n");
384 }
385 if(IS_LEAF(tn)) {
386 free_leaf((struct leaf *)tn);
387 if(trie_debug > 0 )
388 printk("FL %p \n", tn);
389 }
390 else if(IS_TNODE(tn)) {
391 kfree(tn);
392 if(trie_debug > 0 )
393 printk("FT %p \n", tn);
394 }
395 else {
396 trie_bug("tnode_free\n");
397 }
398}
399
400/*
401 * Check whether a tnode 'n' is "full", i.e. it is an internal node
402 * and no bits are skipped. See discussion in dyntree paper p. 6
403 */
404
405static inline int tnode_full(struct tnode *tn, struct node *n)
406{
407 if(n == NULL || IS_LEAF(n))
408 return 0;
409
410 return ((struct tnode *) n)->pos == tn->pos + tn->bits;
411}
412
413static inline void put_child(struct trie *t, struct tnode *tn, int i, struct node *n)
414{
415 tnode_put_child_reorg(tn, i, n, -1);
416}
417
418 /*
419 * Add a child at position i overwriting the old value.
420 * Update the value of full_children and empty_children.
421 */
422
423static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull)
424{
425 struct node *chi;
426 int isfull;
427
428 if(i >= 1<<tn->bits) {
429 printk("bits=%d, i=%d\n", tn->bits, i);
430 trie_bug("tnode_put_child_reorg bits");
431 }
432 write_lock_bh(&fib_lock);
433 chi = tn->child[i];
434
435 /* update emptyChildren */
436 if (n == NULL && chi != NULL)
437 tn->empty_children++;
438 else if (n != NULL && chi == NULL)
439 tn->empty_children--;
440
441 /* update fullChildren */
442 if (wasfull == -1)
443 wasfull = tnode_full(tn, chi);
444
445 isfull = tnode_full(tn, n);
446 if (wasfull && !isfull)
447 tn->full_children--;
448
449 else if (!wasfull && isfull)
450 tn->full_children++;
451 if(n)
452 NODE_SET_PARENT(n, tn);
453
454 tn->child[i] = n;
455 write_unlock_bh(&fib_lock);
456}
457
458static struct node *resize(struct trie *t, struct tnode *tn)
459{
460 int i;
461
462 if (!tn)
463 return NULL;
464
465 if(trie_debug)
466 printk("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
467 tn, inflate_threshold, halve_threshold);
468
469 /* No children */
470 if (tn->empty_children == tnode_child_length(tn)) {
471 tnode_free(tn);
472 return NULL;
473 }
474 /* One child */
475 if (tn->empty_children == tnode_child_length(tn) - 1)
476 for (i = 0; i < tnode_child_length(tn); i++) {
477
478 write_lock_bh(&fib_lock);
479 if (tn->child[i] != NULL) {
480
481 /* compress one level */
482 struct node *n = tn->child[i];
483 if(n)
484 NODE_INIT_PARENT(n, NODE_TYPE(n));
485
486 write_unlock_bh(&fib_lock);
487 tnode_free(tn);
488 return n;
489 }
490 write_unlock_bh(&fib_lock);
491 }
492 /*
493 * Double as long as the resulting node has a number of
494 * nonempty nodes that are above the threshold.
495 */
496
497 /*
498 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
499 * the Helsinki University of Technology and Matti Tikkanen of Nokia
500 * Telecommunications, page 6:
501 * "A node is doubled if the ratio of non-empty children to all
502 * children in the *doubled* node is at least 'high'."
503 *
504 * 'high' in this instance is the variable 'inflate_threshold'. It
505 * is expressed as a percentage, so we multiply it with
506 * tnode_child_length() and instead of multiplying by 2 (since the
507 * child array will be doubled by inflate()) and multiplying
508 * the left-hand side by 100 (to handle the percentage thing) we
509 * multiply the left-hand side by 50.
510 *
511 * The left-hand side may look a bit weird: tnode_child_length(tn)
512 * - tn->empty_children is of course the number of non-null children
513 * in the current node. tn->full_children is the number of "full"
514 * children, that is non-null tnodes with a skip value of 0.
515 * All of those will be doubled in the resulting inflated tnode, so
516 * we just count them one extra time here.
517 *
518 * A clearer way to write this would be:
519 *
520 * to_be_doubled = tn->full_children;
521 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
522 * tn->full_children;
523 *
524 * new_child_length = tnode_child_length(tn) * 2;
525 *
526 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
527 * new_child_length;
528 * if (new_fill_factor >= inflate_threshold)
529 *
530 * ...and so on, tho it would mess up the while() loop.
531 *
532 * anyway,
533 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
534 * inflate_threshold
535 *
536 * avoid a division:
537 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
538 * inflate_threshold * new_child_length
539 *
540 * expand not_to_be_doubled and to_be_doubled, and shorten:
541 * 100 * (tnode_child_length(tn) - tn->empty_children +
542 * tn->full_children ) >= inflate_threshold * new_child_length
543 *
544 * expand new_child_length:
545 * 100 * (tnode_child_length(tn) - tn->empty_children +
546 * tn->full_children ) >=
547 * inflate_threshold * tnode_child_length(tn) * 2
548 *
549 * shorten again:
550 * 50 * (tn->full_children + tnode_child_length(tn) -
551 * tn->empty_children ) >= inflate_threshold *
552 * tnode_child_length(tn)
553 *
554 */
555
556 check_tnode(tn);
557
558 while ((tn->full_children > 0 &&
559 50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >=
560 inflate_threshold * tnode_child_length(tn))) {
561
562 tn = inflate(t, tn);
563 }
564
565 check_tnode(tn);
566
567 /*
568 * Halve as long as the number of empty children in this
569 * node is above threshold.
570 */
571 while (tn->bits > 1 &&
572 100 * (tnode_child_length(tn) - tn->empty_children) <
573 halve_threshold * tnode_child_length(tn))
574
575 tn = halve(t, tn);
576
577 /* Only one child remains */
578
579 if (tn->empty_children == tnode_child_length(tn) - 1)
580 for (i = 0; i < tnode_child_length(tn); i++) {
581
582 write_lock_bh(&fib_lock);
583 if (tn->child[i] != NULL) {
584 /* compress one level */
585 struct node *n = tn->child[i];
586
587 if(n)
588 NODE_INIT_PARENT(n, NODE_TYPE(n));
589
590 write_unlock_bh(&fib_lock);
591 tnode_free(tn);
592 return n;
593 }
594 write_unlock_bh(&fib_lock);
595 }
596
597 return (struct node *) tn;
598}
599
600static struct tnode *inflate(struct trie *t, struct tnode *tn)
601{
602 struct tnode *inode;
603 struct tnode *oldtnode = tn;
604 int olen = tnode_child_length(tn);
605 int i;
606
607 if(trie_debug)
608 printk("In inflate\n");
609
610 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
611
612 if (!tn)
613 trie_bug("tnode_new failed");
614
615 for(i = 0; i < olen; i++) {
616 struct node *node = tnode_get_child(oldtnode, i);
617
618 /* An empty child */
619 if (node == NULL)
620 continue;
621
622 /* A leaf or an internal node with skipped bits */
623
624 if(IS_LEAF(node) || ((struct tnode *) node)->pos >
625 tn->pos + tn->bits - 1) {
626 if(tkey_extract_bits(node->key, tn->pos + tn->bits - 1,
627 1) == 0)
628 put_child(t, tn, 2*i, node);
629 else
630 put_child(t, tn, 2*i+1, node);
631 continue;
632 }
633
634 /* An internal node with two children */
635 inode = (struct tnode *) node;
636
637 if (inode->bits == 1) {
638 put_child(t, tn, 2*i, inode->child[0]);
639 put_child(t, tn, 2*i+1, inode->child[1]);
640
641 tnode_free(inode);
642 }
643
644 /* An internal node with more than two children */
645 else {
646 struct tnode *left, *right;
647 int size, j;
648
649 /* We will replace this node 'inode' with two new
650 * ones, 'left' and 'right', each with half of the
651 * original children. The two new nodes will have
652 * a position one bit further down the key and this
653 * means that the "significant" part of their keys
654 * (see the discussion near the top of this file)
655 * will differ by one bit, which will be "0" in
656 * left's key and "1" in right's key. Since we are
657 * moving the key position by one step, the bit that
658 * we are moving away from - the bit at position
659 * (inode->pos) - is the one that will differ between
660 * left and right. So... we synthesize that bit in the
661 * two new keys.
662 * The mask 'm' below will be a single "one" bit at
663 * the position (inode->pos)
664 */
665
666 t_key m = TKEY_GET_MASK(inode->pos, 1);
667
668 /* Use the old key, but set the new significant
669 * bit to zero.
670 */
671 left = tnode_new(inode->key&(~m), inode->pos + 1,
672 inode->bits - 1);
673
674 if(!left)
675 trie_bug("tnode_new failed");
676
677
678 /* Use the old key, but set the new significant
679 * bit to one.
680 */
681 right = tnode_new(inode->key|m, inode->pos + 1,
682 inode->bits - 1);
683
684 if(!right)
685 trie_bug("tnode_new failed");
686
687 size = tnode_child_length(left);
688 for(j = 0; j < size; j++) {
689 put_child(t, left, j, inode->child[j]);
690 put_child(t, right, j, inode->child[j + size]);
691 }
692 put_child(t, tn, 2*i, resize(t, left));
693 put_child(t, tn, 2*i+1, resize(t, right));
694
695 tnode_free(inode);
696 }
697 }
698 tnode_free(oldtnode);
699 return tn;
700}
701
702static struct tnode *halve(struct trie *t, struct tnode *tn)
703{
704 struct tnode *oldtnode = tn;
705 struct node *left, *right;
706 int i;
707 int olen = tnode_child_length(tn);
708
709 if(trie_debug) printk("In halve\n");
710
711 tn=tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
712
713 if(!tn)
714 trie_bug("tnode_new failed");
715
716 for(i = 0; i < olen; i += 2) {
717 left = tnode_get_child(oldtnode, i);
718 right = tnode_get_child(oldtnode, i+1);
719
720 /* At least one of the children is empty */
721 if (left == NULL) {
722 if (right == NULL) /* Both are empty */
723 continue;
724 put_child(t, tn, i/2, right);
725 } else if (right == NULL)
726 put_child(t, tn, i/2, left);
727
728 /* Two nonempty children */
729 else {
730 struct tnode *newBinNode =
731 tnode_new(left->key, tn->pos + tn->bits, 1);
732
733 if(!newBinNode)
734 trie_bug("tnode_new failed");
735
736 put_child(t, newBinNode, 0, left);
737 put_child(t, newBinNode, 1, right);
738 put_child(t, tn, i/2, resize(t, newBinNode));
739 }
740 }
741 tnode_free(oldtnode);
742 return tn;
743}
744
745static void *trie_init(struct trie *t)
746{
747 if(t) {
748 t->size = 0;
749 t->trie = NULL;
750 t->revision = 0;
751#ifdef CONFIG_IP_FIB_TRIE_STATS
752 memset(&t->stats, 0, sizeof(struct trie_use_stats));
753#endif
754 }
755 return t;
756}
757
758static struct leaf_info *find_leaf_info(struct hlist_head *head, int plen)
759{
760 struct hlist_node *node;
761 struct leaf_info *li;
762
763 hlist_for_each_entry(li, node, head, hlist) {
764
765 if ( li->plen == plen )
766 return li;
767 }
768 return NULL;
769}
770
771static inline struct list_head * get_fa_head(struct leaf *l, int plen)
772{
773 struct list_head *fa_head=NULL;
774 struct leaf_info *li = find_leaf_info(&l->list, plen);
775
776 if(li)
777 fa_head = &li->falh;
778
779 return fa_head;
780}
781
782static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
783{
784 struct leaf_info *li=NULL, *last=NULL;
785 struct hlist_node *node, *tmp;
786
787 write_lock_bh(&fib_lock);
788
789 if(hlist_empty(head))
790 hlist_add_head(&new->hlist, head);
791 else {
792 hlist_for_each_entry_safe(li, node, tmp, head, hlist) {
793
794 if (new->plen > li->plen)
795 break;
796
797 last = li;
798 }
799 if(last)
800 hlist_add_after(&last->hlist, &new->hlist);
801 else
802 hlist_add_before(&new->hlist, &li->hlist);
803 }
804 write_unlock_bh(&fib_lock);
805}
806
807static struct leaf *
808fib_find_node(struct trie *t, u32 key)
809{
810 int pos;
811 struct tnode *tn;
812 struct node *n;
813
814 pos = 0;
815 n=t->trie;
816
817 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
818 tn = (struct tnode *) n;
819
820 check_tnode(tn);
821
822 if(tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
823 pos=tn->pos + tn->bits;
824 n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
825 }
826 else
827 break;
828 }
829 /* Case we have found a leaf. Compare prefixes */
830
831 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
832 struct leaf *l = (struct leaf *) n;
833 return l;
834 }
835 return NULL;
836}
837
838static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
839{
840 int i = 0;
841 int wasfull;
842 t_key cindex, key;
843 struct tnode *tp = NULL;
844
845 if(!tn)
846 BUG();
847
848 key = tn->key;
849 i = 0;
850
851 while (tn != NULL && NODE_PARENT(tn) != NULL) {
852
853 if( i > 10 ) {
854 printk("Rebalance tn=%p \n", tn);
855 if(tn) printk("tn->parent=%p \n", NODE_PARENT(tn));
856
857 printk("Rebalance tp=%p \n", tp);
858 if(tp) printk("tp->parent=%p \n", NODE_PARENT(tp));
859 }
860
861 if( i > 12 ) BUG();
862 i++;
863
864 tp = NODE_PARENT(tn);
865 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
866 wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
867 tn = (struct tnode *) resize (t, (struct tnode *)tn);
868 tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull);
869
870 if(!NODE_PARENT(tn))
871 break;
872
873 tn = NODE_PARENT(tn);
874 }
875 /* Handle last (top) tnode */
876 if (IS_TNODE(tn))
877 tn = (struct tnode*) resize(t, (struct tnode *)tn);
878
879 return (struct node*) tn;
880}
881
882static struct list_head *
883fib_insert_node(struct trie *t, u32 key, int plen)
884{
885 int pos, newpos;
886 struct tnode *tp = NULL, *tn = NULL;
887 struct node *n;
888 struct leaf *l;
889 int missbit;
890 struct list_head *fa_head=NULL;
891 struct leaf_info *li;
892 t_key cindex;
893
894 pos = 0;
895 n=t->trie;
896
897 /* If we point to NULL, stop. Either the tree is empty and we should
898 * just put a new leaf in if, or we have reached an empty child slot,
899 * and we should just put our new leaf in that.
900 * If we point to a T_TNODE, check if it matches our key. Note that
901 * a T_TNODE might be skipping any number of bits - its 'pos' need
902 * not be the parent's 'pos'+'bits'!
903 *
904 * If it does match the current key, get pos/bits from it, extract
905 * the index from our key, push the T_TNODE and walk the tree.
906 *
907 * If it doesn't, we have to replace it with a new T_TNODE.
908 *
909 * If we point to a T_LEAF, it might or might not have the same key
910 * as we do. If it does, just change the value, update the T_LEAF's
911 * value, and return it.
912 * If it doesn't, we need to replace it with a T_TNODE.
913 */
914
915 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
916 tn = (struct tnode *) n;
917
918 check_tnode(tn);
919
920 if(tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
921 tp = tn;
922 pos=tn->pos + tn->bits;
923 n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
924
925 if(n && NODE_PARENT(n) != tn) {
926 printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n));
927 BUG();
928 }
929 }
930 else
931 break;
932 }
933
934 /*
935 * n ----> NULL, LEAF or TNODE
936 *
937 * tp is n's (parent) ----> NULL or TNODE
938 */
939
940 if(tp && IS_LEAF(tp))
941 BUG();
942
943 t->revision++;
944
945 /* Case 1: n is a leaf. Compare prefixes */
946
947 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
948 struct leaf *l = ( struct leaf *) n;
949
950 li = leaf_info_new(plen);
951
952 if(! li)
953 BUG();
954
955 fa_head = &li->falh;
956 insert_leaf_info(&l->list, li);
957 goto done;
958 }
959 t->size++;
960 l = leaf_new();
961
962 if(! l)
963 BUG();
964
965 l->key = key;
966 li = leaf_info_new(plen);
967
968 if(! li)
969 BUG();
970
971 fa_head = &li->falh;
972 insert_leaf_info(&l->list, li);
973
974 /* Case 2: n is NULL, and will just insert a new leaf */
975 if (t->trie && n == NULL) {
976
977 NODE_SET_PARENT(l, tp);
978
979 if (!tp)
980 BUG();
981
982 else {
983 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
984 put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
985 }
986 }
987 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
988 else {
989 /*
990 * Add a new tnode here
991 * first tnode need some special handling
992 */
993
994 if (tp)
995 pos=tp->pos+tp->bits;
996 else
997 pos=0;
998 if(n) {
999 newpos = tkey_mismatch(key, pos, n->key);
1000 tn = tnode_new(n->key, newpos, 1);
1001 }
1002 else {
1003 newpos = 0;
1004 tn = tnode_new(key, newpos, 1); /* First tnode */
1005 }
1006 if(!tn)
1007 trie_bug("tnode_pfx_new failed");
1008
1009 NODE_SET_PARENT(tn, tp);
1010
1011 missbit=tkey_extract_bits(key, newpos, 1);
1012 put_child(t, tn, missbit, (struct node *)l);
1013 put_child(t, tn, 1-missbit, n);
1014
1015 if(tp) {
1016 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1017 put_child(t, (struct tnode *)tp, cindex, (struct node *)tn);
1018 }
1019 else {
1020 t->trie = (struct node*) tn; /* First tnode */
1021 tp = tn;
1022 }
1023 }
1024 if(tp && tp->pos+tp->bits > 32) {
1025 printk("ERROR tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1026 tp, tp->pos, tp->bits, key, plen);
1027 }
1028 /* Rebalance the trie */
1029 t->trie = trie_rebalance(t, tp);
1030done:;
1031 return fa_head;
1032}
1033
1034static int
1035fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1036 struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
1037{
1038 struct trie *t = (struct trie *) tb->tb_data;
1039 struct fib_alias *fa, *new_fa;
1040 struct list_head *fa_head=NULL;
1041 struct fib_info *fi;
1042 int plen = r->rtm_dst_len;
1043 int type = r->rtm_type;
1044 u8 tos = r->rtm_tos;
1045 u32 key, mask;
1046 int err;
1047 struct leaf *l;
1048
1049 if (plen > 32)
1050 return -EINVAL;
1051
1052 key = 0;
1053 if (rta->rta_dst)
1054 memcpy(&key, rta->rta_dst, 4);
1055
1056 key = ntohl(key);
1057
1058 if(trie_debug)
1059 printk("Insert table=%d %08x/%d\n", tb->tb_id, key, plen);
1060
1061 mask = ntohl( inet_make_mask(plen) );
1062
1063 if(key & ~mask)
1064 return -EINVAL;
1065
1066 key = key & mask;
1067
1068 if ((fi = fib_create_info(r, rta, nlhdr, &err)) == NULL)
1069 goto err;
1070
1071 l = fib_find_node(t, key);
1072 fa = NULL;
1073
1074 if(l) {
1075 fa_head = get_fa_head(l, plen);
1076 fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1077 }
1078
1079 /* Now fa, if non-NULL, points to the first fib alias
1080 * with the same keys [prefix,tos,priority], if such key already
1081 * exists or to the node before which we will insert new one.
1082 *
1083 * If fa is NULL, we will need to allocate a new one and
1084 * insert to the head of f.
1085 *
1086 * If f is NULL, no fib node matched the destination key
1087 * and we need to allocate a new one of those as well.
1088 */
1089
1090 if (fa &&
1091 fa->fa_info->fib_priority == fi->fib_priority) {
1092 struct fib_alias *fa_orig;
1093
1094 err = -EEXIST;
1095 if (nlhdr->nlmsg_flags & NLM_F_EXCL)
1096 goto out;
1097
1098 if (nlhdr->nlmsg_flags & NLM_F_REPLACE) {
1099 struct fib_info *fi_drop;
1100 u8 state;
1101
1102 write_lock_bh(&fib_lock);
1103
1104 fi_drop = fa->fa_info;
1105 fa->fa_info = fi;
1106 fa->fa_type = type;
1107 fa->fa_scope = r->rtm_scope;
1108 state = fa->fa_state;
1109 fa->fa_state &= ~FA_S_ACCESSED;
1110
1111 write_unlock_bh(&fib_lock);
1112
1113 fib_release_info(fi_drop);
1114 if (state & FA_S_ACCESSED)
1115 rt_cache_flush(-1);
1116
1117 goto succeeded;
1118 }
1119 /* Error if we find a perfect match which
1120 * uses the same scope, type, and nexthop
1121 * information.
1122 */
1123 fa_orig = fa;
1124 list_for_each_entry(fa, fa_orig->fa_list.prev, fa_list) {
1125 if (fa->fa_tos != tos)
1126 break;
1127 if (fa->fa_info->fib_priority != fi->fib_priority)
1128 break;
1129 if (fa->fa_type == type &&
1130 fa->fa_scope == r->rtm_scope &&
1131 fa->fa_info == fi) {
1132 goto out;
1133 }
1134 }
1135 if (!(nlhdr->nlmsg_flags & NLM_F_APPEND))
1136 fa = fa_orig;
1137 }
1138 err = -ENOENT;
1139 if (!(nlhdr->nlmsg_flags&NLM_F_CREATE))
1140 goto out;
1141
1142 err = -ENOBUFS;
1143 new_fa = kmem_cache_alloc(fn_alias_kmem, SLAB_KERNEL);
1144 if (new_fa == NULL)
1145 goto out;
1146
1147 new_fa->fa_info = fi;
1148 new_fa->fa_tos = tos;
1149 new_fa->fa_type = type;
1150 new_fa->fa_scope = r->rtm_scope;
1151 new_fa->fa_state = 0;
1152#if 0
1153 new_fa->dst = NULL;
1154#endif
1155 /*
1156 * Insert new entry to the list.
1157 */
1158
1159 if(!fa_head)
1160 fa_head = fib_insert_node(t, key, plen);
1161
1162 write_lock_bh(&fib_lock);
1163
1164 list_add_tail(&new_fa->fa_list,
1165 (fa ? &fa->fa_list : fa_head));
1166
1167 write_unlock_bh(&fib_lock);
1168
1169 rt_cache_flush(-1);
1170 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, nlhdr, req);
1171succeeded:
1172 return 0;
1173out:
1174 fib_release_info(fi);
1175err:;
1176 return err;
1177}
1178
1179static inline int check_leaf(struct trie *t, struct leaf *l, t_key key, int *plen, const struct flowi *flp,
1180 struct fib_result *res, int *err)
1181{
1182 int i;
1183 t_key mask;
1184 struct leaf_info *li;
1185 struct hlist_head *hhead = &l->list;
1186 struct hlist_node *node;
1187
1188 hlist_for_each_entry(li, node, hhead, hlist) {
1189
1190 i = li->plen;
1191 mask = ntohl(inet_make_mask(i));
1192 if (l->key != (key & mask))
1193 continue;
1194
1195 if (((*err) = fib_semantic_match(&li->falh, flp, res, l->key, mask, i)) == 0) {
1196 *plen = i;
1197#ifdef CONFIG_IP_FIB_TRIE_STATS
1198 t->stats.semantic_match_passed++;
1199#endif
1200 return 1;
1201 }
1202#ifdef CONFIG_IP_FIB_TRIE_STATS
1203 t->stats.semantic_match_miss++;
1204#endif
1205 }
1206 return 0;
1207}
1208
1209static int
1210fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1211{
1212 struct trie *t = (struct trie *) tb->tb_data;
1213 int plen, ret = 0;
1214 struct node *n;
1215 struct tnode *pn;
1216 int pos, bits;
1217 t_key key=ntohl(flp->fl4_dst);
1218 int chopped_off;
1219 t_key cindex = 0;
1220 int current_prefix_length = KEYLENGTH;
1221 n = t->trie;
1222
1223 read_lock(&fib_lock);
1224 if(!n)
1225 goto failed;
1226
1227#ifdef CONFIG_IP_FIB_TRIE_STATS
1228 t->stats.gets++;
1229#endif
1230
1231 /* Just a leaf? */
1232 if (IS_LEAF(n)) {
1233 if( check_leaf(t, (struct leaf *)n, key, &plen, flp, res, &ret) )
1234 goto found;
1235 goto failed;
1236 }
1237 pn = (struct tnode *) n;
1238 chopped_off = 0;
1239
1240 while (pn) {
1241
1242 pos = pn->pos;
1243 bits = pn->bits;
1244
1245 if(!chopped_off)
1246 cindex = tkey_extract_bits(MASK_PFX(key, current_prefix_length), pos, bits);
1247
1248 n = tnode_get_child(pn, cindex);
1249
1250 if (n == NULL) {
1251#ifdef CONFIG_IP_FIB_TRIE_STATS
1252 t->stats.null_node_hit++;
1253#endif
1254 goto backtrace;
1255 }
1256
1257 if (IS_TNODE(n)) {
1258#define HL_OPTIMIZE
1259#ifdef HL_OPTIMIZE
1260 struct tnode *cn = (struct tnode *)n;
1261 t_key node_prefix, key_prefix, pref_mismatch;
1262 int mp;
1263
1264 /*
1265 * It's a tnode, and we can do some extra checks here if we
1266 * like, to avoid descending into a dead-end branch.
1267 * This tnode is in the parent's child array at index
1268 * key[p_pos..p_pos+p_bits] but potentially with some bits
1269 * chopped off, so in reality the index may be just a
1270 * subprefix, padded with zero at the end.
1271 * We can also take a look at any skipped bits in this
1272 * tnode - everything up to p_pos is supposed to be ok,
1273 * and the non-chopped bits of the index (se previous
1274 * paragraph) are also guaranteed ok, but the rest is
1275 * considered unknown.
1276 *
1277 * The skipped bits are key[pos+bits..cn->pos].
1278 */
1279
1280 /* If current_prefix_length < pos+bits, we are already doing
1281 * actual prefix matching, which means everything from
1282 * pos+(bits-chopped_off) onward must be zero along some
1283 * branch of this subtree - otherwise there is *no* valid
1284 * prefix present. Here we can only check the skipped
1285 * bits. Remember, since we have already indexed into the
1286 * parent's child array, we know that the bits we chopped of
1287 * *are* zero.
1288 */
1289
1290 /* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */
1291
1292 if (current_prefix_length < pos+bits) {
1293 if (tkey_extract_bits(cn->key, current_prefix_length,
1294 cn->pos - current_prefix_length) != 0 ||
1295 !(cn->child[0]))
1296 goto backtrace;
1297 }
1298
1299 /*
1300 * If chopped_off=0, the index is fully validated and we
1301 * only need to look at the skipped bits for this, the new,
1302 * tnode. What we actually want to do is to find out if
1303 * these skipped bits match our key perfectly, or if we will
1304 * have to count on finding a matching prefix further down,
1305 * because if we do, we would like to have some way of
1306 * verifying the existence of such a prefix at this point.
1307 */
1308
1309 /* The only thing we can do at this point is to verify that
1310 * any such matching prefix can indeed be a prefix to our
1311 * key, and if the bits in the node we are inspecting that
1312 * do not match our key are not ZERO, this cannot be true.
1313 * Thus, find out where there is a mismatch (before cn->pos)
1314 * and verify that all the mismatching bits are zero in the
1315 * new tnode's key.
1316 */
1317
1318 /* Note: We aren't very concerned about the piece of the key
1319 * that precede pn->pos+pn->bits, since these have already been
1320 * checked. The bits after cn->pos aren't checked since these are
1321 * by definition "unknown" at this point. Thus, what we want to
1322 * see is if we are about to enter the "prefix matching" state,
1323 * and in that case verify that the skipped bits that will prevail
1324 * throughout this subtree are zero, as they have to be if we are
1325 * to find a matching prefix.
1326 */
1327
1328 node_prefix = MASK_PFX(cn->key, cn->pos);
1329 key_prefix = MASK_PFX(key, cn->pos);
1330 pref_mismatch = key_prefix^node_prefix;
1331 mp = 0;
1332
1333 /* In short: If skipped bits in this node do not match the search
1334 * key, enter the "prefix matching" state.directly.
1335 */
1336 if (pref_mismatch) {
1337 while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
1338 mp++;
1339 pref_mismatch = pref_mismatch <<1;
1340 }
1341 key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1342
1343 if (key_prefix != 0)
1344 goto backtrace;
1345
1346 if (current_prefix_length >= cn->pos)
1347 current_prefix_length=mp;
1348 }
1349#endif
1350 pn = (struct tnode *)n; /* Descend */
1351 chopped_off = 0;
1352 continue;
1353 }
1354 if (IS_LEAF(n)) {
1355 if( check_leaf(t, (struct leaf *)n, key, &plen, flp, res, &ret))
1356 goto found;
1357 }
1358backtrace:
1359 chopped_off++;
1360
1361 /* As zero don't change the child key (cindex) */
1362 while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1)))) {
1363 chopped_off++;
1364 }
1365
1366 /* Decrease current_... with bits chopped off */
1367 if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1368 current_prefix_length = pn->pos + pn->bits - chopped_off;
1369
1370 /*
1371 * Either we do the actual chop off according or if we have
1372 * chopped off all bits in this tnode walk up to our parent.
1373 */
1374
1375 if(chopped_off <= pn->bits)
1376 cindex &= ~(1 << (chopped_off-1));
1377 else {
1378 if( NODE_PARENT(pn) == NULL)
1379 goto failed;
1380
1381 /* Get Child's index */
1382 cindex = tkey_extract_bits(pn->key, NODE_PARENT(pn)->pos, NODE_PARENT(pn)->bits);
1383 pn = NODE_PARENT(pn);
1384 chopped_off = 0;
1385
1386#ifdef CONFIG_IP_FIB_TRIE_STATS
1387 t->stats.backtrack++;
1388#endif
1389 goto backtrace;
1390 }
1391 }
1392failed:
1393 ret = 1;
1394found:
1395 read_unlock(&fib_lock);
1396 return ret;
1397}
1398
1399static int trie_leaf_remove(struct trie *t, t_key key)
1400{
1401 t_key cindex;
1402 struct tnode *tp = NULL;
1403 struct node *n = t->trie;
1404 struct leaf *l;
1405
1406 if(trie_debug)
1407 printk("entering trie_leaf_remove(%p)\n", n);
1408
1409 /* Note that in the case skipped bits, those bits are *not* checked!
1410 * When we finish this, we will have NULL or a T_LEAF, and the
1411 * T_LEAF may or may not match our key.
1412 */
1413
1414 while (n != NULL && IS_TNODE(n)) {
1415 struct tnode *tn = (struct tnode *) n;
1416 check_tnode(tn);
1417 n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits));
1418
1419 if(n && NODE_PARENT(n) != tn) {
1420 printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n));
1421 BUG();
1422 }
1423 }
1424 l = (struct leaf *) n;
1425
1426 if(!n || !tkey_equals(l->key, key))
1427 return 0;
1428
1429 /*
1430 * Key found.
1431 * Remove the leaf and rebalance the tree
1432 */
1433
1434 t->revision++;
1435 t->size--;
1436
1437 tp = NODE_PARENT(n);
1438 tnode_free((struct tnode *) n);
1439
1440 if(tp) {
1441 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1442 put_child(t, (struct tnode *)tp, cindex, NULL);
1443 t->trie = trie_rebalance(t, tp);
1444 }
1445 else
1446 t->trie = NULL;
1447
1448 return 1;
1449}
1450
1451static int
1452fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1453 struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
1454{
1455 struct trie *t = (struct trie *) tb->tb_data;
1456 u32 key, mask;
1457 int plen = r->rtm_dst_len;
1458 u8 tos = r->rtm_tos;
1459 struct fib_alias *fa, *fa_to_delete;
1460 struct list_head *fa_head;
1461 struct leaf *l;
1462
1463 if (plen > 32)
1464 return -EINVAL;
1465
1466 key = 0;
1467 if (rta->rta_dst)
1468 memcpy(&key, rta->rta_dst, 4);
1469
1470 key = ntohl(key);
1471 mask = ntohl( inet_make_mask(plen) );
1472
1473 if(key & ~mask)
1474 return -EINVAL;
1475
1476 key = key & mask;
1477 l = fib_find_node(t, key);
1478
1479 if(!l)
1480 return -ESRCH;
1481
1482 fa_head = get_fa_head(l, plen);
1483 fa = fib_find_alias(fa_head, tos, 0);
1484
1485 if (!fa)
1486 return -ESRCH;
1487
1488 if (trie_debug)
1489 printk("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1490
1491 fa_to_delete = NULL;
1492 fa_head = fa->fa_list.prev;
1493 list_for_each_entry(fa, fa_head, fa_list) {
1494 struct fib_info *fi = fa->fa_info;
1495
1496 if (fa->fa_tos != tos)
1497 break;
1498
1499 if ((!r->rtm_type ||
1500 fa->fa_type == r->rtm_type) &&
1501 (r->rtm_scope == RT_SCOPE_NOWHERE ||
1502 fa->fa_scope == r->rtm_scope) &&
1503 (!r->rtm_protocol ||
1504 fi->fib_protocol == r->rtm_protocol) &&
1505 fib_nh_match(r, nlhdr, rta, fi) == 0) {
1506 fa_to_delete = fa;
1507 break;
1508 }
1509 }
1510
1511 if (fa_to_delete) {
1512 int kill_li = 0;
1513 struct leaf_info *li;
1514
1515 fa = fa_to_delete;
1516 rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id, nlhdr, req);
1517
1518 l = fib_find_node(t, key);
1519 li = find_leaf_info(&l->list, plen);
1520
1521 write_lock_bh(&fib_lock);
1522
1523 list_del(&fa->fa_list);
1524
1525 if(list_empty(fa_head)) {
1526 hlist_del(&li->hlist);
1527 kill_li = 1;
1528 }
1529 write_unlock_bh(&fib_lock);
1530
1531 if(kill_li)
1532 free_leaf_info(li);
1533
1534 if(hlist_empty(&l->list))
1535 trie_leaf_remove(t, key);
1536
1537 if (fa->fa_state & FA_S_ACCESSED)
1538 rt_cache_flush(-1);
1539
1540 fn_free_alias(fa);
1541 return 0;
1542 }
1543 return -ESRCH;
1544}
1545
1546static int trie_flush_list(struct trie *t, struct list_head *head)
1547{
1548 struct fib_alias *fa, *fa_node;
1549 int found = 0;
1550
1551 list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1552 struct fib_info *fi = fa->fa_info;
1553
1554 if (fi && (fi->fib_flags&RTNH_F_DEAD)) {
1555
1556 write_lock_bh(&fib_lock);
1557 list_del(&fa->fa_list);
1558 write_unlock_bh(&fib_lock);
1559
1560 fn_free_alias(fa);
1561 found++;
1562 }
1563 }
1564 return found;
1565}
1566
1567static int trie_flush_leaf(struct trie *t, struct leaf *l)
1568{
1569 int found = 0;
1570 struct hlist_head *lih = &l->list;
1571 struct hlist_node *node, *tmp;
1572 struct leaf_info *li = NULL;
1573
1574 hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
1575
1576 found += trie_flush_list(t, &li->falh);
1577
1578 if (list_empty(&li->falh)) {
1579
1580 write_lock_bh(&fib_lock);
1581 hlist_del(&li->hlist);
1582 write_unlock_bh(&fib_lock);
1583
1584 free_leaf_info(li);
1585 }
1586 }
1587 return found;
1588}
1589
1590static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
1591{
1592 struct node *c = (struct node *) thisleaf;
1593 struct tnode *p;
1594 int idx;
1595
1596 if(c == NULL) {
1597 if(t->trie == NULL)
1598 return NULL;
1599
1600 if (IS_LEAF(t->trie)) /* trie w. just a leaf */
1601 return (struct leaf *) t->trie;
1602
1603 p = (struct tnode*) t->trie; /* Start */
1604 }
1605 else
1606 p = (struct tnode *) NODE_PARENT(c);
1607 while (p) {
1608 int pos, last;
1609
1610 /* Find the next child of the parent */
1611 if(c)
1612 pos = 1 + tkey_extract_bits(c->key, p->pos, p->bits);
1613 else
1614 pos = 0;
1615
1616 last = 1 << p->bits;
1617 for(idx = pos; idx < last ; idx++) {
1618 if( p->child[idx]) {
1619
1620 /* Decend if tnode */
1621
1622 while (IS_TNODE(p->child[idx])) {
1623 p = (struct tnode*) p->child[idx];
1624 idx = 0;
1625
1626 /* Rightmost non-NULL branch */
1627 if( p && IS_TNODE(p) )
1628 while ( p->child[idx] == NULL && idx < (1 << p->bits) ) idx++;
1629
1630 /* Done with this tnode? */
1631 if( idx >= (1 << p->bits) || p->child[idx] == NULL )
1632 goto up;
1633 }
1634 return (struct leaf*) p->child[idx];
1635 }
1636 }
1637up:
1638 /* No more children go up one step */
1639 c = (struct node*) p;
1640 p = (struct tnode *) NODE_PARENT(p);
1641 }
1642 return NULL; /* Ready. Root of trie */
1643}
1644
1645static int fn_trie_flush(struct fib_table *tb)
1646{
1647 struct trie *t = (struct trie *) tb->tb_data;
1648 struct leaf *ll = NULL, *l = NULL;
1649 int found = 0, h;
1650
1651 t->revision++;
1652
1653 for (h=0; (l = nextleaf(t, l)) != NULL; h++) {
1654 found += trie_flush_leaf(t, l);
1655
1656 if (ll && hlist_empty(&ll->list))
1657 trie_leaf_remove(t, ll->key);
1658 ll = l;
1659 }
1660
1661 if (ll && hlist_empty(&ll->list))
1662 trie_leaf_remove(t, ll->key);
1663
1664 if(trie_debug)
1665 printk("trie_flush found=%d\n", found);
1666 return found;
1667}
1668
1669static int trie_last_dflt=-1;
1670
1671static void
1672fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1673{
1674 struct trie *t = (struct trie *) tb->tb_data;
1675 int order, last_idx;
1676 struct fib_info *fi = NULL;
1677 struct fib_info *last_resort;
1678 struct fib_alias *fa = NULL;
1679 struct list_head *fa_head;
1680 struct leaf *l;
1681
1682 last_idx = -1;
1683 last_resort = NULL;
1684 order = -1;
1685
1686 read_lock(&fib_lock);
1687
1688 l = fib_find_node(t, 0);
1689 if(!l)
1690 goto out;
1691
1692 fa_head = get_fa_head(l, 0);
1693 if(!fa_head)
1694 goto out;
1695
1696 if (list_empty(fa_head))
1697 goto out;
1698
1699 list_for_each_entry(fa, fa_head, fa_list) {
1700 struct fib_info *next_fi = fa->fa_info;
1701
1702 if (fa->fa_scope != res->scope ||
1703 fa->fa_type != RTN_UNICAST)
1704 continue;
1705
1706 if (next_fi->fib_priority > res->fi->fib_priority)
1707 break;
1708 if (!next_fi->fib_nh[0].nh_gw ||
1709 next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1710 continue;
1711 fa->fa_state |= FA_S_ACCESSED;
1712
1713 if (fi == NULL) {
1714 if (next_fi != res->fi)
1715 break;
1716 } else if (!fib_detect_death(fi, order, &last_resort,
1717 &last_idx, &trie_last_dflt)) {
1718 if (res->fi)
1719 fib_info_put(res->fi);
1720 res->fi = fi;
1721 atomic_inc(&fi->fib_clntref);
1722 trie_last_dflt = order;
1723 goto out;
1724 }
1725 fi = next_fi;
1726 order++;
1727 }
1728 if (order <= 0 || fi == NULL) {
1729 trie_last_dflt = -1;
1730 goto out;
1731 }
1732
1733 if (!fib_detect_death(fi, order, &last_resort, &last_idx, &trie_last_dflt)) {
1734 if (res->fi)
1735 fib_info_put(res->fi);
1736 res->fi = fi;
1737 atomic_inc(&fi->fib_clntref);
1738 trie_last_dflt = order;
1739 goto out;
1740 }
1741 if (last_idx >= 0) {
1742 if (res->fi)
1743 fib_info_put(res->fi);
1744 res->fi = last_resort;
1745 if (last_resort)
1746 atomic_inc(&last_resort->fib_clntref);
1747 }
1748 trie_last_dflt = last_idx;
1749 out:;
1750 read_unlock(&fib_lock);
1751}
1752
1753static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb,
1754 struct sk_buff *skb, struct netlink_callback *cb)
1755{
1756 int i, s_i;
1757 struct fib_alias *fa;
1758
1759 u32 xkey=htonl(key);
1760
1761 s_i=cb->args[3];
1762 i = 0;
1763
1764 list_for_each_entry(fa, fah, fa_list) {
1765 if (i < s_i) {
1766 i++;
1767 continue;
1768 }
1769 if (fa->fa_info->fib_nh == NULL) {
1770 printk("Trie error _fib_nh=NULL in fa[%d] k=%08x plen=%d\n", i, key, plen);
1771 i++;
1772 continue;
1773 }
1774 if (fa->fa_info == NULL) {
1775 printk("Trie error fa_info=NULL in fa[%d] k=%08x plen=%d\n", i, key, plen);
1776 i++;
1777 continue;
1778 }
1779
1780 if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1781 cb->nlh->nlmsg_seq,
1782 RTM_NEWROUTE,
1783 tb->tb_id,
1784 fa->fa_type,
1785 fa->fa_scope,
1786 &xkey,
1787 plen,
1788 fa->fa_tos,
1789 fa->fa_info, 0) < 0) {
1790 cb->args[3] = i;
1791 return -1;
1792 }
1793 i++;
1794 }
1795 cb->args[3]=i;
1796 return skb->len;
1797}
1798
1799static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, struct sk_buff *skb,
1800 struct netlink_callback *cb)
1801{
1802 int h, s_h;
1803 struct list_head *fa_head;
1804 struct leaf *l = NULL;
1805 s_h=cb->args[2];
1806
1807 for (h=0; (l = nextleaf(t, l)) != NULL; h++) {
1808
1809 if (h < s_h)
1810 continue;
1811 if (h > s_h)
1812 memset(&cb->args[3], 0,
1813 sizeof(cb->args) - 3*sizeof(cb->args[0]));
1814
1815 fa_head = get_fa_head(l, plen);
1816
1817 if(!fa_head)
1818 continue;
1819
1820 if(list_empty(fa_head))
1821 continue;
1822
1823 if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) {
1824 cb->args[2]=h;
1825 return -1;
1826 }
1827 }
1828 cb->args[2]=h;
1829 return skb->len;
1830}
1831
1832static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb)
1833{
1834 int m, s_m;
1835 struct trie *t = (struct trie *) tb->tb_data;
1836
1837 s_m = cb->args[1];
1838
1839 read_lock(&fib_lock);
1840 for (m=0; m<=32; m++) {
1841
1842 if (m < s_m)
1843 continue;
1844 if (m > s_m)
1845 memset(&cb->args[2], 0,
1846 sizeof(cb->args) - 2*sizeof(cb->args[0]));
1847
1848 if (fn_trie_dump_plen(t, 32-m, tb, skb, cb)<0) {
1849 cb->args[1] = m;
1850 goto out;
1851 }
1852 }
1853 read_unlock(&fib_lock);
1854 cb->args[1] = m;
1855 return skb->len;
1856 out:
1857 read_unlock(&fib_lock);
1858 return -1;
1859}
1860
1861/* Fix more generic FIB names for init later */
1862
1863#ifdef CONFIG_IP_MULTIPLE_TABLES
1864struct fib_table * fib_hash_init(int id)
1865#else
1866struct fib_table * __init fib_hash_init(int id)
1867#endif
1868{
1869 struct fib_table *tb;
1870 struct trie *t;
1871
1872 if (fn_alias_kmem == NULL)
1873 fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1874 sizeof(struct fib_alias),
1875 0, SLAB_HWCACHE_ALIGN,
1876 NULL, NULL);
1877
1878 tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
1879 GFP_KERNEL);
1880 if (tb == NULL)
1881 return NULL;
1882
1883 tb->tb_id = id;
1884 tb->tb_lookup = fn_trie_lookup;
1885 tb->tb_insert = fn_trie_insert;
1886 tb->tb_delete = fn_trie_delete;
1887 tb->tb_flush = fn_trie_flush;
1888 tb->tb_select_default = fn_trie_select_default;
1889 tb->tb_dump = fn_trie_dump;
1890 memset(tb->tb_data, 0, sizeof(struct trie));
1891
1892 t = (struct trie *) tb->tb_data;
1893
1894 trie_init(t);
1895
1896 if (id == RT_TABLE_LOCAL)
1897 trie_local=t;
1898 else if (id == RT_TABLE_MAIN)
1899 trie_main=t;
1900
1901 if (id == RT_TABLE_LOCAL)
1902 printk("IPv4 FIB: Using LC-trie version %s\n", VERSION);
1903
1904 return tb;
1905}
1906
1907/* Trie dump functions */
1908
1909static void putspace_seq(struct seq_file *seq, int n)
1910{
1911 while (n--) seq_printf(seq, " ");
1912}
1913
1914static void printbin_seq(struct seq_file *seq, unsigned int v, int bits)
1915{
1916 while (bits--)
1917 seq_printf(seq, "%s", (v & (1<<bits))?"1":"0");
1918}
1919
1920static void printnode_seq(struct seq_file *seq, int indent, struct node *n,
1921 int pend, int cindex, int bits)
1922{
1923 putspace_seq(seq, indent);
1924 if (IS_LEAF(n))
1925 seq_printf(seq, "|");
1926 else
1927 seq_printf(seq, "+");
1928 if (bits) {
1929 seq_printf(seq, "%d/", cindex);
1930 printbin_seq(seq, cindex, bits);
1931 seq_printf(seq, ": ");
1932 }
1933 else
1934 seq_printf(seq, "<root>: ");
1935 seq_printf(seq, "%s:%p ", IS_LEAF(n)?"Leaf":"Internal node", n);
1936
1937 if (IS_LEAF(n))
1938 seq_printf(seq, "key=%d.%d.%d.%d\n",
1939 n->key >> 24, (n->key >> 16) % 256, (n->key >> 8) % 256, n->key % 256);
1940 else {
1941 int plen=((struct tnode *)n)->pos;
1942 t_key prf=MASK_PFX(n->key, plen);
1943 seq_printf(seq, "key=%d.%d.%d.%d/%d\n",
1944 prf >> 24, (prf >> 16) % 256, (prf >> 8) % 256, prf % 256, plen);
1945 }
1946 if (IS_LEAF(n)) {
1947 struct leaf *l=(struct leaf *)n;
1948 struct fib_alias *fa;
1949 int i;
1950 for (i=32; i>=0; i--)
1951 if(find_leaf_info(&l->list, i)) {
1952
1953 struct list_head *fa_head = get_fa_head(l, i);
1954
1955 if(!fa_head)
1956 continue;
1957
1958 if(list_empty(fa_head))
1959 continue;
1960
1961 putspace_seq(seq, indent+2);
1962 seq_printf(seq, "{/%d...dumping}\n", i);
1963
1964
1965 list_for_each_entry(fa, fa_head, fa_list) {
1966 putspace_seq(seq, indent+2);
1967 if (fa->fa_info->fib_nh == NULL) {
1968 seq_printf(seq, "Error _fib_nh=NULL\n");
1969 continue;
1970 }
1971 if (fa->fa_info == NULL) {
1972 seq_printf(seq, "Error fa_info=NULL\n");
1973 continue;
1974 }
1975
1976 seq_printf(seq, "{type=%d scope=%d TOS=%d}\n",
1977 fa->fa_type,
1978 fa->fa_scope,
1979 fa->fa_tos);
1980 }
1981 }
1982 }
1983 else if (IS_TNODE(n)) {
1984 struct tnode *tn=(struct tnode *)n;
1985 putspace_seq(seq, indent); seq_printf(seq, "| ");
1986 seq_printf(seq, "{key prefix=%08x/", tn->key&TKEY_GET_MASK(0, tn->pos));
1987 printbin_seq(seq, tkey_extract_bits(tn->key, 0, tn->pos), tn->pos);
1988 seq_printf(seq, "}\n");
1989 putspace_seq(seq, indent); seq_printf(seq, "| ");
1990 seq_printf(seq, "{pos=%d", tn->pos);
1991 seq_printf(seq, " (skip=%d bits)", tn->pos - pend);
1992 seq_printf(seq, " bits=%d (%u children)}\n", tn->bits, (1 << tn->bits));
1993 putspace_seq(seq, indent); seq_printf(seq, "| ");
1994 seq_printf(seq, "{empty=%d full=%d}\n", tn->empty_children, tn->full_children);
1995 }
1996}
1997
1998static void trie_dump_seq(struct seq_file *seq, struct trie *t)
1999{
2000 struct node *n=t->trie;
2001 int cindex=0;
2002 int indent=1;
2003 int pend=0;
2004 int depth = 0;
2005
2006 read_lock(&fib_lock);
2007
2008 seq_printf(seq, "------ trie_dump of t=%p ------\n", t);
2009 if (n) {
2010 printnode_seq(seq, indent, n, pend, cindex, 0);
2011 if (IS_TNODE(n)) {
2012 struct tnode *tn=(struct tnode *)n;
2013 pend = tn->pos+tn->bits;
2014 putspace_seq(seq, indent); seq_printf(seq, "\\--\n");
2015 indent += 3;
2016 depth++;
2017
2018 while (tn && cindex < (1 << tn->bits)) {
2019 if (tn->child[cindex]) {
2020
2021 /* Got a child */
2022
2023 printnode_seq(seq, indent, tn->child[cindex], pend, cindex, tn->bits);
2024 if (IS_LEAF(tn->child[cindex])) {
2025 cindex++;
2026
2027 }
2028 else {
2029 /*
2030 * New tnode. Decend one level
2031 */
2032
2033 depth++;
2034 n=tn->child[cindex];
2035 tn=(struct tnode *)n;
2036 pend=tn->pos+tn->bits;
2037 putspace_seq(seq, indent); seq_printf(seq, "\\--\n");
2038 indent+=3;
2039 cindex=0;
2040 }
2041 }
2042 else
2043 cindex++;
2044
2045 /*
2046 * Test if we are done
2047 */
2048
2049 while (cindex >= (1 << tn->bits)) {
2050
2051 /*
2052 * Move upwards and test for root
2053 * pop off all traversed nodes
2054 */
2055
2056 if (NODE_PARENT(tn) == NULL) {
2057 tn = NULL;
2058 n = NULL;
2059 break;
2060 }
2061 else {
2062 cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
2063 tn = NODE_PARENT(tn);
2064 cindex++;
2065 n=(struct node *)tn;
2066 pend=tn->pos+tn->bits;
2067 indent-=3;
2068 depth--;
2069 }
2070 }
2071 }
2072 }
2073 else n = NULL;
2074 }
2075 else seq_printf(seq, "------ trie is empty\n");
2076
2077 read_unlock(&fib_lock);
2078}
2079
2080static struct trie_stat *trie_stat_new(void)
2081{
2082 struct trie_stat *s = kmalloc(sizeof(struct trie_stat), GFP_KERNEL);
2083 int i;
2084
2085 if(s) {
2086 s->totdepth = 0;
2087 s->maxdepth = 0;
2088 s->tnodes = 0;
2089 s->leaves = 0;
2090 s->nullpointers = 0;
2091
2092 for(i=0; i< MAX_CHILDS; i++)
2093 s->nodesizes[i] = 0;
2094 }
2095 return s;
2096}
2097
2098static struct trie_stat *trie_collect_stats(struct trie *t)
2099{
2100 struct node *n=t->trie;
2101 struct trie_stat *s = trie_stat_new();
2102 int cindex = 0;
2103 int indent = 1;
2104 int pend = 0;
2105 int depth = 0;
2106
2107 read_lock(&fib_lock);
2108
2109 if (s) {
2110 if (n) {
2111 if (IS_TNODE(n)) {
2112 struct tnode *tn = (struct tnode *)n;
2113 pend=tn->pos+tn->bits;
2114 indent += 3;
2115 s->nodesizes[tn->bits]++;
2116 depth++;
2117
2118 while (tn && cindex < (1 << tn->bits)) {
2119 if (tn->child[cindex]) {
2120 /* Got a child */
2121
2122 if (IS_LEAF(tn->child[cindex])) {
2123 cindex++;
2124
2125 /* stats */
2126 if (depth > s->maxdepth)
2127 s->maxdepth = depth;
2128 s->totdepth += depth;
2129 s->leaves++;
2130 }
2131
2132 else {
2133 /*
2134 * New tnode. Decend one level
2135 */
2136
2137 s->tnodes++;
2138 s->nodesizes[tn->bits]++;
2139 depth++;
2140
2141 n = tn->child[cindex];
2142 tn = (struct tnode *)n;
2143 pend = tn->pos+tn->bits;
2144
2145 indent += 3;
2146 cindex = 0;
2147 }
2148 }
2149 else {
2150 cindex++;
2151 s->nullpointers++;
2152 }
2153
2154 /*
2155 * Test if we are done
2156 */
2157
2158 while (cindex >= (1 << tn->bits)) {
2159
2160 /*
2161 * Move upwards and test for root
2162 * pop off all traversed nodes
2163 */
2164
2165
2166 if (NODE_PARENT(tn) == NULL) {
2167 tn = NULL;
2168 n = NULL;
2169 break;
2170 }
2171 else {
2172 cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
2173 tn = NODE_PARENT(tn);
2174 cindex++;
2175 n = (struct node *)tn;
2176 pend=tn->pos+tn->bits;
2177 indent -= 3;
2178 depth--;
2179 }
2180 }
2181 }
2182 }
2183 else n = NULL;
2184 }
2185 }
2186
2187 read_unlock(&fib_lock);
2188 return s;
2189}
2190
2191#ifdef CONFIG_PROC_FS
2192
2193static struct fib_alias *fib_triestat_get_first(struct seq_file *seq)
2194{
2195 return NULL;
2196}
2197
2198static struct fib_alias *fib_triestat_get_next(struct seq_file *seq)
2199{
2200 return NULL;
2201}
2202
2203static void *fib_triestat_seq_start(struct seq_file *seq, loff_t *pos)
2204{
2205 void *v = NULL;
2206
2207 if (ip_fib_main_table)
2208 v = *pos ? fib_triestat_get_next(seq) : SEQ_START_TOKEN;
2209 return v;
2210}
2211
2212static void *fib_triestat_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2213{
2214 ++*pos;
2215 return v == SEQ_START_TOKEN ? fib_triestat_get_first(seq) : fib_triestat_get_next(seq);
2216}
2217
2218static void fib_triestat_seq_stop(struct seq_file *seq, void *v)
2219{
2220
2221}
2222
2223/*
2224 * This outputs /proc/net/fib_triestats
2225 *
2226 * It always works in backward compatibility mode.
2227 * The format of the file is not supposed to be changed.
2228 */
2229
2230static void collect_and_show(struct trie *t, struct seq_file *seq)
2231{
2232 int bytes = 0; /* How many bytes are used, a ref is 4 bytes */
2233 int i, max, pointers;
2234 struct trie_stat *stat;
2235 int avdepth;
2236
2237 stat = trie_collect_stats(t);
2238
2239 bytes=0;
2240 seq_printf(seq, "trie=%p\n", t);
2241
2242 if (stat) {
2243 if (stat->leaves)
2244 avdepth=stat->totdepth*100 / stat->leaves;
2245 else
2246 avdepth=0;
2247 seq_printf(seq, "Aver depth: %d.%02d\n", avdepth / 100, avdepth % 100 );
2248 seq_printf(seq, "Max depth: %4d\n", stat->maxdepth);
2249
2250 seq_printf(seq, "Leaves: %d\n", stat->leaves);
2251 bytes += sizeof(struct leaf) * stat->leaves;
2252 seq_printf(seq, "Internal nodes: %d\n", stat->tnodes);
2253 bytes += sizeof(struct tnode) * stat->tnodes;
2254
2255 max = MAX_CHILDS-1;
2256
2257 while (max >= 0 && stat->nodesizes[max] == 0)
2258 max--;
2259 pointers = 0;
2260
2261 for (i = 1; i <= max; i++)
2262 if (stat->nodesizes[i] != 0) {
2263 seq_printf(seq, " %d: %d", i, stat->nodesizes[i]);
2264 pointers += (1<<i) * stat->nodesizes[i];
2265 }
2266 seq_printf(seq, "\n");
2267 seq_printf(seq, "Pointers: %d\n", pointers);
2268 bytes += sizeof(struct node *) * pointers;
2269 seq_printf(seq, "Null ptrs: %d\n", stat->nullpointers);
2270 seq_printf(seq, "Total size: %d kB\n", bytes / 1024);
2271
2272 kfree(stat);
2273 }
2274
2275#ifdef CONFIG_IP_FIB_TRIE_STATS
2276 seq_printf(seq, "Counters:\n---------\n");
2277 seq_printf(seq,"gets = %d\n", t->stats.gets);
2278 seq_printf(seq,"backtracks = %d\n", t->stats.backtrack);
2279 seq_printf(seq,"semantic match passed = %d\n", t->stats.semantic_match_passed);
2280 seq_printf(seq,"semantic match miss = %d\n", t->stats.semantic_match_miss);
2281 seq_printf(seq,"null node hit= %d\n", t->stats.null_node_hit);
2282#ifdef CLEAR_STATS
2283 memset(&(t->stats), 0, sizeof(t->stats));
2284#endif
2285#endif /* CONFIG_IP_FIB_TRIE_STATS */
2286}
2287
2288static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2289{
2290 char bf[128];
2291
2292 if (v == SEQ_START_TOKEN) {
2293 seq_printf(seq, "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n",
2294 sizeof(struct leaf), sizeof(struct tnode));
2295 if (trie_local)
2296 collect_and_show(trie_local, seq);
2297
2298 if (trie_main)
2299 collect_and_show(trie_main, seq);
2300 }
2301 else {
2302 snprintf(bf, sizeof(bf),
2303 "*\t%08X\t%08X", 200, 400);
2304
2305 seq_printf(seq, "%-127s\n", bf);
2306 }
2307 return 0;
2308}
2309
2310static struct seq_operations fib_triestat_seq_ops = {
2311 .start = fib_triestat_seq_start,
2312 .next = fib_triestat_seq_next,
2313 .stop = fib_triestat_seq_stop,
2314 .show = fib_triestat_seq_show,
2315};
2316
2317static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2318{
2319 struct seq_file *seq;
2320 int rc = -ENOMEM;
2321
2322 rc = seq_open(file, &fib_triestat_seq_ops);
2323 if (rc)
2324 goto out_kfree;
2325
2326 seq = file->private_data;
2327out:
2328 return rc;
2329out_kfree:
2330 goto out;
2331}
2332
2333static struct file_operations fib_triestat_seq_fops = {
2334 .owner = THIS_MODULE,
2335 .open = fib_triestat_seq_open,
2336 .read = seq_read,
2337 .llseek = seq_lseek,
2338 .release = seq_release_private,
2339};
2340
2341int __init fib_stat_proc_init(void)
2342{
2343 if (!proc_net_fops_create("fib_triestat", S_IRUGO, &fib_triestat_seq_fops))
2344 return -ENOMEM;
2345 return 0;
2346}
2347
2348void __init fib_stat_proc_exit(void)
2349{
2350 proc_net_remove("fib_triestat");
2351}
2352
2353static struct fib_alias *fib_trie_get_first(struct seq_file *seq)
2354{
2355 return NULL;
2356}
2357
2358static struct fib_alias *fib_trie_get_next(struct seq_file *seq)
2359{
2360 return NULL;
2361}
2362
2363static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2364{
2365 void *v = NULL;
2366
2367 if (ip_fib_main_table)
2368 v = *pos ? fib_trie_get_next(seq) : SEQ_START_TOKEN;
2369 return v;
2370}
2371
2372static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2373{
2374 ++*pos;
2375 return v == SEQ_START_TOKEN ? fib_trie_get_first(seq) : fib_trie_get_next(seq);
2376}
2377
2378static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2379{
2380
2381}
2382
2383/*
2384 * This outputs /proc/net/fib_trie.
2385 *
2386 * It always works in backward compatibility mode.
2387 * The format of the file is not supposed to be changed.
2388 */
2389
2390static int fib_trie_seq_show(struct seq_file *seq, void *v)
2391{
2392 char bf[128];
2393
2394 if (v == SEQ_START_TOKEN) {
2395 if (trie_local)
2396 trie_dump_seq(seq, trie_local);
2397
2398 if (trie_main)
2399 trie_dump_seq(seq, trie_main);
2400 }
2401
2402 else {
2403 snprintf(bf, sizeof(bf),
2404 "*\t%08X\t%08X", 200, 400);
2405 seq_printf(seq, "%-127s\n", bf);
2406 }
2407
2408 return 0;
2409}
2410
2411static struct seq_operations fib_trie_seq_ops = {
2412 .start = fib_trie_seq_start,
2413 .next = fib_trie_seq_next,
2414 .stop = fib_trie_seq_stop,
2415 .show = fib_trie_seq_show,
2416};
2417
2418static int fib_trie_seq_open(struct inode *inode, struct file *file)
2419{
2420 struct seq_file *seq;
2421 int rc = -ENOMEM;
2422
2423 rc = seq_open(file, &fib_trie_seq_ops);
2424 if (rc)
2425 goto out_kfree;
2426
2427 seq = file->private_data;
2428out:
2429 return rc;
2430out_kfree:
2431 goto out;
2432}
2433
2434static struct file_operations fib_trie_seq_fops = {
2435 .owner = THIS_MODULE,
2436 .open = fib_trie_seq_open,
2437 .read = seq_read,
2438 .llseek = seq_lseek,
2439 .release = seq_release_private,
2440};
2441
2442int __init fib_proc_init(void)
2443{
2444 if (!proc_net_fops_create("fib_trie", S_IRUGO, &fib_trie_seq_fops))
2445 return -ENOMEM;
2446 return 0;
2447}
2448
2449void __init fib_proc_exit(void)
2450{
2451 proc_net_remove("fib_trie");
2452}
2453
2454#endif /* CONFIG_PROC_FS */
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 4e47a2658c7c..af2ec88bbb2f 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -184,6 +184,7 @@ int ip_call_ra_chain(struct sk_buff *skb)
184 raw_rcv(last, skb2); 184 raw_rcv(last, skb2);
185 } 185 }
186 last = sk; 186 last = sk;
187 nf_reset(skb);
187 } 188 }
188 } 189 }
189 190
@@ -200,10 +201,6 @@ static inline int ip_local_deliver_finish(struct sk_buff *skb)
200{ 201{
201 int ihl = skb->nh.iph->ihl*4; 202 int ihl = skb->nh.iph->ihl*4;
202 203
203#ifdef CONFIG_NETFILTER_DEBUG
204 nf_debug_ip_local_deliver(skb);
205#endif /*CONFIG_NETFILTER_DEBUG*/
206
207 __skb_pull(skb, ihl); 204 __skb_pull(skb, ihl);
208 205
209 /* Free reference early: we don't need it any more, and it may 206 /* Free reference early: we don't need it any more, and it may
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 760dc8238d65..ee07aec215a0 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -107,10 +107,6 @@ static int ip_dev_loopback_xmit(struct sk_buff *newskb)
107 newskb->pkt_type = PACKET_LOOPBACK; 107 newskb->pkt_type = PACKET_LOOPBACK;
108 newskb->ip_summed = CHECKSUM_UNNECESSARY; 108 newskb->ip_summed = CHECKSUM_UNNECESSARY;
109 BUG_TRAP(newskb->dst); 109 BUG_TRAP(newskb->dst);
110
111#ifdef CONFIG_NETFILTER_DEBUG
112 nf_debug_ip_loopback_xmit(newskb);
113#endif
114 nf_reset(newskb); 110 nf_reset(newskb);
115 netif_rx(newskb); 111 netif_rx(newskb);
116 return 0; 112 return 0;
@@ -192,10 +188,6 @@ static inline int ip_finish_output2(struct sk_buff *skb)
192 skb = skb2; 188 skb = skb2;
193 } 189 }
194 190
195#ifdef CONFIG_NETFILTER_DEBUG
196 nf_debug_ip_finish_output2(skb);
197#endif /*CONFIG_NETFILTER_DEBUG*/
198
199 nf_reset(skb); 191 nf_reset(skb);
200 192
201 if (hh) { 193 if (hh) {
@@ -415,9 +407,6 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
415 to->nf_bridge = from->nf_bridge; 407 to->nf_bridge = from->nf_bridge;
416 nf_bridge_get(to->nf_bridge); 408 nf_bridge_get(to->nf_bridge);
417#endif 409#endif
418#ifdef CONFIG_NETFILTER_DEBUG
419 to->nf_debug = from->nf_debug;
420#endif
421#endif 410#endif
422} 411}
423 412
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
index 1a23c5263b99..2065944fd9e5 100644
--- a/net/ipv4/ipcomp.c
+++ b/net/ipv4/ipcomp.c
@@ -236,15 +236,10 @@ static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x)
236 t->props.mode = 1; 236 t->props.mode = 1;
237 t->props.saddr.a4 = x->props.saddr.a4; 237 t->props.saddr.a4 = x->props.saddr.a4;
238 t->props.flags = x->props.flags; 238 t->props.flags = x->props.flags;
239 239
240 t->type = xfrm_get_type(IPPROTO_IPIP, t->props.family); 240 if (xfrm_init_state(t))
241 if (t->type == NULL)
242 goto error;
243
244 if (t->type->init_state(t, NULL))
245 goto error; 241 goto error;
246 242
247 t->km.state = XFRM_STATE_VALID;
248 atomic_set(&t->tunnel_users, 1); 243 atomic_set(&t->tunnel_users, 1);
249out: 244out:
250 return t; 245 return t;
@@ -422,7 +417,7 @@ static void ipcomp_destroy(struct xfrm_state *x)
422 kfree(ipcd); 417 kfree(ipcd);
423} 418}
424 419
425static int ipcomp_init_state(struct xfrm_state *x, void *args) 420static int ipcomp_init_state(struct xfrm_state *x)
426{ 421{
427 int err; 422 int err;
428 struct ipcomp_data *ipcd; 423 struct ipcomp_data *ipcd;
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index e21c049ec62a..e4f809a93f47 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -1350,6 +1350,7 @@ int ip_mr_input(struct sk_buff *skb)
1350 */ 1350 */
1351 read_lock(&mrt_lock); 1351 read_lock(&mrt_lock);
1352 if (mroute_socket) { 1352 if (mroute_socket) {
1353 nf_reset(skb);
1353 raw_rcv(mroute_socket, skb); 1354 raw_rcv(mroute_socket, skb);
1354 read_unlock(&mrt_lock); 1355 read_unlock(&mrt_lock);
1355 return 0; 1356 return 0;
diff --git a/net/ipv4/ipvs/ip_vs_xmit.c b/net/ipv4/ipvs/ip_vs_xmit.c
index de21da00057f..a8512a3fd08a 100644
--- a/net/ipv4/ipvs/ip_vs_xmit.c
+++ b/net/ipv4/ipvs/ip_vs_xmit.c
@@ -127,7 +127,6 @@ ip_vs_dst_reset(struct ip_vs_dest *dest)
127 127
128#define IP_VS_XMIT(skb, rt) \ 128#define IP_VS_XMIT(skb, rt) \
129do { \ 129do { \
130 nf_reset_debug(skb); \
131 (skb)->nfcache |= NFC_IPVS_PROPERTY; \ 130 (skb)->nfcache |= NFC_IPVS_PROPERTY; \
132 (skb)->ip_summed = CHECKSUM_NONE; \ 131 (skb)->ip_summed = CHECKSUM_NONE; \
133 NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, (skb), NULL, \ 132 NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, (skb), NULL, \
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index df79f5ed6a0a..fa1634256680 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -60,7 +60,6 @@ static DECLARE_MUTEX(arpt_mutex);
60 60
61#define ASSERT_READ_LOCK(x) ARP_NF_ASSERT(down_trylock(&arpt_mutex) != 0) 61#define ASSERT_READ_LOCK(x) ARP_NF_ASSERT(down_trylock(&arpt_mutex) != 0)
62#define ASSERT_WRITE_LOCK(x) ARP_NF_ASSERT(down_trylock(&arpt_mutex) != 0) 62#define ASSERT_WRITE_LOCK(x) ARP_NF_ASSERT(down_trylock(&arpt_mutex) != 0)
63#include <linux/netfilter_ipv4/lockhelp.h>
64#include <linux/netfilter_ipv4/listhelp.h> 63#include <linux/netfilter_ipv4/listhelp.h>
65 64
66struct arpt_table_info { 65struct arpt_table_info {
diff --git a/net/ipv4/netfilter/ip_conntrack_amanda.c b/net/ipv4/netfilter/ip_conntrack_amanda.c
index 3dbddd062605..a78a320eee08 100644
--- a/net/ipv4/netfilter/ip_conntrack_amanda.c
+++ b/net/ipv4/netfilter/ip_conntrack_amanda.c
@@ -26,7 +26,6 @@
26#include <net/checksum.h> 26#include <net/checksum.h>
27#include <net/udp.h> 27#include <net/udp.h>
28 28
29#include <linux/netfilter_ipv4/lockhelp.h>
30#include <linux/netfilter_ipv4/ip_conntrack_helper.h> 29#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
31#include <linux/netfilter_ipv4/ip_conntrack_amanda.h> 30#include <linux/netfilter_ipv4/ip_conntrack_amanda.h>
32 31
@@ -42,7 +41,7 @@ static char *conns[] = { "DATA ", "MESG ", "INDEX " };
42 41
43/* This is slow, but it's simple. --RR */ 42/* This is slow, but it's simple. --RR */
44static char amanda_buffer[65536]; 43static char amanda_buffer[65536];
45static DECLARE_LOCK(amanda_buffer_lock); 44static DEFINE_SPINLOCK(amanda_buffer_lock);
46 45
47unsigned int (*ip_nat_amanda_hook)(struct sk_buff **pskb, 46unsigned int (*ip_nat_amanda_hook)(struct sk_buff **pskb,
48 enum ip_conntrack_info ctinfo, 47 enum ip_conntrack_info ctinfo,
@@ -76,7 +75,7 @@ static int help(struct sk_buff **pskb,
76 return NF_ACCEPT; 75 return NF_ACCEPT;
77 } 76 }
78 77
79 LOCK_BH(&amanda_buffer_lock); 78 spin_lock_bh(&amanda_buffer_lock);
80 skb_copy_bits(*pskb, dataoff, amanda_buffer, (*pskb)->len - dataoff); 79 skb_copy_bits(*pskb, dataoff, amanda_buffer, (*pskb)->len - dataoff);
81 data = amanda_buffer; 80 data = amanda_buffer;
82 data_limit = amanda_buffer + (*pskb)->len - dataoff; 81 data_limit = amanda_buffer + (*pskb)->len - dataoff;
@@ -134,7 +133,7 @@ static int help(struct sk_buff **pskb,
134 } 133 }
135 134
136out: 135out:
137 UNLOCK_BH(&amanda_buffer_lock); 136 spin_unlock_bh(&amanda_buffer_lock);
138 return ret; 137 return ret;
139} 138}
140 139
diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c
index 09e824622977..4b78ebeb6635 100644
--- a/net/ipv4/netfilter/ip_conntrack_core.c
+++ b/net/ipv4/netfilter/ip_conntrack_core.c
@@ -38,10 +38,10 @@
38#include <linux/percpu.h> 38#include <linux/percpu.h>
39#include <linux/moduleparam.h> 39#include <linux/moduleparam.h>
40 40
41/* This rwlock protects the main hash table, protocol/helper/expected 41/* ip_conntrack_lock protects the main hash table, protocol/helper/expected
42 registrations, conntrack timers*/ 42 registrations, conntrack timers*/
43#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_conntrack_lock) 43#define ASSERT_READ_LOCK(x)
44#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_conntrack_lock) 44#define ASSERT_WRITE_LOCK(x)
45 45
46#include <linux/netfilter_ipv4/ip_conntrack.h> 46#include <linux/netfilter_ipv4/ip_conntrack.h>
47#include <linux/netfilter_ipv4/ip_conntrack_protocol.h> 47#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
@@ -57,7 +57,7 @@
57#define DEBUGP(format, args...) 57#define DEBUGP(format, args...)
58#endif 58#endif
59 59
60DECLARE_RWLOCK(ip_conntrack_lock); 60DEFINE_RWLOCK(ip_conntrack_lock);
61 61
62/* ip_conntrack_standalone needs this */ 62/* ip_conntrack_standalone needs this */
63atomic_t ip_conntrack_count = ATOMIC_INIT(0); 63atomic_t ip_conntrack_count = ATOMIC_INIT(0);
@@ -147,7 +147,7 @@ static void destroy_expect(struct ip_conntrack_expect *exp)
147 147
148static void unlink_expect(struct ip_conntrack_expect *exp) 148static void unlink_expect(struct ip_conntrack_expect *exp)
149{ 149{
150 MUST_BE_WRITE_LOCKED(&ip_conntrack_lock); 150 ASSERT_WRITE_LOCK(&ip_conntrack_lock);
151 list_del(&exp->list); 151 list_del(&exp->list);
152 /* Logically in destroy_expect, but we hold the lock here. */ 152 /* Logically in destroy_expect, but we hold the lock here. */
153 exp->master->expecting--; 153 exp->master->expecting--;
@@ -157,9 +157,9 @@ static void expectation_timed_out(unsigned long ul_expect)
157{ 157{
158 struct ip_conntrack_expect *exp = (void *)ul_expect; 158 struct ip_conntrack_expect *exp = (void *)ul_expect;
159 159
160 WRITE_LOCK(&ip_conntrack_lock); 160 write_lock_bh(&ip_conntrack_lock);
161 unlink_expect(exp); 161 unlink_expect(exp);
162 WRITE_UNLOCK(&ip_conntrack_lock); 162 write_unlock_bh(&ip_conntrack_lock);
163 destroy_expect(exp); 163 destroy_expect(exp);
164} 164}
165 165
@@ -209,7 +209,7 @@ clean_from_lists(struct ip_conntrack *ct)
209 unsigned int ho, hr; 209 unsigned int ho, hr;
210 210
211 DEBUGP("clean_from_lists(%p)\n", ct); 211 DEBUGP("clean_from_lists(%p)\n", ct);
212 MUST_BE_WRITE_LOCKED(&ip_conntrack_lock); 212 ASSERT_WRITE_LOCK(&ip_conntrack_lock);
213 213
214 ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); 214 ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
215 hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); 215 hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
@@ -240,7 +240,7 @@ destroy_conntrack(struct nf_conntrack *nfct)
240 if (ip_conntrack_destroyed) 240 if (ip_conntrack_destroyed)
241 ip_conntrack_destroyed(ct); 241 ip_conntrack_destroyed(ct);
242 242
243 WRITE_LOCK(&ip_conntrack_lock); 243 write_lock_bh(&ip_conntrack_lock);
244 /* Expectations will have been removed in clean_from_lists, 244 /* Expectations will have been removed in clean_from_lists,
245 * except TFTP can create an expectation on the first packet, 245 * except TFTP can create an expectation on the first packet,
246 * before connection is in the list, so we need to clean here, 246 * before connection is in the list, so we need to clean here,
@@ -254,7 +254,7 @@ destroy_conntrack(struct nf_conntrack *nfct)
254 } 254 }
255 255
256 CONNTRACK_STAT_INC(delete); 256 CONNTRACK_STAT_INC(delete);
257 WRITE_UNLOCK(&ip_conntrack_lock); 257 write_unlock_bh(&ip_conntrack_lock);
258 258
259 if (ct->master) 259 if (ct->master)
260 ip_conntrack_put(ct->master); 260 ip_conntrack_put(ct->master);
@@ -268,12 +268,12 @@ static void death_by_timeout(unsigned long ul_conntrack)
268{ 268{
269 struct ip_conntrack *ct = (void *)ul_conntrack; 269 struct ip_conntrack *ct = (void *)ul_conntrack;
270 270
271 WRITE_LOCK(&ip_conntrack_lock); 271 write_lock_bh(&ip_conntrack_lock);
272 /* Inside lock so preempt is disabled on module removal path. 272 /* Inside lock so preempt is disabled on module removal path.
273 * Otherwise we can get spurious warnings. */ 273 * Otherwise we can get spurious warnings. */
274 CONNTRACK_STAT_INC(delete_list); 274 CONNTRACK_STAT_INC(delete_list);
275 clean_from_lists(ct); 275 clean_from_lists(ct);
276 WRITE_UNLOCK(&ip_conntrack_lock); 276 write_unlock_bh(&ip_conntrack_lock);
277 ip_conntrack_put(ct); 277 ip_conntrack_put(ct);
278} 278}
279 279
@@ -282,7 +282,7 @@ conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i,
282 const struct ip_conntrack_tuple *tuple, 282 const struct ip_conntrack_tuple *tuple,
283 const struct ip_conntrack *ignored_conntrack) 283 const struct ip_conntrack *ignored_conntrack)
284{ 284{
285 MUST_BE_READ_LOCKED(&ip_conntrack_lock); 285 ASSERT_READ_LOCK(&ip_conntrack_lock);
286 return tuplehash_to_ctrack(i) != ignored_conntrack 286 return tuplehash_to_ctrack(i) != ignored_conntrack
287 && ip_ct_tuple_equal(tuple, &i->tuple); 287 && ip_ct_tuple_equal(tuple, &i->tuple);
288} 288}
@@ -294,7 +294,7 @@ __ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
294 struct ip_conntrack_tuple_hash *h; 294 struct ip_conntrack_tuple_hash *h;
295 unsigned int hash = hash_conntrack(tuple); 295 unsigned int hash = hash_conntrack(tuple);
296 296
297 MUST_BE_READ_LOCKED(&ip_conntrack_lock); 297 ASSERT_READ_LOCK(&ip_conntrack_lock);
298 list_for_each_entry(h, &ip_conntrack_hash[hash], list) { 298 list_for_each_entry(h, &ip_conntrack_hash[hash], list) {
299 if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) { 299 if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) {
300 CONNTRACK_STAT_INC(found); 300 CONNTRACK_STAT_INC(found);
@@ -313,11 +313,11 @@ ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
313{ 313{
314 struct ip_conntrack_tuple_hash *h; 314 struct ip_conntrack_tuple_hash *h;
315 315
316 READ_LOCK(&ip_conntrack_lock); 316 read_lock_bh(&ip_conntrack_lock);
317 h = __ip_conntrack_find(tuple, ignored_conntrack); 317 h = __ip_conntrack_find(tuple, ignored_conntrack);
318 if (h) 318 if (h)
319 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use); 319 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
320 READ_UNLOCK(&ip_conntrack_lock); 320 read_unlock_bh(&ip_conntrack_lock);
321 321
322 return h; 322 return h;
323} 323}
@@ -352,7 +352,7 @@ __ip_conntrack_confirm(struct sk_buff **pskb)
352 IP_NF_ASSERT(!is_confirmed(ct)); 352 IP_NF_ASSERT(!is_confirmed(ct));
353 DEBUGP("Confirming conntrack %p\n", ct); 353 DEBUGP("Confirming conntrack %p\n", ct);
354 354
355 WRITE_LOCK(&ip_conntrack_lock); 355 write_lock_bh(&ip_conntrack_lock);
356 356
357 /* See if there's one in the list already, including reverse: 357 /* See if there's one in the list already, including reverse:
358 NAT could have grabbed it without realizing, since we're 358 NAT could have grabbed it without realizing, since we're
@@ -380,12 +380,12 @@ __ip_conntrack_confirm(struct sk_buff **pskb)
380 atomic_inc(&ct->ct_general.use); 380 atomic_inc(&ct->ct_general.use);
381 set_bit(IPS_CONFIRMED_BIT, &ct->status); 381 set_bit(IPS_CONFIRMED_BIT, &ct->status);
382 CONNTRACK_STAT_INC(insert); 382 CONNTRACK_STAT_INC(insert);
383 WRITE_UNLOCK(&ip_conntrack_lock); 383 write_unlock_bh(&ip_conntrack_lock);
384 return NF_ACCEPT; 384 return NF_ACCEPT;
385 } 385 }
386 386
387 CONNTRACK_STAT_INC(insert_failed); 387 CONNTRACK_STAT_INC(insert_failed);
388 WRITE_UNLOCK(&ip_conntrack_lock); 388 write_unlock_bh(&ip_conntrack_lock);
389 389
390 return NF_DROP; 390 return NF_DROP;
391} 391}
@@ -398,9 +398,9 @@ ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
398{ 398{
399 struct ip_conntrack_tuple_hash *h; 399 struct ip_conntrack_tuple_hash *h;
400 400
401 READ_LOCK(&ip_conntrack_lock); 401 read_lock_bh(&ip_conntrack_lock);
402 h = __ip_conntrack_find(tuple, ignored_conntrack); 402 h = __ip_conntrack_find(tuple, ignored_conntrack);
403 READ_UNLOCK(&ip_conntrack_lock); 403 read_unlock_bh(&ip_conntrack_lock);
404 404
405 return h != NULL; 405 return h != NULL;
406} 406}
@@ -419,13 +419,13 @@ static int early_drop(struct list_head *chain)
419 struct ip_conntrack *ct = NULL; 419 struct ip_conntrack *ct = NULL;
420 int dropped = 0; 420 int dropped = 0;
421 421
422 READ_LOCK(&ip_conntrack_lock); 422 read_lock_bh(&ip_conntrack_lock);
423 h = LIST_FIND_B(chain, unreplied, struct ip_conntrack_tuple_hash *); 423 h = LIST_FIND_B(chain, unreplied, struct ip_conntrack_tuple_hash *);
424 if (h) { 424 if (h) {
425 ct = tuplehash_to_ctrack(h); 425 ct = tuplehash_to_ctrack(h);
426 atomic_inc(&ct->ct_general.use); 426 atomic_inc(&ct->ct_general.use);
427 } 427 }
428 READ_UNLOCK(&ip_conntrack_lock); 428 read_unlock_bh(&ip_conntrack_lock);
429 429
430 if (!ct) 430 if (!ct)
431 return dropped; 431 return dropped;
@@ -508,7 +508,7 @@ init_conntrack(const struct ip_conntrack_tuple *tuple,
508 conntrack->timeout.data = (unsigned long)conntrack; 508 conntrack->timeout.data = (unsigned long)conntrack;
509 conntrack->timeout.function = death_by_timeout; 509 conntrack->timeout.function = death_by_timeout;
510 510
511 WRITE_LOCK(&ip_conntrack_lock); 511 write_lock_bh(&ip_conntrack_lock);
512 exp = find_expectation(tuple); 512 exp = find_expectation(tuple);
513 513
514 if (exp) { 514 if (exp) {
@@ -532,7 +532,7 @@ init_conntrack(const struct ip_conntrack_tuple *tuple,
532 list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed); 532 list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
533 533
534 atomic_inc(&ip_conntrack_count); 534 atomic_inc(&ip_conntrack_count);
535 WRITE_UNLOCK(&ip_conntrack_lock); 535 write_unlock_bh(&ip_conntrack_lock);
536 536
537 if (exp) { 537 if (exp) {
538 if (exp->expectfn) 538 if (exp->expectfn)
@@ -723,17 +723,17 @@ void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp)
723{ 723{
724 struct ip_conntrack_expect *i; 724 struct ip_conntrack_expect *i;
725 725
726 WRITE_LOCK(&ip_conntrack_lock); 726 write_lock_bh(&ip_conntrack_lock);
727 /* choose the the oldest expectation to evict */ 727 /* choose the the oldest expectation to evict */
728 list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) { 728 list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
729 if (expect_matches(i, exp) && del_timer(&i->timeout)) { 729 if (expect_matches(i, exp) && del_timer(&i->timeout)) {
730 unlink_expect(i); 730 unlink_expect(i);
731 WRITE_UNLOCK(&ip_conntrack_lock); 731 write_unlock_bh(&ip_conntrack_lock);
732 destroy_expect(i); 732 destroy_expect(i);
733 return; 733 return;
734 } 734 }
735 } 735 }
736 WRITE_UNLOCK(&ip_conntrack_lock); 736 write_unlock_bh(&ip_conntrack_lock);
737} 737}
738 738
739struct ip_conntrack_expect *ip_conntrack_expect_alloc(void) 739struct ip_conntrack_expect *ip_conntrack_expect_alloc(void)
@@ -760,15 +760,11 @@ static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp)
760 exp->master->expecting++; 760 exp->master->expecting++;
761 list_add(&exp->list, &ip_conntrack_expect_list); 761 list_add(&exp->list, &ip_conntrack_expect_list);
762 762
763 if (exp->master->helper->timeout) { 763 init_timer(&exp->timeout);
764 init_timer(&exp->timeout); 764 exp->timeout.data = (unsigned long)exp;
765 exp->timeout.data = (unsigned long)exp; 765 exp->timeout.function = expectation_timed_out;
766 exp->timeout.function = expectation_timed_out; 766 exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ;
767 exp->timeout.expires 767 add_timer(&exp->timeout);
768 = jiffies + exp->master->helper->timeout * HZ;
769 add_timer(&exp->timeout);
770 } else
771 exp->timeout.function = NULL;
772 768
773 CONNTRACK_STAT_INC(expect_create); 769 CONNTRACK_STAT_INC(expect_create);
774} 770}
@@ -808,7 +804,7 @@ int ip_conntrack_expect_related(struct ip_conntrack_expect *expect)
808 DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple); 804 DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple);
809 DEBUGP("mask: "); DUMP_TUPLE(&expect->mask); 805 DEBUGP("mask: "); DUMP_TUPLE(&expect->mask);
810 806
811 WRITE_LOCK(&ip_conntrack_lock); 807 write_lock_bh(&ip_conntrack_lock);
812 list_for_each_entry(i, &ip_conntrack_expect_list, list) { 808 list_for_each_entry(i, &ip_conntrack_expect_list, list) {
813 if (expect_matches(i, expect)) { 809 if (expect_matches(i, expect)) {
814 /* Refresh timer: if it's dying, ignore.. */ 810 /* Refresh timer: if it's dying, ignore.. */
@@ -832,7 +828,7 @@ int ip_conntrack_expect_related(struct ip_conntrack_expect *expect)
832 ip_conntrack_expect_insert(expect); 828 ip_conntrack_expect_insert(expect);
833 ret = 0; 829 ret = 0;
834out: 830out:
835 WRITE_UNLOCK(&ip_conntrack_lock); 831 write_unlock_bh(&ip_conntrack_lock);
836 return ret; 832 return ret;
837} 833}
838 834
@@ -841,7 +837,7 @@ out:
841void ip_conntrack_alter_reply(struct ip_conntrack *conntrack, 837void ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
842 const struct ip_conntrack_tuple *newreply) 838 const struct ip_conntrack_tuple *newreply)
843{ 839{
844 WRITE_LOCK(&ip_conntrack_lock); 840 write_lock_bh(&ip_conntrack_lock);
845 /* Should be unconfirmed, so not in hash table yet */ 841 /* Should be unconfirmed, so not in hash table yet */
846 IP_NF_ASSERT(!is_confirmed(conntrack)); 842 IP_NF_ASSERT(!is_confirmed(conntrack));
847 843
@@ -851,15 +847,15 @@ void ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
851 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply; 847 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
852 if (!conntrack->master && conntrack->expecting == 0) 848 if (!conntrack->master && conntrack->expecting == 0)
853 conntrack->helper = ip_ct_find_helper(newreply); 849 conntrack->helper = ip_ct_find_helper(newreply);
854 WRITE_UNLOCK(&ip_conntrack_lock); 850 write_unlock_bh(&ip_conntrack_lock);
855} 851}
856 852
857int ip_conntrack_helper_register(struct ip_conntrack_helper *me) 853int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
858{ 854{
859 BUG_ON(me->timeout == 0); 855 BUG_ON(me->timeout == 0);
860 WRITE_LOCK(&ip_conntrack_lock); 856 write_lock_bh(&ip_conntrack_lock);
861 list_prepend(&helpers, me); 857 list_prepend(&helpers, me);
862 WRITE_UNLOCK(&ip_conntrack_lock); 858 write_unlock_bh(&ip_conntrack_lock);
863 859
864 return 0; 860 return 0;
865} 861}
@@ -878,7 +874,7 @@ void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
878 struct ip_conntrack_expect *exp, *tmp; 874 struct ip_conntrack_expect *exp, *tmp;
879 875
880 /* Need write lock here, to delete helper. */ 876 /* Need write lock here, to delete helper. */
881 WRITE_LOCK(&ip_conntrack_lock); 877 write_lock_bh(&ip_conntrack_lock);
882 LIST_DELETE(&helpers, me); 878 LIST_DELETE(&helpers, me);
883 879
884 /* Get rid of expectations */ 880 /* Get rid of expectations */
@@ -893,7 +889,7 @@ void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
893 for (i = 0; i < ip_conntrack_htable_size; i++) 889 for (i = 0; i < ip_conntrack_htable_size; i++)
894 LIST_FIND_W(&ip_conntrack_hash[i], unhelp, 890 LIST_FIND_W(&ip_conntrack_hash[i], unhelp,
895 struct ip_conntrack_tuple_hash *, me); 891 struct ip_conntrack_tuple_hash *, me);
896 WRITE_UNLOCK(&ip_conntrack_lock); 892 write_unlock_bh(&ip_conntrack_lock);
897 893
898 /* Someone could be still looking at the helper in a bh. */ 894 /* Someone could be still looking at the helper in a bh. */
899 synchronize_net(); 895 synchronize_net();
@@ -925,14 +921,14 @@ void ip_ct_refresh_acct(struct ip_conntrack *ct,
925 ct->timeout.expires = extra_jiffies; 921 ct->timeout.expires = extra_jiffies;
926 ct_add_counters(ct, ctinfo, skb); 922 ct_add_counters(ct, ctinfo, skb);
927 } else { 923 } else {
928 WRITE_LOCK(&ip_conntrack_lock); 924 write_lock_bh(&ip_conntrack_lock);
929 /* Need del_timer for race avoidance (may already be dying). */ 925 /* Need del_timer for race avoidance (may already be dying). */
930 if (del_timer(&ct->timeout)) { 926 if (del_timer(&ct->timeout)) {
931 ct->timeout.expires = jiffies + extra_jiffies; 927 ct->timeout.expires = jiffies + extra_jiffies;
932 add_timer(&ct->timeout); 928 add_timer(&ct->timeout);
933 } 929 }
934 ct_add_counters(ct, ctinfo, skb); 930 ct_add_counters(ct, ctinfo, skb);
935 WRITE_UNLOCK(&ip_conntrack_lock); 931 write_unlock_bh(&ip_conntrack_lock);
936 } 932 }
937} 933}
938 934
@@ -940,10 +936,6 @@ void ip_ct_refresh_acct(struct ip_conntrack *ct,
940struct sk_buff * 936struct sk_buff *
941ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user) 937ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
942{ 938{
943#ifdef CONFIG_NETFILTER_DEBUG
944 unsigned int olddebug = skb->nf_debug;
945#endif
946
947 skb_orphan(skb); 939 skb_orphan(skb);
948 940
949 local_bh_disable(); 941 local_bh_disable();
@@ -953,12 +945,7 @@ ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
953 if (skb) { 945 if (skb) {
954 ip_send_check(skb->nh.iph); 946 ip_send_check(skb->nh.iph);
955 skb->nfcache |= NFC_ALTERED; 947 skb->nfcache |= NFC_ALTERED;
956#ifdef CONFIG_NETFILTER_DEBUG
957 /* Packet path as if nothing had happened. */
958 skb->nf_debug = olddebug;
959#endif
960 } 948 }
961
962 return skb; 949 return skb;
963} 950}
964 951
@@ -997,7 +984,7 @@ get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data),
997{ 984{
998 struct ip_conntrack_tuple_hash *h = NULL; 985 struct ip_conntrack_tuple_hash *h = NULL;
999 986
1000 WRITE_LOCK(&ip_conntrack_lock); 987 write_lock_bh(&ip_conntrack_lock);
1001 for (; *bucket < ip_conntrack_htable_size; (*bucket)++) { 988 for (; *bucket < ip_conntrack_htable_size; (*bucket)++) {
1002 h = LIST_FIND_W(&ip_conntrack_hash[*bucket], do_iter, 989 h = LIST_FIND_W(&ip_conntrack_hash[*bucket], do_iter,
1003 struct ip_conntrack_tuple_hash *, iter, data); 990 struct ip_conntrack_tuple_hash *, iter, data);
@@ -1009,7 +996,7 @@ get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data),
1009 struct ip_conntrack_tuple_hash *, iter, data); 996 struct ip_conntrack_tuple_hash *, iter, data);
1010 if (h) 997 if (h)
1011 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use); 998 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
1012 WRITE_UNLOCK(&ip_conntrack_lock); 999 write_unlock_bh(&ip_conntrack_lock);
1013 1000
1014 return h; 1001 return h;
1015} 1002}
@@ -1201,14 +1188,14 @@ int __init ip_conntrack_init(void)
1201 } 1188 }
1202 1189
1203 /* Don't NEED lock here, but good form anyway. */ 1190 /* Don't NEED lock here, but good form anyway. */
1204 WRITE_LOCK(&ip_conntrack_lock); 1191 write_lock_bh(&ip_conntrack_lock);
1205 for (i = 0; i < MAX_IP_CT_PROTO; i++) 1192 for (i = 0; i < MAX_IP_CT_PROTO; i++)
1206 ip_ct_protos[i] = &ip_conntrack_generic_protocol; 1193 ip_ct_protos[i] = &ip_conntrack_generic_protocol;
1207 /* Sew in builtin protocols. */ 1194 /* Sew in builtin protocols. */
1208 ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp; 1195 ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp;
1209 ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp; 1196 ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp;
1210 ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp; 1197 ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp;
1211 WRITE_UNLOCK(&ip_conntrack_lock); 1198 write_unlock_bh(&ip_conntrack_lock);
1212 1199
1213 for (i = 0; i < ip_conntrack_htable_size; i++) 1200 for (i = 0; i < ip_conntrack_htable_size; i++)
1214 INIT_LIST_HEAD(&ip_conntrack_hash[i]); 1201 INIT_LIST_HEAD(&ip_conntrack_hash[i]);
diff --git a/net/ipv4/netfilter/ip_conntrack_ftp.c b/net/ipv4/netfilter/ip_conntrack_ftp.c
index dd86503aa788..fea6dd2a00b6 100644
--- a/net/ipv4/netfilter/ip_conntrack_ftp.c
+++ b/net/ipv4/netfilter/ip_conntrack_ftp.c
@@ -16,7 +16,6 @@
16#include <net/checksum.h> 16#include <net/checksum.h>
17#include <net/tcp.h> 17#include <net/tcp.h>
18 18
19#include <linux/netfilter_ipv4/lockhelp.h>
20#include <linux/netfilter_ipv4/ip_conntrack_helper.h> 19#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
21#include <linux/netfilter_ipv4/ip_conntrack_ftp.h> 20#include <linux/netfilter_ipv4/ip_conntrack_ftp.h>
22#include <linux/moduleparam.h> 21#include <linux/moduleparam.h>
@@ -28,7 +27,7 @@ MODULE_DESCRIPTION("ftp connection tracking helper");
28/* This is slow, but it's simple. --RR */ 27/* This is slow, but it's simple. --RR */
29static char ftp_buffer[65536]; 28static char ftp_buffer[65536];
30 29
31static DECLARE_LOCK(ip_ftp_lock); 30static DEFINE_SPINLOCK(ip_ftp_lock);
32 31
33#define MAX_PORTS 8 32#define MAX_PORTS 8
34static int ports[MAX_PORTS]; 33static int ports[MAX_PORTS];
@@ -319,7 +318,7 @@ static int help(struct sk_buff **pskb,
319 } 318 }
320 datalen = (*pskb)->len - dataoff; 319 datalen = (*pskb)->len - dataoff;
321 320
322 LOCK_BH(&ip_ftp_lock); 321 spin_lock_bh(&ip_ftp_lock);
323 fb_ptr = skb_header_pointer(*pskb, dataoff, 322 fb_ptr = skb_header_pointer(*pskb, dataoff,
324 (*pskb)->len - dataoff, ftp_buffer); 323 (*pskb)->len - dataoff, ftp_buffer);
325 BUG_ON(fb_ptr == NULL); 324 BUG_ON(fb_ptr == NULL);
@@ -442,7 +441,7 @@ out_update_nl:
442 if (ends_in_nl) 441 if (ends_in_nl)
443 update_nl_seq(seq, ct_ftp_info,dir); 442 update_nl_seq(seq, ct_ftp_info,dir);
444 out: 443 out:
445 UNLOCK_BH(&ip_ftp_lock); 444 spin_unlock_bh(&ip_ftp_lock);
446 return ret; 445 return ret;
447} 446}
448 447
diff --git a/net/ipv4/netfilter/ip_conntrack_irc.c b/net/ipv4/netfilter/ip_conntrack_irc.c
index 33cc7348b6ee..cd98772cc332 100644
--- a/net/ipv4/netfilter/ip_conntrack_irc.c
+++ b/net/ipv4/netfilter/ip_conntrack_irc.c
@@ -29,7 +29,6 @@
29#include <net/checksum.h> 29#include <net/checksum.h>
30#include <net/tcp.h> 30#include <net/tcp.h>
31 31
32#include <linux/netfilter_ipv4/lockhelp.h>
33#include <linux/netfilter_ipv4/ip_conntrack_helper.h> 32#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
34#include <linux/netfilter_ipv4/ip_conntrack_irc.h> 33#include <linux/netfilter_ipv4/ip_conntrack_irc.h>
35#include <linux/moduleparam.h> 34#include <linux/moduleparam.h>
@@ -41,7 +40,7 @@ static int max_dcc_channels = 8;
41static unsigned int dcc_timeout = 300; 40static unsigned int dcc_timeout = 300;
42/* This is slow, but it's simple. --RR */ 41/* This is slow, but it's simple. --RR */
43static char irc_buffer[65536]; 42static char irc_buffer[65536];
44static DECLARE_LOCK(irc_buffer_lock); 43static DEFINE_SPINLOCK(irc_buffer_lock);
45 44
46unsigned int (*ip_nat_irc_hook)(struct sk_buff **pskb, 45unsigned int (*ip_nat_irc_hook)(struct sk_buff **pskb,
47 enum ip_conntrack_info ctinfo, 46 enum ip_conntrack_info ctinfo,
@@ -141,7 +140,7 @@ static int help(struct sk_buff **pskb,
141 if (dataoff >= (*pskb)->len) 140 if (dataoff >= (*pskb)->len)
142 return NF_ACCEPT; 141 return NF_ACCEPT;
143 142
144 LOCK_BH(&irc_buffer_lock); 143 spin_lock_bh(&irc_buffer_lock);
145 ib_ptr = skb_header_pointer(*pskb, dataoff, 144 ib_ptr = skb_header_pointer(*pskb, dataoff,
146 (*pskb)->len - dataoff, irc_buffer); 145 (*pskb)->len - dataoff, irc_buffer);
147 BUG_ON(ib_ptr == NULL); 146 BUG_ON(ib_ptr == NULL);
@@ -237,7 +236,7 @@ static int help(struct sk_buff **pskb,
237 } /* while data < ... */ 236 } /* while data < ... */
238 237
239 out: 238 out:
240 UNLOCK_BH(&irc_buffer_lock); 239 spin_unlock_bh(&irc_buffer_lock);
241 return ret; 240 return ret;
242} 241}
243 242
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_sctp.c b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c
index ff8c34a860ff..31d75390bf12 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_sctp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c
@@ -26,7 +26,6 @@
26 26
27#include <linux/netfilter_ipv4/ip_conntrack.h> 27#include <linux/netfilter_ipv4/ip_conntrack.h>
28#include <linux/netfilter_ipv4/ip_conntrack_protocol.h> 28#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
29#include <linux/netfilter_ipv4/lockhelp.h>
30 29
31#if 0 30#if 0
32#define DEBUGP(format, ...) printk(format, ## __VA_ARGS__) 31#define DEBUGP(format, ...) printk(format, ## __VA_ARGS__)
@@ -35,7 +34,7 @@
35#endif 34#endif
36 35
37/* Protects conntrack->proto.sctp */ 36/* Protects conntrack->proto.sctp */
38static DECLARE_RWLOCK(sctp_lock); 37static DEFINE_RWLOCK(sctp_lock);
39 38
40/* FIXME: Examine ipfilter's timeouts and conntrack transitions more 39/* FIXME: Examine ipfilter's timeouts and conntrack transitions more
41 closely. They're more complex. --RR 40 closely. They're more complex. --RR
@@ -199,9 +198,9 @@ static int sctp_print_conntrack(struct seq_file *s,
199 DEBUGP(__FUNCTION__); 198 DEBUGP(__FUNCTION__);
200 DEBUGP("\n"); 199 DEBUGP("\n");
201 200
202 READ_LOCK(&sctp_lock); 201 read_lock_bh(&sctp_lock);
203 state = conntrack->proto.sctp.state; 202 state = conntrack->proto.sctp.state;
204 READ_UNLOCK(&sctp_lock); 203 read_unlock_bh(&sctp_lock);
205 204
206 return seq_printf(s, "%s ", sctp_conntrack_names[state]); 205 return seq_printf(s, "%s ", sctp_conntrack_names[state]);
207} 206}
@@ -343,13 +342,13 @@ static int sctp_packet(struct ip_conntrack *conntrack,
343 342
344 oldsctpstate = newconntrack = SCTP_CONNTRACK_MAX; 343 oldsctpstate = newconntrack = SCTP_CONNTRACK_MAX;
345 for_each_sctp_chunk (skb, sch, _sch, offset, count) { 344 for_each_sctp_chunk (skb, sch, _sch, offset, count) {
346 WRITE_LOCK(&sctp_lock); 345 write_lock_bh(&sctp_lock);
347 346
348 /* Special cases of Verification tag check (Sec 8.5.1) */ 347 /* Special cases of Verification tag check (Sec 8.5.1) */
349 if (sch->type == SCTP_CID_INIT) { 348 if (sch->type == SCTP_CID_INIT) {
350 /* Sec 8.5.1 (A) */ 349 /* Sec 8.5.1 (A) */
351 if (sh->vtag != 0) { 350 if (sh->vtag != 0) {
352 WRITE_UNLOCK(&sctp_lock); 351 write_unlock_bh(&sctp_lock);
353 return -1; 352 return -1;
354 } 353 }
355 } else if (sch->type == SCTP_CID_ABORT) { 354 } else if (sch->type == SCTP_CID_ABORT) {
@@ -357,7 +356,7 @@ static int sctp_packet(struct ip_conntrack *conntrack,
357 if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)]) 356 if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])
358 && !(sh->vtag == conntrack->proto.sctp.vtag 357 && !(sh->vtag == conntrack->proto.sctp.vtag
359 [1 - CTINFO2DIR(ctinfo)])) { 358 [1 - CTINFO2DIR(ctinfo)])) {
360 WRITE_UNLOCK(&sctp_lock); 359 write_unlock_bh(&sctp_lock);
361 return -1; 360 return -1;
362 } 361 }
363 } else if (sch->type == SCTP_CID_SHUTDOWN_COMPLETE) { 362 } else if (sch->type == SCTP_CID_SHUTDOWN_COMPLETE) {
@@ -366,13 +365,13 @@ static int sctp_packet(struct ip_conntrack *conntrack,
366 && !(sh->vtag == conntrack->proto.sctp.vtag 365 && !(sh->vtag == conntrack->proto.sctp.vtag
367 [1 - CTINFO2DIR(ctinfo)] 366 [1 - CTINFO2DIR(ctinfo)]
368 && (sch->flags & 1))) { 367 && (sch->flags & 1))) {
369 WRITE_UNLOCK(&sctp_lock); 368 write_unlock_bh(&sctp_lock);
370 return -1; 369 return -1;
371 } 370 }
372 } else if (sch->type == SCTP_CID_COOKIE_ECHO) { 371 } else if (sch->type == SCTP_CID_COOKIE_ECHO) {
373 /* Sec 8.5.1 (D) */ 372 /* Sec 8.5.1 (D) */
374 if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])) { 373 if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])) {
375 WRITE_UNLOCK(&sctp_lock); 374 write_unlock_bh(&sctp_lock);
376 return -1; 375 return -1;
377 } 376 }
378 } 377 }
@@ -384,7 +383,7 @@ static int sctp_packet(struct ip_conntrack *conntrack,
384 if (newconntrack == SCTP_CONNTRACK_MAX) { 383 if (newconntrack == SCTP_CONNTRACK_MAX) {
385 DEBUGP("ip_conntrack_sctp: Invalid dir=%i ctype=%u conntrack=%u\n", 384 DEBUGP("ip_conntrack_sctp: Invalid dir=%i ctype=%u conntrack=%u\n",
386 CTINFO2DIR(ctinfo), sch->type, oldsctpstate); 385 CTINFO2DIR(ctinfo), sch->type, oldsctpstate);
387 WRITE_UNLOCK(&sctp_lock); 386 write_unlock_bh(&sctp_lock);
388 return -1; 387 return -1;
389 } 388 }
390 389
@@ -396,7 +395,7 @@ static int sctp_packet(struct ip_conntrack *conntrack,
396 ih = skb_header_pointer(skb, offset + sizeof(sctp_chunkhdr_t), 395 ih = skb_header_pointer(skb, offset + sizeof(sctp_chunkhdr_t),
397 sizeof(_inithdr), &_inithdr); 396 sizeof(_inithdr), &_inithdr);
398 if (ih == NULL) { 397 if (ih == NULL) {
399 WRITE_UNLOCK(&sctp_lock); 398 write_unlock_bh(&sctp_lock);
400 return -1; 399 return -1;
401 } 400 }
402 DEBUGP("Setting vtag %x for dir %d\n", 401 DEBUGP("Setting vtag %x for dir %d\n",
@@ -405,7 +404,7 @@ static int sctp_packet(struct ip_conntrack *conntrack,
405 } 404 }
406 405
407 conntrack->proto.sctp.state = newconntrack; 406 conntrack->proto.sctp.state = newconntrack;
408 WRITE_UNLOCK(&sctp_lock); 407 write_unlock_bh(&sctp_lock);
409 } 408 }
410 409
411 ip_ct_refresh_acct(conntrack, ctinfo, skb, *sctp_timeouts[newconntrack]); 410 ip_ct_refresh_acct(conntrack, ctinfo, skb, *sctp_timeouts[newconntrack]);
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
index 721ddbf522b4..809dfed766d4 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
@@ -36,7 +36,6 @@
36#include <linux/netfilter_ipv4.h> 36#include <linux/netfilter_ipv4.h>
37#include <linux/netfilter_ipv4/ip_conntrack.h> 37#include <linux/netfilter_ipv4/ip_conntrack.h>
38#include <linux/netfilter_ipv4/ip_conntrack_protocol.h> 38#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
39#include <linux/netfilter_ipv4/lockhelp.h>
40 39
41#if 0 40#if 0
42#define DEBUGP printk 41#define DEBUGP printk
@@ -46,7 +45,7 @@
46#endif 45#endif
47 46
48/* Protects conntrack->proto.tcp */ 47/* Protects conntrack->proto.tcp */
49static DECLARE_RWLOCK(tcp_lock); 48static DEFINE_RWLOCK(tcp_lock);
50 49
51/* "Be conservative in what you do, 50/* "Be conservative in what you do,
52 be liberal in what you accept from others." 51 be liberal in what you accept from others."
@@ -330,9 +329,9 @@ static int tcp_print_conntrack(struct seq_file *s,
330{ 329{
331 enum tcp_conntrack state; 330 enum tcp_conntrack state;
332 331
333 READ_LOCK(&tcp_lock); 332 read_lock_bh(&tcp_lock);
334 state = conntrack->proto.tcp.state; 333 state = conntrack->proto.tcp.state;
335 READ_UNLOCK(&tcp_lock); 334 read_unlock_bh(&tcp_lock);
336 335
337 return seq_printf(s, "%s ", tcp_conntrack_names[state]); 336 return seq_printf(s, "%s ", tcp_conntrack_names[state]);
338} 337}
@@ -738,14 +737,14 @@ void ip_conntrack_tcp_update(struct sk_buff *skb,
738 737
739 end = segment_seq_plus_len(ntohl(tcph->seq), skb->len, iph, tcph); 738 end = segment_seq_plus_len(ntohl(tcph->seq), skb->len, iph, tcph);
740 739
741 WRITE_LOCK(&tcp_lock); 740 write_lock_bh(&tcp_lock);
742 /* 741 /*
743 * We have to worry for the ack in the reply packet only... 742 * We have to worry for the ack in the reply packet only...
744 */ 743 */
745 if (after(end, conntrack->proto.tcp.seen[dir].td_end)) 744 if (after(end, conntrack->proto.tcp.seen[dir].td_end))
746 conntrack->proto.tcp.seen[dir].td_end = end; 745 conntrack->proto.tcp.seen[dir].td_end = end;
747 conntrack->proto.tcp.last_end = end; 746 conntrack->proto.tcp.last_end = end;
748 WRITE_UNLOCK(&tcp_lock); 747 write_unlock_bh(&tcp_lock);
749 DEBUGP("tcp_update: sender end=%u maxend=%u maxwin=%u scale=%i " 748 DEBUGP("tcp_update: sender end=%u maxend=%u maxwin=%u scale=%i "
750 "receiver end=%u maxend=%u maxwin=%u scale=%i\n", 749 "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
751 sender->td_end, sender->td_maxend, sender->td_maxwin, 750 sender->td_end, sender->td_maxend, sender->td_maxwin,
@@ -857,7 +856,7 @@ static int tcp_packet(struct ip_conntrack *conntrack,
857 sizeof(_tcph), &_tcph); 856 sizeof(_tcph), &_tcph);
858 BUG_ON(th == NULL); 857 BUG_ON(th == NULL);
859 858
860 WRITE_LOCK(&tcp_lock); 859 write_lock_bh(&tcp_lock);
861 old_state = conntrack->proto.tcp.state; 860 old_state = conntrack->proto.tcp.state;
862 dir = CTINFO2DIR(ctinfo); 861 dir = CTINFO2DIR(ctinfo);
863 index = get_conntrack_index(th); 862 index = get_conntrack_index(th);
@@ -879,7 +878,7 @@ static int tcp_packet(struct ip_conntrack *conntrack,
879 * that the client cannot but retransmit its SYN and 878 * that the client cannot but retransmit its SYN and
880 * thus initiate a clean new session. 879 * thus initiate a clean new session.
881 */ 880 */
882 WRITE_UNLOCK(&tcp_lock); 881 write_unlock_bh(&tcp_lock);
883 if (LOG_INVALID(IPPROTO_TCP)) 882 if (LOG_INVALID(IPPROTO_TCP))
884 nf_log_packet(PF_INET, 0, skb, NULL, NULL, 883 nf_log_packet(PF_INET, 0, skb, NULL, NULL,
885 "ip_ct_tcp: killing out of sync session "); 884 "ip_ct_tcp: killing out of sync session ");
@@ -894,7 +893,7 @@ static int tcp_packet(struct ip_conntrack *conntrack,
894 conntrack->proto.tcp.last_end = 893 conntrack->proto.tcp.last_end =
895 segment_seq_plus_len(ntohl(th->seq), skb->len, iph, th); 894 segment_seq_plus_len(ntohl(th->seq), skb->len, iph, th);
896 895
897 WRITE_UNLOCK(&tcp_lock); 896 write_unlock_bh(&tcp_lock);
898 if (LOG_INVALID(IPPROTO_TCP)) 897 if (LOG_INVALID(IPPROTO_TCP))
899 nf_log_packet(PF_INET, 0, skb, NULL, NULL, 898 nf_log_packet(PF_INET, 0, skb, NULL, NULL,
900 "ip_ct_tcp: invalid packet ignored "); 899 "ip_ct_tcp: invalid packet ignored ");
@@ -904,7 +903,7 @@ static int tcp_packet(struct ip_conntrack *conntrack,
904 DEBUGP("ip_ct_tcp: Invalid dir=%i index=%u ostate=%u\n", 903 DEBUGP("ip_ct_tcp: Invalid dir=%i index=%u ostate=%u\n",
905 dir, get_conntrack_index(th), 904 dir, get_conntrack_index(th),
906 old_state); 905 old_state);
907 WRITE_UNLOCK(&tcp_lock); 906 write_unlock_bh(&tcp_lock);
908 if (LOG_INVALID(IPPROTO_TCP)) 907 if (LOG_INVALID(IPPROTO_TCP))
909 nf_log_packet(PF_INET, 0, skb, NULL, NULL, 908 nf_log_packet(PF_INET, 0, skb, NULL, NULL,
910 "ip_ct_tcp: invalid state "); 909 "ip_ct_tcp: invalid state ");
@@ -918,13 +917,13 @@ static int tcp_packet(struct ip_conntrack *conntrack,
918 conntrack->proto.tcp.seen[dir].td_end)) { 917 conntrack->proto.tcp.seen[dir].td_end)) {
919 /* Attempt to reopen a closed connection. 918 /* Attempt to reopen a closed connection.
920 * Delete this connection and look up again. */ 919 * Delete this connection and look up again. */
921 WRITE_UNLOCK(&tcp_lock); 920 write_unlock_bh(&tcp_lock);
922 if (del_timer(&conntrack->timeout)) 921 if (del_timer(&conntrack->timeout))
923 conntrack->timeout.function((unsigned long) 922 conntrack->timeout.function((unsigned long)
924 conntrack); 923 conntrack);
925 return -NF_REPEAT; 924 return -NF_REPEAT;
926 } else { 925 } else {
927 WRITE_UNLOCK(&tcp_lock); 926 write_unlock_bh(&tcp_lock);
928 if (LOG_INVALID(IPPROTO_TCP)) 927 if (LOG_INVALID(IPPROTO_TCP))
929 nf_log_packet(PF_INET, 0, skb, NULL, NULL, 928 nf_log_packet(PF_INET, 0, skb, NULL, NULL,
930 "ip_ct_tcp: invalid SYN"); 929 "ip_ct_tcp: invalid SYN");
@@ -949,7 +948,7 @@ static int tcp_packet(struct ip_conntrack *conntrack,
949 948
950 if (!tcp_in_window(&conntrack->proto.tcp, dir, index, 949 if (!tcp_in_window(&conntrack->proto.tcp, dir, index,
951 skb, iph, th)) { 950 skb, iph, th)) {
952 WRITE_UNLOCK(&tcp_lock); 951 write_unlock_bh(&tcp_lock);
953 return -NF_ACCEPT; 952 return -NF_ACCEPT;
954 } 953 }
955 in_window: 954 in_window:
@@ -972,7 +971,7 @@ static int tcp_packet(struct ip_conntrack *conntrack,
972 timeout = conntrack->proto.tcp.retrans >= ip_ct_tcp_max_retrans 971 timeout = conntrack->proto.tcp.retrans >= ip_ct_tcp_max_retrans
973 && *tcp_timeouts[new_state] > ip_ct_tcp_timeout_max_retrans 972 && *tcp_timeouts[new_state] > ip_ct_tcp_timeout_max_retrans
974 ? ip_ct_tcp_timeout_max_retrans : *tcp_timeouts[new_state]; 973 ? ip_ct_tcp_timeout_max_retrans : *tcp_timeouts[new_state];
975 WRITE_UNLOCK(&tcp_lock); 974 write_unlock_bh(&tcp_lock);
976 975
977 if (!test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) { 976 if (!test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) {
978 /* If only reply is a RST, we can consider ourselves not to 977 /* If only reply is a RST, we can consider ourselves not to
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_udp.c b/net/ipv4/netfilter/ip_conntrack_proto_udp.c
index 5bc28a224623..8c1eaba098d4 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_udp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_udp.c
@@ -120,6 +120,7 @@ static int udp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
120 * and moreover root might send raw packets. 120 * and moreover root might send raw packets.
121 * FIXME: Source route IP option packets --RR */ 121 * FIXME: Source route IP option packets --RR */
122 if (hooknum == NF_IP_PRE_ROUTING 122 if (hooknum == NF_IP_PRE_ROUTING
123 && skb->ip_summed != CHECKSUM_UNNECESSARY
123 && csum_tcpudp_magic(iph->saddr, iph->daddr, udplen, IPPROTO_UDP, 124 && csum_tcpudp_magic(iph->saddr, iph->daddr, udplen, IPPROTO_UDP,
124 skb->ip_summed == CHECKSUM_HW ? skb->csum 125 skb->ip_summed == CHECKSUM_HW ? skb->csum
125 : skb_checksum(skb, iph->ihl*4, udplen, 0))) { 126 : skb_checksum(skb, iph->ihl*4, udplen, 0))) {
diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c
index bc59f7b39805..42dc95102873 100644
--- a/net/ipv4/netfilter/ip_conntrack_standalone.c
+++ b/net/ipv4/netfilter/ip_conntrack_standalone.c
@@ -28,8 +28,8 @@
28#include <net/checksum.h> 28#include <net/checksum.h>
29#include <net/ip.h> 29#include <net/ip.h>
30 30
31#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_conntrack_lock) 31#define ASSERT_READ_LOCK(x)
32#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_conntrack_lock) 32#define ASSERT_WRITE_LOCK(x)
33 33
34#include <linux/netfilter_ipv4/ip_conntrack.h> 34#include <linux/netfilter_ipv4/ip_conntrack.h>
35#include <linux/netfilter_ipv4/ip_conntrack_protocol.h> 35#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
@@ -119,7 +119,7 @@ static struct list_head *ct_get_idx(struct seq_file *seq, loff_t pos)
119 119
120static void *ct_seq_start(struct seq_file *seq, loff_t *pos) 120static void *ct_seq_start(struct seq_file *seq, loff_t *pos)
121{ 121{
122 READ_LOCK(&ip_conntrack_lock); 122 read_lock_bh(&ip_conntrack_lock);
123 return ct_get_idx(seq, *pos); 123 return ct_get_idx(seq, *pos);
124} 124}
125 125
@@ -131,7 +131,7 @@ static void *ct_seq_next(struct seq_file *s, void *v, loff_t *pos)
131 131
132static void ct_seq_stop(struct seq_file *s, void *v) 132static void ct_seq_stop(struct seq_file *s, void *v)
133{ 133{
134 READ_UNLOCK(&ip_conntrack_lock); 134 read_unlock_bh(&ip_conntrack_lock);
135} 135}
136 136
137static int ct_seq_show(struct seq_file *s, void *v) 137static int ct_seq_show(struct seq_file *s, void *v)
@@ -140,7 +140,7 @@ static int ct_seq_show(struct seq_file *s, void *v)
140 const struct ip_conntrack *conntrack = tuplehash_to_ctrack(hash); 140 const struct ip_conntrack *conntrack = tuplehash_to_ctrack(hash);
141 struct ip_conntrack_protocol *proto; 141 struct ip_conntrack_protocol *proto;
142 142
143 MUST_BE_READ_LOCKED(&ip_conntrack_lock); 143 ASSERT_READ_LOCK(&ip_conntrack_lock);
144 IP_NF_ASSERT(conntrack); 144 IP_NF_ASSERT(conntrack);
145 145
146 /* we only want to print DIR_ORIGINAL */ 146 /* we only want to print DIR_ORIGINAL */
@@ -239,7 +239,7 @@ static void *exp_seq_start(struct seq_file *s, loff_t *pos)
239 239
240 /* strange seq_file api calls stop even if we fail, 240 /* strange seq_file api calls stop even if we fail,
241 * thus we need to grab lock since stop unlocks */ 241 * thus we need to grab lock since stop unlocks */
242 READ_LOCK(&ip_conntrack_lock); 242 read_lock_bh(&ip_conntrack_lock);
243 243
244 if (list_empty(e)) 244 if (list_empty(e))
245 return NULL; 245 return NULL;
@@ -267,7 +267,7 @@ static void *exp_seq_next(struct seq_file *s, void *v, loff_t *pos)
267 267
268static void exp_seq_stop(struct seq_file *s, void *v) 268static void exp_seq_stop(struct seq_file *s, void *v)
269{ 269{
270 READ_UNLOCK(&ip_conntrack_lock); 270 read_unlock_bh(&ip_conntrack_lock);
271} 271}
272 272
273static int exp_seq_show(struct seq_file *s, void *v) 273static int exp_seq_show(struct seq_file *s, void *v)
@@ -921,22 +921,22 @@ int ip_conntrack_protocol_register(struct ip_conntrack_protocol *proto)
921{ 921{
922 int ret = 0; 922 int ret = 0;
923 923
924 WRITE_LOCK(&ip_conntrack_lock); 924 write_lock_bh(&ip_conntrack_lock);
925 if (ip_ct_protos[proto->proto] != &ip_conntrack_generic_protocol) { 925 if (ip_ct_protos[proto->proto] != &ip_conntrack_generic_protocol) {
926 ret = -EBUSY; 926 ret = -EBUSY;
927 goto out; 927 goto out;
928 } 928 }
929 ip_ct_protos[proto->proto] = proto; 929 ip_ct_protos[proto->proto] = proto;
930 out: 930 out:
931 WRITE_UNLOCK(&ip_conntrack_lock); 931 write_unlock_bh(&ip_conntrack_lock);
932 return ret; 932 return ret;
933} 933}
934 934
935void ip_conntrack_protocol_unregister(struct ip_conntrack_protocol *proto) 935void ip_conntrack_protocol_unregister(struct ip_conntrack_protocol *proto)
936{ 936{
937 WRITE_LOCK(&ip_conntrack_lock); 937 write_lock_bh(&ip_conntrack_lock);
938 ip_ct_protos[proto->proto] = &ip_conntrack_generic_protocol; 938 ip_ct_protos[proto->proto] = &ip_conntrack_generic_protocol;
939 WRITE_UNLOCK(&ip_conntrack_lock); 939 write_unlock_bh(&ip_conntrack_lock);
940 940
941 /* Somebody could be still looking at the proto in bh. */ 941 /* Somebody could be still looking at the proto in bh. */
942 synchronize_net(); 942 synchronize_net();
diff --git a/net/ipv4/netfilter/ip_nat_core.c b/net/ipv4/netfilter/ip_nat_core.c
index 9fc6f93af0dd..739b6dde1c82 100644
--- a/net/ipv4/netfilter/ip_nat_core.c
+++ b/net/ipv4/netfilter/ip_nat_core.c
@@ -22,8 +22,8 @@
22#include <linux/udp.h> 22#include <linux/udp.h>
23#include <linux/jhash.h> 23#include <linux/jhash.h>
24 24
25#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock) 25#define ASSERT_READ_LOCK(x)
26#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock) 26#define ASSERT_WRITE_LOCK(x)
27 27
28#include <linux/netfilter_ipv4/ip_conntrack.h> 28#include <linux/netfilter_ipv4/ip_conntrack.h>
29#include <linux/netfilter_ipv4/ip_conntrack_core.h> 29#include <linux/netfilter_ipv4/ip_conntrack_core.h>
@@ -41,7 +41,7 @@
41#define DEBUGP(format, args...) 41#define DEBUGP(format, args...)
42#endif 42#endif
43 43
44DECLARE_RWLOCK(ip_nat_lock); 44DEFINE_RWLOCK(ip_nat_lock);
45 45
46/* Calculated at init based on memory size */ 46/* Calculated at init based on memory size */
47static unsigned int ip_nat_htable_size; 47static unsigned int ip_nat_htable_size;
@@ -65,9 +65,9 @@ static void ip_nat_cleanup_conntrack(struct ip_conntrack *conn)
65 if (!(conn->status & IPS_NAT_DONE_MASK)) 65 if (!(conn->status & IPS_NAT_DONE_MASK))
66 return; 66 return;
67 67
68 WRITE_LOCK(&ip_nat_lock); 68 write_lock_bh(&ip_nat_lock);
69 list_del(&conn->nat.info.bysource); 69 list_del(&conn->nat.info.bysource);
70 WRITE_UNLOCK(&ip_nat_lock); 70 write_unlock_bh(&ip_nat_lock);
71} 71}
72 72
73/* We do checksum mangling, so if they were wrong before they're still 73/* We do checksum mangling, so if they were wrong before they're still
@@ -142,7 +142,7 @@ find_appropriate_src(const struct ip_conntrack_tuple *tuple,
142 unsigned int h = hash_by_src(tuple); 142 unsigned int h = hash_by_src(tuple);
143 struct ip_conntrack *ct; 143 struct ip_conntrack *ct;
144 144
145 READ_LOCK(&ip_nat_lock); 145 read_lock_bh(&ip_nat_lock);
146 list_for_each_entry(ct, &bysource[h], nat.info.bysource) { 146 list_for_each_entry(ct, &bysource[h], nat.info.bysource) {
147 if (same_src(ct, tuple)) { 147 if (same_src(ct, tuple)) {
148 /* Copy source part from reply tuple. */ 148 /* Copy source part from reply tuple. */
@@ -151,12 +151,12 @@ find_appropriate_src(const struct ip_conntrack_tuple *tuple,
151 result->dst = tuple->dst; 151 result->dst = tuple->dst;
152 152
153 if (in_range(result, range)) { 153 if (in_range(result, range)) {
154 READ_UNLOCK(&ip_nat_lock); 154 read_unlock_bh(&ip_nat_lock);
155 return 1; 155 return 1;
156 } 156 }
157 } 157 }
158 } 158 }
159 READ_UNLOCK(&ip_nat_lock); 159 read_unlock_bh(&ip_nat_lock);
160 return 0; 160 return 0;
161} 161}
162 162
@@ -297,9 +297,9 @@ ip_nat_setup_info(struct ip_conntrack *conntrack,
297 unsigned int srchash 297 unsigned int srchash
298 = hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL] 298 = hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
299 .tuple); 299 .tuple);
300 WRITE_LOCK(&ip_nat_lock); 300 write_lock_bh(&ip_nat_lock);
301 list_add(&info->bysource, &bysource[srchash]); 301 list_add(&info->bysource, &bysource[srchash]);
302 WRITE_UNLOCK(&ip_nat_lock); 302 write_unlock_bh(&ip_nat_lock);
303 } 303 }
304 304
305 /* It's done. */ 305 /* It's done. */
@@ -474,23 +474,23 @@ int ip_nat_protocol_register(struct ip_nat_protocol *proto)
474{ 474{
475 int ret = 0; 475 int ret = 0;
476 476
477 WRITE_LOCK(&ip_nat_lock); 477 write_lock_bh(&ip_nat_lock);
478 if (ip_nat_protos[proto->protonum] != &ip_nat_unknown_protocol) { 478 if (ip_nat_protos[proto->protonum] != &ip_nat_unknown_protocol) {
479 ret = -EBUSY; 479 ret = -EBUSY;
480 goto out; 480 goto out;
481 } 481 }
482 ip_nat_protos[proto->protonum] = proto; 482 ip_nat_protos[proto->protonum] = proto;
483 out: 483 out:
484 WRITE_UNLOCK(&ip_nat_lock); 484 write_unlock_bh(&ip_nat_lock);
485 return ret; 485 return ret;
486} 486}
487 487
488/* Noone stores the protocol anywhere; simply delete it. */ 488/* Noone stores the protocol anywhere; simply delete it. */
489void ip_nat_protocol_unregister(struct ip_nat_protocol *proto) 489void ip_nat_protocol_unregister(struct ip_nat_protocol *proto)
490{ 490{
491 WRITE_LOCK(&ip_nat_lock); 491 write_lock_bh(&ip_nat_lock);
492 ip_nat_protos[proto->protonum] = &ip_nat_unknown_protocol; 492 ip_nat_protos[proto->protonum] = &ip_nat_unknown_protocol;
493 WRITE_UNLOCK(&ip_nat_lock); 493 write_unlock_bh(&ip_nat_lock);
494 494
495 /* Someone could be still looking at the proto in a bh. */ 495 /* Someone could be still looking at the proto in a bh. */
496 synchronize_net(); 496 synchronize_net();
@@ -509,13 +509,13 @@ int __init ip_nat_init(void)
509 return -ENOMEM; 509 return -ENOMEM;
510 510
511 /* Sew in builtin protocols. */ 511 /* Sew in builtin protocols. */
512 WRITE_LOCK(&ip_nat_lock); 512 write_lock_bh(&ip_nat_lock);
513 for (i = 0; i < MAX_IP_NAT_PROTO; i++) 513 for (i = 0; i < MAX_IP_NAT_PROTO; i++)
514 ip_nat_protos[i] = &ip_nat_unknown_protocol; 514 ip_nat_protos[i] = &ip_nat_unknown_protocol;
515 ip_nat_protos[IPPROTO_TCP] = &ip_nat_protocol_tcp; 515 ip_nat_protos[IPPROTO_TCP] = &ip_nat_protocol_tcp;
516 ip_nat_protos[IPPROTO_UDP] = &ip_nat_protocol_udp; 516 ip_nat_protos[IPPROTO_UDP] = &ip_nat_protocol_udp;
517 ip_nat_protos[IPPROTO_ICMP] = &ip_nat_protocol_icmp; 517 ip_nat_protos[IPPROTO_ICMP] = &ip_nat_protocol_icmp;
518 WRITE_UNLOCK(&ip_nat_lock); 518 write_unlock_bh(&ip_nat_lock);
519 519
520 for (i = 0; i < ip_nat_htable_size; i++) { 520 for (i = 0; i < ip_nat_htable_size; i++) {
521 INIT_LIST_HEAD(&bysource[i]); 521 INIT_LIST_HEAD(&bysource[i]);
diff --git a/net/ipv4/netfilter/ip_nat_helper.c b/net/ipv4/netfilter/ip_nat_helper.c
index 1637b96d8c01..158f34f32c04 100644
--- a/net/ipv4/netfilter/ip_nat_helper.c
+++ b/net/ipv4/netfilter/ip_nat_helper.c
@@ -28,8 +28,8 @@
28#include <net/tcp.h> 28#include <net/tcp.h>
29#include <net/udp.h> 29#include <net/udp.h>
30 30
31#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock) 31#define ASSERT_READ_LOCK(x)
32#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock) 32#define ASSERT_WRITE_LOCK(x)
33 33
34#include <linux/netfilter_ipv4/ip_conntrack.h> 34#include <linux/netfilter_ipv4/ip_conntrack.h>
35#include <linux/netfilter_ipv4/ip_conntrack_helper.h> 35#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
@@ -47,7 +47,7 @@
47#define DUMP_OFFSET(x) 47#define DUMP_OFFSET(x)
48#endif 48#endif
49 49
50static DECLARE_LOCK(ip_nat_seqofs_lock); 50static DEFINE_SPINLOCK(ip_nat_seqofs_lock);
51 51
52/* Setup TCP sequence correction given this change at this sequence */ 52/* Setup TCP sequence correction given this change at this sequence */
53static inline void 53static inline void
@@ -70,7 +70,7 @@ adjust_tcp_sequence(u32 seq,
70 DEBUGP("ip_nat_resize_packet: Seq_offset before: "); 70 DEBUGP("ip_nat_resize_packet: Seq_offset before: ");
71 DUMP_OFFSET(this_way); 71 DUMP_OFFSET(this_way);
72 72
73 LOCK_BH(&ip_nat_seqofs_lock); 73 spin_lock_bh(&ip_nat_seqofs_lock);
74 74
75 /* SYN adjust. If it's uninitialized, or this is after last 75 /* SYN adjust. If it's uninitialized, or this is after last
76 * correction, record it: we don't handle more than one 76 * correction, record it: we don't handle more than one
@@ -82,7 +82,7 @@ adjust_tcp_sequence(u32 seq,
82 this_way->offset_before = this_way->offset_after; 82 this_way->offset_before = this_way->offset_after;
83 this_way->offset_after += sizediff; 83 this_way->offset_after += sizediff;
84 } 84 }
85 UNLOCK_BH(&ip_nat_seqofs_lock); 85 spin_unlock_bh(&ip_nat_seqofs_lock);
86 86
87 DEBUGP("ip_nat_resize_packet: Seq_offset after: "); 87 DEBUGP("ip_nat_resize_packet: Seq_offset after: ");
88 DUMP_OFFSET(this_way); 88 DUMP_OFFSET(this_way);
@@ -142,9 +142,6 @@ static int enlarge_skb(struct sk_buff **pskb, unsigned int extra)
142 /* Transfer socket to new skb. */ 142 /* Transfer socket to new skb. */
143 if ((*pskb)->sk) 143 if ((*pskb)->sk)
144 skb_set_owner_w(nskb, (*pskb)->sk); 144 skb_set_owner_w(nskb, (*pskb)->sk);
145#ifdef CONFIG_NETFILTER_DEBUG
146 nskb->nf_debug = (*pskb)->nf_debug;
147#endif
148 kfree_skb(*pskb); 145 kfree_skb(*pskb);
149 *pskb = nskb; 146 *pskb = nskb;
150 return 1; 147 return 1;
diff --git a/net/ipv4/netfilter/ip_nat_rule.c b/net/ipv4/netfilter/ip_nat_rule.c
index 581f097f5a24..60d70fa41a15 100644
--- a/net/ipv4/netfilter/ip_nat_rule.c
+++ b/net/ipv4/netfilter/ip_nat_rule.c
@@ -19,8 +19,8 @@
19#include <net/route.h> 19#include <net/route.h>
20#include <linux/bitops.h> 20#include <linux/bitops.h>
21 21
22#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock) 22#define ASSERT_READ_LOCK(x)
23#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock) 23#define ASSERT_WRITE_LOCK(x)
24 24
25#include <linux/netfilter_ipv4/ip_tables.h> 25#include <linux/netfilter_ipv4/ip_tables.h>
26#include <linux/netfilter_ipv4/ip_nat.h> 26#include <linux/netfilter_ipv4/ip_nat.h>
diff --git a/net/ipv4/netfilter/ip_nat_standalone.c b/net/ipv4/netfilter/ip_nat_standalone.c
index 79f56f662b33..bc59d0d6e89e 100644
--- a/net/ipv4/netfilter/ip_nat_standalone.c
+++ b/net/ipv4/netfilter/ip_nat_standalone.c
@@ -31,8 +31,8 @@
31#include <net/checksum.h> 31#include <net/checksum.h>
32#include <linux/spinlock.h> 32#include <linux/spinlock.h>
33 33
34#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock) 34#define ASSERT_READ_LOCK(x)
35#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock) 35#define ASSERT_WRITE_LOCK(x)
36 36
37#include <linux/netfilter_ipv4/ip_nat.h> 37#include <linux/netfilter_ipv4/ip_nat.h>
38#include <linux/netfilter_ipv4/ip_nat_rule.h> 38#include <linux/netfilter_ipv4/ip_nat_rule.h>
@@ -373,7 +373,6 @@ static int init_or_cleanup(int init)
373 cleanup_rule_init: 373 cleanup_rule_init:
374 ip_nat_rule_cleanup(); 374 ip_nat_rule_cleanup();
375 cleanup_nothing: 375 cleanup_nothing:
376 MUST_BE_READ_WRITE_UNLOCKED(&ip_nat_lock);
377 return ret; 376 return ret;
378} 377}
379 378
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 8a54f92b8496..c88dfcd38c56 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -67,7 +67,6 @@ static DECLARE_MUTEX(ipt_mutex);
67/* Must have mutex */ 67/* Must have mutex */
68#define ASSERT_READ_LOCK(x) IP_NF_ASSERT(down_trylock(&ipt_mutex) != 0) 68#define ASSERT_READ_LOCK(x) IP_NF_ASSERT(down_trylock(&ipt_mutex) != 0)
69#define ASSERT_WRITE_LOCK(x) IP_NF_ASSERT(down_trylock(&ipt_mutex) != 0) 69#define ASSERT_WRITE_LOCK(x) IP_NF_ASSERT(down_trylock(&ipt_mutex) != 0)
70#include <linux/netfilter_ipv4/lockhelp.h>
71#include <linux/netfilter_ipv4/listhelp.h> 70#include <linux/netfilter_ipv4/listhelp.h>
72 71
73#if 0 72#if 0
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 0f12e3a3dc73..dc4362b57cfa 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -29,7 +29,6 @@
29#include <linux/netfilter_ipv4/ip_tables.h> 29#include <linux/netfilter_ipv4/ip_tables.h>
30#include <linux/netfilter_ipv4/ipt_CLUSTERIP.h> 30#include <linux/netfilter_ipv4/ipt_CLUSTERIP.h>
31#include <linux/netfilter_ipv4/ip_conntrack.h> 31#include <linux/netfilter_ipv4/ip_conntrack.h>
32#include <linux/netfilter_ipv4/lockhelp.h>
33 32
34#define CLUSTERIP_VERSION "0.6" 33#define CLUSTERIP_VERSION "0.6"
35 34
@@ -41,6 +40,8 @@
41#define DEBUGP 40#define DEBUGP
42#endif 41#endif
43 42
43#define ASSERT_READ_LOCK(x)
44
44MODULE_LICENSE("GPL"); 45MODULE_LICENSE("GPL");
45MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); 46MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
46MODULE_DESCRIPTION("iptables target for CLUSTERIP"); 47MODULE_DESCRIPTION("iptables target for CLUSTERIP");
@@ -67,7 +68,7 @@ static LIST_HEAD(clusterip_configs);
67 68
68/* clusterip_lock protects the clusterip_configs list _AND_ the configurable 69/* clusterip_lock protects the clusterip_configs list _AND_ the configurable
69 * data within all structurses (num_local_nodes, local_nodes[]) */ 70 * data within all structurses (num_local_nodes, local_nodes[]) */
70static DECLARE_RWLOCK(clusterip_lock); 71static DEFINE_RWLOCK(clusterip_lock);
71 72
72#ifdef CONFIG_PROC_FS 73#ifdef CONFIG_PROC_FS
73static struct file_operations clusterip_proc_fops; 74static struct file_operations clusterip_proc_fops;
@@ -82,9 +83,9 @@ clusterip_config_get(struct clusterip_config *c) {
82static inline void 83static inline void
83clusterip_config_put(struct clusterip_config *c) { 84clusterip_config_put(struct clusterip_config *c) {
84 if (atomic_dec_and_test(&c->refcount)) { 85 if (atomic_dec_and_test(&c->refcount)) {
85 WRITE_LOCK(&clusterip_lock); 86 write_lock_bh(&clusterip_lock);
86 list_del(&c->list); 87 list_del(&c->list);
87 WRITE_UNLOCK(&clusterip_lock); 88 write_unlock_bh(&clusterip_lock);
88 dev_mc_delete(c->dev, c->clustermac, ETH_ALEN, 0); 89 dev_mc_delete(c->dev, c->clustermac, ETH_ALEN, 0);
89 dev_put(c->dev); 90 dev_put(c->dev);
90 kfree(c); 91 kfree(c);
@@ -97,7 +98,7 @@ __clusterip_config_find(u_int32_t clusterip)
97{ 98{
98 struct list_head *pos; 99 struct list_head *pos;
99 100
100 MUST_BE_READ_LOCKED(&clusterip_lock); 101 ASSERT_READ_LOCK(&clusterip_lock);
101 list_for_each(pos, &clusterip_configs) { 102 list_for_each(pos, &clusterip_configs) {
102 struct clusterip_config *c = list_entry(pos, 103 struct clusterip_config *c = list_entry(pos,
103 struct clusterip_config, list); 104 struct clusterip_config, list);
@@ -114,14 +115,14 @@ clusterip_config_find_get(u_int32_t clusterip)
114{ 115{
115 struct clusterip_config *c; 116 struct clusterip_config *c;
116 117
117 READ_LOCK(&clusterip_lock); 118 read_lock_bh(&clusterip_lock);
118 c = __clusterip_config_find(clusterip); 119 c = __clusterip_config_find(clusterip);
119 if (!c) { 120 if (!c) {
120 READ_UNLOCK(&clusterip_lock); 121 read_unlock_bh(&clusterip_lock);
121 return NULL; 122 return NULL;
122 } 123 }
123 atomic_inc(&c->refcount); 124 atomic_inc(&c->refcount);
124 READ_UNLOCK(&clusterip_lock); 125 read_unlock_bh(&clusterip_lock);
125 126
126 return c; 127 return c;
127} 128}
@@ -160,9 +161,9 @@ clusterip_config_init(struct ipt_clusterip_tgt_info *i, u_int32_t ip,
160 c->pde->data = c; 161 c->pde->data = c;
161#endif 162#endif
162 163
163 WRITE_LOCK(&clusterip_lock); 164 write_lock_bh(&clusterip_lock);
164 list_add(&c->list, &clusterip_configs); 165 list_add(&c->list, &clusterip_configs);
165 WRITE_UNLOCK(&clusterip_lock); 166 write_unlock_bh(&clusterip_lock);
166 167
167 return c; 168 return c;
168} 169}
@@ -172,25 +173,25 @@ clusterip_add_node(struct clusterip_config *c, u_int16_t nodenum)
172{ 173{
173 int i; 174 int i;
174 175
175 WRITE_LOCK(&clusterip_lock); 176 write_lock_bh(&clusterip_lock);
176 177
177 if (c->num_local_nodes >= CLUSTERIP_MAX_NODES 178 if (c->num_local_nodes >= CLUSTERIP_MAX_NODES
178 || nodenum > CLUSTERIP_MAX_NODES) { 179 || nodenum > CLUSTERIP_MAX_NODES) {
179 WRITE_UNLOCK(&clusterip_lock); 180 write_unlock_bh(&clusterip_lock);
180 return 1; 181 return 1;
181 } 182 }
182 183
183 /* check if we alrady have this number in our array */ 184 /* check if we alrady have this number in our array */
184 for (i = 0; i < c->num_local_nodes; i++) { 185 for (i = 0; i < c->num_local_nodes; i++) {
185 if (c->local_nodes[i] == nodenum) { 186 if (c->local_nodes[i] == nodenum) {
186 WRITE_UNLOCK(&clusterip_lock); 187 write_unlock_bh(&clusterip_lock);
187 return 1; 188 return 1;
188 } 189 }
189 } 190 }
190 191
191 c->local_nodes[c->num_local_nodes++] = nodenum; 192 c->local_nodes[c->num_local_nodes++] = nodenum;
192 193
193 WRITE_UNLOCK(&clusterip_lock); 194 write_unlock_bh(&clusterip_lock);
194 return 0; 195 return 0;
195} 196}
196 197
@@ -199,10 +200,10 @@ clusterip_del_node(struct clusterip_config *c, u_int16_t nodenum)
199{ 200{
200 int i; 201 int i;
201 202
202 WRITE_LOCK(&clusterip_lock); 203 write_lock_bh(&clusterip_lock);
203 204
204 if (c->num_local_nodes <= 1 || nodenum > CLUSTERIP_MAX_NODES) { 205 if (c->num_local_nodes <= 1 || nodenum > CLUSTERIP_MAX_NODES) {
205 WRITE_UNLOCK(&clusterip_lock); 206 write_unlock_bh(&clusterip_lock);
206 return 1; 207 return 1;
207 } 208 }
208 209
@@ -211,12 +212,12 @@ clusterip_del_node(struct clusterip_config *c, u_int16_t nodenum)
211 int size = sizeof(u_int16_t)*(c->num_local_nodes-(i+1)); 212 int size = sizeof(u_int16_t)*(c->num_local_nodes-(i+1));
212 memmove(&c->local_nodes[i], &c->local_nodes[i+1], size); 213 memmove(&c->local_nodes[i], &c->local_nodes[i+1], size);
213 c->num_local_nodes--; 214 c->num_local_nodes--;
214 WRITE_UNLOCK(&clusterip_lock); 215 write_unlock_bh(&clusterip_lock);
215 return 0; 216 return 0;
216 } 217 }
217 } 218 }
218 219
219 WRITE_UNLOCK(&clusterip_lock); 220 write_unlock_bh(&clusterip_lock);
220 return 1; 221 return 1;
221} 222}
222 223
@@ -286,21 +287,21 @@ clusterip_responsible(struct clusterip_config *config, u_int32_t hash)
286{ 287{
287 int i; 288 int i;
288 289
289 READ_LOCK(&clusterip_lock); 290 read_lock_bh(&clusterip_lock);
290 291
291 if (config->num_local_nodes == 0) { 292 if (config->num_local_nodes == 0) {
292 READ_UNLOCK(&clusterip_lock); 293 read_unlock_bh(&clusterip_lock);
293 return 0; 294 return 0;
294 } 295 }
295 296
296 for (i = 0; i < config->num_local_nodes; i++) { 297 for (i = 0; i < config->num_local_nodes; i++) {
297 if (config->local_nodes[i] == hash) { 298 if (config->local_nodes[i] == hash) {
298 READ_UNLOCK(&clusterip_lock); 299 read_unlock_bh(&clusterip_lock);
299 return 1; 300 return 1;
300 } 301 }
301 } 302 }
302 303
303 READ_UNLOCK(&clusterip_lock); 304 read_unlock_bh(&clusterip_lock);
304 305
305 return 0; 306 return 0;
306} 307}
@@ -578,7 +579,7 @@ static void *clusterip_seq_start(struct seq_file *s, loff_t *pos)
578 struct clusterip_config *c = pde->data; 579 struct clusterip_config *c = pde->data;
579 unsigned int *nodeidx; 580 unsigned int *nodeidx;
580 581
581 READ_LOCK(&clusterip_lock); 582 read_lock_bh(&clusterip_lock);
582 if (*pos >= c->num_local_nodes) 583 if (*pos >= c->num_local_nodes)
583 return NULL; 584 return NULL;
584 585
@@ -608,7 +609,7 @@ static void clusterip_seq_stop(struct seq_file *s, void *v)
608{ 609{
609 kfree(v); 610 kfree(v);
610 611
611 READ_UNLOCK(&clusterip_lock); 612 read_unlock_bh(&clusterip_lock);
612} 613}
613 614
614static int clusterip_seq_show(struct seq_file *s, void *v) 615static int clusterip_seq_show(struct seq_file *s, void *v)
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index 57e9f6cf1c36..91e74502c3d3 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -33,7 +33,7 @@ MODULE_DESCRIPTION("iptables MASQUERADE target module");
33#endif 33#endif
34 34
35/* Lock protects masq region inside conntrack */ 35/* Lock protects masq region inside conntrack */
36static DECLARE_RWLOCK(masq_lock); 36static DEFINE_RWLOCK(masq_lock);
37 37
38/* FIXME: Multiple targets. --RR */ 38/* FIXME: Multiple targets. --RR */
39static int 39static int
@@ -103,9 +103,9 @@ masquerade_target(struct sk_buff **pskb,
103 return NF_DROP; 103 return NF_DROP;
104 } 104 }
105 105
106 WRITE_LOCK(&masq_lock); 106 write_lock_bh(&masq_lock);
107 ct->nat.masq_index = out->ifindex; 107 ct->nat.masq_index = out->ifindex;
108 WRITE_UNLOCK(&masq_lock); 108 write_unlock_bh(&masq_lock);
109 109
110 /* Transfer from original range. */ 110 /* Transfer from original range. */
111 newrange = ((struct ip_nat_range) 111 newrange = ((struct ip_nat_range)
@@ -122,9 +122,9 @@ device_cmp(struct ip_conntrack *i, void *ifindex)
122{ 122{
123 int ret; 123 int ret;
124 124
125 READ_LOCK(&masq_lock); 125 read_lock_bh(&masq_lock);
126 ret = (i->nat.masq_index == (int)(long)ifindex); 126 ret = (i->nat.masq_index == (int)(long)ifindex);
127 READ_UNLOCK(&masq_lock); 127 read_unlock_bh(&masq_lock);
128 128
129 return ret; 129 return ret;
130} 130}
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index 266d64979286..915696446020 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -104,10 +104,12 @@ static inline struct rtable *route_reverse(struct sk_buff *skb,
104static void send_reset(struct sk_buff *oldskb, int hook) 104static void send_reset(struct sk_buff *oldskb, int hook)
105{ 105{
106 struct sk_buff *nskb; 106 struct sk_buff *nskb;
107 struct iphdr *iph = oldskb->nh.iph;
107 struct tcphdr _otcph, *oth, *tcph; 108 struct tcphdr _otcph, *oth, *tcph;
108 struct rtable *rt; 109 struct rtable *rt;
109 u_int16_t tmp_port; 110 u_int16_t tmp_port;
110 u_int32_t tmp_addr; 111 u_int32_t tmp_addr;
112 unsigned int tcplen;
111 int needs_ack; 113 int needs_ack;
112 int hh_len; 114 int hh_len;
113 115
@@ -124,7 +126,16 @@ static void send_reset(struct sk_buff *oldskb, int hook)
124 if (oth->rst) 126 if (oth->rst)
125 return; 127 return;
126 128
127 /* FIXME: Check checksum --RR */ 129 /* Check checksum */
130 tcplen = oldskb->len - iph->ihl * 4;
131 if (((hook != NF_IP_LOCAL_IN && oldskb->ip_summed != CHECKSUM_HW) ||
132 (hook == NF_IP_LOCAL_IN &&
133 oldskb->ip_summed != CHECKSUM_UNNECESSARY)) &&
134 csum_tcpudp_magic(iph->saddr, iph->daddr, tcplen, IPPROTO_TCP,
135 oldskb->ip_summed == CHECKSUM_HW ? oldskb->csum :
136 skb_checksum(oldskb, iph->ihl * 4, tcplen, 0)))
137 return;
138
128 if ((rt = route_reverse(oldskb, oth, hook)) == NULL) 139 if ((rt = route_reverse(oldskb, oth, hook)) == NULL)
129 return; 140 return;
130 141
diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c
index 6f2cefbe16cd..52a0076302a7 100644
--- a/net/ipv4/netfilter/ipt_ULOG.c
+++ b/net/ipv4/netfilter/ipt_ULOG.c
@@ -56,7 +56,6 @@
56#include <linux/netfilter.h> 56#include <linux/netfilter.h>
57#include <linux/netfilter_ipv4/ip_tables.h> 57#include <linux/netfilter_ipv4/ip_tables.h>
58#include <linux/netfilter_ipv4/ipt_ULOG.h> 58#include <linux/netfilter_ipv4/ipt_ULOG.h>
59#include <linux/netfilter_ipv4/lockhelp.h>
60#include <net/sock.h> 59#include <net/sock.h>
61#include <linux/bitops.h> 60#include <linux/bitops.h>
62 61
@@ -99,8 +98,8 @@ typedef struct {
99 98
100static ulog_buff_t ulog_buffers[ULOG_MAXNLGROUPS]; /* array of buffers */ 99static ulog_buff_t ulog_buffers[ULOG_MAXNLGROUPS]; /* array of buffers */
101 100
102static struct sock *nflognl; /* our socket */ 101static struct sock *nflognl; /* our socket */
103static DECLARE_LOCK(ulog_lock); /* spinlock */ 102static DEFINE_SPINLOCK(ulog_lock); /* spinlock */
104 103
105/* send one ulog_buff_t to userspace */ 104/* send one ulog_buff_t to userspace */
106static void ulog_send(unsigned int nlgroupnum) 105static void ulog_send(unsigned int nlgroupnum)
@@ -135,9 +134,9 @@ static void ulog_timer(unsigned long data)
135 134
136 /* lock to protect against somebody modifying our structure 135 /* lock to protect against somebody modifying our structure
137 * from ipt_ulog_target at the same time */ 136 * from ipt_ulog_target at the same time */
138 LOCK_BH(&ulog_lock); 137 spin_lock_bh(&ulog_lock);
139 ulog_send(data); 138 ulog_send(data);
140 UNLOCK_BH(&ulog_lock); 139 spin_unlock_bh(&ulog_lock);
141} 140}
142 141
143static struct sk_buff *ulog_alloc_skb(unsigned int size) 142static struct sk_buff *ulog_alloc_skb(unsigned int size)
@@ -193,7 +192,7 @@ static void ipt_ulog_packet(unsigned int hooknum,
193 192
194 ub = &ulog_buffers[groupnum]; 193 ub = &ulog_buffers[groupnum];
195 194
196 LOCK_BH(&ulog_lock); 195 spin_lock_bh(&ulog_lock);
197 196
198 if (!ub->skb) { 197 if (!ub->skb) {
199 if (!(ub->skb = ulog_alloc_skb(size))) 198 if (!(ub->skb = ulog_alloc_skb(size)))
@@ -278,7 +277,7 @@ static void ipt_ulog_packet(unsigned int hooknum,
278 ulog_send(groupnum); 277 ulog_send(groupnum);
279 } 278 }
280 279
281 UNLOCK_BH(&ulog_lock); 280 spin_unlock_bh(&ulog_lock);
282 281
283 return; 282 return;
284 283
@@ -288,7 +287,7 @@ nlmsg_failure:
288alloc_failure: 287alloc_failure:
289 PRINTR("ipt_ULOG: Error building netlink message\n"); 288 PRINTR("ipt_ULOG: Error building netlink message\n");
290 289
291 UNLOCK_BH(&ulog_lock); 290 spin_unlock_bh(&ulog_lock);
292} 291}
293 292
294static unsigned int ipt_ulog_target(struct sk_buff **pskb, 293static unsigned int ipt_ulog_target(struct sk_buff **pskb,
diff --git a/net/ipv4/netfilter/ipt_hashlimit.c b/net/ipv4/netfilter/ipt_hashlimit.c
index f1937190cd77..564b49bfebcf 100644
--- a/net/ipv4/netfilter/ipt_hashlimit.c
+++ b/net/ipv4/netfilter/ipt_hashlimit.c
@@ -37,7 +37,6 @@
37 37
38#include <linux/netfilter_ipv4/ip_tables.h> 38#include <linux/netfilter_ipv4/ip_tables.h>
39#include <linux/netfilter_ipv4/ipt_hashlimit.h> 39#include <linux/netfilter_ipv4/ipt_hashlimit.h>
40#include <linux/netfilter_ipv4/lockhelp.h>
41 40
42/* FIXME: this is just for IP_NF_ASSERRT */ 41/* FIXME: this is just for IP_NF_ASSERRT */
43#include <linux/netfilter_ipv4/ip_conntrack.h> 42#include <linux/netfilter_ipv4/ip_conntrack.h>
@@ -92,7 +91,7 @@ struct ipt_hashlimit_htable {
92 struct hlist_head hash[0]; /* hashtable itself */ 91 struct hlist_head hash[0]; /* hashtable itself */
93}; 92};
94 93
95static DECLARE_LOCK(hashlimit_lock); /* protects htables list */ 94static DEFINE_SPINLOCK(hashlimit_lock); /* protects htables list */
96static DECLARE_MUTEX(hlimit_mutex); /* additional checkentry protection */ 95static DECLARE_MUTEX(hlimit_mutex); /* additional checkentry protection */
97static HLIST_HEAD(hashlimit_htables); 96static HLIST_HEAD(hashlimit_htables);
98static kmem_cache_t *hashlimit_cachep; 97static kmem_cache_t *hashlimit_cachep;
@@ -233,9 +232,9 @@ static int htable_create(struct ipt_hashlimit_info *minfo)
233 hinfo->timer.function = htable_gc; 232 hinfo->timer.function = htable_gc;
234 add_timer(&hinfo->timer); 233 add_timer(&hinfo->timer);
235 234
236 LOCK_BH(&hashlimit_lock); 235 spin_lock_bh(&hashlimit_lock);
237 hlist_add_head(&hinfo->node, &hashlimit_htables); 236 hlist_add_head(&hinfo->node, &hashlimit_htables);
238 UNLOCK_BH(&hashlimit_lock); 237 spin_unlock_bh(&hashlimit_lock);
239 238
240 return 0; 239 return 0;
241} 240}
@@ -301,15 +300,15 @@ static struct ipt_hashlimit_htable *htable_find_get(char *name)
301 struct ipt_hashlimit_htable *hinfo; 300 struct ipt_hashlimit_htable *hinfo;
302 struct hlist_node *pos; 301 struct hlist_node *pos;
303 302
304 LOCK_BH(&hashlimit_lock); 303 spin_lock_bh(&hashlimit_lock);
305 hlist_for_each_entry(hinfo, pos, &hashlimit_htables, node) { 304 hlist_for_each_entry(hinfo, pos, &hashlimit_htables, node) {
306 if (!strcmp(name, hinfo->pde->name)) { 305 if (!strcmp(name, hinfo->pde->name)) {
307 atomic_inc(&hinfo->use); 306 atomic_inc(&hinfo->use);
308 UNLOCK_BH(&hashlimit_lock); 307 spin_unlock_bh(&hashlimit_lock);
309 return hinfo; 308 return hinfo;
310 } 309 }
311 } 310 }
312 UNLOCK_BH(&hashlimit_lock); 311 spin_unlock_bh(&hashlimit_lock);
313 312
314 return NULL; 313 return NULL;
315} 314}
@@ -317,9 +316,9 @@ static struct ipt_hashlimit_htable *htable_find_get(char *name)
317static void htable_put(struct ipt_hashlimit_htable *hinfo) 316static void htable_put(struct ipt_hashlimit_htable *hinfo)
318{ 317{
319 if (atomic_dec_and_test(&hinfo->use)) { 318 if (atomic_dec_and_test(&hinfo->use)) {
320 LOCK_BH(&hashlimit_lock); 319 spin_lock_bh(&hashlimit_lock);
321 hlist_del(&hinfo->node); 320 hlist_del(&hinfo->node);
322 UNLOCK_BH(&hashlimit_lock); 321 spin_unlock_bh(&hashlimit_lock);
323 htable_destroy(hinfo); 322 htable_destroy(hinfo);
324 } 323 }
325} 324}
diff --git a/net/ipv4/netfilter/ipt_helper.c b/net/ipv4/netfilter/ipt_helper.c
index 33fdf364d3d3..3e7dd014de43 100644
--- a/net/ipv4/netfilter/ipt_helper.c
+++ b/net/ipv4/netfilter/ipt_helper.c
@@ -53,7 +53,7 @@ match(const struct sk_buff *skb,
53 return ret; 53 return ret;
54 } 54 }
55 55
56 READ_LOCK(&ip_conntrack_lock); 56 read_lock_bh(&ip_conntrack_lock);
57 if (!ct->master->helper) { 57 if (!ct->master->helper) {
58 DEBUGP("ipt_helper: master ct %p has no helper\n", 58 DEBUGP("ipt_helper: master ct %p has no helper\n",
59 exp->expectant); 59 exp->expectant);
@@ -69,7 +69,7 @@ match(const struct sk_buff *skb,
69 ret ^= !strncmp(ct->master->helper->name, info->name, 69 ret ^= !strncmp(ct->master->helper->name, info->name,
70 strlen(ct->master->helper->name)); 70 strlen(ct->master->helper->name));
71out_unlock: 71out_unlock:
72 READ_UNLOCK(&ip_conntrack_lock); 72 read_unlock_bh(&ip_conntrack_lock);
73 return ret; 73 return ret;
74} 74}
75 75
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
index af2392ae5769..66620a95942a 100644
--- a/net/ipv4/xfrm4_output.c
+++ b/net/ipv4/xfrm4_output.c
@@ -33,6 +33,7 @@ static void xfrm4_encap(struct sk_buff *skb)
33 struct dst_entry *dst = skb->dst; 33 struct dst_entry *dst = skb->dst;
34 struct xfrm_state *x = dst->xfrm; 34 struct xfrm_state *x = dst->xfrm;
35 struct iphdr *iph, *top_iph; 35 struct iphdr *iph, *top_iph;
36 int flags;
36 37
37 iph = skb->nh.iph; 38 iph = skb->nh.iph;
38 skb->h.ipiph = iph; 39 skb->h.ipiph = iph;
@@ -51,10 +52,13 @@ static void xfrm4_encap(struct sk_buff *skb)
51 52
52 /* DS disclosed */ 53 /* DS disclosed */
53 top_iph->tos = INET_ECN_encapsulate(iph->tos, iph->tos); 54 top_iph->tos = INET_ECN_encapsulate(iph->tos, iph->tos);
54 if (x->props.flags & XFRM_STATE_NOECN) 55
56 flags = x->props.flags;
57 if (flags & XFRM_STATE_NOECN)
55 IP_ECN_clear(top_iph); 58 IP_ECN_clear(top_iph);
56 59
57 top_iph->frag_off = iph->frag_off & htons(IP_DF); 60 top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ?
61 0 : (iph->frag_off & htons(IP_DF));
58 if (!top_iph->frag_off) 62 if (!top_iph->frag_off)
59 __ip_select_ident(top_iph, dst, 0); 63 __ip_select_ident(top_iph, dst, 0);
60 64
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
index 223a2e83853f..050611d7a967 100644
--- a/net/ipv4/xfrm4_state.c
+++ b/net/ipv4/xfrm4_state.c
@@ -7,12 +7,20 @@
7 * 7 *
8 */ 8 */
9 9
10#include <net/ip.h>
10#include <net/xfrm.h> 11#include <net/xfrm.h>
11#include <linux/pfkeyv2.h> 12#include <linux/pfkeyv2.h>
12#include <linux/ipsec.h> 13#include <linux/ipsec.h>
13 14
14static struct xfrm_state_afinfo xfrm4_state_afinfo; 15static struct xfrm_state_afinfo xfrm4_state_afinfo;
15 16
17static int xfrm4_init_flags(struct xfrm_state *x)
18{
19 if (ipv4_config.no_pmtu_disc)
20 x->props.flags |= XFRM_STATE_NOPMTUDISC;
21 return 0;
22}
23
16static void 24static void
17__xfrm4_init_tempsel(struct xfrm_state *x, struct flowi *fl, 25__xfrm4_init_tempsel(struct xfrm_state *x, struct flowi *fl,
18 struct xfrm_tmpl *tmpl, 26 struct xfrm_tmpl *tmpl,
@@ -109,6 +117,7 @@ __xfrm4_find_acq(u8 mode, u32 reqid, u8 proto,
109static struct xfrm_state_afinfo xfrm4_state_afinfo = { 117static struct xfrm_state_afinfo xfrm4_state_afinfo = {
110 .family = AF_INET, 118 .family = AF_INET,
111 .lock = RW_LOCK_UNLOCKED, 119 .lock = RW_LOCK_UNLOCKED,
120 .init_flags = xfrm4_init_flags,
112 .init_tempsel = __xfrm4_init_tempsel, 121 .init_tempsel = __xfrm4_init_tempsel,
113 .state_lookup = __xfrm4_state_lookup, 122 .state_lookup = __xfrm4_state_lookup,
114 .find_acq = __xfrm4_find_acq, 123 .find_acq = __xfrm4_find_acq,
diff --git a/net/ipv4/xfrm4_tunnel.c b/net/ipv4/xfrm4_tunnel.c
index 413191f585f6..e1fe360ed27a 100644
--- a/net/ipv4/xfrm4_tunnel.c
+++ b/net/ipv4/xfrm4_tunnel.c
@@ -84,7 +84,7 @@ static void ipip_err(struct sk_buff *skb, u32 info)
84 handler->err_handler(skb, &arg); 84 handler->err_handler(skb, &arg);
85} 85}
86 86
87static int ipip_init_state(struct xfrm_state *x, void *args) 87static int ipip_init_state(struct xfrm_state *x)
88{ 88{
89 if (!x->props.mode) 89 if (!x->props.mode)
90 return -EINVAL; 90 return -EINVAL;
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 47a30c3188ea..14f5c53235fe 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -695,7 +695,7 @@ static void ipv6_del_addr(struct inet6_ifaddr *ifp)
695 695
696 if (rt && ((rt->rt6i_flags & (RTF_GATEWAY | RTF_DEFAULT)) == 0)) { 696 if (rt && ((rt->rt6i_flags & (RTF_GATEWAY | RTF_DEFAULT)) == 0)) {
697 if (onlink == 0) { 697 if (onlink == 0) {
698 ip6_del_rt(rt, NULL, NULL); 698 ip6_del_rt(rt, NULL, NULL, NULL);
699 rt = NULL; 699 rt = NULL;
700 } else if (!(rt->rt6i_flags & RTF_EXPIRES)) { 700 } else if (!(rt->rt6i_flags & RTF_EXPIRES)) {
701 rt->rt6i_expires = expires; 701 rt->rt6i_expires = expires;
@@ -1340,7 +1340,7 @@ addrconf_prefix_route(struct in6_addr *pfx, int plen, struct net_device *dev,
1340 if (dev->type == ARPHRD_SIT && (dev->flags&IFF_POINTOPOINT)) 1340 if (dev->type == ARPHRD_SIT && (dev->flags&IFF_POINTOPOINT))
1341 rtmsg.rtmsg_flags |= RTF_NONEXTHOP; 1341 rtmsg.rtmsg_flags |= RTF_NONEXTHOP;
1342 1342
1343 ip6_route_add(&rtmsg, NULL, NULL); 1343 ip6_route_add(&rtmsg, NULL, NULL, NULL);
1344} 1344}
1345 1345
1346/* Create "default" multicast route to the interface */ 1346/* Create "default" multicast route to the interface */
@@ -1357,7 +1357,7 @@ static void addrconf_add_mroute(struct net_device *dev)
1357 rtmsg.rtmsg_ifindex = dev->ifindex; 1357 rtmsg.rtmsg_ifindex = dev->ifindex;
1358 rtmsg.rtmsg_flags = RTF_UP; 1358 rtmsg.rtmsg_flags = RTF_UP;
1359 rtmsg.rtmsg_type = RTMSG_NEWROUTE; 1359 rtmsg.rtmsg_type = RTMSG_NEWROUTE;
1360 ip6_route_add(&rtmsg, NULL, NULL); 1360 ip6_route_add(&rtmsg, NULL, NULL, NULL);
1361} 1361}
1362 1362
1363static void sit_route_add(struct net_device *dev) 1363static void sit_route_add(struct net_device *dev)
@@ -1374,7 +1374,7 @@ static void sit_route_add(struct net_device *dev)
1374 rtmsg.rtmsg_flags = RTF_UP|RTF_NONEXTHOP; 1374 rtmsg.rtmsg_flags = RTF_UP|RTF_NONEXTHOP;
1375 rtmsg.rtmsg_ifindex = dev->ifindex; 1375 rtmsg.rtmsg_ifindex = dev->ifindex;
1376 1376
1377 ip6_route_add(&rtmsg, NULL, NULL); 1377 ip6_route_add(&rtmsg, NULL, NULL, NULL);
1378} 1378}
1379 1379
1380static void addrconf_add_lroute(struct net_device *dev) 1380static void addrconf_add_lroute(struct net_device *dev)
@@ -1467,7 +1467,7 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len)
1467 if (rt && ((rt->rt6i_flags & (RTF_GATEWAY | RTF_DEFAULT)) == 0)) { 1467 if (rt && ((rt->rt6i_flags & (RTF_GATEWAY | RTF_DEFAULT)) == 0)) {
1468 if (rt->rt6i_flags&RTF_EXPIRES) { 1468 if (rt->rt6i_flags&RTF_EXPIRES) {
1469 if (valid_lft == 0) { 1469 if (valid_lft == 0) {
1470 ip6_del_rt(rt, NULL, NULL); 1470 ip6_del_rt(rt, NULL, NULL, NULL);
1471 rt = NULL; 1471 rt = NULL;
1472 } else { 1472 } else {
1473 rt->rt6i_expires = rt_expires; 1473 rt->rt6i_expires = rt_expires;
@@ -3094,7 +3094,7 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
3094 switch (event) { 3094 switch (event) {
3095 case RTM_NEWADDR: 3095 case RTM_NEWADDR:
3096 dst_hold(&ifp->rt->u.dst); 3096 dst_hold(&ifp->rt->u.dst);
3097 if (ip6_ins_rt(ifp->rt, NULL, NULL)) 3097 if (ip6_ins_rt(ifp->rt, NULL, NULL, NULL))
3098 dst_release(&ifp->rt->u.dst); 3098 dst_release(&ifp->rt->u.dst);
3099 if (ifp->idev->cnf.forwarding) 3099 if (ifp->idev->cnf.forwarding)
3100 addrconf_join_anycast(ifp); 3100 addrconf_join_anycast(ifp);
@@ -3104,7 +3104,7 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
3104 addrconf_leave_anycast(ifp); 3104 addrconf_leave_anycast(ifp);
3105 addrconf_leave_solict(ifp->idev, &ifp->addr); 3105 addrconf_leave_solict(ifp->idev, &ifp->addr);
3106 dst_hold(&ifp->rt->u.dst); 3106 dst_hold(&ifp->rt->u.dst);
3107 if (ip6_del_rt(ifp->rt, NULL, NULL)) 3107 if (ip6_del_rt(ifp->rt, NULL, NULL, NULL))
3108 dst_free(&ifp->rt->u.dst); 3108 dst_free(&ifp->rt->u.dst);
3109 else 3109 else
3110 dst_release(&ifp->rt->u.dst); 3110 dst_release(&ifp->rt->u.dst);
diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c
index e3ecf626cbf7..986fdfdccbcd 100644
--- a/net/ipv6/ah6.c
+++ b/net/ipv6/ah6.c
@@ -339,7 +339,7 @@ static void ah6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
339 xfrm_state_put(x); 339 xfrm_state_put(x);
340} 340}
341 341
342static int ah6_init_state(struct xfrm_state *x, void *args) 342static int ah6_init_state(struct xfrm_state *x)
343{ 343{
344 struct ah_data *ahp = NULL; 344 struct ah_data *ahp = NULL;
345 struct xfrm_algo_desc *aalg_desc; 345 struct xfrm_algo_desc *aalg_desc;
diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
index 5d22ca3cca2e..6b7294047238 100644
--- a/net/ipv6/anycast.c
+++ b/net/ipv6/anycast.c
@@ -337,7 +337,7 @@ int ipv6_dev_ac_inc(struct net_device *dev, struct in6_addr *addr)
337 write_unlock_bh(&idev->lock); 337 write_unlock_bh(&idev->lock);
338 338
339 dst_hold(&rt->u.dst); 339 dst_hold(&rt->u.dst);
340 if (ip6_ins_rt(rt, NULL, NULL)) 340 if (ip6_ins_rt(rt, NULL, NULL, NULL))
341 dst_release(&rt->u.dst); 341 dst_release(&rt->u.dst);
342 342
343 addrconf_join_solict(dev, &aca->aca_addr); 343 addrconf_join_solict(dev, &aca->aca_addr);
@@ -380,7 +380,7 @@ int __ipv6_dev_ac_dec(struct inet6_dev *idev, struct in6_addr *addr)
380 addrconf_leave_solict(idev, &aca->aca_addr); 380 addrconf_leave_solict(idev, &aca->aca_addr);
381 381
382 dst_hold(&aca->aca_rt->u.dst); 382 dst_hold(&aca->aca_rt->u.dst);
383 if (ip6_del_rt(aca->aca_rt, NULL, NULL)) 383 if (ip6_del_rt(aca->aca_rt, NULL, NULL, NULL))
384 dst_free(&aca->aca_rt->u.dst); 384 dst_free(&aca->aca_rt->u.dst);
385 else 385 else
386 dst_release(&aca->aca_rt->u.dst); 386 dst_release(&aca->aca_rt->u.dst);
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index be7095d6babe..324db62515a2 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -296,7 +296,7 @@ static void esp6_destroy(struct xfrm_state *x)
296 kfree(esp); 296 kfree(esp);
297} 297}
298 298
299static int esp6_init_state(struct xfrm_state *x, void *args) 299static int esp6_init_state(struct xfrm_state *x)
300{ 300{
301 struct esp_data *esp = NULL; 301 struct esp_data *esp = NULL;
302 302
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 405740b75abb..1b354aa97934 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -394,7 +394,7 @@ insert_above:
394 */ 394 */
395 395
396static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, 396static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
397 struct nlmsghdr *nlh) 397 struct nlmsghdr *nlh, struct netlink_skb_parms *req)
398{ 398{
399 struct rt6_info *iter = NULL; 399 struct rt6_info *iter = NULL;
400 struct rt6_info **ins; 400 struct rt6_info **ins;
@@ -449,7 +449,7 @@ out:
449 *ins = rt; 449 *ins = rt;
450 rt->rt6i_node = fn; 450 rt->rt6i_node = fn;
451 atomic_inc(&rt->rt6i_ref); 451 atomic_inc(&rt->rt6i_ref);
452 inet6_rt_notify(RTM_NEWROUTE, rt, nlh); 452 inet6_rt_notify(RTM_NEWROUTE, rt, nlh, req);
453 rt6_stats.fib_rt_entries++; 453 rt6_stats.fib_rt_entries++;
454 454
455 if ((fn->fn_flags & RTN_RTINFO) == 0) { 455 if ((fn->fn_flags & RTN_RTINFO) == 0) {
@@ -479,7 +479,8 @@ void fib6_force_start_gc(void)
479 * with source addr info in sub-trees 479 * with source addr info in sub-trees
480 */ 480 */
481 481
482int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr) 482int fib6_add(struct fib6_node *root, struct rt6_info *rt,
483 struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
483{ 484{
484 struct fib6_node *fn; 485 struct fib6_node *fn;
485 int err = -ENOMEM; 486 int err = -ENOMEM;
@@ -552,7 +553,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nlmsghdr *nlh,
552 } 553 }
553#endif 554#endif
554 555
555 err = fib6_add_rt2node(fn, rt, nlh); 556 err = fib6_add_rt2node(fn, rt, nlh, req);
556 557
557 if (err == 0) { 558 if (err == 0) {
558 fib6_start_gc(rt); 559 fib6_start_gc(rt);
@@ -859,7 +860,7 @@ static struct fib6_node * fib6_repair_tree(struct fib6_node *fn)
859} 860}
860 861
861static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp, 862static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
862 struct nlmsghdr *nlh, void *_rtattr) 863 struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
863{ 864{
864 struct fib6_walker_t *w; 865 struct fib6_walker_t *w;
865 struct rt6_info *rt = *rtp; 866 struct rt6_info *rt = *rtp;
@@ -915,11 +916,11 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
915 if (atomic_read(&rt->rt6i_ref) != 1) BUG(); 916 if (atomic_read(&rt->rt6i_ref) != 1) BUG();
916 } 917 }
917 918
918 inet6_rt_notify(RTM_DELROUTE, rt, nlh); 919 inet6_rt_notify(RTM_DELROUTE, rt, nlh, req);
919 rt6_release(rt); 920 rt6_release(rt);
920} 921}
921 922
922int fib6_del(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr) 923int fib6_del(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
923{ 924{
924 struct fib6_node *fn = rt->rt6i_node; 925 struct fib6_node *fn = rt->rt6i_node;
925 struct rt6_info **rtp; 926 struct rt6_info **rtp;
@@ -944,7 +945,7 @@ int fib6_del(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr)
944 945
945 for (rtp = &fn->leaf; *rtp; rtp = &(*rtp)->u.next) { 946 for (rtp = &fn->leaf; *rtp; rtp = &(*rtp)->u.next) {
946 if (*rtp == rt) { 947 if (*rtp == rt) {
947 fib6_del_route(fn, rtp, nlh, _rtattr); 948 fib6_del_route(fn, rtp, nlh, _rtattr, req);
948 return 0; 949 return 0;
949 } 950 }
950 } 951 }
@@ -1073,7 +1074,7 @@ static int fib6_clean_node(struct fib6_walker_t *w)
1073 res = c->func(rt, c->arg); 1074 res = c->func(rt, c->arg);
1074 if (res < 0) { 1075 if (res < 0) {
1075 w->leaf = rt; 1076 w->leaf = rt;
1076 res = fib6_del(rt, NULL, NULL); 1077 res = fib6_del(rt, NULL, NULL, NULL);
1077 if (res) { 1078 if (res) {
1078#if RT6_DEBUG >= 2 1079#if RT6_DEBUG >= 2
1079 printk(KERN_DEBUG "fib6_clean_node: del failed: rt=%p@%p err=%d\n", rt, rt->rt6i_node, res); 1080 printk(KERN_DEBUG "fib6_clean_node: del failed: rt=%p@%p err=%d\n", rt, rt->rt6i_node, res);
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index b78a53586804..06e7cdaeedc5 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -484,9 +484,6 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
484 to->nf_bridge = from->nf_bridge; 484 to->nf_bridge = from->nf_bridge;
485 nf_bridge_get(to->nf_bridge); 485 nf_bridge_get(to->nf_bridge);
486#endif 486#endif
487#ifdef CONFIG_NETFILTER_DEBUG
488 to->nf_debug = from->nf_debug;
489#endif
490#endif 487#endif
491} 488}
492 489
diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c
index 6cde5310cd76..423feb46ccc0 100644
--- a/net/ipv6/ipcomp6.c
+++ b/net/ipv6/ipcomp6.c
@@ -234,14 +234,9 @@ static struct xfrm_state *ipcomp6_tunnel_create(struct xfrm_state *x)
234 t->props.mode = 1; 234 t->props.mode = 1;
235 memcpy(t->props.saddr.a6, x->props.saddr.a6, sizeof(struct in6_addr)); 235 memcpy(t->props.saddr.a6, x->props.saddr.a6, sizeof(struct in6_addr));
236 236
237 t->type = xfrm_get_type(IPPROTO_IPV6, t->props.family); 237 if (xfrm_init_state(t))
238 if (t->type == NULL)
239 goto error; 238 goto error;
240 239
241 if (t->type->init_state(t, NULL))
242 goto error;
243
244 t->km.state = XFRM_STATE_VALID;
245 atomic_set(&t->tunnel_users, 1); 240 atomic_set(&t->tunnel_users, 1);
246 241
247out: 242out:
@@ -420,7 +415,7 @@ static void ipcomp6_destroy(struct xfrm_state *x)
420 xfrm6_tunnel_free_spi((xfrm_address_t *)&x->props.saddr); 415 xfrm6_tunnel_free_spi((xfrm_address_t *)&x->props.saddr);
421} 416}
422 417
423static int ipcomp6_init_state(struct xfrm_state *x, void *args) 418static int ipcomp6_init_state(struct xfrm_state *x)
424{ 419{
425 int err; 420 int err;
426 struct ipcomp_data *ipcd; 421 struct ipcomp_data *ipcd;
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 279ab86be662..f3ef4c38d315 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -423,11 +423,12 @@ done:
423 psin6 = (struct sockaddr_in6 *)&greqs.gsr_group; 423 psin6 = (struct sockaddr_in6 *)&greqs.gsr_group;
424 retv = ipv6_sock_mc_join(sk, greqs.gsr_interface, 424 retv = ipv6_sock_mc_join(sk, greqs.gsr_interface,
425 &psin6->sin6_addr); 425 &psin6->sin6_addr);
426 if (retv) 426 /* prior join w/ different source is ok */
427 if (retv && retv != -EADDRINUSE)
427 break; 428 break;
428 omode = MCAST_INCLUDE; 429 omode = MCAST_INCLUDE;
429 add = 1; 430 add = 1;
430 } else /*IP_DROP_SOURCE_MEMBERSHIP */ { 431 } else /* MCAST_LEAVE_SOURCE_GROUP */ {
431 omode = MCAST_INCLUDE; 432 omode = MCAST_INCLUDE;
432 add = 0; 433 add = 0;
433 } 434 }
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 393b6e6f50a9..562fcd14fdea 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -188,6 +188,16 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, struct in6_addr *addr)
188 if (!ipv6_addr_is_multicast(addr)) 188 if (!ipv6_addr_is_multicast(addr))
189 return -EINVAL; 189 return -EINVAL;
190 190
191 read_lock_bh(&ipv6_sk_mc_lock);
192 for (mc_lst=np->ipv6_mc_list; mc_lst; mc_lst=mc_lst->next) {
193 if ((ifindex == 0 || mc_lst->ifindex == ifindex) &&
194 ipv6_addr_equal(&mc_lst->addr, addr)) {
195 read_unlock_bh(&ipv6_sk_mc_lock);
196 return -EADDRINUSE;
197 }
198 }
199 read_unlock_bh(&ipv6_sk_mc_lock);
200
191 mc_lst = sock_kmalloc(sk, sizeof(struct ipv6_mc_socklist), GFP_KERNEL); 201 mc_lst = sock_kmalloc(sk, sizeof(struct ipv6_mc_socklist), GFP_KERNEL);
192 202
193 if (mc_lst == NULL) 203 if (mc_lst == NULL)
@@ -349,6 +359,7 @@ int ip6_mc_source(int add, int omode, struct sock *sk,
349 struct ipv6_pinfo *inet6 = inet6_sk(sk); 359 struct ipv6_pinfo *inet6 = inet6_sk(sk);
350 struct ip6_sf_socklist *psl; 360 struct ip6_sf_socklist *psl;
351 int i, j, rv; 361 int i, j, rv;
362 int leavegroup = 0;
352 int err; 363 int err;
353 364
354 if (pgsr->gsr_group.ss_family != AF_INET6 || 365 if (pgsr->gsr_group.ss_family != AF_INET6 ||
@@ -368,6 +379,7 @@ int ip6_mc_source(int add, int omode, struct sock *sk,
368 379
369 err = -EADDRNOTAVAIL; 380 err = -EADDRNOTAVAIL;
370 381
382 read_lock_bh(&ipv6_sk_mc_lock);
371 for (pmc=inet6->ipv6_mc_list; pmc; pmc=pmc->next) { 383 for (pmc=inet6->ipv6_mc_list; pmc; pmc=pmc->next) {
372 if (pgsr->gsr_interface && pmc->ifindex != pgsr->gsr_interface) 384 if (pgsr->gsr_interface && pmc->ifindex != pgsr->gsr_interface)
373 continue; 385 continue;
@@ -401,6 +413,12 @@ int ip6_mc_source(int add, int omode, struct sock *sk,
401 if (rv) /* source not found */ 413 if (rv) /* source not found */
402 goto done; 414 goto done;
403 415
416 /* special case - (INCLUDE, empty) == LEAVE_GROUP */
417 if (psl->sl_count == 1 && omode == MCAST_INCLUDE) {
418 leavegroup = 1;
419 goto done;
420 }
421
404 /* update the interface filter */ 422 /* update the interface filter */
405 ip6_mc_del_src(idev, group, omode, 1, source, 1); 423 ip6_mc_del_src(idev, group, omode, 1, source, 1);
406 424
@@ -453,9 +471,12 @@ int ip6_mc_source(int add, int omode, struct sock *sk,
453 /* update the interface list */ 471 /* update the interface list */
454 ip6_mc_add_src(idev, group, omode, 1, source, 1); 472 ip6_mc_add_src(idev, group, omode, 1, source, 1);
455done: 473done:
474 read_unlock_bh(&ipv6_sk_mc_lock);
456 read_unlock_bh(&idev->lock); 475 read_unlock_bh(&idev->lock);
457 in6_dev_put(idev); 476 in6_dev_put(idev);
458 dev_put(dev); 477 dev_put(dev);
478 if (leavegroup)
479 return ipv6_sock_mc_drop(sk, pgsr->gsr_interface, group);
459 return err; 480 return err;
460} 481}
461 482
@@ -1280,15 +1301,6 @@ static struct sk_buff *mld_newpack(struct net_device *dev, int size)
1280 return NULL; 1301 return NULL;
1281 1302
1282 skb_reserve(skb, LL_RESERVED_SPACE(dev)); 1303 skb_reserve(skb, LL_RESERVED_SPACE(dev));
1283 if (dev->hard_header) {
1284 unsigned char ha[MAX_ADDR_LEN];
1285
1286 ndisc_mc_map(&mld2_all_mcr, ha, dev, 1);
1287 if (dev->hard_header(skb, dev, ETH_P_IPV6,ha,NULL,size) < 0) {
1288 kfree_skb(skb);
1289 return NULL;
1290 }
1291 }
1292 1304
1293 if (ipv6_get_lladdr(dev, &addr_buf)) { 1305 if (ipv6_get_lladdr(dev, &addr_buf)) {
1294 /* <draft-ietf-magma-mld-source-05.txt>: 1306 /* <draft-ietf-magma-mld-source-05.txt>:
@@ -1312,6 +1324,30 @@ static struct sk_buff *mld_newpack(struct net_device *dev, int size)
1312 return skb; 1324 return skb;
1313} 1325}
1314 1326
1327static inline int mld_dev_queue_xmit2(struct sk_buff *skb)
1328{
1329 struct net_device *dev = skb->dev;
1330
1331 if (dev->hard_header) {
1332 unsigned char ha[MAX_ADDR_LEN];
1333 int err;
1334
1335 ndisc_mc_map(&skb->nh.ipv6h->daddr, ha, dev, 1);
1336 err = dev->hard_header(skb, dev, ETH_P_IPV6, ha, NULL, skb->len);
1337 if (err < 0) {
1338 kfree_skb(skb);
1339 return err;
1340 }
1341 }
1342 return dev_queue_xmit(skb);
1343}
1344
1345static inline int mld_dev_queue_xmit(struct sk_buff *skb)
1346{
1347 return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb, NULL, skb->dev,
1348 mld_dev_queue_xmit2);
1349}
1350
1315static void mld_sendpack(struct sk_buff *skb) 1351static void mld_sendpack(struct sk_buff *skb)
1316{ 1352{
1317 struct ipv6hdr *pip6 = skb->nh.ipv6h; 1353 struct ipv6hdr *pip6 = skb->nh.ipv6h;
@@ -1329,7 +1365,7 @@ static void mld_sendpack(struct sk_buff *skb)
1329 pmr->csum = csum_ipv6_magic(&pip6->saddr, &pip6->daddr, mldlen, 1365 pmr->csum = csum_ipv6_magic(&pip6->saddr, &pip6->daddr, mldlen,
1330 IPPROTO_ICMPV6, csum_partial(skb->h.raw, mldlen, 0)); 1366 IPPROTO_ICMPV6, csum_partial(skb->h.raw, mldlen, 0));
1331 err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dev, 1367 err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dev,
1332 dev_queue_xmit); 1368 mld_dev_queue_xmit);
1333 if (!err) { 1369 if (!err) {
1334 ICMP6_INC_STATS(idev,ICMP6_MIB_OUTMSGS); 1370 ICMP6_INC_STATS(idev,ICMP6_MIB_OUTMSGS);
1335 IP6_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS); 1371 IP6_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS);
@@ -1635,12 +1671,6 @@ static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type)
1635 } 1671 }
1636 1672
1637 skb_reserve(skb, LL_RESERVED_SPACE(dev)); 1673 skb_reserve(skb, LL_RESERVED_SPACE(dev));
1638 if (dev->hard_header) {
1639 unsigned char ha[MAX_ADDR_LEN];
1640 ndisc_mc_map(snd_addr, ha, dev, 1);
1641 if (dev->hard_header(skb, dev, ETH_P_IPV6, ha, NULL, full_len) < 0)
1642 goto out;
1643 }
1644 1674
1645 if (ipv6_get_lladdr(dev, &addr_buf)) { 1675 if (ipv6_get_lladdr(dev, &addr_buf)) {
1646 /* <draft-ietf-magma-mld-source-05.txt>: 1676 /* <draft-ietf-magma-mld-source-05.txt>:
@@ -1668,7 +1698,7 @@ static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type)
1668 idev = in6_dev_get(skb->dev); 1698 idev = in6_dev_get(skb->dev);
1669 1699
1670 err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dev, 1700 err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dev,
1671 dev_queue_xmit); 1701 mld_dev_queue_xmit);
1672 if (!err) { 1702 if (!err) {
1673 if (type == ICMPV6_MGM_REDUCTION) 1703 if (type == ICMPV6_MGM_REDUCTION)
1674 ICMP6_INC_STATS(idev, ICMP6_MIB_OUTGROUPMEMBREDUCTIONS); 1704 ICMP6_INC_STATS(idev, ICMP6_MIB_OUTGROUPMEMBREDUCTIONS);
@@ -1682,10 +1712,6 @@ static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type)
1682 if (likely(idev != NULL)) 1712 if (likely(idev != NULL))
1683 in6_dev_put(idev); 1713 in6_dev_put(idev);
1684 return; 1714 return;
1685
1686out:
1687 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1688 kfree_skb(skb);
1689} 1715}
1690 1716
1691static int ip6_mc_del1_src(struct ifmcaddr6 *pmc, int sfmode, 1717static int ip6_mc_del1_src(struct ifmcaddr6 *pmc, int sfmode,
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 7c291f4e9edc..7ae72d4c9bd2 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -955,7 +955,7 @@ static void ndisc_recv_na(struct sk_buff *skb)
955 struct rt6_info *rt; 955 struct rt6_info *rt;
956 rt = rt6_get_dflt_router(saddr, dev); 956 rt = rt6_get_dflt_router(saddr, dev);
957 if (rt) 957 if (rt)
958 ip6_del_rt(rt, NULL, NULL); 958 ip6_del_rt(rt, NULL, NULL, NULL);
959 } 959 }
960 960
961out: 961out:
@@ -1096,7 +1096,7 @@ static void ndisc_router_discovery(struct sk_buff *skb)
1096 1096
1097 if (rt && lifetime == 0) { 1097 if (rt && lifetime == 0) {
1098 neigh_clone(neigh); 1098 neigh_clone(neigh);
1099 ip6_del_rt(rt, NULL, NULL); 1099 ip6_del_rt(rt, NULL, NULL, NULL);
1100 rt = NULL; 1100 rt = NULL;
1101 } 1101 }
1102 1102
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index c735276fdd5f..73034511c8db 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -71,7 +71,6 @@ static DECLARE_MUTEX(ip6t_mutex);
71/* Must have mutex */ 71/* Must have mutex */
72#define ASSERT_READ_LOCK(x) IP_NF_ASSERT(down_trylock(&ip6t_mutex) != 0) 72#define ASSERT_READ_LOCK(x) IP_NF_ASSERT(down_trylock(&ip6t_mutex) != 0)
73#define ASSERT_WRITE_LOCK(x) IP_NF_ASSERT(down_trylock(&ip6t_mutex) != 0) 73#define ASSERT_WRITE_LOCK(x) IP_NF_ASSERT(down_trylock(&ip6t_mutex) != 0)
74#include <linux/netfilter_ipv4/lockhelp.h>
75#include <linux/netfilter_ipv4/listhelp.h> 74#include <linux/netfilter_ipv4/listhelp.h>
76 75
77#if 0 76#if 0
diff --git a/net/ipv6/netfilter/ip6t_LOG.c b/net/ipv6/netfilter/ip6t_LOG.c
index bfc3d0185d19..c44685e391b7 100644
--- a/net/ipv6/netfilter/ip6t_LOG.c
+++ b/net/ipv6/netfilter/ip6t_LOG.c
@@ -366,8 +366,6 @@ ip6t_log_packet(unsigned int hooknum,
366 const char *level_string, 366 const char *level_string,
367 const char *prefix) 367 const char *prefix)
368{ 368{
369 struct ipv6hdr *ipv6h = skb->nh.ipv6h;
370
371 spin_lock_bh(&log_lock); 369 spin_lock_bh(&log_lock);
372 printk(level_string); 370 printk(level_string);
373 printk("%sIN=%s OUT=%s ", 371 printk("%sIN=%s OUT=%s ",
@@ -377,39 +375,25 @@ ip6t_log_packet(unsigned int hooknum,
377 if (in && !out) { 375 if (in && !out) {
378 /* MAC logging for input chain only. */ 376 /* MAC logging for input chain only. */
379 printk("MAC="); 377 printk("MAC=");
380 if (skb->dev && skb->dev->hard_header_len && skb->mac.raw != (void*)ipv6h) { 378 if (skb->dev && skb->dev->hard_header_len &&
381 if (skb->dev->type != ARPHRD_SIT){ 379 skb->mac.raw != skb->nh.raw) {
382 int i; 380 unsigned char *p = skb->mac.raw;
383 unsigned char *p = skb->mac.raw; 381 int i;
384 for (i = 0; i < skb->dev->hard_header_len; i++,p++) 382
385 printk("%02x%c", *p, 383 if (skb->dev->type == ARPHRD_SIT &&
386 i==skb->dev->hard_header_len - 1 384 (p -= ETH_HLEN) < skb->head)
387 ? ' ':':'); 385 p = NULL;
388 } else { 386
389 int i; 387 if (p != NULL)
390 unsigned char *p = skb->mac.raw; 388 for (i = 0; i < skb->dev->hard_header_len; i++)
391 if ( p - (ETH_ALEN*2+2) > skb->head ){ 389 printk("%02x", p[i]);
392 p -= (ETH_ALEN+2); 390 printk(" ");
393 for (i = 0; i < (ETH_ALEN); i++,p++) 391
394 printk("%02x%s", *p, 392 if (skb->dev->type == ARPHRD_SIT) {
395 i == ETH_ALEN-1 ? "->" : ":"); 393 struct iphdr *iph = (struct iphdr *)skb->mac.raw;
396 p -= (ETH_ALEN*2); 394 printk("TUNNEL=%u.%u.%u.%u->%u.%u.%u.%u ",
397 for (i = 0; i < (ETH_ALEN); i++,p++) 395 NIPQUAD(iph->saddr),
398 printk("%02x%c", *p, 396 NIPQUAD(iph->daddr));
399 i == ETH_ALEN-1 ? ' ' : ':');
400 }
401
402 if ((skb->dev->addr_len == 4) &&
403 skb->dev->hard_header_len > 20){
404 printk("TUNNEL=");
405 p = skb->mac.raw + 12;
406 for (i = 0; i < 4; i++,p++)
407 printk("%3d%s", *p,
408 i == 3 ? "->" : ".");
409 for (i = 0; i < 4; i++,p++)
410 printk("%3d%c", *p,
411 i == 3 ? ' ' : '.');
412 }
413 } 397 }
414 } else 398 } else
415 printk(" "); 399 printk(" ");
diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c
index 71407beaf790..c2982efd14af 100644
--- a/net/ipv6/netfilter/ip6table_raw.c
+++ b/net/ipv6/netfilter/ip6table_raw.c
@@ -129,13 +129,15 @@ static struct nf_hook_ops ip6t_ops[] = {
129 .hook = ip6t_hook, 129 .hook = ip6t_hook,
130 .pf = PF_INET6, 130 .pf = PF_INET6,
131 .hooknum = NF_IP6_PRE_ROUTING, 131 .hooknum = NF_IP6_PRE_ROUTING,
132 .priority = NF_IP6_PRI_FIRST 132 .priority = NF_IP6_PRI_FIRST,
133 .owner = THIS_MODULE,
133 }, 134 },
134 { 135 {
135 .hook = ip6t_hook, 136 .hook = ip6t_hook,
136 .pf = PF_INET6, 137 .pf = PF_INET6,
137 .hooknum = NF_IP6_LOCAL_OUT, 138 .hooknum = NF_IP6_LOCAL_OUT,
138 .priority = NF_IP6_PRI_FIRST 139 .priority = NF_IP6_PRI_FIRST,
140 .owner = THIS_MODULE,
139 }, 141 },
140}; 142};
141 143
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 1f5b226c3573..878789b3122d 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -384,12 +384,13 @@ struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr,
384 be destroyed. 384 be destroyed.
385 */ 385 */
386 386
387int ip6_ins_rt(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr) 387int ip6_ins_rt(struct rt6_info *rt, struct nlmsghdr *nlh,
388 void *_rtattr, struct netlink_skb_parms *req)
388{ 389{
389 int err; 390 int err;
390 391
391 write_lock_bh(&rt6_lock); 392 write_lock_bh(&rt6_lock);
392 err = fib6_add(&ip6_routing_table, rt, nlh, _rtattr); 393 err = fib6_add(&ip6_routing_table, rt, nlh, _rtattr, req);
393 write_unlock_bh(&rt6_lock); 394 write_unlock_bh(&rt6_lock);
394 395
395 return err; 396 return err;
@@ -400,7 +401,7 @@ int ip6_ins_rt(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr)
400 */ 401 */
401 402
402static struct rt6_info *rt6_cow(struct rt6_info *ort, struct in6_addr *daddr, 403static struct rt6_info *rt6_cow(struct rt6_info *ort, struct in6_addr *daddr,
403 struct in6_addr *saddr) 404 struct in6_addr *saddr, struct netlink_skb_parms *req)
404{ 405{
405 int err; 406 int err;
406 struct rt6_info *rt; 407 struct rt6_info *rt;
@@ -432,7 +433,7 @@ static struct rt6_info *rt6_cow(struct rt6_info *ort, struct in6_addr *daddr,
432 433
433 dst_hold(&rt->u.dst); 434 dst_hold(&rt->u.dst);
434 435
435 err = ip6_ins_rt(rt, NULL, NULL); 436 err = ip6_ins_rt(rt, NULL, NULL, req);
436 if (err == 0) 437 if (err == 0)
437 return rt; 438 return rt;
438 439
@@ -491,7 +492,8 @@ restart:
491 read_unlock_bh(&rt6_lock); 492 read_unlock_bh(&rt6_lock);
492 493
493 nrt = rt6_cow(rt, &skb->nh.ipv6h->daddr, 494 nrt = rt6_cow(rt, &skb->nh.ipv6h->daddr,
494 &skb->nh.ipv6h->saddr); 495 &skb->nh.ipv6h->saddr,
496 &NETLINK_CB(skb));
495 497
496 dst_release(&rt->u.dst); 498 dst_release(&rt->u.dst);
497 rt = nrt; 499 rt = nrt;
@@ -551,7 +553,7 @@ restart:
551 dst_hold(&rt->u.dst); 553 dst_hold(&rt->u.dst);
552 read_unlock_bh(&rt6_lock); 554 read_unlock_bh(&rt6_lock);
553 555
554 nrt = rt6_cow(rt, &fl->fl6_dst, &fl->fl6_src); 556 nrt = rt6_cow(rt, &fl->fl6_dst, &fl->fl6_src, NULL);
555 557
556 dst_release(&rt->u.dst); 558 dst_release(&rt->u.dst);
557 rt = nrt; 559 rt = nrt;
@@ -598,7 +600,7 @@ static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
598 600
599 if (rt) { 601 if (rt) {
600 if (rt->rt6i_flags & RTF_CACHE) 602 if (rt->rt6i_flags & RTF_CACHE)
601 ip6_del_rt(rt, NULL, NULL); 603 ip6_del_rt(rt, NULL, NULL, NULL);
602 else 604 else
603 dst_release(dst); 605 dst_release(dst);
604 } 606 }
@@ -787,7 +789,8 @@ int ipv6_get_hoplimit(struct net_device *dev)
787 * 789 *
788 */ 790 */
789 791
790int ip6_route_add(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, void *_rtattr) 792int ip6_route_add(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh,
793 void *_rtattr, struct netlink_skb_parms *req)
791{ 794{
792 int err; 795 int err;
793 struct rtmsg *r; 796 struct rtmsg *r;
@@ -974,7 +977,7 @@ install_route:
974 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst)); 977 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
975 rt->u.dst.dev = dev; 978 rt->u.dst.dev = dev;
976 rt->rt6i_idev = idev; 979 rt->rt6i_idev = idev;
977 return ip6_ins_rt(rt, nlh, _rtattr); 980 return ip6_ins_rt(rt, nlh, _rtattr, req);
978 981
979out: 982out:
980 if (dev) 983 if (dev)
@@ -986,7 +989,7 @@ out:
986 return err; 989 return err;
987} 990}
988 991
989int ip6_del_rt(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr) 992int ip6_del_rt(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
990{ 993{
991 int err; 994 int err;
992 995
@@ -994,7 +997,7 @@ int ip6_del_rt(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr)
994 997
995 rt6_reset_dflt_pointer(NULL); 998 rt6_reset_dflt_pointer(NULL);
996 999
997 err = fib6_del(rt, nlh, _rtattr); 1000 err = fib6_del(rt, nlh, _rtattr, req);
998 dst_release(&rt->u.dst); 1001 dst_release(&rt->u.dst);
999 1002
1000 write_unlock_bh(&rt6_lock); 1003 write_unlock_bh(&rt6_lock);
@@ -1002,7 +1005,7 @@ int ip6_del_rt(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr)
1002 return err; 1005 return err;
1003} 1006}
1004 1007
1005static int ip6_route_del(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, void *_rtattr) 1008static int ip6_route_del(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
1006{ 1009{
1007 struct fib6_node *fn; 1010 struct fib6_node *fn;
1008 struct rt6_info *rt; 1011 struct rt6_info *rt;
@@ -1029,7 +1032,7 @@ static int ip6_route_del(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, void *_r
1029 dst_hold(&rt->u.dst); 1032 dst_hold(&rt->u.dst);
1030 read_unlock_bh(&rt6_lock); 1033 read_unlock_bh(&rt6_lock);
1031 1034
1032 return ip6_del_rt(rt, nlh, _rtattr); 1035 return ip6_del_rt(rt, nlh, _rtattr, req);
1033 } 1036 }
1034 } 1037 }
1035 read_unlock_bh(&rt6_lock); 1038 read_unlock_bh(&rt6_lock);
@@ -1136,11 +1139,11 @@ source_ok:
1136 nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev); 1139 nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1137 nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&nrt->u.dst)); 1140 nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&nrt->u.dst));
1138 1141
1139 if (ip6_ins_rt(nrt, NULL, NULL)) 1142 if (ip6_ins_rt(nrt, NULL, NULL, NULL))
1140 goto out; 1143 goto out;
1141 1144
1142 if (rt->rt6i_flags&RTF_CACHE) { 1145 if (rt->rt6i_flags&RTF_CACHE) {
1143 ip6_del_rt(rt, NULL, NULL); 1146 ip6_del_rt(rt, NULL, NULL, NULL);
1144 return; 1147 return;
1145 } 1148 }
1146 1149
@@ -1204,7 +1207,7 @@ void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1204 2. It is gatewayed route or NONEXTHOP route. Action: clone it. 1207 2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1205 */ 1208 */
1206 if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP)) { 1209 if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP)) {
1207 nrt = rt6_cow(rt, daddr, saddr); 1210 nrt = rt6_cow(rt, daddr, saddr, NULL);
1208 if (!nrt->u.dst.error) { 1211 if (!nrt->u.dst.error) {
1209 nrt->u.dst.metrics[RTAX_MTU-1] = pmtu; 1212 nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1210 if (allfrag) 1213 if (allfrag)
@@ -1232,7 +1235,7 @@ void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1232 nrt->u.dst.metrics[RTAX_MTU-1] = pmtu; 1235 nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1233 if (allfrag) 1236 if (allfrag)
1234 nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG; 1237 nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1235 ip6_ins_rt(nrt, NULL, NULL); 1238 ip6_ins_rt(nrt, NULL, NULL, NULL);
1236 } 1239 }
1237 1240
1238out: 1241out:
@@ -1305,7 +1308,7 @@ struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1305 1308
1306 rtmsg.rtmsg_ifindex = dev->ifindex; 1309 rtmsg.rtmsg_ifindex = dev->ifindex;
1307 1310
1308 ip6_route_add(&rtmsg, NULL, NULL); 1311 ip6_route_add(&rtmsg, NULL, NULL, NULL);
1309 return rt6_get_dflt_router(gwaddr, dev); 1312 return rt6_get_dflt_router(gwaddr, dev);
1310} 1313}
1311 1314
@@ -1323,7 +1326,7 @@ restart:
1323 1326
1324 read_unlock_bh(&rt6_lock); 1327 read_unlock_bh(&rt6_lock);
1325 1328
1326 ip6_del_rt(rt, NULL, NULL); 1329 ip6_del_rt(rt, NULL, NULL, NULL);
1327 1330
1328 goto restart; 1331 goto restart;
1329 } 1332 }
@@ -1349,10 +1352,10 @@ int ipv6_route_ioctl(unsigned int cmd, void __user *arg)
1349 rtnl_lock(); 1352 rtnl_lock();
1350 switch (cmd) { 1353 switch (cmd) {
1351 case SIOCADDRT: 1354 case SIOCADDRT:
1352 err = ip6_route_add(&rtmsg, NULL, NULL); 1355 err = ip6_route_add(&rtmsg, NULL, NULL, NULL);
1353 break; 1356 break;
1354 case SIOCDELRT: 1357 case SIOCDELRT:
1355 err = ip6_route_del(&rtmsg, NULL, NULL); 1358 err = ip6_route_del(&rtmsg, NULL, NULL, NULL);
1356 break; 1359 break;
1357 default: 1360 default:
1358 err = -EINVAL; 1361 err = -EINVAL;
@@ -1546,7 +1549,7 @@ int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1546 1549
1547 if (inet6_rtm_to_rtmsg(r, arg, &rtmsg)) 1550 if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
1548 return -EINVAL; 1551 return -EINVAL;
1549 return ip6_route_del(&rtmsg, nlh, arg); 1552 return ip6_route_del(&rtmsg, nlh, arg, &NETLINK_CB(skb));
1550} 1553}
1551 1554
1552int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) 1555int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
@@ -1556,7 +1559,7 @@ int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1556 1559
1557 if (inet6_rtm_to_rtmsg(r, arg, &rtmsg)) 1560 if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
1558 return -EINVAL; 1561 return -EINVAL;
1559 return ip6_route_add(&rtmsg, nlh, arg); 1562 return ip6_route_add(&rtmsg, nlh, arg, &NETLINK_CB(skb));
1560} 1563}
1561 1564
1562struct rt6_rtnl_dump_arg 1565struct rt6_rtnl_dump_arg
@@ -1566,12 +1569,9 @@ struct rt6_rtnl_dump_arg
1566}; 1569};
1567 1570
1568static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt, 1571static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
1569 struct in6_addr *dst, 1572 struct in6_addr *dst, struct in6_addr *src,
1570 struct in6_addr *src, 1573 int iif, int type, u32 pid, u32 seq,
1571 int iif, 1574 int prefix, unsigned int flags)
1572 int type, u32 pid, u32 seq,
1573 struct nlmsghdr *in_nlh, int prefix,
1574 unsigned int flags)
1575{ 1575{
1576 struct rtmsg *rtm; 1576 struct rtmsg *rtm;
1577 struct nlmsghdr *nlh; 1577 struct nlmsghdr *nlh;
@@ -1585,10 +1585,6 @@ static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
1585 } 1585 }
1586 } 1586 }
1587 1587
1588 if (!pid && in_nlh) {
1589 pid = in_nlh->nlmsg_pid;
1590 }
1591
1592 nlh = NLMSG_NEW(skb, pid, seq, type, sizeof(*rtm), flags); 1588 nlh = NLMSG_NEW(skb, pid, seq, type, sizeof(*rtm), flags);
1593 rtm = NLMSG_DATA(nlh); 1589 rtm = NLMSG_DATA(nlh);
1594 rtm->rtm_family = AF_INET6; 1590 rtm->rtm_family = AF_INET6;
@@ -1675,7 +1671,7 @@ static int rt6_dump_route(struct rt6_info *rt, void *p_arg)
1675 1671
1676 return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE, 1672 return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
1677 NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq, 1673 NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
1678 NULL, prefix, NLM_F_MULTI); 1674 prefix, NLM_F_MULTI);
1679} 1675}
1680 1676
1681static int fib6_dump_node(struct fib6_walker_t *w) 1677static int fib6_dump_node(struct fib6_walker_t *w)
@@ -1823,7 +1819,7 @@ int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
1823 &fl.fl6_dst, &fl.fl6_src, 1819 &fl.fl6_dst, &fl.fl6_src,
1824 iif, 1820 iif,
1825 RTM_NEWROUTE, NETLINK_CB(in_skb).pid, 1821 RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
1826 nlh->nlmsg_seq, nlh, 0, 0); 1822 nlh->nlmsg_seq, 0, 0);
1827 if (err < 0) { 1823 if (err < 0) {
1828 err = -EMSGSIZE; 1824 err = -EMSGSIZE;
1829 goto out_free; 1825 goto out_free;
@@ -1839,17 +1835,25 @@ out_free:
1839 goto out; 1835 goto out;
1840} 1836}
1841 1837
1842void inet6_rt_notify(int event, struct rt6_info *rt, struct nlmsghdr *nlh) 1838void inet6_rt_notify(int event, struct rt6_info *rt, struct nlmsghdr *nlh,
1839 struct netlink_skb_parms *req)
1843{ 1840{
1844 struct sk_buff *skb; 1841 struct sk_buff *skb;
1845 int size = NLMSG_SPACE(sizeof(struct rtmsg)+256); 1842 int size = NLMSG_SPACE(sizeof(struct rtmsg)+256);
1843 u32 pid = current->pid;
1844 u32 seq = 0;
1846 1845
1846 if (req)
1847 pid = req->pid;
1848 if (nlh)
1849 seq = nlh->nlmsg_seq;
1850
1847 skb = alloc_skb(size, gfp_any()); 1851 skb = alloc_skb(size, gfp_any());
1848 if (!skb) { 1852 if (!skb) {
1849 netlink_set_err(rtnl, 0, RTMGRP_IPV6_ROUTE, ENOBUFS); 1853 netlink_set_err(rtnl, 0, RTMGRP_IPV6_ROUTE, ENOBUFS);
1850 return; 1854 return;
1851 } 1855 }
1852 if (rt6_fill_node(skb, rt, NULL, NULL, 0, event, 0, 0, nlh, 0, 0) < 0) { 1856 if (rt6_fill_node(skb, rt, NULL, NULL, 0, event, pid, seq, 0, 0) < 0) {
1853 kfree_skb(skb); 1857 kfree_skb(skb);
1854 netlink_set_err(rtnl, 0, RTMGRP_IPV6_ROUTE, EINVAL); 1858 netlink_set_err(rtnl, 0, RTMGRP_IPV6_ROUTE, EINVAL);
1855 return; 1859 return;
diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c
index ffcadd68b951..60c26c87277e 100644
--- a/net/ipv6/xfrm6_tunnel.c
+++ b/net/ipv6/xfrm6_tunnel.c
@@ -466,7 +466,7 @@ static void xfrm6_tunnel_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
466 return; 466 return;
467} 467}
468 468
469static int xfrm6_tunnel_init_state(struct xfrm_state *x, void *args) 469static int xfrm6_tunnel_init_state(struct xfrm_state *x)
470{ 470{
471 if (!x->props.mode) 471 if (!x->props.mode)
472 return -EINVAL; 472 return -EINVAL;
diff --git a/net/key/af_key.c b/net/key/af_key.c
index 98b72f2024ff..4879743b945a 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -690,6 +690,8 @@ static struct sk_buff * pfkey_xfrm_state2msg(struct xfrm_state *x, int add_keys,
690 sa->sadb_sa_flags |= SADB_SAFLAGS_NOECN; 690 sa->sadb_sa_flags |= SADB_SAFLAGS_NOECN;
691 if (x->props.flags & XFRM_STATE_DECAP_DSCP) 691 if (x->props.flags & XFRM_STATE_DECAP_DSCP)
692 sa->sadb_sa_flags |= SADB_SAFLAGS_DECAP_DSCP; 692 sa->sadb_sa_flags |= SADB_SAFLAGS_DECAP_DSCP;
693 if (x->props.flags & XFRM_STATE_NOPMTUDISC)
694 sa->sadb_sa_flags |= SADB_SAFLAGS_NOPMTUDISC;
693 695
694 /* hard time */ 696 /* hard time */
695 if (hsc & 2) { 697 if (hsc & 2) {
@@ -974,6 +976,8 @@ static struct xfrm_state * pfkey_msg2xfrm_state(struct sadb_msg *hdr,
974 x->props.flags |= XFRM_STATE_NOECN; 976 x->props.flags |= XFRM_STATE_NOECN;
975 if (sa->sadb_sa_flags & SADB_SAFLAGS_DECAP_DSCP) 977 if (sa->sadb_sa_flags & SADB_SAFLAGS_DECAP_DSCP)
976 x->props.flags |= XFRM_STATE_DECAP_DSCP; 978 x->props.flags |= XFRM_STATE_DECAP_DSCP;
979 if (sa->sadb_sa_flags & SADB_SAFLAGS_NOPMTUDISC)
980 x->props.flags |= XFRM_STATE_NOPMTUDISC;
977 981
978 lifetime = (struct sadb_lifetime*) ext_hdrs[SADB_EXT_LIFETIME_HARD-1]; 982 lifetime = (struct sadb_lifetime*) ext_hdrs[SADB_EXT_LIFETIME_HARD-1];
979 if (lifetime != NULL) { 983 if (lifetime != NULL) {
@@ -1096,17 +1100,11 @@ static struct xfrm_state * pfkey_msg2xfrm_state(struct sadb_msg *hdr,
1096 } 1100 }
1097 } 1101 }
1098 1102
1099 x->type = xfrm_get_type(proto, x->props.family); 1103 err = xfrm_init_state(x);
1100 if (x->type == NULL) { 1104 if (err)
1101 err = -ENOPROTOOPT;
1102 goto out;
1103 }
1104 if (x->type->init_state(x, NULL)) {
1105 err = -EINVAL;
1106 goto out; 1105 goto out;
1107 } 1106
1108 x->km.seq = hdr->sadb_msg_seq; 1107 x->km.seq = hdr->sadb_msg_seq;
1109 x->km.state = XFRM_STATE_VALID;
1110 return x; 1108 return x;
1111 1109
1112out: 1110out:
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 663843d97a92..7ae6aa772dab 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -191,10 +191,6 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
191 asoc->last_cwr_tsn = asoc->ctsn_ack_point; 191 asoc->last_cwr_tsn = asoc->ctsn_ack_point;
192 asoc->unack_data = 0; 192 asoc->unack_data = 0;
193 193
194 SCTP_DEBUG_PRINTK("myctsnap for %s INIT as 0x%x.\n",
195 asoc->ep->debug_name,
196 asoc->ctsn_ack_point);
197
198 /* ADDIP Section 4.1 Asconf Chunk Procedures 194 /* ADDIP Section 4.1 Asconf Chunk Procedures
199 * 195 *
200 * When an endpoint has an ASCONF signaled change to be sent to the 196 * When an endpoint has an ASCONF signaled change to be sent to the
@@ -211,6 +207,7 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
211 207
212 /* Make an empty list of remote transport addresses. */ 208 /* Make an empty list of remote transport addresses. */
213 INIT_LIST_HEAD(&asoc->peer.transport_addr_list); 209 INIT_LIST_HEAD(&asoc->peer.transport_addr_list);
210 asoc->peer.transport_count = 0;
214 211
215 /* RFC 2960 5.1 Normal Establishment of an Association 212 /* RFC 2960 5.1 Normal Establishment of an Association
216 * 213 *
@@ -288,6 +285,7 @@ struct sctp_association *sctp_association_new(const struct sctp_endpoint *ep,
288 285
289 asoc->base.malloced = 1; 286 asoc->base.malloced = 1;
290 SCTP_DBG_OBJCNT_INC(assoc); 287 SCTP_DBG_OBJCNT_INC(assoc);
288 SCTP_DEBUG_PRINTK("Created asoc %p\n", asoc);
291 289
292 return asoc; 290 return asoc;
293 291
@@ -356,6 +354,8 @@ void sctp_association_free(struct sctp_association *asoc)
356 sctp_transport_free(transport); 354 sctp_transport_free(transport);
357 } 355 }
358 356
357 asoc->peer.transport_count = 0;
358
359 /* Free any cached ASCONF_ACK chunk. */ 359 /* Free any cached ASCONF_ACK chunk. */
360 if (asoc->addip_last_asconf_ack) 360 if (asoc->addip_last_asconf_ack)
361 sctp_chunk_free(asoc->addip_last_asconf_ack); 361 sctp_chunk_free(asoc->addip_last_asconf_ack);
@@ -400,7 +400,7 @@ void sctp_assoc_set_primary(struct sctp_association *asoc,
400 /* If the primary path is changing, assume that the 400 /* If the primary path is changing, assume that the
401 * user wants to use this new path. 401 * user wants to use this new path.
402 */ 402 */
403 if (transport->active) 403 if (transport->state != SCTP_INACTIVE)
404 asoc->peer.active_path = transport; 404 asoc->peer.active_path = transport;
405 405
406 /* 406 /*
@@ -428,10 +428,58 @@ void sctp_assoc_set_primary(struct sctp_association *asoc,
428 transport->cacc.next_tsn_at_change = asoc->next_tsn; 428 transport->cacc.next_tsn_at_change = asoc->next_tsn;
429} 429}
430 430
431/* Remove a transport from an association. */
432void sctp_assoc_rm_peer(struct sctp_association *asoc,
433 struct sctp_transport *peer)
434{
435 struct list_head *pos;
436 struct sctp_transport *transport;
437
438 SCTP_DEBUG_PRINTK_IPADDR("sctp_assoc_rm_peer:association %p addr: ",
439 " port: %d\n",
440 asoc,
441 (&peer->ipaddr),
442 peer->ipaddr.v4.sin_port);
443
444 /* If we are to remove the current retran_path, update it
445 * to the next peer before removing this peer from the list.
446 */
447 if (asoc->peer.retran_path == peer)
448 sctp_assoc_update_retran_path(asoc);
449
450 /* Remove this peer from the list. */
451 list_del(&peer->transports);
452
453 /* Get the first transport of asoc. */
454 pos = asoc->peer.transport_addr_list.next;
455 transport = list_entry(pos, struct sctp_transport, transports);
456
457 /* Update any entries that match the peer to be deleted. */
458 if (asoc->peer.primary_path == peer)
459 sctp_assoc_set_primary(asoc, transport);
460 if (asoc->peer.active_path == peer)
461 asoc->peer.active_path = transport;
462 if (asoc->peer.last_data_from == peer)
463 asoc->peer.last_data_from = transport;
464
465 /* If we remove the transport an INIT was last sent to, set it to
466 * NULL. Combined with the update of the retran path above, this
467 * will cause the next INIT to be sent to the next available
468 * transport, maintaining the cycle.
469 */
470 if (asoc->init_last_sent_to == peer)
471 asoc->init_last_sent_to = NULL;
472
473 asoc->peer.transport_count--;
474
475 sctp_transport_free(peer);
476}
477
431/* Add a transport address to an association. */ 478/* Add a transport address to an association. */
432struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc, 479struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
433 const union sctp_addr *addr, 480 const union sctp_addr *addr,
434 int gfp) 481 const int gfp,
482 const int peer_state)
435{ 483{
436 struct sctp_transport *peer; 484 struct sctp_transport *peer;
437 struct sctp_sock *sp; 485 struct sctp_sock *sp;
@@ -442,14 +490,25 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
442 /* AF_INET and AF_INET6 share common port field. */ 490 /* AF_INET and AF_INET6 share common port field. */
443 port = addr->v4.sin_port; 491 port = addr->v4.sin_port;
444 492
493 SCTP_DEBUG_PRINTK_IPADDR("sctp_assoc_add_peer:association %p addr: ",
494 " port: %d state:%s\n",
495 asoc,
496 addr,
497 addr->v4.sin_port,
498 peer_state == SCTP_UNKNOWN?"UNKNOWN":"ACTIVE");
499
445 /* Set the port if it has not been set yet. */ 500 /* Set the port if it has not been set yet. */
446 if (0 == asoc->peer.port) 501 if (0 == asoc->peer.port)
447 asoc->peer.port = port; 502 asoc->peer.port = port;
448 503
449 /* Check to see if this is a duplicate. */ 504 /* Check to see if this is a duplicate. */
450 peer = sctp_assoc_lookup_paddr(asoc, addr); 505 peer = sctp_assoc_lookup_paddr(asoc, addr);
451 if (peer) 506 if (peer) {
507 if (peer_state == SCTP_ACTIVE &&
508 peer->state == SCTP_UNKNOWN)
509 peer->state = SCTP_ACTIVE;
452 return peer; 510 return peer;
511 }
453 512
454 peer = sctp_transport_new(addr, gfp); 513 peer = sctp_transport_new(addr, gfp);
455 if (!peer) 514 if (!peer)
@@ -516,8 +575,12 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
516 /* Set the transport's RTO.initial value */ 575 /* Set the transport's RTO.initial value */
517 peer->rto = asoc->rto_initial; 576 peer->rto = asoc->rto_initial;
518 577
578 /* Set the peer's active state. */
579 peer->state = peer_state;
580
519 /* Attach the remote transport to our asoc. */ 581 /* Attach the remote transport to our asoc. */
520 list_add_tail(&peer->transports, &asoc->peer.transport_addr_list); 582 list_add_tail(&peer->transports, &asoc->peer.transport_addr_list);
583 asoc->peer.transport_count++;
521 584
522 /* If we do not yet have a primary path, set one. */ 585 /* If we do not yet have a primary path, set one. */
523 if (!asoc->peer.primary_path) { 586 if (!asoc->peer.primary_path) {
@@ -525,8 +588,9 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
525 asoc->peer.retran_path = peer; 588 asoc->peer.retran_path = peer;
526 } 589 }
527 590
528 if (asoc->peer.active_path == asoc->peer.retran_path) 591 if (asoc->peer.active_path == asoc->peer.retran_path) {
529 asoc->peer.retran_path = peer; 592 asoc->peer.retran_path = peer;
593 }
530 594
531 return peer; 595 return peer;
532} 596}
@@ -537,37 +601,16 @@ void sctp_assoc_del_peer(struct sctp_association *asoc,
537{ 601{
538 struct list_head *pos; 602 struct list_head *pos;
539 struct list_head *temp; 603 struct list_head *temp;
540 struct sctp_transport *peer = NULL;
541 struct sctp_transport *transport; 604 struct sctp_transport *transport;
542 605
543 list_for_each_safe(pos, temp, &asoc->peer.transport_addr_list) { 606 list_for_each_safe(pos, temp, &asoc->peer.transport_addr_list) {
544 transport = list_entry(pos, struct sctp_transport, transports); 607 transport = list_entry(pos, struct sctp_transport, transports);
545 if (sctp_cmp_addr_exact(addr, &transport->ipaddr)) { 608 if (sctp_cmp_addr_exact(addr, &transport->ipaddr)) {
546 peer = transport; 609 /* Do book keeping for removing the peer and free it. */
547 list_del(pos); 610 sctp_assoc_rm_peer(asoc, transport);
548 break; 611 break;
549 } 612 }
550 } 613 }
551
552 /* The address we want delete is not in the association. */
553 if (!peer)
554 return;
555
556 /* Get the first transport of asoc. */
557 pos = asoc->peer.transport_addr_list.next;
558 transport = list_entry(pos, struct sctp_transport, transports);
559
560 /* Update any entries that match the peer to be deleted. */
561 if (asoc->peer.primary_path == peer)
562 sctp_assoc_set_primary(asoc, transport);
563 if (asoc->peer.active_path == peer)
564 asoc->peer.active_path = transport;
565 if (asoc->peer.retran_path == peer)
566 asoc->peer.retran_path = transport;
567 if (asoc->peer.last_data_from == peer)
568 asoc->peer.last_data_from = transport;
569
570 sctp_transport_free(peer);
571} 614}
572 615
573/* Lookup a transport by address. */ 616/* Lookup a transport by address. */
@@ -608,12 +651,12 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
608 /* Record the transition on the transport. */ 651 /* Record the transition on the transport. */
609 switch (command) { 652 switch (command) {
610 case SCTP_TRANSPORT_UP: 653 case SCTP_TRANSPORT_UP:
611 transport->active = SCTP_ACTIVE; 654 transport->state = SCTP_ACTIVE;
612 spc_state = SCTP_ADDR_AVAILABLE; 655 spc_state = SCTP_ADDR_AVAILABLE;
613 break; 656 break;
614 657
615 case SCTP_TRANSPORT_DOWN: 658 case SCTP_TRANSPORT_DOWN:
616 transport->active = SCTP_INACTIVE; 659 transport->state = SCTP_INACTIVE;
617 spc_state = SCTP_ADDR_UNREACHABLE; 660 spc_state = SCTP_ADDR_UNREACHABLE;
618 break; 661 break;
619 662
@@ -643,7 +686,7 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
643 list_for_each(pos, &asoc->peer.transport_addr_list) { 686 list_for_each(pos, &asoc->peer.transport_addr_list) {
644 t = list_entry(pos, struct sctp_transport, transports); 687 t = list_entry(pos, struct sctp_transport, transports);
645 688
646 if (!t->active) 689 if (t->state == SCTP_INACTIVE)
647 continue; 690 continue;
648 if (!first || t->last_time_heard > first->last_time_heard) { 691 if (!first || t->last_time_heard > first->last_time_heard) {
649 second = first; 692 second = first;
@@ -663,7 +706,7 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
663 * [If the primary is active but not most recent, bump the most 706 * [If the primary is active but not most recent, bump the most
664 * recently used transport.] 707 * recently used transport.]
665 */ 708 */
666 if (asoc->peer.primary_path->active && 709 if (asoc->peer.primary_path->state != SCTP_INACTIVE &&
667 first != asoc->peer.primary_path) { 710 first != asoc->peer.primary_path) {
668 second = first; 711 second = first;
669 first = asoc->peer.primary_path; 712 first = asoc->peer.primary_path;
@@ -958,7 +1001,7 @@ void sctp_assoc_update(struct sctp_association *asoc,
958 transports); 1001 transports);
959 if (!sctp_assoc_lookup_paddr(asoc, &trans->ipaddr)) 1002 if (!sctp_assoc_lookup_paddr(asoc, &trans->ipaddr))
960 sctp_assoc_add_peer(asoc, &trans->ipaddr, 1003 sctp_assoc_add_peer(asoc, &trans->ipaddr,
961 GFP_ATOMIC); 1004 GFP_ATOMIC, SCTP_ACTIVE);
962 } 1005 }
963 1006
964 asoc->ctsn_ack_point = asoc->next_tsn - 1; 1007 asoc->ctsn_ack_point = asoc->next_tsn - 1;
@@ -998,7 +1041,7 @@ void sctp_assoc_update_retran_path(struct sctp_association *asoc)
998 1041
999 /* Try to find an active transport. */ 1042 /* Try to find an active transport. */
1000 1043
1001 if (t->active) { 1044 if (t->state != SCTP_INACTIVE) {
1002 break; 1045 break;
1003 } else { 1046 } else {
1004 /* Keep track of the next transport in case 1047 /* Keep track of the next transport in case
@@ -1019,6 +1062,40 @@ void sctp_assoc_update_retran_path(struct sctp_association *asoc)
1019 } 1062 }
1020 1063
1021 asoc->peer.retran_path = t; 1064 asoc->peer.retran_path = t;
1065
1066 SCTP_DEBUG_PRINTK_IPADDR("sctp_assoc_update_retran_path:association"
1067 " %p addr: ",
1068 " port: %d\n",
1069 asoc,
1070 (&t->ipaddr),
1071 t->ipaddr.v4.sin_port);
1072}
1073
1074/* Choose the transport for sending a INIT packet. */
1075struct sctp_transport *sctp_assoc_choose_init_transport(
1076 struct sctp_association *asoc)
1077{
1078 struct sctp_transport *t;
1079
1080 /* Use the retran path. If the last INIT was sent over the
1081 * retran path, update the retran path and use it.
1082 */
1083 if (!asoc->init_last_sent_to) {
1084 t = asoc->peer.active_path;
1085 } else {
1086 if (asoc->init_last_sent_to == asoc->peer.retran_path)
1087 sctp_assoc_update_retran_path(asoc);
1088 t = asoc->peer.retran_path;
1089 }
1090
1091 SCTP_DEBUG_PRINTK_IPADDR("sctp_assoc_update_retran_path:association"
1092 " %p addr: ",
1093 " port: %d\n",
1094 asoc,
1095 (&t->ipaddr),
1096 t->ipaddr.v4.sin_port);
1097
1098 return t;
1022} 1099}
1023 1100
1024/* Choose the transport for sending a SHUTDOWN packet. */ 1101/* Choose the transport for sending a SHUTDOWN packet. */
diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c
index 334f61773e6d..2ec0320fac3b 100644
--- a/net/sctp/endpointola.c
+++ b/net/sctp/endpointola.c
@@ -134,7 +134,6 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep,
134 ep->last_key = ep->current_key = 0; 134 ep->last_key = ep->current_key = 0;
135 ep->key_changed_at = jiffies; 135 ep->key_changed_at = jiffies;
136 136
137 ep->debug_name = "unnamedEndpoint";
138 return ep; 137 return ep;
139} 138}
140 139
diff --git a/net/sctp/input.c b/net/sctp/input.c
index fffc880a646d..339f7acfdb64 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -353,7 +353,7 @@ void sctp_icmp_proto_unreachable(struct sock *sk,
353 353
354 sctp_do_sm(SCTP_EVENT_T_OTHER, 354 sctp_do_sm(SCTP_EVENT_T_OTHER,
355 SCTP_ST_OTHER(SCTP_EVENT_ICMP_PROTO_UNREACH), 355 SCTP_ST_OTHER(SCTP_EVENT_ICMP_PROTO_UNREACH),
356 asoc->state, asoc->ep, asoc, NULL, 356 asoc->state, asoc->ep, asoc, t,
357 GFP_ATOMIC); 357 GFP_ATOMIC);
358 358
359} 359}
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index 1b2d4adc4ddb..4eb81a1407b7 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -682,9 +682,9 @@ int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
682 682
683 if (!new_transport) { 683 if (!new_transport) {
684 new_transport = asoc->peer.active_path; 684 new_transport = asoc->peer.active_path;
685 } else if (!new_transport->active) { 685 } else if (new_transport->state == SCTP_INACTIVE) {
686 /* If the chunk is Heartbeat or Heartbeat Ack, 686 /* If the chunk is Heartbeat or Heartbeat Ack,
687 * send it to chunk->transport, even if it's 687 * send it to chunk->transport, even if it's
688 * inactive. 688 * inactive.
689 * 689 *
690 * 3.3.6 Heartbeat Acknowledgement: 690 * 3.3.6 Heartbeat Acknowledgement:
@@ -840,7 +840,8 @@ int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
840 * Otherwise, we want to use the active path. 840 * Otherwise, we want to use the active path.
841 */ 841 */
842 new_transport = chunk->transport; 842 new_transport = chunk->transport;
843 if (!new_transport || !new_transport->active) 843 if (!new_transport ||
844 new_transport->state == SCTP_INACTIVE)
844 new_transport = asoc->peer.active_path; 845 new_transport = asoc->peer.active_path;
845 846
846 /* Change packets if necessary. */ 847 /* Change packets if necessary. */
@@ -1454,7 +1455,7 @@ static void sctp_check_transmitted(struct sctp_outq *q,
1454 /* Mark the destination transport address as 1455 /* Mark the destination transport address as
1455 * active if it is not so marked. 1456 * active if it is not so marked.
1456 */ 1457 */
1457 if (!transport->active) { 1458 if (transport->state == SCTP_INACTIVE) {
1458 sctp_assoc_control_transport( 1459 sctp_assoc_control_transport(
1459 transport->asoc, 1460 transport->asoc,
1460 transport, 1461 transport,
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 33ac8bf47b0e..5baed9bb7de5 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -1830,7 +1830,7 @@ int sctp_process_init(struct sctp_association *asoc, sctp_cid_t cid,
1830 * be a a better choice than any of the embedded addresses. 1830 * be a a better choice than any of the embedded addresses.
1831 */ 1831 */
1832 if (peer_addr) 1832 if (peer_addr)
1833 if(!sctp_assoc_add_peer(asoc, peer_addr, gfp)) 1833 if(!sctp_assoc_add_peer(asoc, peer_addr, gfp, SCTP_ACTIVE))
1834 goto nomem; 1834 goto nomem;
1835 1835
1836 /* Process the initialization parameters. */ 1836 /* Process the initialization parameters. */
@@ -1841,6 +1841,14 @@ int sctp_process_init(struct sctp_association *asoc, sctp_cid_t cid,
1841 goto clean_up; 1841 goto clean_up;
1842 } 1842 }
1843 1843
1844 /* Walk list of transports, removing transports in the UNKNOWN state. */
1845 list_for_each_safe(pos, temp, &asoc->peer.transport_addr_list) {
1846 transport = list_entry(pos, struct sctp_transport, transports);
1847 if (transport->state == SCTP_UNKNOWN) {
1848 sctp_assoc_rm_peer(asoc, transport);
1849 }
1850 }
1851
1844 /* The fixed INIT headers are always in network byte 1852 /* The fixed INIT headers are always in network byte
1845 * order. 1853 * order.
1846 */ 1854 */
@@ -1906,7 +1914,8 @@ int sctp_process_init(struct sctp_association *asoc, sctp_cid_t cid,
1906 * stream sequence number shall be set to 0. 1914 * stream sequence number shall be set to 0.
1907 */ 1915 */
1908 1916
1909 /* Allocate storage for the negotiated streams if it is not a temporary * association. 1917 /* Allocate storage for the negotiated streams if it is not a temporary
1918 * association.
1910 */ 1919 */
1911 if (!asoc->temp) { 1920 if (!asoc->temp) {
1912 int assoc_id; 1921 int assoc_id;
@@ -1952,6 +1961,9 @@ clean_up:
1952 list_del_init(pos); 1961 list_del_init(pos);
1953 sctp_transport_free(transport); 1962 sctp_transport_free(transport);
1954 } 1963 }
1964
1965 asoc->peer.transport_count = 0;
1966
1955nomem: 1967nomem:
1956 return 0; 1968 return 0;
1957} 1969}
@@ -1995,7 +2007,7 @@ static int sctp_process_param(struct sctp_association *asoc,
1995 af->from_addr_param(&addr, param.addr, asoc->peer.port, 0); 2007 af->from_addr_param(&addr, param.addr, asoc->peer.port, 0);
1996 scope = sctp_scope(peer_addr); 2008 scope = sctp_scope(peer_addr);
1997 if (sctp_in_scope(&addr, scope)) 2009 if (sctp_in_scope(&addr, scope))
1998 if (!sctp_assoc_add_peer(asoc, &addr, gfp)) 2010 if (!sctp_assoc_add_peer(asoc, &addr, gfp, SCTP_ACTIVE))
1999 return 0; 2011 return 0;
2000 break; 2012 break;
2001 2013
@@ -2396,7 +2408,7 @@ static __u16 sctp_process_asconf_param(struct sctp_association *asoc,
2396 * Due to Resource Shortage'. 2408 * Due to Resource Shortage'.
2397 */ 2409 */
2398 2410
2399 peer = sctp_assoc_add_peer(asoc, &addr, GFP_ATOMIC); 2411 peer = sctp_assoc_add_peer(asoc, &addr, GFP_ATOMIC, SCTP_ACTIVE);
2400 if (!peer) 2412 if (!peer)
2401 return SCTP_ERROR_RSRC_LOW; 2413 return SCTP_ERROR_RSRC_LOW;
2402 2414
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index f65fa441952f..778639db125a 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -414,11 +414,13 @@ static void sctp_do_8_2_transport_strike(struct sctp_association *asoc,
414 */ 414 */
415 asoc->overall_error_count++; 415 asoc->overall_error_count++;
416 416
417 if (transport->active && 417 if (transport->state != SCTP_INACTIVE &&
418 (transport->error_count++ >= transport->max_retrans)) { 418 (transport->error_count++ >= transport->max_retrans)) {
419 SCTP_DEBUG_PRINTK("transport_strike: transport " 419 SCTP_DEBUG_PRINTK_IPADDR("transport_strike:association %p",
420 "IP:%d.%d.%d.%d failed.\n", 420 " transport IP: port:%d failed.\n",
421 NIPQUAD(transport->ipaddr.v4.sin_addr)); 421 asoc,
422 (&transport->ipaddr),
423 transport->ipaddr.v4.sin_port);
422 sctp_assoc_control_transport(asoc, transport, 424 sctp_assoc_control_transport(asoc, transport,
423 SCTP_TRANSPORT_DOWN, 425 SCTP_TRANSPORT_DOWN,
424 SCTP_FAILED_THRESHOLD); 426 SCTP_FAILED_THRESHOLD);
@@ -593,7 +595,7 @@ static void sctp_cmd_transport_on(sctp_cmd_seq_t *cmds,
593 /* Mark the destination transport address as active if it is not so 595 /* Mark the destination transport address as active if it is not so
594 * marked. 596 * marked.
595 */ 597 */
596 if (!t->active) 598 if (t->state == SCTP_INACTIVE)
597 sctp_assoc_control_transport(asoc, t, SCTP_TRANSPORT_UP, 599 sctp_assoc_control_transport(asoc, t, SCTP_TRANSPORT_UP,
598 SCTP_HEARTBEAT_SUCCESS); 600 SCTP_HEARTBEAT_SUCCESS);
599 601
@@ -665,8 +667,11 @@ static void sctp_cmd_new_state(sctp_cmd_seq_t *cmds,
665 667
666 asoc->state = state; 668 asoc->state = state;
667 669
670 SCTP_DEBUG_PRINTK("sctp_cmd_new_state: asoc %p[%s]\n",
671 asoc, sctp_state_tbl[state]);
672
668 if (sctp_style(sk, TCP)) { 673 if (sctp_style(sk, TCP)) {
669 /* Change the sk->sk_state of a TCP-style socket that has 674 /* Change the sk->sk_state of a TCP-style socket that has
670 * sucessfully completed a connect() call. 675 * sucessfully completed a connect() call.
671 */ 676 */
672 if (sctp_state(asoc, ESTABLISHED) && sctp_sstate(sk, CLOSED)) 677 if (sctp_state(asoc, ESTABLISHED) && sctp_sstate(sk, CLOSED))
@@ -678,6 +683,16 @@ static void sctp_cmd_new_state(sctp_cmd_seq_t *cmds,
678 sk->sk_shutdown |= RCV_SHUTDOWN; 683 sk->sk_shutdown |= RCV_SHUTDOWN;
679 } 684 }
680 685
686 if (sctp_state(asoc, COOKIE_WAIT)) {
687 /* Reset init timeouts since they may have been
688 * increased due to timer expirations.
689 */
690 asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] =
691 asoc->ep->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT];
692 asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] =
693 asoc->ep->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE];
694 }
695
681 if (sctp_state(asoc, ESTABLISHED) || 696 if (sctp_state(asoc, ESTABLISHED) ||
682 sctp_state(asoc, CLOSED) || 697 sctp_state(asoc, CLOSED) ||
683 sctp_state(asoc, SHUTDOWN_RECEIVED)) { 698 sctp_state(asoc, SHUTDOWN_RECEIVED)) {
@@ -1120,10 +1135,10 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
1120 * to be executed only during failed attempts of 1135 * to be executed only during failed attempts of
1121 * association establishment. 1136 * association establishment.
1122 */ 1137 */
1123 if ((asoc->peer.retran_path != 1138 if ((asoc->peer.retran_path !=
1124 asoc->peer.primary_path) && 1139 asoc->peer.primary_path) &&
1125 (asoc->counters[SCTP_COUNTER_INIT_ERROR] > 0)) { 1140 (asoc->init_err_counter > 0)) {
1126 sctp_add_cmd_sf(commands, 1141 sctp_add_cmd_sf(commands,
1127 SCTP_CMD_FORCE_PRIM_RETRAN, 1142 SCTP_CMD_FORCE_PRIM_RETRAN,
1128 SCTP_NULL()); 1143 SCTP_NULL());
1129 } 1144 }
@@ -1237,18 +1252,67 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
1237 sctp_association_put(asoc); 1252 sctp_association_put(asoc);
1238 break; 1253 break;
1239 1254
1255 case SCTP_CMD_INIT_CHOOSE_TRANSPORT:
1256 chunk = cmd->obj.ptr;
1257 t = sctp_assoc_choose_init_transport(asoc);
1258 asoc->init_last_sent_to = t;
1259 chunk->transport = t;
1260 t->init_sent_count++;
1261 break;
1262
1240 case SCTP_CMD_INIT_RESTART: 1263 case SCTP_CMD_INIT_RESTART:
1241 /* Do the needed accounting and updates 1264 /* Do the needed accounting and updates
1242 * associated with restarting an initialization 1265 * associated with restarting an initialization
1243 * timer. 1266 * timer. Only multiply the timeout by two if
1267 * all transports have been tried at the current
1268 * timeout.
1269 */
1270 t = asoc->init_last_sent_to;
1271 asoc->init_err_counter++;
1272
1273 if (t->init_sent_count > (asoc->init_cycle + 1)) {
1274 asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] *= 2;
1275 if (asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] >
1276 asoc->max_init_timeo) {
1277 asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] =
1278 asoc->max_init_timeo;
1279 }
1280 asoc->init_cycle++;
1281 SCTP_DEBUG_PRINTK(
1282 "T1 INIT Timeout adjustment"
1283 " init_err_counter: %d"
1284 " cycle: %d"
1285 " timeout: %d\n",
1286 asoc->init_err_counter,
1287 asoc->init_cycle,
1288 asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT]);
1289 }
1290
1291 sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART,
1292 SCTP_TO(SCTP_EVENT_TIMEOUT_T1_INIT));
1293 break;
1294
1295 case SCTP_CMD_COOKIEECHO_RESTART:
1296 /* Do the needed accounting and updates
1297 * associated with restarting an initialization
1298 * timer. Only multiply the timeout by two if
1299 * all transports have been tried at the current
1300 * timeout.
1244 */ 1301 */
1245 asoc->counters[SCTP_COUNTER_INIT_ERROR]++; 1302 asoc->init_err_counter++;
1246 asoc->timeouts[cmd->obj.to] *= 2; 1303
1247 if (asoc->timeouts[cmd->obj.to] > 1304 asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] *= 2;
1305 if (asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] >
1248 asoc->max_init_timeo) { 1306 asoc->max_init_timeo) {
1249 asoc->timeouts[cmd->obj.to] = 1307 asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] =
1250 asoc->max_init_timeo; 1308 asoc->max_init_timeo;
1251 } 1309 }
1310 SCTP_DEBUG_PRINTK(
1311 "T1 COOKIE Timeout adjustment"
1312 " init_err_counter: %d"
1313 " timeout: %d\n",
1314 asoc->init_err_counter,
1315 asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE]);
1252 1316
1253 /* If we've sent any data bundled with 1317 /* If we've sent any data bundled with
1254 * COOKIE-ECHO we need to resend. 1318 * COOKIE-ECHO we need to resend.
@@ -1261,7 +1325,7 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
1261 1325
1262 sctp_add_cmd_sf(commands, 1326 sctp_add_cmd_sf(commands,
1263 SCTP_CMD_TIMER_RESTART, 1327 SCTP_CMD_TIMER_RESTART,
1264 SCTP_TO(cmd->obj.to)); 1328 SCTP_TO(SCTP_EVENT_TIMEOUT_T1_COOKIE));
1265 break; 1329 break;
1266 1330
1267 case SCTP_CMD_INIT_FAILED: 1331 case SCTP_CMD_INIT_FAILED:
@@ -1273,12 +1337,13 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
1273 subtype, chunk, cmd->obj.u32); 1337 subtype, chunk, cmd->obj.u32);
1274 break; 1338 break;
1275 1339
1276 case SCTP_CMD_COUNTER_INC: 1340 case SCTP_CMD_INIT_COUNTER_INC:
1277 asoc->counters[cmd->obj.counter]++; 1341 asoc->init_err_counter++;
1278 break; 1342 break;
1279 1343
1280 case SCTP_CMD_COUNTER_RESET: 1344 case SCTP_CMD_INIT_COUNTER_RESET:
1281 asoc->counters[cmd->obj.counter] = 0; 1345 asoc->init_err_counter = 0;
1346 asoc->init_cycle = 0;
1282 break; 1347 break;
1283 1348
1284 case SCTP_CMD_REPORT_DUP: 1349 case SCTP_CMD_REPORT_DUP:
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 8e01b8f09ac2..058189684c7c 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -533,6 +533,9 @@ sctp_disposition_t sctp_sf_do_5_1C_ack(const struct sctp_endpoint *ep,
533 sctp_add_cmd_sf(commands, SCTP_CMD_PEER_INIT, 533 sctp_add_cmd_sf(commands, SCTP_CMD_PEER_INIT,
534 SCTP_PEER_INIT(initchunk)); 534 SCTP_PEER_INIT(initchunk));
535 535
536 /* Reset init error count upon receipt of INIT-ACK. */
537 sctp_add_cmd_sf(commands, SCTP_CMD_INIT_COUNTER_RESET, SCTP_NULL());
538
536 /* 5.1 C) "A" shall stop the T1-init timer and leave 539 /* 5.1 C) "A" shall stop the T1-init timer and leave
537 * COOKIE-WAIT state. "A" shall then ... start the T1-cookie 540 * COOKIE-WAIT state. "A" shall then ... start the T1-cookie
538 * timer, and enter the COOKIE-ECHOED state. 541 * timer, and enter the COOKIE-ECHOED state.
@@ -775,8 +778,7 @@ sctp_disposition_t sctp_sf_do_5_1E_ca(const struct sctp_endpoint *ep,
775 * from the COOKIE-ECHOED state to the COOKIE-WAIT 778 * from the COOKIE-ECHOED state to the COOKIE-WAIT
776 * state is performed. 779 * state is performed.
777 */ 780 */
778 sctp_add_cmd_sf(commands, SCTP_CMD_COUNTER_RESET, 781 sctp_add_cmd_sf(commands, SCTP_CMD_INIT_COUNTER_RESET, SCTP_NULL());
779 SCTP_COUNTER(SCTP_COUNTER_INIT_ERROR));
780 782
781 /* RFC 2960 5.1 Normal Establishment of an Association 783 /* RFC 2960 5.1 Normal Establishment of an Association
782 * 784 *
@@ -1019,10 +1021,22 @@ sctp_disposition_t sctp_sf_backbeat_8_3(const struct sctp_endpoint *ep,
1019 link = sctp_assoc_lookup_paddr(asoc, &from_addr); 1021 link = sctp_assoc_lookup_paddr(asoc, &from_addr);
1020 1022
1021 /* This should never happen, but lets log it if so. */ 1023 /* This should never happen, but lets log it if so. */
1022 if (!link) { 1024 if (unlikely(!link)) {
1023 printk(KERN_WARNING 1025 if (from_addr.sa.sa_family == AF_INET6) {
1024 "%s: Could not find address %d.%d.%d.%d\n", 1026 printk(KERN_WARNING
1025 __FUNCTION__, NIPQUAD(from_addr.v4.sin_addr)); 1027 "%s association %p could not find address "
1028 "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n",
1029 __FUNCTION__,
1030 asoc,
1031 NIP6(from_addr.v6.sin6_addr));
1032 } else {
1033 printk(KERN_WARNING
1034 "%s association %p could not find address "
1035 "%u.%u.%u.%u\n",
1036 __FUNCTION__,
1037 asoc,
1038 NIPQUAD(from_addr.v4.sin_addr.s_addr));
1039 }
1026 return SCTP_DISPOSITION_DISCARD; 1040 return SCTP_DISPOSITION_DISCARD;
1027 } 1041 }
1028 1042
@@ -2095,9 +2109,7 @@ static sctp_disposition_t sctp_sf_do_5_2_6_stale(const struct sctp_endpoint *ep,
2095 sctp_errhdr_t *err; 2109 sctp_errhdr_t *err;
2096 struct sctp_chunk *reply; 2110 struct sctp_chunk *reply;
2097 struct sctp_bind_addr *bp; 2111 struct sctp_bind_addr *bp;
2098 int attempts; 2112 int attempts = asoc->init_err_counter + 1;
2099
2100 attempts = asoc->counters[SCTP_COUNTER_INIT_ERROR] + 1;
2101 2113
2102 if (attempts >= asoc->max_init_attempts) { 2114 if (attempts >= asoc->max_init_attempts) {
2103 sctp_add_cmd_sf(commands, SCTP_CMD_INIT_FAILED, 2115 sctp_add_cmd_sf(commands, SCTP_CMD_INIT_FAILED,
@@ -2157,8 +2169,7 @@ static sctp_disposition_t sctp_sf_do_5_2_6_stale(const struct sctp_endpoint *ep,
2157 /* Cast away the const modifier, as we want to just 2169 /* Cast away the const modifier, as we want to just
2158 * rerun it through as a sideffect. 2170 * rerun it through as a sideffect.
2159 */ 2171 */
2160 sctp_add_cmd_sf(commands, SCTP_CMD_COUNTER_INC, 2172 sctp_add_cmd_sf(commands, SCTP_CMD_INIT_COUNTER_INC, SCTP_NULL());
2161 SCTP_COUNTER(SCTP_COUNTER_INIT_ERROR));
2162 2173
2163 sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP, 2174 sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
2164 SCTP_TO(SCTP_EVENT_TIMEOUT_T1_COOKIE)); 2175 SCTP_TO(SCTP_EVENT_TIMEOUT_T1_COOKIE));
@@ -2281,8 +2292,7 @@ sctp_disposition_t sctp_sf_cookie_wait_abort(const struct sctp_endpoint *ep,
2281 if (len >= sizeof(struct sctp_chunkhdr) + sizeof(struct sctp_errhdr)) 2292 if (len >= sizeof(struct sctp_chunkhdr) + sizeof(struct sctp_errhdr))
2282 error = ((sctp_errhdr_t *)chunk->skb->data)->cause; 2293 error = ((sctp_errhdr_t *)chunk->skb->data)->cause;
2283 2294
2284 sctp_stop_t1_and_abort(commands, error); 2295 return sctp_stop_t1_and_abort(commands, error, asoc, chunk->transport);
2285 return SCTP_DISPOSITION_ABORT;
2286} 2296}
2287 2297
2288/* 2298/*
@@ -2294,8 +2304,8 @@ sctp_disposition_t sctp_sf_cookie_wait_icmp_abort(const struct sctp_endpoint *ep
2294 void *arg, 2304 void *arg,
2295 sctp_cmd_seq_t *commands) 2305 sctp_cmd_seq_t *commands)
2296{ 2306{
2297 sctp_stop_t1_and_abort(commands, SCTP_ERROR_NO_ERROR); 2307 return sctp_stop_t1_and_abort(commands, SCTP_ERROR_NO_ERROR, asoc,
2298 return SCTP_DISPOSITION_ABORT; 2308 (struct sctp_transport *)arg);
2299} 2309}
2300 2310
2301/* 2311/*
@@ -2318,8 +2328,12 @@ sctp_disposition_t sctp_sf_cookie_echoed_abort(const struct sctp_endpoint *ep,
2318 * 2328 *
2319 * This is common code called by several sctp_sf_*_abort() functions above. 2329 * This is common code called by several sctp_sf_*_abort() functions above.
2320 */ 2330 */
2321void sctp_stop_t1_and_abort(sctp_cmd_seq_t *commands, __u16 error) 2331sctp_disposition_t sctp_stop_t1_and_abort(sctp_cmd_seq_t *commands,
2332 __u16 error,
2333 const struct sctp_association *asoc,
2334 struct sctp_transport *transport)
2322{ 2335{
2336 SCTP_DEBUG_PRINTK("ABORT received (INIT).\n");
2323 sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE, 2337 sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE,
2324 SCTP_STATE(SCTP_STATE_CLOSED)); 2338 SCTP_STATE(SCTP_STATE_CLOSED));
2325 SCTP_INC_STATS(SCTP_MIB_ABORTEDS); 2339 SCTP_INC_STATS(SCTP_MIB_ABORTEDS);
@@ -2328,6 +2342,7 @@ void sctp_stop_t1_and_abort(sctp_cmd_seq_t *commands, __u16 error)
2328 /* CMD_INIT_FAILED will DELETE_TCB. */ 2342 /* CMD_INIT_FAILED will DELETE_TCB. */
2329 sctp_add_cmd_sf(commands, SCTP_CMD_INIT_FAILED, 2343 sctp_add_cmd_sf(commands, SCTP_CMD_INIT_FAILED,
2330 SCTP_U32(error)); 2344 SCTP_U32(error));
2345 return SCTP_DISPOSITION_ABORT;
2331} 2346}
2332 2347
2333/* 2348/*
@@ -3805,6 +3820,10 @@ sctp_disposition_t sctp_sf_do_prm_asoc(const struct sctp_endpoint *ep,
3805 sctp_add_cmd_sf(commands, SCTP_CMD_NEW_ASOC, 3820 sctp_add_cmd_sf(commands, SCTP_CMD_NEW_ASOC,
3806 SCTP_ASOC((struct sctp_association *) asoc)); 3821 SCTP_ASOC((struct sctp_association *) asoc));
3807 3822
3823 /* Choose transport for INIT. */
3824 sctp_add_cmd_sf(commands, SCTP_CMD_INIT_CHOOSE_TRANSPORT,
3825 SCTP_CHUNK(repl));
3826
3808 /* After sending the INIT, "A" starts the T1-init timer and 3827 /* After sending the INIT, "A" starts the T1-init timer and
3809 * enters the COOKIE-WAIT state. 3828 * enters the COOKIE-WAIT state.
3810 */ 3829 */
@@ -4589,7 +4608,7 @@ sctp_disposition_t sctp_sf_do_6_2_sack(const struct sctp_endpoint *ep,
4589} 4608}
4590 4609
4591/* 4610/*
4592 * sctp_sf_t1_timer_expire 4611 * sctp_sf_t1_init_timer_expire
4593 * 4612 *
4594 * Section: 4 Note: 2 4613 * Section: 4 Note: 2
4595 * Verification Tag: 4614 * Verification Tag:
@@ -4603,7 +4622,59 @@ sctp_disposition_t sctp_sf_do_6_2_sack(const struct sctp_endpoint *ep,
4603 * endpoint MUST abort the initialization process and report the 4622 * endpoint MUST abort the initialization process and report the
4604 * error to SCTP user. 4623 * error to SCTP user.
4605 * 4624 *
4606 * 3) If the T1-cookie timer expires, the endpoint MUST retransmit 4625 * Outputs
4626 * (timers, events)
4627 *
4628 */
4629sctp_disposition_t sctp_sf_t1_init_timer_expire(const struct sctp_endpoint *ep,
4630 const struct sctp_association *asoc,
4631 const sctp_subtype_t type,
4632 void *arg,
4633 sctp_cmd_seq_t *commands)
4634{
4635 struct sctp_chunk *repl = NULL;
4636 struct sctp_bind_addr *bp;
4637 int attempts = asoc->init_err_counter + 1;
4638
4639 SCTP_DEBUG_PRINTK("Timer T1 expired (INIT).\n");
4640
4641 if (attempts < asoc->max_init_attempts) {
4642 bp = (struct sctp_bind_addr *) &asoc->base.bind_addr;
4643 repl = sctp_make_init(asoc, bp, GFP_ATOMIC, 0);
4644 if (!repl)
4645 return SCTP_DISPOSITION_NOMEM;
4646
4647 /* Choose transport for INIT. */
4648 sctp_add_cmd_sf(commands, SCTP_CMD_INIT_CHOOSE_TRANSPORT,
4649 SCTP_CHUNK(repl));
4650
4651 /* Issue a sideeffect to do the needed accounting. */
4652 sctp_add_cmd_sf(commands, SCTP_CMD_INIT_RESTART,
4653 SCTP_TO(SCTP_EVENT_TIMEOUT_T1_INIT));
4654
4655 sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(repl));
4656 } else {
4657 SCTP_DEBUG_PRINTK("Giving up on INIT, attempts: %d"
4658 " max_init_attempts: %d\n",
4659 attempts, asoc->max_init_attempts);
4660 sctp_add_cmd_sf(commands, SCTP_CMD_INIT_FAILED,
4661 SCTP_U32(SCTP_ERROR_NO_ERROR));
4662 return SCTP_DISPOSITION_DELETE_TCB;
4663 }
4664
4665 return SCTP_DISPOSITION_CONSUME;
4666}
4667
4668/*
4669 * sctp_sf_t1_cookie_timer_expire
4670 *
4671 * Section: 4 Note: 2
4672 * Verification Tag:
4673 * Inputs
4674 * (endpoint, asoc)
4675 *
4676 * RFC 2960 Section 4 Notes
4677 * 3) If the T1-cookie timer expires, the endpoint MUST retransmit
4607 * COOKIE ECHO and re-start the T1-cookie timer without changing 4678 * COOKIE ECHO and re-start the T1-cookie timer without changing
4608 * state. This MUST be repeated up to 'Max.Init.Retransmits' times. 4679 * state. This MUST be repeated up to 'Max.Init.Retransmits' times.
4609 * After that, the endpoint MUST abort the initialization process and 4680 * After that, the endpoint MUST abort the initialization process and
@@ -4613,46 +4684,26 @@ sctp_disposition_t sctp_sf_do_6_2_sack(const struct sctp_endpoint *ep,
4613 * (timers, events) 4684 * (timers, events)
4614 * 4685 *
4615 */ 4686 */
4616sctp_disposition_t sctp_sf_t1_timer_expire(const struct sctp_endpoint *ep, 4687sctp_disposition_t sctp_sf_t1_cookie_timer_expire(const struct sctp_endpoint *ep,
4617 const struct sctp_association *asoc, 4688 const struct sctp_association *asoc,
4618 const sctp_subtype_t type, 4689 const sctp_subtype_t type,
4619 void *arg, 4690 void *arg,
4620 sctp_cmd_seq_t *commands) 4691 sctp_cmd_seq_t *commands)
4621{ 4692{
4622 struct sctp_chunk *repl; 4693 struct sctp_chunk *repl = NULL;
4623 struct sctp_bind_addr *bp; 4694 int attempts = asoc->init_err_counter + 1;
4624 sctp_event_timeout_t timer = (sctp_event_timeout_t) arg;
4625 int timeout;
4626 int attempts;
4627
4628 timeout = asoc->timeouts[timer];
4629 attempts = asoc->counters[SCTP_COUNTER_INIT_ERROR] + 1;
4630 repl = NULL;
4631 4695
4632 SCTP_DEBUG_PRINTK("Timer T1 expired.\n"); 4696 SCTP_DEBUG_PRINTK("Timer T1 expired (COOKIE-ECHO).\n");
4633 4697
4634 if (attempts < asoc->max_init_attempts) { 4698 if (attempts < asoc->max_init_attempts) {
4635 switch (timer) { 4699 repl = sctp_make_cookie_echo(asoc, NULL);
4636 case SCTP_EVENT_TIMEOUT_T1_INIT:
4637 bp = (struct sctp_bind_addr *) &asoc->base.bind_addr;
4638 repl = sctp_make_init(asoc, bp, GFP_ATOMIC, 0);
4639 break;
4640
4641 case SCTP_EVENT_TIMEOUT_T1_COOKIE:
4642 repl = sctp_make_cookie_echo(asoc, NULL);
4643 break;
4644
4645 default:
4646 BUG();
4647 break;
4648 };
4649
4650 if (!repl) 4700 if (!repl)
4651 goto nomem; 4701 return SCTP_DISPOSITION_NOMEM;
4652 4702
4653 /* Issue a sideeffect to do the needed accounting. */ 4703 /* Issue a sideeffect to do the needed accounting. */
4654 sctp_add_cmd_sf(commands, SCTP_CMD_INIT_RESTART, 4704 sctp_add_cmd_sf(commands, SCTP_CMD_COOKIEECHO_RESTART,
4655 SCTP_TO(timer)); 4705 SCTP_TO(SCTP_EVENT_TIMEOUT_T1_COOKIE));
4706
4656 sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(repl)); 4707 sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(repl));
4657 } else { 4708 } else {
4658 sctp_add_cmd_sf(commands, SCTP_CMD_INIT_FAILED, 4709 sctp_add_cmd_sf(commands, SCTP_CMD_INIT_FAILED,
@@ -4661,9 +4712,6 @@ sctp_disposition_t sctp_sf_t1_timer_expire(const struct sctp_endpoint *ep,
4661 } 4712 }
4662 4713
4663 return SCTP_DISPOSITION_CONSUME; 4714 return SCTP_DISPOSITION_CONSUME;
4664
4665nomem:
4666 return SCTP_DISPOSITION_NOMEM;
4667} 4715}
4668 4716
4669/* RFC2960 9.2 If the timer expires, the endpoint must re-send the SHUTDOWN 4717/* RFC2960 9.2 If the timer expires, the endpoint must re-send the SHUTDOWN
diff --git a/net/sctp/sm_statetable.c b/net/sctp/sm_statetable.c
index 8967846f69e8..75ef10408764 100644
--- a/net/sctp/sm_statetable.c
+++ b/net/sctp/sm_statetable.c
@@ -783,7 +783,8 @@ static const sctp_sm_table_entry_t other_event_table[SCTP_NUM_OTHER_TYPES][SCTP_
783 /* SCTP_STATE_COOKIE_WAIT */ \ 783 /* SCTP_STATE_COOKIE_WAIT */ \
784 {.fn = sctp_sf_bug, .name = "sctp_sf_bug"}, \ 784 {.fn = sctp_sf_bug, .name = "sctp_sf_bug"}, \
785 /* SCTP_STATE_COOKIE_ECHOED */ \ 785 /* SCTP_STATE_COOKIE_ECHOED */ \
786 {.fn = sctp_sf_t1_timer_expire, .name = "sctp_sf_t1_timer_expire"}, \ 786 {.fn = sctp_sf_t1_cookie_timer_expire, \
787 .name = "sctp_sf_t1_cookie_timer_expire"}, \
787 /* SCTP_STATE_ESTABLISHED */ \ 788 /* SCTP_STATE_ESTABLISHED */ \
788 {.fn = sctp_sf_timer_ignore, .name = "sctp_sf_timer_ignore"}, \ 789 {.fn = sctp_sf_timer_ignore, .name = "sctp_sf_timer_ignore"}, \
789 /* SCTP_STATE_SHUTDOWN_PENDING */ \ 790 /* SCTP_STATE_SHUTDOWN_PENDING */ \
@@ -802,7 +803,8 @@ static const sctp_sm_table_entry_t other_event_table[SCTP_NUM_OTHER_TYPES][SCTP_
802 /* SCTP_STATE_CLOSED */ \ 803 /* SCTP_STATE_CLOSED */ \
803 {.fn = sctp_sf_timer_ignore, .name = "sctp_sf_timer_ignore"}, \ 804 {.fn = sctp_sf_timer_ignore, .name = "sctp_sf_timer_ignore"}, \
804 /* SCTP_STATE_COOKIE_WAIT */ \ 805 /* SCTP_STATE_COOKIE_WAIT */ \
805 {.fn = sctp_sf_t1_timer_expire, .name = "sctp_sf_t1_timer_expire"}, \ 806 {.fn = sctp_sf_t1_init_timer_expire, \
807 .name = "sctp_sf_t1_init_timer_expire"}, \
806 /* SCTP_STATE_COOKIE_ECHOED */ \ 808 /* SCTP_STATE_COOKIE_ECHOED */ \
807 {.fn = sctp_sf_timer_ignore, .name = "sctp_sf_timer_ignore"}, \ 809 {.fn = sctp_sf_timer_ignore, .name = "sctp_sf_timer_ignore"}, \
808 /* SCTP_STATE_ESTABLISHED */ \ 810 /* SCTP_STATE_ESTABLISHED */ \
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index e6926cb19420..aad55dc3792b 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -262,18 +262,18 @@ static struct sctp_transport *sctp_addr_id2transport(struct sock *sk,
262 * sockaddr_in6 [RFC 2553]), 262 * sockaddr_in6 [RFC 2553]),
263 * addr_len - the size of the address structure. 263 * addr_len - the size of the address structure.
264 */ 264 */
265SCTP_STATIC int sctp_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) 265SCTP_STATIC int sctp_bind(struct sock *sk, struct sockaddr *addr, int addr_len)
266{ 266{
267 int retval = 0; 267 int retval = 0;
268 268
269 sctp_lock_sock(sk); 269 sctp_lock_sock(sk);
270 270
271 SCTP_DEBUG_PRINTK("sctp_bind(sk: %p, uaddr: %p, addr_len: %d)\n", 271 SCTP_DEBUG_PRINTK("sctp_bind(sk: %p, addr: %p, addr_len: %d)\n",
272 sk, uaddr, addr_len); 272 sk, addr, addr_len);
273 273
274 /* Disallow binding twice. */ 274 /* Disallow binding twice. */
275 if (!sctp_sk(sk)->ep->base.bind_addr.port) 275 if (!sctp_sk(sk)->ep->base.bind_addr.port)
276 retval = sctp_do_bind(sk, (union sctp_addr *)uaddr, 276 retval = sctp_do_bind(sk, (union sctp_addr *)addr,
277 addr_len); 277 addr_len);
278 else 278 else
279 retval = -EINVAL; 279 retval = -EINVAL;
@@ -318,23 +318,27 @@ SCTP_STATIC int sctp_do_bind(struct sock *sk, union sctp_addr *addr, int len)
318 unsigned short snum; 318 unsigned short snum;
319 int ret = 0; 319 int ret = 0;
320 320
321 SCTP_DEBUG_PRINTK("sctp_do_bind(sk: %p, newaddr: %p, len: %d)\n",
322 sk, addr, len);
323
324 /* Common sockaddr verification. */ 321 /* Common sockaddr verification. */
325 af = sctp_sockaddr_af(sp, addr, len); 322 af = sctp_sockaddr_af(sp, addr, len);
326 if (!af) 323 if (!af) {
324 SCTP_DEBUG_PRINTK("sctp_do_bind(sk: %p, newaddr: %p, len: %d) EINVAL\n",
325 sk, addr, len);
327 return -EINVAL; 326 return -EINVAL;
327 }
328
329 snum = ntohs(addr->v4.sin_port);
330
331 SCTP_DEBUG_PRINTK_IPADDR("sctp_do_bind(sk: %p, new addr: ",
332 ", port: %d, new port: %d, len: %d)\n",
333 sk,
334 addr,
335 bp->port, snum,
336 len);
328 337
329 /* PF specific bind() address verification. */ 338 /* PF specific bind() address verification. */
330 if (!sp->pf->bind_verify(sp, addr)) 339 if (!sp->pf->bind_verify(sp, addr))
331 return -EADDRNOTAVAIL; 340 return -EADDRNOTAVAIL;
332 341
333 snum= ntohs(addr->v4.sin_port);
334
335 SCTP_DEBUG_PRINTK("sctp_do_bind: port: %d, new port: %d\n",
336 bp->port, snum);
337
338 /* We must either be unbound, or bind to the same port. */ 342 /* We must either be unbound, or bind to the same port. */
339 if (bp->port && (snum != bp->port)) { 343 if (bp->port && (snum != bp->port)) {
340 SCTP_DEBUG_PRINTK("sctp_do_bind:" 344 SCTP_DEBUG_PRINTK("sctp_do_bind:"
@@ -816,7 +820,8 @@ out:
816 * 820 *
817 * Basically do nothing but copying the addresses from user to kernel 821 * Basically do nothing but copying the addresses from user to kernel
818 * land and invoking either sctp_bindx_add() or sctp_bindx_rem() on the sk. 822 * land and invoking either sctp_bindx_add() or sctp_bindx_rem() on the sk.
819 * This is used for tunneling the sctp_bindx() request through sctp_setsockopt() * from userspace. 823 * This is used for tunneling the sctp_bindx() request through sctp_setsockopt()
824 * from userspace.
820 * 825 *
821 * We don't use copy_from_user() for optimization: we first do the 826 * We don't use copy_from_user() for optimization: we first do the
822 * sanity checks (buffer size -fast- and access check-healthy 827 * sanity checks (buffer size -fast- and access check-healthy
@@ -913,6 +918,243 @@ out:
913 return err; 918 return err;
914} 919}
915 920
921/* __sctp_connect(struct sock* sk, struct sockaddr *kaddrs, int addrs_size)
922 *
923 * Common routine for handling connect() and sctp_connectx().
924 * Connect will come in with just a single address.
925 */
926static int __sctp_connect(struct sock* sk,
927 struct sockaddr *kaddrs,
928 int addrs_size)
929{
930 struct sctp_sock *sp;
931 struct sctp_endpoint *ep;
932 struct sctp_association *asoc = NULL;
933 struct sctp_association *asoc2;
934 struct sctp_transport *transport;
935 union sctp_addr to;
936 struct sctp_af *af;
937 sctp_scope_t scope;
938 long timeo;
939 int err = 0;
940 int addrcnt = 0;
941 int walk_size = 0;
942 struct sockaddr *sa_addr;
943 void *addr_buf;
944
945 sp = sctp_sk(sk);
946 ep = sp->ep;
947
948 /* connect() cannot be done on a socket that is already in ESTABLISHED
949 * state - UDP-style peeled off socket or a TCP-style socket that
950 * is already connected.
951 * It cannot be done even on a TCP-style listening socket.
952 */
953 if (sctp_sstate(sk, ESTABLISHED) ||
954 (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING))) {
955 err = -EISCONN;
956 goto out_free;
957 }
958
959 /* Walk through the addrs buffer and count the number of addresses. */
960 addr_buf = kaddrs;
961 while (walk_size < addrs_size) {
962 sa_addr = (struct sockaddr *)addr_buf;
963 af = sctp_get_af_specific(sa_addr->sa_family);
964
965 /* If the address family is not supported or if this address
966 * causes the address buffer to overflow return EINVAL.
967 */
968 if (!af || (walk_size + af->sockaddr_len) > addrs_size) {
969 err = -EINVAL;
970 goto out_free;
971 }
972
973 err = sctp_verify_addr(sk, (union sctp_addr *)sa_addr,
974 af->sockaddr_len);
975 if (err)
976 goto out_free;
977
978 memcpy(&to, sa_addr, af->sockaddr_len);
979 to.v4.sin_port = ntohs(to.v4.sin_port);
980
981 /* Check if there already is a matching association on the
982 * endpoint (other than the one created here).
983 */
984 asoc2 = sctp_endpoint_lookup_assoc(ep, &to, &transport);
985 if (asoc2 && asoc2 != asoc) {
986 if (asoc2->state >= SCTP_STATE_ESTABLISHED)
987 err = -EISCONN;
988 else
989 err = -EALREADY;
990 goto out_free;
991 }
992
993 /* If we could not find a matching association on the endpoint,
994 * make sure that there is no peeled-off association matching
995 * the peer address even on another socket.
996 */
997 if (sctp_endpoint_is_peeled_off(ep, &to)) {
998 err = -EADDRNOTAVAIL;
999 goto out_free;
1000 }
1001
1002 if (!asoc) {
1003 /* If a bind() or sctp_bindx() is not called prior to
1004 * an sctp_connectx() call, the system picks an
1005 * ephemeral port and will choose an address set
1006 * equivalent to binding with a wildcard address.
1007 */
1008 if (!ep->base.bind_addr.port) {
1009 if (sctp_autobind(sk)) {
1010 err = -EAGAIN;
1011 goto out_free;
1012 }
1013 }
1014
1015 scope = sctp_scope(&to);
1016 asoc = sctp_association_new(ep, sk, scope, GFP_KERNEL);
1017 if (!asoc) {
1018 err = -ENOMEM;
1019 goto out_free;
1020 }
1021 }
1022
1023 /* Prime the peer's transport structures. */
1024 transport = sctp_assoc_add_peer(asoc, &to, GFP_KERNEL,
1025 SCTP_UNKNOWN);
1026 if (!transport) {
1027 err = -ENOMEM;
1028 goto out_free;
1029 }
1030
1031 addrcnt++;
1032 addr_buf += af->sockaddr_len;
1033 walk_size += af->sockaddr_len;
1034 }
1035
1036 err = sctp_assoc_set_bind_addr_from_ep(asoc, GFP_KERNEL);
1037 if (err < 0) {
1038 goto out_free;
1039 }
1040
1041 err = sctp_primitive_ASSOCIATE(asoc, NULL);
1042 if (err < 0) {
1043 goto out_free;
1044 }
1045
1046 /* Initialize sk's dport and daddr for getpeername() */
1047 inet_sk(sk)->dport = htons(asoc->peer.port);
1048 af = sctp_get_af_specific(to.sa.sa_family);
1049 af->to_sk_daddr(&to, sk);
1050
1051 timeo = sock_sndtimeo(sk, sk->sk_socket->file->f_flags & O_NONBLOCK);
1052 err = sctp_wait_for_connect(asoc, &timeo);
1053
1054 /* Don't free association on exit. */
1055 asoc = NULL;
1056
1057out_free:
1058
1059 SCTP_DEBUG_PRINTK("About to exit __sctp_connect() free asoc: %p"
1060 " kaddrs: %p err: %d\n",
1061 asoc, kaddrs, err);
1062 if (asoc)
1063 sctp_association_free(asoc);
1064 return err;
1065}
1066
1067/* Helper for tunneling sctp_connectx() requests through sctp_setsockopt()
1068 *
1069 * API 8.9
1070 * int sctp_connectx(int sd, struct sockaddr *addrs, int addrcnt);
1071 *
1072 * If sd is an IPv4 socket, the addresses passed must be IPv4 addresses.
1073 * If the sd is an IPv6 socket, the addresses passed can either be IPv4
1074 * or IPv6 addresses.
1075 *
1076 * A single address may be specified as INADDR_ANY or IN6ADDR_ANY, see
1077 * Section 3.1.2 for this usage.
1078 *
1079 * addrs is a pointer to an array of one or more socket addresses. Each
1080 * address is contained in its appropriate structure (i.e. struct
1081 * sockaddr_in or struct sockaddr_in6) the family of the address type
1082 * must be used to distengish the address length (note that this
1083 * representation is termed a "packed array" of addresses). The caller
1084 * specifies the number of addresses in the array with addrcnt.
1085 *
1086 * On success, sctp_connectx() returns 0. On failure, sctp_connectx() returns
1087 * -1, and sets errno to the appropriate error code.
1088 *
1089 * For SCTP, the port given in each socket address must be the same, or
1090 * sctp_connectx() will fail, setting errno to EINVAL.
1091 *
1092 * An application can use sctp_connectx to initiate an association with
1093 * an endpoint that is multi-homed. Much like sctp_bindx() this call
1094 * allows a caller to specify multiple addresses at which a peer can be
1095 * reached. The way the SCTP stack uses the list of addresses to set up
1096 * the association is implementation dependant. This function only
1097 * specifies that the stack will try to make use of all the addresses in
1098 * the list when needed.
1099 *
1100 * Note that the list of addresses passed in is only used for setting up
1101 * the association. It does not necessarily equal the set of addresses
1102 * the peer uses for the resulting association. If the caller wants to
1103 * find out the set of peer addresses, it must use sctp_getpaddrs() to
1104 * retrieve them after the association has been set up.
1105 *
1106 * Basically do nothing but copying the addresses from user to kernel
1107 * land and invoking either sctp_connectx(). This is used for tunneling
1108 * the sctp_connectx() request through sctp_setsockopt() from userspace.
1109 *
1110 * We don't use copy_from_user() for optimization: we first do the
1111 * sanity checks (buffer size -fast- and access check-healthy
1112 * pointer); if all of those succeed, then we can alloc the memory
1113 * (expensive operation) needed to copy the data to kernel. Then we do
1114 * the copying without checking the user space area
1115 * (__copy_from_user()).
1116 *
1117 * On exit there is no need to do sockfd_put(), sys_setsockopt() does
1118 * it.
1119 *
1120 * sk The sk of the socket
1121 * addrs The pointer to the addresses in user land
1122 * addrssize Size of the addrs buffer
1123 *
1124 * Returns 0 if ok, <0 errno code on error.
1125 */
1126SCTP_STATIC int sctp_setsockopt_connectx(struct sock* sk,
1127 struct sockaddr __user *addrs,
1128 int addrs_size)
1129{
1130 int err = 0;
1131 struct sockaddr *kaddrs;
1132
1133 SCTP_DEBUG_PRINTK("%s - sk %p addrs %p addrs_size %d\n",
1134 __FUNCTION__, sk, addrs, addrs_size);
1135
1136 if (unlikely(addrs_size <= 0))
1137 return -EINVAL;
1138
1139 /* Check the user passed a healthy pointer. */
1140 if (unlikely(!access_ok(VERIFY_READ, addrs, addrs_size)))
1141 return -EFAULT;
1142
1143 /* Alloc space for the address array in kernel memory. */
1144 kaddrs = (struct sockaddr *)kmalloc(addrs_size, GFP_KERNEL);
1145 if (unlikely(!kaddrs))
1146 return -ENOMEM;
1147
1148 if (__copy_from_user(kaddrs, addrs, addrs_size)) {
1149 err = -EFAULT;
1150 } else {
1151 err = __sctp_connect(sk, kaddrs, addrs_size);
1152 }
1153
1154 kfree(kaddrs);
1155 return err;
1156}
1157
916/* API 3.1.4 close() - UDP Style Syntax 1158/* API 3.1.4 close() - UDP Style Syntax
917 * Applications use close() to perform graceful shutdown (as described in 1159 * Applications use close() to perform graceful shutdown (as described in
918 * Section 10.1 of [SCTP]) on ALL the associations currently represented 1160 * Section 10.1 of [SCTP]) on ALL the associations currently represented
@@ -1095,7 +1337,7 @@ SCTP_STATIC int sctp_sendmsg(struct kiocb *iocb, struct sock *sk,
1095 sp = sctp_sk(sk); 1337 sp = sctp_sk(sk);
1096 ep = sp->ep; 1338 ep = sp->ep;
1097 1339
1098 SCTP_DEBUG_PRINTK("Using endpoint: %s.\n", ep->debug_name); 1340 SCTP_DEBUG_PRINTK("Using endpoint: %p.\n", ep);
1099 1341
1100 /* We cannot send a message over a TCP-style listening socket. */ 1342 /* We cannot send a message over a TCP-style listening socket. */
1101 if (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING)) { 1343 if (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING)) {
@@ -1306,7 +1548,7 @@ SCTP_STATIC int sctp_sendmsg(struct kiocb *iocb, struct sock *sk,
1306 } 1548 }
1307 1549
1308 /* Prime the peer's transport structures. */ 1550 /* Prime the peer's transport structures. */
1309 transport = sctp_assoc_add_peer(asoc, &to, GFP_KERNEL); 1551 transport = sctp_assoc_add_peer(asoc, &to, GFP_KERNEL, SCTP_UNKNOWN);
1310 if (!transport) { 1552 if (!transport) {
1311 err = -ENOMEM; 1553 err = -ENOMEM;
1312 goto out_free; 1554 goto out_free;
@@ -2208,6 +2450,12 @@ SCTP_STATIC int sctp_setsockopt(struct sock *sk, int level, int optname,
2208 optlen, SCTP_BINDX_REM_ADDR); 2450 optlen, SCTP_BINDX_REM_ADDR);
2209 break; 2451 break;
2210 2452
2453 case SCTP_SOCKOPT_CONNECTX:
2454 /* 'optlen' is the size of the addresses buffer. */
2455 retval = sctp_setsockopt_connectx(sk, (struct sockaddr __user *)optval,
2456 optlen);
2457 break;
2458
2211 case SCTP_DISABLE_FRAGMENTS: 2459 case SCTP_DISABLE_FRAGMENTS:
2212 retval = sctp_setsockopt_disable_fragments(sk, optval, optlen); 2460 retval = sctp_setsockopt_disable_fragments(sk, optval, optlen);
2213 break; 2461 break;
@@ -2283,112 +2531,29 @@ out_nounlock:
2283 * 2531 *
2284 * len: the size of the address. 2532 * len: the size of the address.
2285 */ 2533 */
2286SCTP_STATIC int sctp_connect(struct sock *sk, struct sockaddr *uaddr, 2534SCTP_STATIC int sctp_connect(struct sock *sk, struct sockaddr *addr,
2287 int addr_len) 2535 int addr_len)
2288{ 2536{
2289 struct sctp_sock *sp;
2290 struct sctp_endpoint *ep;
2291 struct sctp_association *asoc;
2292 struct sctp_transport *transport;
2293 union sctp_addr to;
2294 struct sctp_af *af;
2295 sctp_scope_t scope;
2296 long timeo;
2297 int err = 0; 2537 int err = 0;
2538 struct sctp_af *af;
2298 2539
2299 sctp_lock_sock(sk); 2540 sctp_lock_sock(sk);
2300 2541
2301 SCTP_DEBUG_PRINTK("%s - sk: %p, sockaddr: %p, addr_len: %d)\n", 2542 SCTP_DEBUG_PRINTK("%s - sk: %p, sockaddr: %p, addr_len: %d\n",
2302 __FUNCTION__, sk, uaddr, addr_len); 2543 __FUNCTION__, sk, addr, addr_len);
2303
2304 sp = sctp_sk(sk);
2305 ep = sp->ep;
2306
2307 /* connect() cannot be done on a socket that is already in ESTABLISHED
2308 * state - UDP-style peeled off socket or a TCP-style socket that
2309 * is already connected.
2310 * It cannot be done even on a TCP-style listening socket.
2311 */
2312 if (sctp_sstate(sk, ESTABLISHED) ||
2313 (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING))) {
2314 err = -EISCONN;
2315 goto out_unlock;
2316 }
2317
2318 err = sctp_verify_addr(sk, (union sctp_addr *)uaddr, addr_len);
2319 if (err)
2320 goto out_unlock;
2321
2322 if (addr_len > sizeof(to))
2323 addr_len = sizeof(to);
2324 memcpy(&to, uaddr, addr_len);
2325 to.v4.sin_port = ntohs(to.v4.sin_port);
2326
2327 asoc = sctp_endpoint_lookup_assoc(ep, &to, &transport);
2328 if (asoc) {
2329 if (asoc->state >= SCTP_STATE_ESTABLISHED)
2330 err = -EISCONN;
2331 else
2332 err = -EALREADY;
2333 goto out_unlock;
2334 }
2335
2336 /* If we could not find a matching association on the endpoint,
2337 * make sure that there is no peeled-off association matching the
2338 * peer address even on another socket.
2339 */
2340 if (sctp_endpoint_is_peeled_off(ep, &to)) {
2341 err = -EADDRNOTAVAIL;
2342 goto out_unlock;
2343 }
2344
2345 /* If a bind() or sctp_bindx() is not called prior to a connect()
2346 * call, the system picks an ephemeral port and will choose an address
2347 * set equivalent to binding with a wildcard address.
2348 */
2349 if (!ep->base.bind_addr.port) {
2350 if (sctp_autobind(sk)) {
2351 err = -EAGAIN;
2352 goto out_unlock;
2353 }
2354 }
2355
2356 scope = sctp_scope(&to);
2357 asoc = sctp_association_new(ep, sk, scope, GFP_KERNEL);
2358 if (!asoc) {
2359 err = -ENOMEM;
2360 goto out_unlock;
2361 }
2362 2544
2363 /* Prime the peer's transport structures. */ 2545 /* Validate addr_len before calling common connect/connectx routine. */
2364 transport = sctp_assoc_add_peer(asoc, &to, GFP_KERNEL); 2546 af = sctp_get_af_specific(addr->sa_family);
2365 if (!transport) { 2547 if (!af || addr_len < af->sockaddr_len) {
2366 sctp_association_free(asoc); 2548 err = -EINVAL;
2367 goto out_unlock; 2549 } else {
2368 } 2550 /* Pass correct addr len to common routine (so it knows there
2369 err = sctp_assoc_set_bind_addr_from_ep(asoc, GFP_KERNEL); 2551 * is only one address being passed.
2370 if (err < 0) { 2552 */
2371 sctp_association_free(asoc); 2553 err = __sctp_connect(sk, addr, af->sockaddr_len);
2372 goto out_unlock;
2373 }
2374
2375 err = sctp_primitive_ASSOCIATE(asoc, NULL);
2376 if (err < 0) {
2377 sctp_association_free(asoc);
2378 goto out_unlock;
2379 } 2554 }
2380 2555
2381 /* Initialize sk's dport and daddr for getpeername() */
2382 inet_sk(sk)->dport = htons(asoc->peer.port);
2383 af = sctp_get_af_specific(to.sa.sa_family);
2384 af->to_sk_daddr(&to, sk);
2385
2386 timeo = sock_sndtimeo(sk, sk->sk_socket->file->f_flags & O_NONBLOCK);
2387 err = sctp_wait_for_connect(asoc, &timeo);
2388
2389out_unlock:
2390 sctp_release_sock(sk); 2556 sctp_release_sock(sk);
2391
2392 return err; 2557 return err;
2393} 2558}
2394 2559
@@ -2677,12 +2842,15 @@ static int sctp_getsockopt_sctp_status(struct sock *sk, int len,
2677 /* Map ipv4 address into v4-mapped-on-v6 address. */ 2842 /* Map ipv4 address into v4-mapped-on-v6 address. */
2678 sctp_get_pf_specific(sk->sk_family)->addr_v4map(sctp_sk(sk), 2843 sctp_get_pf_specific(sk->sk_family)->addr_v4map(sctp_sk(sk),
2679 (union sctp_addr *)&status.sstat_primary.spinfo_address); 2844 (union sctp_addr *)&status.sstat_primary.spinfo_address);
2680 status.sstat_primary.spinfo_state = transport->active; 2845 status.sstat_primary.spinfo_state = transport->state;
2681 status.sstat_primary.spinfo_cwnd = transport->cwnd; 2846 status.sstat_primary.spinfo_cwnd = transport->cwnd;
2682 status.sstat_primary.spinfo_srtt = transport->srtt; 2847 status.sstat_primary.spinfo_srtt = transport->srtt;
2683 status.sstat_primary.spinfo_rto = jiffies_to_msecs(transport->rto); 2848 status.sstat_primary.spinfo_rto = jiffies_to_msecs(transport->rto);
2684 status.sstat_primary.spinfo_mtu = transport->pmtu; 2849 status.sstat_primary.spinfo_mtu = transport->pmtu;
2685 2850
2851 if (status.sstat_primary.spinfo_state == SCTP_UNKNOWN)
2852 status.sstat_primary.spinfo_state = SCTP_ACTIVE;
2853
2686 if (put_user(len, optlen)) { 2854 if (put_user(len, optlen)) {
2687 retval = -EFAULT; 2855 retval = -EFAULT;
2688 goto out; 2856 goto out;
@@ -2733,12 +2901,15 @@ static int sctp_getsockopt_peer_addr_info(struct sock *sk, int len,
2733 return -EINVAL; 2901 return -EINVAL;
2734 2902
2735 pinfo.spinfo_assoc_id = sctp_assoc2id(transport->asoc); 2903 pinfo.spinfo_assoc_id = sctp_assoc2id(transport->asoc);
2736 pinfo.spinfo_state = transport->active; 2904 pinfo.spinfo_state = transport->state;
2737 pinfo.spinfo_cwnd = transport->cwnd; 2905 pinfo.spinfo_cwnd = transport->cwnd;
2738 pinfo.spinfo_srtt = transport->srtt; 2906 pinfo.spinfo_srtt = transport->srtt;
2739 pinfo.spinfo_rto = jiffies_to_msecs(transport->rto); 2907 pinfo.spinfo_rto = jiffies_to_msecs(transport->rto);
2740 pinfo.spinfo_mtu = transport->pmtu; 2908 pinfo.spinfo_mtu = transport->pmtu;
2741 2909
2910 if (pinfo.spinfo_state == SCTP_UNKNOWN)
2911 pinfo.spinfo_state = SCTP_ACTIVE;
2912
2742 if (put_user(len, optlen)) { 2913 if (put_user(len, optlen)) {
2743 retval = -EFAULT; 2914 retval = -EFAULT;
2744 goto out; 2915 goto out;
@@ -3591,7 +3762,8 @@ SCTP_STATIC int sctp_getsockopt(struct sock *sk, int level, int optname,
3591 int retval = 0; 3762 int retval = 0;
3592 int len; 3763 int len;
3593 3764
3594 SCTP_DEBUG_PRINTK("sctp_getsockopt(sk: %p, ...)\n", sk); 3765 SCTP_DEBUG_PRINTK("sctp_getsockopt(sk: %p... optname: %d)\n",
3766 sk, optname);
3595 3767
3596 /* I can hardly begin to describe how wrong this is. This is 3768 /* I can hardly begin to describe how wrong this is. This is
3597 * so broken as to be worse than useless. The API draft 3769 * so broken as to be worse than useless. The API draft
@@ -4596,8 +4768,7 @@ out:
4596 return err; 4768 return err;
4597 4769
4598do_error: 4770do_error:
4599 if (asoc->counters[SCTP_COUNTER_INIT_ERROR] + 1 >= 4771 if (asoc->init_err_counter + 1 >= asoc->max_init_attempts)
4600 asoc->max_init_attempts)
4601 err = -ETIMEDOUT; 4772 err = -ETIMEDOUT;
4602 else 4773 else
4603 err = -ECONNREFUSED; 4774 err = -ECONNREFUSED;
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index f30882e1e96a..0ec0fde6e6c5 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -83,7 +83,9 @@ static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer,
83 peer->last_time_used = jiffies; 83 peer->last_time_used = jiffies;
84 peer->last_time_ecne_reduced = jiffies; 84 peer->last_time_ecne_reduced = jiffies;
85 85
86 peer->active = SCTP_ACTIVE; 86 peer->init_sent_count = 0;
87
88 peer->state = SCTP_ACTIVE;
87 peer->hb_allowed = 0; 89 peer->hb_allowed = 0;
88 90
89 /* Initialize the default path max_retrans. */ 91 /* Initialize the default path max_retrans. */
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 0a4260719a12..d65ed8684fc1 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -118,7 +118,6 @@ retry:
118 xfrm_policy_put_afinfo(afinfo); 118 xfrm_policy_put_afinfo(afinfo);
119 return type; 119 return type;
120} 120}
121EXPORT_SYMBOL(xfrm_get_type);
122 121
123int xfrm_dst_lookup(struct xfrm_dst **dst, struct flowi *fl, 122int xfrm_dst_lookup(struct xfrm_dst **dst, struct flowi *fl,
124 unsigned short family) 123 unsigned short family)
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 2537f26f097c..9d206c282cf1 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -1055,6 +1055,43 @@ int xfrm_state_mtu(struct xfrm_state *x, int mtu)
1055} 1055}
1056 1056
1057EXPORT_SYMBOL(xfrm_state_mtu); 1057EXPORT_SYMBOL(xfrm_state_mtu);
1058
1059int xfrm_init_state(struct xfrm_state *x)
1060{
1061 struct xfrm_state_afinfo *afinfo;
1062 int family = x->props.family;
1063 int err;
1064
1065 err = -EAFNOSUPPORT;
1066 afinfo = xfrm_state_get_afinfo(family);
1067 if (!afinfo)
1068 goto error;
1069
1070 err = 0;
1071 if (afinfo->init_flags)
1072 err = afinfo->init_flags(x);
1073
1074 xfrm_state_put_afinfo(afinfo);
1075
1076 if (err)
1077 goto error;
1078
1079 err = -EPROTONOSUPPORT;
1080 x->type = xfrm_get_type(x->id.proto, family);
1081 if (x->type == NULL)
1082 goto error;
1083
1084 err = x->type->init_state(x);
1085 if (err)
1086 goto error;
1087
1088 x->km.state = XFRM_STATE_VALID;
1089
1090error:
1091 return err;
1092}
1093
1094EXPORT_SYMBOL(xfrm_init_state);
1058 1095
1059void __init xfrm_state_init(void) 1096void __init xfrm_state_init(void)
1060{ 1097{
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 5ce8558eac91..ecade4893a13 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -249,17 +249,10 @@ static struct xfrm_state *xfrm_state_construct(struct xfrm_usersa_info *p,
249 if ((err = attach_encap_tmpl(&x->encap, xfrma[XFRMA_ENCAP-1]))) 249 if ((err = attach_encap_tmpl(&x->encap, xfrma[XFRMA_ENCAP-1])))
250 goto error; 250 goto error;
251 251
252 err = -ENOENT; 252 err = xfrm_init_state(x);
253 x->type = xfrm_get_type(x->id.proto, x->props.family);
254 if (x->type == NULL)
255 goto error;
256
257 err = x->type->init_state(x, NULL);
258 if (err) 253 if (err)
259 goto error; 254 goto error;
260 255
261 x->curlft.add_time = (unsigned long) xtime.tv_sec;
262 x->km.state = XFRM_STATE_VALID;
263 x->km.seq = p->seq; 256 x->km.seq = p->seq;
264 257
265 return x; 258 return x;