aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2010-10-23 14:47:02 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2010-10-23 14:47:02 -0400
commit5f05647dd81c11a6a165ccc8f0c1370b16f3bcb0 (patch)
tree7851ef1c93aa1aba7ef327ca4b75fd35e6d10f29 /net/ipv4
parent02f36038c568111ad4fc433f6fa760ff5e38fab4 (diff)
parentec37a48d1d16c30b655ac5280209edf52a6775d4 (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next-2.6
* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next-2.6: (1699 commits) bnx2/bnx2x: Unsupported Ethtool operations should return -EINVAL. vlan: Calling vlan_hwaccel_do_receive() is always valid. tproxy: use the interface primary IP address as a default value for --on-ip tproxy: added IPv6 support to the socket match cxgb3: function namespace cleanup tproxy: added IPv6 support to the TPROXY target tproxy: added IPv6 socket lookup function to nf_tproxy_core be2net: Changes to use only priority codes allowed by f/w tproxy: allow non-local binds of IPv6 sockets if IP_TRANSPARENT is enabled tproxy: added tproxy sockopt interface in the IPV6 layer tproxy: added udp6_lib_lookup function tproxy: added const specifiers to udp lookup functions tproxy: split off ipv6 defragmentation to a separate module l2tp: small cleanup nf_nat: restrict ICMP translation for embedded header can: mcp251x: fix generation of error frames can: mcp251x: fix endless loop in interrupt handler if CANINTF_MERRF is set can-raw: add msg_flags to distinguish local traffic 9p: client code cleanup rds: make local functions/variables static ... Fix up conflicts in net/core/dev.c, drivers/net/pcmcia/smc91c92_cs.c and drivers/net/wireless/ath/ath9k/debug.c as per David
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Kconfig8
-rw-r--r--net/ipv4/Makefile1
-rw-r--r--net/ipv4/af_inet.c8
-rw-r--r--net/ipv4/arp.c245
-rw-r--r--net/ipv4/datagram.c2
-rw-r--r--net/ipv4/devinet.c11
-rw-r--r--net/ipv4/fib_frontend.c192
-rw-r--r--net/ipv4/fib_hash.c291
-rw-r--r--net/ipv4/fib_lookup.h11
-rw-r--r--net/ipv4/fib_rules.c13
-rw-r--r--net/ipv4/fib_semantics.c297
-rw-r--r--net/ipv4/fib_trie.c84
-rw-r--r--net/ipv4/gre.c151
-rw-r--r--net/ipv4/icmp.c4
-rw-r--r--net/ipv4/igmp.c22
-rw-r--r--net/ipv4/inet_diag.c2
-rw-r--r--net/ipv4/inet_hashtables.c28
-rw-r--r--net/ipv4/ip_fragment.c6
-rw-r--r--net/ipv4/ip_gre.c237
-rw-r--r--net/ipv4/ip_options.c3
-rw-r--r--net/ipv4/ip_output.c24
-rw-r--r--net/ipv4/ipip.c212
-rw-r--r--net/ipv4/ipmr.c428
-rw-r--r--net/ipv4/netfilter/Kconfig4
-rw-r--r--net/ipv4/netfilter/arp_tables.c64
-rw-r--r--net/ipv4/netfilter/arpt_mangle.c2
-rw-r--r--net/ipv4/netfilter/ip_tables.c84
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c31
-rw-r--r--net/ipv4/netfilter/ipt_LOG.c145
-rw-r--r--net/ipv4/netfilter/nf_nat_amanda.c9
-rw-r--r--net/ipv4/netfilter/nf_nat_core.c51
-rw-r--r--net/ipv4/netfilter/nf_nat_ftp.c9
-rw-r--r--net/ipv4/netfilter/nf_nat_h323.c53
-rw-r--r--net/ipv4/netfilter/nf_nat_helper.c76
-rw-r--r--net/ipv4/netfilter/nf_nat_irc.c9
-rw-r--r--net/ipv4/netfilter/nf_nat_rule.c17
-rw-r--r--net/ipv4/netfilter/nf_nat_sip.c27
-rw-r--r--net/ipv4/protocol.c31
-rw-r--r--net/ipv4/raw.c2
-rw-r--r--net/ipv4/route.c190
-rw-r--r--net/ipv4/tcp.c11
-rw-r--r--net/ipv4/tcp_input.c55
-rw-r--r--net/ipv4/tcp_ipv4.c12
-rw-r--r--net/ipv4/tcp_minisocks.c2
-rw-r--r--net/ipv4/tcp_output.c31
-rw-r--r--net/ipv4/tcp_timer.c50
-rw-r--r--net/ipv4/tcp_westwood.c2
-rw-r--r--net/ipv4/tunnel4.c19
-rw-r--r--net/ipv4/udp.c4
-rw-r--r--net/ipv4/xfrm4_policy.c4
-rw-r--r--net/ipv4/xfrm4_tunnel.c4
51 files changed, 1874 insertions, 1404 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 7cd7760144f7..e848e6c062cd 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -215,9 +215,15 @@ config NET_IPIP
215 be inserted in and removed from the running kernel whenever you 215 be inserted in and removed from the running kernel whenever you
216 want). Most people won't need this and can say N. 216 want). Most people won't need this and can say N.
217 217
218config NET_IPGRE_DEMUX
219 tristate "IP: GRE demultiplexer"
220 help
221 This is helper module to demultiplex GRE packets on GRE version field criteria.
222 Required by ip_gre and pptp modules.
223
218config NET_IPGRE 224config NET_IPGRE
219 tristate "IP: GRE tunnels over IP" 225 tristate "IP: GRE tunnels over IP"
220 depends on IPV6 || IPV6=n 226 depends on (IPV6 || IPV6=n) && NET_IPGRE_DEMUX
221 help 227 help
222 Tunneling means encapsulating data of one protocol type within 228 Tunneling means encapsulating data of one protocol type within
223 another protocol and sending it over a channel that understands the 229 another protocol and sending it over a channel that understands the
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 80ff87ce43aa..4978d22f9a75 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -20,6 +20,7 @@ obj-$(CONFIG_PROC_FS) += proc.o
20obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o 20obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o
21obj-$(CONFIG_IP_MROUTE) += ipmr.o 21obj-$(CONFIG_IP_MROUTE) += ipmr.o
22obj-$(CONFIG_NET_IPIP) += ipip.o 22obj-$(CONFIG_NET_IPIP) += ipip.o
23obj-$(CONFIG_NET_IPGRE_DEMUX) += gre.o
23obj-$(CONFIG_NET_IPGRE) += ip_gre.o 24obj-$(CONFIG_NET_IPGRE) += ip_gre.o
24obj-$(CONFIG_SYN_COOKIES) += syncookies.o 25obj-$(CONFIG_SYN_COOKIES) += syncookies.o
25obj-$(CONFIG_INET_AH) += ah4.o 26obj-$(CONFIG_INET_AH) += ah4.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 6a1100c25a9f..f581f77d1097 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -227,18 +227,16 @@ EXPORT_SYMBOL(inet_ehash_secret);
227 227
228/* 228/*
229 * inet_ehash_secret must be set exactly once 229 * inet_ehash_secret must be set exactly once
230 * Instead of using a dedicated spinlock, we (ab)use inetsw_lock
231 */ 230 */
232void build_ehash_secret(void) 231void build_ehash_secret(void)
233{ 232{
234 u32 rnd; 233 u32 rnd;
234
235 do { 235 do {
236 get_random_bytes(&rnd, sizeof(rnd)); 236 get_random_bytes(&rnd, sizeof(rnd));
237 } while (rnd == 0); 237 } while (rnd == 0);
238 spin_lock_bh(&inetsw_lock); 238
239 if (!inet_ehash_secret) 239 cmpxchg(&inet_ehash_secret, 0, rnd);
240 inet_ehash_secret = rnd;
241 spin_unlock_bh(&inetsw_lock);
242} 240}
243EXPORT_SYMBOL(build_ehash_secret); 241EXPORT_SYMBOL(build_ehash_secret);
244 242
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 96c1955b3e2f..d8e540c5b071 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -55,7 +55,7 @@
55 * Stuart Cheshire : Metricom and grat arp fixes 55 * Stuart Cheshire : Metricom and grat arp fixes
56 * *** FOR 2.1 clean this up *** 56 * *** FOR 2.1 clean this up ***
57 * Lawrence V. Stefani: (08/12/96) Added FDDI support. 57 * Lawrence V. Stefani: (08/12/96) Added FDDI support.
58 * Alan Cox : Took the AP1000 nasty FDDI hack and 58 * Alan Cox : Took the AP1000 nasty FDDI hack and
59 * folded into the mainstream FDDI code. 59 * folded into the mainstream FDDI code.
60 * Ack spit, Linus how did you allow that 60 * Ack spit, Linus how did you allow that
61 * one in... 61 * one in...
@@ -120,14 +120,14 @@ EXPORT_SYMBOL(clip_tbl_hook);
120#endif 120#endif
121 121
122#include <asm/system.h> 122#include <asm/system.h>
123#include <asm/uaccess.h> 123#include <linux/uaccess.h>
124 124
125#include <linux/netfilter_arp.h> 125#include <linux/netfilter_arp.h>
126 126
127/* 127/*
128 * Interface to generic neighbour cache. 128 * Interface to generic neighbour cache.
129 */ 129 */
130static u32 arp_hash(const void *pkey, const struct net_device *dev); 130static u32 arp_hash(const void *pkey, const struct net_device *dev, __u32 rnd);
131static int arp_constructor(struct neighbour *neigh); 131static int arp_constructor(struct neighbour *neigh);
132static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb); 132static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb);
133static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb); 133static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb);
@@ -161,7 +161,7 @@ static const struct neigh_ops arp_direct_ops = {
161 .queue_xmit = dev_queue_xmit, 161 .queue_xmit = dev_queue_xmit,
162}; 162};
163 163
164const struct neigh_ops arp_broken_ops = { 164static const struct neigh_ops arp_broken_ops = {
165 .family = AF_INET, 165 .family = AF_INET,
166 .solicit = arp_solicit, 166 .solicit = arp_solicit,
167 .error_report = arp_error_report, 167 .error_report = arp_error_report,
@@ -170,35 +170,34 @@ const struct neigh_ops arp_broken_ops = {
170 .hh_output = dev_queue_xmit, 170 .hh_output = dev_queue_xmit,
171 .queue_xmit = dev_queue_xmit, 171 .queue_xmit = dev_queue_xmit,
172}; 172};
173EXPORT_SYMBOL(arp_broken_ops);
174 173
175struct neigh_table arp_tbl = { 174struct neigh_table arp_tbl = {
176 .family = AF_INET, 175 .family = AF_INET,
177 .entry_size = sizeof(struct neighbour) + 4, 176 .entry_size = sizeof(struct neighbour) + 4,
178 .key_len = 4, 177 .key_len = 4,
179 .hash = arp_hash, 178 .hash = arp_hash,
180 .constructor = arp_constructor, 179 .constructor = arp_constructor,
181 .proxy_redo = parp_redo, 180 .proxy_redo = parp_redo,
182 .id = "arp_cache", 181 .id = "arp_cache",
183 .parms = { 182 .parms = {
184 .tbl = &arp_tbl, 183 .tbl = &arp_tbl,
185 .base_reachable_time = 30 * HZ, 184 .base_reachable_time = 30 * HZ,
186 .retrans_time = 1 * HZ, 185 .retrans_time = 1 * HZ,
187 .gc_staletime = 60 * HZ, 186 .gc_staletime = 60 * HZ,
188 .reachable_time = 30 * HZ, 187 .reachable_time = 30 * HZ,
189 .delay_probe_time = 5 * HZ, 188 .delay_probe_time = 5 * HZ,
190 .queue_len = 3, 189 .queue_len = 3,
191 .ucast_probes = 3, 190 .ucast_probes = 3,
192 .mcast_probes = 3, 191 .mcast_probes = 3,
193 .anycast_delay = 1 * HZ, 192 .anycast_delay = 1 * HZ,
194 .proxy_delay = (8 * HZ) / 10, 193 .proxy_delay = (8 * HZ) / 10,
195 .proxy_qlen = 64, 194 .proxy_qlen = 64,
196 .locktime = 1 * HZ, 195 .locktime = 1 * HZ,
197 }, 196 },
198 .gc_interval = 30 * HZ, 197 .gc_interval = 30 * HZ,
199 .gc_thresh1 = 128, 198 .gc_thresh1 = 128,
200 .gc_thresh2 = 512, 199 .gc_thresh2 = 512,
201 .gc_thresh3 = 1024, 200 .gc_thresh3 = 1024,
202}; 201};
203EXPORT_SYMBOL(arp_tbl); 202EXPORT_SYMBOL(arp_tbl);
204 203
@@ -226,14 +225,16 @@ int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir)
226} 225}
227 226
228 227
229static u32 arp_hash(const void *pkey, const struct net_device *dev) 228static u32 arp_hash(const void *pkey,
229 const struct net_device *dev,
230 __u32 hash_rnd)
230{ 231{
231 return jhash_2words(*(u32 *)pkey, dev->ifindex, arp_tbl.hash_rnd); 232 return jhash_2words(*(u32 *)pkey, dev->ifindex, hash_rnd);
232} 233}
233 234
234static int arp_constructor(struct neighbour *neigh) 235static int arp_constructor(struct neighbour *neigh)
235{ 236{
236 __be32 addr = *(__be32*)neigh->primary_key; 237 __be32 addr = *(__be32 *)neigh->primary_key;
237 struct net_device *dev = neigh->dev; 238 struct net_device *dev = neigh->dev;
238 struct in_device *in_dev; 239 struct in_device *in_dev;
239 struct neigh_parms *parms; 240 struct neigh_parms *parms;
@@ -296,16 +297,19 @@ static int arp_constructor(struct neighbour *neigh)
296 neigh->ops = &arp_broken_ops; 297 neigh->ops = &arp_broken_ops;
297 neigh->output = neigh->ops->output; 298 neigh->output = neigh->ops->output;
298 return 0; 299 return 0;
300#else
301 break;
299#endif 302#endif
300 ;} 303 }
301#endif 304#endif
302 if (neigh->type == RTN_MULTICAST) { 305 if (neigh->type == RTN_MULTICAST) {
303 neigh->nud_state = NUD_NOARP; 306 neigh->nud_state = NUD_NOARP;
304 arp_mc_map(addr, neigh->ha, dev, 1); 307 arp_mc_map(addr, neigh->ha, dev, 1);
305 } else if (dev->flags&(IFF_NOARP|IFF_LOOPBACK)) { 308 } else if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) {
306 neigh->nud_state = NUD_NOARP; 309 neigh->nud_state = NUD_NOARP;
307 memcpy(neigh->ha, dev->dev_addr, dev->addr_len); 310 memcpy(neigh->ha, dev->dev_addr, dev->addr_len);
308 } else if (neigh->type == RTN_BROADCAST || dev->flags&IFF_POINTOPOINT) { 311 } else if (neigh->type == RTN_BROADCAST ||
312 (dev->flags & IFF_POINTOPOINT)) {
309 neigh->nud_state = NUD_NOARP; 313 neigh->nud_state = NUD_NOARP;
310 memcpy(neigh->ha, dev->broadcast, dev->addr_len); 314 memcpy(neigh->ha, dev->broadcast, dev->addr_len);
311 } 315 }
@@ -315,7 +319,7 @@ static int arp_constructor(struct neighbour *neigh)
315 else 319 else
316 neigh->ops = &arp_generic_ops; 320 neigh->ops = &arp_generic_ops;
317 321
318 if (neigh->nud_state&NUD_VALID) 322 if (neigh->nud_state & NUD_VALID)
319 neigh->output = neigh->ops->connected_output; 323 neigh->output = neigh->ops->connected_output;
320 else 324 else
321 neigh->output = neigh->ops->output; 325 neigh->output = neigh->ops->output;
@@ -334,7 +338,7 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
334 __be32 saddr = 0; 338 __be32 saddr = 0;
335 u8 *dst_ha = NULL; 339 u8 *dst_ha = NULL;
336 struct net_device *dev = neigh->dev; 340 struct net_device *dev = neigh->dev;
337 __be32 target = *(__be32*)neigh->primary_key; 341 __be32 target = *(__be32 *)neigh->primary_key;
338 int probes = atomic_read(&neigh->probes); 342 int probes = atomic_read(&neigh->probes);
339 struct in_device *in_dev; 343 struct in_device *in_dev;
340 344
@@ -347,7 +351,8 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
347 switch (IN_DEV_ARP_ANNOUNCE(in_dev)) { 351 switch (IN_DEV_ARP_ANNOUNCE(in_dev)) {
348 default: 352 default:
349 case 0: /* By default announce any local IP */ 353 case 0: /* By default announce any local IP */
350 if (skb && inet_addr_type(dev_net(dev), ip_hdr(skb)->saddr) == RTN_LOCAL) 354 if (skb && inet_addr_type(dev_net(dev),
355 ip_hdr(skb)->saddr) == RTN_LOCAL)
351 saddr = ip_hdr(skb)->saddr; 356 saddr = ip_hdr(skb)->saddr;
352 break; 357 break;
353 case 1: /* Restrict announcements of saddr in same subnet */ 358 case 1: /* Restrict announcements of saddr in same subnet */
@@ -369,16 +374,21 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
369 if (!saddr) 374 if (!saddr)
370 saddr = inet_select_addr(dev, target, RT_SCOPE_LINK); 375 saddr = inet_select_addr(dev, target, RT_SCOPE_LINK);
371 376
372 if ((probes -= neigh->parms->ucast_probes) < 0) { 377 probes -= neigh->parms->ucast_probes;
373 if (!(neigh->nud_state&NUD_VALID)) 378 if (probes < 0) {
374 printk(KERN_DEBUG "trying to ucast probe in NUD_INVALID\n"); 379 if (!(neigh->nud_state & NUD_VALID))
380 printk(KERN_DEBUG
381 "trying to ucast probe in NUD_INVALID\n");
375 dst_ha = neigh->ha; 382 dst_ha = neigh->ha;
376 read_lock_bh(&neigh->lock); 383 read_lock_bh(&neigh->lock);
377 } else if ((probes -= neigh->parms->app_probes) < 0) { 384 } else {
385 probes -= neigh->parms->app_probes;
386 if (probes < 0) {
378#ifdef CONFIG_ARPD 387#ifdef CONFIG_ARPD
379 neigh_app_ns(neigh); 388 neigh_app_ns(neigh);
380#endif 389#endif
381 return; 390 return;
391 }
382 } 392 }
383 393
384 arp_send(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr, 394 arp_send(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr,
@@ -451,7 +461,8 @@ static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev)
451 * is allowed to use this function, it is scheduled to be removed. --ANK 461 * is allowed to use this function, it is scheduled to be removed. --ANK
452 */ 462 */
453 463
454static int arp_set_predefined(int addr_hint, unsigned char * haddr, __be32 paddr, struct net_device * dev) 464static int arp_set_predefined(int addr_hint, unsigned char *haddr,
465 __be32 paddr, struct net_device *dev)
455{ 466{
456 switch (addr_hint) { 467 switch (addr_hint) {
457 case RTN_LOCAL: 468 case RTN_LOCAL:
@@ -483,17 +494,16 @@ int arp_find(unsigned char *haddr, struct sk_buff *skb)
483 494
484 paddr = skb_rtable(skb)->rt_gateway; 495 paddr = skb_rtable(skb)->rt_gateway;
485 496
486 if (arp_set_predefined(inet_addr_type(dev_net(dev), paddr), haddr, paddr, dev)) 497 if (arp_set_predefined(inet_addr_type(dev_net(dev), paddr), haddr,
498 paddr, dev))
487 return 0; 499 return 0;
488 500
489 n = __neigh_lookup(&arp_tbl, &paddr, dev, 1); 501 n = __neigh_lookup(&arp_tbl, &paddr, dev, 1);
490 502
491 if (n) { 503 if (n) {
492 n->used = jiffies; 504 n->used = jiffies;
493 if (n->nud_state&NUD_VALID || neigh_event_send(n, skb) == 0) { 505 if (n->nud_state & NUD_VALID || neigh_event_send(n, skb) == 0) {
494 read_lock_bh(&n->lock); 506 neigh_ha_snapshot(haddr, n, dev);
495 memcpy(haddr, n->ha, dev->addr_len);
496 read_unlock_bh(&n->lock);
497 neigh_release(n); 507 neigh_release(n);
498 return 0; 508 return 0;
499 } 509 }
@@ -515,13 +525,14 @@ int arp_bind_neighbour(struct dst_entry *dst)
515 return -EINVAL; 525 return -EINVAL;
516 if (n == NULL) { 526 if (n == NULL) {
517 __be32 nexthop = ((struct rtable *)dst)->rt_gateway; 527 __be32 nexthop = ((struct rtable *)dst)->rt_gateway;
518 if (dev->flags&(IFF_LOOPBACK|IFF_POINTOPOINT)) 528 if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
519 nexthop = 0; 529 nexthop = 0;
520 n = __neigh_lookup_errno( 530 n = __neigh_lookup_errno(
521#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE) 531#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
522 dev->type == ARPHRD_ATM ? clip_tbl_hook : 532 dev->type == ARPHRD_ATM ?
533 clip_tbl_hook :
523#endif 534#endif
524 &arp_tbl, &nexthop, dev); 535 &arp_tbl, &nexthop, dev);
525 if (IS_ERR(n)) 536 if (IS_ERR(n))
526 return PTR_ERR(n); 537 return PTR_ERR(n);
527 dst->neighbour = n; 538 dst->neighbour = n;
@@ -543,8 +554,8 @@ static inline int arp_fwd_proxy(struct in_device *in_dev,
543 554
544 if (!IN_DEV_PROXY_ARP(in_dev)) 555 if (!IN_DEV_PROXY_ARP(in_dev))
545 return 0; 556 return 0;
546 557 imi = IN_DEV_MEDIUM_ID(in_dev);
547 if ((imi = IN_DEV_MEDIUM_ID(in_dev)) == 0) 558 if (imi == 0)
548 return 1; 559 return 1;
549 if (imi == -1) 560 if (imi == -1)
550 return 0; 561 return 0;
@@ -555,7 +566,7 @@ static inline int arp_fwd_proxy(struct in_device *in_dev,
555 if (out_dev) 566 if (out_dev)
556 omi = IN_DEV_MEDIUM_ID(out_dev); 567 omi = IN_DEV_MEDIUM_ID(out_dev);
557 568
558 return (omi != imi && omi != -1); 569 return omi != imi && omi != -1;
559} 570}
560 571
561/* 572/*
@@ -685,7 +696,7 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip,
685 arp->ar_pln = 4; 696 arp->ar_pln = 4;
686 arp->ar_op = htons(type); 697 arp->ar_op = htons(type);
687 698
688 arp_ptr=(unsigned char *)(arp+1); 699 arp_ptr = (unsigned char *)(arp + 1);
689 700
690 memcpy(arp_ptr, src_hw, dev->addr_len); 701 memcpy(arp_ptr, src_hw, dev->addr_len);
691 arp_ptr += dev->addr_len; 702 arp_ptr += dev->addr_len;
@@ -735,9 +746,8 @@ void arp_send(int type, int ptype, __be32 dest_ip,
735 746
736 skb = arp_create(type, ptype, dest_ip, dev, src_ip, 747 skb = arp_create(type, ptype, dest_ip, dev, src_ip,
737 dest_hw, src_hw, target_hw); 748 dest_hw, src_hw, target_hw);
738 if (skb == NULL) { 749 if (skb == NULL)
739 return; 750 return;
740 }
741 751
742 arp_xmit(skb); 752 arp_xmit(skb);
743} 753}
@@ -815,7 +825,7 @@ static int arp_process(struct sk_buff *skb)
815/* 825/*
816 * Extract fields 826 * Extract fields
817 */ 827 */
818 arp_ptr= (unsigned char *)(arp+1); 828 arp_ptr = (unsigned char *)(arp + 1);
819 sha = arp_ptr; 829 sha = arp_ptr;
820 arp_ptr += dev->addr_len; 830 arp_ptr += dev->addr_len;
821 memcpy(&sip, arp_ptr, 4); 831 memcpy(&sip, arp_ptr, 4);
@@ -869,16 +879,17 @@ static int arp_process(struct sk_buff *skb)
869 addr_type = rt->rt_type; 879 addr_type = rt->rt_type;
870 880
871 if (addr_type == RTN_LOCAL) { 881 if (addr_type == RTN_LOCAL) {
872 int dont_send = 0; 882 int dont_send;
873 883
874 if (!dont_send) 884 dont_send = arp_ignore(in_dev, sip, tip);
875 dont_send |= arp_ignore(in_dev,sip,tip);
876 if (!dont_send && IN_DEV_ARPFILTER(in_dev)) 885 if (!dont_send && IN_DEV_ARPFILTER(in_dev))
877 dont_send |= arp_filter(sip,tip,dev); 886 dont_send |= arp_filter(sip, tip, dev);
878 if (!dont_send) { 887 if (!dont_send) {
879 n = neigh_event_ns(&arp_tbl, sha, &sip, dev); 888 n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
880 if (n) { 889 if (n) {
881 arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha); 890 arp_send(ARPOP_REPLY, ETH_P_ARP, sip,
891 dev, tip, sha, dev->dev_addr,
892 sha);
882 neigh_release(n); 893 neigh_release(n);
883 } 894 }
884 } 895 }
@@ -887,8 +898,7 @@ static int arp_process(struct sk_buff *skb)
887 if (addr_type == RTN_UNICAST && 898 if (addr_type == RTN_UNICAST &&
888 (arp_fwd_proxy(in_dev, dev, rt) || 899 (arp_fwd_proxy(in_dev, dev, rt) ||
889 arp_fwd_pvlan(in_dev, dev, rt, sip, tip) || 900 arp_fwd_pvlan(in_dev, dev, rt, sip, tip) ||
890 pneigh_lookup(&arp_tbl, net, &tip, dev, 0))) 901 pneigh_lookup(&arp_tbl, net, &tip, dev, 0))) {
891 {
892 n = neigh_event_ns(&arp_tbl, sha, &sip, dev); 902 n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
893 if (n) 903 if (n)
894 neigh_release(n); 904 neigh_release(n);
@@ -896,9 +906,12 @@ static int arp_process(struct sk_buff *skb)
896 if (NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED || 906 if (NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED ||
897 skb->pkt_type == PACKET_HOST || 907 skb->pkt_type == PACKET_HOST ||
898 in_dev->arp_parms->proxy_delay == 0) { 908 in_dev->arp_parms->proxy_delay == 0) {
899 arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha); 909 arp_send(ARPOP_REPLY, ETH_P_ARP, sip,
910 dev, tip, sha, dev->dev_addr,
911 sha);
900 } else { 912 } else {
901 pneigh_enqueue(&arp_tbl, in_dev->arp_parms, skb); 913 pneigh_enqueue(&arp_tbl,
914 in_dev->arp_parms, skb);
902 return 0; 915 return 0;
903 } 916 }
904 goto out; 917 goto out;
@@ -939,7 +952,8 @@ static int arp_process(struct sk_buff *skb)
939 if (arp->ar_op != htons(ARPOP_REPLY) || 952 if (arp->ar_op != htons(ARPOP_REPLY) ||
940 skb->pkt_type != PACKET_HOST) 953 skb->pkt_type != PACKET_HOST)
941 state = NUD_STALE; 954 state = NUD_STALE;
942 neigh_update(n, sha, state, override ? NEIGH_UPDATE_F_OVERRIDE : 0); 955 neigh_update(n, sha, state,
956 override ? NEIGH_UPDATE_F_OVERRIDE : 0);
943 neigh_release(n); 957 neigh_release(n);
944 } 958 }
945 959
@@ -975,7 +989,8 @@ static int arp_rcv(struct sk_buff *skb, struct net_device *dev,
975 arp->ar_pln != 4) 989 arp->ar_pln != 4)
976 goto freeskb; 990 goto freeskb;
977 991
978 if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) 992 skb = skb_share_check(skb, GFP_ATOMIC);
993 if (skb == NULL)
979 goto out_of_mem; 994 goto out_of_mem;
980 995
981 memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb)); 996 memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb));
@@ -1019,7 +1034,7 @@ static int arp_req_set_public(struct net *net, struct arpreq *r,
1019 return -EINVAL; 1034 return -EINVAL;
1020 if (!dev && (r->arp_flags & ATF_COM)) { 1035 if (!dev && (r->arp_flags & ATF_COM)) {
1021 dev = dev_getbyhwaddr(net, r->arp_ha.sa_family, 1036 dev = dev_getbyhwaddr(net, r->arp_ha.sa_family,
1022 r->arp_ha.sa_data); 1037 r->arp_ha.sa_data);
1023 if (!dev) 1038 if (!dev)
1024 return -ENODEV; 1039 return -ENODEV;
1025 } 1040 }
@@ -1033,7 +1048,7 @@ static int arp_req_set_public(struct net *net, struct arpreq *r,
1033} 1048}
1034 1049
1035static int arp_req_set(struct net *net, struct arpreq *r, 1050static int arp_req_set(struct net *net, struct arpreq *r,
1036 struct net_device * dev) 1051 struct net_device *dev)
1037{ 1052{
1038 __be32 ip; 1053 __be32 ip;
1039 struct neighbour *neigh; 1054 struct neighbour *neigh;
@@ -1046,10 +1061,11 @@ static int arp_req_set(struct net *net, struct arpreq *r,
1046 if (r->arp_flags & ATF_PERM) 1061 if (r->arp_flags & ATF_PERM)
1047 r->arp_flags |= ATF_COM; 1062 r->arp_flags |= ATF_COM;
1048 if (dev == NULL) { 1063 if (dev == NULL) {
1049 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = ip, 1064 struct flowi fl = { .nl_u.ip4_u = { .daddr = ip,
1050 .tos = RTO_ONLINK } } }; 1065 .tos = RTO_ONLINK } };
1051 struct rtable * rt; 1066 struct rtable *rt;
1052 if ((err = ip_route_output_key(net, &rt, &fl)) != 0) 1067 err = ip_route_output_key(net, &rt, &fl);
1068 if (err != 0)
1053 return err; 1069 return err;
1054 dev = rt->dst.dev; 1070 dev = rt->dst.dev;
1055 ip_rt_put(rt); 1071 ip_rt_put(rt);
@@ -1083,9 +1099,9 @@ static int arp_req_set(struct net *net, struct arpreq *r,
1083 unsigned state = NUD_STALE; 1099 unsigned state = NUD_STALE;
1084 if (r->arp_flags & ATF_PERM) 1100 if (r->arp_flags & ATF_PERM)
1085 state = NUD_PERMANENT; 1101 state = NUD_PERMANENT;
1086 err = neigh_update(neigh, (r->arp_flags&ATF_COM) ? 1102 err = neigh_update(neigh, (r->arp_flags & ATF_COM) ?
1087 r->arp_ha.sa_data : NULL, state, 1103 r->arp_ha.sa_data : NULL, state,
1088 NEIGH_UPDATE_F_OVERRIDE| 1104 NEIGH_UPDATE_F_OVERRIDE |
1089 NEIGH_UPDATE_F_ADMIN); 1105 NEIGH_UPDATE_F_ADMIN);
1090 neigh_release(neigh); 1106 neigh_release(neigh);
1091 } 1107 }
@@ -1094,12 +1110,12 @@ static int arp_req_set(struct net *net, struct arpreq *r,
1094 1110
1095static unsigned arp_state_to_flags(struct neighbour *neigh) 1111static unsigned arp_state_to_flags(struct neighbour *neigh)
1096{ 1112{
1097 unsigned flags = 0;
1098 if (neigh->nud_state&NUD_PERMANENT) 1113 if (neigh->nud_state&NUD_PERMANENT)
1099 flags = ATF_PERM|ATF_COM; 1114 return ATF_PERM | ATF_COM;
1100 else if (neigh->nud_state&NUD_VALID) 1115 else if (neigh->nud_state&NUD_VALID)
1101 flags = ATF_COM; 1116 return ATF_COM;
1102 return flags; 1117 else
1118 return 0;
1103} 1119}
1104 1120
1105/* 1121/*
@@ -1142,7 +1158,7 @@ static int arp_req_delete_public(struct net *net, struct arpreq *r,
1142} 1158}
1143 1159
1144static int arp_req_delete(struct net *net, struct arpreq *r, 1160static int arp_req_delete(struct net *net, struct arpreq *r,
1145 struct net_device * dev) 1161 struct net_device *dev)
1146{ 1162{
1147 int err; 1163 int err;
1148 __be32 ip; 1164 __be32 ip;
@@ -1153,10 +1169,11 @@ static int arp_req_delete(struct net *net, struct arpreq *r,
1153 1169
1154 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; 1170 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;
1155 if (dev == NULL) { 1171 if (dev == NULL) {
1156 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = ip, 1172 struct flowi fl = { .nl_u.ip4_u = { .daddr = ip,
1157 .tos = RTO_ONLINK } } }; 1173 .tos = RTO_ONLINK } };
1158 struct rtable * rt; 1174 struct rtable *rt;
1159 if ((err = ip_route_output_key(net, &rt, &fl)) != 0) 1175 err = ip_route_output_key(net, &rt, &fl);
1176 if (err != 0)
1160 return err; 1177 return err;
1161 dev = rt->dst.dev; 1178 dev = rt->dst.dev;
1162 ip_rt_put(rt); 1179 ip_rt_put(rt);
@@ -1166,7 +1183,7 @@ static int arp_req_delete(struct net *net, struct arpreq *r,
1166 err = -ENXIO; 1183 err = -ENXIO;
1167 neigh = neigh_lookup(&arp_tbl, &ip, dev); 1184 neigh = neigh_lookup(&arp_tbl, &ip, dev);
1168 if (neigh) { 1185 if (neigh) {
1169 if (neigh->nud_state&~NUD_NOARP) 1186 if (neigh->nud_state & ~NUD_NOARP)
1170 err = neigh_update(neigh, NULL, NUD_FAILED, 1187 err = neigh_update(neigh, NULL, NUD_FAILED,
1171 NEIGH_UPDATE_F_OVERRIDE| 1188 NEIGH_UPDATE_F_OVERRIDE|
1172 NEIGH_UPDATE_F_ADMIN); 1189 NEIGH_UPDATE_F_ADMIN);
@@ -1186,24 +1203,24 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1186 struct net_device *dev = NULL; 1203 struct net_device *dev = NULL;
1187 1204
1188 switch (cmd) { 1205 switch (cmd) {
1189 case SIOCDARP: 1206 case SIOCDARP:
1190 case SIOCSARP: 1207 case SIOCSARP:
1191 if (!capable(CAP_NET_ADMIN)) 1208 if (!capable(CAP_NET_ADMIN))
1192 return -EPERM; 1209 return -EPERM;
1193 case SIOCGARP: 1210 case SIOCGARP:
1194 err = copy_from_user(&r, arg, sizeof(struct arpreq)); 1211 err = copy_from_user(&r, arg, sizeof(struct arpreq));
1195 if (err) 1212 if (err)
1196 return -EFAULT; 1213 return -EFAULT;
1197 break; 1214 break;
1198 default: 1215 default:
1199 return -EINVAL; 1216 return -EINVAL;
1200 } 1217 }
1201 1218
1202 if (r.arp_pa.sa_family != AF_INET) 1219 if (r.arp_pa.sa_family != AF_INET)
1203 return -EPFNOSUPPORT; 1220 return -EPFNOSUPPORT;
1204 1221
1205 if (!(r.arp_flags & ATF_PUBL) && 1222 if (!(r.arp_flags & ATF_PUBL) &&
1206 (r.arp_flags & (ATF_NETMASK|ATF_DONTPUB))) 1223 (r.arp_flags & (ATF_NETMASK | ATF_DONTPUB)))
1207 return -EINVAL; 1224 return -EINVAL;
1208 if (!(r.arp_flags & ATF_NETMASK)) 1225 if (!(r.arp_flags & ATF_NETMASK))
1209 ((struct sockaddr_in *)&r.arp_netmask)->sin_addr.s_addr = 1226 ((struct sockaddr_in *)&r.arp_netmask)->sin_addr.s_addr =
@@ -1211,7 +1228,8 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1211 rtnl_lock(); 1228 rtnl_lock();
1212 if (r.arp_dev[0]) { 1229 if (r.arp_dev[0]) {
1213 err = -ENODEV; 1230 err = -ENODEV;
1214 if ((dev = __dev_get_by_name(net, r.arp_dev)) == NULL) 1231 dev = __dev_get_by_name(net, r.arp_dev);
1232 if (dev == NULL)
1215 goto out; 1233 goto out;
1216 1234
1217 /* Mmmm... It is wrong... ARPHRD_NETROM==0 */ 1235 /* Mmmm... It is wrong... ARPHRD_NETROM==0 */
@@ -1243,7 +1261,8 @@ out:
1243 return err; 1261 return err;
1244} 1262}
1245 1263
1246static int arp_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) 1264static int arp_netdev_event(struct notifier_block *this, unsigned long event,
1265 void *ptr)
1247{ 1266{
1248 struct net_device *dev = ptr; 1267 struct net_device *dev = ptr;
1249 1268
@@ -1311,12 +1330,13 @@ static char *ax2asc2(ax25_address *a, char *buf)
1311 for (n = 0, s = buf; n < 6; n++) { 1330 for (n = 0, s = buf; n < 6; n++) {
1312 c = (a->ax25_call[n] >> 1) & 0x7F; 1331 c = (a->ax25_call[n] >> 1) & 0x7F;
1313 1332
1314 if (c != ' ') *s++ = c; 1333 if (c != ' ')
1334 *s++ = c;
1315 } 1335 }
1316 1336
1317 *s++ = '-'; 1337 *s++ = '-';
1318 1338 n = (a->ax25_call[6] >> 1) & 0x0F;
1319 if ((n = ((a->ax25_call[6] >> 1) & 0x0F)) > 9) { 1339 if (n > 9) {
1320 *s++ = '1'; 1340 *s++ = '1';
1321 n -= 10; 1341 n -= 10;
1322 } 1342 }
@@ -1325,10 +1345,9 @@ static char *ax2asc2(ax25_address *a, char *buf)
1325 *s++ = '\0'; 1345 *s++ = '\0';
1326 1346
1327 if (*buf == '\0' || *buf == '-') 1347 if (*buf == '\0' || *buf == '-')
1328 return "*"; 1348 return "*";
1329 1349
1330 return buf; 1350 return buf;
1331
1332} 1351}
1333#endif /* CONFIG_AX25 */ 1352#endif /* CONFIG_AX25 */
1334 1353
@@ -1408,10 +1427,10 @@ static void *arp_seq_start(struct seq_file *seq, loff_t *pos)
1408/* ------------------------------------------------------------------------ */ 1427/* ------------------------------------------------------------------------ */
1409 1428
1410static const struct seq_operations arp_seq_ops = { 1429static const struct seq_operations arp_seq_ops = {
1411 .start = arp_seq_start, 1430 .start = arp_seq_start,
1412 .next = neigh_seq_next, 1431 .next = neigh_seq_next,
1413 .stop = neigh_seq_stop, 1432 .stop = neigh_seq_stop,
1414 .show = arp_seq_show, 1433 .show = arp_seq_show,
1415}; 1434};
1416 1435
1417static int arp_seq_open(struct inode *inode, struct file *file) 1436static int arp_seq_open(struct inode *inode, struct file *file)
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index 721a8a37b45c..174be6caa5c8 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -73,6 +73,6 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
73 inet->inet_id = jiffies; 73 inet->inet_id = jiffies;
74 74
75 sk_dst_set(sk, &rt->dst); 75 sk_dst_set(sk, &rt->dst);
76 return(0); 76 return 0;
77} 77}
78EXPORT_SYMBOL(ip4_datagram_connect); 78EXPORT_SYMBOL(ip4_datagram_connect);
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index da14c49284f4..dc94b0316b78 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -209,7 +209,7 @@ static void inetdev_destroy(struct in_device *in_dev)
209 inet_free_ifa(ifa); 209 inet_free_ifa(ifa);
210 } 210 }
211 211
212 dev->ip_ptr = NULL; 212 rcu_assign_pointer(dev->ip_ptr, NULL);
213 213
214 devinet_sysctl_unregister(in_dev); 214 devinet_sysctl_unregister(in_dev);
215 neigh_parms_release(&arp_tbl, in_dev->arp_parms); 215 neigh_parms_release(&arp_tbl, in_dev->arp_parms);
@@ -403,6 +403,9 @@ static int inet_set_ifa(struct net_device *dev, struct in_ifaddr *ifa)
403 return inet_insert_ifa(ifa); 403 return inet_insert_ifa(ifa);
404} 404}
405 405
406/* Caller must hold RCU or RTNL :
407 * We dont take a reference on found in_device
408 */
406struct in_device *inetdev_by_index(struct net *net, int ifindex) 409struct in_device *inetdev_by_index(struct net *net, int ifindex)
407{ 410{
408 struct net_device *dev; 411 struct net_device *dev;
@@ -411,7 +414,7 @@ struct in_device *inetdev_by_index(struct net *net, int ifindex)
411 rcu_read_lock(); 414 rcu_read_lock();
412 dev = dev_get_by_index_rcu(net, ifindex); 415 dev = dev_get_by_index_rcu(net, ifindex);
413 if (dev) 416 if (dev)
414 in_dev = in_dev_get(dev); 417 in_dev = rcu_dereference_rtnl(dev->ip_ptr);
415 rcu_read_unlock(); 418 rcu_read_unlock();
416 return in_dev; 419 return in_dev;
417} 420}
@@ -453,8 +456,6 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg
453 goto errout; 456 goto errout;
454 } 457 }
455 458
456 __in_dev_put(in_dev);
457
458 for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL; 459 for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL;
459 ifap = &ifa->ifa_next) { 460 ifap = &ifa->ifa_next) {
460 if (tb[IFA_LOCAL] && 461 if (tb[IFA_LOCAL] &&
@@ -1059,7 +1060,7 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
1059 switch (event) { 1060 switch (event) {
1060 case NETDEV_REGISTER: 1061 case NETDEV_REGISTER:
1061 printk(KERN_DEBUG "inetdev_event: bug\n"); 1062 printk(KERN_DEBUG "inetdev_event: bug\n");
1062 dev->ip_ptr = NULL; 1063 rcu_assign_pointer(dev->ip_ptr, NULL);
1063 break; 1064 break;
1064 case NETDEV_UP: 1065 case NETDEV_UP:
1065 if (!inetdev_valid_mtu(dev->mtu)) 1066 if (!inetdev_valid_mtu(dev->mtu))
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 7d02a9f999fa..36e27c2107de 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -147,35 +147,43 @@ static void fib_flush(struct net *net)
147 rt_cache_flush(net, -1); 147 rt_cache_flush(net, -1);
148} 148}
149 149
150/* 150/**
151 * Find the first device with a given source address. 151 * __ip_dev_find - find the first device with a given source address.
152 * @net: the net namespace
153 * @addr: the source address
154 * @devref: if true, take a reference on the found device
155 *
156 * If a caller uses devref=false, it should be protected by RCU, or RTNL
152 */ 157 */
153 158struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref)
154struct net_device * ip_dev_find(struct net *net, __be32 addr)
155{ 159{
156 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = addr } } }; 160 struct flowi fl = {
157 struct fib_result res; 161 .nl_u = {
162 .ip4_u = {
163 .daddr = addr
164 }
165 },
166 .flags = FLOWI_FLAG_MATCH_ANY_IIF
167 };
168 struct fib_result res = { 0 };
158 struct net_device *dev = NULL; 169 struct net_device *dev = NULL;
159 struct fib_table *local_table;
160 170
161#ifdef CONFIG_IP_MULTIPLE_TABLES 171 rcu_read_lock();
162 res.r = NULL; 172 if (fib_lookup(net, &fl, &res)) {
163#endif 173 rcu_read_unlock();
164
165 local_table = fib_get_table(net, RT_TABLE_LOCAL);
166 if (!local_table || fib_table_lookup(local_table, &fl, &res))
167 return NULL; 174 return NULL;
175 }
168 if (res.type != RTN_LOCAL) 176 if (res.type != RTN_LOCAL)
169 goto out; 177 goto out;
170 dev = FIB_RES_DEV(res); 178 dev = FIB_RES_DEV(res);
171 179
172 if (dev) 180 if (dev && devref)
173 dev_hold(dev); 181 dev_hold(dev);
174out: 182out:
175 fib_res_put(&res); 183 rcu_read_unlock();
176 return dev; 184 return dev;
177} 185}
178EXPORT_SYMBOL(ip_dev_find); 186EXPORT_SYMBOL(__ip_dev_find);
179 187
180/* 188/*
181 * Find address type as if only "dev" was present in the system. If 189 * Find address type as if only "dev" was present in the system. If
@@ -202,11 +210,12 @@ static inline unsigned __inet_dev_addr_type(struct net *net,
202 local_table = fib_get_table(net, RT_TABLE_LOCAL); 210 local_table = fib_get_table(net, RT_TABLE_LOCAL);
203 if (local_table) { 211 if (local_table) {
204 ret = RTN_UNICAST; 212 ret = RTN_UNICAST;
205 if (!fib_table_lookup(local_table, &fl, &res)) { 213 rcu_read_lock();
214 if (!fib_table_lookup(local_table, &fl, &res, FIB_LOOKUP_NOREF)) {
206 if (!dev || dev == res.fi->fib_dev) 215 if (!dev || dev == res.fi->fib_dev)
207 ret = res.type; 216 ret = res.type;
208 fib_res_put(&res);
209 } 217 }
218 rcu_read_unlock();
210 } 219 }
211 return ret; 220 return ret;
212} 221}
@@ -220,30 +229,34 @@ EXPORT_SYMBOL(inet_addr_type);
220unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev, 229unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev,
221 __be32 addr) 230 __be32 addr)
222{ 231{
223 return __inet_dev_addr_type(net, dev, addr); 232 return __inet_dev_addr_type(net, dev, addr);
224} 233}
225EXPORT_SYMBOL(inet_dev_addr_type); 234EXPORT_SYMBOL(inet_dev_addr_type);
226 235
227/* Given (packet source, input interface) and optional (dst, oif, tos): 236/* Given (packet source, input interface) and optional (dst, oif, tos):
228 - (main) check, that source is valid i.e. not broadcast or our local 237 * - (main) check, that source is valid i.e. not broadcast or our local
229 address. 238 * address.
230 - figure out what "logical" interface this packet arrived 239 * - figure out what "logical" interface this packet arrived
231 and calculate "specific destination" address. 240 * and calculate "specific destination" address.
232 - check, that packet arrived from expected physical interface. 241 * - check, that packet arrived from expected physical interface.
242 * called with rcu_read_lock()
233 */ 243 */
234
235int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif, 244int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
236 struct net_device *dev, __be32 *spec_dst, 245 struct net_device *dev, __be32 *spec_dst,
237 u32 *itag, u32 mark) 246 u32 *itag, u32 mark)
238{ 247{
239 struct in_device *in_dev; 248 struct in_device *in_dev;
240 struct flowi fl = { .nl_u = { .ip4_u = 249 struct flowi fl = {
241 { .daddr = src, 250 .nl_u = {
242 .saddr = dst, 251 .ip4_u = {
243 .tos = tos } }, 252 .daddr = src,
244 .mark = mark, 253 .saddr = dst,
245 .iif = oif }; 254 .tos = tos
246 255 }
256 },
257 .mark = mark,
258 .iif = oif
259 };
247 struct fib_result res; 260 struct fib_result res;
248 int no_addr, rpf, accept_local; 261 int no_addr, rpf, accept_local;
249 bool dev_match; 262 bool dev_match;
@@ -251,7 +264,6 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
251 struct net *net; 264 struct net *net;
252 265
253 no_addr = rpf = accept_local = 0; 266 no_addr = rpf = accept_local = 0;
254 rcu_read_lock();
255 in_dev = __in_dev_get_rcu(dev); 267 in_dev = __in_dev_get_rcu(dev);
256 if (in_dev) { 268 if (in_dev) {
257 no_addr = in_dev->ifa_list == NULL; 269 no_addr = in_dev->ifa_list == NULL;
@@ -260,7 +272,6 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
260 if (mark && !IN_DEV_SRC_VMARK(in_dev)) 272 if (mark && !IN_DEV_SRC_VMARK(in_dev))
261 fl.mark = 0; 273 fl.mark = 0;
262 } 274 }
263 rcu_read_unlock();
264 275
265 if (in_dev == NULL) 276 if (in_dev == NULL)
266 goto e_inval; 277 goto e_inval;
@@ -270,7 +281,7 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
270 goto last_resort; 281 goto last_resort;
271 if (res.type != RTN_UNICAST) { 282 if (res.type != RTN_UNICAST) {
272 if (res.type != RTN_LOCAL || !accept_local) 283 if (res.type != RTN_LOCAL || !accept_local)
273 goto e_inval_res; 284 goto e_inval;
274 } 285 }
275 *spec_dst = FIB_RES_PREFSRC(res); 286 *spec_dst = FIB_RES_PREFSRC(res);
276 fib_combine_itag(itag, &res); 287 fib_combine_itag(itag, &res);
@@ -291,10 +302,8 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
291#endif 302#endif
292 if (dev_match) { 303 if (dev_match) {
293 ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; 304 ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
294 fib_res_put(&res);
295 return ret; 305 return ret;
296 } 306 }
297 fib_res_put(&res);
298 if (no_addr) 307 if (no_addr)
299 goto last_resort; 308 goto last_resort;
300 if (rpf == 1) 309 if (rpf == 1)
@@ -307,7 +316,6 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
307 *spec_dst = FIB_RES_PREFSRC(res); 316 *spec_dst = FIB_RES_PREFSRC(res);
308 ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; 317 ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
309 } 318 }
310 fib_res_put(&res);
311 } 319 }
312 return ret; 320 return ret;
313 321
@@ -318,8 +326,6 @@ last_resort:
318 *itag = 0; 326 *itag = 0;
319 return 0; 327 return 0;
320 328
321e_inval_res:
322 fib_res_put(&res);
323e_inval: 329e_inval:
324 return -EINVAL; 330 return -EINVAL;
325e_rpf: 331e_rpf:
@@ -472,9 +478,9 @@ static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt,
472} 478}
473 479
474/* 480/*
475 * Handle IP routing ioctl calls. These are used to manipulate the routing tables 481 * Handle IP routing ioctl calls.
482 * These are used to manipulate the routing tables
476 */ 483 */
477
478int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg) 484int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg)
479{ 485{
480 struct fib_config cfg; 486 struct fib_config cfg;
@@ -518,7 +524,7 @@ int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg)
518 return -EINVAL; 524 return -EINVAL;
519} 525}
520 526
521const struct nla_policy rtm_ipv4_policy[RTA_MAX+1] = { 527const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
522 [RTA_DST] = { .type = NLA_U32 }, 528 [RTA_DST] = { .type = NLA_U32 },
523 [RTA_SRC] = { .type = NLA_U32 }, 529 [RTA_SRC] = { .type = NLA_U32 },
524 [RTA_IIF] = { .type = NLA_U32 }, 530 [RTA_IIF] = { .type = NLA_U32 },
@@ -532,7 +538,7 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX+1] = {
532}; 538};
533 539
534static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, 540static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
535 struct nlmsghdr *nlh, struct fib_config *cfg) 541 struct nlmsghdr *nlh, struct fib_config *cfg)
536{ 542{
537 struct nlattr *attr; 543 struct nlattr *attr;
538 int err, remaining; 544 int err, remaining;
@@ -687,12 +693,11 @@ out:
687} 693}
688 694
689/* Prepare and feed intra-kernel routing request. 695/* Prepare and feed intra-kernel routing request.
690 Really, it should be netlink message, but :-( netlink 696 * Really, it should be netlink message, but :-( netlink
691 can be not configured, so that we feed it directly 697 * can be not configured, so that we feed it directly
692 to fib engine. It is legal, because all events occur 698 * to fib engine. It is legal, because all events occur
693 only when netlink is already locked. 699 * only when netlink is already locked.
694 */ 700 */
695
696static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifaddr *ifa) 701static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifaddr *ifa)
697{ 702{
698 struct net *net = dev_net(ifa->ifa_dev->dev); 703 struct net *net = dev_net(ifa->ifa_dev->dev);
@@ -738,9 +743,9 @@ void fib_add_ifaddr(struct in_ifaddr *ifa)
738 struct in_ifaddr *prim = ifa; 743 struct in_ifaddr *prim = ifa;
739 __be32 mask = ifa->ifa_mask; 744 __be32 mask = ifa->ifa_mask;
740 __be32 addr = ifa->ifa_local; 745 __be32 addr = ifa->ifa_local;
741 __be32 prefix = ifa->ifa_address&mask; 746 __be32 prefix = ifa->ifa_address & mask;
742 747
743 if (ifa->ifa_flags&IFA_F_SECONDARY) { 748 if (ifa->ifa_flags & IFA_F_SECONDARY) {
744 prim = inet_ifa_byprefix(in_dev, prefix, mask); 749 prim = inet_ifa_byprefix(in_dev, prefix, mask);
745 if (prim == NULL) { 750 if (prim == NULL) {
746 printk(KERN_WARNING "fib_add_ifaddr: bug: prim == NULL\n"); 751 printk(KERN_WARNING "fib_add_ifaddr: bug: prim == NULL\n");
@@ -750,22 +755,24 @@ void fib_add_ifaddr(struct in_ifaddr *ifa)
750 755
751 fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim); 756 fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim);
752 757
753 if (!(dev->flags&IFF_UP)) 758 if (!(dev->flags & IFF_UP))
754 return; 759 return;
755 760
756 /* Add broadcast address, if it is explicitly assigned. */ 761 /* Add broadcast address, if it is explicitly assigned. */
757 if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF)) 762 if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF))
758 fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim); 763 fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
759 764
760 if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags&IFA_F_SECONDARY) && 765 if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags & IFA_F_SECONDARY) &&
761 (prefix != addr || ifa->ifa_prefixlen < 32)) { 766 (prefix != addr || ifa->ifa_prefixlen < 32)) {
762 fib_magic(RTM_NEWROUTE, dev->flags&IFF_LOOPBACK ? RTN_LOCAL : 767 fib_magic(RTM_NEWROUTE,
763 RTN_UNICAST, prefix, ifa->ifa_prefixlen, prim); 768 dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
769 prefix, ifa->ifa_prefixlen, prim);
764 770
765 /* Add network specific broadcasts, when it takes a sense */ 771 /* Add network specific broadcasts, when it takes a sense */
766 if (ifa->ifa_prefixlen < 31) { 772 if (ifa->ifa_prefixlen < 31) {
767 fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32, prim); 773 fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32, prim);
768 fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix|~mask, 32, prim); 774 fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix | ~mask,
775 32, prim);
769 } 776 }
770 } 777 }
771} 778}
@@ -776,17 +783,18 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa)
776 struct net_device *dev = in_dev->dev; 783 struct net_device *dev = in_dev->dev;
777 struct in_ifaddr *ifa1; 784 struct in_ifaddr *ifa1;
778 struct in_ifaddr *prim = ifa; 785 struct in_ifaddr *prim = ifa;
779 __be32 brd = ifa->ifa_address|~ifa->ifa_mask; 786 __be32 brd = ifa->ifa_address | ~ifa->ifa_mask;
780 __be32 any = ifa->ifa_address&ifa->ifa_mask; 787 __be32 any = ifa->ifa_address & ifa->ifa_mask;
781#define LOCAL_OK 1 788#define LOCAL_OK 1
782#define BRD_OK 2 789#define BRD_OK 2
783#define BRD0_OK 4 790#define BRD0_OK 4
784#define BRD1_OK 8 791#define BRD1_OK 8
785 unsigned ok = 0; 792 unsigned ok = 0;
786 793
787 if (!(ifa->ifa_flags&IFA_F_SECONDARY)) 794 if (!(ifa->ifa_flags & IFA_F_SECONDARY))
788 fib_magic(RTM_DELROUTE, dev->flags&IFF_LOOPBACK ? RTN_LOCAL : 795 fib_magic(RTM_DELROUTE,
789 RTN_UNICAST, any, ifa->ifa_prefixlen, prim); 796 dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
797 any, ifa->ifa_prefixlen, prim);
790 else { 798 else {
791 prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask); 799 prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask);
792 if (prim == NULL) { 800 if (prim == NULL) {
@@ -796,9 +804,9 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa)
796 } 804 }
797 805
798 /* Deletion is more complicated than add. 806 /* Deletion is more complicated than add.
799 We should take care of not to delete too much :-) 807 * We should take care of not to delete too much :-)
800 808 *
801 Scan address list to be sure that addresses are really gone. 809 * Scan address list to be sure that addresses are really gone.
802 */ 810 */
803 811
804 for (ifa1 = in_dev->ifa_list; ifa1; ifa1 = ifa1->ifa_next) { 812 for (ifa1 = in_dev->ifa_list; ifa1; ifa1 = ifa1->ifa_next) {
@@ -812,23 +820,23 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa)
812 ok |= BRD0_OK; 820 ok |= BRD0_OK;
813 } 821 }
814 822
815 if (!(ok&BRD_OK)) 823 if (!(ok & BRD_OK))
816 fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim); 824 fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
817 if (!(ok&BRD1_OK)) 825 if (!(ok & BRD1_OK))
818 fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim); 826 fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim);
819 if (!(ok&BRD0_OK)) 827 if (!(ok & BRD0_OK))
820 fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim); 828 fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim);
821 if (!(ok&LOCAL_OK)) { 829 if (!(ok & LOCAL_OK)) {
822 fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim); 830 fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim);
823 831
824 /* Check, that this local address finally disappeared. */ 832 /* Check, that this local address finally disappeared. */
825 if (inet_addr_type(dev_net(dev), ifa->ifa_local) != RTN_LOCAL) { 833 if (inet_addr_type(dev_net(dev), ifa->ifa_local) != RTN_LOCAL) {
826 /* And the last, but not the least thing. 834 /* And the last, but not the least thing.
827 We must flush stray FIB entries. 835 * We must flush stray FIB entries.
828 836 *
829 First of all, we scan fib_info list searching 837 * First of all, we scan fib_info list searching
830 for stray nexthop entries, then ignite fib_flush. 838 * for stray nexthop entries, then ignite fib_flush.
831 */ 839 */
832 if (fib_sync_down_addr(dev_net(dev), ifa->ifa_local)) 840 if (fib_sync_down_addr(dev_net(dev), ifa->ifa_local))
833 fib_flush(dev_net(dev)); 841 fib_flush(dev_net(dev));
834 } 842 }
@@ -839,14 +847,20 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa)
839#undef BRD1_OK 847#undef BRD1_OK
840} 848}
841 849
842static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb ) 850static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb)
843{ 851{
844 852
845 struct fib_result res; 853 struct fib_result res;
846 struct flowi fl = { .mark = frn->fl_mark, 854 struct flowi fl = {
847 .nl_u = { .ip4_u = { .daddr = frn->fl_addr, 855 .mark = frn->fl_mark,
848 .tos = frn->fl_tos, 856 .nl_u = {
849 .scope = frn->fl_scope } } }; 857 .ip4_u = {
858 .daddr = frn->fl_addr,
859 .tos = frn->fl_tos,
860 .scope = frn->fl_scope
861 }
862 }
863 };
850 864
851#ifdef CONFIG_IP_MULTIPLE_TABLES 865#ifdef CONFIG_IP_MULTIPLE_TABLES
852 res.r = NULL; 866 res.r = NULL;
@@ -857,15 +871,16 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb )
857 local_bh_disable(); 871 local_bh_disable();
858 872
859 frn->tb_id = tb->tb_id; 873 frn->tb_id = tb->tb_id;
860 frn->err = fib_table_lookup(tb, &fl, &res); 874 rcu_read_lock();
875 frn->err = fib_table_lookup(tb, &fl, &res, FIB_LOOKUP_NOREF);
861 876
862 if (!frn->err) { 877 if (!frn->err) {
863 frn->prefixlen = res.prefixlen; 878 frn->prefixlen = res.prefixlen;
864 frn->nh_sel = res.nh_sel; 879 frn->nh_sel = res.nh_sel;
865 frn->type = res.type; 880 frn->type = res.type;
866 frn->scope = res.scope; 881 frn->scope = res.scope;
867 fib_res_put(&res);
868 } 882 }
883 rcu_read_unlock();
869 local_bh_enable(); 884 local_bh_enable();
870 } 885 }
871} 886}
@@ -894,8 +909,8 @@ static void nl_fib_input(struct sk_buff *skb)
894 909
895 nl_fib_lookup(frn, tb); 910 nl_fib_lookup(frn, tb);
896 911
897 pid = NETLINK_CB(skb).pid; /* pid of sending process */ 912 pid = NETLINK_CB(skb).pid; /* pid of sending process */
898 NETLINK_CB(skb).pid = 0; /* from kernel */ 913 NETLINK_CB(skb).pid = 0; /* from kernel */
899 NETLINK_CB(skb).dst_group = 0; /* unicast */ 914 NETLINK_CB(skb).dst_group = 0; /* unicast */
900 netlink_unicast(net->ipv4.fibnl, skb, pid, MSG_DONTWAIT); 915 netlink_unicast(net->ipv4.fibnl, skb, pid, MSG_DONTWAIT);
901} 916}
@@ -942,7 +957,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
942 fib_del_ifaddr(ifa); 957 fib_del_ifaddr(ifa);
943 if (ifa->ifa_dev->ifa_list == NULL) { 958 if (ifa->ifa_dev->ifa_list == NULL) {
944 /* Last address was deleted from this interface. 959 /* Last address was deleted from this interface.
945 Disable IP. 960 * Disable IP.
946 */ 961 */
947 fib_disable_ip(dev, 1, 0); 962 fib_disable_ip(dev, 1, 0);
948 } else { 963 } else {
@@ -1001,16 +1016,15 @@ static struct notifier_block fib_netdev_notifier = {
1001static int __net_init ip_fib_net_init(struct net *net) 1016static int __net_init ip_fib_net_init(struct net *net)
1002{ 1017{
1003 int err; 1018 int err;
1004 unsigned int i; 1019 size_t size = sizeof(struct hlist_head) * FIB_TABLE_HASHSZ;
1005 1020
1006 net->ipv4.fib_table_hash = kzalloc( 1021 /* Avoid false sharing : Use at least a full cache line */
1007 sizeof(struct hlist_head)*FIB_TABLE_HASHSZ, GFP_KERNEL); 1022 size = max_t(size_t, size, L1_CACHE_BYTES);
1023
1024 net->ipv4.fib_table_hash = kzalloc(size, GFP_KERNEL);
1008 if (net->ipv4.fib_table_hash == NULL) 1025 if (net->ipv4.fib_table_hash == NULL)
1009 return -ENOMEM; 1026 return -ENOMEM;
1010 1027
1011 for (i = 0; i < FIB_TABLE_HASHSZ; i++)
1012 INIT_HLIST_HEAD(&net->ipv4.fib_table_hash[i]);
1013
1014 err = fib4_rules_init(net); 1028 err = fib4_rules_init(net);
1015 if (err < 0) 1029 if (err < 0)
1016 goto fail; 1030 goto fail;
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
index 4ed7e0dea1bc..43e1c594ce8f 100644
--- a/net/ipv4/fib_hash.c
+++ b/net/ipv4/fib_hash.c
@@ -54,36 +54,37 @@ struct fib_node {
54 struct fib_alias fn_embedded_alias; 54 struct fib_alias fn_embedded_alias;
55}; 55};
56 56
57struct fn_zone { 57#define EMBEDDED_HASH_SIZE (L1_CACHE_BYTES / sizeof(struct hlist_head))
58 struct fn_zone *fz_next; /* Next not empty zone */
59 struct hlist_head *fz_hash; /* Hash table pointer */
60 int fz_nent; /* Number of entries */
61 58
62 int fz_divisor; /* Hash divisor */ 59struct fn_zone {
60 struct fn_zone __rcu *fz_next; /* Next not empty zone */
61 struct hlist_head __rcu *fz_hash; /* Hash table pointer */
62 seqlock_t fz_lock;
63 u32 fz_hashmask; /* (fz_divisor - 1) */ 63 u32 fz_hashmask; /* (fz_divisor - 1) */
64#define FZ_HASHMASK(fz) ((fz)->fz_hashmask)
65 64
66 int fz_order; /* Zone order */ 65 u8 fz_order; /* Zone order (0..32) */
67 __be32 fz_mask; 66 u8 fz_revorder; /* 32 - fz_order */
67 __be32 fz_mask; /* inet_make_mask(order) */
68#define FZ_MASK(fz) ((fz)->fz_mask) 68#define FZ_MASK(fz) ((fz)->fz_mask)
69};
70 69
71/* NOTE. On fast computers evaluation of fz_hashmask and fz_mask 70 struct hlist_head fz_embedded_hash[EMBEDDED_HASH_SIZE];
72 * can be cheaper than memory lookup, so that FZ_* macros are used. 71
73 */ 72 int fz_nent; /* Number of entries */
73 int fz_divisor; /* Hash size (mask+1) */
74};
74 75
75struct fn_hash { 76struct fn_hash {
76 struct fn_zone *fn_zones[33]; 77 struct fn_zone *fn_zones[33];
77 struct fn_zone *fn_zone_list; 78 struct fn_zone __rcu *fn_zone_list;
78}; 79};
79 80
80static inline u32 fn_hash(__be32 key, struct fn_zone *fz) 81static inline u32 fn_hash(__be32 key, struct fn_zone *fz)
81{ 82{
82 u32 h = ntohl(key)>>(32 - fz->fz_order); 83 u32 h = ntohl(key) >> fz->fz_revorder;
83 h ^= (h>>20); 84 h ^= (h>>20);
84 h ^= (h>>10); 85 h ^= (h>>10);
85 h ^= (h>>5); 86 h ^= (h>>5);
86 h &= FZ_HASHMASK(fz); 87 h &= fz->fz_hashmask;
87 return h; 88 return h;
88} 89}
89 90
@@ -92,7 +93,6 @@ static inline __be32 fz_key(__be32 dst, struct fn_zone *fz)
92 return dst & FZ_MASK(fz); 93 return dst & FZ_MASK(fz);
93} 94}
94 95
95static DEFINE_RWLOCK(fib_hash_lock);
96static unsigned int fib_hash_genid; 96static unsigned int fib_hash_genid;
97 97
98#define FZ_MAX_DIVISOR ((PAGE_SIZE<<MAX_ORDER) / sizeof(struct hlist_head)) 98#define FZ_MAX_DIVISOR ((PAGE_SIZE<<MAX_ORDER) / sizeof(struct hlist_head))
@@ -101,12 +101,11 @@ static struct hlist_head *fz_hash_alloc(int divisor)
101{ 101{
102 unsigned long size = divisor * sizeof(struct hlist_head); 102 unsigned long size = divisor * sizeof(struct hlist_head);
103 103
104 if (size <= PAGE_SIZE) { 104 if (size <= PAGE_SIZE)
105 return kzalloc(size, GFP_KERNEL); 105 return kzalloc(size, GFP_KERNEL);
106 } else { 106
107 return (struct hlist_head *) 107 return (struct hlist_head *)
108 __get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(size)); 108 __get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(size));
109 }
110} 109}
111 110
112/* The fib hash lock must be held when this is called. */ 111/* The fib hash lock must be held when this is called. */
@@ -121,12 +120,12 @@ static inline void fn_rebuild_zone(struct fn_zone *fz,
121 struct fib_node *f; 120 struct fib_node *f;
122 121
123 hlist_for_each_entry_safe(f, node, n, &old_ht[i], fn_hash) { 122 hlist_for_each_entry_safe(f, node, n, &old_ht[i], fn_hash) {
124 struct hlist_head *new_head; 123 struct hlist_head __rcu *new_head;
125 124
126 hlist_del(&f->fn_hash); 125 hlist_del_rcu(&f->fn_hash);
127 126
128 new_head = &fz->fz_hash[fn_hash(f->fn_key, fz)]; 127 new_head = &fz->fz_hash[fn_hash(f->fn_key, fz)];
129 hlist_add_head(&f->fn_hash, new_head); 128 hlist_add_head_rcu(&f->fn_hash, new_head);
130 } 129 }
131 } 130 }
132} 131}
@@ -147,14 +146,14 @@ static void fn_rehash_zone(struct fn_zone *fz)
147 int old_divisor, new_divisor; 146 int old_divisor, new_divisor;
148 u32 new_hashmask; 147 u32 new_hashmask;
149 148
150 old_divisor = fz->fz_divisor; 149 new_divisor = old_divisor = fz->fz_divisor;
151 150
152 switch (old_divisor) { 151 switch (old_divisor) {
153 case 16: 152 case EMBEDDED_HASH_SIZE:
154 new_divisor = 256; 153 new_divisor *= EMBEDDED_HASH_SIZE;
155 break; 154 break;
156 case 256: 155 case EMBEDDED_HASH_SIZE*EMBEDDED_HASH_SIZE:
157 new_divisor = 1024; 156 new_divisor *= (EMBEDDED_HASH_SIZE/2);
158 break; 157 break;
159 default: 158 default:
160 if ((old_divisor << 1) > FZ_MAX_DIVISOR) { 159 if ((old_divisor << 1) > FZ_MAX_DIVISOR) {
@@ -175,31 +174,55 @@ static void fn_rehash_zone(struct fn_zone *fz)
175 ht = fz_hash_alloc(new_divisor); 174 ht = fz_hash_alloc(new_divisor);
176 175
177 if (ht) { 176 if (ht) {
178 write_lock_bh(&fib_hash_lock); 177 struct fn_zone nfz;
178
179 memcpy(&nfz, fz, sizeof(nfz));
180
181 write_seqlock_bh(&fz->fz_lock);
179 old_ht = fz->fz_hash; 182 old_ht = fz->fz_hash;
180 fz->fz_hash = ht; 183 nfz.fz_hash = ht;
184 nfz.fz_hashmask = new_hashmask;
185 nfz.fz_divisor = new_divisor;
186 fn_rebuild_zone(&nfz, old_ht, old_divisor);
187 fib_hash_genid++;
188 rcu_assign_pointer(fz->fz_hash, ht);
181 fz->fz_hashmask = new_hashmask; 189 fz->fz_hashmask = new_hashmask;
182 fz->fz_divisor = new_divisor; 190 fz->fz_divisor = new_divisor;
183 fn_rebuild_zone(fz, old_ht, old_divisor); 191 write_sequnlock_bh(&fz->fz_lock);
184 fib_hash_genid++;
185 write_unlock_bh(&fib_hash_lock);
186 192
187 fz_hash_free(old_ht, old_divisor); 193 if (old_ht != fz->fz_embedded_hash) {
194 synchronize_rcu();
195 fz_hash_free(old_ht, old_divisor);
196 }
188 } 197 }
189} 198}
190 199
191static inline void fn_free_node(struct fib_node * f) 200static void fn_free_node_rcu(struct rcu_head *head)
192{ 201{
202 struct fib_node *f = container_of(head, struct fib_node, fn_embedded_alias.rcu);
203
193 kmem_cache_free(fn_hash_kmem, f); 204 kmem_cache_free(fn_hash_kmem, f);
194} 205}
195 206
207static inline void fn_free_node(struct fib_node *f)
208{
209 call_rcu(&f->fn_embedded_alias.rcu, fn_free_node_rcu);
210}
211
212static void fn_free_alias_rcu(struct rcu_head *head)
213{
214 struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
215
216 kmem_cache_free(fn_alias_kmem, fa);
217}
218
196static inline void fn_free_alias(struct fib_alias *fa, struct fib_node *f) 219static inline void fn_free_alias(struct fib_alias *fa, struct fib_node *f)
197{ 220{
198 fib_release_info(fa->fa_info); 221 fib_release_info(fa->fa_info);
199 if (fa == &f->fn_embedded_alias) 222 if (fa == &f->fn_embedded_alias)
200 fa->fa_info = NULL; 223 fa->fa_info = NULL;
201 else 224 else
202 kmem_cache_free(fn_alias_kmem, fa); 225 call_rcu(&fa->rcu, fn_free_alias_rcu);
203} 226}
204 227
205static struct fn_zone * 228static struct fn_zone *
@@ -210,68 +233,71 @@ fn_new_zone(struct fn_hash *table, int z)
210 if (!fz) 233 if (!fz)
211 return NULL; 234 return NULL;
212 235
213 if (z) { 236 seqlock_init(&fz->fz_lock);
214 fz->fz_divisor = 16; 237 fz->fz_divisor = z ? EMBEDDED_HASH_SIZE : 1;
215 } else { 238 fz->fz_hashmask = fz->fz_divisor - 1;
216 fz->fz_divisor = 1; 239 fz->fz_hash = fz->fz_embedded_hash;
217 }
218 fz->fz_hashmask = (fz->fz_divisor - 1);
219 fz->fz_hash = fz_hash_alloc(fz->fz_divisor);
220 if (!fz->fz_hash) {
221 kfree(fz);
222 return NULL;
223 }
224 fz->fz_order = z; 240 fz->fz_order = z;
241 fz->fz_revorder = 32 - z;
225 fz->fz_mask = inet_make_mask(z); 242 fz->fz_mask = inet_make_mask(z);
226 243
227 /* Find the first not empty zone with more specific mask */ 244 /* Find the first not empty zone with more specific mask */
228 for (i=z+1; i<=32; i++) 245 for (i = z + 1; i <= 32; i++)
229 if (table->fn_zones[i]) 246 if (table->fn_zones[i])
230 break; 247 break;
231 write_lock_bh(&fib_hash_lock); 248 if (i > 32) {
232 if (i>32) {
233 /* No more specific masks, we are the first. */ 249 /* No more specific masks, we are the first. */
234 fz->fz_next = table->fn_zone_list; 250 rcu_assign_pointer(fz->fz_next,
235 table->fn_zone_list = fz; 251 rtnl_dereference(table->fn_zone_list));
252 rcu_assign_pointer(table->fn_zone_list, fz);
236 } else { 253 } else {
237 fz->fz_next = table->fn_zones[i]->fz_next; 254 rcu_assign_pointer(fz->fz_next,
238 table->fn_zones[i]->fz_next = fz; 255 rtnl_dereference(table->fn_zones[i]->fz_next));
256 rcu_assign_pointer(table->fn_zones[i]->fz_next, fz);
239 } 257 }
240 table->fn_zones[z] = fz; 258 table->fn_zones[z] = fz;
241 fib_hash_genid++; 259 fib_hash_genid++;
242 write_unlock_bh(&fib_hash_lock);
243 return fz; 260 return fz;
244} 261}
245 262
246int fib_table_lookup(struct fib_table *tb, 263int fib_table_lookup(struct fib_table *tb,
247 const struct flowi *flp, struct fib_result *res) 264 const struct flowi *flp, struct fib_result *res,
265 int fib_flags)
248{ 266{
249 int err; 267 int err;
250 struct fn_zone *fz; 268 struct fn_zone *fz;
251 struct fn_hash *t = (struct fn_hash *)tb->tb_data; 269 struct fn_hash *t = (struct fn_hash *)tb->tb_data;
252 270
253 read_lock(&fib_hash_lock); 271 rcu_read_lock();
254 for (fz = t->fn_zone_list; fz; fz = fz->fz_next) { 272 for (fz = rcu_dereference(t->fn_zone_list);
255 struct hlist_head *head; 273 fz != NULL;
274 fz = rcu_dereference(fz->fz_next)) {
275 struct hlist_head __rcu *head;
256 struct hlist_node *node; 276 struct hlist_node *node;
257 struct fib_node *f; 277 struct fib_node *f;
258 __be32 k = fz_key(flp->fl4_dst, fz); 278 __be32 k;
279 unsigned int seq;
259 280
260 head = &fz->fz_hash[fn_hash(k, fz)]; 281 do {
261 hlist_for_each_entry(f, node, head, fn_hash) { 282 seq = read_seqbegin(&fz->fz_lock);
262 if (f->fn_key != k) 283 k = fz_key(flp->fl4_dst, fz);
263 continue; 284
285 head = &fz->fz_hash[fn_hash(k, fz)];
286 hlist_for_each_entry_rcu(f, node, head, fn_hash) {
287 if (f->fn_key != k)
288 continue;
264 289
265 err = fib_semantic_match(&f->fn_alias, 290 err = fib_semantic_match(&f->fn_alias,
266 flp, res, 291 flp, res,
267 fz->fz_order); 292 fz->fz_order, fib_flags);
268 if (err <= 0) 293 if (err <= 0)
269 goto out; 294 goto out;
270 } 295 }
296 } while (read_seqretry(&fz->fz_lock, seq));
271 } 297 }
272 err = 1; 298 err = 1;
273out: 299out:
274 read_unlock(&fib_hash_lock); 300 rcu_read_unlock();
275 return err; 301 return err;
276} 302}
277 303
@@ -293,11 +319,11 @@ void fib_table_select_default(struct fib_table *tb,
293 last_resort = NULL; 319 last_resort = NULL;
294 order = -1; 320 order = -1;
295 321
296 read_lock(&fib_hash_lock); 322 rcu_read_lock();
297 hlist_for_each_entry(f, node, &fz->fz_hash[0], fn_hash) { 323 hlist_for_each_entry_rcu(f, node, &fz->fz_hash[0], fn_hash) {
298 struct fib_alias *fa; 324 struct fib_alias *fa;
299 325
300 list_for_each_entry(fa, &f->fn_alias, fa_list) { 326 list_for_each_entry_rcu(fa, &f->fn_alias, fa_list) {
301 struct fib_info *next_fi = fa->fa_info; 327 struct fib_info *next_fi = fa->fa_info;
302 328
303 if (fa->fa_scope != res->scope || 329 if (fa->fa_scope != res->scope ||
@@ -309,7 +335,8 @@ void fib_table_select_default(struct fib_table *tb,
309 if (!next_fi->fib_nh[0].nh_gw || 335 if (!next_fi->fib_nh[0].nh_gw ||
310 next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK) 336 next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
311 continue; 337 continue;
312 fa->fa_state |= FA_S_ACCESSED; 338
339 fib_alias_accessed(fa);
313 340
314 if (fi == NULL) { 341 if (fi == NULL) {
315 if (next_fi != res->fi) 342 if (next_fi != res->fi)
@@ -341,7 +368,7 @@ void fib_table_select_default(struct fib_table *tb,
341 fib_result_assign(res, last_resort); 368 fib_result_assign(res, last_resort);
342 tb->tb_default = last_idx; 369 tb->tb_default = last_idx;
343out: 370out:
344 read_unlock(&fib_hash_lock); 371 rcu_read_unlock();
345} 372}
346 373
347/* Insert node F to FZ. */ 374/* Insert node F to FZ. */
@@ -349,7 +376,7 @@ static inline void fib_insert_node(struct fn_zone *fz, struct fib_node *f)
349{ 376{
350 struct hlist_head *head = &fz->fz_hash[fn_hash(f->fn_key, fz)]; 377 struct hlist_head *head = &fz->fz_hash[fn_hash(f->fn_key, fz)];
351 378
352 hlist_add_head(&f->fn_hash, head); 379 hlist_add_head_rcu(&f->fn_hash, head);
353} 380}
354 381
355/* Return the node in FZ matching KEY. */ 382/* Return the node in FZ matching KEY. */
@@ -359,7 +386,7 @@ static struct fib_node *fib_find_node(struct fn_zone *fz, __be32 key)
359 struct hlist_node *node; 386 struct hlist_node *node;
360 struct fib_node *f; 387 struct fib_node *f;
361 388
362 hlist_for_each_entry(f, node, head, fn_hash) { 389 hlist_for_each_entry_rcu(f, node, head, fn_hash) {
363 if (f->fn_key == key) 390 if (f->fn_key == key)
364 return f; 391 return f;
365 } 392 }
@@ -367,6 +394,17 @@ static struct fib_node *fib_find_node(struct fn_zone *fz, __be32 key)
367 return NULL; 394 return NULL;
368} 395}
369 396
397
398static struct fib_alias *fib_fast_alloc(struct fib_node *f)
399{
400 struct fib_alias *fa = &f->fn_embedded_alias;
401
402 if (fa->fa_info != NULL)
403 fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
404 return fa;
405}
406
407/* Caller must hold RTNL. */
370int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) 408int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
371{ 409{
372 struct fn_hash *table = (struct fn_hash *) tb->tb_data; 410 struct fn_hash *table = (struct fn_hash *) tb->tb_data;
@@ -451,7 +489,6 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
451 } 489 }
452 490
453 if (cfg->fc_nlflags & NLM_F_REPLACE) { 491 if (cfg->fc_nlflags & NLM_F_REPLACE) {
454 struct fib_info *fi_drop;
455 u8 state; 492 u8 state;
456 493
457 fa = fa_first; 494 fa = fa_first;
@@ -460,21 +497,25 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
460 err = 0; 497 err = 0;
461 goto out; 498 goto out;
462 } 499 }
463 write_lock_bh(&fib_hash_lock); 500 err = -ENOBUFS;
464 fi_drop = fa->fa_info; 501 new_fa = fib_fast_alloc(f);
465 fa->fa_info = fi; 502 if (new_fa == NULL)
466 fa->fa_type = cfg->fc_type; 503 goto out;
467 fa->fa_scope = cfg->fc_scope; 504
505 new_fa->fa_tos = fa->fa_tos;
506 new_fa->fa_info = fi;
507 new_fa->fa_type = cfg->fc_type;
508 new_fa->fa_scope = cfg->fc_scope;
468 state = fa->fa_state; 509 state = fa->fa_state;
469 fa->fa_state &= ~FA_S_ACCESSED; 510 new_fa->fa_state = state & ~FA_S_ACCESSED;
470 fib_hash_genid++; 511 fib_hash_genid++;
471 write_unlock_bh(&fib_hash_lock); 512 list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
472 513
473 fib_release_info(fi_drop); 514 fn_free_alias(fa, f);
474 if (state & FA_S_ACCESSED) 515 if (state & FA_S_ACCESSED)
475 rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); 516 rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
476 rtmsg_fib(RTM_NEWROUTE, key, fa, cfg->fc_dst_len, tb->tb_id, 517 rtmsg_fib(RTM_NEWROUTE, key, new_fa, cfg->fc_dst_len,
477 &cfg->fc_nlinfo, NLM_F_REPLACE); 518 tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
478 return 0; 519 return 0;
479 } 520 }
480 521
@@ -506,12 +547,10 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
506 f = new_f; 547 f = new_f;
507 } 548 }
508 549
509 new_fa = &f->fn_embedded_alias; 550 new_fa = fib_fast_alloc(f);
510 if (new_fa->fa_info != NULL) { 551 if (new_fa == NULL)
511 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL); 552 goto out;
512 if (new_fa == NULL) 553
513 goto out;
514 }
515 new_fa->fa_info = fi; 554 new_fa->fa_info = fi;
516 new_fa->fa_tos = tos; 555 new_fa->fa_tos = tos;
517 new_fa->fa_type = cfg->fc_type; 556 new_fa->fa_type = cfg->fc_type;
@@ -522,13 +561,11 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
522 * Insert new entry to the list. 561 * Insert new entry to the list.
523 */ 562 */
524 563
525 write_lock_bh(&fib_hash_lock);
526 if (new_f) 564 if (new_f)
527 fib_insert_node(fz, new_f); 565 fib_insert_node(fz, new_f);
528 list_add_tail(&new_fa->fa_list, 566 list_add_tail_rcu(&new_fa->fa_list,
529 (fa ? &fa->fa_list : &f->fn_alias)); 567 (fa ? &fa->fa_list : &f->fn_alias));
530 fib_hash_genid++; 568 fib_hash_genid++;
531 write_unlock_bh(&fib_hash_lock);
532 569
533 if (new_f) 570 if (new_f)
534 fz->fz_nent++; 571 fz->fz_nent++;
@@ -603,14 +640,12 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
603 tb->tb_id, &cfg->fc_nlinfo, 0); 640 tb->tb_id, &cfg->fc_nlinfo, 0);
604 641
605 kill_fn = 0; 642 kill_fn = 0;
606 write_lock_bh(&fib_hash_lock); 643 list_del_rcu(&fa->fa_list);
607 list_del(&fa->fa_list);
608 if (list_empty(&f->fn_alias)) { 644 if (list_empty(&f->fn_alias)) {
609 hlist_del(&f->fn_hash); 645 hlist_del_rcu(&f->fn_hash);
610 kill_fn = 1; 646 kill_fn = 1;
611 } 647 }
612 fib_hash_genid++; 648 fib_hash_genid++;
613 write_unlock_bh(&fib_hash_lock);
614 649
615 if (fa->fa_state & FA_S_ACCESSED) 650 if (fa->fa_state & FA_S_ACCESSED)
616 rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); 651 rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
@@ -641,14 +676,12 @@ static int fn_flush_list(struct fn_zone *fz, int idx)
641 struct fib_info *fi = fa->fa_info; 676 struct fib_info *fi = fa->fa_info;
642 677
643 if (fi && (fi->fib_flags&RTNH_F_DEAD)) { 678 if (fi && (fi->fib_flags&RTNH_F_DEAD)) {
644 write_lock_bh(&fib_hash_lock); 679 list_del_rcu(&fa->fa_list);
645 list_del(&fa->fa_list);
646 if (list_empty(&f->fn_alias)) { 680 if (list_empty(&f->fn_alias)) {
647 hlist_del(&f->fn_hash); 681 hlist_del_rcu(&f->fn_hash);
648 kill_f = 1; 682 kill_f = 1;
649 } 683 }
650 fib_hash_genid++; 684 fib_hash_genid++;
651 write_unlock_bh(&fib_hash_lock);
652 685
653 fn_free_alias(fa, f); 686 fn_free_alias(fa, f);
654 found++; 687 found++;
@@ -662,13 +695,16 @@ static int fn_flush_list(struct fn_zone *fz, int idx)
662 return found; 695 return found;
663} 696}
664 697
698/* caller must hold RTNL. */
665int fib_table_flush(struct fib_table *tb) 699int fib_table_flush(struct fib_table *tb)
666{ 700{
667 struct fn_hash *table = (struct fn_hash *) tb->tb_data; 701 struct fn_hash *table = (struct fn_hash *) tb->tb_data;
668 struct fn_zone *fz; 702 struct fn_zone *fz;
669 int found = 0; 703 int found = 0;
670 704
671 for (fz = table->fn_zone_list; fz; fz = fz->fz_next) { 705 for (fz = rtnl_dereference(table->fn_zone_list);
706 fz != NULL;
707 fz = rtnl_dereference(fz->fz_next)) {
672 int i; 708 int i;
673 709
674 for (i = fz->fz_divisor - 1; i >= 0; i--) 710 for (i = fz->fz_divisor - 1; i >= 0; i--)
@@ -690,10 +726,10 @@ fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb,
690 726
691 s_i = cb->args[4]; 727 s_i = cb->args[4];
692 i = 0; 728 i = 0;
693 hlist_for_each_entry(f, node, head, fn_hash) { 729 hlist_for_each_entry_rcu(f, node, head, fn_hash) {
694 struct fib_alias *fa; 730 struct fib_alias *fa;
695 731
696 list_for_each_entry(fa, &f->fn_alias, fa_list) { 732 list_for_each_entry_rcu(fa, &f->fn_alias, fa_list) {
697 if (i < s_i) 733 if (i < s_i)
698 goto next; 734 goto next;
699 735
@@ -711,7 +747,7 @@ fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb,
711 cb->args[4] = i; 747 cb->args[4] = i;
712 return -1; 748 return -1;
713 } 749 }
714 next: 750next:
715 i++; 751 i++;
716 } 752 }
717 } 753 }
@@ -746,23 +782,26 @@ fn_hash_dump_zone(struct sk_buff *skb, struct netlink_callback *cb,
746int fib_table_dump(struct fib_table *tb, struct sk_buff *skb, 782int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
747 struct netlink_callback *cb) 783 struct netlink_callback *cb)
748{ 784{
749 int m, s_m; 785 int m = 0, s_m;
750 struct fn_zone *fz; 786 struct fn_zone *fz;
751 struct fn_hash *table = (struct fn_hash *)tb->tb_data; 787 struct fn_hash *table = (struct fn_hash *)tb->tb_data;
752 788
753 s_m = cb->args[2]; 789 s_m = cb->args[2];
754 read_lock(&fib_hash_lock); 790 rcu_read_lock();
755 for (fz = table->fn_zone_list, m=0; fz; fz = fz->fz_next, m++) { 791 for (fz = rcu_dereference(table->fn_zone_list);
756 if (m < s_m) continue; 792 fz != NULL;
793 fz = rcu_dereference(fz->fz_next), m++) {
794 if (m < s_m)
795 continue;
757 if (fn_hash_dump_zone(skb, cb, tb, fz) < 0) { 796 if (fn_hash_dump_zone(skb, cb, tb, fz) < 0) {
758 cb->args[2] = m; 797 cb->args[2] = m;
759 read_unlock(&fib_hash_lock); 798 rcu_read_unlock();
760 return -1; 799 return -1;
761 } 800 }
762 memset(&cb->args[3], 0, 801 memset(&cb->args[3], 0,
763 sizeof(cb->args) - 3*sizeof(cb->args[0])); 802 sizeof(cb->args) - 3*sizeof(cb->args[0]));
764 } 803 }
765 read_unlock(&fib_hash_lock); 804 rcu_read_unlock();
766 cb->args[2] = m; 805 cb->args[2] = m;
767 return skb->len; 806 return skb->len;
768} 807}
@@ -825,8 +864,9 @@ static struct fib_alias *fib_get_first(struct seq_file *seq)
825 iter->genid = fib_hash_genid; 864 iter->genid = fib_hash_genid;
826 iter->valid = 1; 865 iter->valid = 1;
827 866
828 for (iter->zone = table->fn_zone_list; iter->zone; 867 for (iter->zone = rcu_dereference(table->fn_zone_list);
829 iter->zone = iter->zone->fz_next) { 868 iter->zone != NULL;
869 iter->zone = rcu_dereference(iter->zone->fz_next)) {
830 int maxslot; 870 int maxslot;
831 871
832 if (!iter->zone->fz_nent) 872 if (!iter->zone->fz_nent)
@@ -911,7 +951,7 @@ static struct fib_alias *fib_get_next(struct seq_file *seq)
911 } 951 }
912 } 952 }
913 953
914 iter->zone = iter->zone->fz_next; 954 iter->zone = rcu_dereference(iter->zone->fz_next);
915 955
916 if (!iter->zone) 956 if (!iter->zone)
917 goto out; 957 goto out;
@@ -950,11 +990,11 @@ static struct fib_alias *fib_get_idx(struct seq_file *seq, loff_t pos)
950} 990}
951 991
952static void *fib_seq_start(struct seq_file *seq, loff_t *pos) 992static void *fib_seq_start(struct seq_file *seq, loff_t *pos)
953 __acquires(fib_hash_lock) 993 __acquires(RCU)
954{ 994{
955 void *v = NULL; 995 void *v = NULL;
956 996
957 read_lock(&fib_hash_lock); 997 rcu_read_lock();
958 if (fib_get_table(seq_file_net(seq), RT_TABLE_MAIN)) 998 if (fib_get_table(seq_file_net(seq), RT_TABLE_MAIN))
959 v = *pos ? fib_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; 999 v = *pos ? fib_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
960 return v; 1000 return v;
@@ -967,15 +1007,16 @@ static void *fib_seq_next(struct seq_file *seq, void *v, loff_t *pos)
967} 1007}
968 1008
969static void fib_seq_stop(struct seq_file *seq, void *v) 1009static void fib_seq_stop(struct seq_file *seq, void *v)
970 __releases(fib_hash_lock) 1010 __releases(RCU)
971{ 1011{
972 read_unlock(&fib_hash_lock); 1012 rcu_read_unlock();
973} 1013}
974 1014
975static unsigned fib_flag_trans(int type, __be32 mask, struct fib_info *fi) 1015static unsigned fib_flag_trans(int type, __be32 mask, struct fib_info *fi)
976{ 1016{
977 static const unsigned type2flags[RTN_MAX + 1] = { 1017 static const unsigned type2flags[RTN_MAX + 1] = {
978 [7] = RTF_REJECT, [8] = RTF_REJECT, 1018 [7] = RTF_REJECT,
1019 [8] = RTF_REJECT,
979 }; 1020 };
980 unsigned flags = type2flags[type]; 1021 unsigned flags = type2flags[type];
981 1022
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
index 637b133973bd..a29edf2219c8 100644
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -12,17 +12,22 @@ struct fib_alias {
12 u8 fa_type; 12 u8 fa_type;
13 u8 fa_scope; 13 u8 fa_scope;
14 u8 fa_state; 14 u8 fa_state;
15#ifdef CONFIG_IP_FIB_TRIE
16 struct rcu_head rcu; 15 struct rcu_head rcu;
17#endif
18}; 16};
19 17
20#define FA_S_ACCESSED 0x01 18#define FA_S_ACCESSED 0x01
21 19
20/* Dont write on fa_state unless needed, to keep it shared on all cpus */
21static inline void fib_alias_accessed(struct fib_alias *fa)
22{
23 if (!(fa->fa_state & FA_S_ACCESSED))
24 fa->fa_state |= FA_S_ACCESSED;
25}
26
22/* Exported by fib_semantics.c */ 27/* Exported by fib_semantics.c */
23extern int fib_semantic_match(struct list_head *head, 28extern int fib_semantic_match(struct list_head *head,
24 const struct flowi *flp, 29 const struct flowi *flp,
25 struct fib_result *res, int prefixlen); 30 struct fib_result *res, int prefixlen, int fib_flags);
26extern void fib_release_info(struct fib_info *); 31extern void fib_release_info(struct fib_info *);
27extern struct fib_info *fib_create_info(struct fib_config *cfg); 32extern struct fib_info *fib_create_info(struct fib_config *cfg);
28extern int fib_nh_match(struct fib_config *cfg, struct fib_info *fi); 33extern int fib_nh_match(struct fib_config *cfg, struct fib_info *fi);
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 76daeb5ff564..7981a24f5c7b 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -6,7 +6,7 @@
6 * IPv4 Forwarding Information Base: policy rules. 6 * IPv4 Forwarding Information Base: policy rules.
7 * 7 *
8 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> 8 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
9 * Thomas Graf <tgraf@suug.ch> 9 * Thomas Graf <tgraf@suug.ch>
10 * 10 *
11 * This program is free software; you can redistribute it and/or 11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public License 12 * modify it under the terms of the GNU General Public License
@@ -14,7 +14,7 @@
14 * 2 of the License, or (at your option) any later version. 14 * 2 of the License, or (at your option) any later version.
15 * 15 *
16 * Fixes: 16 * Fixes:
17 * Rani Assaf : local_rule cannot be deleted 17 * Rani Assaf : local_rule cannot be deleted
18 * Marc Boucher : routing by fwmark 18 * Marc Boucher : routing by fwmark
19 */ 19 */
20 20
@@ -32,8 +32,7 @@
32#include <net/ip_fib.h> 32#include <net/ip_fib.h>
33#include <net/fib_rules.h> 33#include <net/fib_rules.h>
34 34
35struct fib4_rule 35struct fib4_rule {
36{
37 struct fib_rule common; 36 struct fib_rule common;
38 u8 dst_len; 37 u8 dst_len;
39 u8 src_len; 38 u8 src_len;
@@ -58,6 +57,7 @@ int fib_lookup(struct net *net, struct flowi *flp, struct fib_result *res)
58{ 57{
59 struct fib_lookup_arg arg = { 58 struct fib_lookup_arg arg = {
60 .result = res, 59 .result = res,
60 .flags = FIB_LOOKUP_NOREF,
61 }; 61 };
62 int err; 62 int err;
63 63
@@ -91,10 +91,11 @@ static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp,
91 goto errout; 91 goto errout;
92 } 92 }
93 93
94 if ((tbl = fib_get_table(rule->fr_net, rule->table)) == NULL) 94 tbl = fib_get_table(rule->fr_net, rule->table);
95 if (!tbl)
95 goto errout; 96 goto errout;
96 97
97 err = fib_table_lookup(tbl, flp, (struct fib_result *) arg->result); 98 err = fib_table_lookup(tbl, flp, (struct fib_result *) arg->result, arg->flags);
98 if (err > 0) 99 if (err > 0)
99 err = -EAGAIN; 100 err = -EAGAIN;
100errout: 101errout:
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 20f09c5b31e8..3e0da3ef6116 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -60,21 +60,30 @@ static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
60 60
61static DEFINE_SPINLOCK(fib_multipath_lock); 61static DEFINE_SPINLOCK(fib_multipath_lock);
62 62
63#define for_nexthops(fi) { int nhsel; const struct fib_nh * nh; \ 63#define for_nexthops(fi) { \
64for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++) 64 int nhsel; const struct fib_nh *nh; \
65 65 for (nhsel = 0, nh = (fi)->fib_nh; \
66#define change_nexthops(fi) { int nhsel; struct fib_nh *nexthop_nh; \ 66 nhsel < (fi)->fib_nhs; \
67for (nhsel=0, nexthop_nh = (struct fib_nh *)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nexthop_nh++, nhsel++) 67 nh++, nhsel++)
68
69#define change_nexthops(fi) { \
70 int nhsel; struct fib_nh *nexthop_nh; \
71 for (nhsel = 0, nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \
72 nhsel < (fi)->fib_nhs; \
73 nexthop_nh++, nhsel++)
68 74
69#else /* CONFIG_IP_ROUTE_MULTIPATH */ 75#else /* CONFIG_IP_ROUTE_MULTIPATH */
70 76
71/* Hope, that gcc will optimize it to get rid of dummy loop */ 77/* Hope, that gcc will optimize it to get rid of dummy loop */
72 78
73#define for_nexthops(fi) { int nhsel = 0; const struct fib_nh * nh = (fi)->fib_nh; \ 79#define for_nexthops(fi) { \
74for (nhsel=0; nhsel < 1; nhsel++) 80 int nhsel; const struct fib_nh *nh = (fi)->fib_nh; \
81 for (nhsel = 0; nhsel < 1; nhsel++)
75 82
76#define change_nexthops(fi) { int nhsel = 0; struct fib_nh *nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \ 83#define change_nexthops(fi) { \
77for (nhsel=0; nhsel < 1; nhsel++) 84 int nhsel; \
85 struct fib_nh *nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \
86 for (nhsel = 0; nhsel < 1; nhsel++)
78 87
79#endif /* CONFIG_IP_ROUTE_MULTIPATH */ 88#endif /* CONFIG_IP_ROUTE_MULTIPATH */
80 89
@@ -86,63 +95,70 @@ static const struct
86 int error; 95 int error;
87 u8 scope; 96 u8 scope;
88} fib_props[RTN_MAX + 1] = { 97} fib_props[RTN_MAX + 1] = {
89 { 98 [RTN_UNSPEC] = {
90 .error = 0, 99 .error = 0,
91 .scope = RT_SCOPE_NOWHERE, 100 .scope = RT_SCOPE_NOWHERE,
92 }, /* RTN_UNSPEC */ 101 },
93 { 102 [RTN_UNICAST] = {
94 .error = 0, 103 .error = 0,
95 .scope = RT_SCOPE_UNIVERSE, 104 .scope = RT_SCOPE_UNIVERSE,
96 }, /* RTN_UNICAST */ 105 },
97 { 106 [RTN_LOCAL] = {
98 .error = 0, 107 .error = 0,
99 .scope = RT_SCOPE_HOST, 108 .scope = RT_SCOPE_HOST,
100 }, /* RTN_LOCAL */ 109 },
101 { 110 [RTN_BROADCAST] = {
102 .error = 0, 111 .error = 0,
103 .scope = RT_SCOPE_LINK, 112 .scope = RT_SCOPE_LINK,
104 }, /* RTN_BROADCAST */ 113 },
105 { 114 [RTN_ANYCAST] = {
106 .error = 0, 115 .error = 0,
107 .scope = RT_SCOPE_LINK, 116 .scope = RT_SCOPE_LINK,
108 }, /* RTN_ANYCAST */ 117 },
109 { 118 [RTN_MULTICAST] = {
110 .error = 0, 119 .error = 0,
111 .scope = RT_SCOPE_UNIVERSE, 120 .scope = RT_SCOPE_UNIVERSE,
112 }, /* RTN_MULTICAST */ 121 },
113 { 122 [RTN_BLACKHOLE] = {
114 .error = -EINVAL, 123 .error = -EINVAL,
115 .scope = RT_SCOPE_UNIVERSE, 124 .scope = RT_SCOPE_UNIVERSE,
116 }, /* RTN_BLACKHOLE */ 125 },
117 { 126 [RTN_UNREACHABLE] = {
118 .error = -EHOSTUNREACH, 127 .error = -EHOSTUNREACH,
119 .scope = RT_SCOPE_UNIVERSE, 128 .scope = RT_SCOPE_UNIVERSE,
120 }, /* RTN_UNREACHABLE */ 129 },
121 { 130 [RTN_PROHIBIT] = {
122 .error = -EACCES, 131 .error = -EACCES,
123 .scope = RT_SCOPE_UNIVERSE, 132 .scope = RT_SCOPE_UNIVERSE,
124 }, /* RTN_PROHIBIT */ 133 },
125 { 134 [RTN_THROW] = {
126 .error = -EAGAIN, 135 .error = -EAGAIN,
127 .scope = RT_SCOPE_UNIVERSE, 136 .scope = RT_SCOPE_UNIVERSE,
128 }, /* RTN_THROW */ 137 },
129 { 138 [RTN_NAT] = {
130 .error = -EINVAL, 139 .error = -EINVAL,
131 .scope = RT_SCOPE_NOWHERE, 140 .scope = RT_SCOPE_NOWHERE,
132 }, /* RTN_NAT */ 141 },
133 { 142 [RTN_XRESOLVE] = {
134 .error = -EINVAL, 143 .error = -EINVAL,
135 .scope = RT_SCOPE_NOWHERE, 144 .scope = RT_SCOPE_NOWHERE,
136 }, /* RTN_XRESOLVE */ 145 },
137}; 146};
138 147
139 148
140/* Release a nexthop info record */ 149/* Release a nexthop info record */
141 150
151static void free_fib_info_rcu(struct rcu_head *head)
152{
153 struct fib_info *fi = container_of(head, struct fib_info, rcu);
154
155 kfree(fi);
156}
157
142void free_fib_info(struct fib_info *fi) 158void free_fib_info(struct fib_info *fi)
143{ 159{
144 if (fi->fib_dead == 0) { 160 if (fi->fib_dead == 0) {
145 printk(KERN_WARNING "Freeing alive fib_info %p\n", fi); 161 pr_warning("Freeing alive fib_info %p\n", fi);
146 return; 162 return;
147 } 163 }
148 change_nexthops(fi) { 164 change_nexthops(fi) {
@@ -152,7 +168,7 @@ void free_fib_info(struct fib_info *fi)
152 } endfor_nexthops(fi); 168 } endfor_nexthops(fi);
153 fib_info_cnt--; 169 fib_info_cnt--;
154 release_net(fi->fib_net); 170 release_net(fi->fib_net);
155 kfree(fi); 171 call_rcu(&fi->rcu, free_fib_info_rcu);
156} 172}
157 173
158void fib_release_info(struct fib_info *fi) 174void fib_release_info(struct fib_info *fi)
@@ -173,7 +189,7 @@ void fib_release_info(struct fib_info *fi)
173 spin_unlock_bh(&fib_info_lock); 189 spin_unlock_bh(&fib_info_lock);
174} 190}
175 191
176static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *ofi) 192static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
177{ 193{
178 const struct fib_nh *onh = ofi->fib_nh; 194 const struct fib_nh *onh = ofi->fib_nh;
179 195
@@ -187,7 +203,7 @@ static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *
187#ifdef CONFIG_NET_CLS_ROUTE 203#ifdef CONFIG_NET_CLS_ROUTE
188 nh->nh_tclassid != onh->nh_tclassid || 204 nh->nh_tclassid != onh->nh_tclassid ||
189#endif 205#endif
190 ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD)) 206 ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_F_DEAD))
191 return -1; 207 return -1;
192 onh++; 208 onh++;
193 } endfor_nexthops(fi); 209 } endfor_nexthops(fi);
@@ -238,7 +254,7 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi)
238 nfi->fib_priority == fi->fib_priority && 254 nfi->fib_priority == fi->fib_priority &&
239 memcmp(nfi->fib_metrics, fi->fib_metrics, 255 memcmp(nfi->fib_metrics, fi->fib_metrics,
240 sizeof(fi->fib_metrics)) == 0 && 256 sizeof(fi->fib_metrics)) == 0 &&
241 ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 && 257 ((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_F_DEAD) == 0 &&
242 (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0)) 258 (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
243 return fi; 259 return fi;
244 } 260 }
@@ -247,9 +263,8 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi)
247} 263}
248 264
249/* Check, that the gateway is already configured. 265/* Check, that the gateway is already configured.
250 Used only by redirect accept routine. 266 * Used only by redirect accept routine.
251 */ 267 */
252
253int ip_fib_check_default(__be32 gw, struct net_device *dev) 268int ip_fib_check_default(__be32 gw, struct net_device *dev)
254{ 269{
255 struct hlist_head *head; 270 struct hlist_head *head;
@@ -264,7 +279,7 @@ int ip_fib_check_default(__be32 gw, struct net_device *dev)
264 hlist_for_each_entry(nh, node, head, nh_hash) { 279 hlist_for_each_entry(nh, node, head, nh_hash) {
265 if (nh->nh_dev == dev && 280 if (nh->nh_dev == dev &&
266 nh->nh_gw == gw && 281 nh->nh_gw == gw &&
267 !(nh->nh_flags&RTNH_F_DEAD)) { 282 !(nh->nh_flags & RTNH_F_DEAD)) {
268 spin_unlock(&fib_info_lock); 283 spin_unlock(&fib_info_lock);
269 return 0; 284 return 0;
270 } 285 }
@@ -362,10 +377,10 @@ int fib_detect_death(struct fib_info *fi, int order,
362 } 377 }
363 if (state == NUD_REACHABLE) 378 if (state == NUD_REACHABLE)
364 return 0; 379 return 0;
365 if ((state&NUD_VALID) && order != dflt) 380 if ((state & NUD_VALID) && order != dflt)
366 return 0; 381 return 0;
367 if ((state&NUD_VALID) || 382 if ((state & NUD_VALID) ||
368 (*last_idx<0 && order > dflt)) { 383 (*last_idx < 0 && order > dflt)) {
369 *last_resort = fi; 384 *last_resort = fi;
370 *last_idx = order; 385 *last_idx = order;
371 } 386 }
@@ -476,75 +491,76 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
476 491
477 492
478/* 493/*
479 Picture 494 * Picture
480 ------- 495 * -------
481 496 *
482 Semantics of nexthop is very messy by historical reasons. 497 * Semantics of nexthop is very messy by historical reasons.
483 We have to take into account, that: 498 * We have to take into account, that:
484 a) gateway can be actually local interface address, 499 * a) gateway can be actually local interface address,
485 so that gatewayed route is direct. 500 * so that gatewayed route is direct.
486 b) gateway must be on-link address, possibly 501 * b) gateway must be on-link address, possibly
487 described not by an ifaddr, but also by a direct route. 502 * described not by an ifaddr, but also by a direct route.
488 c) If both gateway and interface are specified, they should not 503 * c) If both gateway and interface are specified, they should not
489 contradict. 504 * contradict.
490 d) If we use tunnel routes, gateway could be not on-link. 505 * d) If we use tunnel routes, gateway could be not on-link.
491 506 *
492 Attempt to reconcile all of these (alas, self-contradictory) conditions 507 * Attempt to reconcile all of these (alas, self-contradictory) conditions
493 results in pretty ugly and hairy code with obscure logic. 508 * results in pretty ugly and hairy code with obscure logic.
494 509 *
495 I chose to generalized it instead, so that the size 510 * I chose to generalized it instead, so that the size
496 of code does not increase practically, but it becomes 511 * of code does not increase practically, but it becomes
497 much more general. 512 * much more general.
498 Every prefix is assigned a "scope" value: "host" is local address, 513 * Every prefix is assigned a "scope" value: "host" is local address,
499 "link" is direct route, 514 * "link" is direct route,
500 [ ... "site" ... "interior" ... ] 515 * [ ... "site" ... "interior" ... ]
501 and "universe" is true gateway route with global meaning. 516 * and "universe" is true gateway route with global meaning.
502 517 *
503 Every prefix refers to a set of "nexthop"s (gw, oif), 518 * Every prefix refers to a set of "nexthop"s (gw, oif),
504 where gw must have narrower scope. This recursion stops 519 * where gw must have narrower scope. This recursion stops
505 when gw has LOCAL scope or if "nexthop" is declared ONLINK, 520 * when gw has LOCAL scope or if "nexthop" is declared ONLINK,
506 which means that gw is forced to be on link. 521 * which means that gw is forced to be on link.
507 522 *
508 Code is still hairy, but now it is apparently logically 523 * Code is still hairy, but now it is apparently logically
509 consistent and very flexible. F.e. as by-product it allows 524 * consistent and very flexible. F.e. as by-product it allows
510 to co-exists in peace independent exterior and interior 525 * to co-exists in peace independent exterior and interior
511 routing processes. 526 * routing processes.
512 527 *
513 Normally it looks as following. 528 * Normally it looks as following.
514 529 *
515 {universe prefix} -> (gw, oif) [scope link] 530 * {universe prefix} -> (gw, oif) [scope link]
516 | 531 * |
517 |-> {link prefix} -> (gw, oif) [scope local] 532 * |-> {link prefix} -> (gw, oif) [scope local]
518 | 533 * |
519 |-> {local prefix} (terminal node) 534 * |-> {local prefix} (terminal node)
520 */ 535 */
521
522static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, 536static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
523 struct fib_nh *nh) 537 struct fib_nh *nh)
524{ 538{
525 int err; 539 int err;
526 struct net *net; 540 struct net *net;
541 struct net_device *dev;
527 542
528 net = cfg->fc_nlinfo.nl_net; 543 net = cfg->fc_nlinfo.nl_net;
529 if (nh->nh_gw) { 544 if (nh->nh_gw) {
530 struct fib_result res; 545 struct fib_result res;
531 546
532 if (nh->nh_flags&RTNH_F_ONLINK) { 547 if (nh->nh_flags & RTNH_F_ONLINK) {
533 struct net_device *dev;
534 548
535 if (cfg->fc_scope >= RT_SCOPE_LINK) 549 if (cfg->fc_scope >= RT_SCOPE_LINK)
536 return -EINVAL; 550 return -EINVAL;
537 if (inet_addr_type(net, nh->nh_gw) != RTN_UNICAST) 551 if (inet_addr_type(net, nh->nh_gw) != RTN_UNICAST)
538 return -EINVAL; 552 return -EINVAL;
539 if ((dev = __dev_get_by_index(net, nh->nh_oif)) == NULL) 553 dev = __dev_get_by_index(net, nh->nh_oif);
554 if (!dev)
540 return -ENODEV; 555 return -ENODEV;
541 if (!(dev->flags&IFF_UP)) 556 if (!(dev->flags & IFF_UP))
542 return -ENETDOWN; 557 return -ENETDOWN;
543 nh->nh_dev = dev; 558 nh->nh_dev = dev;
544 dev_hold(dev); 559 dev_hold(dev);
545 nh->nh_scope = RT_SCOPE_LINK; 560 nh->nh_scope = RT_SCOPE_LINK;
546 return 0; 561 return 0;
547 } 562 }
563 rcu_read_lock();
548 { 564 {
549 struct flowi fl = { 565 struct flowi fl = {
550 .nl_u = { 566 .nl_u = {
@@ -559,50 +575,53 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
559 /* It is not necessary, but requires a bit of thinking */ 575 /* It is not necessary, but requires a bit of thinking */
560 if (fl.fl4_scope < RT_SCOPE_LINK) 576 if (fl.fl4_scope < RT_SCOPE_LINK)
561 fl.fl4_scope = RT_SCOPE_LINK; 577 fl.fl4_scope = RT_SCOPE_LINK;
562 if ((err = fib_lookup(net, &fl, &res)) != 0) 578 err = fib_lookup(net, &fl, &res);
579 if (err) {
580 rcu_read_unlock();
563 return err; 581 return err;
582 }
564 } 583 }
565 err = -EINVAL; 584 err = -EINVAL;
566 if (res.type != RTN_UNICAST && res.type != RTN_LOCAL) 585 if (res.type != RTN_UNICAST && res.type != RTN_LOCAL)
567 goto out; 586 goto out;
568 nh->nh_scope = res.scope; 587 nh->nh_scope = res.scope;
569 nh->nh_oif = FIB_RES_OIF(res); 588 nh->nh_oif = FIB_RES_OIF(res);
570 if ((nh->nh_dev = FIB_RES_DEV(res)) == NULL) 589 nh->nh_dev = dev = FIB_RES_DEV(res);
590 if (!dev)
571 goto out; 591 goto out;
572 dev_hold(nh->nh_dev); 592 dev_hold(dev);
573 err = -ENETDOWN; 593 err = (dev->flags & IFF_UP) ? 0 : -ENETDOWN;
574 if (!(nh->nh_dev->flags & IFF_UP))
575 goto out;
576 err = 0;
577out:
578 fib_res_put(&res);
579 return err;
580 } else { 594 } else {
581 struct in_device *in_dev; 595 struct in_device *in_dev;
582 596
583 if (nh->nh_flags&(RTNH_F_PERVASIVE|RTNH_F_ONLINK)) 597 if (nh->nh_flags & (RTNH_F_PERVASIVE | RTNH_F_ONLINK))
584 return -EINVAL; 598 return -EINVAL;
585 599
600 rcu_read_lock();
601 err = -ENODEV;
586 in_dev = inetdev_by_index(net, nh->nh_oif); 602 in_dev = inetdev_by_index(net, nh->nh_oif);
587 if (in_dev == NULL) 603 if (in_dev == NULL)
588 return -ENODEV; 604 goto out;
589 if (!(in_dev->dev->flags&IFF_UP)) { 605 err = -ENETDOWN;
590 in_dev_put(in_dev); 606 if (!(in_dev->dev->flags & IFF_UP))
591 return -ENETDOWN; 607 goto out;
592 }
593 nh->nh_dev = in_dev->dev; 608 nh->nh_dev = in_dev->dev;
594 dev_hold(nh->nh_dev); 609 dev_hold(nh->nh_dev);
595 nh->nh_scope = RT_SCOPE_HOST; 610 nh->nh_scope = RT_SCOPE_HOST;
596 in_dev_put(in_dev); 611 err = 0;
597 } 612 }
598 return 0; 613out:
614 rcu_read_unlock();
615 return err;
599} 616}
600 617
601static inline unsigned int fib_laddr_hashfn(__be32 val) 618static inline unsigned int fib_laddr_hashfn(__be32 val)
602{ 619{
603 unsigned int mask = (fib_hash_size - 1); 620 unsigned int mask = (fib_hash_size - 1);
604 621
605 return ((__force u32)val ^ ((__force u32)val >> 7) ^ ((__force u32)val >> 14)) & mask; 622 return ((__force u32)val ^
623 ((__force u32)val >> 7) ^
624 ((__force u32)val >> 14)) & mask;
606} 625}
607 626
608static struct hlist_head *fib_hash_alloc(int bytes) 627static struct hlist_head *fib_hash_alloc(int bytes)
@@ -611,7 +630,8 @@ static struct hlist_head *fib_hash_alloc(int bytes)
611 return kzalloc(bytes, GFP_KERNEL); 630 return kzalloc(bytes, GFP_KERNEL);
612 else 631 else
613 return (struct hlist_head *) 632 return (struct hlist_head *)
614 __get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(bytes)); 633 __get_free_pages(GFP_KERNEL | __GFP_ZERO,
634 get_order(bytes));
615} 635}
616 636
617static void fib_hash_free(struct hlist_head *hash, int bytes) 637static void fib_hash_free(struct hlist_head *hash, int bytes)
@@ -806,7 +826,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
806 goto failure; 826 goto failure;
807 } else { 827 } else {
808 change_nexthops(fi) { 828 change_nexthops(fi) {
809 if ((err = fib_check_nh(cfg, fi, nexthop_nh)) != 0) 829 err = fib_check_nh(cfg, fi, nexthop_nh);
830 if (err != 0)
810 goto failure; 831 goto failure;
811 } endfor_nexthops(fi) 832 } endfor_nexthops(fi)
812 } 833 }
@@ -819,7 +840,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
819 } 840 }
820 841
821link_it: 842link_it:
822 if ((ofi = fib_find_info(fi)) != NULL) { 843 ofi = fib_find_info(fi);
844 if (ofi) {
823 fi->fib_dead = 1; 845 fi->fib_dead = 1;
824 free_fib_info(fi); 846 free_fib_info(fi);
825 ofi->fib_treeref++; 847 ofi->fib_treeref++;
@@ -864,7 +886,7 @@ failure:
864 886
865/* Note! fib_semantic_match intentionally uses RCU list functions. */ 887/* Note! fib_semantic_match intentionally uses RCU list functions. */
866int fib_semantic_match(struct list_head *head, const struct flowi *flp, 888int fib_semantic_match(struct list_head *head, const struct flowi *flp,
867 struct fib_result *res, int prefixlen) 889 struct fib_result *res, int prefixlen, int fib_flags)
868{ 890{
869 struct fib_alias *fa; 891 struct fib_alias *fa;
870 int nh_sel = 0; 892 int nh_sel = 0;
@@ -879,7 +901,7 @@ int fib_semantic_match(struct list_head *head, const struct flowi *flp,
879 if (fa->fa_scope < flp->fl4_scope) 901 if (fa->fa_scope < flp->fl4_scope)
880 continue; 902 continue;
881 903
882 fa->fa_state |= FA_S_ACCESSED; 904 fib_alias_accessed(fa);
883 905
884 err = fib_props[fa->fa_type].error; 906 err = fib_props[fa->fa_type].error;
885 if (err == 0) { 907 if (err == 0) {
@@ -895,7 +917,7 @@ int fib_semantic_match(struct list_head *head, const struct flowi *flp,
895 case RTN_ANYCAST: 917 case RTN_ANYCAST:
896 case RTN_MULTICAST: 918 case RTN_MULTICAST:
897 for_nexthops(fi) { 919 for_nexthops(fi) {
898 if (nh->nh_flags&RTNH_F_DEAD) 920 if (nh->nh_flags & RTNH_F_DEAD)
899 continue; 921 continue;
900 if (!flp->oif || flp->oif == nh->nh_oif) 922 if (!flp->oif || flp->oif == nh->nh_oif)
901 break; 923 break;
@@ -906,16 +928,15 @@ int fib_semantic_match(struct list_head *head, const struct flowi *flp,
906 goto out_fill_res; 928 goto out_fill_res;
907 } 929 }
908#else 930#else
909 if (nhsel < 1) { 931 if (nhsel < 1)
910 goto out_fill_res; 932 goto out_fill_res;
911 }
912#endif 933#endif
913 endfor_nexthops(fi); 934 endfor_nexthops(fi);
914 continue; 935 continue;
915 936
916 default: 937 default:
917 printk(KERN_WARNING "fib_semantic_match bad type %#x\n", 938 pr_warning("fib_semantic_match bad type %#x\n",
918 fa->fa_type); 939 fa->fa_type);
919 return -EINVAL; 940 return -EINVAL;
920 } 941 }
921 } 942 }
@@ -929,7 +950,8 @@ out_fill_res:
929 res->type = fa->fa_type; 950 res->type = fa->fa_type;
930 res->scope = fa->fa_scope; 951 res->scope = fa->fa_scope;
931 res->fi = fa->fa_info; 952 res->fi = fa->fa_info;
932 atomic_inc(&res->fi->fib_clntref); 953 if (!(fib_flags & FIB_LOOKUP_NOREF))
954 atomic_inc(&res->fi->fib_clntref);
933 return 0; 955 return 0;
934} 956}
935 957
@@ -1028,10 +1050,10 @@ nla_put_failure:
1028} 1050}
1029 1051
1030/* 1052/*
1031 Update FIB if: 1053 * Update FIB if:
1032 - local address disappeared -> we must delete all the entries 1054 * - local address disappeared -> we must delete all the entries
1033 referring to it. 1055 * referring to it.
1034 - device went down -> we must shutdown all nexthops going via it. 1056 * - device went down -> we must shutdown all nexthops going via it.
1035 */ 1057 */
1036int fib_sync_down_addr(struct net *net, __be32 local) 1058int fib_sync_down_addr(struct net *net, __be32 local)
1037{ 1059{
@@ -1078,7 +1100,7 @@ int fib_sync_down_dev(struct net_device *dev, int force)
1078 prev_fi = fi; 1100 prev_fi = fi;
1079 dead = 0; 1101 dead = 0;
1080 change_nexthops(fi) { 1102 change_nexthops(fi) {
1081 if (nexthop_nh->nh_flags&RTNH_F_DEAD) 1103 if (nexthop_nh->nh_flags & RTNH_F_DEAD)
1082 dead++; 1104 dead++;
1083 else if (nexthop_nh->nh_dev == dev && 1105 else if (nexthop_nh->nh_dev == dev &&
1084 nexthop_nh->nh_scope != scope) { 1106 nexthop_nh->nh_scope != scope) {
@@ -1110,10 +1132,9 @@ int fib_sync_down_dev(struct net_device *dev, int force)
1110#ifdef CONFIG_IP_ROUTE_MULTIPATH 1132#ifdef CONFIG_IP_ROUTE_MULTIPATH
1111 1133
1112/* 1134/*
1113 Dead device goes up. We wake up dead nexthops. 1135 * Dead device goes up. We wake up dead nexthops.
1114 It takes sense only on multipath routes. 1136 * It takes sense only on multipath routes.
1115 */ 1137 */
1116
1117int fib_sync_up(struct net_device *dev) 1138int fib_sync_up(struct net_device *dev)
1118{ 1139{
1119 struct fib_info *prev_fi; 1140 struct fib_info *prev_fi;
@@ -1123,7 +1144,7 @@ int fib_sync_up(struct net_device *dev)
1123 struct fib_nh *nh; 1144 struct fib_nh *nh;
1124 int ret; 1145 int ret;
1125 1146
1126 if (!(dev->flags&IFF_UP)) 1147 if (!(dev->flags & IFF_UP))
1127 return 0; 1148 return 0;
1128 1149
1129 prev_fi = NULL; 1150 prev_fi = NULL;
@@ -1142,12 +1163,12 @@ int fib_sync_up(struct net_device *dev)
1142 prev_fi = fi; 1163 prev_fi = fi;
1143 alive = 0; 1164 alive = 0;
1144 change_nexthops(fi) { 1165 change_nexthops(fi) {
1145 if (!(nexthop_nh->nh_flags&RTNH_F_DEAD)) { 1166 if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) {
1146 alive++; 1167 alive++;
1147 continue; 1168 continue;
1148 } 1169 }
1149 if (nexthop_nh->nh_dev == NULL || 1170 if (nexthop_nh->nh_dev == NULL ||
1150 !(nexthop_nh->nh_dev->flags&IFF_UP)) 1171 !(nexthop_nh->nh_dev->flags & IFF_UP))
1151 continue; 1172 continue;
1152 if (nexthop_nh->nh_dev != dev || 1173 if (nexthop_nh->nh_dev != dev ||
1153 !__in_dev_get_rtnl(dev)) 1174 !__in_dev_get_rtnl(dev))
@@ -1169,10 +1190,9 @@ int fib_sync_up(struct net_device *dev)
1169} 1190}
1170 1191
1171/* 1192/*
1172 The algorithm is suboptimal, but it provides really 1193 * The algorithm is suboptimal, but it provides really
1173 fair weighted route distribution. 1194 * fair weighted route distribution.
1174 */ 1195 */
1175
1176void fib_select_multipath(const struct flowi *flp, struct fib_result *res) 1196void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
1177{ 1197{
1178 struct fib_info *fi = res->fi; 1198 struct fib_info *fi = res->fi;
@@ -1182,7 +1202,7 @@ void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
1182 if (fi->fib_power <= 0) { 1202 if (fi->fib_power <= 0) {
1183 int power = 0; 1203 int power = 0;
1184 change_nexthops(fi) { 1204 change_nexthops(fi) {
1185 if (!(nexthop_nh->nh_flags&RTNH_F_DEAD)) { 1205 if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) {
1186 power += nexthop_nh->nh_weight; 1206 power += nexthop_nh->nh_weight;
1187 nexthop_nh->nh_power = nexthop_nh->nh_weight; 1207 nexthop_nh->nh_power = nexthop_nh->nh_weight;
1188 } 1208 }
@@ -1198,15 +1218,16 @@ void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
1198 1218
1199 1219
1200 /* w should be random number [0..fi->fib_power-1], 1220 /* w should be random number [0..fi->fib_power-1],
1201 it is pretty bad approximation. 1221 * it is pretty bad approximation.
1202 */ 1222 */
1203 1223
1204 w = jiffies % fi->fib_power; 1224 w = jiffies % fi->fib_power;
1205 1225
1206 change_nexthops(fi) { 1226 change_nexthops(fi) {
1207 if (!(nexthop_nh->nh_flags&RTNH_F_DEAD) && 1227 if (!(nexthop_nh->nh_flags & RTNH_F_DEAD) &&
1208 nexthop_nh->nh_power) { 1228 nexthop_nh->nh_power) {
1209 if ((w -= nexthop_nh->nh_power) <= 0) { 1229 w -= nexthop_nh->nh_power;
1230 if (w <= 0) {
1210 nexthop_nh->nh_power--; 1231 nexthop_nh->nh_power--;
1211 fi->fib_power--; 1232 fi->fib_power--;
1212 res->nh_sel = nhsel; 1233 res->nh_sel = nhsel;
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 4a8e370862bc..cd5e13aee7d5 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -186,9 +186,7 @@ static inline struct tnode *node_parent_rcu(struct node *node)
186{ 186{
187 struct tnode *ret = node_parent(node); 187 struct tnode *ret = node_parent(node);
188 188
189 return rcu_dereference_check(ret, 189 return rcu_dereference_rtnl(ret);
190 rcu_read_lock_held() ||
191 lockdep_rtnl_is_held());
192} 190}
193 191
194/* Same as rcu_assign_pointer 192/* Same as rcu_assign_pointer
@@ -211,9 +209,7 @@ static inline struct node *tnode_get_child_rcu(struct tnode *tn, unsigned int i)
211{ 209{
212 struct node *ret = tnode_get_child(tn, i); 210 struct node *ret = tnode_get_child(tn, i);
213 211
214 return rcu_dereference_check(ret, 212 return rcu_dereference_rtnl(ret);
215 rcu_read_lock_held() ||
216 lockdep_rtnl_is_held());
217} 213}
218 214
219static inline int tnode_child_length(const struct tnode *tn) 215static inline int tnode_child_length(const struct tnode *tn)
@@ -459,8 +455,8 @@ static struct tnode *tnode_new(t_key key, int pos, int bits)
459 tn->empty_children = 1<<bits; 455 tn->empty_children = 1<<bits;
460 } 456 }
461 457
462 pr_debug("AT %p s=%u %lu\n", tn, (unsigned int) sizeof(struct tnode), 458 pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
463 (unsigned long) (sizeof(struct node) << bits)); 459 sizeof(struct node) << bits);
464 return tn; 460 return tn;
465} 461}
466 462
@@ -609,11 +605,10 @@ static struct node *resize(struct trie *t, struct tnode *tn)
609 605
610 /* Keep root node larger */ 606 /* Keep root node larger */
611 607
612 if (!node_parent((struct node*) tn)) { 608 if (!node_parent((struct node *)tn)) {
613 inflate_threshold_use = inflate_threshold_root; 609 inflate_threshold_use = inflate_threshold_root;
614 halve_threshold_use = halve_threshold_root; 610 halve_threshold_use = halve_threshold_root;
615 } 611 } else {
616 else {
617 inflate_threshold_use = inflate_threshold; 612 inflate_threshold_use = inflate_threshold;
618 halve_threshold_use = halve_threshold; 613 halve_threshold_use = halve_threshold;
619 } 614 }
@@ -639,7 +634,7 @@ static struct node *resize(struct trie *t, struct tnode *tn)
639 check_tnode(tn); 634 check_tnode(tn);
640 635
641 /* Return if at least one inflate is run */ 636 /* Return if at least one inflate is run */
642 if( max_work != MAX_WORK) 637 if (max_work != MAX_WORK)
643 return (struct node *) tn; 638 return (struct node *) tn;
644 639
645 /* 640 /*
@@ -966,9 +961,7 @@ fib_find_node(struct trie *t, u32 key)
966 struct node *n; 961 struct node *n;
967 962
968 pos = 0; 963 pos = 0;
969 n = rcu_dereference_check(t->trie, 964 n = rcu_dereference_rtnl(t->trie);
970 rcu_read_lock_held() ||
971 lockdep_rtnl_is_held());
972 965
973 while (n != NULL && NODE_TYPE(n) == T_TNODE) { 966 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
974 tn = (struct tnode *) n; 967 tn = (struct tnode *) n;
@@ -1349,7 +1342,7 @@ err:
1349/* should be called with rcu_read_lock */ 1342/* should be called with rcu_read_lock */
1350static int check_leaf(struct trie *t, struct leaf *l, 1343static int check_leaf(struct trie *t, struct leaf *l,
1351 t_key key, const struct flowi *flp, 1344 t_key key, const struct flowi *flp,
1352 struct fib_result *res) 1345 struct fib_result *res, int fib_flags)
1353{ 1346{
1354 struct leaf_info *li; 1347 struct leaf_info *li;
1355 struct hlist_head *hhead = &l->list; 1348 struct hlist_head *hhead = &l->list;
@@ -1363,7 +1356,7 @@ static int check_leaf(struct trie *t, struct leaf *l,
1363 if (l->key != (key & ntohl(mask))) 1356 if (l->key != (key & ntohl(mask)))
1364 continue; 1357 continue;
1365 1358
1366 err = fib_semantic_match(&li->falh, flp, res, plen); 1359 err = fib_semantic_match(&li->falh, flp, res, plen, fib_flags);
1367 1360
1368#ifdef CONFIG_IP_FIB_TRIE_STATS 1361#ifdef CONFIG_IP_FIB_TRIE_STATS
1369 if (err <= 0) 1362 if (err <= 0)
@@ -1379,7 +1372,7 @@ static int check_leaf(struct trie *t, struct leaf *l,
1379} 1372}
1380 1373
1381int fib_table_lookup(struct fib_table *tb, const struct flowi *flp, 1374int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
1382 struct fib_result *res) 1375 struct fib_result *res, int fib_flags)
1383{ 1376{
1384 struct trie *t = (struct trie *) tb->tb_data; 1377 struct trie *t = (struct trie *) tb->tb_data;
1385 int ret; 1378 int ret;
@@ -1391,8 +1384,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
1391 t_key cindex = 0; 1384 t_key cindex = 0;
1392 int current_prefix_length = KEYLENGTH; 1385 int current_prefix_length = KEYLENGTH;
1393 struct tnode *cn; 1386 struct tnode *cn;
1394 t_key node_prefix, key_prefix, pref_mismatch; 1387 t_key pref_mismatch;
1395 int mp;
1396 1388
1397 rcu_read_lock(); 1389 rcu_read_lock();
1398 1390
@@ -1406,7 +1398,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
1406 1398
1407 /* Just a leaf? */ 1399 /* Just a leaf? */
1408 if (IS_LEAF(n)) { 1400 if (IS_LEAF(n)) {
1409 ret = check_leaf(t, (struct leaf *)n, key, flp, res); 1401 ret = check_leaf(t, (struct leaf *)n, key, flp, res, fib_flags);
1410 goto found; 1402 goto found;
1411 } 1403 }
1412 1404
@@ -1431,7 +1423,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
1431 } 1423 }
1432 1424
1433 if (IS_LEAF(n)) { 1425 if (IS_LEAF(n)) {
1434 ret = check_leaf(t, (struct leaf *)n, key, flp, res); 1426 ret = check_leaf(t, (struct leaf *)n, key, flp, res, fib_flags);
1435 if (ret > 0) 1427 if (ret > 0)
1436 goto backtrace; 1428 goto backtrace;
1437 goto found; 1429 goto found;
@@ -1507,10 +1499,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
1507 * matching prefix. 1499 * matching prefix.
1508 */ 1500 */
1509 1501
1510 node_prefix = mask_pfx(cn->key, cn->pos); 1502 pref_mismatch = mask_pfx(cn->key ^ key, cn->pos);
1511 key_prefix = mask_pfx(key, cn->pos);
1512 pref_mismatch = key_prefix^node_prefix;
1513 mp = 0;
1514 1503
1515 /* 1504 /*
1516 * In short: If skipped bits in this node do not match 1505 * In short: If skipped bits in this node do not match
@@ -1518,13 +1507,9 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
1518 * state.directly. 1507 * state.directly.
1519 */ 1508 */
1520 if (pref_mismatch) { 1509 if (pref_mismatch) {
1521 while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) { 1510 int mp = KEYLENGTH - fls(pref_mismatch);
1522 mp++;
1523 pref_mismatch = pref_mismatch << 1;
1524 }
1525 key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1526 1511
1527 if (key_prefix != 0) 1512 if (tkey_extract_bits(cn->key, mp, cn->pos - mp) != 0)
1528 goto backtrace; 1513 goto backtrace;
1529 1514
1530 if (current_prefix_length >= cn->pos) 1515 if (current_prefix_length >= cn->pos)
@@ -1748,16 +1733,14 @@ static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c)
1748 1733
1749 /* Node empty, walk back up to parent */ 1734 /* Node empty, walk back up to parent */
1750 c = (struct node *) p; 1735 c = (struct node *) p;
1751 } while ( (p = node_parent_rcu(c)) != NULL); 1736 } while ((p = node_parent_rcu(c)) != NULL);
1752 1737
1753 return NULL; /* Root of trie */ 1738 return NULL; /* Root of trie */
1754} 1739}
1755 1740
1756static struct leaf *trie_firstleaf(struct trie *t) 1741static struct leaf *trie_firstleaf(struct trie *t)
1757{ 1742{
1758 struct tnode *n = (struct tnode *) rcu_dereference_check(t->trie, 1743 struct tnode *n = (struct tnode *)rcu_dereference_rtnl(t->trie);
1759 rcu_read_lock_held() ||
1760 lockdep_rtnl_is_held());
1761 1744
1762 if (!n) 1745 if (!n)
1763 return NULL; 1746 return NULL;
@@ -1855,7 +1838,8 @@ void fib_table_select_default(struct fib_table *tb,
1855 if (!next_fi->fib_nh[0].nh_gw || 1838 if (!next_fi->fib_nh[0].nh_gw ||
1856 next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK) 1839 next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1857 continue; 1840 continue;
1858 fa->fa_state |= FA_S_ACCESSED; 1841
1842 fib_alias_accessed(fa);
1859 1843
1860 if (fi == NULL) { 1844 if (fi == NULL) {
1861 if (next_fi != res->fi) 1845 if (next_fi != res->fi)
@@ -2043,14 +2027,14 @@ struct fib_trie_iter {
2043 struct seq_net_private p; 2027 struct seq_net_private p;
2044 struct fib_table *tb; 2028 struct fib_table *tb;
2045 struct tnode *tnode; 2029 struct tnode *tnode;
2046 unsigned index; 2030 unsigned int index;
2047 unsigned depth; 2031 unsigned int depth;
2048}; 2032};
2049 2033
2050static struct node *fib_trie_get_next(struct fib_trie_iter *iter) 2034static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
2051{ 2035{
2052 struct tnode *tn = iter->tnode; 2036 struct tnode *tn = iter->tnode;
2053 unsigned cindex = iter->index; 2037 unsigned int cindex = iter->index;
2054 struct tnode *p; 2038 struct tnode *p;
2055 2039
2056 /* A single entry routing table */ 2040 /* A single entry routing table */
@@ -2159,7 +2143,7 @@ static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2159 */ 2143 */
2160static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat) 2144static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2161{ 2145{
2162 unsigned i, max, pointers, bytes, avdepth; 2146 unsigned int i, max, pointers, bytes, avdepth;
2163 2147
2164 if (stat->leaves) 2148 if (stat->leaves)
2165 avdepth = stat->totdepth*100 / stat->leaves; 2149 avdepth = stat->totdepth*100 / stat->leaves;
@@ -2356,7 +2340,8 @@ static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2356 2340
2357static void seq_indent(struct seq_file *seq, int n) 2341static void seq_indent(struct seq_file *seq, int n)
2358{ 2342{
2359 while (n-- > 0) seq_puts(seq, " "); 2343 while (n-- > 0)
2344 seq_puts(seq, " ");
2360} 2345}
2361 2346
2362static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s) 2347static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
@@ -2388,7 +2373,7 @@ static const char *const rtn_type_names[__RTN_MAX] = {
2388 [RTN_XRESOLVE] = "XRESOLVE", 2373 [RTN_XRESOLVE] = "XRESOLVE",
2389}; 2374};
2390 2375
2391static inline const char *rtn_type(char *buf, size_t len, unsigned t) 2376static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
2392{ 2377{
2393 if (t < __RTN_MAX && rtn_type_names[t]) 2378 if (t < __RTN_MAX && rtn_type_names[t])
2394 return rtn_type_names[t]; 2379 return rtn_type_names[t];
@@ -2544,13 +2529,12 @@ static void fib_route_seq_stop(struct seq_file *seq, void *v)
2544 rcu_read_unlock(); 2529 rcu_read_unlock();
2545} 2530}
2546 2531
2547static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi) 2532static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2548{ 2533{
2549 static unsigned type2flags[RTN_MAX + 1] = { 2534 unsigned int flags = 0;
2550 [7] = RTF_REJECT, [8] = RTF_REJECT,
2551 };
2552 unsigned flags = type2flags[type];
2553 2535
2536 if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT)
2537 flags = RTF_REJECT;
2554 if (fi && fi->fib_nh->nh_gw) 2538 if (fi && fi->fib_nh->nh_gw)
2555 flags |= RTF_GATEWAY; 2539 flags |= RTF_GATEWAY;
2556 if (mask == htonl(0xFFFFFFFF)) 2540 if (mask == htonl(0xFFFFFFFF))
@@ -2562,7 +2546,7 @@ static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2562/* 2546/*
2563 * This outputs /proc/net/route. 2547 * This outputs /proc/net/route.
2564 * The format of the file is not supposed to be changed 2548 * The format of the file is not supposed to be changed
2565 * and needs to be same as fib_hash output to avoid breaking 2549 * and needs to be same as fib_hash output to avoid breaking
2566 * legacy utilities 2550 * legacy utilities
2567 */ 2551 */
2568static int fib_route_seq_show(struct seq_file *seq, void *v) 2552static int fib_route_seq_show(struct seq_file *seq, void *v)
@@ -2587,7 +2571,7 @@ static int fib_route_seq_show(struct seq_file *seq, void *v)
2587 2571
2588 list_for_each_entry_rcu(fa, &li->falh, fa_list) { 2572 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2589 const struct fib_info *fi = fa->fa_info; 2573 const struct fib_info *fi = fa->fa_info;
2590 unsigned flags = fib_flag_trans(fa->fa_type, mask, fi); 2574 unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
2591 int len; 2575 int len;
2592 2576
2593 if (fa->fa_type == RTN_BROADCAST 2577 if (fa->fa_type == RTN_BROADCAST
diff --git a/net/ipv4/gre.c b/net/ipv4/gre.c
new file mode 100644
index 000000000000..caea6885fdbd
--- /dev/null
+++ b/net/ipv4/gre.c
@@ -0,0 +1,151 @@
1/*
2 * GRE over IPv4 demultiplexer driver
3 *
4 * Authors: Dmitry Kozlov (xeb@mail.ru)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 *
11 */
12
13#include <linux/module.h>
14#include <linux/kernel.h>
15#include <linux/kmod.h>
16#include <linux/skbuff.h>
17#include <linux/in.h>
18#include <linux/netdevice.h>
19#include <linux/version.h>
20#include <linux/spinlock.h>
21#include <net/protocol.h>
22#include <net/gre.h>
23
24
25static const struct gre_protocol *gre_proto[GREPROTO_MAX] __read_mostly;
26static DEFINE_SPINLOCK(gre_proto_lock);
27
28int gre_add_protocol(const struct gre_protocol *proto, u8 version)
29{
30 if (version >= GREPROTO_MAX)
31 goto err_out;
32
33 spin_lock(&gre_proto_lock);
34 if (gre_proto[version])
35 goto err_out_unlock;
36
37 rcu_assign_pointer(gre_proto[version], proto);
38 spin_unlock(&gre_proto_lock);
39 return 0;
40
41err_out_unlock:
42 spin_unlock(&gre_proto_lock);
43err_out:
44 return -1;
45}
46EXPORT_SYMBOL_GPL(gre_add_protocol);
47
48int gre_del_protocol(const struct gre_protocol *proto, u8 version)
49{
50 if (version >= GREPROTO_MAX)
51 goto err_out;
52
53 spin_lock(&gre_proto_lock);
54 if (gre_proto[version] != proto)
55 goto err_out_unlock;
56 rcu_assign_pointer(gre_proto[version], NULL);
57 spin_unlock(&gre_proto_lock);
58 synchronize_rcu();
59 return 0;
60
61err_out_unlock:
62 spin_unlock(&gre_proto_lock);
63err_out:
64 return -1;
65}
66EXPORT_SYMBOL_GPL(gre_del_protocol);
67
68static int gre_rcv(struct sk_buff *skb)
69{
70 const struct gre_protocol *proto;
71 u8 ver;
72 int ret;
73
74 if (!pskb_may_pull(skb, 12))
75 goto drop;
76
77 ver = skb->data[1]&0x7f;
78 if (ver >= GREPROTO_MAX)
79 goto drop;
80
81 rcu_read_lock();
82 proto = rcu_dereference(gre_proto[ver]);
83 if (!proto || !proto->handler)
84 goto drop_unlock;
85 ret = proto->handler(skb);
86 rcu_read_unlock();
87 return ret;
88
89drop_unlock:
90 rcu_read_unlock();
91drop:
92 kfree_skb(skb);
93 return NET_RX_DROP;
94}
95
96static void gre_err(struct sk_buff *skb, u32 info)
97{
98 const struct gre_protocol *proto;
99 u8 ver;
100
101 if (!pskb_may_pull(skb, 12))
102 goto drop;
103
104 ver = skb->data[1]&0x7f;
105 if (ver >= GREPROTO_MAX)
106 goto drop;
107
108 rcu_read_lock();
109 proto = rcu_dereference(gre_proto[ver]);
110 if (!proto || !proto->err_handler)
111 goto drop_unlock;
112 proto->err_handler(skb, info);
113 rcu_read_unlock();
114 return;
115
116drop_unlock:
117 rcu_read_unlock();
118drop:
119 kfree_skb(skb);
120}
121
122static const struct net_protocol net_gre_protocol = {
123 .handler = gre_rcv,
124 .err_handler = gre_err,
125 .netns_ok = 1,
126};
127
128static int __init gre_init(void)
129{
130 pr_info("GRE over IPv4 demultiplexor driver");
131
132 if (inet_add_protocol(&net_gre_protocol, IPPROTO_GRE) < 0) {
133 pr_err("gre: can't add protocol\n");
134 return -EAGAIN;
135 }
136
137 return 0;
138}
139
140static void __exit gre_exit(void)
141{
142 inet_del_protocol(&net_gre_protocol, IPPROTO_GRE);
143}
144
145module_init(gre_init);
146module_exit(gre_exit);
147
148MODULE_DESCRIPTION("GRE over IPv4 demultiplexer driver");
149MODULE_AUTHOR("D. Kozlov (xeb@mail.ru)");
150MODULE_LICENSE("GPL");
151
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index a0d847c7cba5..96bc7f9475a3 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -379,7 +379,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
379 inet->tos = ip_hdr(skb)->tos; 379 inet->tos = ip_hdr(skb)->tos;
380 daddr = ipc.addr = rt->rt_src; 380 daddr = ipc.addr = rt->rt_src;
381 ipc.opt = NULL; 381 ipc.opt = NULL;
382 ipc.shtx.flags = 0; 382 ipc.tx_flags = 0;
383 if (icmp_param->replyopts.optlen) { 383 if (icmp_param->replyopts.optlen) {
384 ipc.opt = &icmp_param->replyopts; 384 ipc.opt = &icmp_param->replyopts;
385 if (ipc.opt->srr) 385 if (ipc.opt->srr)
@@ -538,7 +538,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
538 inet_sk(sk)->tos = tos; 538 inet_sk(sk)->tos = tos;
539 ipc.addr = iph->saddr; 539 ipc.addr = iph->saddr;
540 ipc.opt = &icmp_param.replyopts; 540 ipc.opt = &icmp_param.replyopts;
541 ipc.shtx.flags = 0; 541 ipc.tx_flags = 0;
542 542
543 { 543 {
544 struct flowi fl = { 544 struct flowi fl = {
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 2a4bb76f2132..c8877c6c7216 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -1269,14 +1269,14 @@ void ip_mc_rejoin_group(struct ip_mc_list *im)
1269 if (im->multiaddr == IGMP_ALL_HOSTS) 1269 if (im->multiaddr == IGMP_ALL_HOSTS)
1270 return; 1270 return;
1271 1271
1272 if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) { 1272 /* a failover is happening and switches
1273 igmp_mod_timer(im, IGMP_Initial_Report_Delay); 1273 * must be notified immediately */
1274 return; 1274 if (IGMP_V1_SEEN(in_dev))
1275 } 1275 igmp_send_report(in_dev, im, IGMP_HOST_MEMBERSHIP_REPORT);
1276 /* else, v3 */ 1276 else if (IGMP_V2_SEEN(in_dev))
1277 im->crcount = in_dev->mr_qrv ? in_dev->mr_qrv : 1277 igmp_send_report(in_dev, im, IGMPV2_HOST_MEMBERSHIP_REPORT);
1278 IGMP_Unsolicited_Report_Count; 1278 else
1279 igmp_ifc_event(in_dev); 1279 igmp_send_report(in_dev, im, IGMPV3_HOST_MEMBERSHIP_REPORT);
1280#endif 1280#endif
1281} 1281}
1282EXPORT_SYMBOL(ip_mc_rejoin_group); 1282EXPORT_SYMBOL(ip_mc_rejoin_group);
@@ -1418,6 +1418,7 @@ void ip_mc_destroy_dev(struct in_device *in_dev)
1418 write_unlock_bh(&in_dev->mc_list_lock); 1418 write_unlock_bh(&in_dev->mc_list_lock);
1419} 1419}
1420 1420
1421/* RTNL is locked */
1421static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr) 1422static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr)
1422{ 1423{
1423 struct flowi fl = { .nl_u = { .ip4_u = 1424 struct flowi fl = { .nl_u = { .ip4_u =
@@ -1428,15 +1429,12 @@ static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr)
1428 1429
1429 if (imr->imr_ifindex) { 1430 if (imr->imr_ifindex) {
1430 idev = inetdev_by_index(net, imr->imr_ifindex); 1431 idev = inetdev_by_index(net, imr->imr_ifindex);
1431 if (idev)
1432 __in_dev_put(idev);
1433 return idev; 1432 return idev;
1434 } 1433 }
1435 if (imr->imr_address.s_addr) { 1434 if (imr->imr_address.s_addr) {
1436 dev = ip_dev_find(net, imr->imr_address.s_addr); 1435 dev = __ip_dev_find(net, imr->imr_address.s_addr, false);
1437 if (!dev) 1436 if (!dev)
1438 return NULL; 1437 return NULL;
1439 dev_put(dev);
1440 } 1438 }
1441 1439
1442 if (!dev && !ip_route_output_key(net, &rt, &fl)) { 1440 if (!dev && !ip_route_output_key(net, &rt, &fl)) {
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index e5fa2ddce320..ba8042665849 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -425,7 +425,7 @@ static int inet_diag_bc_run(const void *bc, int len,
425 bc += op->no; 425 bc += op->no;
426 } 426 }
427 } 427 }
428 return (len == 0); 428 return len == 0;
429} 429}
430 430
431static int valid_cc(const void *bc, int len, int cc) 431static int valid_cc(const void *bc, int len, int cc)
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index fb7ad5a21ff3..1b344f30b463 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -101,19 +101,43 @@ void inet_put_port(struct sock *sk)
101} 101}
102EXPORT_SYMBOL(inet_put_port); 102EXPORT_SYMBOL(inet_put_port);
103 103
104void __inet_inherit_port(struct sock *sk, struct sock *child) 104int __inet_inherit_port(struct sock *sk, struct sock *child)
105{ 105{
106 struct inet_hashinfo *table = sk->sk_prot->h.hashinfo; 106 struct inet_hashinfo *table = sk->sk_prot->h.hashinfo;
107 const int bhash = inet_bhashfn(sock_net(sk), inet_sk(child)->inet_num, 107 unsigned short port = inet_sk(child)->inet_num;
108 const int bhash = inet_bhashfn(sock_net(sk), port,
108 table->bhash_size); 109 table->bhash_size);
109 struct inet_bind_hashbucket *head = &table->bhash[bhash]; 110 struct inet_bind_hashbucket *head = &table->bhash[bhash];
110 struct inet_bind_bucket *tb; 111 struct inet_bind_bucket *tb;
111 112
112 spin_lock(&head->lock); 113 spin_lock(&head->lock);
113 tb = inet_csk(sk)->icsk_bind_hash; 114 tb = inet_csk(sk)->icsk_bind_hash;
115 if (tb->port != port) {
116 /* NOTE: using tproxy and redirecting skbs to a proxy
117 * on a different listener port breaks the assumption
118 * that the listener socket's icsk_bind_hash is the same
119 * as that of the child socket. We have to look up or
120 * create a new bind bucket for the child here. */
121 struct hlist_node *node;
122 inet_bind_bucket_for_each(tb, node, &head->chain) {
123 if (net_eq(ib_net(tb), sock_net(sk)) &&
124 tb->port == port)
125 break;
126 }
127 if (!node) {
128 tb = inet_bind_bucket_create(table->bind_bucket_cachep,
129 sock_net(sk), head, port);
130 if (!tb) {
131 spin_unlock(&head->lock);
132 return -ENOMEM;
133 }
134 }
135 }
114 sk_add_bind_node(child, &tb->owners); 136 sk_add_bind_node(child, &tb->owners);
115 inet_csk(child)->icsk_bind_hash = tb; 137 inet_csk(child)->icsk_bind_hash = tb;
116 spin_unlock(&head->lock); 138 spin_unlock(&head->lock);
139
140 return 0;
117} 141}
118EXPORT_SYMBOL_GPL(__inet_inherit_port); 142EXPORT_SYMBOL_GPL(__inet_inherit_port);
119 143
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index b7c41654dde5..168440834ade 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -116,11 +116,11 @@ static int ip4_frag_match(struct inet_frag_queue *q, void *a)
116 struct ip4_create_arg *arg = a; 116 struct ip4_create_arg *arg = a;
117 117
118 qp = container_of(q, struct ipq, q); 118 qp = container_of(q, struct ipq, q);
119 return (qp->id == arg->iph->id && 119 return qp->id == arg->iph->id &&
120 qp->saddr == arg->iph->saddr && 120 qp->saddr == arg->iph->saddr &&
121 qp->daddr == arg->iph->daddr && 121 qp->daddr == arg->iph->daddr &&
122 qp->protocol == arg->iph->protocol && 122 qp->protocol == arg->iph->protocol &&
123 qp->user == arg->user); 123 qp->user == arg->user;
124} 124}
125 125
126/* Memory Tracking Functions. */ 126/* Memory Tracking Functions. */
@@ -542,7 +542,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
542 /* If the first fragment is fragmented itself, we split 542 /* If the first fragment is fragmented itself, we split
543 * it to two chunks: the first with data and paged part 543 * it to two chunks: the first with data and paged part
544 * and the second, holding only fragments. */ 544 * and the second, holding only fragments. */
545 if (skb_has_frags(head)) { 545 if (skb_has_frag_list(head)) {
546 struct sk_buff *clone; 546 struct sk_buff *clone;
547 int i, plen = 0; 547 int i, plen = 0;
548 548
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 35c93e8b6a46..d0ffcbe369b7 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -44,6 +44,7 @@
44#include <net/net_namespace.h> 44#include <net/net_namespace.h>
45#include <net/netns/generic.h> 45#include <net/netns/generic.h>
46#include <net/rtnetlink.h> 46#include <net/rtnetlink.h>
47#include <net/gre.h>
47 48
48#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 49#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
49#include <net/ipv6.h> 50#include <net/ipv6.h>
@@ -63,13 +64,13 @@
63 We cannot track such dead loops during route installation, 64 We cannot track such dead loops during route installation,
64 it is infeasible task. The most general solutions would be 65 it is infeasible task. The most general solutions would be
65 to keep skb->encapsulation counter (sort of local ttl), 66 to keep skb->encapsulation counter (sort of local ttl),
66 and silently drop packet when it expires. It is the best 67 and silently drop packet when it expires. It is a good
67 solution, but it supposes maintaing new variable in ALL 68 solution, but it supposes maintaing new variable in ALL
68 skb, even if no tunneling is used. 69 skb, even if no tunneling is used.
69 70
70 Current solution: HARD_TX_LOCK lock breaks dead loops. 71 Current solution: xmit_recursion breaks dead loops. This is a percpu
71 72 counter, since when we enter the first ndo_xmit(), cpu migration is
72 73 forbidden. We force an exit if this counter reaches RECURSION_LIMIT
73 74
74 2. Networking dead loops would not kill routers, but would really 75 2. Networking dead loops would not kill routers, but would really
75 kill network. IP hop limit plays role of "t->recursion" in this case, 76 kill network. IP hop limit plays role of "t->recursion" in this case,
@@ -128,7 +129,7 @@ static int ipgre_tunnel_bind_dev(struct net_device *dev);
128 129
129static int ipgre_net_id __read_mostly; 130static int ipgre_net_id __read_mostly;
130struct ipgre_net { 131struct ipgre_net {
131 struct ip_tunnel *tunnels[4][HASH_SIZE]; 132 struct ip_tunnel __rcu *tunnels[4][HASH_SIZE];
132 133
133 struct net_device *fb_tunnel_dev; 134 struct net_device *fb_tunnel_dev;
134}; 135};
@@ -158,13 +159,40 @@ struct ipgre_net {
158#define tunnels_l tunnels[1] 159#define tunnels_l tunnels[1]
159#define tunnels_wc tunnels[0] 160#define tunnels_wc tunnels[0]
160/* 161/*
161 * Locking : hash tables are protected by RCU and a spinlock 162 * Locking : hash tables are protected by RCU and RTNL
162 */ 163 */
163static DEFINE_SPINLOCK(ipgre_lock);
164 164
165#define for_each_ip_tunnel_rcu(start) \ 165#define for_each_ip_tunnel_rcu(start) \
166 for (t = rcu_dereference(start); t; t = rcu_dereference(t->next)) 166 for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
167 167
168/* often modified stats are per cpu, other are shared (netdev->stats) */
169struct pcpu_tstats {
170 unsigned long rx_packets;
171 unsigned long rx_bytes;
172 unsigned long tx_packets;
173 unsigned long tx_bytes;
174};
175
176static struct net_device_stats *ipgre_get_stats(struct net_device *dev)
177{
178 struct pcpu_tstats sum = { 0 };
179 int i;
180
181 for_each_possible_cpu(i) {
182 const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
183
184 sum.rx_packets += tstats->rx_packets;
185 sum.rx_bytes += tstats->rx_bytes;
186 sum.tx_packets += tstats->tx_packets;
187 sum.tx_bytes += tstats->tx_bytes;
188 }
189 dev->stats.rx_packets = sum.rx_packets;
190 dev->stats.rx_bytes = sum.rx_bytes;
191 dev->stats.tx_packets = sum.tx_packets;
192 dev->stats.tx_bytes = sum.tx_bytes;
193 return &dev->stats;
194}
195
168/* Given src, dst and key, find appropriate for input tunnel. */ 196/* Given src, dst and key, find appropriate for input tunnel. */
169 197
170static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev, 198static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
@@ -173,8 +201,8 @@ static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
173{ 201{
174 struct net *net = dev_net(dev); 202 struct net *net = dev_net(dev);
175 int link = dev->ifindex; 203 int link = dev->ifindex;
176 unsigned h0 = HASH(remote); 204 unsigned int h0 = HASH(remote);
177 unsigned h1 = HASH(key); 205 unsigned int h1 = HASH(key);
178 struct ip_tunnel *t, *cand = NULL; 206 struct ip_tunnel *t, *cand = NULL;
179 struct ipgre_net *ign = net_generic(net, ipgre_net_id); 207 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
180 int dev_type = (gre_proto == htons(ETH_P_TEB)) ? 208 int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
@@ -289,13 +317,13 @@ static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
289 return NULL; 317 return NULL;
290} 318}
291 319
292static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign, 320static struct ip_tunnel __rcu **__ipgre_bucket(struct ipgre_net *ign,
293 struct ip_tunnel_parm *parms) 321 struct ip_tunnel_parm *parms)
294{ 322{
295 __be32 remote = parms->iph.daddr; 323 __be32 remote = parms->iph.daddr;
296 __be32 local = parms->iph.saddr; 324 __be32 local = parms->iph.saddr;
297 __be32 key = parms->i_key; 325 __be32 key = parms->i_key;
298 unsigned h = HASH(key); 326 unsigned int h = HASH(key);
299 int prio = 0; 327 int prio = 0;
300 328
301 if (local) 329 if (local)
@@ -308,7 +336,7 @@ static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign,
308 return &ign->tunnels[prio][h]; 336 return &ign->tunnels[prio][h];
309} 337}
310 338
311static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign, 339static inline struct ip_tunnel __rcu **ipgre_bucket(struct ipgre_net *ign,
312 struct ip_tunnel *t) 340 struct ip_tunnel *t)
313{ 341{
314 return __ipgre_bucket(ign, &t->parms); 342 return __ipgre_bucket(ign, &t->parms);
@@ -316,23 +344,22 @@ static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign,
316 344
317static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t) 345static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
318{ 346{
319 struct ip_tunnel **tp = ipgre_bucket(ign, t); 347 struct ip_tunnel __rcu **tp = ipgre_bucket(ign, t);
320 348
321 spin_lock_bh(&ipgre_lock); 349 rcu_assign_pointer(t->next, rtnl_dereference(*tp));
322 t->next = *tp;
323 rcu_assign_pointer(*tp, t); 350 rcu_assign_pointer(*tp, t);
324 spin_unlock_bh(&ipgre_lock);
325} 351}
326 352
327static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t) 353static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
328{ 354{
329 struct ip_tunnel **tp; 355 struct ip_tunnel __rcu **tp;
330 356 struct ip_tunnel *iter;
331 for (tp = ipgre_bucket(ign, t); *tp; tp = &(*tp)->next) { 357
332 if (t == *tp) { 358 for (tp = ipgre_bucket(ign, t);
333 spin_lock_bh(&ipgre_lock); 359 (iter = rtnl_dereference(*tp)) != NULL;
334 *tp = t->next; 360 tp = &iter->next) {
335 spin_unlock_bh(&ipgre_lock); 361 if (t == iter) {
362 rcu_assign_pointer(*tp, t->next);
336 break; 363 break;
337 } 364 }
338 } 365 }
@@ -346,10 +373,13 @@ static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
346 __be32 local = parms->iph.saddr; 373 __be32 local = parms->iph.saddr;
347 __be32 key = parms->i_key; 374 __be32 key = parms->i_key;
348 int link = parms->link; 375 int link = parms->link;
349 struct ip_tunnel *t, **tp; 376 struct ip_tunnel *t;
377 struct ip_tunnel __rcu **tp;
350 struct ipgre_net *ign = net_generic(net, ipgre_net_id); 378 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
351 379
352 for (tp = __ipgre_bucket(ign, parms); (t = *tp) != NULL; tp = &t->next) 380 for (tp = __ipgre_bucket(ign, parms);
381 (t = rtnl_dereference(*tp)) != NULL;
382 tp = &t->next)
353 if (local == t->parms.iph.saddr && 383 if (local == t->parms.iph.saddr &&
354 remote == t->parms.iph.daddr && 384 remote == t->parms.iph.daddr &&
355 key == t->parms.i_key && 385 key == t->parms.i_key &&
@@ -360,7 +390,7 @@ static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
360 return t; 390 return t;
361} 391}
362 392
363static struct ip_tunnel * ipgre_tunnel_locate(struct net *net, 393static struct ip_tunnel *ipgre_tunnel_locate(struct net *net,
364 struct ip_tunnel_parm *parms, int create) 394 struct ip_tunnel_parm *parms, int create)
365{ 395{
366 struct ip_tunnel *t, *nt; 396 struct ip_tunnel *t, *nt;
@@ -582,7 +612,7 @@ static int ipgre_rcv(struct sk_buff *skb)
582 if ((tunnel = ipgre_tunnel_lookup(skb->dev, 612 if ((tunnel = ipgre_tunnel_lookup(skb->dev,
583 iph->saddr, iph->daddr, key, 613 iph->saddr, iph->daddr, key,
584 gre_proto))) { 614 gre_proto))) {
585 struct net_device_stats *stats = &tunnel->dev->stats; 615 struct pcpu_tstats *tstats;
586 616
587 secpath_reset(skb); 617 secpath_reset(skb);
588 618
@@ -606,22 +636,22 @@ static int ipgre_rcv(struct sk_buff *skb)
606 /* Looped back packet, drop it! */ 636 /* Looped back packet, drop it! */
607 if (skb_rtable(skb)->fl.iif == 0) 637 if (skb_rtable(skb)->fl.iif == 0)
608 goto drop; 638 goto drop;
609 stats->multicast++; 639 tunnel->dev->stats.multicast++;
610 skb->pkt_type = PACKET_BROADCAST; 640 skb->pkt_type = PACKET_BROADCAST;
611 } 641 }
612#endif 642#endif
613 643
614 if (((flags&GRE_CSUM) && csum) || 644 if (((flags&GRE_CSUM) && csum) ||
615 (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) { 645 (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
616 stats->rx_crc_errors++; 646 tunnel->dev->stats.rx_crc_errors++;
617 stats->rx_errors++; 647 tunnel->dev->stats.rx_errors++;
618 goto drop; 648 goto drop;
619 } 649 }
620 if (tunnel->parms.i_flags&GRE_SEQ) { 650 if (tunnel->parms.i_flags&GRE_SEQ) {
621 if (!(flags&GRE_SEQ) || 651 if (!(flags&GRE_SEQ) ||
622 (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) { 652 (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
623 stats->rx_fifo_errors++; 653 tunnel->dev->stats.rx_fifo_errors++;
624 stats->rx_errors++; 654 tunnel->dev->stats.rx_errors++;
625 goto drop; 655 goto drop;
626 } 656 }
627 tunnel->i_seqno = seqno + 1; 657 tunnel->i_seqno = seqno + 1;
@@ -630,8 +660,8 @@ static int ipgre_rcv(struct sk_buff *skb)
630 /* Warning: All skb pointers will be invalidated! */ 660 /* Warning: All skb pointers will be invalidated! */
631 if (tunnel->dev->type == ARPHRD_ETHER) { 661 if (tunnel->dev->type == ARPHRD_ETHER) {
632 if (!pskb_may_pull(skb, ETH_HLEN)) { 662 if (!pskb_may_pull(skb, ETH_HLEN)) {
633 stats->rx_length_errors++; 663 tunnel->dev->stats.rx_length_errors++;
634 stats->rx_errors++; 664 tunnel->dev->stats.rx_errors++;
635 goto drop; 665 goto drop;
636 } 666 }
637 667
@@ -640,14 +670,19 @@ static int ipgre_rcv(struct sk_buff *skb)
640 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); 670 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
641 } 671 }
642 672
643 skb_tunnel_rx(skb, tunnel->dev); 673 tstats = this_cpu_ptr(tunnel->dev->tstats);
674 tstats->rx_packets++;
675 tstats->rx_bytes += skb->len;
676
677 __skb_tunnel_rx(skb, tunnel->dev);
644 678
645 skb_reset_network_header(skb); 679 skb_reset_network_header(skb);
646 ipgre_ecn_decapsulate(iph, skb); 680 ipgre_ecn_decapsulate(iph, skb);
647 681
648 netif_rx(skb); 682 netif_rx(skb);
683
649 rcu_read_unlock(); 684 rcu_read_unlock();
650 return(0); 685 return 0;
651 } 686 }
652 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); 687 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
653 688
@@ -655,20 +690,19 @@ drop:
655 rcu_read_unlock(); 690 rcu_read_unlock();
656drop_nolock: 691drop_nolock:
657 kfree_skb(skb); 692 kfree_skb(skb);
658 return(0); 693 return 0;
659} 694}
660 695
661static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) 696static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
662{ 697{
663 struct ip_tunnel *tunnel = netdev_priv(dev); 698 struct ip_tunnel *tunnel = netdev_priv(dev);
664 struct net_device_stats *stats = &dev->stats; 699 struct pcpu_tstats *tstats;
665 struct netdev_queue *txq = netdev_get_tx_queue(dev, 0);
666 struct iphdr *old_iph = ip_hdr(skb); 700 struct iphdr *old_iph = ip_hdr(skb);
667 struct iphdr *tiph; 701 struct iphdr *tiph;
668 u8 tos; 702 u8 tos;
669 __be16 df; 703 __be16 df;
670 struct rtable *rt; /* Route to the other host */ 704 struct rtable *rt; /* Route to the other host */
671 struct net_device *tdev; /* Device to other host */ 705 struct net_device *tdev; /* Device to other host */
672 struct iphdr *iph; /* Our new IP header */ 706 struct iphdr *iph; /* Our new IP header */
673 unsigned int max_headroom; /* The extra header space needed */ 707 unsigned int max_headroom; /* The extra header space needed */
674 int gre_hlen; 708 int gre_hlen;
@@ -690,7 +724,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
690 /* NBMA tunnel */ 724 /* NBMA tunnel */
691 725
692 if (skb_dst(skb) == NULL) { 726 if (skb_dst(skb) == NULL) {
693 stats->tx_fifo_errors++; 727 dev->stats.tx_fifo_errors++;
694 goto tx_error; 728 goto tx_error;
695 } 729 }
696 730
@@ -736,14 +770,20 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
736 } 770 }
737 771
738 { 772 {
739 struct flowi fl = { .oif = tunnel->parms.link, 773 struct flowi fl = {
740 .nl_u = { .ip4_u = 774 .oif = tunnel->parms.link,
741 { .daddr = dst, 775 .nl_u = {
742 .saddr = tiph->saddr, 776 .ip4_u = {
743 .tos = RT_TOS(tos) } }, 777 .daddr = dst,
744 .proto = IPPROTO_GRE }; 778 .saddr = tiph->saddr,
779 .tos = RT_TOS(tos)
780 }
781 },
782 .proto = IPPROTO_GRE
783 }
784;
745 if (ip_route_output_key(dev_net(dev), &rt, &fl)) { 785 if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
746 stats->tx_carrier_errors++; 786 dev->stats.tx_carrier_errors++;
747 goto tx_error; 787 goto tx_error;
748 } 788 }
749 } 789 }
@@ -751,7 +791,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
751 791
752 if (tdev == dev) { 792 if (tdev == dev) {
753 ip_rt_put(rt); 793 ip_rt_put(rt);
754 stats->collisions++; 794 dev->stats.collisions++;
755 goto tx_error; 795 goto tx_error;
756 } 796 }
757 797
@@ -814,7 +854,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
814 dev->needed_headroom = max_headroom; 854 dev->needed_headroom = max_headroom;
815 if (!new_skb) { 855 if (!new_skb) {
816 ip_rt_put(rt); 856 ip_rt_put(rt);
817 txq->tx_dropped++; 857 dev->stats.tx_dropped++;
818 dev_kfree_skb(skb); 858 dev_kfree_skb(skb);
819 return NETDEV_TX_OK; 859 return NETDEV_TX_OK;
820 } 860 }
@@ -881,15 +921,15 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
881 } 921 }
882 922
883 nf_reset(skb); 923 nf_reset(skb);
884 924 tstats = this_cpu_ptr(dev->tstats);
885 IPTUNNEL_XMIT(); 925 __IPTUNNEL_XMIT(tstats, &dev->stats);
886 return NETDEV_TX_OK; 926 return NETDEV_TX_OK;
887 927
888tx_error_icmp: 928tx_error_icmp:
889 dst_link_failure(skb); 929 dst_link_failure(skb);
890 930
891tx_error: 931tx_error:
892 stats->tx_errors++; 932 dev->stats.tx_errors++;
893 dev_kfree_skb(skb); 933 dev_kfree_skb(skb);
894 return NETDEV_TX_OK; 934 return NETDEV_TX_OK;
895} 935}
@@ -909,13 +949,19 @@ static int ipgre_tunnel_bind_dev(struct net_device *dev)
909 /* Guess output device to choose reasonable mtu and needed_headroom */ 949 /* Guess output device to choose reasonable mtu and needed_headroom */
910 950
911 if (iph->daddr) { 951 if (iph->daddr) {
912 struct flowi fl = { .oif = tunnel->parms.link, 952 struct flowi fl = {
913 .nl_u = { .ip4_u = 953 .oif = tunnel->parms.link,
914 { .daddr = iph->daddr, 954 .nl_u = {
915 .saddr = iph->saddr, 955 .ip4_u = {
916 .tos = RT_TOS(iph->tos) } }, 956 .daddr = iph->daddr,
917 .proto = IPPROTO_GRE }; 957 .saddr = iph->saddr,
958 .tos = RT_TOS(iph->tos)
959 }
960 },
961 .proto = IPPROTO_GRE
962 };
918 struct rtable *rt; 963 struct rtable *rt;
964
919 if (!ip_route_output_key(dev_net(dev), &rt, &fl)) { 965 if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
920 tdev = rt->dst.dev; 966 tdev = rt->dst.dev;
921 ip_rt_put(rt); 967 ip_rt_put(rt);
@@ -1012,7 +1058,7 @@ ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
1012 break; 1058 break;
1013 } 1059 }
1014 } else { 1060 } else {
1015 unsigned nflags = 0; 1061 unsigned int nflags = 0;
1016 1062
1017 t = netdev_priv(dev); 1063 t = netdev_priv(dev);
1018 1064
@@ -1125,7 +1171,7 @@ static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1125 1171
1126static int ipgre_header(struct sk_buff *skb, struct net_device *dev, 1172static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1127 unsigned short type, 1173 unsigned short type,
1128 const void *daddr, const void *saddr, unsigned len) 1174 const void *daddr, const void *saddr, unsigned int len)
1129{ 1175{
1130 struct ip_tunnel *t = netdev_priv(dev); 1176 struct ip_tunnel *t = netdev_priv(dev);
1131 struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen); 1177 struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
@@ -1167,13 +1213,19 @@ static int ipgre_open(struct net_device *dev)
1167 struct ip_tunnel *t = netdev_priv(dev); 1213 struct ip_tunnel *t = netdev_priv(dev);
1168 1214
1169 if (ipv4_is_multicast(t->parms.iph.daddr)) { 1215 if (ipv4_is_multicast(t->parms.iph.daddr)) {
1170 struct flowi fl = { .oif = t->parms.link, 1216 struct flowi fl = {
1171 .nl_u = { .ip4_u = 1217 .oif = t->parms.link,
1172 { .daddr = t->parms.iph.daddr, 1218 .nl_u = {
1173 .saddr = t->parms.iph.saddr, 1219 .ip4_u = {
1174 .tos = RT_TOS(t->parms.iph.tos) } }, 1220 .daddr = t->parms.iph.daddr,
1175 .proto = IPPROTO_GRE }; 1221 .saddr = t->parms.iph.saddr,
1222 .tos = RT_TOS(t->parms.iph.tos)
1223 }
1224 },
1225 .proto = IPPROTO_GRE
1226 };
1176 struct rtable *rt; 1227 struct rtable *rt;
1228
1177 if (ip_route_output_key(dev_net(dev), &rt, &fl)) 1229 if (ip_route_output_key(dev_net(dev), &rt, &fl))
1178 return -EADDRNOTAVAIL; 1230 return -EADDRNOTAVAIL;
1179 dev = rt->dst.dev; 1231 dev = rt->dst.dev;
@@ -1193,10 +1245,8 @@ static int ipgre_close(struct net_device *dev)
1193 if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) { 1245 if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1194 struct in_device *in_dev; 1246 struct in_device *in_dev;
1195 in_dev = inetdev_by_index(dev_net(dev), t->mlink); 1247 in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1196 if (in_dev) { 1248 if (in_dev)
1197 ip_mc_dec_group(in_dev, t->parms.iph.daddr); 1249 ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1198 in_dev_put(in_dev);
1199 }
1200 } 1250 }
1201 return 0; 1251 return 0;
1202} 1252}
@@ -1213,12 +1263,19 @@ static const struct net_device_ops ipgre_netdev_ops = {
1213 .ndo_start_xmit = ipgre_tunnel_xmit, 1263 .ndo_start_xmit = ipgre_tunnel_xmit,
1214 .ndo_do_ioctl = ipgre_tunnel_ioctl, 1264 .ndo_do_ioctl = ipgre_tunnel_ioctl,
1215 .ndo_change_mtu = ipgre_tunnel_change_mtu, 1265 .ndo_change_mtu = ipgre_tunnel_change_mtu,
1266 .ndo_get_stats = ipgre_get_stats,
1216}; 1267};
1217 1268
1269static void ipgre_dev_free(struct net_device *dev)
1270{
1271 free_percpu(dev->tstats);
1272 free_netdev(dev);
1273}
1274
1218static void ipgre_tunnel_setup(struct net_device *dev) 1275static void ipgre_tunnel_setup(struct net_device *dev)
1219{ 1276{
1220 dev->netdev_ops = &ipgre_netdev_ops; 1277 dev->netdev_ops = &ipgre_netdev_ops;
1221 dev->destructor = free_netdev; 1278 dev->destructor = ipgre_dev_free;
1222 1279
1223 dev->type = ARPHRD_IPGRE; 1280 dev->type = ARPHRD_IPGRE;
1224 dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4; 1281 dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
@@ -1256,6 +1313,10 @@ static int ipgre_tunnel_init(struct net_device *dev)
1256 } else 1313 } else
1257 dev->header_ops = &ipgre_header_ops; 1314 dev->header_ops = &ipgre_header_ops;
1258 1315
1316 dev->tstats = alloc_percpu(struct pcpu_tstats);
1317 if (!dev->tstats)
1318 return -ENOMEM;
1319
1259 return 0; 1320 return 0;
1260} 1321}
1261 1322
@@ -1274,14 +1335,13 @@ static void ipgre_fb_tunnel_init(struct net_device *dev)
1274 tunnel->hlen = sizeof(struct iphdr) + 4; 1335 tunnel->hlen = sizeof(struct iphdr) + 4;
1275 1336
1276 dev_hold(dev); 1337 dev_hold(dev);
1277 ign->tunnels_wc[0] = tunnel; 1338 rcu_assign_pointer(ign->tunnels_wc[0], tunnel);
1278} 1339}
1279 1340
1280 1341
1281static const struct net_protocol ipgre_protocol = { 1342static const struct gre_protocol ipgre_protocol = {
1282 .handler = ipgre_rcv, 1343 .handler = ipgre_rcv,
1283 .err_handler = ipgre_err, 1344 .err_handler = ipgre_err,
1284 .netns_ok = 1,
1285}; 1345};
1286 1346
1287static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head) 1347static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
@@ -1291,11 +1351,13 @@ static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
1291 for (prio = 0; prio < 4; prio++) { 1351 for (prio = 0; prio < 4; prio++) {
1292 int h; 1352 int h;
1293 for (h = 0; h < HASH_SIZE; h++) { 1353 for (h = 0; h < HASH_SIZE; h++) {
1294 struct ip_tunnel *t = ign->tunnels[prio][h]; 1354 struct ip_tunnel *t;
1355
1356 t = rtnl_dereference(ign->tunnels[prio][h]);
1295 1357
1296 while (t != NULL) { 1358 while (t != NULL) {
1297 unregister_netdevice_queue(t->dev, head); 1359 unregister_netdevice_queue(t->dev, head);
1298 t = t->next; 1360 t = rtnl_dereference(t->next);
1299 } 1361 }
1300 } 1362 }
1301 } 1363 }
@@ -1441,6 +1503,10 @@ static int ipgre_tap_init(struct net_device *dev)
1441 1503
1442 ipgre_tunnel_bind_dev(dev); 1504 ipgre_tunnel_bind_dev(dev);
1443 1505
1506 dev->tstats = alloc_percpu(struct pcpu_tstats);
1507 if (!dev->tstats)
1508 return -ENOMEM;
1509
1444 return 0; 1510 return 0;
1445} 1511}
1446 1512
@@ -1451,6 +1517,7 @@ static const struct net_device_ops ipgre_tap_netdev_ops = {
1451 .ndo_set_mac_address = eth_mac_addr, 1517 .ndo_set_mac_address = eth_mac_addr,
1452 .ndo_validate_addr = eth_validate_addr, 1518 .ndo_validate_addr = eth_validate_addr,
1453 .ndo_change_mtu = ipgre_tunnel_change_mtu, 1519 .ndo_change_mtu = ipgre_tunnel_change_mtu,
1520 .ndo_get_stats = ipgre_get_stats,
1454}; 1521};
1455 1522
1456static void ipgre_tap_setup(struct net_device *dev) 1523static void ipgre_tap_setup(struct net_device *dev)
@@ -1459,7 +1526,7 @@ static void ipgre_tap_setup(struct net_device *dev)
1459 ether_setup(dev); 1526 ether_setup(dev);
1460 1527
1461 dev->netdev_ops = &ipgre_tap_netdev_ops; 1528 dev->netdev_ops = &ipgre_tap_netdev_ops;
1462 dev->destructor = free_netdev; 1529 dev->destructor = ipgre_dev_free;
1463 1530
1464 dev->iflink = 0; 1531 dev->iflink = 0;
1465 dev->features |= NETIF_F_NETNS_LOCAL; 1532 dev->features |= NETIF_F_NETNS_LOCAL;
@@ -1487,6 +1554,10 @@ static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nla
1487 if (!tb[IFLA_MTU]) 1554 if (!tb[IFLA_MTU])
1488 dev->mtu = mtu; 1555 dev->mtu = mtu;
1489 1556
1557 /* Can use a lockless transmit, unless we generate output sequences */
1558 if (!(nt->parms.o_flags & GRE_SEQ))
1559 dev->features |= NETIF_F_LLTX;
1560
1490 err = register_netdevice(dev); 1561 err = register_netdevice(dev);
1491 if (err) 1562 if (err)
1492 goto out; 1563 goto out;
@@ -1522,7 +1593,7 @@ static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1522 t = nt; 1593 t = nt;
1523 1594
1524 if (dev->type != ARPHRD_ETHER) { 1595 if (dev->type != ARPHRD_ETHER) {
1525 unsigned nflags = 0; 1596 unsigned int nflags = 0;
1526 1597
1527 if (ipv4_is_multicast(p.iph.daddr)) 1598 if (ipv4_is_multicast(p.iph.daddr))
1528 nflags = IFF_BROADCAST; 1599 nflags = IFF_BROADCAST;
@@ -1663,7 +1734,7 @@ static int __init ipgre_init(void)
1663 if (err < 0) 1734 if (err < 0)
1664 return err; 1735 return err;
1665 1736
1666 err = inet_add_protocol(&ipgre_protocol, IPPROTO_GRE); 1737 err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
1667 if (err < 0) { 1738 if (err < 0) {
1668 printk(KERN_INFO "ipgre init: can't add protocol\n"); 1739 printk(KERN_INFO "ipgre init: can't add protocol\n");
1669 goto add_proto_failed; 1740 goto add_proto_failed;
@@ -1683,7 +1754,7 @@ out:
1683tap_ops_failed: 1754tap_ops_failed:
1684 rtnl_link_unregister(&ipgre_link_ops); 1755 rtnl_link_unregister(&ipgre_link_ops);
1685rtnl_link_failed: 1756rtnl_link_failed:
1686 inet_del_protocol(&ipgre_protocol, IPPROTO_GRE); 1757 gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1687add_proto_failed: 1758add_proto_failed:
1688 unregister_pernet_device(&ipgre_net_ops); 1759 unregister_pernet_device(&ipgre_net_ops);
1689 goto out; 1760 goto out;
@@ -1693,7 +1764,7 @@ static void __exit ipgre_fini(void)
1693{ 1764{
1694 rtnl_link_unregister(&ipgre_tap_ops); 1765 rtnl_link_unregister(&ipgre_tap_ops);
1695 rtnl_link_unregister(&ipgre_link_ops); 1766 rtnl_link_unregister(&ipgre_link_ops);
1696 if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) 1767 if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0)
1697 printk(KERN_INFO "ipgre close: can't remove protocol\n"); 1768 printk(KERN_INFO "ipgre close: can't remove protocol\n");
1698 unregister_pernet_device(&ipgre_net_ops); 1769 unregister_pernet_device(&ipgre_net_ops);
1699} 1770}
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index ba9836c488ed..1906fa35860c 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -466,7 +466,7 @@ error:
466 } 466 }
467 return -EINVAL; 467 return -EINVAL;
468} 468}
469 469EXPORT_SYMBOL(ip_options_compile);
470 470
471/* 471/*
472 * Undo all the changes done by ip_options_compile(). 472 * Undo all the changes done by ip_options_compile().
@@ -646,3 +646,4 @@ int ip_options_rcv_srr(struct sk_buff *skb)
646 } 646 }
647 return 0; 647 return 0;
648} 648}
649EXPORT_SYMBOL(ip_options_rcv_srr);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 7649d7750075..439d2a34ee44 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -487,7 +487,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
487 * LATER: this step can be merged to real generation of fragments, 487 * LATER: this step can be merged to real generation of fragments,
488 * we can switch to copy when see the first bad fragment. 488 * we can switch to copy when see the first bad fragment.
489 */ 489 */
490 if (skb_has_frags(skb)) { 490 if (skb_has_frag_list(skb)) {
491 struct sk_buff *frag, *frag2; 491 struct sk_buff *frag, *frag2;
492 int first_len = skb_pagelen(skb); 492 int first_len = skb_pagelen(skb);
493 493
@@ -844,10 +844,9 @@ int ip_append_data(struct sock *sk,
844 inet->cork.length = 0; 844 inet->cork.length = 0;
845 sk->sk_sndmsg_page = NULL; 845 sk->sk_sndmsg_page = NULL;
846 sk->sk_sndmsg_off = 0; 846 sk->sk_sndmsg_off = 0;
847 if ((exthdrlen = rt->dst.header_len) != 0) { 847 exthdrlen = rt->dst.header_len;
848 length += exthdrlen; 848 length += exthdrlen;
849 transhdrlen += exthdrlen; 849 transhdrlen += exthdrlen;
850 }
851 } else { 850 } else {
852 rt = (struct rtable *)inet->cork.dst; 851 rt = (struct rtable *)inet->cork.dst;
853 if (inet->cork.flags & IPCORK_OPT) 852 if (inet->cork.flags & IPCORK_OPT)
@@ -934,16 +933,19 @@ alloc_new_skb:
934 !(rt->dst.dev->features&NETIF_F_SG)) 933 !(rt->dst.dev->features&NETIF_F_SG))
935 alloclen = mtu; 934 alloclen = mtu;
936 else 935 else
937 alloclen = datalen + fragheaderlen; 936 alloclen = fraglen;
938 937
939 /* The last fragment gets additional space at tail. 938 /* The last fragment gets additional space at tail.
940 * Note, with MSG_MORE we overallocate on fragments, 939 * Note, with MSG_MORE we overallocate on fragments,
941 * because we have no idea what fragment will be 940 * because we have no idea what fragment will be
942 * the last. 941 * the last.
943 */ 942 */
944 if (datalen == length + fraggap) 943 if (datalen == length + fraggap) {
945 alloclen += rt->dst.trailer_len; 944 alloclen += rt->dst.trailer_len;
946 945 /* make sure mtu is not reached */
946 if (datalen > mtu - fragheaderlen - rt->dst.trailer_len)
947 datalen -= ALIGN(rt->dst.trailer_len, 8);
948 }
947 if (transhdrlen) { 949 if (transhdrlen) {
948 skb = sock_alloc_send_skb(sk, 950 skb = sock_alloc_send_skb(sk,
949 alloclen + hh_len + 15, 951 alloclen + hh_len + 15,
@@ -960,7 +962,7 @@ alloc_new_skb:
960 else 962 else
961 /* only the initial fragment is 963 /* only the initial fragment is
962 time stamped */ 964 time stamped */
963 ipc->shtx.flags = 0; 965 ipc->tx_flags = 0;
964 } 966 }
965 if (skb == NULL) 967 if (skb == NULL)
966 goto error; 968 goto error;
@@ -971,7 +973,7 @@ alloc_new_skb:
971 skb->ip_summed = csummode; 973 skb->ip_summed = csummode;
972 skb->csum = 0; 974 skb->csum = 0;
973 skb_reserve(skb, hh_len); 975 skb_reserve(skb, hh_len);
974 *skb_tx(skb) = ipc->shtx; 976 skb_shinfo(skb)->tx_flags = ipc->tx_flags;
975 977
976 /* 978 /*
977 * Find where to start putting bytes. 979 * Find where to start putting bytes.
@@ -1391,7 +1393,7 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar
1391 1393
1392 daddr = ipc.addr = rt->rt_src; 1394 daddr = ipc.addr = rt->rt_src;
1393 ipc.opt = NULL; 1395 ipc.opt = NULL;
1394 ipc.shtx.flags = 0; 1396 ipc.tx_flags = 0;
1395 1397
1396 if (replyopts.opt.optlen) { 1398 if (replyopts.opt.optlen) {
1397 ipc.opt = &replyopts.opt; 1399 ipc.opt = &replyopts.opt;
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index ec036731a70b..e9b816e6cd73 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -122,31 +122,59 @@
122 122
123static int ipip_net_id __read_mostly; 123static int ipip_net_id __read_mostly;
124struct ipip_net { 124struct ipip_net {
125 struct ip_tunnel *tunnels_r_l[HASH_SIZE]; 125 struct ip_tunnel __rcu *tunnels_r_l[HASH_SIZE];
126 struct ip_tunnel *tunnels_r[HASH_SIZE]; 126 struct ip_tunnel __rcu *tunnels_r[HASH_SIZE];
127 struct ip_tunnel *tunnels_l[HASH_SIZE]; 127 struct ip_tunnel __rcu *tunnels_l[HASH_SIZE];
128 struct ip_tunnel *tunnels_wc[1]; 128 struct ip_tunnel __rcu *tunnels_wc[1];
129 struct ip_tunnel **tunnels[4]; 129 struct ip_tunnel __rcu **tunnels[4];
130 130
131 struct net_device *fb_tunnel_dev; 131 struct net_device *fb_tunnel_dev;
132}; 132};
133 133
134static void ipip_tunnel_init(struct net_device *dev); 134static int ipip_tunnel_init(struct net_device *dev);
135static void ipip_tunnel_setup(struct net_device *dev); 135static void ipip_tunnel_setup(struct net_device *dev);
136static void ipip_dev_free(struct net_device *dev);
136 137
137/* 138/*
138 * Locking : hash tables are protected by RCU and a spinlock 139 * Locking : hash tables are protected by RCU and RTNL
139 */ 140 */
140static DEFINE_SPINLOCK(ipip_lock);
141 141
142#define for_each_ip_tunnel_rcu(start) \ 142#define for_each_ip_tunnel_rcu(start) \
143 for (t = rcu_dereference(start); t; t = rcu_dereference(t->next)) 143 for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
144 144
145/* often modified stats are per cpu, other are shared (netdev->stats) */
146struct pcpu_tstats {
147 unsigned long rx_packets;
148 unsigned long rx_bytes;
149 unsigned long tx_packets;
150 unsigned long tx_bytes;
151};
152
153static struct net_device_stats *ipip_get_stats(struct net_device *dev)
154{
155 struct pcpu_tstats sum = { 0 };
156 int i;
157
158 for_each_possible_cpu(i) {
159 const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
160
161 sum.rx_packets += tstats->rx_packets;
162 sum.rx_bytes += tstats->rx_bytes;
163 sum.tx_packets += tstats->tx_packets;
164 sum.tx_bytes += tstats->tx_bytes;
165 }
166 dev->stats.rx_packets = sum.rx_packets;
167 dev->stats.rx_bytes = sum.rx_bytes;
168 dev->stats.tx_packets = sum.tx_packets;
169 dev->stats.tx_bytes = sum.tx_bytes;
170 return &dev->stats;
171}
172
145static struct ip_tunnel * ipip_tunnel_lookup(struct net *net, 173static struct ip_tunnel * ipip_tunnel_lookup(struct net *net,
146 __be32 remote, __be32 local) 174 __be32 remote, __be32 local)
147{ 175{
148 unsigned h0 = HASH(remote); 176 unsigned int h0 = HASH(remote);
149 unsigned h1 = HASH(local); 177 unsigned int h1 = HASH(local);
150 struct ip_tunnel *t; 178 struct ip_tunnel *t;
151 struct ipip_net *ipn = net_generic(net, ipip_net_id); 179 struct ipip_net *ipn = net_generic(net, ipip_net_id);
152 180
@@ -169,12 +197,12 @@ static struct ip_tunnel * ipip_tunnel_lookup(struct net *net,
169 return NULL; 197 return NULL;
170} 198}
171 199
172static struct ip_tunnel **__ipip_bucket(struct ipip_net *ipn, 200static struct ip_tunnel __rcu **__ipip_bucket(struct ipip_net *ipn,
173 struct ip_tunnel_parm *parms) 201 struct ip_tunnel_parm *parms)
174{ 202{
175 __be32 remote = parms->iph.daddr; 203 __be32 remote = parms->iph.daddr;
176 __be32 local = parms->iph.saddr; 204 __be32 local = parms->iph.saddr;
177 unsigned h = 0; 205 unsigned int h = 0;
178 int prio = 0; 206 int prio = 0;
179 207
180 if (remote) { 208 if (remote) {
@@ -188,7 +216,7 @@ static struct ip_tunnel **__ipip_bucket(struct ipip_net *ipn,
188 return &ipn->tunnels[prio][h]; 216 return &ipn->tunnels[prio][h];
189} 217}
190 218
191static inline struct ip_tunnel **ipip_bucket(struct ipip_net *ipn, 219static inline struct ip_tunnel __rcu **ipip_bucket(struct ipip_net *ipn,
192 struct ip_tunnel *t) 220 struct ip_tunnel *t)
193{ 221{
194 return __ipip_bucket(ipn, &t->parms); 222 return __ipip_bucket(ipn, &t->parms);
@@ -196,13 +224,14 @@ static inline struct ip_tunnel **ipip_bucket(struct ipip_net *ipn,
196 224
197static void ipip_tunnel_unlink(struct ipip_net *ipn, struct ip_tunnel *t) 225static void ipip_tunnel_unlink(struct ipip_net *ipn, struct ip_tunnel *t)
198{ 226{
199 struct ip_tunnel **tp; 227 struct ip_tunnel __rcu **tp;
200 228 struct ip_tunnel *iter;
201 for (tp = ipip_bucket(ipn, t); *tp; tp = &(*tp)->next) { 229
202 if (t == *tp) { 230 for (tp = ipip_bucket(ipn, t);
203 spin_lock_bh(&ipip_lock); 231 (iter = rtnl_dereference(*tp)) != NULL;
204 *tp = t->next; 232 tp = &iter->next) {
205 spin_unlock_bh(&ipip_lock); 233 if (t == iter) {
234 rcu_assign_pointer(*tp, t->next);
206 break; 235 break;
207 } 236 }
208 } 237 }
@@ -210,12 +239,10 @@ static void ipip_tunnel_unlink(struct ipip_net *ipn, struct ip_tunnel *t)
210 239
211static void ipip_tunnel_link(struct ipip_net *ipn, struct ip_tunnel *t) 240static void ipip_tunnel_link(struct ipip_net *ipn, struct ip_tunnel *t)
212{ 241{
213 struct ip_tunnel **tp = ipip_bucket(ipn, t); 242 struct ip_tunnel __rcu **tp = ipip_bucket(ipn, t);
214 243
215 spin_lock_bh(&ipip_lock); 244 rcu_assign_pointer(t->next, rtnl_dereference(*tp));
216 t->next = *tp;
217 rcu_assign_pointer(*tp, t); 245 rcu_assign_pointer(*tp, t);
218 spin_unlock_bh(&ipip_lock);
219} 246}
220 247
221static struct ip_tunnel * ipip_tunnel_locate(struct net *net, 248static struct ip_tunnel * ipip_tunnel_locate(struct net *net,
@@ -223,12 +250,15 @@ static struct ip_tunnel * ipip_tunnel_locate(struct net *net,
223{ 250{
224 __be32 remote = parms->iph.daddr; 251 __be32 remote = parms->iph.daddr;
225 __be32 local = parms->iph.saddr; 252 __be32 local = parms->iph.saddr;
226 struct ip_tunnel *t, **tp, *nt; 253 struct ip_tunnel *t, *nt;
254 struct ip_tunnel __rcu **tp;
227 struct net_device *dev; 255 struct net_device *dev;
228 char name[IFNAMSIZ]; 256 char name[IFNAMSIZ];
229 struct ipip_net *ipn = net_generic(net, ipip_net_id); 257 struct ipip_net *ipn = net_generic(net, ipip_net_id);
230 258
231 for (tp = __ipip_bucket(ipn, parms); (t = *tp) != NULL; tp = &t->next) { 259 for (tp = __ipip_bucket(ipn, parms);
260 (t = rtnl_dereference(*tp)) != NULL;
261 tp = &t->next) {
232 if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) 262 if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr)
233 return t; 263 return t;
234 } 264 }
@@ -238,7 +268,7 @@ static struct ip_tunnel * ipip_tunnel_locate(struct net *net,
238 if (parms->name[0]) 268 if (parms->name[0])
239 strlcpy(name, parms->name, IFNAMSIZ); 269 strlcpy(name, parms->name, IFNAMSIZ);
240 else 270 else
241 sprintf(name, "tunl%%d"); 271 strcpy(name, "tunl%d");
242 272
243 dev = alloc_netdev(sizeof(*t), name, ipip_tunnel_setup); 273 dev = alloc_netdev(sizeof(*t), name, ipip_tunnel_setup);
244 if (dev == NULL) 274 if (dev == NULL)
@@ -254,7 +284,8 @@ static struct ip_tunnel * ipip_tunnel_locate(struct net *net,
254 nt = netdev_priv(dev); 284 nt = netdev_priv(dev);
255 nt->parms = *parms; 285 nt->parms = *parms;
256 286
257 ipip_tunnel_init(dev); 287 if (ipip_tunnel_init(dev) < 0)
288 goto failed_free;
258 289
259 if (register_netdevice(dev) < 0) 290 if (register_netdevice(dev) < 0)
260 goto failed_free; 291 goto failed_free;
@@ -264,20 +295,19 @@ static struct ip_tunnel * ipip_tunnel_locate(struct net *net,
264 return nt; 295 return nt;
265 296
266failed_free: 297failed_free:
267 free_netdev(dev); 298 ipip_dev_free(dev);
268 return NULL; 299 return NULL;
269} 300}
270 301
302/* called with RTNL */
271static void ipip_tunnel_uninit(struct net_device *dev) 303static void ipip_tunnel_uninit(struct net_device *dev)
272{ 304{
273 struct net *net = dev_net(dev); 305 struct net *net = dev_net(dev);
274 struct ipip_net *ipn = net_generic(net, ipip_net_id); 306 struct ipip_net *ipn = net_generic(net, ipip_net_id);
275 307
276 if (dev == ipn->fb_tunnel_dev) { 308 if (dev == ipn->fb_tunnel_dev)
277 spin_lock_bh(&ipip_lock); 309 rcu_assign_pointer(ipn->tunnels_wc[0], NULL);
278 ipn->tunnels_wc[0] = NULL; 310 else
279 spin_unlock_bh(&ipip_lock);
280 } else
281 ipip_tunnel_unlink(ipn, netdev_priv(dev)); 311 ipip_tunnel_unlink(ipn, netdev_priv(dev));
282 dev_put(dev); 312 dev_put(dev);
283} 313}
@@ -359,8 +389,10 @@ static int ipip_rcv(struct sk_buff *skb)
359 const struct iphdr *iph = ip_hdr(skb); 389 const struct iphdr *iph = ip_hdr(skb);
360 390
361 rcu_read_lock(); 391 rcu_read_lock();
362 if ((tunnel = ipip_tunnel_lookup(dev_net(skb->dev), 392 tunnel = ipip_tunnel_lookup(dev_net(skb->dev), iph->saddr, iph->daddr);
363 iph->saddr, iph->daddr)) != NULL) { 393 if (tunnel != NULL) {
394 struct pcpu_tstats *tstats;
395
364 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { 396 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
365 rcu_read_unlock(); 397 rcu_read_unlock();
366 kfree_skb(skb); 398 kfree_skb(skb);
@@ -374,10 +406,16 @@ static int ipip_rcv(struct sk_buff *skb)
374 skb->protocol = htons(ETH_P_IP); 406 skb->protocol = htons(ETH_P_IP);
375 skb->pkt_type = PACKET_HOST; 407 skb->pkt_type = PACKET_HOST;
376 408
377 skb_tunnel_rx(skb, tunnel->dev); 409 tstats = this_cpu_ptr(tunnel->dev->tstats);
410 tstats->rx_packets++;
411 tstats->rx_bytes += skb->len;
412
413 __skb_tunnel_rx(skb, tunnel->dev);
378 414
379 ipip_ecn_decapsulate(iph, skb); 415 ipip_ecn_decapsulate(iph, skb);
416
380 netif_rx(skb); 417 netif_rx(skb);
418
381 rcu_read_unlock(); 419 rcu_read_unlock();
382 return 0; 420 return 0;
383 } 421 }
@@ -394,13 +432,12 @@ static int ipip_rcv(struct sk_buff *skb)
394static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) 432static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
395{ 433{
396 struct ip_tunnel *tunnel = netdev_priv(dev); 434 struct ip_tunnel *tunnel = netdev_priv(dev);
397 struct net_device_stats *stats = &dev->stats; 435 struct pcpu_tstats *tstats;
398 struct netdev_queue *txq = netdev_get_tx_queue(dev, 0);
399 struct iphdr *tiph = &tunnel->parms.iph; 436 struct iphdr *tiph = &tunnel->parms.iph;
400 u8 tos = tunnel->parms.iph.tos; 437 u8 tos = tunnel->parms.iph.tos;
401 __be16 df = tiph->frag_off; 438 __be16 df = tiph->frag_off;
402 struct rtable *rt; /* Route to the other host */ 439 struct rtable *rt; /* Route to the other host */
403 struct net_device *tdev; /* Device to other host */ 440 struct net_device *tdev; /* Device to other host */
404 struct iphdr *old_iph = ip_hdr(skb); 441 struct iphdr *old_iph = ip_hdr(skb);
405 struct iphdr *iph; /* Our new IP header */ 442 struct iphdr *iph; /* Our new IP header */
406 unsigned int max_headroom; /* The extra header space needed */ 443 unsigned int max_headroom; /* The extra header space needed */
@@ -410,13 +447,13 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
410 if (skb->protocol != htons(ETH_P_IP)) 447 if (skb->protocol != htons(ETH_P_IP))
411 goto tx_error; 448 goto tx_error;
412 449
413 if (tos&1) 450 if (tos & 1)
414 tos = old_iph->tos; 451 tos = old_iph->tos;
415 452
416 if (!dst) { 453 if (!dst) {
417 /* NBMA tunnel */ 454 /* NBMA tunnel */
418 if ((rt = skb_rtable(skb)) == NULL) { 455 if ((rt = skb_rtable(skb)) == NULL) {
419 stats->tx_fifo_errors++; 456 dev->stats.tx_fifo_errors++;
420 goto tx_error; 457 goto tx_error;
421 } 458 }
422 if ((dst = rt->rt_gateway) == 0) 459 if ((dst = rt->rt_gateway) == 0)
@@ -424,14 +461,20 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
424 } 461 }
425 462
426 { 463 {
427 struct flowi fl = { .oif = tunnel->parms.link, 464 struct flowi fl = {
428 .nl_u = { .ip4_u = 465 .oif = tunnel->parms.link,
429 { .daddr = dst, 466 .nl_u = {
430 .saddr = tiph->saddr, 467 .ip4_u = {
431 .tos = RT_TOS(tos) } }, 468 .daddr = dst,
432 .proto = IPPROTO_IPIP }; 469 .saddr = tiph->saddr,
470 .tos = RT_TOS(tos)
471 }
472 },
473 .proto = IPPROTO_IPIP
474 };
475
433 if (ip_route_output_key(dev_net(dev), &rt, &fl)) { 476 if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
434 stats->tx_carrier_errors++; 477 dev->stats.tx_carrier_errors++;
435 goto tx_error_icmp; 478 goto tx_error_icmp;
436 } 479 }
437 } 480 }
@@ -439,7 +482,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
439 482
440 if (tdev == dev) { 483 if (tdev == dev) {
441 ip_rt_put(rt); 484 ip_rt_put(rt);
442 stats->collisions++; 485 dev->stats.collisions++;
443 goto tx_error; 486 goto tx_error;
444 } 487 }
445 488
@@ -449,7 +492,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
449 mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr); 492 mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr);
450 493
451 if (mtu < 68) { 494 if (mtu < 68) {
452 stats->collisions++; 495 dev->stats.collisions++;
453 ip_rt_put(rt); 496 ip_rt_put(rt);
454 goto tx_error; 497 goto tx_error;
455 } 498 }
@@ -485,7 +528,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
485 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); 528 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
486 if (!new_skb) { 529 if (!new_skb) {
487 ip_rt_put(rt); 530 ip_rt_put(rt);
488 txq->tx_dropped++; 531 dev->stats.tx_dropped++;
489 dev_kfree_skb(skb); 532 dev_kfree_skb(skb);
490 return NETDEV_TX_OK; 533 return NETDEV_TX_OK;
491 } 534 }
@@ -522,14 +565,14 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
522 iph->ttl = old_iph->ttl; 565 iph->ttl = old_iph->ttl;
523 566
524 nf_reset(skb); 567 nf_reset(skb);
525 568 tstats = this_cpu_ptr(dev->tstats);
526 IPTUNNEL_XMIT(); 569 __IPTUNNEL_XMIT(tstats, &dev->stats);
527 return NETDEV_TX_OK; 570 return NETDEV_TX_OK;
528 571
529tx_error_icmp: 572tx_error_icmp:
530 dst_link_failure(skb); 573 dst_link_failure(skb);
531tx_error: 574tx_error:
532 stats->tx_errors++; 575 dev->stats.tx_errors++;
533 dev_kfree_skb(skb); 576 dev_kfree_skb(skb);
534 return NETDEV_TX_OK; 577 return NETDEV_TX_OK;
535} 578}
@@ -544,13 +587,19 @@ static void ipip_tunnel_bind_dev(struct net_device *dev)
544 iph = &tunnel->parms.iph; 587 iph = &tunnel->parms.iph;
545 588
546 if (iph->daddr) { 589 if (iph->daddr) {
547 struct flowi fl = { .oif = tunnel->parms.link, 590 struct flowi fl = {
548 .nl_u = { .ip4_u = 591 .oif = tunnel->parms.link,
549 { .daddr = iph->daddr, 592 .nl_u = {
550 .saddr = iph->saddr, 593 .ip4_u = {
551 .tos = RT_TOS(iph->tos) } }, 594 .daddr = iph->daddr,
552 .proto = IPPROTO_IPIP }; 595 .saddr = iph->saddr,
596 .tos = RT_TOS(iph->tos)
597 }
598 },
599 .proto = IPPROTO_IPIP
600 };
553 struct rtable *rt; 601 struct rtable *rt;
602
554 if (!ip_route_output_key(dev_net(dev), &rt, &fl)) { 603 if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
555 tdev = rt->dst.dev; 604 tdev = rt->dst.dev;
556 ip_rt_put(rt); 605 ip_rt_put(rt);
@@ -696,13 +745,19 @@ static const struct net_device_ops ipip_netdev_ops = {
696 .ndo_start_xmit = ipip_tunnel_xmit, 745 .ndo_start_xmit = ipip_tunnel_xmit,
697 .ndo_do_ioctl = ipip_tunnel_ioctl, 746 .ndo_do_ioctl = ipip_tunnel_ioctl,
698 .ndo_change_mtu = ipip_tunnel_change_mtu, 747 .ndo_change_mtu = ipip_tunnel_change_mtu,
699 748 .ndo_get_stats = ipip_get_stats,
700}; 749};
701 750
751static void ipip_dev_free(struct net_device *dev)
752{
753 free_percpu(dev->tstats);
754 free_netdev(dev);
755}
756
702static void ipip_tunnel_setup(struct net_device *dev) 757static void ipip_tunnel_setup(struct net_device *dev)
703{ 758{
704 dev->netdev_ops = &ipip_netdev_ops; 759 dev->netdev_ops = &ipip_netdev_ops;
705 dev->destructor = free_netdev; 760 dev->destructor = ipip_dev_free;
706 761
707 dev->type = ARPHRD_TUNNEL; 762 dev->type = ARPHRD_TUNNEL;
708 dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr); 763 dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr);
@@ -711,10 +766,11 @@ static void ipip_tunnel_setup(struct net_device *dev)
711 dev->iflink = 0; 766 dev->iflink = 0;
712 dev->addr_len = 4; 767 dev->addr_len = 4;
713 dev->features |= NETIF_F_NETNS_LOCAL; 768 dev->features |= NETIF_F_NETNS_LOCAL;
769 dev->features |= NETIF_F_LLTX;
714 dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; 770 dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
715} 771}
716 772
717static void ipip_tunnel_init(struct net_device *dev) 773static int ipip_tunnel_init(struct net_device *dev)
718{ 774{
719 struct ip_tunnel *tunnel = netdev_priv(dev); 775 struct ip_tunnel *tunnel = netdev_priv(dev);
720 776
@@ -725,9 +781,15 @@ static void ipip_tunnel_init(struct net_device *dev)
725 memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); 781 memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
726 782
727 ipip_tunnel_bind_dev(dev); 783 ipip_tunnel_bind_dev(dev);
784
785 dev->tstats = alloc_percpu(struct pcpu_tstats);
786 if (!dev->tstats)
787 return -ENOMEM;
788
789 return 0;
728} 790}
729 791
730static void __net_init ipip_fb_tunnel_init(struct net_device *dev) 792static int __net_init ipip_fb_tunnel_init(struct net_device *dev)
731{ 793{
732 struct ip_tunnel *tunnel = netdev_priv(dev); 794 struct ip_tunnel *tunnel = netdev_priv(dev);
733 struct iphdr *iph = &tunnel->parms.iph; 795 struct iphdr *iph = &tunnel->parms.iph;
@@ -740,11 +802,16 @@ static void __net_init ipip_fb_tunnel_init(struct net_device *dev)
740 iph->protocol = IPPROTO_IPIP; 802 iph->protocol = IPPROTO_IPIP;
741 iph->ihl = 5; 803 iph->ihl = 5;
742 804
805 dev->tstats = alloc_percpu(struct pcpu_tstats);
806 if (!dev->tstats)
807 return -ENOMEM;
808
743 dev_hold(dev); 809 dev_hold(dev);
744 ipn->tunnels_wc[0] = tunnel; 810 rcu_assign_pointer(ipn->tunnels_wc[0], tunnel);
811 return 0;
745} 812}
746 813
747static struct xfrm_tunnel ipip_handler = { 814static struct xfrm_tunnel ipip_handler __read_mostly = {
748 .handler = ipip_rcv, 815 .handler = ipip_rcv,
749 .err_handler = ipip_err, 816 .err_handler = ipip_err,
750 .priority = 1, 817 .priority = 1,
@@ -760,11 +827,12 @@ static void ipip_destroy_tunnels(struct ipip_net *ipn, struct list_head *head)
760 for (prio = 1; prio < 4; prio++) { 827 for (prio = 1; prio < 4; prio++) {
761 int h; 828 int h;
762 for (h = 0; h < HASH_SIZE; h++) { 829 for (h = 0; h < HASH_SIZE; h++) {
763 struct ip_tunnel *t = ipn->tunnels[prio][h]; 830 struct ip_tunnel *t;
764 831
832 t = rtnl_dereference(ipn->tunnels[prio][h]);
765 while (t != NULL) { 833 while (t != NULL) {
766 unregister_netdevice_queue(t->dev, head); 834 unregister_netdevice_queue(t->dev, head);
767 t = t->next; 835 t = rtnl_dereference(t->next);
768 } 836 }
769 } 837 }
770 } 838 }
@@ -789,7 +857,9 @@ static int __net_init ipip_init_net(struct net *net)
789 } 857 }
790 dev_net_set(ipn->fb_tunnel_dev, net); 858 dev_net_set(ipn->fb_tunnel_dev, net);
791 859
792 ipip_fb_tunnel_init(ipn->fb_tunnel_dev); 860 err = ipip_fb_tunnel_init(ipn->fb_tunnel_dev);
861 if (err)
862 goto err_reg_dev;
793 863
794 if ((err = register_netdev(ipn->fb_tunnel_dev))) 864 if ((err = register_netdev(ipn->fb_tunnel_dev)))
795 goto err_reg_dev; 865 goto err_reg_dev;
@@ -797,7 +867,7 @@ static int __net_init ipip_init_net(struct net *net)
797 return 0; 867 return 0;
798 868
799err_reg_dev: 869err_reg_dev:
800 free_netdev(ipn->fb_tunnel_dev); 870 ipip_dev_free(ipn->fb_tunnel_dev);
801err_alloc_dev: 871err_alloc_dev:
802 /* nothing */ 872 /* nothing */
803 return err; 873 return err;
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 179fcab866fc..86dd5691af46 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -75,7 +75,7 @@ struct mr_table {
75 struct net *net; 75 struct net *net;
76#endif 76#endif
77 u32 id; 77 u32 id;
78 struct sock *mroute_sk; 78 struct sock __rcu *mroute_sk;
79 struct timer_list ipmr_expire_timer; 79 struct timer_list ipmr_expire_timer;
80 struct list_head mfc_unres_queue; 80 struct list_head mfc_unres_queue;
81 struct list_head mfc_cache_array[MFC_LINES]; 81 struct list_head mfc_cache_array[MFC_LINES];
@@ -98,7 +98,7 @@ struct ipmr_result {
98}; 98};
99 99
100/* Big lock, protecting vif table, mrt cache and mroute socket state. 100/* Big lock, protecting vif table, mrt cache and mroute socket state.
101 Note that the changes are semaphored via rtnl_lock. 101 * Note that the changes are semaphored via rtnl_lock.
102 */ 102 */
103 103
104static DEFINE_RWLOCK(mrt_lock); 104static DEFINE_RWLOCK(mrt_lock);
@@ -113,11 +113,11 @@ static DEFINE_RWLOCK(mrt_lock);
113static DEFINE_SPINLOCK(mfc_unres_lock); 113static DEFINE_SPINLOCK(mfc_unres_lock);
114 114
115/* We return to original Alan's scheme. Hash table of resolved 115/* We return to original Alan's scheme. Hash table of resolved
116 entries is changed only in process context and protected 116 * entries is changed only in process context and protected
117 with weak lock mrt_lock. Queue of unresolved entries is protected 117 * with weak lock mrt_lock. Queue of unresolved entries is protected
118 with strong spinlock mfc_unres_lock. 118 * with strong spinlock mfc_unres_lock.
119 119 *
120 In this case data path is free of exclusive locks at all. 120 * In this case data path is free of exclusive locks at all.
121 */ 121 */
122 122
123static struct kmem_cache *mrt_cachep __read_mostly; 123static struct kmem_cache *mrt_cachep __read_mostly;
@@ -396,9 +396,9 @@ struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v)
396 set_fs(KERNEL_DS); 396 set_fs(KERNEL_DS);
397 err = ops->ndo_do_ioctl(dev, &ifr, SIOCADDTUNNEL); 397 err = ops->ndo_do_ioctl(dev, &ifr, SIOCADDTUNNEL);
398 set_fs(oldfs); 398 set_fs(oldfs);
399 } else 399 } else {
400 err = -EOPNOTSUPP; 400 err = -EOPNOTSUPP;
401 401 }
402 dev = NULL; 402 dev = NULL;
403 403
404 if (err == 0 && 404 if (err == 0 &&
@@ -495,7 +495,8 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
495 dev->iflink = 0; 495 dev->iflink = 0;
496 496
497 rcu_read_lock(); 497 rcu_read_lock();
498 if ((in_dev = __in_dev_get_rcu(dev)) == NULL) { 498 in_dev = __in_dev_get_rcu(dev);
499 if (!in_dev) {
499 rcu_read_unlock(); 500 rcu_read_unlock();
500 goto failure; 501 goto failure;
501 } 502 }
@@ -552,9 +553,10 @@ static int vif_delete(struct mr_table *mrt, int vifi, int notify,
552 mrt->mroute_reg_vif_num = -1; 553 mrt->mroute_reg_vif_num = -1;
553#endif 554#endif
554 555
555 if (vifi+1 == mrt->maxvif) { 556 if (vifi + 1 == mrt->maxvif) {
556 int tmp; 557 int tmp;
557 for (tmp=vifi-1; tmp>=0; tmp--) { 558
559 for (tmp = vifi - 1; tmp >= 0; tmp--) {
558 if (VIF_EXISTS(mrt, tmp)) 560 if (VIF_EXISTS(mrt, tmp))
559 break; 561 break;
560 } 562 }
@@ -565,25 +567,33 @@ static int vif_delete(struct mr_table *mrt, int vifi, int notify,
565 567
566 dev_set_allmulti(dev, -1); 568 dev_set_allmulti(dev, -1);
567 569
568 if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) { 570 in_dev = __in_dev_get_rtnl(dev);
571 if (in_dev) {
569 IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--; 572 IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--;
570 ip_rt_multicast_event(in_dev); 573 ip_rt_multicast_event(in_dev);
571 } 574 }
572 575
573 if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER) && !notify) 576 if (v->flags & (VIFF_TUNNEL | VIFF_REGISTER) && !notify)
574 unregister_netdevice_queue(dev, head); 577 unregister_netdevice_queue(dev, head);
575 578
576 dev_put(dev); 579 dev_put(dev);
577 return 0; 580 return 0;
578} 581}
579 582
580static inline void ipmr_cache_free(struct mfc_cache *c) 583static void ipmr_cache_free_rcu(struct rcu_head *head)
581{ 584{
585 struct mfc_cache *c = container_of(head, struct mfc_cache, rcu);
586
582 kmem_cache_free(mrt_cachep, c); 587 kmem_cache_free(mrt_cachep, c);
583} 588}
584 589
590static inline void ipmr_cache_free(struct mfc_cache *c)
591{
592 call_rcu(&c->rcu, ipmr_cache_free_rcu);
593}
594
585/* Destroy an unresolved cache entry, killing queued skbs 595/* Destroy an unresolved cache entry, killing queued skbs
586 and reporting error to netlink readers. 596 * and reporting error to netlink readers.
587 */ 597 */
588 598
589static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c) 599static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c)
@@ -605,8 +615,9 @@ static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c)
605 memset(&e->msg, 0, sizeof(e->msg)); 615 memset(&e->msg, 0, sizeof(e->msg));
606 616
607 rtnl_unicast(skb, net, NETLINK_CB(skb).pid); 617 rtnl_unicast(skb, net, NETLINK_CB(skb).pid);
608 } else 618 } else {
609 kfree_skb(skb); 619 kfree_skb(skb);
620 }
610 } 621 }
611 622
612 ipmr_cache_free(c); 623 ipmr_cache_free(c);
@@ -724,13 +735,13 @@ static int vif_add(struct net *net, struct mr_table *mrt,
724 case 0: 735 case 0:
725 if (vifc->vifc_flags == VIFF_USE_IFINDEX) { 736 if (vifc->vifc_flags == VIFF_USE_IFINDEX) {
726 dev = dev_get_by_index(net, vifc->vifc_lcl_ifindex); 737 dev = dev_get_by_index(net, vifc->vifc_lcl_ifindex);
727 if (dev && dev->ip_ptr == NULL) { 738 if (dev && __in_dev_get_rtnl(dev) == NULL) {
728 dev_put(dev); 739 dev_put(dev);
729 return -EADDRNOTAVAIL; 740 return -EADDRNOTAVAIL;
730 } 741 }
731 } else 742 } else {
732 dev = ip_dev_find(net, vifc->vifc_lcl_addr.s_addr); 743 dev = ip_dev_find(net, vifc->vifc_lcl_addr.s_addr);
733 744 }
734 if (!dev) 745 if (!dev)
735 return -EADDRNOTAVAIL; 746 return -EADDRNOTAVAIL;
736 err = dev_set_allmulti(dev, 1); 747 err = dev_set_allmulti(dev, 1);
@@ -743,16 +754,16 @@ static int vif_add(struct net *net, struct mr_table *mrt,
743 return -EINVAL; 754 return -EINVAL;
744 } 755 }
745 756
746 if ((in_dev = __in_dev_get_rtnl(dev)) == NULL) { 757 in_dev = __in_dev_get_rtnl(dev);
758 if (!in_dev) {
747 dev_put(dev); 759 dev_put(dev);
748 return -EADDRNOTAVAIL; 760 return -EADDRNOTAVAIL;
749 } 761 }
750 IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++; 762 IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++;
751 ip_rt_multicast_event(in_dev); 763 ip_rt_multicast_event(in_dev);
752 764
753 /* 765 /* Fill in the VIF structures */
754 * Fill in the VIF structures 766
755 */
756 v->rate_limit = vifc->vifc_rate_limit; 767 v->rate_limit = vifc->vifc_rate_limit;
757 v->local = vifc->vifc_lcl_addr.s_addr; 768 v->local = vifc->vifc_lcl_addr.s_addr;
758 v->remote = vifc->vifc_rmt_addr.s_addr; 769 v->remote = vifc->vifc_rmt_addr.s_addr;
@@ -765,14 +776,14 @@ static int vif_add(struct net *net, struct mr_table *mrt,
765 v->pkt_in = 0; 776 v->pkt_in = 0;
766 v->pkt_out = 0; 777 v->pkt_out = 0;
767 v->link = dev->ifindex; 778 v->link = dev->ifindex;
768 if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER)) 779 if (v->flags & (VIFF_TUNNEL | VIFF_REGISTER))
769 v->link = dev->iflink; 780 v->link = dev->iflink;
770 781
771 /* And finish update writing critical data */ 782 /* And finish update writing critical data */
772 write_lock_bh(&mrt_lock); 783 write_lock_bh(&mrt_lock);
773 v->dev = dev; 784 v->dev = dev;
774#ifdef CONFIG_IP_PIMSM 785#ifdef CONFIG_IP_PIMSM
775 if (v->flags&VIFF_REGISTER) 786 if (v->flags & VIFF_REGISTER)
776 mrt->mroute_reg_vif_num = vifi; 787 mrt->mroute_reg_vif_num = vifi;
777#endif 788#endif
778 if (vifi+1 > mrt->maxvif) 789 if (vifi+1 > mrt->maxvif)
@@ -781,6 +792,7 @@ static int vif_add(struct net *net, struct mr_table *mrt,
781 return 0; 792 return 0;
782} 793}
783 794
795/* called with rcu_read_lock() */
784static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt, 796static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt,
785 __be32 origin, 797 __be32 origin,
786 __be32 mcastgrp) 798 __be32 mcastgrp)
@@ -788,7 +800,7 @@ static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt,
788 int line = MFC_HASH(mcastgrp, origin); 800 int line = MFC_HASH(mcastgrp, origin);
789 struct mfc_cache *c; 801 struct mfc_cache *c;
790 802
791 list_for_each_entry(c, &mrt->mfc_cache_array[line], list) { 803 list_for_each_entry_rcu(c, &mrt->mfc_cache_array[line], list) {
792 if (c->mfc_origin == origin && c->mfc_mcastgrp == mcastgrp) 804 if (c->mfc_origin == origin && c->mfc_mcastgrp == mcastgrp)
793 return c; 805 return c;
794 } 806 }
@@ -801,19 +813,20 @@ static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt,
801static struct mfc_cache *ipmr_cache_alloc(void) 813static struct mfc_cache *ipmr_cache_alloc(void)
802{ 814{
803 struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL); 815 struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
804 if (c == NULL) 816
805 return NULL; 817 if (c)
806 c->mfc_un.res.minvif = MAXVIFS; 818 c->mfc_un.res.minvif = MAXVIFS;
807 return c; 819 return c;
808} 820}
809 821
810static struct mfc_cache *ipmr_cache_alloc_unres(void) 822static struct mfc_cache *ipmr_cache_alloc_unres(void)
811{ 823{
812 struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC); 824 struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC);
813 if (c == NULL) 825
814 return NULL; 826 if (c) {
815 skb_queue_head_init(&c->mfc_un.unres.unresolved); 827 skb_queue_head_init(&c->mfc_un.unres.unresolved);
816 c->mfc_un.unres.expires = jiffies + 10*HZ; 828 c->mfc_un.unres.expires = jiffies + 10*HZ;
829 }
817 return c; 830 return c;
818} 831}
819 832
@@ -827,17 +840,15 @@ static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt,
827 struct sk_buff *skb; 840 struct sk_buff *skb;
828 struct nlmsgerr *e; 841 struct nlmsgerr *e;
829 842
830 /* 843 /* Play the pending entries through our router */
831 * Play the pending entries through our router
832 */
833 844
834 while ((skb = __skb_dequeue(&uc->mfc_un.unres.unresolved))) { 845 while ((skb = __skb_dequeue(&uc->mfc_un.unres.unresolved))) {
835 if (ip_hdr(skb)->version == 0) { 846 if (ip_hdr(skb)->version == 0) {
836 struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr)); 847 struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
837 848
838 if (__ipmr_fill_mroute(mrt, skb, c, NLMSG_DATA(nlh)) > 0) { 849 if (__ipmr_fill_mroute(mrt, skb, c, NLMSG_DATA(nlh)) > 0) {
839 nlh->nlmsg_len = (skb_tail_pointer(skb) - 850 nlh->nlmsg_len = skb_tail_pointer(skb) -
840 (u8 *)nlh); 851 (u8 *)nlh;
841 } else { 852 } else {
842 nlh->nlmsg_type = NLMSG_ERROR; 853 nlh->nlmsg_type = NLMSG_ERROR;
843 nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr)); 854 nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
@@ -848,8 +859,9 @@ static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt,
848 } 859 }
849 860
850 rtnl_unicast(skb, net, NETLINK_CB(skb).pid); 861 rtnl_unicast(skb, net, NETLINK_CB(skb).pid);
851 } else 862 } else {
852 ip_mr_forward(net, mrt, skb, c, 0); 863 ip_mr_forward(net, mrt, skb, c, 0);
864 }
853 } 865 }
854} 866}
855 867
@@ -867,6 +879,7 @@ static int ipmr_cache_report(struct mr_table *mrt,
867 const int ihl = ip_hdrlen(pkt); 879 const int ihl = ip_hdrlen(pkt);
868 struct igmphdr *igmp; 880 struct igmphdr *igmp;
869 struct igmpmsg *msg; 881 struct igmpmsg *msg;
882 struct sock *mroute_sk;
870 int ret; 883 int ret;
871 884
872#ifdef CONFIG_IP_PIMSM 885#ifdef CONFIG_IP_PIMSM
@@ -882,9 +895,9 @@ static int ipmr_cache_report(struct mr_table *mrt,
882#ifdef CONFIG_IP_PIMSM 895#ifdef CONFIG_IP_PIMSM
883 if (assert == IGMPMSG_WHOLEPKT) { 896 if (assert == IGMPMSG_WHOLEPKT) {
884 /* Ugly, but we have no choice with this interface. 897 /* Ugly, but we have no choice with this interface.
885 Duplicate old header, fix ihl, length etc. 898 * Duplicate old header, fix ihl, length etc.
886 And all this only to mangle msg->im_msgtype and 899 * And all this only to mangle msg->im_msgtype and
887 to set msg->im_mbz to "mbz" :-) 900 * to set msg->im_mbz to "mbz" :-)
888 */ 901 */
889 skb_push(skb, sizeof(struct iphdr)); 902 skb_push(skb, sizeof(struct iphdr));
890 skb_reset_network_header(skb); 903 skb_reset_network_header(skb);
@@ -901,39 +914,38 @@ static int ipmr_cache_report(struct mr_table *mrt,
901#endif 914#endif
902 { 915 {
903 916
904 /* 917 /* Copy the IP header */
905 * Copy the IP header
906 */
907 918
908 skb->network_header = skb->tail; 919 skb->network_header = skb->tail;
909 skb_put(skb, ihl); 920 skb_put(skb, ihl);
910 skb_copy_to_linear_data(skb, pkt->data, ihl); 921 skb_copy_to_linear_data(skb, pkt->data, ihl);
911 ip_hdr(skb)->protocol = 0; /* Flag to the kernel this is a route add */ 922 ip_hdr(skb)->protocol = 0; /* Flag to the kernel this is a route add */
912 msg = (struct igmpmsg *)skb_network_header(skb); 923 msg = (struct igmpmsg *)skb_network_header(skb);
913 msg->im_vif = vifi; 924 msg->im_vif = vifi;
914 skb_dst_set(skb, dst_clone(skb_dst(pkt))); 925 skb_dst_set(skb, dst_clone(skb_dst(pkt)));
915 926
916 /* 927 /* Add our header */
917 * Add our header
918 */
919 928
920 igmp=(struct igmphdr *)skb_put(skb, sizeof(struct igmphdr)); 929 igmp = (struct igmphdr *)skb_put(skb, sizeof(struct igmphdr));
921 igmp->type = 930 igmp->type =
922 msg->im_msgtype = assert; 931 msg->im_msgtype = assert;
923 igmp->code = 0; 932 igmp->code = 0;
924 ip_hdr(skb)->tot_len = htons(skb->len); /* Fix the length */ 933 ip_hdr(skb)->tot_len = htons(skb->len); /* Fix the length */
925 skb->transport_header = skb->network_header; 934 skb->transport_header = skb->network_header;
926 } 935 }
927 936
928 if (mrt->mroute_sk == NULL) { 937 rcu_read_lock();
938 mroute_sk = rcu_dereference(mrt->mroute_sk);
939 if (mroute_sk == NULL) {
940 rcu_read_unlock();
929 kfree_skb(skb); 941 kfree_skb(skb);
930 return -EINVAL; 942 return -EINVAL;
931 } 943 }
932 944
933 /* 945 /* Deliver to mrouted */
934 * Deliver to mrouted 946
935 */ 947 ret = sock_queue_rcv_skb(mroute_sk, skb);
936 ret = sock_queue_rcv_skb(mrt->mroute_sk, skb); 948 rcu_read_unlock();
937 if (ret < 0) { 949 if (ret < 0) {
938 if (net_ratelimit()) 950 if (net_ratelimit())
939 printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n"); 951 printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n");
@@ -965,9 +977,7 @@ ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb)
965 } 977 }
966 978
967 if (!found) { 979 if (!found) {
968 /* 980 /* Create a new entry if allowable */
969 * Create a new entry if allowable
970 */
971 981
972 if (atomic_read(&mrt->cache_resolve_queue_len) >= 10 || 982 if (atomic_read(&mrt->cache_resolve_queue_len) >= 10 ||
973 (c = ipmr_cache_alloc_unres()) == NULL) { 983 (c = ipmr_cache_alloc_unres()) == NULL) {
@@ -977,16 +987,14 @@ ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb)
977 return -ENOBUFS; 987 return -ENOBUFS;
978 } 988 }
979 989
980 /* 990 /* Fill in the new cache entry */
981 * Fill in the new cache entry 991
982 */
983 c->mfc_parent = -1; 992 c->mfc_parent = -1;
984 c->mfc_origin = iph->saddr; 993 c->mfc_origin = iph->saddr;
985 c->mfc_mcastgrp = iph->daddr; 994 c->mfc_mcastgrp = iph->daddr;
986 995
987 /* 996 /* Reflect first query at mrouted. */
988 * Reflect first query at mrouted. 997
989 */
990 err = ipmr_cache_report(mrt, skb, vifi, IGMPMSG_NOCACHE); 998 err = ipmr_cache_report(mrt, skb, vifi, IGMPMSG_NOCACHE);
991 if (err < 0) { 999 if (err < 0) {
992 /* If the report failed throw the cache entry 1000 /* If the report failed throw the cache entry
@@ -1006,10 +1014,9 @@ ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb)
1006 mod_timer(&mrt->ipmr_expire_timer, c->mfc_un.unres.expires); 1014 mod_timer(&mrt->ipmr_expire_timer, c->mfc_un.unres.expires);
1007 } 1015 }
1008 1016
1009 /* 1017 /* See if we can append the packet */
1010 * See if we can append the packet 1018
1011 */ 1019 if (c->mfc_un.unres.unresolved.qlen > 3) {
1012 if (c->mfc_un.unres.unresolved.qlen>3) {
1013 kfree_skb(skb); 1020 kfree_skb(skb);
1014 err = -ENOBUFS; 1021 err = -ENOBUFS;
1015 } else { 1022 } else {
@@ -1035,9 +1042,7 @@ static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc)
1035 list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[line], list) { 1042 list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[line], list) {
1036 if (c->mfc_origin == mfc->mfcc_origin.s_addr && 1043 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
1037 c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) { 1044 c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
1038 write_lock_bh(&mrt_lock); 1045 list_del_rcu(&c->list);
1039 list_del(&c->list);
1040 write_unlock_bh(&mrt_lock);
1041 1046
1042 ipmr_cache_free(c); 1047 ipmr_cache_free(c);
1043 return 0; 1048 return 0;
@@ -1090,9 +1095,7 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
1090 if (!mrtsock) 1095 if (!mrtsock)
1091 c->mfc_flags |= MFC_STATIC; 1096 c->mfc_flags |= MFC_STATIC;
1092 1097
1093 write_lock_bh(&mrt_lock); 1098 list_add_rcu(&c->list, &mrt->mfc_cache_array[line]);
1094 list_add(&c->list, &mrt->mfc_cache_array[line]);
1095 write_unlock_bh(&mrt_lock);
1096 1099
1097 /* 1100 /*
1098 * Check to see if we resolved a queued list. If so we 1101 * Check to see if we resolved a queued list. If so we
@@ -1130,26 +1133,21 @@ static void mroute_clean_tables(struct mr_table *mrt)
1130 LIST_HEAD(list); 1133 LIST_HEAD(list);
1131 struct mfc_cache *c, *next; 1134 struct mfc_cache *c, *next;
1132 1135
1133 /* 1136 /* Shut down all active vif entries */
1134 * Shut down all active vif entries 1137
1135 */
1136 for (i = 0; i < mrt->maxvif; i++) { 1138 for (i = 0; i < mrt->maxvif; i++) {
1137 if (!(mrt->vif_table[i].flags&VIFF_STATIC)) 1139 if (!(mrt->vif_table[i].flags & VIFF_STATIC))
1138 vif_delete(mrt, i, 0, &list); 1140 vif_delete(mrt, i, 0, &list);
1139 } 1141 }
1140 unregister_netdevice_many(&list); 1142 unregister_netdevice_many(&list);
1141 1143
1142 /* 1144 /* Wipe the cache */
1143 * Wipe the cache 1145
1144 */
1145 for (i = 0; i < MFC_LINES; i++) { 1146 for (i = 0; i < MFC_LINES; i++) {
1146 list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[i], list) { 1147 list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[i], list) {
1147 if (c->mfc_flags&MFC_STATIC) 1148 if (c->mfc_flags & MFC_STATIC)
1148 continue; 1149 continue;
1149 write_lock_bh(&mrt_lock); 1150 list_del_rcu(&c->list);
1150 list_del(&c->list);
1151 write_unlock_bh(&mrt_lock);
1152
1153 ipmr_cache_free(c); 1151 ipmr_cache_free(c);
1154 } 1152 }
1155 } 1153 }
@@ -1164,6 +1162,9 @@ static void mroute_clean_tables(struct mr_table *mrt)
1164 } 1162 }
1165} 1163}
1166 1164
1165/* called from ip_ra_control(), before an RCU grace period,
1166 * we dont need to call synchronize_rcu() here
1167 */
1167static void mrtsock_destruct(struct sock *sk) 1168static void mrtsock_destruct(struct sock *sk)
1168{ 1169{
1169 struct net *net = sock_net(sk); 1170 struct net *net = sock_net(sk);
@@ -1171,13 +1172,9 @@ static void mrtsock_destruct(struct sock *sk)
1171 1172
1172 rtnl_lock(); 1173 rtnl_lock();
1173 ipmr_for_each_table(mrt, net) { 1174 ipmr_for_each_table(mrt, net) {
1174 if (sk == mrt->mroute_sk) { 1175 if (sk == rtnl_dereference(mrt->mroute_sk)) {
1175 IPV4_DEVCONF_ALL(net, MC_FORWARDING)--; 1176 IPV4_DEVCONF_ALL(net, MC_FORWARDING)--;
1176 1177 rcu_assign_pointer(mrt->mroute_sk, NULL);
1177 write_lock_bh(&mrt_lock);
1178 mrt->mroute_sk = NULL;
1179 write_unlock_bh(&mrt_lock);
1180
1181 mroute_clean_tables(mrt); 1178 mroute_clean_tables(mrt);
1182 } 1179 }
1183 } 1180 }
@@ -1204,7 +1201,8 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
1204 return -ENOENT; 1201 return -ENOENT;
1205 1202
1206 if (optname != MRT_INIT) { 1203 if (optname != MRT_INIT) {
1207 if (sk != mrt->mroute_sk && !capable(CAP_NET_ADMIN)) 1204 if (sk != rcu_dereference_raw(mrt->mroute_sk) &&
1205 !capable(CAP_NET_ADMIN))
1208 return -EACCES; 1206 return -EACCES;
1209 } 1207 }
1210 1208
@@ -1217,23 +1215,20 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
1217 return -ENOPROTOOPT; 1215 return -ENOPROTOOPT;
1218 1216
1219 rtnl_lock(); 1217 rtnl_lock();
1220 if (mrt->mroute_sk) { 1218 if (rtnl_dereference(mrt->mroute_sk)) {
1221 rtnl_unlock(); 1219 rtnl_unlock();
1222 return -EADDRINUSE; 1220 return -EADDRINUSE;
1223 } 1221 }
1224 1222
1225 ret = ip_ra_control(sk, 1, mrtsock_destruct); 1223 ret = ip_ra_control(sk, 1, mrtsock_destruct);
1226 if (ret == 0) { 1224 if (ret == 0) {
1227 write_lock_bh(&mrt_lock); 1225 rcu_assign_pointer(mrt->mroute_sk, sk);
1228 mrt->mroute_sk = sk;
1229 write_unlock_bh(&mrt_lock);
1230
1231 IPV4_DEVCONF_ALL(net, MC_FORWARDING)++; 1226 IPV4_DEVCONF_ALL(net, MC_FORWARDING)++;
1232 } 1227 }
1233 rtnl_unlock(); 1228 rtnl_unlock();
1234 return ret; 1229 return ret;
1235 case MRT_DONE: 1230 case MRT_DONE:
1236 if (sk != mrt->mroute_sk) 1231 if (sk != rcu_dereference_raw(mrt->mroute_sk))
1237 return -EACCES; 1232 return -EACCES;
1238 return ip_ra_control(sk, 0, NULL); 1233 return ip_ra_control(sk, 0, NULL);
1239 case MRT_ADD_VIF: 1234 case MRT_ADD_VIF:
@@ -1246,7 +1241,8 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
1246 return -ENFILE; 1241 return -ENFILE;
1247 rtnl_lock(); 1242 rtnl_lock();
1248 if (optname == MRT_ADD_VIF) { 1243 if (optname == MRT_ADD_VIF) {
1249 ret = vif_add(net, mrt, &vif, sk == mrt->mroute_sk); 1244 ret = vif_add(net, mrt, &vif,
1245 sk == rtnl_dereference(mrt->mroute_sk));
1250 } else { 1246 } else {
1251 ret = vif_delete(mrt, vif.vifc_vifi, 0, NULL); 1247 ret = vif_delete(mrt, vif.vifc_vifi, 0, NULL);
1252 } 1248 }
@@ -1267,7 +1263,8 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
1267 if (optname == MRT_DEL_MFC) 1263 if (optname == MRT_DEL_MFC)
1268 ret = ipmr_mfc_delete(mrt, &mfc); 1264 ret = ipmr_mfc_delete(mrt, &mfc);
1269 else 1265 else
1270 ret = ipmr_mfc_add(net, mrt, &mfc, sk == mrt->mroute_sk); 1266 ret = ipmr_mfc_add(net, mrt, &mfc,
1267 sk == rtnl_dereference(mrt->mroute_sk));
1271 rtnl_unlock(); 1268 rtnl_unlock();
1272 return ret; 1269 return ret;
1273 /* 1270 /*
@@ -1276,7 +1273,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
1276 case MRT_ASSERT: 1273 case MRT_ASSERT:
1277 { 1274 {
1278 int v; 1275 int v;
1279 if (get_user(v,(int __user *)optval)) 1276 if (get_user(v, (int __user *)optval))
1280 return -EFAULT; 1277 return -EFAULT;
1281 mrt->mroute_do_assert = (v) ? 1 : 0; 1278 mrt->mroute_do_assert = (v) ? 1 : 0;
1282 return 0; 1279 return 0;
@@ -1286,7 +1283,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
1286 { 1283 {
1287 int v; 1284 int v;
1288 1285
1289 if (get_user(v,(int __user *)optval)) 1286 if (get_user(v, (int __user *)optval))
1290 return -EFAULT; 1287 return -EFAULT;
1291 v = (v) ? 1 : 0; 1288 v = (v) ? 1 : 0;
1292 1289
@@ -1309,14 +1306,16 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
1309 return -EINVAL; 1306 return -EINVAL;
1310 if (get_user(v, (u32 __user *)optval)) 1307 if (get_user(v, (u32 __user *)optval))
1311 return -EFAULT; 1308 return -EFAULT;
1312 if (sk == mrt->mroute_sk)
1313 return -EBUSY;
1314 1309
1315 rtnl_lock(); 1310 rtnl_lock();
1316 ret = 0; 1311 ret = 0;
1317 if (!ipmr_new_table(net, v)) 1312 if (sk == rtnl_dereference(mrt->mroute_sk)) {
1318 ret = -ENOMEM; 1313 ret = -EBUSY;
1319 raw_sk(sk)->ipmr_table = v; 1314 } else {
1315 if (!ipmr_new_table(net, v))
1316 ret = -ENOMEM;
1317 raw_sk(sk)->ipmr_table = v;
1318 }
1320 rtnl_unlock(); 1319 rtnl_unlock();
1321 return ret; 1320 return ret;
1322 } 1321 }
@@ -1347,9 +1346,9 @@ int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int
1347 1346
1348 if (optname != MRT_VERSION && 1347 if (optname != MRT_VERSION &&
1349#ifdef CONFIG_IP_PIMSM 1348#ifdef CONFIG_IP_PIMSM
1350 optname!=MRT_PIM && 1349 optname != MRT_PIM &&
1351#endif 1350#endif
1352 optname!=MRT_ASSERT) 1351 optname != MRT_ASSERT)
1353 return -ENOPROTOOPT; 1352 return -ENOPROTOOPT;
1354 1353
1355 if (get_user(olr, optlen)) 1354 if (get_user(olr, optlen))
@@ -1416,19 +1415,19 @@ int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
1416 if (copy_from_user(&sr, arg, sizeof(sr))) 1415 if (copy_from_user(&sr, arg, sizeof(sr)))
1417 return -EFAULT; 1416 return -EFAULT;
1418 1417
1419 read_lock(&mrt_lock); 1418 rcu_read_lock();
1420 c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr); 1419 c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr);
1421 if (c) { 1420 if (c) {
1422 sr.pktcnt = c->mfc_un.res.pkt; 1421 sr.pktcnt = c->mfc_un.res.pkt;
1423 sr.bytecnt = c->mfc_un.res.bytes; 1422 sr.bytecnt = c->mfc_un.res.bytes;
1424 sr.wrong_if = c->mfc_un.res.wrong_if; 1423 sr.wrong_if = c->mfc_un.res.wrong_if;
1425 read_unlock(&mrt_lock); 1424 rcu_read_unlock();
1426 1425
1427 if (copy_to_user(arg, &sr, sizeof(sr))) 1426 if (copy_to_user(arg, &sr, sizeof(sr)))
1428 return -EFAULT; 1427 return -EFAULT;
1429 return 0; 1428 return 0;
1430 } 1429 }
1431 read_unlock(&mrt_lock); 1430 rcu_read_unlock();
1432 return -EADDRNOTAVAIL; 1431 return -EADDRNOTAVAIL;
1433 default: 1432 default:
1434 return -ENOIOCTLCMD; 1433 return -ENOIOCTLCMD;
@@ -1465,7 +1464,7 @@ static struct notifier_block ip_mr_notifier = {
1465}; 1464};
1466 1465
1467/* 1466/*
1468 * Encapsulate a packet by attaching a valid IPIP header to it. 1467 * Encapsulate a packet by attaching a valid IPIP header to it.
1469 * This avoids tunnel drivers and other mess and gives us the speed so 1468 * This avoids tunnel drivers and other mess and gives us the speed so
1470 * important for multicast video. 1469 * important for multicast video.
1471 */ 1470 */
@@ -1480,7 +1479,7 @@ static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)
1480 skb_reset_network_header(skb); 1479 skb_reset_network_header(skb);
1481 iph = ip_hdr(skb); 1480 iph = ip_hdr(skb);
1482 1481
1483 iph->version = 4; 1482 iph->version = 4;
1484 iph->tos = old_iph->tos; 1483 iph->tos = old_iph->tos;
1485 iph->ttl = old_iph->ttl; 1484 iph->ttl = old_iph->ttl;
1486 iph->frag_off = 0; 1485 iph->frag_off = 0;
@@ -1498,7 +1497,7 @@ static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)
1498 1497
1499static inline int ipmr_forward_finish(struct sk_buff *skb) 1498static inline int ipmr_forward_finish(struct sk_buff *skb)
1500{ 1499{
1501 struct ip_options * opt = &(IPCB(skb)->opt); 1500 struct ip_options *opt = &(IPCB(skb)->opt);
1502 1501
1503 IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS); 1502 IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);
1504 1503
@@ -1535,22 +1534,34 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
1535 } 1534 }
1536#endif 1535#endif
1537 1536
1538 if (vif->flags&VIFF_TUNNEL) { 1537 if (vif->flags & VIFF_TUNNEL) {
1539 struct flowi fl = { .oif = vif->link, 1538 struct flowi fl = {
1540 .nl_u = { .ip4_u = 1539 .oif = vif->link,
1541 { .daddr = vif->remote, 1540 .nl_u = {
1542 .saddr = vif->local, 1541 .ip4_u = {
1543 .tos = RT_TOS(iph->tos) } }, 1542 .daddr = vif->remote,
1544 .proto = IPPROTO_IPIP }; 1543 .saddr = vif->local,
1544 .tos = RT_TOS(iph->tos)
1545 }
1546 },
1547 .proto = IPPROTO_IPIP
1548 };
1549
1545 if (ip_route_output_key(net, &rt, &fl)) 1550 if (ip_route_output_key(net, &rt, &fl))
1546 goto out_free; 1551 goto out_free;
1547 encap = sizeof(struct iphdr); 1552 encap = sizeof(struct iphdr);
1548 } else { 1553 } else {
1549 struct flowi fl = { .oif = vif->link, 1554 struct flowi fl = {
1550 .nl_u = { .ip4_u = 1555 .oif = vif->link,
1551 { .daddr = iph->daddr, 1556 .nl_u = {
1552 .tos = RT_TOS(iph->tos) } }, 1557 .ip4_u = {
1553 .proto = IPPROTO_IPIP }; 1558 .daddr = iph->daddr,
1559 .tos = RT_TOS(iph->tos)
1560 }
1561 },
1562 .proto = IPPROTO_IPIP
1563 };
1564
1554 if (ip_route_output_key(net, &rt, &fl)) 1565 if (ip_route_output_key(net, &rt, &fl))
1555 goto out_free; 1566 goto out_free;
1556 } 1567 }
@@ -1559,8 +1570,8 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
1559 1570
1560 if (skb->len+encap > dst_mtu(&rt->dst) && (ntohs(iph->frag_off) & IP_DF)) { 1571 if (skb->len+encap > dst_mtu(&rt->dst) && (ntohs(iph->frag_off) & IP_DF)) {
1561 /* Do not fragment multicasts. Alas, IPv4 does not 1572 /* Do not fragment multicasts. Alas, IPv4 does not
1562 allow to send ICMP, so that packets will disappear 1573 * allow to send ICMP, so that packets will disappear
1563 to blackhole. 1574 * to blackhole.
1564 */ 1575 */
1565 1576
1566 IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_FRAGFAILS); 1577 IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
@@ -1583,7 +1594,8 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
1583 ip_decrease_ttl(ip_hdr(skb)); 1594 ip_decrease_ttl(ip_hdr(skb));
1584 1595
1585 /* FIXME: forward and output firewalls used to be called here. 1596 /* FIXME: forward and output firewalls used to be called here.
1586 * What do we do with netfilter? -- RR */ 1597 * What do we do with netfilter? -- RR
1598 */
1587 if (vif->flags & VIFF_TUNNEL) { 1599 if (vif->flags & VIFF_TUNNEL) {
1588 ip_encap(skb, vif->local, vif->remote); 1600 ip_encap(skb, vif->local, vif->remote);
1589 /* FIXME: extra output firewall step used to be here. --RR */ 1601 /* FIXME: extra output firewall step used to be here. --RR */
@@ -1644,15 +1656,15 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt,
1644 1656
1645 if (skb_rtable(skb)->fl.iif == 0) { 1657 if (skb_rtable(skb)->fl.iif == 0) {
1646 /* It is our own packet, looped back. 1658 /* It is our own packet, looped back.
1647 Very complicated situation... 1659 * Very complicated situation...
1648 1660 *
1649 The best workaround until routing daemons will be 1661 * The best workaround until routing daemons will be
1650 fixed is not to redistribute packet, if it was 1662 * fixed is not to redistribute packet, if it was
1651 send through wrong interface. It means, that 1663 * send through wrong interface. It means, that
1652 multicast applications WILL NOT work for 1664 * multicast applications WILL NOT work for
1653 (S,G), which have default multicast route pointing 1665 * (S,G), which have default multicast route pointing
1654 to wrong oif. In any case, it is not a good 1666 * to wrong oif. In any case, it is not a good
1655 idea to use multicasting applications on router. 1667 * idea to use multicasting applications on router.
1656 */ 1668 */
1657 goto dont_forward; 1669 goto dont_forward;
1658 } 1670 }
@@ -1662,9 +1674,9 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt,
1662 1674
1663 if (true_vifi >= 0 && mrt->mroute_do_assert && 1675 if (true_vifi >= 0 && mrt->mroute_do_assert &&
1664 /* pimsm uses asserts, when switching from RPT to SPT, 1676 /* pimsm uses asserts, when switching from RPT to SPT,
1665 so that we cannot check that packet arrived on an oif. 1677 * so that we cannot check that packet arrived on an oif.
1666 It is bad, but otherwise we would need to move pretty 1678 * It is bad, but otherwise we would need to move pretty
1667 large chunk of pimd to kernel. Ough... --ANK 1679 * large chunk of pimd to kernel. Ough... --ANK
1668 */ 1680 */
1669 (mrt->mroute_do_pim || 1681 (mrt->mroute_do_pim ||
1670 cache->mfc_un.res.ttls[true_vifi] < 255) && 1682 cache->mfc_un.res.ttls[true_vifi] < 255) &&
@@ -1682,10 +1694,12 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt,
1682 /* 1694 /*
1683 * Forward the frame 1695 * Forward the frame
1684 */ 1696 */
1685 for (ct = cache->mfc_un.res.maxvif-1; ct >= cache->mfc_un.res.minvif; ct--) { 1697 for (ct = cache->mfc_un.res.maxvif - 1;
1698 ct >= cache->mfc_un.res.minvif; ct--) {
1686 if (ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) { 1699 if (ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) {
1687 if (psend != -1) { 1700 if (psend != -1) {
1688 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); 1701 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1702
1689 if (skb2) 1703 if (skb2)
1690 ipmr_queue_xmit(net, mrt, skb2, cache, 1704 ipmr_queue_xmit(net, mrt, skb2, cache,
1691 psend); 1705 psend);
@@ -1696,6 +1710,7 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt,
1696 if (psend != -1) { 1710 if (psend != -1) {
1697 if (local) { 1711 if (local) {
1698 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); 1712 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1713
1699 if (skb2) 1714 if (skb2)
1700 ipmr_queue_xmit(net, mrt, skb2, cache, psend); 1715 ipmr_queue_xmit(net, mrt, skb2, cache, psend);
1701 } else { 1716 } else {
@@ -1713,6 +1728,7 @@ dont_forward:
1713 1728
1714/* 1729/*
1715 * Multicast packets for forwarding arrive here 1730 * Multicast packets for forwarding arrive here
1731 * Called with rcu_read_lock();
1716 */ 1732 */
1717 1733
1718int ip_mr_input(struct sk_buff *skb) 1734int ip_mr_input(struct sk_buff *skb)
@@ -1724,9 +1740,9 @@ int ip_mr_input(struct sk_buff *skb)
1724 int err; 1740 int err;
1725 1741
1726 /* Packet is looped back after forward, it should not be 1742 /* Packet is looped back after forward, it should not be
1727 forwarded second time, but still can be delivered locally. 1743 * forwarded second time, but still can be delivered locally.
1728 */ 1744 */
1729 if (IPCB(skb)->flags&IPSKB_FORWARDED) 1745 if (IPCB(skb)->flags & IPSKB_FORWARDED)
1730 goto dont_forward; 1746 goto dont_forward;
1731 1747
1732 err = ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt); 1748 err = ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt);
@@ -1736,28 +1752,28 @@ int ip_mr_input(struct sk_buff *skb)
1736 } 1752 }
1737 1753
1738 if (!local) { 1754 if (!local) {
1739 if (IPCB(skb)->opt.router_alert) { 1755 if (IPCB(skb)->opt.router_alert) {
1740 if (ip_call_ra_chain(skb)) 1756 if (ip_call_ra_chain(skb))
1741 return 0; 1757 return 0;
1742 } else if (ip_hdr(skb)->protocol == IPPROTO_IGMP){ 1758 } else if (ip_hdr(skb)->protocol == IPPROTO_IGMP) {
1743 /* IGMPv1 (and broken IGMPv2 implementations sort of 1759 /* IGMPv1 (and broken IGMPv2 implementations sort of
1744 Cisco IOS <= 11.2(8)) do not put router alert 1760 * Cisco IOS <= 11.2(8)) do not put router alert
1745 option to IGMP packets destined to routable 1761 * option to IGMP packets destined to routable
1746 groups. It is very bad, because it means 1762 * groups. It is very bad, because it means
1747 that we can forward NO IGMP messages. 1763 * that we can forward NO IGMP messages.
1748 */ 1764 */
1749 read_lock(&mrt_lock); 1765 struct sock *mroute_sk;
1750 if (mrt->mroute_sk) { 1766
1751 nf_reset(skb); 1767 mroute_sk = rcu_dereference(mrt->mroute_sk);
1752 raw_rcv(mrt->mroute_sk, skb); 1768 if (mroute_sk) {
1753 read_unlock(&mrt_lock); 1769 nf_reset(skb);
1754 return 0; 1770 raw_rcv(mroute_sk, skb);
1755 } 1771 return 0;
1756 read_unlock(&mrt_lock); 1772 }
1757 } 1773 }
1758 } 1774 }
1759 1775
1760 read_lock(&mrt_lock); 1776 /* already under rcu_read_lock() */
1761 cache = ipmr_cache_find(mrt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr); 1777 cache = ipmr_cache_find(mrt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr);
1762 1778
1763 /* 1779 /*
@@ -1769,13 +1785,12 @@ int ip_mr_input(struct sk_buff *skb)
1769 if (local) { 1785 if (local) {
1770 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); 1786 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1771 ip_local_deliver(skb); 1787 ip_local_deliver(skb);
1772 if (skb2 == NULL) { 1788 if (skb2 == NULL)
1773 read_unlock(&mrt_lock);
1774 return -ENOBUFS; 1789 return -ENOBUFS;
1775 }
1776 skb = skb2; 1790 skb = skb2;
1777 } 1791 }
1778 1792
1793 read_lock(&mrt_lock);
1779 vif = ipmr_find_vif(mrt, skb->dev); 1794 vif = ipmr_find_vif(mrt, skb->dev);
1780 if (vif >= 0) { 1795 if (vif >= 0) {
1781 int err2 = ipmr_cache_unresolved(mrt, vif, skb); 1796 int err2 = ipmr_cache_unresolved(mrt, vif, skb);
@@ -1788,8 +1803,8 @@ int ip_mr_input(struct sk_buff *skb)
1788 return -ENODEV; 1803 return -ENODEV;
1789 } 1804 }
1790 1805
1806 read_lock(&mrt_lock);
1791 ip_mr_forward(net, mrt, skb, cache, local); 1807 ip_mr_forward(net, mrt, skb, cache, local);
1792
1793 read_unlock(&mrt_lock); 1808 read_unlock(&mrt_lock);
1794 1809
1795 if (local) 1810 if (local)
@@ -1805,6 +1820,7 @@ dont_forward:
1805} 1820}
1806 1821
1807#ifdef CONFIG_IP_PIMSM 1822#ifdef CONFIG_IP_PIMSM
1823/* called with rcu_read_lock() */
1808static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb, 1824static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
1809 unsigned int pimlen) 1825 unsigned int pimlen)
1810{ 1826{
@@ -1813,10 +1829,10 @@ static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
1813 1829
1814 encap = (struct iphdr *)(skb_transport_header(skb) + pimlen); 1830 encap = (struct iphdr *)(skb_transport_header(skb) + pimlen);
1815 /* 1831 /*
1816 Check that: 1832 * Check that:
1817 a. packet is really destinted to a multicast group 1833 * a. packet is really sent to a multicast group
1818 b. packet is not a NULL-REGISTER 1834 * b. packet is not a NULL-REGISTER
1819 c. packet is not truncated 1835 * c. packet is not truncated
1820 */ 1836 */
1821 if (!ipv4_is_multicast(encap->daddr) || 1837 if (!ipv4_is_multicast(encap->daddr) ||
1822 encap->tot_len == 0 || 1838 encap->tot_len == 0 ||
@@ -1826,26 +1842,23 @@ static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
1826 read_lock(&mrt_lock); 1842 read_lock(&mrt_lock);
1827 if (mrt->mroute_reg_vif_num >= 0) 1843 if (mrt->mroute_reg_vif_num >= 0)
1828 reg_dev = mrt->vif_table[mrt->mroute_reg_vif_num].dev; 1844 reg_dev = mrt->vif_table[mrt->mroute_reg_vif_num].dev;
1829 if (reg_dev)
1830 dev_hold(reg_dev);
1831 read_unlock(&mrt_lock); 1845 read_unlock(&mrt_lock);
1832 1846
1833 if (reg_dev == NULL) 1847 if (reg_dev == NULL)
1834 return 1; 1848 return 1;
1835 1849
1836 skb->mac_header = skb->network_header; 1850 skb->mac_header = skb->network_header;
1837 skb_pull(skb, (u8*)encap - skb->data); 1851 skb_pull(skb, (u8 *)encap - skb->data);
1838 skb_reset_network_header(skb); 1852 skb_reset_network_header(skb);
1839 skb->protocol = htons(ETH_P_IP); 1853 skb->protocol = htons(ETH_P_IP);
1840 skb->ip_summed = 0; 1854 skb->ip_summed = CHECKSUM_NONE;
1841 skb->pkt_type = PACKET_HOST; 1855 skb->pkt_type = PACKET_HOST;
1842 1856
1843 skb_tunnel_rx(skb, reg_dev); 1857 skb_tunnel_rx(skb, reg_dev);
1844 1858
1845 netif_rx(skb); 1859 netif_rx(skb);
1846 dev_put(reg_dev);
1847 1860
1848 return 0; 1861 return NET_RX_SUCCESS;
1849} 1862}
1850#endif 1863#endif
1851 1864
@@ -1854,7 +1867,7 @@ static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
1854 * Handle IGMP messages of PIMv1 1867 * Handle IGMP messages of PIMv1
1855 */ 1868 */
1856 1869
1857int pim_rcv_v1(struct sk_buff * skb) 1870int pim_rcv_v1(struct sk_buff *skb)
1858{ 1871{
1859 struct igmphdr *pim; 1872 struct igmphdr *pim;
1860 struct net *net = dev_net(skb->dev); 1873 struct net *net = dev_net(skb->dev);
@@ -1881,7 +1894,7 @@ drop:
1881#endif 1894#endif
1882 1895
1883#ifdef CONFIG_IP_PIMSM_V2 1896#ifdef CONFIG_IP_PIMSM_V2
1884static int pim_rcv(struct sk_buff * skb) 1897static int pim_rcv(struct sk_buff *skb)
1885{ 1898{
1886 struct pimreghdr *pim; 1899 struct pimreghdr *pim;
1887 struct net *net = dev_net(skb->dev); 1900 struct net *net = dev_net(skb->dev);
@@ -1891,8 +1904,8 @@ static int pim_rcv(struct sk_buff * skb)
1891 goto drop; 1904 goto drop;
1892 1905
1893 pim = (struct pimreghdr *)skb_transport_header(skb); 1906 pim = (struct pimreghdr *)skb_transport_header(skb);
1894 if (pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) || 1907 if (pim->type != ((PIM_VERSION << 4) | (PIM_REGISTER)) ||
1895 (pim->flags&PIM_NULL_REGISTER) || 1908 (pim->flags & PIM_NULL_REGISTER) ||
1896 (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 && 1909 (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 &&
1897 csum_fold(skb_checksum(skb, 0, skb->len, 0)))) 1910 csum_fold(skb_checksum(skb, 0, skb->len, 0))))
1898 goto drop; 1911 goto drop;
@@ -1958,28 +1971,33 @@ int ipmr_get_route(struct net *net,
1958 if (mrt == NULL) 1971 if (mrt == NULL)
1959 return -ENOENT; 1972 return -ENOENT;
1960 1973
1961 read_lock(&mrt_lock); 1974 rcu_read_lock();
1962 cache = ipmr_cache_find(mrt, rt->rt_src, rt->rt_dst); 1975 cache = ipmr_cache_find(mrt, rt->rt_src, rt->rt_dst);
1963 1976
1964 if (cache == NULL) { 1977 if (cache == NULL) {
1965 struct sk_buff *skb2; 1978 struct sk_buff *skb2;
1966 struct iphdr *iph; 1979 struct iphdr *iph;
1967 struct net_device *dev; 1980 struct net_device *dev;
1968 int vif; 1981 int vif = -1;
1969 1982
1970 if (nowait) { 1983 if (nowait) {
1971 read_unlock(&mrt_lock); 1984 rcu_read_unlock();
1972 return -EAGAIN; 1985 return -EAGAIN;
1973 } 1986 }
1974 1987
1975 dev = skb->dev; 1988 dev = skb->dev;
1976 if (dev == NULL || (vif = ipmr_find_vif(mrt, dev)) < 0) { 1989 read_lock(&mrt_lock);
1990 if (dev)
1991 vif = ipmr_find_vif(mrt, dev);
1992 if (vif < 0) {
1977 read_unlock(&mrt_lock); 1993 read_unlock(&mrt_lock);
1994 rcu_read_unlock();
1978 return -ENODEV; 1995 return -ENODEV;
1979 } 1996 }
1980 skb2 = skb_clone(skb, GFP_ATOMIC); 1997 skb2 = skb_clone(skb, GFP_ATOMIC);
1981 if (!skb2) { 1998 if (!skb2) {
1982 read_unlock(&mrt_lock); 1999 read_unlock(&mrt_lock);
2000 rcu_read_unlock();
1983 return -ENOMEM; 2001 return -ENOMEM;
1984 } 2002 }
1985 2003
@@ -1992,13 +2010,16 @@ int ipmr_get_route(struct net *net,
1992 iph->version = 0; 2010 iph->version = 0;
1993 err = ipmr_cache_unresolved(mrt, vif, skb2); 2011 err = ipmr_cache_unresolved(mrt, vif, skb2);
1994 read_unlock(&mrt_lock); 2012 read_unlock(&mrt_lock);
2013 rcu_read_unlock();
1995 return err; 2014 return err;
1996 } 2015 }
1997 2016
1998 if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY)) 2017 read_lock(&mrt_lock);
2018 if (!nowait && (rtm->rtm_flags & RTM_F_NOTIFY))
1999 cache->mfc_flags |= MFC_NOTIFY; 2019 cache->mfc_flags |= MFC_NOTIFY;
2000 err = __ipmr_fill_mroute(mrt, skb, cache, rtm); 2020 err = __ipmr_fill_mroute(mrt, skb, cache, rtm);
2001 read_unlock(&mrt_lock); 2021 read_unlock(&mrt_lock);
2022 rcu_read_unlock();
2002 return err; 2023 return err;
2003} 2024}
2004 2025
@@ -2050,14 +2071,14 @@ static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
2050 s_h = cb->args[1]; 2071 s_h = cb->args[1];
2051 s_e = cb->args[2]; 2072 s_e = cb->args[2];
2052 2073
2053 read_lock(&mrt_lock); 2074 rcu_read_lock();
2054 ipmr_for_each_table(mrt, net) { 2075 ipmr_for_each_table(mrt, net) {
2055 if (t < s_t) 2076 if (t < s_t)
2056 goto next_table; 2077 goto next_table;
2057 if (t > s_t) 2078 if (t > s_t)
2058 s_h = 0; 2079 s_h = 0;
2059 for (h = s_h; h < MFC_LINES; h++) { 2080 for (h = s_h; h < MFC_LINES; h++) {
2060 list_for_each_entry(mfc, &mrt->mfc_cache_array[h], list) { 2081 list_for_each_entry_rcu(mfc, &mrt->mfc_cache_array[h], list) {
2061 if (e < s_e) 2082 if (e < s_e)
2062 goto next_entry; 2083 goto next_entry;
2063 if (ipmr_fill_mroute(mrt, skb, 2084 if (ipmr_fill_mroute(mrt, skb,
@@ -2075,7 +2096,7 @@ next_table:
2075 t++; 2096 t++;
2076 } 2097 }
2077done: 2098done:
2078 read_unlock(&mrt_lock); 2099 rcu_read_unlock();
2079 2100
2080 cb->args[2] = e; 2101 cb->args[2] = e;
2081 cb->args[1] = h; 2102 cb->args[1] = h;
@@ -2086,7 +2107,8 @@ done:
2086 2107
2087#ifdef CONFIG_PROC_FS 2108#ifdef CONFIG_PROC_FS
2088/* 2109/*
2089 * The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif 2110 * The /proc interfaces to multicast routing :
2111 * /proc/net/ip_mr_cache & /proc/net/ip_mr_vif
2090 */ 2112 */
2091struct ipmr_vif_iter { 2113struct ipmr_vif_iter {
2092 struct seq_net_private p; 2114 struct seq_net_private p;
@@ -2208,14 +2230,14 @@ static struct mfc_cache *ipmr_mfc_seq_idx(struct net *net,
2208 struct mr_table *mrt = it->mrt; 2230 struct mr_table *mrt = it->mrt;
2209 struct mfc_cache *mfc; 2231 struct mfc_cache *mfc;
2210 2232
2211 read_lock(&mrt_lock); 2233 rcu_read_lock();
2212 for (it->ct = 0; it->ct < MFC_LINES; it->ct++) { 2234 for (it->ct = 0; it->ct < MFC_LINES; it->ct++) {
2213 it->cache = &mrt->mfc_cache_array[it->ct]; 2235 it->cache = &mrt->mfc_cache_array[it->ct];
2214 list_for_each_entry(mfc, it->cache, list) 2236 list_for_each_entry_rcu(mfc, it->cache, list)
2215 if (pos-- == 0) 2237 if (pos-- == 0)
2216 return mfc; 2238 return mfc;
2217 } 2239 }
2218 read_unlock(&mrt_lock); 2240 rcu_read_unlock();
2219 2241
2220 spin_lock_bh(&mfc_unres_lock); 2242 spin_lock_bh(&mfc_unres_lock);
2221 it->cache = &mrt->mfc_unres_queue; 2243 it->cache = &mrt->mfc_unres_queue;
@@ -2274,7 +2296,7 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2274 } 2296 }
2275 2297
2276 /* exhausted cache_array, show unresolved */ 2298 /* exhausted cache_array, show unresolved */
2277 read_unlock(&mrt_lock); 2299 rcu_read_unlock();
2278 it->cache = &mrt->mfc_unres_queue; 2300 it->cache = &mrt->mfc_unres_queue;
2279 it->ct = 0; 2301 it->ct = 0;
2280 2302
@@ -2282,7 +2304,7 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2282 if (!list_empty(it->cache)) 2304 if (!list_empty(it->cache))
2283 return list_first_entry(it->cache, struct mfc_cache, list); 2305 return list_first_entry(it->cache, struct mfc_cache, list);
2284 2306
2285 end_of_list: 2307end_of_list:
2286 spin_unlock_bh(&mfc_unres_lock); 2308 spin_unlock_bh(&mfc_unres_lock);
2287 it->cache = NULL; 2309 it->cache = NULL;
2288 2310
@@ -2297,7 +2319,7 @@ static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
2297 if (it->cache == &mrt->mfc_unres_queue) 2319 if (it->cache == &mrt->mfc_unres_queue)
2298 spin_unlock_bh(&mfc_unres_lock); 2320 spin_unlock_bh(&mfc_unres_lock);
2299 else if (it->cache == &mrt->mfc_cache_array[it->ct]) 2321 else if (it->cache == &mrt->mfc_cache_array[it->ct])
2300 read_unlock(&mrt_lock); 2322 rcu_read_unlock();
2301} 2323}
2302 2324
2303static int ipmr_mfc_seq_show(struct seq_file *seq, void *v) 2325static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
@@ -2323,7 +2345,7 @@ static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
2323 mfc->mfc_un.res.bytes, 2345 mfc->mfc_un.res.bytes,
2324 mfc->mfc_un.res.wrong_if); 2346 mfc->mfc_un.res.wrong_if);
2325 for (n = mfc->mfc_un.res.minvif; 2347 for (n = mfc->mfc_un.res.minvif;
2326 n < mfc->mfc_un.res.maxvif; n++ ) { 2348 n < mfc->mfc_un.res.maxvif; n++) {
2327 if (VIF_EXISTS(mrt, n) && 2349 if (VIF_EXISTS(mrt, n) &&
2328 mfc->mfc_un.res.ttls[n] < 255) 2350 mfc->mfc_un.res.ttls[n] < 255)
2329 seq_printf(seq, 2351 seq_printf(seq,
@@ -2421,7 +2443,7 @@ int __init ip_mr_init(void)
2421 2443
2422 mrt_cachep = kmem_cache_create("ip_mrt_cache", 2444 mrt_cachep = kmem_cache_create("ip_mrt_cache",
2423 sizeof(struct mfc_cache), 2445 sizeof(struct mfc_cache),
2424 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, 2446 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC,
2425 NULL); 2447 NULL);
2426 if (!mrt_cachep) 2448 if (!mrt_cachep)
2427 return -ENOMEM; 2449 return -ENOMEM;
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 1833bdbf9805..8e3350643b63 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -324,10 +324,10 @@ config IP_NF_TARGET_ECN
324 324
325config IP_NF_TARGET_TTL 325config IP_NF_TARGET_TTL
326 tristate '"TTL" target support' 326 tristate '"TTL" target support'
327 depends on NETFILTER_ADVANCED 327 depends on NETFILTER_ADVANCED && IP_NF_MANGLE
328 select NETFILTER_XT_TARGET_HL 328 select NETFILTER_XT_TARGET_HL
329 ---help--- 329 ---help---
330 This is a backwards-compat option for the user's convenience 330 This is a backwards-compatible option for the user's convenience
331 (e.g. when running oldconfig). It selects 331 (e.g. when running oldconfig). It selects
332 CONFIG_NETFILTER_XT_TARGET_HL. 332 CONFIG_NETFILTER_XT_TARGET_HL.
333 333
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index e8f4f9a57f12..3cad2591ace0 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -72,7 +72,7 @@ static inline int arp_devaddr_compare(const struct arpt_devaddr_info *ap,
72 for (i = 0; i < len; i++) 72 for (i = 0; i < len; i++)
73 ret |= (hdr_addr[i] ^ ap->addr[i]) & ap->mask[i]; 73 ret |= (hdr_addr[i] ^ ap->addr[i]) & ap->mask[i];
74 74
75 return (ret != 0); 75 return ret != 0;
76} 76}
77 77
78/* 78/*
@@ -228,7 +228,7 @@ arpt_error(struct sk_buff *skb, const struct xt_action_param *par)
228 return NF_DROP; 228 return NF_DROP;
229} 229}
230 230
231static inline const struct arpt_entry_target * 231static inline const struct xt_entry_target *
232arpt_get_target_c(const struct arpt_entry *e) 232arpt_get_target_c(const struct arpt_entry *e)
233{ 233{
234 return arpt_get_target((struct arpt_entry *)e); 234 return arpt_get_target((struct arpt_entry *)e);
@@ -282,7 +282,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
282 282
283 arp = arp_hdr(skb); 283 arp = arp_hdr(skb);
284 do { 284 do {
285 const struct arpt_entry_target *t; 285 const struct xt_entry_target *t;
286 286
287 if (!arp_packet_match(arp, skb->dev, indev, outdev, &e->arp)) { 287 if (!arp_packet_match(arp, skb->dev, indev, outdev, &e->arp)) {
288 e = arpt_next_entry(e); 288 e = arpt_next_entry(e);
@@ -297,10 +297,10 @@ unsigned int arpt_do_table(struct sk_buff *skb,
297 if (!t->u.kernel.target->target) { 297 if (!t->u.kernel.target->target) {
298 int v; 298 int v;
299 299
300 v = ((struct arpt_standard_target *)t)->verdict; 300 v = ((struct xt_standard_target *)t)->verdict;
301 if (v < 0) { 301 if (v < 0) {
302 /* Pop from stack? */ 302 /* Pop from stack? */
303 if (v != ARPT_RETURN) { 303 if (v != XT_RETURN) {
304 verdict = (unsigned)(-v) - 1; 304 verdict = (unsigned)(-v) - 1;
305 break; 305 break;
306 } 306 }
@@ -332,7 +332,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
332 /* Target might have changed stuff. */ 332 /* Target might have changed stuff. */
333 arp = arp_hdr(skb); 333 arp = arp_hdr(skb);
334 334
335 if (verdict == ARPT_CONTINUE) 335 if (verdict == XT_CONTINUE)
336 e = arpt_next_entry(e); 336 e = arpt_next_entry(e);
337 else 337 else
338 /* Verdict */ 338 /* Verdict */
@@ -377,7 +377,7 @@ static int mark_source_chains(const struct xt_table_info *newinfo,
377 e->counters.pcnt = pos; 377 e->counters.pcnt = pos;
378 378
379 for (;;) { 379 for (;;) {
380 const struct arpt_standard_target *t 380 const struct xt_standard_target *t
381 = (void *)arpt_get_target_c(e); 381 = (void *)arpt_get_target_c(e);
382 int visited = e->comefrom & (1 << hook); 382 int visited = e->comefrom & (1 << hook);
383 383
@@ -392,13 +392,13 @@ static int mark_source_chains(const struct xt_table_info *newinfo,
392 /* Unconditional return/END. */ 392 /* Unconditional return/END. */
393 if ((e->target_offset == sizeof(struct arpt_entry) && 393 if ((e->target_offset == sizeof(struct arpt_entry) &&
394 (strcmp(t->target.u.user.name, 394 (strcmp(t->target.u.user.name,
395 ARPT_STANDARD_TARGET) == 0) && 395 XT_STANDARD_TARGET) == 0) &&
396 t->verdict < 0 && unconditional(&e->arp)) || 396 t->verdict < 0 && unconditional(&e->arp)) ||
397 visited) { 397 visited) {
398 unsigned int oldpos, size; 398 unsigned int oldpos, size;
399 399
400 if ((strcmp(t->target.u.user.name, 400 if ((strcmp(t->target.u.user.name,
401 ARPT_STANDARD_TARGET) == 0) && 401 XT_STANDARD_TARGET) == 0) &&
402 t->verdict < -NF_MAX_VERDICT - 1) { 402 t->verdict < -NF_MAX_VERDICT - 1) {
403 duprintf("mark_source_chains: bad " 403 duprintf("mark_source_chains: bad "
404 "negative verdict (%i)\n", 404 "negative verdict (%i)\n",
@@ -433,7 +433,7 @@ static int mark_source_chains(const struct xt_table_info *newinfo,
433 int newpos = t->verdict; 433 int newpos = t->verdict;
434 434
435 if (strcmp(t->target.u.user.name, 435 if (strcmp(t->target.u.user.name,
436 ARPT_STANDARD_TARGET) == 0 && 436 XT_STANDARD_TARGET) == 0 &&
437 newpos >= 0) { 437 newpos >= 0) {
438 if (newpos > newinfo->size - 438 if (newpos > newinfo->size -
439 sizeof(struct arpt_entry)) { 439 sizeof(struct arpt_entry)) {
@@ -464,14 +464,14 @@ static int mark_source_chains(const struct xt_table_info *newinfo,
464 464
465static inline int check_entry(const struct arpt_entry *e, const char *name) 465static inline int check_entry(const struct arpt_entry *e, const char *name)
466{ 466{
467 const struct arpt_entry_target *t; 467 const struct xt_entry_target *t;
468 468
469 if (!arp_checkentry(&e->arp)) { 469 if (!arp_checkentry(&e->arp)) {
470 duprintf("arp_tables: arp check failed %p %s.\n", e, name); 470 duprintf("arp_tables: arp check failed %p %s.\n", e, name);
471 return -EINVAL; 471 return -EINVAL;
472 } 472 }
473 473
474 if (e->target_offset + sizeof(struct arpt_entry_target) > e->next_offset) 474 if (e->target_offset + sizeof(struct xt_entry_target) > e->next_offset)
475 return -EINVAL; 475 return -EINVAL;
476 476
477 t = arpt_get_target_c(e); 477 t = arpt_get_target_c(e);
@@ -483,7 +483,7 @@ static inline int check_entry(const struct arpt_entry *e, const char *name)
483 483
484static inline int check_target(struct arpt_entry *e, const char *name) 484static inline int check_target(struct arpt_entry *e, const char *name)
485{ 485{
486 struct arpt_entry_target *t = arpt_get_target(e); 486 struct xt_entry_target *t = arpt_get_target(e);
487 int ret; 487 int ret;
488 struct xt_tgchk_param par = { 488 struct xt_tgchk_param par = {
489 .table = name, 489 .table = name,
@@ -506,7 +506,7 @@ static inline int check_target(struct arpt_entry *e, const char *name)
506static inline int 506static inline int
507find_check_entry(struct arpt_entry *e, const char *name, unsigned int size) 507find_check_entry(struct arpt_entry *e, const char *name, unsigned int size)
508{ 508{
509 struct arpt_entry_target *t; 509 struct xt_entry_target *t;
510 struct xt_target *target; 510 struct xt_target *target;
511 int ret; 511 int ret;
512 512
@@ -536,7 +536,7 @@ out:
536 536
537static bool check_underflow(const struct arpt_entry *e) 537static bool check_underflow(const struct arpt_entry *e)
538{ 538{
539 const struct arpt_entry_target *t; 539 const struct xt_entry_target *t;
540 unsigned int verdict; 540 unsigned int verdict;
541 541
542 if (!unconditional(&e->arp)) 542 if (!unconditional(&e->arp))
@@ -544,7 +544,7 @@ static bool check_underflow(const struct arpt_entry *e)
544 t = arpt_get_target_c(e); 544 t = arpt_get_target_c(e);
545 if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0) 545 if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0)
546 return false; 546 return false;
547 verdict = ((struct arpt_standard_target *)t)->verdict; 547 verdict = ((struct xt_standard_target *)t)->verdict;
548 verdict = -verdict - 1; 548 verdict = -verdict - 1;
549 return verdict == NF_DROP || verdict == NF_ACCEPT; 549 return verdict == NF_DROP || verdict == NF_ACCEPT;
550} 550}
@@ -566,7 +566,7 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e,
566 } 566 }
567 567
568 if (e->next_offset 568 if (e->next_offset
569 < sizeof(struct arpt_entry) + sizeof(struct arpt_entry_target)) { 569 < sizeof(struct arpt_entry) + sizeof(struct xt_entry_target)) {
570 duprintf("checking: element %p size %u\n", 570 duprintf("checking: element %p size %u\n",
571 e, e->next_offset); 571 e, e->next_offset);
572 return -EINVAL; 572 return -EINVAL;
@@ -598,7 +598,7 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e,
598static inline void cleanup_entry(struct arpt_entry *e) 598static inline void cleanup_entry(struct arpt_entry *e)
599{ 599{
600 struct xt_tgdtor_param par; 600 struct xt_tgdtor_param par;
601 struct arpt_entry_target *t; 601 struct xt_entry_target *t;
602 602
603 t = arpt_get_target(e); 603 t = arpt_get_target(e);
604 par.target = t->u.kernel.target; 604 par.target = t->u.kernel.target;
@@ -794,7 +794,7 @@ static int copy_entries_to_user(unsigned int total_size,
794 /* FIXME: use iterator macros --RR */ 794 /* FIXME: use iterator macros --RR */
795 /* ... then go back and fix counters and names */ 795 /* ... then go back and fix counters and names */
796 for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){ 796 for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){
797 const struct arpt_entry_target *t; 797 const struct xt_entry_target *t;
798 798
799 e = (struct arpt_entry *)(loc_cpu_entry + off); 799 e = (struct arpt_entry *)(loc_cpu_entry + off);
800 if (copy_to_user(userptr + off 800 if (copy_to_user(userptr + off
@@ -807,7 +807,7 @@ static int copy_entries_to_user(unsigned int total_size,
807 807
808 t = arpt_get_target_c(e); 808 t = arpt_get_target_c(e);
809 if (copy_to_user(userptr + off + e->target_offset 809 if (copy_to_user(userptr + off + e->target_offset
810 + offsetof(struct arpt_entry_target, 810 + offsetof(struct xt_entry_target,
811 u.user.name), 811 u.user.name),
812 t->u.kernel.target->name, 812 t->u.kernel.target->name,
813 strlen(t->u.kernel.target->name)+1) != 0) { 813 strlen(t->u.kernel.target->name)+1) != 0) {
@@ -844,7 +844,7 @@ static int compat_calc_entry(const struct arpt_entry *e,
844 const struct xt_table_info *info, 844 const struct xt_table_info *info,
845 const void *base, struct xt_table_info *newinfo) 845 const void *base, struct xt_table_info *newinfo)
846{ 846{
847 const struct arpt_entry_target *t; 847 const struct xt_entry_target *t;
848 unsigned int entry_offset; 848 unsigned int entry_offset;
849 int off, i, ret; 849 int off, i, ret;
850 850
@@ -895,7 +895,7 @@ static int compat_table_info(const struct xt_table_info *info,
895static int get_info(struct net *net, void __user *user, 895static int get_info(struct net *net, void __user *user,
896 const int *len, int compat) 896 const int *len, int compat)
897{ 897{
898 char name[ARPT_TABLE_MAXNAMELEN]; 898 char name[XT_TABLE_MAXNAMELEN];
899 struct xt_table *t; 899 struct xt_table *t;
900 int ret; 900 int ret;
901 901
@@ -908,7 +908,7 @@ static int get_info(struct net *net, void __user *user,
908 if (copy_from_user(name, user, sizeof(name)) != 0) 908 if (copy_from_user(name, user, sizeof(name)) != 0)
909 return -EFAULT; 909 return -EFAULT;
910 910
911 name[ARPT_TABLE_MAXNAMELEN-1] = '\0'; 911 name[XT_TABLE_MAXNAMELEN-1] = '\0';
912#ifdef CONFIG_COMPAT 912#ifdef CONFIG_COMPAT
913 if (compat) 913 if (compat)
914 xt_compat_lock(NFPROTO_ARP); 914 xt_compat_lock(NFPROTO_ARP);
@@ -1204,7 +1204,7 @@ static int do_add_counters(struct net *net, const void __user *user,
1204#ifdef CONFIG_COMPAT 1204#ifdef CONFIG_COMPAT
1205static inline void compat_release_entry(struct compat_arpt_entry *e) 1205static inline void compat_release_entry(struct compat_arpt_entry *e)
1206{ 1206{
1207 struct arpt_entry_target *t; 1207 struct xt_entry_target *t;
1208 1208
1209 t = compat_arpt_get_target(e); 1209 t = compat_arpt_get_target(e);
1210 module_put(t->u.kernel.target->me); 1210 module_put(t->u.kernel.target->me);
@@ -1220,7 +1220,7 @@ check_compat_entry_size_and_hooks(struct compat_arpt_entry *e,
1220 const unsigned int *underflows, 1220 const unsigned int *underflows,
1221 const char *name) 1221 const char *name)
1222{ 1222{
1223 struct arpt_entry_target *t; 1223 struct xt_entry_target *t;
1224 struct xt_target *target; 1224 struct xt_target *target;
1225 unsigned int entry_offset; 1225 unsigned int entry_offset;
1226 int ret, off, h; 1226 int ret, off, h;
@@ -1288,7 +1288,7 @@ compat_copy_entry_from_user(struct compat_arpt_entry *e, void **dstptr,
1288 unsigned int *size, const char *name, 1288 unsigned int *size, const char *name,
1289 struct xt_table_info *newinfo, unsigned char *base) 1289 struct xt_table_info *newinfo, unsigned char *base)
1290{ 1290{
1291 struct arpt_entry_target *t; 1291 struct xt_entry_target *t;
1292 struct xt_target *target; 1292 struct xt_target *target;
1293 struct arpt_entry *de; 1293 struct arpt_entry *de;
1294 unsigned int origsize; 1294 unsigned int origsize;
@@ -1474,7 +1474,7 @@ out_unlock:
1474} 1474}
1475 1475
1476struct compat_arpt_replace { 1476struct compat_arpt_replace {
1477 char name[ARPT_TABLE_MAXNAMELEN]; 1477 char name[XT_TABLE_MAXNAMELEN];
1478 u32 valid_hooks; 1478 u32 valid_hooks;
1479 u32 num_entries; 1479 u32 num_entries;
1480 u32 size; 1480 u32 size;
@@ -1567,7 +1567,7 @@ static int compat_copy_entry_to_user(struct arpt_entry *e, void __user **dstptr,
1567 struct xt_counters *counters, 1567 struct xt_counters *counters,
1568 unsigned int i) 1568 unsigned int i)
1569{ 1569{
1570 struct arpt_entry_target *t; 1570 struct xt_entry_target *t;
1571 struct compat_arpt_entry __user *ce; 1571 struct compat_arpt_entry __user *ce;
1572 u_int16_t target_offset, next_offset; 1572 u_int16_t target_offset, next_offset;
1573 compat_uint_t origsize; 1573 compat_uint_t origsize;
@@ -1628,7 +1628,7 @@ static int compat_copy_entries_to_user(unsigned int total_size,
1628} 1628}
1629 1629
1630struct compat_arpt_get_entries { 1630struct compat_arpt_get_entries {
1631 char name[ARPT_TABLE_MAXNAMELEN]; 1631 char name[XT_TABLE_MAXNAMELEN];
1632 compat_uint_t size; 1632 compat_uint_t size;
1633 struct compat_arpt_entry entrytable[0]; 1633 struct compat_arpt_entry entrytable[0];
1634}; 1634};
@@ -1828,7 +1828,7 @@ void arpt_unregister_table(struct xt_table *table)
1828/* The built-in targets: standard (NULL) and error. */ 1828/* The built-in targets: standard (NULL) and error. */
1829static struct xt_target arpt_builtin_tg[] __read_mostly = { 1829static struct xt_target arpt_builtin_tg[] __read_mostly = {
1830 { 1830 {
1831 .name = ARPT_STANDARD_TARGET, 1831 .name = XT_STANDARD_TARGET,
1832 .targetsize = sizeof(int), 1832 .targetsize = sizeof(int),
1833 .family = NFPROTO_ARP, 1833 .family = NFPROTO_ARP,
1834#ifdef CONFIG_COMPAT 1834#ifdef CONFIG_COMPAT
@@ -1838,9 +1838,9 @@ static struct xt_target arpt_builtin_tg[] __read_mostly = {
1838#endif 1838#endif
1839 }, 1839 },
1840 { 1840 {
1841 .name = ARPT_ERROR_TARGET, 1841 .name = XT_ERROR_TARGET,
1842 .target = arpt_error, 1842 .target = arpt_error,
1843 .targetsize = ARPT_FUNCTION_MAXNAMELEN, 1843 .targetsize = XT_FUNCTION_MAXNAMELEN,
1844 .family = NFPROTO_ARP, 1844 .family = NFPROTO_ARP,
1845 }, 1845 },
1846}; 1846};
diff --git a/net/ipv4/netfilter/arpt_mangle.c b/net/ipv4/netfilter/arpt_mangle.c
index e1be7dd1171b..b8ddcc480ed9 100644
--- a/net/ipv4/netfilter/arpt_mangle.c
+++ b/net/ipv4/netfilter/arpt_mangle.c
@@ -63,7 +63,7 @@ static int checkentry(const struct xt_tgchk_param *par)
63 return false; 63 return false;
64 64
65 if (mangle->target != NF_DROP && mangle->target != NF_ACCEPT && 65 if (mangle->target != NF_DROP && mangle->target != NF_ACCEPT &&
66 mangle->target != ARPT_CONTINUE) 66 mangle->target != XT_CONTINUE)
67 return false; 67 return false;
68 return true; 68 return true;
69} 69}
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index d163f2e3b2e9..d31b007a6d80 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -186,7 +186,7 @@ static inline bool unconditional(const struct ipt_ip *ip)
186} 186}
187 187
188/* for const-correctness */ 188/* for const-correctness */
189static inline const struct ipt_entry_target * 189static inline const struct xt_entry_target *
190ipt_get_target_c(const struct ipt_entry *e) 190ipt_get_target_c(const struct ipt_entry *e)
191{ 191{
192 return ipt_get_target((struct ipt_entry *)e); 192 return ipt_get_target((struct ipt_entry *)e);
@@ -230,9 +230,9 @@ get_chainname_rulenum(const struct ipt_entry *s, const struct ipt_entry *e,
230 const char *hookname, const char **chainname, 230 const char *hookname, const char **chainname,
231 const char **comment, unsigned int *rulenum) 231 const char **comment, unsigned int *rulenum)
232{ 232{
233 const struct ipt_standard_target *t = (void *)ipt_get_target_c(s); 233 const struct xt_standard_target *t = (void *)ipt_get_target_c(s);
234 234
235 if (strcmp(t->target.u.kernel.target->name, IPT_ERROR_TARGET) == 0) { 235 if (strcmp(t->target.u.kernel.target->name, XT_ERROR_TARGET) == 0) {
236 /* Head of user chain: ERROR target with chainname */ 236 /* Head of user chain: ERROR target with chainname */
237 *chainname = t->target.data; 237 *chainname = t->target.data;
238 (*rulenum) = 0; 238 (*rulenum) = 0;
@@ -241,7 +241,7 @@ get_chainname_rulenum(const struct ipt_entry *s, const struct ipt_entry *e,
241 241
242 if (s->target_offset == sizeof(struct ipt_entry) && 242 if (s->target_offset == sizeof(struct ipt_entry) &&
243 strcmp(t->target.u.kernel.target->name, 243 strcmp(t->target.u.kernel.target->name,
244 IPT_STANDARD_TARGET) == 0 && 244 XT_STANDARD_TARGET) == 0 &&
245 t->verdict < 0 && 245 t->verdict < 0 &&
246 unconditional(&s->ip)) { 246 unconditional(&s->ip)) {
247 /* Tail of chains: STANDARD target (return/policy) */ 247 /* Tail of chains: STANDARD target (return/policy) */
@@ -346,7 +346,7 @@ ipt_do_table(struct sk_buff *skb,
346 get_entry(table_base, private->underflow[hook])); 346 get_entry(table_base, private->underflow[hook]));
347 347
348 do { 348 do {
349 const struct ipt_entry_target *t; 349 const struct xt_entry_target *t;
350 const struct xt_entry_match *ematch; 350 const struct xt_entry_match *ematch;
351 351
352 IP_NF_ASSERT(e); 352 IP_NF_ASSERT(e);
@@ -380,10 +380,10 @@ ipt_do_table(struct sk_buff *skb,
380 if (!t->u.kernel.target->target) { 380 if (!t->u.kernel.target->target) {
381 int v; 381 int v;
382 382
383 v = ((struct ipt_standard_target *)t)->verdict; 383 v = ((struct xt_standard_target *)t)->verdict;
384 if (v < 0) { 384 if (v < 0) {
385 /* Pop from stack? */ 385 /* Pop from stack? */
386 if (v != IPT_RETURN) { 386 if (v != XT_RETURN) {
387 verdict = (unsigned)(-v) - 1; 387 verdict = (unsigned)(-v) - 1;
388 break; 388 break;
389 } 389 }
@@ -421,7 +421,7 @@ ipt_do_table(struct sk_buff *skb,
421 verdict = t->u.kernel.target->target(skb, &acpar); 421 verdict = t->u.kernel.target->target(skb, &acpar);
422 /* Target might have changed stuff. */ 422 /* Target might have changed stuff. */
423 ip = ip_hdr(skb); 423 ip = ip_hdr(skb);
424 if (verdict == IPT_CONTINUE) 424 if (verdict == XT_CONTINUE)
425 e = ipt_next_entry(e); 425 e = ipt_next_entry(e);
426 else 426 else
427 /* Verdict */ 427 /* Verdict */
@@ -461,7 +461,7 @@ mark_source_chains(const struct xt_table_info *newinfo,
461 e->counters.pcnt = pos; 461 e->counters.pcnt = pos;
462 462
463 for (;;) { 463 for (;;) {
464 const struct ipt_standard_target *t 464 const struct xt_standard_target *t
465 = (void *)ipt_get_target_c(e); 465 = (void *)ipt_get_target_c(e);
466 int visited = e->comefrom & (1 << hook); 466 int visited = e->comefrom & (1 << hook);
467 467
@@ -475,13 +475,13 @@ mark_source_chains(const struct xt_table_info *newinfo,
475 /* Unconditional return/END. */ 475 /* Unconditional return/END. */
476 if ((e->target_offset == sizeof(struct ipt_entry) && 476 if ((e->target_offset == sizeof(struct ipt_entry) &&
477 (strcmp(t->target.u.user.name, 477 (strcmp(t->target.u.user.name,
478 IPT_STANDARD_TARGET) == 0) && 478 XT_STANDARD_TARGET) == 0) &&
479 t->verdict < 0 && unconditional(&e->ip)) || 479 t->verdict < 0 && unconditional(&e->ip)) ||
480 visited) { 480 visited) {
481 unsigned int oldpos, size; 481 unsigned int oldpos, size;
482 482
483 if ((strcmp(t->target.u.user.name, 483 if ((strcmp(t->target.u.user.name,
484 IPT_STANDARD_TARGET) == 0) && 484 XT_STANDARD_TARGET) == 0) &&
485 t->verdict < -NF_MAX_VERDICT - 1) { 485 t->verdict < -NF_MAX_VERDICT - 1) {
486 duprintf("mark_source_chains: bad " 486 duprintf("mark_source_chains: bad "
487 "negative verdict (%i)\n", 487 "negative verdict (%i)\n",
@@ -524,7 +524,7 @@ mark_source_chains(const struct xt_table_info *newinfo,
524 int newpos = t->verdict; 524 int newpos = t->verdict;
525 525
526 if (strcmp(t->target.u.user.name, 526 if (strcmp(t->target.u.user.name,
527 IPT_STANDARD_TARGET) == 0 && 527 XT_STANDARD_TARGET) == 0 &&
528 newpos >= 0) { 528 newpos >= 0) {
529 if (newpos > newinfo->size - 529 if (newpos > newinfo->size -
530 sizeof(struct ipt_entry)) { 530 sizeof(struct ipt_entry)) {
@@ -552,7 +552,7 @@ mark_source_chains(const struct xt_table_info *newinfo,
552 return 1; 552 return 1;
553} 553}
554 554
555static void cleanup_match(struct ipt_entry_match *m, struct net *net) 555static void cleanup_match(struct xt_entry_match *m, struct net *net)
556{ 556{
557 struct xt_mtdtor_param par; 557 struct xt_mtdtor_param par;
558 558
@@ -568,14 +568,14 @@ static void cleanup_match(struct ipt_entry_match *m, struct net *net)
568static int 568static int
569check_entry(const struct ipt_entry *e, const char *name) 569check_entry(const struct ipt_entry *e, const char *name)
570{ 570{
571 const struct ipt_entry_target *t; 571 const struct xt_entry_target *t;
572 572
573 if (!ip_checkentry(&e->ip)) { 573 if (!ip_checkentry(&e->ip)) {
574 duprintf("ip check failed %p %s.\n", e, par->match->name); 574 duprintf("ip check failed %p %s.\n", e, par->match->name);
575 return -EINVAL; 575 return -EINVAL;
576 } 576 }
577 577
578 if (e->target_offset + sizeof(struct ipt_entry_target) > 578 if (e->target_offset + sizeof(struct xt_entry_target) >
579 e->next_offset) 579 e->next_offset)
580 return -EINVAL; 580 return -EINVAL;
581 581
@@ -587,7 +587,7 @@ check_entry(const struct ipt_entry *e, const char *name)
587} 587}
588 588
589static int 589static int
590check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par) 590check_match(struct xt_entry_match *m, struct xt_mtchk_param *par)
591{ 591{
592 const struct ipt_ip *ip = par->entryinfo; 592 const struct ipt_ip *ip = par->entryinfo;
593 int ret; 593 int ret;
@@ -605,7 +605,7 @@ check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par)
605} 605}
606 606
607static int 607static int
608find_check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par) 608find_check_match(struct xt_entry_match *m, struct xt_mtchk_param *par)
609{ 609{
610 struct xt_match *match; 610 struct xt_match *match;
611 int ret; 611 int ret;
@@ -630,7 +630,7 @@ err:
630 630
631static int check_target(struct ipt_entry *e, struct net *net, const char *name) 631static int check_target(struct ipt_entry *e, struct net *net, const char *name)
632{ 632{
633 struct ipt_entry_target *t = ipt_get_target(e); 633 struct xt_entry_target *t = ipt_get_target(e);
634 struct xt_tgchk_param par = { 634 struct xt_tgchk_param par = {
635 .net = net, 635 .net = net,
636 .table = name, 636 .table = name,
@@ -656,7 +656,7 @@ static int
656find_check_entry(struct ipt_entry *e, struct net *net, const char *name, 656find_check_entry(struct ipt_entry *e, struct net *net, const char *name,
657 unsigned int size) 657 unsigned int size)
658{ 658{
659 struct ipt_entry_target *t; 659 struct xt_entry_target *t;
660 struct xt_target *target; 660 struct xt_target *target;
661 int ret; 661 int ret;
662 unsigned int j; 662 unsigned int j;
@@ -707,7 +707,7 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name,
707 707
708static bool check_underflow(const struct ipt_entry *e) 708static bool check_underflow(const struct ipt_entry *e)
709{ 709{
710 const struct ipt_entry_target *t; 710 const struct xt_entry_target *t;
711 unsigned int verdict; 711 unsigned int verdict;
712 712
713 if (!unconditional(&e->ip)) 713 if (!unconditional(&e->ip))
@@ -715,7 +715,7 @@ static bool check_underflow(const struct ipt_entry *e)
715 t = ipt_get_target_c(e); 715 t = ipt_get_target_c(e);
716 if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0) 716 if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0)
717 return false; 717 return false;
718 verdict = ((struct ipt_standard_target *)t)->verdict; 718 verdict = ((struct xt_standard_target *)t)->verdict;
719 verdict = -verdict - 1; 719 verdict = -verdict - 1;
720 return verdict == NF_DROP || verdict == NF_ACCEPT; 720 return verdict == NF_DROP || verdict == NF_ACCEPT;
721} 721}
@@ -738,7 +738,7 @@ check_entry_size_and_hooks(struct ipt_entry *e,
738 } 738 }
739 739
740 if (e->next_offset 740 if (e->next_offset
741 < sizeof(struct ipt_entry) + sizeof(struct ipt_entry_target)) { 741 < sizeof(struct ipt_entry) + sizeof(struct xt_entry_target)) {
742 duprintf("checking: element %p size %u\n", 742 duprintf("checking: element %p size %u\n",
743 e, e->next_offset); 743 e, e->next_offset);
744 return -EINVAL; 744 return -EINVAL;
@@ -771,7 +771,7 @@ static void
771cleanup_entry(struct ipt_entry *e, struct net *net) 771cleanup_entry(struct ipt_entry *e, struct net *net)
772{ 772{
773 struct xt_tgdtor_param par; 773 struct xt_tgdtor_param par;
774 struct ipt_entry_target *t; 774 struct xt_entry_target *t;
775 struct xt_entry_match *ematch; 775 struct xt_entry_match *ematch;
776 776
777 /* Cleanup all matches */ 777 /* Cleanup all matches */
@@ -972,8 +972,8 @@ copy_entries_to_user(unsigned int total_size,
972 /* ... then go back and fix counters and names */ 972 /* ... then go back and fix counters and names */
973 for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){ 973 for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){
974 unsigned int i; 974 unsigned int i;
975 const struct ipt_entry_match *m; 975 const struct xt_entry_match *m;
976 const struct ipt_entry_target *t; 976 const struct xt_entry_target *t;
977 977
978 e = (struct ipt_entry *)(loc_cpu_entry + off); 978 e = (struct ipt_entry *)(loc_cpu_entry + off);
979 if (copy_to_user(userptr + off 979 if (copy_to_user(userptr + off
@@ -990,7 +990,7 @@ copy_entries_to_user(unsigned int total_size,
990 m = (void *)e + i; 990 m = (void *)e + i;
991 991
992 if (copy_to_user(userptr + off + i 992 if (copy_to_user(userptr + off + i
993 + offsetof(struct ipt_entry_match, 993 + offsetof(struct xt_entry_match,
994 u.user.name), 994 u.user.name),
995 m->u.kernel.match->name, 995 m->u.kernel.match->name,
996 strlen(m->u.kernel.match->name)+1) 996 strlen(m->u.kernel.match->name)+1)
@@ -1002,7 +1002,7 @@ copy_entries_to_user(unsigned int total_size,
1002 1002
1003 t = ipt_get_target_c(e); 1003 t = ipt_get_target_c(e);
1004 if (copy_to_user(userptr + off + e->target_offset 1004 if (copy_to_user(userptr + off + e->target_offset
1005 + offsetof(struct ipt_entry_target, 1005 + offsetof(struct xt_entry_target,
1006 u.user.name), 1006 u.user.name),
1007 t->u.kernel.target->name, 1007 t->u.kernel.target->name,
1008 strlen(t->u.kernel.target->name)+1) != 0) { 1008 strlen(t->u.kernel.target->name)+1) != 0) {
@@ -1040,7 +1040,7 @@ static int compat_calc_entry(const struct ipt_entry *e,
1040 const void *base, struct xt_table_info *newinfo) 1040 const void *base, struct xt_table_info *newinfo)
1041{ 1041{
1042 const struct xt_entry_match *ematch; 1042 const struct xt_entry_match *ematch;
1043 const struct ipt_entry_target *t; 1043 const struct xt_entry_target *t;
1044 unsigned int entry_offset; 1044 unsigned int entry_offset;
1045 int off, i, ret; 1045 int off, i, ret;
1046 1046
@@ -1092,7 +1092,7 @@ static int compat_table_info(const struct xt_table_info *info,
1092static int get_info(struct net *net, void __user *user, 1092static int get_info(struct net *net, void __user *user,
1093 const int *len, int compat) 1093 const int *len, int compat)
1094{ 1094{
1095 char name[IPT_TABLE_MAXNAMELEN]; 1095 char name[XT_TABLE_MAXNAMELEN];
1096 struct xt_table *t; 1096 struct xt_table *t;
1097 int ret; 1097 int ret;
1098 1098
@@ -1105,7 +1105,7 @@ static int get_info(struct net *net, void __user *user,
1105 if (copy_from_user(name, user, sizeof(name)) != 0) 1105 if (copy_from_user(name, user, sizeof(name)) != 0)
1106 return -EFAULT; 1106 return -EFAULT;
1107 1107
1108 name[IPT_TABLE_MAXNAMELEN-1] = '\0'; 1108 name[XT_TABLE_MAXNAMELEN-1] = '\0';
1109#ifdef CONFIG_COMPAT 1109#ifdef CONFIG_COMPAT
1110 if (compat) 1110 if (compat)
1111 xt_compat_lock(AF_INET); 1111 xt_compat_lock(AF_INET);
@@ -1400,14 +1400,14 @@ do_add_counters(struct net *net, const void __user *user,
1400 1400
1401#ifdef CONFIG_COMPAT 1401#ifdef CONFIG_COMPAT
1402struct compat_ipt_replace { 1402struct compat_ipt_replace {
1403 char name[IPT_TABLE_MAXNAMELEN]; 1403 char name[XT_TABLE_MAXNAMELEN];
1404 u32 valid_hooks; 1404 u32 valid_hooks;
1405 u32 num_entries; 1405 u32 num_entries;
1406 u32 size; 1406 u32 size;
1407 u32 hook_entry[NF_INET_NUMHOOKS]; 1407 u32 hook_entry[NF_INET_NUMHOOKS];
1408 u32 underflow[NF_INET_NUMHOOKS]; 1408 u32 underflow[NF_INET_NUMHOOKS];
1409 u32 num_counters; 1409 u32 num_counters;
1410 compat_uptr_t counters; /* struct ipt_counters * */ 1410 compat_uptr_t counters; /* struct xt_counters * */
1411 struct compat_ipt_entry entries[0]; 1411 struct compat_ipt_entry entries[0];
1412}; 1412};
1413 1413
@@ -1416,7 +1416,7 @@ compat_copy_entry_to_user(struct ipt_entry *e, void __user **dstptr,
1416 unsigned int *size, struct xt_counters *counters, 1416 unsigned int *size, struct xt_counters *counters,
1417 unsigned int i) 1417 unsigned int i)
1418{ 1418{
1419 struct ipt_entry_target *t; 1419 struct xt_entry_target *t;
1420 struct compat_ipt_entry __user *ce; 1420 struct compat_ipt_entry __user *ce;
1421 u_int16_t target_offset, next_offset; 1421 u_int16_t target_offset, next_offset;
1422 compat_uint_t origsize; 1422 compat_uint_t origsize;
@@ -1451,7 +1451,7 @@ compat_copy_entry_to_user(struct ipt_entry *e, void __user **dstptr,
1451} 1451}
1452 1452
1453static int 1453static int
1454compat_find_calc_match(struct ipt_entry_match *m, 1454compat_find_calc_match(struct xt_entry_match *m,
1455 const char *name, 1455 const char *name,
1456 const struct ipt_ip *ip, 1456 const struct ipt_ip *ip,
1457 unsigned int hookmask, 1457 unsigned int hookmask,
@@ -1473,7 +1473,7 @@ compat_find_calc_match(struct ipt_entry_match *m,
1473 1473
1474static void compat_release_entry(struct compat_ipt_entry *e) 1474static void compat_release_entry(struct compat_ipt_entry *e)
1475{ 1475{
1476 struct ipt_entry_target *t; 1476 struct xt_entry_target *t;
1477 struct xt_entry_match *ematch; 1477 struct xt_entry_match *ematch;
1478 1478
1479 /* Cleanup all matches */ 1479 /* Cleanup all matches */
@@ -1494,7 +1494,7 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e,
1494 const char *name) 1494 const char *name)
1495{ 1495{
1496 struct xt_entry_match *ematch; 1496 struct xt_entry_match *ematch;
1497 struct ipt_entry_target *t; 1497 struct xt_entry_target *t;
1498 struct xt_target *target; 1498 struct xt_target *target;
1499 unsigned int entry_offset; 1499 unsigned int entry_offset;
1500 unsigned int j; 1500 unsigned int j;
@@ -1576,7 +1576,7 @@ compat_copy_entry_from_user(struct compat_ipt_entry *e, void **dstptr,
1576 unsigned int *size, const char *name, 1576 unsigned int *size, const char *name,
1577 struct xt_table_info *newinfo, unsigned char *base) 1577 struct xt_table_info *newinfo, unsigned char *base)
1578{ 1578{
1579 struct ipt_entry_target *t; 1579 struct xt_entry_target *t;
1580 struct xt_target *target; 1580 struct xt_target *target;
1581 struct ipt_entry *de; 1581 struct ipt_entry *de;
1582 unsigned int origsize; 1582 unsigned int origsize;
@@ -1884,7 +1884,7 @@ compat_do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user,
1884} 1884}
1885 1885
1886struct compat_ipt_get_entries { 1886struct compat_ipt_get_entries {
1887 char name[IPT_TABLE_MAXNAMELEN]; 1887 char name[XT_TABLE_MAXNAMELEN];
1888 compat_uint_t size; 1888 compat_uint_t size;
1889 struct compat_ipt_entry entrytable[0]; 1889 struct compat_ipt_entry entrytable[0];
1890}; 1890};
@@ -2039,7 +2039,7 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
2039 2039
2040 case IPT_SO_GET_REVISION_MATCH: 2040 case IPT_SO_GET_REVISION_MATCH:
2041 case IPT_SO_GET_REVISION_TARGET: { 2041 case IPT_SO_GET_REVISION_TARGET: {
2042 struct ipt_get_revision rev; 2042 struct xt_get_revision rev;
2043 int target; 2043 int target;
2044 2044
2045 if (*len != sizeof(rev)) { 2045 if (*len != sizeof(rev)) {
@@ -2176,7 +2176,7 @@ static int icmp_checkentry(const struct xt_mtchk_param *par)
2176 2176
2177static struct xt_target ipt_builtin_tg[] __read_mostly = { 2177static struct xt_target ipt_builtin_tg[] __read_mostly = {
2178 { 2178 {
2179 .name = IPT_STANDARD_TARGET, 2179 .name = XT_STANDARD_TARGET,
2180 .targetsize = sizeof(int), 2180 .targetsize = sizeof(int),
2181 .family = NFPROTO_IPV4, 2181 .family = NFPROTO_IPV4,
2182#ifdef CONFIG_COMPAT 2182#ifdef CONFIG_COMPAT
@@ -2186,9 +2186,9 @@ static struct xt_target ipt_builtin_tg[] __read_mostly = {
2186#endif 2186#endif
2187 }, 2187 },
2188 { 2188 {
2189 .name = IPT_ERROR_TARGET, 2189 .name = XT_ERROR_TARGET,
2190 .target = ipt_error, 2190 .target = ipt_error,
2191 .targetsize = IPT_FUNCTION_MAXNAMELEN, 2191 .targetsize = XT_FUNCTION_MAXNAMELEN,
2192 .family = NFPROTO_IPV4, 2192 .family = NFPROTO_IPV4,
2193 }, 2193 },
2194}; 2194};
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 3a43cf36db87..1e26a4897655 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -29,6 +29,7 @@
29#include <net/netfilter/nf_conntrack.h> 29#include <net/netfilter/nf_conntrack.h>
30#include <net/net_namespace.h> 30#include <net/net_namespace.h>
31#include <net/checksum.h> 31#include <net/checksum.h>
32#include <net/ip.h>
32 33
33#define CLUSTERIP_VERSION "0.8" 34#define CLUSTERIP_VERSION "0.8"
34 35
@@ -231,24 +232,22 @@ clusterip_hashfn(const struct sk_buff *skb,
231{ 232{
232 const struct iphdr *iph = ip_hdr(skb); 233 const struct iphdr *iph = ip_hdr(skb);
233 unsigned long hashval; 234 unsigned long hashval;
234 u_int16_t sport, dport; 235 u_int16_t sport = 0, dport = 0;
235 const u_int16_t *ports; 236 int poff;
236 237
237 switch (iph->protocol) { 238 poff = proto_ports_offset(iph->protocol);
238 case IPPROTO_TCP: 239 if (poff >= 0) {
239 case IPPROTO_UDP: 240 const u_int16_t *ports;
240 case IPPROTO_UDPLITE: 241 u16 _ports[2];
241 case IPPROTO_SCTP: 242
242 case IPPROTO_DCCP: 243 ports = skb_header_pointer(skb, iph->ihl * 4 + poff, 4, _ports);
243 case IPPROTO_ICMP: 244 if (ports) {
244 ports = (const void *)iph+iph->ihl*4; 245 sport = ports[0];
245 sport = ports[0]; 246 dport = ports[1];
246 dport = ports[1]; 247 }
247 break; 248 } else {
248 default:
249 if (net_ratelimit()) 249 if (net_ratelimit())
250 pr_info("unknown protocol %u\n", iph->protocol); 250 pr_info("unknown protocol %u\n", iph->protocol);
251 sport = dport = 0;
252 } 251 }
253 252
254 switch (config->hash_mode) { 253 switch (config->hash_mode) {
diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c
index 915fc17d7ce2..72ffc8fda2e9 100644
--- a/net/ipv4/netfilter/ipt_LOG.c
+++ b/net/ipv4/netfilter/ipt_LOG.c
@@ -24,16 +24,15 @@
24#include <linux/netfilter/x_tables.h> 24#include <linux/netfilter/x_tables.h>
25#include <linux/netfilter_ipv4/ipt_LOG.h> 25#include <linux/netfilter_ipv4/ipt_LOG.h>
26#include <net/netfilter/nf_log.h> 26#include <net/netfilter/nf_log.h>
27#include <net/netfilter/xt_log.h>
27 28
28MODULE_LICENSE("GPL"); 29MODULE_LICENSE("GPL");
29MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); 30MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
30MODULE_DESCRIPTION("Xtables: IPv4 packet logging to syslog"); 31MODULE_DESCRIPTION("Xtables: IPv4 packet logging to syslog");
31 32
32/* Use lock to serialize, so printks don't overlap */
33static DEFINE_SPINLOCK(log_lock);
34
35/* One level of recursion won't kill us */ 33/* One level of recursion won't kill us */
36static void dump_packet(const struct nf_loginfo *info, 34static void dump_packet(struct sbuff *m,
35 const struct nf_loginfo *info,
37 const struct sk_buff *skb, 36 const struct sk_buff *skb,
38 unsigned int iphoff) 37 unsigned int iphoff)
39{ 38{
@@ -48,32 +47,32 @@ static void dump_packet(const struct nf_loginfo *info,
48 47
49 ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph); 48 ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph);
50 if (ih == NULL) { 49 if (ih == NULL) {
51 printk("TRUNCATED"); 50 sb_add(m, "TRUNCATED");
52 return; 51 return;
53 } 52 }
54 53
55 /* Important fields: 54 /* Important fields:
56 * TOS, len, DF/MF, fragment offset, TTL, src, dst, options. */ 55 * TOS, len, DF/MF, fragment offset, TTL, src, dst, options. */
57 /* Max length: 40 "SRC=255.255.255.255 DST=255.255.255.255 " */ 56 /* Max length: 40 "SRC=255.255.255.255 DST=255.255.255.255 " */
58 printk("SRC=%pI4 DST=%pI4 ", 57 sb_add(m, "SRC=%pI4 DST=%pI4 ",
59 &ih->saddr, &ih->daddr); 58 &ih->saddr, &ih->daddr);
60 59
61 /* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */ 60 /* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */
62 printk("LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ", 61 sb_add(m, "LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ",
63 ntohs(ih->tot_len), ih->tos & IPTOS_TOS_MASK, 62 ntohs(ih->tot_len), ih->tos & IPTOS_TOS_MASK,
64 ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id)); 63 ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id));
65 64
66 /* Max length: 6 "CE DF MF " */ 65 /* Max length: 6 "CE DF MF " */
67 if (ntohs(ih->frag_off) & IP_CE) 66 if (ntohs(ih->frag_off) & IP_CE)
68 printk("CE "); 67 sb_add(m, "CE ");
69 if (ntohs(ih->frag_off) & IP_DF) 68 if (ntohs(ih->frag_off) & IP_DF)
70 printk("DF "); 69 sb_add(m, "DF ");
71 if (ntohs(ih->frag_off) & IP_MF) 70 if (ntohs(ih->frag_off) & IP_MF)
72 printk("MF "); 71 sb_add(m, "MF ");
73 72
74 /* Max length: 11 "FRAG:65535 " */ 73 /* Max length: 11 "FRAG:65535 " */
75 if (ntohs(ih->frag_off) & IP_OFFSET) 74 if (ntohs(ih->frag_off) & IP_OFFSET)
76 printk("FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET); 75 sb_add(m, "FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET);
77 76
78 if ((logflags & IPT_LOG_IPOPT) && 77 if ((logflags & IPT_LOG_IPOPT) &&
79 ih->ihl * 4 > sizeof(struct iphdr)) { 78 ih->ihl * 4 > sizeof(struct iphdr)) {
@@ -85,15 +84,15 @@ static void dump_packet(const struct nf_loginfo *info,
85 op = skb_header_pointer(skb, iphoff+sizeof(_iph), 84 op = skb_header_pointer(skb, iphoff+sizeof(_iph),
86 optsize, _opt); 85 optsize, _opt);
87 if (op == NULL) { 86 if (op == NULL) {
88 printk("TRUNCATED"); 87 sb_add(m, "TRUNCATED");
89 return; 88 return;
90 } 89 }
91 90
92 /* Max length: 127 "OPT (" 15*4*2chars ") " */ 91 /* Max length: 127 "OPT (" 15*4*2chars ") " */
93 printk("OPT ("); 92 sb_add(m, "OPT (");
94 for (i = 0; i < optsize; i++) 93 for (i = 0; i < optsize; i++)
95 printk("%02X", op[i]); 94 sb_add(m, "%02X", op[i]);
96 printk(") "); 95 sb_add(m, ") ");
97 } 96 }
98 97
99 switch (ih->protocol) { 98 switch (ih->protocol) {
@@ -102,7 +101,7 @@ static void dump_packet(const struct nf_loginfo *info,
102 const struct tcphdr *th; 101 const struct tcphdr *th;
103 102
104 /* Max length: 10 "PROTO=TCP " */ 103 /* Max length: 10 "PROTO=TCP " */
105 printk("PROTO=TCP "); 104 sb_add(m, "PROTO=TCP ");
106 105
107 if (ntohs(ih->frag_off) & IP_OFFSET) 106 if (ntohs(ih->frag_off) & IP_OFFSET)
108 break; 107 break;
@@ -111,41 +110,41 @@ static void dump_packet(const struct nf_loginfo *info,
111 th = skb_header_pointer(skb, iphoff + ih->ihl * 4, 110 th = skb_header_pointer(skb, iphoff + ih->ihl * 4,
112 sizeof(_tcph), &_tcph); 111 sizeof(_tcph), &_tcph);
113 if (th == NULL) { 112 if (th == NULL) {
114 printk("INCOMPLETE [%u bytes] ", 113 sb_add(m, "INCOMPLETE [%u bytes] ",
115 skb->len - iphoff - ih->ihl*4); 114 skb->len - iphoff - ih->ihl*4);
116 break; 115 break;
117 } 116 }
118 117
119 /* Max length: 20 "SPT=65535 DPT=65535 " */ 118 /* Max length: 20 "SPT=65535 DPT=65535 " */
120 printk("SPT=%u DPT=%u ", 119 sb_add(m, "SPT=%u DPT=%u ",
121 ntohs(th->source), ntohs(th->dest)); 120 ntohs(th->source), ntohs(th->dest));
122 /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */ 121 /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */
123 if (logflags & IPT_LOG_TCPSEQ) 122 if (logflags & IPT_LOG_TCPSEQ)
124 printk("SEQ=%u ACK=%u ", 123 sb_add(m, "SEQ=%u ACK=%u ",
125 ntohl(th->seq), ntohl(th->ack_seq)); 124 ntohl(th->seq), ntohl(th->ack_seq));
126 /* Max length: 13 "WINDOW=65535 " */ 125 /* Max length: 13 "WINDOW=65535 " */
127 printk("WINDOW=%u ", ntohs(th->window)); 126 sb_add(m, "WINDOW=%u ", ntohs(th->window));
128 /* Max length: 9 "RES=0x3F " */ 127 /* Max length: 9 "RES=0x3F " */
129 printk("RES=0x%02x ", (u8)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22)); 128 sb_add(m, "RES=0x%02x ", (u8)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22));
130 /* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */ 129 /* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */
131 if (th->cwr) 130 if (th->cwr)
132 printk("CWR "); 131 sb_add(m, "CWR ");
133 if (th->ece) 132 if (th->ece)
134 printk("ECE "); 133 sb_add(m, "ECE ");
135 if (th->urg) 134 if (th->urg)
136 printk("URG "); 135 sb_add(m, "URG ");
137 if (th->ack) 136 if (th->ack)
138 printk("ACK "); 137 sb_add(m, "ACK ");
139 if (th->psh) 138 if (th->psh)
140 printk("PSH "); 139 sb_add(m, "PSH ");
141 if (th->rst) 140 if (th->rst)
142 printk("RST "); 141 sb_add(m, "RST ");
143 if (th->syn) 142 if (th->syn)
144 printk("SYN "); 143 sb_add(m, "SYN ");
145 if (th->fin) 144 if (th->fin)
146 printk("FIN "); 145 sb_add(m, "FIN ");
147 /* Max length: 11 "URGP=65535 " */ 146 /* Max length: 11 "URGP=65535 " */
148 printk("URGP=%u ", ntohs(th->urg_ptr)); 147 sb_add(m, "URGP=%u ", ntohs(th->urg_ptr));
149 148
150 if ((logflags & IPT_LOG_TCPOPT) && 149 if ((logflags & IPT_LOG_TCPOPT) &&
151 th->doff * 4 > sizeof(struct tcphdr)) { 150 th->doff * 4 > sizeof(struct tcphdr)) {
@@ -158,15 +157,15 @@ static void dump_packet(const struct nf_loginfo *info,
158 iphoff+ih->ihl*4+sizeof(_tcph), 157 iphoff+ih->ihl*4+sizeof(_tcph),
159 optsize, _opt); 158 optsize, _opt);
160 if (op == NULL) { 159 if (op == NULL) {
161 printk("TRUNCATED"); 160 sb_add(m, "TRUNCATED");
162 return; 161 return;
163 } 162 }
164 163
165 /* Max length: 127 "OPT (" 15*4*2chars ") " */ 164 /* Max length: 127 "OPT (" 15*4*2chars ") " */
166 printk("OPT ("); 165 sb_add(m, "OPT (");
167 for (i = 0; i < optsize; i++) 166 for (i = 0; i < optsize; i++)
168 printk("%02X", op[i]); 167 sb_add(m, "%02X", op[i]);
169 printk(") "); 168 sb_add(m, ") ");
170 } 169 }
171 break; 170 break;
172 } 171 }
@@ -177,9 +176,9 @@ static void dump_packet(const struct nf_loginfo *info,
177 176
178 if (ih->protocol == IPPROTO_UDP) 177 if (ih->protocol == IPPROTO_UDP)
179 /* Max length: 10 "PROTO=UDP " */ 178 /* Max length: 10 "PROTO=UDP " */
180 printk("PROTO=UDP " ); 179 sb_add(m, "PROTO=UDP " );
181 else /* Max length: 14 "PROTO=UDPLITE " */ 180 else /* Max length: 14 "PROTO=UDPLITE " */
182 printk("PROTO=UDPLITE "); 181 sb_add(m, "PROTO=UDPLITE ");
183 182
184 if (ntohs(ih->frag_off) & IP_OFFSET) 183 if (ntohs(ih->frag_off) & IP_OFFSET)
185 break; 184 break;
@@ -188,13 +187,13 @@ static void dump_packet(const struct nf_loginfo *info,
188 uh = skb_header_pointer(skb, iphoff+ih->ihl*4, 187 uh = skb_header_pointer(skb, iphoff+ih->ihl*4,
189 sizeof(_udph), &_udph); 188 sizeof(_udph), &_udph);
190 if (uh == NULL) { 189 if (uh == NULL) {
191 printk("INCOMPLETE [%u bytes] ", 190 sb_add(m, "INCOMPLETE [%u bytes] ",
192 skb->len - iphoff - ih->ihl*4); 191 skb->len - iphoff - ih->ihl*4);
193 break; 192 break;
194 } 193 }
195 194
196 /* Max length: 20 "SPT=65535 DPT=65535 " */ 195 /* Max length: 20 "SPT=65535 DPT=65535 " */
197 printk("SPT=%u DPT=%u LEN=%u ", 196 sb_add(m, "SPT=%u DPT=%u LEN=%u ",
198 ntohs(uh->source), ntohs(uh->dest), 197 ntohs(uh->source), ntohs(uh->dest),
199 ntohs(uh->len)); 198 ntohs(uh->len));
200 break; 199 break;
@@ -221,7 +220,7 @@ static void dump_packet(const struct nf_loginfo *info,
221 [ICMP_ADDRESSREPLY] = 12 }; 220 [ICMP_ADDRESSREPLY] = 12 };
222 221
223 /* Max length: 11 "PROTO=ICMP " */ 222 /* Max length: 11 "PROTO=ICMP " */
224 printk("PROTO=ICMP "); 223 sb_add(m, "PROTO=ICMP ");
225 224
226 if (ntohs(ih->frag_off) & IP_OFFSET) 225 if (ntohs(ih->frag_off) & IP_OFFSET)
227 break; 226 break;
@@ -230,19 +229,19 @@ static void dump_packet(const struct nf_loginfo *info,
230 ich = skb_header_pointer(skb, iphoff + ih->ihl * 4, 229 ich = skb_header_pointer(skb, iphoff + ih->ihl * 4,
231 sizeof(_icmph), &_icmph); 230 sizeof(_icmph), &_icmph);
232 if (ich == NULL) { 231 if (ich == NULL) {
233 printk("INCOMPLETE [%u bytes] ", 232 sb_add(m, "INCOMPLETE [%u bytes] ",
234 skb->len - iphoff - ih->ihl*4); 233 skb->len - iphoff - ih->ihl*4);
235 break; 234 break;
236 } 235 }
237 236
238 /* Max length: 18 "TYPE=255 CODE=255 " */ 237 /* Max length: 18 "TYPE=255 CODE=255 " */
239 printk("TYPE=%u CODE=%u ", ich->type, ich->code); 238 sb_add(m, "TYPE=%u CODE=%u ", ich->type, ich->code);
240 239
241 /* Max length: 25 "INCOMPLETE [65535 bytes] " */ 240 /* Max length: 25 "INCOMPLETE [65535 bytes] " */
242 if (ich->type <= NR_ICMP_TYPES && 241 if (ich->type <= NR_ICMP_TYPES &&
243 required_len[ich->type] && 242 required_len[ich->type] &&
244 skb->len-iphoff-ih->ihl*4 < required_len[ich->type]) { 243 skb->len-iphoff-ih->ihl*4 < required_len[ich->type]) {
245 printk("INCOMPLETE [%u bytes] ", 244 sb_add(m, "INCOMPLETE [%u bytes] ",
246 skb->len - iphoff - ih->ihl*4); 245 skb->len - iphoff - ih->ihl*4);
247 break; 246 break;
248 } 247 }
@@ -251,35 +250,35 @@ static void dump_packet(const struct nf_loginfo *info,
251 case ICMP_ECHOREPLY: 250 case ICMP_ECHOREPLY:
252 case ICMP_ECHO: 251 case ICMP_ECHO:
253 /* Max length: 19 "ID=65535 SEQ=65535 " */ 252 /* Max length: 19 "ID=65535 SEQ=65535 " */
254 printk("ID=%u SEQ=%u ", 253 sb_add(m, "ID=%u SEQ=%u ",
255 ntohs(ich->un.echo.id), 254 ntohs(ich->un.echo.id),
256 ntohs(ich->un.echo.sequence)); 255 ntohs(ich->un.echo.sequence));
257 break; 256 break;
258 257
259 case ICMP_PARAMETERPROB: 258 case ICMP_PARAMETERPROB:
260 /* Max length: 14 "PARAMETER=255 " */ 259 /* Max length: 14 "PARAMETER=255 " */
261 printk("PARAMETER=%u ", 260 sb_add(m, "PARAMETER=%u ",
262 ntohl(ich->un.gateway) >> 24); 261 ntohl(ich->un.gateway) >> 24);
263 break; 262 break;
264 case ICMP_REDIRECT: 263 case ICMP_REDIRECT:
265 /* Max length: 24 "GATEWAY=255.255.255.255 " */ 264 /* Max length: 24 "GATEWAY=255.255.255.255 " */
266 printk("GATEWAY=%pI4 ", &ich->un.gateway); 265 sb_add(m, "GATEWAY=%pI4 ", &ich->un.gateway);
267 /* Fall through */ 266 /* Fall through */
268 case ICMP_DEST_UNREACH: 267 case ICMP_DEST_UNREACH:
269 case ICMP_SOURCE_QUENCH: 268 case ICMP_SOURCE_QUENCH:
270 case ICMP_TIME_EXCEEDED: 269 case ICMP_TIME_EXCEEDED:
271 /* Max length: 3+maxlen */ 270 /* Max length: 3+maxlen */
272 if (!iphoff) { /* Only recurse once. */ 271 if (!iphoff) { /* Only recurse once. */
273 printk("["); 272 sb_add(m, "[");
274 dump_packet(info, skb, 273 dump_packet(m, info, skb,
275 iphoff + ih->ihl*4+sizeof(_icmph)); 274 iphoff + ih->ihl*4+sizeof(_icmph));
276 printk("] "); 275 sb_add(m, "] ");
277 } 276 }
278 277
279 /* Max length: 10 "MTU=65535 " */ 278 /* Max length: 10 "MTU=65535 " */
280 if (ich->type == ICMP_DEST_UNREACH && 279 if (ich->type == ICMP_DEST_UNREACH &&
281 ich->code == ICMP_FRAG_NEEDED) 280 ich->code == ICMP_FRAG_NEEDED)
282 printk("MTU=%u ", ntohs(ich->un.frag.mtu)); 281 sb_add(m, "MTU=%u ", ntohs(ich->un.frag.mtu));
283 } 282 }
284 break; 283 break;
285 } 284 }
@@ -292,19 +291,19 @@ static void dump_packet(const struct nf_loginfo *info,
292 break; 291 break;
293 292
294 /* Max length: 9 "PROTO=AH " */ 293 /* Max length: 9 "PROTO=AH " */
295 printk("PROTO=AH "); 294 sb_add(m, "PROTO=AH ");
296 295
297 /* Max length: 25 "INCOMPLETE [65535 bytes] " */ 296 /* Max length: 25 "INCOMPLETE [65535 bytes] " */
298 ah = skb_header_pointer(skb, iphoff+ih->ihl*4, 297 ah = skb_header_pointer(skb, iphoff+ih->ihl*4,
299 sizeof(_ahdr), &_ahdr); 298 sizeof(_ahdr), &_ahdr);
300 if (ah == NULL) { 299 if (ah == NULL) {
301 printk("INCOMPLETE [%u bytes] ", 300 sb_add(m, "INCOMPLETE [%u bytes] ",
302 skb->len - iphoff - ih->ihl*4); 301 skb->len - iphoff - ih->ihl*4);
303 break; 302 break;
304 } 303 }
305 304
306 /* Length: 15 "SPI=0xF1234567 " */ 305 /* Length: 15 "SPI=0xF1234567 " */
307 printk("SPI=0x%x ", ntohl(ah->spi)); 306 sb_add(m, "SPI=0x%x ", ntohl(ah->spi));
308 break; 307 break;
309 } 308 }
310 case IPPROTO_ESP: { 309 case IPPROTO_ESP: {
@@ -312,7 +311,7 @@ static void dump_packet(const struct nf_loginfo *info,
312 const struct ip_esp_hdr *eh; 311 const struct ip_esp_hdr *eh;
313 312
314 /* Max length: 10 "PROTO=ESP " */ 313 /* Max length: 10 "PROTO=ESP " */
315 printk("PROTO=ESP "); 314 sb_add(m, "PROTO=ESP ");
316 315
317 if (ntohs(ih->frag_off) & IP_OFFSET) 316 if (ntohs(ih->frag_off) & IP_OFFSET)
318 break; 317 break;
@@ -321,25 +320,25 @@ static void dump_packet(const struct nf_loginfo *info,
321 eh = skb_header_pointer(skb, iphoff+ih->ihl*4, 320 eh = skb_header_pointer(skb, iphoff+ih->ihl*4,
322 sizeof(_esph), &_esph); 321 sizeof(_esph), &_esph);
323 if (eh == NULL) { 322 if (eh == NULL) {
324 printk("INCOMPLETE [%u bytes] ", 323 sb_add(m, "INCOMPLETE [%u bytes] ",
325 skb->len - iphoff - ih->ihl*4); 324 skb->len - iphoff - ih->ihl*4);
326 break; 325 break;
327 } 326 }
328 327
329 /* Length: 15 "SPI=0xF1234567 " */ 328 /* Length: 15 "SPI=0xF1234567 " */
330 printk("SPI=0x%x ", ntohl(eh->spi)); 329 sb_add(m, "SPI=0x%x ", ntohl(eh->spi));
331 break; 330 break;
332 } 331 }
333 /* Max length: 10 "PROTO 255 " */ 332 /* Max length: 10 "PROTO 255 " */
334 default: 333 default:
335 printk("PROTO=%u ", ih->protocol); 334 sb_add(m, "PROTO=%u ", ih->protocol);
336 } 335 }
337 336
338 /* Max length: 15 "UID=4294967295 " */ 337 /* Max length: 15 "UID=4294967295 " */
339 if ((logflags & IPT_LOG_UID) && !iphoff && skb->sk) { 338 if ((logflags & IPT_LOG_UID) && !iphoff && skb->sk) {
340 read_lock_bh(&skb->sk->sk_callback_lock); 339 read_lock_bh(&skb->sk->sk_callback_lock);
341 if (skb->sk->sk_socket && skb->sk->sk_socket->file) 340 if (skb->sk->sk_socket && skb->sk->sk_socket->file)
342 printk("UID=%u GID=%u ", 341 sb_add(m, "UID=%u GID=%u ",
343 skb->sk->sk_socket->file->f_cred->fsuid, 342 skb->sk->sk_socket->file->f_cred->fsuid,
344 skb->sk->sk_socket->file->f_cred->fsgid); 343 skb->sk->sk_socket->file->f_cred->fsgid);
345 read_unlock_bh(&skb->sk->sk_callback_lock); 344 read_unlock_bh(&skb->sk->sk_callback_lock);
@@ -347,7 +346,7 @@ static void dump_packet(const struct nf_loginfo *info,
347 346
348 /* Max length: 16 "MARK=0xFFFFFFFF " */ 347 /* Max length: 16 "MARK=0xFFFFFFFF " */
349 if (!iphoff && skb->mark) 348 if (!iphoff && skb->mark)
350 printk("MARK=0x%x ", skb->mark); 349 sb_add(m, "MARK=0x%x ", skb->mark);
351 350
352 /* Proto Max log string length */ 351 /* Proto Max log string length */
353 /* IP: 40+46+6+11+127 = 230 */ 352 /* IP: 40+46+6+11+127 = 230 */
@@ -364,7 +363,8 @@ static void dump_packet(const struct nf_loginfo *info,
364 /* maxlen = 230+ 91 + 230 + 252 = 803 */ 363 /* maxlen = 230+ 91 + 230 + 252 = 803 */
365} 364}
366 365
367static void dump_mac_header(const struct nf_loginfo *info, 366static void dump_mac_header(struct sbuff *m,
367 const struct nf_loginfo *info,
368 const struct sk_buff *skb) 368 const struct sk_buff *skb)
369{ 369{
370 struct net_device *dev = skb->dev; 370 struct net_device *dev = skb->dev;
@@ -378,7 +378,7 @@ static void dump_mac_header(const struct nf_loginfo *info,
378 378
379 switch (dev->type) { 379 switch (dev->type) {
380 case ARPHRD_ETHER: 380 case ARPHRD_ETHER:
381 printk("MACSRC=%pM MACDST=%pM MACPROTO=%04x ", 381 sb_add(m, "MACSRC=%pM MACDST=%pM MACPROTO=%04x ",
382 eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest, 382 eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
383 ntohs(eth_hdr(skb)->h_proto)); 383 ntohs(eth_hdr(skb)->h_proto));
384 return; 384 return;
@@ -387,17 +387,17 @@ static void dump_mac_header(const struct nf_loginfo *info,
387 } 387 }
388 388
389fallback: 389fallback:
390 printk("MAC="); 390 sb_add(m, "MAC=");
391 if (dev->hard_header_len && 391 if (dev->hard_header_len &&
392 skb->mac_header != skb->network_header) { 392 skb->mac_header != skb->network_header) {
393 const unsigned char *p = skb_mac_header(skb); 393 const unsigned char *p = skb_mac_header(skb);
394 unsigned int i; 394 unsigned int i;
395 395
396 printk("%02x", *p++); 396 sb_add(m, "%02x", *p++);
397 for (i = 1; i < dev->hard_header_len; i++, p++) 397 for (i = 1; i < dev->hard_header_len; i++, p++)
398 printk(":%02x", *p); 398 sb_add(m, ":%02x", *p);
399 } 399 }
400 printk(" "); 400 sb_add(m, " ");
401} 401}
402 402
403static struct nf_loginfo default_loginfo = { 403static struct nf_loginfo default_loginfo = {
@@ -419,11 +419,12 @@ ipt_log_packet(u_int8_t pf,
419 const struct nf_loginfo *loginfo, 419 const struct nf_loginfo *loginfo,
420 const char *prefix) 420 const char *prefix)
421{ 421{
422 struct sbuff *m = sb_open();
423
422 if (!loginfo) 424 if (!loginfo)
423 loginfo = &default_loginfo; 425 loginfo = &default_loginfo;
424 426
425 spin_lock_bh(&log_lock); 427 sb_add(m, "<%d>%sIN=%s OUT=%s ", loginfo->u.log.level,
426 printk("<%d>%sIN=%s OUT=%s ", loginfo->u.log.level,
427 prefix, 428 prefix,
428 in ? in->name : "", 429 in ? in->name : "",
429 out ? out->name : ""); 430 out ? out->name : "");
@@ -434,20 +435,20 @@ ipt_log_packet(u_int8_t pf,
434 435
435 physindev = skb->nf_bridge->physindev; 436 physindev = skb->nf_bridge->physindev;
436 if (physindev && in != physindev) 437 if (physindev && in != physindev)
437 printk("PHYSIN=%s ", physindev->name); 438 sb_add(m, "PHYSIN=%s ", physindev->name);
438 physoutdev = skb->nf_bridge->physoutdev; 439 physoutdev = skb->nf_bridge->physoutdev;
439 if (physoutdev && out != physoutdev) 440 if (physoutdev && out != physoutdev)
440 printk("PHYSOUT=%s ", physoutdev->name); 441 sb_add(m, "PHYSOUT=%s ", physoutdev->name);
441 } 442 }
442#endif 443#endif
443 444
444 /* MAC logging for input path only. */ 445 /* MAC logging for input path only. */
445 if (in && !out) 446 if (in && !out)
446 dump_mac_header(loginfo, skb); 447 dump_mac_header(m, loginfo, skb);
448
449 dump_packet(m, loginfo, skb, 0);
447 450
448 dump_packet(loginfo, skb, 0); 451 sb_close(m);
449 printk("\n");
450 spin_unlock_bh(&log_lock);
451} 452}
452 453
453static unsigned int 454static unsigned int
diff --git a/net/ipv4/netfilter/nf_nat_amanda.c b/net/ipv4/netfilter/nf_nat_amanda.c
index c31b87668250..0f23b3f06df0 100644
--- a/net/ipv4/netfilter/nf_nat_amanda.c
+++ b/net/ipv4/netfilter/nf_nat_amanda.c
@@ -44,9 +44,16 @@ static unsigned int help(struct sk_buff *skb,
44 44
45 /* Try to get same port: if not, try to change it. */ 45 /* Try to get same port: if not, try to change it. */
46 for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) { 46 for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
47 int ret;
48
47 exp->tuple.dst.u.tcp.port = htons(port); 49 exp->tuple.dst.u.tcp.port = htons(port);
48 if (nf_ct_expect_related(exp) == 0) 50 ret = nf_ct_expect_related(exp);
51 if (ret == 0)
52 break;
53 else if (ret != -EBUSY) {
54 port = 0;
49 break; 55 break;
56 }
50 } 57 }
51 58
52 if (port == 0) 59 if (port == 0)
diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c
index 957c9241fb0c..295c97431e43 100644
--- a/net/ipv4/netfilter/nf_nat_core.c
+++ b/net/ipv4/netfilter/nf_nat_core.c
@@ -47,7 +47,7 @@ __nf_nat_proto_find(u_int8_t protonum)
47 return rcu_dereference(nf_nat_protos[protonum]); 47 return rcu_dereference(nf_nat_protos[protonum]);
48} 48}
49 49
50const struct nf_nat_protocol * 50static const struct nf_nat_protocol *
51nf_nat_proto_find_get(u_int8_t protonum) 51nf_nat_proto_find_get(u_int8_t protonum)
52{ 52{
53 const struct nf_nat_protocol *p; 53 const struct nf_nat_protocol *p;
@@ -60,14 +60,12 @@ nf_nat_proto_find_get(u_int8_t protonum)
60 60
61 return p; 61 return p;
62} 62}
63EXPORT_SYMBOL_GPL(nf_nat_proto_find_get);
64 63
65void 64static void
66nf_nat_proto_put(const struct nf_nat_protocol *p) 65nf_nat_proto_put(const struct nf_nat_protocol *p)
67{ 66{
68 module_put(p->me); 67 module_put(p->me);
69} 68}
70EXPORT_SYMBOL_GPL(nf_nat_proto_put);
71 69
72/* We keep an extra hash for each conntrack, for fast searching. */ 70/* We keep an extra hash for each conntrack, for fast searching. */
73static inline unsigned int 71static inline unsigned int
@@ -262,11 +260,17 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
262 proto = __nf_nat_proto_find(orig_tuple->dst.protonum); 260 proto = __nf_nat_proto_find(orig_tuple->dst.protonum);
263 261
264 /* Only bother mapping if it's not already in range and unique */ 262 /* Only bother mapping if it's not already in range and unique */
265 if (!(range->flags & IP_NAT_RANGE_PROTO_RANDOM) && 263 if (!(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) {
266 (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED) || 264 if (range->flags & IP_NAT_RANGE_PROTO_SPECIFIED) {
267 proto->in_range(tuple, maniptype, &range->min, &range->max)) && 265 if (proto->in_range(tuple, maniptype, &range->min,
268 !nf_nat_used_tuple(tuple, ct)) 266 &range->max) &&
269 goto out; 267 (range->min.all == range->max.all ||
268 !nf_nat_used_tuple(tuple, ct)))
269 goto out;
270 } else if (!nf_nat_used_tuple(tuple, ct)) {
271 goto out;
272 }
273 }
270 274
271 /* Last change: get protocol to try to obtain unique tuple. */ 275 /* Last change: get protocol to try to obtain unique tuple. */
272 proto->unique_tuple(tuple, range, maniptype, ct); 276 proto->unique_tuple(tuple, range, maniptype, ct);
@@ -458,6 +462,18 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct,
458 return 0; 462 return 0;
459 } 463 }
460 464
465 if (manip == IP_NAT_MANIP_SRC)
466 statusbit = IPS_SRC_NAT;
467 else
468 statusbit = IPS_DST_NAT;
469
470 /* Invert if this is reply dir. */
471 if (dir == IP_CT_DIR_REPLY)
472 statusbit ^= IPS_NAT_MASK;
473
474 if (!(ct->status & statusbit))
475 return 1;
476
461 pr_debug("icmp_reply_translation: translating error %p manip %u " 477 pr_debug("icmp_reply_translation: translating error %p manip %u "
462 "dir %s\n", skb, manip, 478 "dir %s\n", skb, manip,
463 dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY"); 479 dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY");
@@ -492,20 +508,9 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct,
492 508
493 /* Change outer to look the reply to an incoming packet 509 /* Change outer to look the reply to an incoming packet
494 * (proto 0 means don't invert per-proto part). */ 510 * (proto 0 means don't invert per-proto part). */
495 if (manip == IP_NAT_MANIP_SRC) 511 nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
496 statusbit = IPS_SRC_NAT; 512 if (!manip_pkt(0, skb, 0, &target, manip))
497 else 513 return 0;
498 statusbit = IPS_DST_NAT;
499
500 /* Invert if this is reply dir. */
501 if (dir == IP_CT_DIR_REPLY)
502 statusbit ^= IPS_NAT_MASK;
503
504 if (ct->status & statusbit) {
505 nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
506 if (!manip_pkt(0, skb, 0, &target, manip))
507 return 0;
508 }
509 514
510 return 1; 515 return 1;
511} 516}
diff --git a/net/ipv4/netfilter/nf_nat_ftp.c b/net/ipv4/netfilter/nf_nat_ftp.c
index 86e0e84ff0a0..dc73abb3fe27 100644
--- a/net/ipv4/netfilter/nf_nat_ftp.c
+++ b/net/ipv4/netfilter/nf_nat_ftp.c
@@ -79,9 +79,16 @@ static unsigned int nf_nat_ftp(struct sk_buff *skb,
79 79
80 /* Try to get same port: if not, try to change it. */ 80 /* Try to get same port: if not, try to change it. */
81 for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) { 81 for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
82 int ret;
83
82 exp->tuple.dst.u.tcp.port = htons(port); 84 exp->tuple.dst.u.tcp.port = htons(port);
83 if (nf_ct_expect_related(exp) == 0) 85 ret = nf_ct_expect_related(exp);
86 if (ret == 0)
87 break;
88 else if (ret != -EBUSY) {
89 port = 0;
84 break; 90 break;
91 }
85 } 92 }
86 93
87 if (port == 0) 94 if (port == 0)
diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c
index 5045196d853c..790f3160e012 100644
--- a/net/ipv4/netfilter/nf_nat_h323.c
+++ b/net/ipv4/netfilter/nf_nat_h323.c
@@ -222,13 +222,24 @@ static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
222 /* Try to get a pair of ports. */ 222 /* Try to get a pair of ports. */
223 for (nated_port = ntohs(rtp_exp->tuple.dst.u.udp.port); 223 for (nated_port = ntohs(rtp_exp->tuple.dst.u.udp.port);
224 nated_port != 0; nated_port += 2) { 224 nated_port != 0; nated_port += 2) {
225 int ret;
226
225 rtp_exp->tuple.dst.u.udp.port = htons(nated_port); 227 rtp_exp->tuple.dst.u.udp.port = htons(nated_port);
226 if (nf_ct_expect_related(rtp_exp) == 0) { 228 ret = nf_ct_expect_related(rtp_exp);
229 if (ret == 0) {
227 rtcp_exp->tuple.dst.u.udp.port = 230 rtcp_exp->tuple.dst.u.udp.port =
228 htons(nated_port + 1); 231 htons(nated_port + 1);
229 if (nf_ct_expect_related(rtcp_exp) == 0) 232 ret = nf_ct_expect_related(rtcp_exp);
233 if (ret == 0)
234 break;
235 else if (ret != -EBUSY) {
236 nf_ct_unexpect_related(rtp_exp);
237 nated_port = 0;
230 break; 238 break;
231 nf_ct_unexpect_related(rtp_exp); 239 }
240 } else if (ret != -EBUSY) {
241 nated_port = 0;
242 break;
232 } 243 }
233 } 244 }
234 245
@@ -284,9 +295,16 @@ static int nat_t120(struct sk_buff *skb, struct nf_conn *ct,
284 295
285 /* Try to get same port: if not, try to change it. */ 296 /* Try to get same port: if not, try to change it. */
286 for (; nated_port != 0; nated_port++) { 297 for (; nated_port != 0; nated_port++) {
298 int ret;
299
287 exp->tuple.dst.u.tcp.port = htons(nated_port); 300 exp->tuple.dst.u.tcp.port = htons(nated_port);
288 if (nf_ct_expect_related(exp) == 0) 301 ret = nf_ct_expect_related(exp);
302 if (ret == 0)
303 break;
304 else if (ret != -EBUSY) {
305 nated_port = 0;
289 break; 306 break;
307 }
290 } 308 }
291 309
292 if (nated_port == 0) { /* No port available */ 310 if (nated_port == 0) { /* No port available */
@@ -334,9 +352,16 @@ static int nat_h245(struct sk_buff *skb, struct nf_conn *ct,
334 352
335 /* Try to get same port: if not, try to change it. */ 353 /* Try to get same port: if not, try to change it. */
336 for (; nated_port != 0; nated_port++) { 354 for (; nated_port != 0; nated_port++) {
355 int ret;
356
337 exp->tuple.dst.u.tcp.port = htons(nated_port); 357 exp->tuple.dst.u.tcp.port = htons(nated_port);
338 if (nf_ct_expect_related(exp) == 0) 358 ret = nf_ct_expect_related(exp);
359 if (ret == 0)
339 break; 360 break;
361 else if (ret != -EBUSY) {
362 nated_port = 0;
363 break;
364 }
340 } 365 }
341 366
342 if (nated_port == 0) { /* No port available */ 367 if (nated_port == 0) { /* No port available */
@@ -418,9 +443,16 @@ static int nat_q931(struct sk_buff *skb, struct nf_conn *ct,
418 443
419 /* Try to get same port: if not, try to change it. */ 444 /* Try to get same port: if not, try to change it. */
420 for (; nated_port != 0; nated_port++) { 445 for (; nated_port != 0; nated_port++) {
446 int ret;
447
421 exp->tuple.dst.u.tcp.port = htons(nated_port); 448 exp->tuple.dst.u.tcp.port = htons(nated_port);
422 if (nf_ct_expect_related(exp) == 0) 449 ret = nf_ct_expect_related(exp);
450 if (ret == 0)
451 break;
452 else if (ret != -EBUSY) {
453 nated_port = 0;
423 break; 454 break;
455 }
424 } 456 }
425 457
426 if (nated_port == 0) { /* No port available */ 458 if (nated_port == 0) { /* No port available */
@@ -500,9 +532,16 @@ static int nat_callforwarding(struct sk_buff *skb, struct nf_conn *ct,
500 532
501 /* Try to get same port: if not, try to change it. */ 533 /* Try to get same port: if not, try to change it. */
502 for (nated_port = ntohs(port); nated_port != 0; nated_port++) { 534 for (nated_port = ntohs(port); nated_port != 0; nated_port++) {
535 int ret;
536
503 exp->tuple.dst.u.tcp.port = htons(nated_port); 537 exp->tuple.dst.u.tcp.port = htons(nated_port);
504 if (nf_ct_expect_related(exp) == 0) 538 ret = nf_ct_expect_related(exp);
539 if (ret == 0)
505 break; 540 break;
541 else if (ret != -EBUSY) {
542 nated_port = 0;
543 break;
544 }
506 } 545 }
507 546
508 if (nated_port == 0) { /* No port available */ 547 if (nated_port == 0) { /* No port available */
diff --git a/net/ipv4/netfilter/nf_nat_helper.c b/net/ipv4/netfilter/nf_nat_helper.c
index 4a0c6b548eee..31427fb57aa8 100644
--- a/net/ipv4/netfilter/nf_nat_helper.c
+++ b/net/ipv4/netfilter/nf_nat_helper.c
@@ -153,6 +153,35 @@ void nf_nat_set_seq_adjust(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
153} 153}
154EXPORT_SYMBOL_GPL(nf_nat_set_seq_adjust); 154EXPORT_SYMBOL_GPL(nf_nat_set_seq_adjust);
155 155
156static void nf_nat_csum(struct sk_buff *skb, struct iphdr *iph, void *data,
157 int datalen, __sum16 *check, int oldlen)
158{
159 struct rtable *rt = skb_rtable(skb);
160
161 if (skb->ip_summed != CHECKSUM_PARTIAL) {
162 if (!(rt->rt_flags & RTCF_LOCAL) &&
163 skb->dev->features & NETIF_F_V4_CSUM) {
164 skb->ip_summed = CHECKSUM_PARTIAL;
165 skb->csum_start = skb_headroom(skb) +
166 skb_network_offset(skb) +
167 iph->ihl * 4;
168 skb->csum_offset = (void *)check - data;
169 *check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
170 datalen, iph->protocol, 0);
171 } else {
172 *check = 0;
173 *check = csum_tcpudp_magic(iph->saddr, iph->daddr,
174 datalen, iph->protocol,
175 csum_partial(data, datalen,
176 0));
177 if (iph->protocol == IPPROTO_UDP && !*check)
178 *check = CSUM_MANGLED_0;
179 }
180 } else
181 inet_proto_csum_replace2(check, skb,
182 htons(oldlen), htons(datalen), 1);
183}
184
156/* Generic function for mangling variable-length address changes inside 185/* Generic function for mangling variable-length address changes inside
157 * NATed TCP connections (like the PORT XXX,XXX,XXX,XXX,XXX,XXX 186 * NATed TCP connections (like the PORT XXX,XXX,XXX,XXX,XXX,XXX
158 * command in FTP). 187 * command in FTP).
@@ -169,7 +198,6 @@ int __nf_nat_mangle_tcp_packet(struct sk_buff *skb,
169 const char *rep_buffer, 198 const char *rep_buffer,
170 unsigned int rep_len, bool adjust) 199 unsigned int rep_len, bool adjust)
171{ 200{
172 struct rtable *rt = skb_rtable(skb);
173 struct iphdr *iph; 201 struct iphdr *iph;
174 struct tcphdr *tcph; 202 struct tcphdr *tcph;
175 int oldlen, datalen; 203 int oldlen, datalen;
@@ -192,26 +220,7 @@ int __nf_nat_mangle_tcp_packet(struct sk_buff *skb,
192 match_offset, match_len, rep_buffer, rep_len); 220 match_offset, match_len, rep_buffer, rep_len);
193 221
194 datalen = skb->len - iph->ihl*4; 222 datalen = skb->len - iph->ihl*4;
195 if (skb->ip_summed != CHECKSUM_PARTIAL) { 223 nf_nat_csum(skb, iph, tcph, datalen, &tcph->check, oldlen);
196 if (!(rt->rt_flags & RTCF_LOCAL) &&
197 skb->dev->features & NETIF_F_V4_CSUM) {
198 skb->ip_summed = CHECKSUM_PARTIAL;
199 skb->csum_start = skb_headroom(skb) +
200 skb_network_offset(skb) +
201 iph->ihl * 4;
202 skb->csum_offset = offsetof(struct tcphdr, check);
203 tcph->check = ~tcp_v4_check(datalen,
204 iph->saddr, iph->daddr, 0);
205 } else {
206 tcph->check = 0;
207 tcph->check = tcp_v4_check(datalen,
208 iph->saddr, iph->daddr,
209 csum_partial(tcph,
210 datalen, 0));
211 }
212 } else
213 inet_proto_csum_replace2(&tcph->check, skb,
214 htons(oldlen), htons(datalen), 1);
215 224
216 if (adjust && rep_len != match_len) 225 if (adjust && rep_len != match_len)
217 nf_nat_set_seq_adjust(ct, ctinfo, tcph->seq, 226 nf_nat_set_seq_adjust(ct, ctinfo, tcph->seq,
@@ -240,7 +249,6 @@ nf_nat_mangle_udp_packet(struct sk_buff *skb,
240 const char *rep_buffer, 249 const char *rep_buffer,
241 unsigned int rep_len) 250 unsigned int rep_len)
242{ 251{
243 struct rtable *rt = skb_rtable(skb);
244 struct iphdr *iph; 252 struct iphdr *iph;
245 struct udphdr *udph; 253 struct udphdr *udph;
246 int datalen, oldlen; 254 int datalen, oldlen;
@@ -274,29 +282,7 @@ nf_nat_mangle_udp_packet(struct sk_buff *skb,
274 if (!udph->check && skb->ip_summed != CHECKSUM_PARTIAL) 282 if (!udph->check && skb->ip_summed != CHECKSUM_PARTIAL)
275 return 1; 283 return 1;
276 284
277 if (skb->ip_summed != CHECKSUM_PARTIAL) { 285 nf_nat_csum(skb, iph, udph, datalen, &udph->check, oldlen);
278 if (!(rt->rt_flags & RTCF_LOCAL) &&
279 skb->dev->features & NETIF_F_V4_CSUM) {
280 skb->ip_summed = CHECKSUM_PARTIAL;
281 skb->csum_start = skb_headroom(skb) +
282 skb_network_offset(skb) +
283 iph->ihl * 4;
284 skb->csum_offset = offsetof(struct udphdr, check);
285 udph->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
286 datalen, IPPROTO_UDP,
287 0);
288 } else {
289 udph->check = 0;
290 udph->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
291 datalen, IPPROTO_UDP,
292 csum_partial(udph,
293 datalen, 0));
294 if (!udph->check)
295 udph->check = CSUM_MANGLED_0;
296 }
297 } else
298 inet_proto_csum_replace2(&udph->check, skb,
299 htons(oldlen), htons(datalen), 1);
300 286
301 return 1; 287 return 1;
302} 288}
diff --git a/net/ipv4/netfilter/nf_nat_irc.c b/net/ipv4/netfilter/nf_nat_irc.c
index ea83a886b03e..535e1a802356 100644
--- a/net/ipv4/netfilter/nf_nat_irc.c
+++ b/net/ipv4/netfilter/nf_nat_irc.c
@@ -45,9 +45,16 @@ static unsigned int help(struct sk_buff *skb,
45 45
46 /* Try to get same port: if not, try to change it. */ 46 /* Try to get same port: if not, try to change it. */
47 for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) { 47 for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
48 int ret;
49
48 exp->tuple.dst.u.tcp.port = htons(port); 50 exp->tuple.dst.u.tcp.port = htons(port);
49 if (nf_ct_expect_related(exp) == 0) 51 ret = nf_ct_expect_related(exp);
52 if (ret == 0)
53 break;
54 else if (ret != -EBUSY) {
55 port = 0;
50 break; 56 break;
57 }
51 } 58 }
52 59
53 if (port == 0) 60 if (port == 0)
diff --git a/net/ipv4/netfilter/nf_nat_rule.c b/net/ipv4/netfilter/nf_nat_rule.c
index ebbd319f62f5..21c30426480b 100644
--- a/net/ipv4/netfilter/nf_nat_rule.c
+++ b/net/ipv4/netfilter/nf_nat_rule.c
@@ -106,16 +106,15 @@ alloc_null_binding(struct nf_conn *ct, unsigned int hooknum)
106{ 106{
107 /* Force range to this IP; let proto decide mapping for 107 /* Force range to this IP; let proto decide mapping for
108 per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED). 108 per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED).
109 Use reply in case it's already been mangled (eg local packet).
110 */ 109 */
111 __be32 ip 110 struct nf_nat_range range;
112 = (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC 111
113 ? ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip 112 range.flags = 0;
114 : ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip); 113 pr_debug("Allocating NULL binding for %p (%pI4)\n", ct,
115 struct nf_nat_range range 114 HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC ?
116 = { IP_NAT_RANGE_MAP_IPS, ip, ip, { 0 }, { 0 } }; 115 &ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip :
117 116 &ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip);
118 pr_debug("Allocating NULL binding for %p (%pI4)\n", ct, &ip); 117
119 return nf_nat_setup_info(ct, &range, HOOK2MANIP(hooknum)); 118 return nf_nat_setup_info(ct, &range, HOOK2MANIP(hooknum));
120} 119}
121 120
diff --git a/net/ipv4/netfilter/nf_nat_sip.c b/net/ipv4/netfilter/nf_nat_sip.c
index 11b538deaaec..e40cf7816fdb 100644
--- a/net/ipv4/netfilter/nf_nat_sip.c
+++ b/net/ipv4/netfilter/nf_nat_sip.c
@@ -307,9 +307,16 @@ static unsigned int ip_nat_sip_expect(struct sk_buff *skb, unsigned int dataoff,
307 exp->expectfn = ip_nat_sip_expected; 307 exp->expectfn = ip_nat_sip_expected;
308 308
309 for (; port != 0; port++) { 309 for (; port != 0; port++) {
310 int ret;
311
310 exp->tuple.dst.u.udp.port = htons(port); 312 exp->tuple.dst.u.udp.port = htons(port);
311 if (nf_ct_expect_related(exp) == 0) 313 ret = nf_ct_expect_related(exp);
314 if (ret == 0)
315 break;
316 else if (ret != -EBUSY) {
317 port = 0;
312 break; 318 break;
319 }
313 } 320 }
314 321
315 if (port == 0) 322 if (port == 0)
@@ -480,13 +487,25 @@ static unsigned int ip_nat_sdp_media(struct sk_buff *skb, unsigned int dataoff,
480 /* Try to get same pair of ports: if not, try to change them. */ 487 /* Try to get same pair of ports: if not, try to change them. */
481 for (port = ntohs(rtp_exp->tuple.dst.u.udp.port); 488 for (port = ntohs(rtp_exp->tuple.dst.u.udp.port);
482 port != 0; port += 2) { 489 port != 0; port += 2) {
490 int ret;
491
483 rtp_exp->tuple.dst.u.udp.port = htons(port); 492 rtp_exp->tuple.dst.u.udp.port = htons(port);
484 if (nf_ct_expect_related(rtp_exp) != 0) 493 ret = nf_ct_expect_related(rtp_exp);
494 if (ret == -EBUSY)
485 continue; 495 continue;
496 else if (ret < 0) {
497 port = 0;
498 break;
499 }
486 rtcp_exp->tuple.dst.u.udp.port = htons(port + 1); 500 rtcp_exp->tuple.dst.u.udp.port = htons(port + 1);
487 if (nf_ct_expect_related(rtcp_exp) == 0) 501 ret = nf_ct_expect_related(rtcp_exp);
502 if (ret == 0)
488 break; 503 break;
489 nf_ct_unexpect_related(rtp_exp); 504 else if (ret != -EBUSY) {
505 nf_ct_unexpect_related(rtp_exp);
506 port = 0;
507 break;
508 }
490 } 509 }
491 510
492 if (port == 0) 511 if (port == 0)
diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c
index f2d297351405..65699c24411c 100644
--- a/net/ipv4/protocol.c
+++ b/net/ipv4/protocol.c
@@ -28,8 +28,7 @@
28#include <linux/spinlock.h> 28#include <linux/spinlock.h>
29#include <net/protocol.h> 29#include <net/protocol.h>
30 30
31const struct net_protocol *inet_protos[MAX_INET_PROTOS] ____cacheline_aligned_in_smp; 31const struct net_protocol *inet_protos[MAX_INET_PROTOS] __read_mostly;
32static DEFINE_SPINLOCK(inet_proto_lock);
33 32
34/* 33/*
35 * Add a protocol handler to the hash tables 34 * Add a protocol handler to the hash tables
@@ -37,20 +36,9 @@ static DEFINE_SPINLOCK(inet_proto_lock);
37 36
38int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol) 37int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol)
39{ 38{
40 int hash, ret; 39 int hash = protocol & (MAX_INET_PROTOS - 1);
41 40
42 hash = protocol & (MAX_INET_PROTOS - 1); 41 return !cmpxchg(&inet_protos[hash], NULL, prot) ? 0 : -1;
43
44 spin_lock_bh(&inet_proto_lock);
45 if (inet_protos[hash]) {
46 ret = -1;
47 } else {
48 inet_protos[hash] = prot;
49 ret = 0;
50 }
51 spin_unlock_bh(&inet_proto_lock);
52
53 return ret;
54} 42}
55EXPORT_SYMBOL(inet_add_protocol); 43EXPORT_SYMBOL(inet_add_protocol);
56 44
@@ -60,18 +48,9 @@ EXPORT_SYMBOL(inet_add_protocol);
60 48
61int inet_del_protocol(const struct net_protocol *prot, unsigned char protocol) 49int inet_del_protocol(const struct net_protocol *prot, unsigned char protocol)
62{ 50{
63 int hash, ret; 51 int ret, hash = protocol & (MAX_INET_PROTOS - 1);
64
65 hash = protocol & (MAX_INET_PROTOS - 1);
66 52
67 spin_lock_bh(&inet_proto_lock); 53 ret = (cmpxchg(&inet_protos[hash], prot, NULL) == prot) ? 0 : -1;
68 if (inet_protos[hash] == prot) {
69 inet_protos[hash] = NULL;
70 ret = 0;
71 } else {
72 ret = -1;
73 }
74 spin_unlock_bh(&inet_proto_lock);
75 54
76 synchronize_net(); 55 synchronize_net();
77 56
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 009a7b2aa1ef..1f85ef289895 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -505,7 +505,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
505 505
506 ipc.addr = inet->inet_saddr; 506 ipc.addr = inet->inet_saddr;
507 ipc.opt = NULL; 507 ipc.opt = NULL;
508 ipc.shtx.flags = 0; 508 ipc.tx_flags = 0;
509 ipc.oif = sk->sk_bound_dev_if; 509 ipc.oif = sk->sk_bound_dev_if;
510 510
511 if (msg->msg_controllen) { 511 if (msg->msg_controllen) {
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index ac6559cb54f9..d6cb2bfcd8e1 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -159,7 +159,6 @@ static struct dst_ops ipv4_dst_ops = {
159 .link_failure = ipv4_link_failure, 159 .link_failure = ipv4_link_failure,
160 .update_pmtu = ip_rt_update_pmtu, 160 .update_pmtu = ip_rt_update_pmtu,
161 .local_out = __ip_local_out, 161 .local_out = __ip_local_out,
162 .entries = ATOMIC_INIT(0),
163}; 162};
164 163
165#define ECN_OR_COST(class) TC_PRIO_##class 164#define ECN_OR_COST(class) TC_PRIO_##class
@@ -466,7 +465,7 @@ static int rt_cpu_seq_show(struct seq_file *seq, void *v)
466 465
467 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x " 466 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
468 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n", 467 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
469 atomic_read(&ipv4_dst_ops.entries), 468 dst_entries_get_slow(&ipv4_dst_ops),
470 st->in_hit, 469 st->in_hit,
471 st->in_slow_tot, 470 st->in_slow_tot,
472 st->in_slow_mc, 471 st->in_slow_mc,
@@ -945,6 +944,7 @@ static int rt_garbage_collect(struct dst_ops *ops)
945 struct rtable *rth, **rthp; 944 struct rtable *rth, **rthp;
946 unsigned long now = jiffies; 945 unsigned long now = jiffies;
947 int goal; 946 int goal;
947 int entries = dst_entries_get_fast(&ipv4_dst_ops);
948 948
949 /* 949 /*
950 * Garbage collection is pretty expensive, 950 * Garbage collection is pretty expensive,
@@ -954,28 +954,28 @@ static int rt_garbage_collect(struct dst_ops *ops)
954 RT_CACHE_STAT_INC(gc_total); 954 RT_CACHE_STAT_INC(gc_total);
955 955
956 if (now - last_gc < ip_rt_gc_min_interval && 956 if (now - last_gc < ip_rt_gc_min_interval &&
957 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) { 957 entries < ip_rt_max_size) {
958 RT_CACHE_STAT_INC(gc_ignored); 958 RT_CACHE_STAT_INC(gc_ignored);
959 goto out; 959 goto out;
960 } 960 }
961 961
962 entries = dst_entries_get_slow(&ipv4_dst_ops);
962 /* Calculate number of entries, which we want to expire now. */ 963 /* Calculate number of entries, which we want to expire now. */
963 goal = atomic_read(&ipv4_dst_ops.entries) - 964 goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
964 (ip_rt_gc_elasticity << rt_hash_log);
965 if (goal <= 0) { 965 if (goal <= 0) {
966 if (equilibrium < ipv4_dst_ops.gc_thresh) 966 if (equilibrium < ipv4_dst_ops.gc_thresh)
967 equilibrium = ipv4_dst_ops.gc_thresh; 967 equilibrium = ipv4_dst_ops.gc_thresh;
968 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium; 968 goal = entries - equilibrium;
969 if (goal > 0) { 969 if (goal > 0) {
970 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1); 970 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
971 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium; 971 goal = entries - equilibrium;
972 } 972 }
973 } else { 973 } else {
974 /* We are in dangerous area. Try to reduce cache really 974 /* We are in dangerous area. Try to reduce cache really
975 * aggressively. 975 * aggressively.
976 */ 976 */
977 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1); 977 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
978 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal; 978 equilibrium = entries - goal;
979 } 979 }
980 980
981 if (now - last_gc >= ip_rt_gc_min_interval) 981 if (now - last_gc >= ip_rt_gc_min_interval)
@@ -1032,14 +1032,16 @@ static int rt_garbage_collect(struct dst_ops *ops)
1032 expire >>= 1; 1032 expire >>= 1;
1033#if RT_CACHE_DEBUG >= 2 1033#if RT_CACHE_DEBUG >= 2
1034 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire, 1034 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
1035 atomic_read(&ipv4_dst_ops.entries), goal, i); 1035 dst_entries_get_fast(&ipv4_dst_ops), goal, i);
1036#endif 1036#endif
1037 1037
1038 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) 1038 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1039 goto out; 1039 goto out;
1040 } while (!in_softirq() && time_before_eq(jiffies, now)); 1040 } while (!in_softirq() && time_before_eq(jiffies, now));
1041 1041
1042 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) 1042 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1043 goto out;
1044 if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1043 goto out; 1045 goto out;
1044 if (net_ratelimit()) 1046 if (net_ratelimit())
1045 printk(KERN_WARNING "dst cache overflow\n"); 1047 printk(KERN_WARNING "dst cache overflow\n");
@@ -1049,11 +1051,12 @@ static int rt_garbage_collect(struct dst_ops *ops)
1049work_done: 1051work_done:
1050 expire += ip_rt_gc_min_interval; 1052 expire += ip_rt_gc_min_interval;
1051 if (expire > ip_rt_gc_timeout || 1053 if (expire > ip_rt_gc_timeout ||
1052 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh) 1054 dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1055 dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1053 expire = ip_rt_gc_timeout; 1056 expire = ip_rt_gc_timeout;
1054#if RT_CACHE_DEBUG >= 2 1057#if RT_CACHE_DEBUG >= 2
1055 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire, 1058 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
1056 atomic_read(&ipv4_dst_ops.entries), goal, rover); 1059 dst_entries_get_fast(&ipv4_dst_ops), goal, rover);
1057#endif 1060#endif
1058out: return 0; 1061out: return 0;
1059} 1062}
@@ -1102,23 +1105,23 @@ restart:
1102 * Note that we do rt_free on this new route entry, so that 1105 * Note that we do rt_free on this new route entry, so that
1103 * once its refcount hits zero, we are still able to reap it 1106 * once its refcount hits zero, we are still able to reap it
1104 * (Thanks Alexey) 1107 * (Thanks Alexey)
1105 * Note also the rt_free uses call_rcu. We don't actually 1108 * Note: To avoid expensive rcu stuff for this uncached dst,
1106 * need rcu protection here, this is just our path to get 1109 * we set DST_NOCACHE so that dst_release() can free dst without
1107 * on the route gc list. 1110 * waiting a grace period.
1108 */ 1111 */
1109 1112
1113 rt->dst.flags |= DST_NOCACHE;
1110 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) { 1114 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1111 int err = arp_bind_neighbour(&rt->dst); 1115 int err = arp_bind_neighbour(&rt->dst);
1112 if (err) { 1116 if (err) {
1113 if (net_ratelimit()) 1117 if (net_ratelimit())
1114 printk(KERN_WARNING 1118 printk(KERN_WARNING
1115 "Neighbour table failure & not caching routes.\n"); 1119 "Neighbour table failure & not caching routes.\n");
1116 rt_drop(rt); 1120 ip_rt_put(rt);
1117 return err; 1121 return err;
1118 } 1122 }
1119 } 1123 }
1120 1124
1121 rt_free(rt);
1122 goto skip_hashing; 1125 goto skip_hashing;
1123 } 1126 }
1124 1127
@@ -1268,18 +1271,11 @@ skip_hashing:
1268 1271
1269void rt_bind_peer(struct rtable *rt, int create) 1272void rt_bind_peer(struct rtable *rt, int create)
1270{ 1273{
1271 static DEFINE_SPINLOCK(rt_peer_lock);
1272 struct inet_peer *peer; 1274 struct inet_peer *peer;
1273 1275
1274 peer = inet_getpeer(rt->rt_dst, create); 1276 peer = inet_getpeer(rt->rt_dst, create);
1275 1277
1276 spin_lock_bh(&rt_peer_lock); 1278 if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1277 if (rt->peer == NULL) {
1278 rt->peer = peer;
1279 peer = NULL;
1280 }
1281 spin_unlock_bh(&rt_peer_lock);
1282 if (peer)
1283 inet_putpeer(peer); 1279 inet_putpeer(peer);
1284} 1280}
1285 1281
@@ -1779,12 +1775,15 @@ void ip_rt_get_source(u8 *addr, struct rtable *rt)
1779 1775
1780 if (rt->fl.iif == 0) 1776 if (rt->fl.iif == 0)
1781 src = rt->rt_src; 1777 src = rt->rt_src;
1782 else if (fib_lookup(dev_net(rt->dst.dev), &rt->fl, &res) == 0) { 1778 else {
1783 src = FIB_RES_PREFSRC(res); 1779 rcu_read_lock();
1784 fib_res_put(&res); 1780 if (fib_lookup(dev_net(rt->dst.dev), &rt->fl, &res) == 0)
1785 } else 1781 src = FIB_RES_PREFSRC(res);
1786 src = inet_select_addr(rt->dst.dev, rt->rt_gateway, 1782 else
1783 src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1787 RT_SCOPE_UNIVERSE); 1784 RT_SCOPE_UNIVERSE);
1785 rcu_read_unlock();
1786 }
1788 memcpy(addr, &src, 4); 1787 memcpy(addr, &src, 4);
1789} 1788}
1790 1789
@@ -2087,6 +2086,7 @@ static int ip_mkroute_input(struct sk_buff *skb,
2087 * Such approach solves two big problems: 2086 * Such approach solves two big problems:
2088 * 1. Not simplex devices are handled properly. 2087 * 1. Not simplex devices are handled properly.
2089 * 2. IP spoofing attempts are filtered with 100% of guarantee. 2088 * 2. IP spoofing attempts are filtered with 100% of guarantee.
2089 * called with rcu_read_lock()
2090 */ 2090 */
2091 2091
2092static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, 2092static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
@@ -2108,7 +2108,6 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2108 unsigned hash; 2108 unsigned hash;
2109 __be32 spec_dst; 2109 __be32 spec_dst;
2110 int err = -EINVAL; 2110 int err = -EINVAL;
2111 int free_res = 0;
2112 struct net * net = dev_net(dev); 2111 struct net * net = dev_net(dev);
2113 2112
2114 /* IP on this device is disabled. */ 2113 /* IP on this device is disabled. */
@@ -2124,7 +2123,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2124 ipv4_is_loopback(saddr)) 2123 ipv4_is_loopback(saddr))
2125 goto martian_source; 2124 goto martian_source;
2126 2125
2127 if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0)) 2126 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2128 goto brd_input; 2127 goto brd_input;
2129 2128
2130 /* Accept zero addresses only to limited broadcast; 2129 /* Accept zero addresses only to limited broadcast;
@@ -2133,19 +2132,18 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2133 if (ipv4_is_zeronet(saddr)) 2132 if (ipv4_is_zeronet(saddr))
2134 goto martian_source; 2133 goto martian_source;
2135 2134
2136 if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) || 2135 if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2137 ipv4_is_loopback(daddr))
2138 goto martian_destination; 2136 goto martian_destination;
2139 2137
2140 /* 2138 /*
2141 * Now we are ready to route packet. 2139 * Now we are ready to route packet.
2142 */ 2140 */
2143 if ((err = fib_lookup(net, &fl, &res)) != 0) { 2141 err = fib_lookup(net, &fl, &res);
2142 if (err != 0) {
2144 if (!IN_DEV_FORWARD(in_dev)) 2143 if (!IN_DEV_FORWARD(in_dev))
2145 goto e_hostunreach; 2144 goto e_hostunreach;
2146 goto no_route; 2145 goto no_route;
2147 } 2146 }
2148 free_res = 1;
2149 2147
2150 RT_CACHE_STAT_INC(in_slow_tot); 2148 RT_CACHE_STAT_INC(in_slow_tot);
2151 2149
@@ -2154,8 +2152,8 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2154 2152
2155 if (res.type == RTN_LOCAL) { 2153 if (res.type == RTN_LOCAL) {
2156 err = fib_validate_source(saddr, daddr, tos, 2154 err = fib_validate_source(saddr, daddr, tos,
2157 net->loopback_dev->ifindex, 2155 net->loopback_dev->ifindex,
2158 dev, &spec_dst, &itag, skb->mark); 2156 dev, &spec_dst, &itag, skb->mark);
2159 if (err < 0) 2157 if (err < 0)
2160 goto martian_source_keep_err; 2158 goto martian_source_keep_err;
2161 if (err) 2159 if (err)
@@ -2170,9 +2168,6 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2170 goto martian_destination; 2168 goto martian_destination;
2171 2169
2172 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos); 2170 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
2173done:
2174 if (free_res)
2175 fib_res_put(&res);
2176out: return err; 2171out: return err;
2177 2172
2178brd_input: 2173brd_input:
@@ -2232,7 +2227,7 @@ local_input:
2232 rth->rt_type = res.type; 2227 rth->rt_type = res.type;
2233 hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net)); 2228 hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
2234 err = rt_intern_hash(hash, rth, NULL, skb, fl.iif); 2229 err = rt_intern_hash(hash, rth, NULL, skb, fl.iif);
2235 goto done; 2230 goto out;
2236 2231
2237no_route: 2232no_route:
2238 RT_CACHE_STAT_INC(in_no_route); 2233 RT_CACHE_STAT_INC(in_no_route);
@@ -2255,21 +2250,21 @@ martian_destination:
2255 2250
2256e_hostunreach: 2251e_hostunreach:
2257 err = -EHOSTUNREACH; 2252 err = -EHOSTUNREACH;
2258 goto done; 2253 goto out;
2259 2254
2260e_inval: 2255e_inval:
2261 err = -EINVAL; 2256 err = -EINVAL;
2262 goto done; 2257 goto out;
2263 2258
2264e_nobufs: 2259e_nobufs:
2265 err = -ENOBUFS; 2260 err = -ENOBUFS;
2266 goto done; 2261 goto out;
2267 2262
2268martian_source: 2263martian_source:
2269 err = -EINVAL; 2264 err = -EINVAL;
2270martian_source_keep_err: 2265martian_source_keep_err:
2271 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr); 2266 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2272 goto done; 2267 goto out;
2273} 2268}
2274 2269
2275int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr, 2270int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
@@ -2355,6 +2350,7 @@ skip_cache:
2355} 2350}
2356EXPORT_SYMBOL(ip_route_input_common); 2351EXPORT_SYMBOL(ip_route_input_common);
2357 2352
2353/* called with rcu_read_lock() */
2358static int __mkroute_output(struct rtable **result, 2354static int __mkroute_output(struct rtable **result,
2359 struct fib_result *res, 2355 struct fib_result *res,
2360 const struct flowi *fl, 2356 const struct flowi *fl,
@@ -2365,53 +2361,47 @@ static int __mkroute_output(struct rtable **result,
2365 struct rtable *rth; 2361 struct rtable *rth;
2366 struct in_device *in_dev; 2362 struct in_device *in_dev;
2367 u32 tos = RT_FL_TOS(oldflp); 2363 u32 tos = RT_FL_TOS(oldflp);
2368 int err = 0;
2369 2364
2370 if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK)) 2365 if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags & IFF_LOOPBACK))
2371 return -EINVAL; 2366 return -EINVAL;
2372 2367
2373 if (fl->fl4_dst == htonl(0xFFFFFFFF)) 2368 if (ipv4_is_lbcast(fl->fl4_dst))
2374 res->type = RTN_BROADCAST; 2369 res->type = RTN_BROADCAST;
2375 else if (ipv4_is_multicast(fl->fl4_dst)) 2370 else if (ipv4_is_multicast(fl->fl4_dst))
2376 res->type = RTN_MULTICAST; 2371 res->type = RTN_MULTICAST;
2377 else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst)) 2372 else if (ipv4_is_zeronet(fl->fl4_dst))
2378 return -EINVAL; 2373 return -EINVAL;
2379 2374
2380 if (dev_out->flags & IFF_LOOPBACK) 2375 if (dev_out->flags & IFF_LOOPBACK)
2381 flags |= RTCF_LOCAL; 2376 flags |= RTCF_LOCAL;
2382 2377
2383 /* get work reference to inet device */ 2378 in_dev = __in_dev_get_rcu(dev_out);
2384 in_dev = in_dev_get(dev_out);
2385 if (!in_dev) 2379 if (!in_dev)
2386 return -EINVAL; 2380 return -EINVAL;
2387 2381
2388 if (res->type == RTN_BROADCAST) { 2382 if (res->type == RTN_BROADCAST) {
2389 flags |= RTCF_BROADCAST | RTCF_LOCAL; 2383 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2390 if (res->fi) { 2384 res->fi = NULL;
2391 fib_info_put(res->fi);
2392 res->fi = NULL;
2393 }
2394 } else if (res->type == RTN_MULTICAST) { 2385 } else if (res->type == RTN_MULTICAST) {
2395 flags |= RTCF_MULTICAST|RTCF_LOCAL; 2386 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2396 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src, 2387 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2397 oldflp->proto)) 2388 oldflp->proto))
2398 flags &= ~RTCF_LOCAL; 2389 flags &= ~RTCF_LOCAL;
2399 /* If multicast route do not exist use 2390 /* If multicast route do not exist use
2400 default one, but do not gateway in this case. 2391 * default one, but do not gateway in this case.
2401 Yes, it is hack. 2392 * Yes, it is hack.
2402 */ 2393 */
2403 if (res->fi && res->prefixlen < 4) { 2394 if (res->fi && res->prefixlen < 4)
2404 fib_info_put(res->fi);
2405 res->fi = NULL; 2395 res->fi = NULL;
2406 }
2407 } 2396 }
2408 2397
2409 2398
2410 rth = dst_alloc(&ipv4_dst_ops); 2399 rth = dst_alloc(&ipv4_dst_ops);
2411 if (!rth) { 2400 if (!rth)
2412 err = -ENOBUFS; 2401 return -ENOBUFS;
2413 goto cleanup; 2402
2414 } 2403 in_dev_hold(in_dev);
2404 rth->idev = in_dev;
2415 2405
2416 atomic_set(&rth->dst.__refcnt, 1); 2406 atomic_set(&rth->dst.__refcnt, 1);
2417 rth->dst.flags= DST_HOST; 2407 rth->dst.flags= DST_HOST;
@@ -2432,7 +2422,6 @@ static int __mkroute_output(struct rtable **result,
2432 cache entry */ 2422 cache entry */
2433 rth->dst.dev = dev_out; 2423 rth->dst.dev = dev_out;
2434 dev_hold(dev_out); 2424 dev_hold(dev_out);
2435 rth->idev = in_dev_get(dev_out);
2436 rth->rt_gateway = fl->fl4_dst; 2425 rth->rt_gateway = fl->fl4_dst;
2437 rth->rt_spec_dst= fl->fl4_src; 2426 rth->rt_spec_dst= fl->fl4_src;
2438 2427
@@ -2467,15 +2456,11 @@ static int __mkroute_output(struct rtable **result,
2467 rt_set_nexthop(rth, res, 0); 2456 rt_set_nexthop(rth, res, 0);
2468 2457
2469 rth->rt_flags = flags; 2458 rth->rt_flags = flags;
2470
2471 *result = rth; 2459 *result = rth;
2472 cleanup: 2460 return 0;
2473 /* release work reference to inet device */
2474 in_dev_put(in_dev);
2475
2476 return err;
2477} 2461}
2478 2462
2463/* called with rcu_read_lock() */
2479static int ip_mkroute_output(struct rtable **rp, 2464static int ip_mkroute_output(struct rtable **rp,
2480 struct fib_result *res, 2465 struct fib_result *res,
2481 const struct flowi *fl, 2466 const struct flowi *fl,
@@ -2497,6 +2482,7 @@ static int ip_mkroute_output(struct rtable **rp,
2497 2482
2498/* 2483/*
2499 * Major route resolver routine. 2484 * Major route resolver routine.
2485 * called with rcu_read_lock();
2500 */ 2486 */
2501 2487
2502static int ip_route_output_slow(struct net *net, struct rtable **rp, 2488static int ip_route_output_slow(struct net *net, struct rtable **rp,
@@ -2515,9 +2501,8 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
2515 .iif = net->loopback_dev->ifindex, 2501 .iif = net->loopback_dev->ifindex,
2516 .oif = oldflp->oif }; 2502 .oif = oldflp->oif };
2517 struct fib_result res; 2503 struct fib_result res;
2518 unsigned flags = 0; 2504 unsigned int flags = 0;
2519 struct net_device *dev_out = NULL; 2505 struct net_device *dev_out = NULL;
2520 int free_res = 0;
2521 int err; 2506 int err;
2522 2507
2523 2508
@@ -2543,9 +2528,9 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
2543 2528
2544 if (oldflp->oif == 0 && 2529 if (oldflp->oif == 0 &&
2545 (ipv4_is_multicast(oldflp->fl4_dst) || 2530 (ipv4_is_multicast(oldflp->fl4_dst) ||
2546 oldflp->fl4_dst == htonl(0xFFFFFFFF))) { 2531 ipv4_is_lbcast(oldflp->fl4_dst))) {
2547 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ 2532 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2548 dev_out = ip_dev_find(net, oldflp->fl4_src); 2533 dev_out = __ip_dev_find(net, oldflp->fl4_src, false);
2549 if (dev_out == NULL) 2534 if (dev_out == NULL)
2550 goto out; 2535 goto out;
2551 2536
@@ -2570,29 +2555,24 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
2570 2555
2571 if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) { 2556 if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
2572 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ 2557 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2573 dev_out = ip_dev_find(net, oldflp->fl4_src); 2558 if (!__ip_dev_find(net, oldflp->fl4_src, false))
2574 if (dev_out == NULL)
2575 goto out; 2559 goto out;
2576 dev_put(dev_out);
2577 dev_out = NULL;
2578 } 2560 }
2579 } 2561 }
2580 2562
2581 2563
2582 if (oldflp->oif) { 2564 if (oldflp->oif) {
2583 dev_out = dev_get_by_index(net, oldflp->oif); 2565 dev_out = dev_get_by_index_rcu(net, oldflp->oif);
2584 err = -ENODEV; 2566 err = -ENODEV;
2585 if (dev_out == NULL) 2567 if (dev_out == NULL)
2586 goto out; 2568 goto out;
2587 2569
2588 /* RACE: Check return value of inet_select_addr instead. */ 2570 /* RACE: Check return value of inet_select_addr instead. */
2589 if (__in_dev_get_rtnl(dev_out) == NULL) { 2571 if (rcu_dereference(dev_out->ip_ptr) == NULL)
2590 dev_put(dev_out);
2591 goto out; /* Wrong error code */ 2572 goto out; /* Wrong error code */
2592 }
2593 2573
2594 if (ipv4_is_local_multicast(oldflp->fl4_dst) || 2574 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2595 oldflp->fl4_dst == htonl(0xFFFFFFFF)) { 2575 ipv4_is_lbcast(oldflp->fl4_dst)) {
2596 if (!fl.fl4_src) 2576 if (!fl.fl4_src)
2597 fl.fl4_src = inet_select_addr(dev_out, 0, 2577 fl.fl4_src = inet_select_addr(dev_out, 0,
2598 RT_SCOPE_LINK); 2578 RT_SCOPE_LINK);
@@ -2612,10 +2592,7 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
2612 fl.fl4_dst = fl.fl4_src; 2592 fl.fl4_dst = fl.fl4_src;
2613 if (!fl.fl4_dst) 2593 if (!fl.fl4_dst)
2614 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK); 2594 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2615 if (dev_out)
2616 dev_put(dev_out);
2617 dev_out = net->loopback_dev; 2595 dev_out = net->loopback_dev;
2618 dev_hold(dev_out);
2619 fl.oif = net->loopback_dev->ifindex; 2596 fl.oif = net->loopback_dev->ifindex;
2620 res.type = RTN_LOCAL; 2597 res.type = RTN_LOCAL;
2621 flags |= RTCF_LOCAL; 2598 flags |= RTCF_LOCAL;
@@ -2649,23 +2626,15 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
2649 res.type = RTN_UNICAST; 2626 res.type = RTN_UNICAST;
2650 goto make_route; 2627 goto make_route;
2651 } 2628 }
2652 if (dev_out)
2653 dev_put(dev_out);
2654 err = -ENETUNREACH; 2629 err = -ENETUNREACH;
2655 goto out; 2630 goto out;
2656 } 2631 }
2657 free_res = 1;
2658 2632
2659 if (res.type == RTN_LOCAL) { 2633 if (res.type == RTN_LOCAL) {
2660 if (!fl.fl4_src) 2634 if (!fl.fl4_src)
2661 fl.fl4_src = fl.fl4_dst; 2635 fl.fl4_src = fl.fl4_dst;
2662 if (dev_out)
2663 dev_put(dev_out);
2664 dev_out = net->loopback_dev; 2636 dev_out = net->loopback_dev;
2665 dev_hold(dev_out);
2666 fl.oif = dev_out->ifindex; 2637 fl.oif = dev_out->ifindex;
2667 if (res.fi)
2668 fib_info_put(res.fi);
2669 res.fi = NULL; 2638 res.fi = NULL;
2670 flags |= RTCF_LOCAL; 2639 flags |= RTCF_LOCAL;
2671 goto make_route; 2640 goto make_route;
@@ -2682,28 +2651,21 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
2682 if (!fl.fl4_src) 2651 if (!fl.fl4_src)
2683 fl.fl4_src = FIB_RES_PREFSRC(res); 2652 fl.fl4_src = FIB_RES_PREFSRC(res);
2684 2653
2685 if (dev_out)
2686 dev_put(dev_out);
2687 dev_out = FIB_RES_DEV(res); 2654 dev_out = FIB_RES_DEV(res);
2688 dev_hold(dev_out);
2689 fl.oif = dev_out->ifindex; 2655 fl.oif = dev_out->ifindex;
2690 2656
2691 2657
2692make_route: 2658make_route:
2693 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags); 2659 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2694 2660
2695
2696 if (free_res)
2697 fib_res_put(&res);
2698 if (dev_out)
2699 dev_put(dev_out);
2700out: return err; 2661out: return err;
2701} 2662}
2702 2663
2703int __ip_route_output_key(struct net *net, struct rtable **rp, 2664int __ip_route_output_key(struct net *net, struct rtable **rp,
2704 const struct flowi *flp) 2665 const struct flowi *flp)
2705{ 2666{
2706 unsigned hash; 2667 unsigned int hash;
2668 int res;
2707 struct rtable *rth; 2669 struct rtable *rth;
2708 2670
2709 if (!rt_caching(net)) 2671 if (!rt_caching(net))
@@ -2734,7 +2696,10 @@ int __ip_route_output_key(struct net *net, struct rtable **rp,
2734 rcu_read_unlock_bh(); 2696 rcu_read_unlock_bh();
2735 2697
2736slow_output: 2698slow_output:
2737 return ip_route_output_slow(net, rp, flp); 2699 rcu_read_lock();
2700 res = ip_route_output_slow(net, rp, flp);
2701 rcu_read_unlock();
2702 return res;
2738} 2703}
2739EXPORT_SYMBOL_GPL(__ip_route_output_key); 2704EXPORT_SYMBOL_GPL(__ip_route_output_key);
2740 2705
@@ -2753,7 +2718,6 @@ static struct dst_ops ipv4_dst_blackhole_ops = {
2753 .destroy = ipv4_dst_destroy, 2718 .destroy = ipv4_dst_destroy,
2754 .check = ipv4_blackhole_dst_check, 2719 .check = ipv4_blackhole_dst_check,
2755 .update_pmtu = ipv4_rt_blackhole_update_pmtu, 2720 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2756 .entries = ATOMIC_INIT(0),
2757}; 2721};
2758 2722
2759 2723
@@ -2798,7 +2762,7 @@ static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi
2798 2762
2799 dst_release(&(*rp)->dst); 2763 dst_release(&(*rp)->dst);
2800 *rp = rt; 2764 *rp = rt;
2801 return (rt ? 0 : -ENOMEM); 2765 return rt ? 0 : -ENOMEM;
2802} 2766}
2803 2767
2804int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp, 2768int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
@@ -3323,6 +3287,12 @@ int __init ip_rt_init(void)
3323 3287
3324 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep; 3288 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3325 3289
3290 if (dst_entries_init(&ipv4_dst_ops) < 0)
3291 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3292
3293 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3294 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3295
3326 rt_hash_table = (struct rt_hash_bucket *) 3296 rt_hash_table = (struct rt_hash_bucket *)
3327 alloc_large_system_hash("IP route cache", 3297 alloc_large_system_hash("IP route cache",
3328 sizeof(struct rt_hash_bucket), 3298 sizeof(struct rt_hash_bucket),
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index f115ea68a4ef..1664a0590bb8 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2392,7 +2392,12 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
2392 err = tp->af_specific->md5_parse(sk, optval, optlen); 2392 err = tp->af_specific->md5_parse(sk, optval, optlen);
2393 break; 2393 break;
2394#endif 2394#endif
2395 2395 case TCP_USER_TIMEOUT:
2396 /* Cap the max timeout in ms TCP will retry/retrans
2397 * before giving up and aborting (ETIMEDOUT) a connection.
2398 */
2399 icsk->icsk_user_timeout = msecs_to_jiffies(val);
2400 break;
2396 default: 2401 default:
2397 err = -ENOPROTOOPT; 2402 err = -ENOPROTOOPT;
2398 break; 2403 break;
@@ -2611,6 +2616,10 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
2611 case TCP_THIN_DUPACK: 2616 case TCP_THIN_DUPACK:
2612 val = tp->thin_dupack; 2617 val = tp->thin_dupack;
2613 break; 2618 break;
2619
2620 case TCP_USER_TIMEOUT:
2621 val = jiffies_to_msecs(icsk->icsk_user_timeout);
2622 break;
2614 default: 2623 default:
2615 return -ENOPROTOOPT; 2624 return -ENOPROTOOPT;
2616 } 2625 }
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index b55f60f6fcbe..ee0df4817498 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -182,7 +182,7 @@ static void tcp_incr_quickack(struct sock *sk)
182 icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS); 182 icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS);
183} 183}
184 184
185void tcp_enter_quickack_mode(struct sock *sk) 185static void tcp_enter_quickack_mode(struct sock *sk)
186{ 186{
187 struct inet_connection_sock *icsk = inet_csk(sk); 187 struct inet_connection_sock *icsk = inet_csk(sk);
188 tcp_incr_quickack(sk); 188 tcp_incr_quickack(sk);
@@ -805,25 +805,12 @@ void tcp_update_metrics(struct sock *sk)
805 } 805 }
806} 806}
807 807
808/* Numbers are taken from RFC3390.
809 *
810 * John Heffner states:
811 *
812 * The RFC specifies a window of no more than 4380 bytes
813 * unless 2*MSS > 4380. Reading the pseudocode in the RFC
814 * is a bit misleading because they use a clamp at 4380 bytes
815 * rather than use a multiplier in the relevant range.
816 */
817__u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst) 808__u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst)
818{ 809{
819 __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); 810 __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
820 811
821 if (!cwnd) { 812 if (!cwnd)
822 if (tp->mss_cache > 1460) 813 cwnd = rfc3390_bytes_to_packets(tp->mss_cache);
823 cwnd = 2;
824 else
825 cwnd = (tp->mss_cache > 1095) ? 3 : 4;
826 }
827 return min_t(__u32, cwnd, tp->snd_cwnd_clamp); 814 return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
828} 815}
829 816
@@ -2314,7 +2301,7 @@ static inline int tcp_dupack_heuristics(struct tcp_sock *tp)
2314 2301
2315static inline int tcp_skb_timedout(struct sock *sk, struct sk_buff *skb) 2302static inline int tcp_skb_timedout(struct sock *sk, struct sk_buff *skb)
2316{ 2303{
2317 return (tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto); 2304 return tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto;
2318} 2305}
2319 2306
2320static inline int tcp_head_timedout(struct sock *sk) 2307static inline int tcp_head_timedout(struct sock *sk)
@@ -2508,7 +2495,7 @@ static void tcp_timeout_skbs(struct sock *sk)
2508/* Mark head of queue up as lost. With RFC3517 SACK, the packets is 2495/* Mark head of queue up as lost. With RFC3517 SACK, the packets is
2509 * is against sacked "cnt", otherwise it's against facked "cnt" 2496 * is against sacked "cnt", otherwise it's against facked "cnt"
2510 */ 2497 */
2511static void tcp_mark_head_lost(struct sock *sk, int packets) 2498static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
2512{ 2499{
2513 struct tcp_sock *tp = tcp_sk(sk); 2500 struct tcp_sock *tp = tcp_sk(sk);
2514 struct sk_buff *skb; 2501 struct sk_buff *skb;
@@ -2516,13 +2503,13 @@ static void tcp_mark_head_lost(struct sock *sk, int packets)
2516 int err; 2503 int err;
2517 unsigned int mss; 2504 unsigned int mss;
2518 2505
2519 if (packets == 0)
2520 return;
2521
2522 WARN_ON(packets > tp->packets_out); 2506 WARN_ON(packets > tp->packets_out);
2523 if (tp->lost_skb_hint) { 2507 if (tp->lost_skb_hint) {
2524 skb = tp->lost_skb_hint; 2508 skb = tp->lost_skb_hint;
2525 cnt = tp->lost_cnt_hint; 2509 cnt = tp->lost_cnt_hint;
2510 /* Head already handled? */
2511 if (mark_head && skb != tcp_write_queue_head(sk))
2512 return;
2526 } else { 2513 } else {
2527 skb = tcp_write_queue_head(sk); 2514 skb = tcp_write_queue_head(sk);
2528 cnt = 0; 2515 cnt = 0;
@@ -2557,6 +2544,9 @@ static void tcp_mark_head_lost(struct sock *sk, int packets)
2557 } 2544 }
2558 2545
2559 tcp_skb_mark_lost(tp, skb); 2546 tcp_skb_mark_lost(tp, skb);
2547
2548 if (mark_head)
2549 break;
2560 } 2550 }
2561 tcp_verify_left_out(tp); 2551 tcp_verify_left_out(tp);
2562} 2552}
@@ -2568,17 +2558,18 @@ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
2568 struct tcp_sock *tp = tcp_sk(sk); 2558 struct tcp_sock *tp = tcp_sk(sk);
2569 2559
2570 if (tcp_is_reno(tp)) { 2560 if (tcp_is_reno(tp)) {
2571 tcp_mark_head_lost(sk, 1); 2561 tcp_mark_head_lost(sk, 1, 1);
2572 } else if (tcp_is_fack(tp)) { 2562 } else if (tcp_is_fack(tp)) {
2573 int lost = tp->fackets_out - tp->reordering; 2563 int lost = tp->fackets_out - tp->reordering;
2574 if (lost <= 0) 2564 if (lost <= 0)
2575 lost = 1; 2565 lost = 1;
2576 tcp_mark_head_lost(sk, lost); 2566 tcp_mark_head_lost(sk, lost, 0);
2577 } else { 2567 } else {
2578 int sacked_upto = tp->sacked_out - tp->reordering; 2568 int sacked_upto = tp->sacked_out - tp->reordering;
2579 if (sacked_upto < fast_rexmit) 2569 if (sacked_upto >= 0)
2580 sacked_upto = fast_rexmit; 2570 tcp_mark_head_lost(sk, sacked_upto, 0);
2581 tcp_mark_head_lost(sk, sacked_upto); 2571 else if (fast_rexmit)
2572 tcp_mark_head_lost(sk, 1, 1);
2582 } 2573 }
2583 2574
2584 tcp_timeout_skbs(sk); 2575 tcp_timeout_skbs(sk);
@@ -2887,7 +2878,7 @@ static void tcp_mtup_probe_success(struct sock *sk)
2887 icsk->icsk_mtup.probe_size; 2878 icsk->icsk_mtup.probe_size;
2888 tp->snd_cwnd_cnt = 0; 2879 tp->snd_cwnd_cnt = 0;
2889 tp->snd_cwnd_stamp = tcp_time_stamp; 2880 tp->snd_cwnd_stamp = tcp_time_stamp;
2890 tp->rcv_ssthresh = tcp_current_ssthresh(sk); 2881 tp->snd_ssthresh = tcp_current_ssthresh(sk);
2891 2882
2892 icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size; 2883 icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size;
2893 icsk->icsk_mtup.probe_size = 0; 2884 icsk->icsk_mtup.probe_size = 0;
@@ -2984,7 +2975,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag)
2984 before(tp->snd_una, tp->high_seq) && 2975 before(tp->snd_una, tp->high_seq) &&
2985 icsk->icsk_ca_state != TCP_CA_Open && 2976 icsk->icsk_ca_state != TCP_CA_Open &&
2986 tp->fackets_out > tp->reordering) { 2977 tp->fackets_out > tp->reordering) {
2987 tcp_mark_head_lost(sk, tp->fackets_out - tp->reordering); 2978 tcp_mark_head_lost(sk, tp->fackets_out - tp->reordering, 0);
2988 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSS); 2979 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSS);
2989 } 2980 }
2990 2981
@@ -3412,8 +3403,8 @@ static void tcp_ack_probe(struct sock *sk)
3412 3403
3413static inline int tcp_ack_is_dubious(const struct sock *sk, const int flag) 3404static inline int tcp_ack_is_dubious(const struct sock *sk, const int flag)
3414{ 3405{
3415 return (!(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) || 3406 return !(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) ||
3416 inet_csk(sk)->icsk_ca_state != TCP_CA_Open); 3407 inet_csk(sk)->icsk_ca_state != TCP_CA_Open;
3417} 3408}
3418 3409
3419static inline int tcp_may_raise_cwnd(const struct sock *sk, const int flag) 3410static inline int tcp_may_raise_cwnd(const struct sock *sk, const int flag)
@@ -3430,9 +3421,9 @@ static inline int tcp_may_update_window(const struct tcp_sock *tp,
3430 const u32 ack, const u32 ack_seq, 3421 const u32 ack, const u32 ack_seq,
3431 const u32 nwin) 3422 const u32 nwin)
3432{ 3423{
3433 return (after(ack, tp->snd_una) || 3424 return after(ack, tp->snd_una) ||
3434 after(ack_seq, tp->snd_wl1) || 3425 after(ack_seq, tp->snd_wl1) ||
3435 (ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd)); 3426 (ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd);
3436} 3427}
3437 3428
3438/* Update our send window. 3429/* Update our send window.
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 020766292bb0..8f8527d41682 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1422,7 +1422,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1422 1422
1423 newsk = tcp_create_openreq_child(sk, req, skb); 1423 newsk = tcp_create_openreq_child(sk, req, skb);
1424 if (!newsk) 1424 if (!newsk)
1425 goto exit; 1425 goto exit_nonewsk;
1426 1426
1427 newsk->sk_gso_type = SKB_GSO_TCPV4; 1427 newsk->sk_gso_type = SKB_GSO_TCPV4;
1428 sk_setup_caps(newsk, dst); 1428 sk_setup_caps(newsk, dst);
@@ -1469,16 +1469,20 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1469 } 1469 }
1470#endif 1470#endif
1471 1471
1472 if (__inet_inherit_port(sk, newsk) < 0) {
1473 sock_put(newsk);
1474 goto exit;
1475 }
1472 __inet_hash_nolisten(newsk, NULL); 1476 __inet_hash_nolisten(newsk, NULL);
1473 __inet_inherit_port(sk, newsk);
1474 1477
1475 return newsk; 1478 return newsk;
1476 1479
1477exit_overflow: 1480exit_overflow:
1478 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); 1481 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1482exit_nonewsk:
1483 dst_release(dst);
1479exit: 1484exit:
1480 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); 1485 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1481 dst_release(dst);
1482 return NULL; 1486 return NULL;
1483} 1487}
1484EXPORT_SYMBOL(tcp_v4_syn_recv_sock); 1488EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
@@ -2571,7 +2575,6 @@ struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2571 2575
2572 return tcp_gro_receive(head, skb); 2576 return tcp_gro_receive(head, skb);
2573} 2577}
2574EXPORT_SYMBOL(tcp4_gro_receive);
2575 2578
2576int tcp4_gro_complete(struct sk_buff *skb) 2579int tcp4_gro_complete(struct sk_buff *skb)
2577{ 2580{
@@ -2584,7 +2587,6 @@ int tcp4_gro_complete(struct sk_buff *skb)
2584 2587
2585 return tcp_gro_complete(skb); 2588 return tcp_gro_complete(skb);
2586} 2589}
2587EXPORT_SYMBOL(tcp4_gro_complete);
2588 2590
2589struct proto tcp_prot = { 2591struct proto tcp_prot = {
2590 .name = "TCP", 2592 .name = "TCP",
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index f25b56cb85cb..43cf901d7659 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -55,7 +55,7 @@ static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
55 return 1; 55 return 1;
56 if (after(end_seq, s_win) && before(seq, e_win)) 56 if (after(end_seq, s_win) && before(seq, e_win))
57 return 1; 57 return 1;
58 return (seq == e_win && seq == end_seq); 58 return seq == e_win && seq == end_seq;
59} 59}
60 60
61/* 61/*
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index de3bd8458588..05b1ecf36763 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -224,16 +224,10 @@ void tcp_select_initial_window(int __space, __u32 mss,
224 } 224 }
225 } 225 }
226 226
227 /* Set initial window to value enough for senders, 227 /* Set initial window to value enough for senders, following RFC5681. */
228 * following RFC2414. Senders, not following this RFC,
229 * will be satisfied with 2.
230 */
231 if (mss > (1 << *rcv_wscale)) { 228 if (mss > (1 << *rcv_wscale)) {
232 int init_cwnd = 4; 229 int init_cwnd = rfc3390_bytes_to_packets(mss);
233 if (mss > 1460 * 3) 230
234 init_cwnd = 2;
235 else if (mss > 1460)
236 init_cwnd = 3;
237 /* when initializing use the value from init_rcv_wnd 231 /* when initializing use the value from init_rcv_wnd
238 * rather than the default from above 232 * rather than the default from above
239 */ 233 */
@@ -1376,9 +1370,9 @@ static inline int tcp_nagle_check(const struct tcp_sock *tp,
1376 const struct sk_buff *skb, 1370 const struct sk_buff *skb,
1377 unsigned mss_now, int nonagle) 1371 unsigned mss_now, int nonagle)
1378{ 1372{
1379 return (skb->len < mss_now && 1373 return skb->len < mss_now &&
1380 ((nonagle & TCP_NAGLE_CORK) || 1374 ((nonagle & TCP_NAGLE_CORK) ||
1381 (!nonagle && tp->packets_out && tcp_minshall_check(tp)))); 1375 (!nonagle && tp->packets_out && tcp_minshall_check(tp)));
1382} 1376}
1383 1377
1384/* Return non-zero if the Nagle test allows this packet to be 1378/* Return non-zero if the Nagle test allows this packet to be
@@ -1449,10 +1443,10 @@ int tcp_may_send_now(struct sock *sk)
1449 struct tcp_sock *tp = tcp_sk(sk); 1443 struct tcp_sock *tp = tcp_sk(sk);
1450 struct sk_buff *skb = tcp_send_head(sk); 1444 struct sk_buff *skb = tcp_send_head(sk);
1451 1445
1452 return (skb && 1446 return skb &&
1453 tcp_snd_test(sk, skb, tcp_current_mss(sk), 1447 tcp_snd_test(sk, skb, tcp_current_mss(sk),
1454 (tcp_skb_is_last(sk, skb) ? 1448 (tcp_skb_is_last(sk, skb) ?
1455 tp->nonagle : TCP_NAGLE_PUSH))); 1449 tp->nonagle : TCP_NAGLE_PUSH));
1456} 1450}
1457 1451
1458/* Trim TSO SKB to LEN bytes, put the remaining data into a new packet 1452/* Trim TSO SKB to LEN bytes, put the remaining data into a new packet
@@ -2429,6 +2423,12 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2429 __u8 rcv_wscale; 2423 __u8 rcv_wscale;
2430 /* Set this up on the first call only */ 2424 /* Set this up on the first call only */
2431 req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW); 2425 req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW);
2426
2427 /* limit the window selection if the user enforce a smaller rx buffer */
2428 if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
2429 (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0))
2430 req->window_clamp = tcp_full_space(sk);
2431
2432 /* tcp_full_space because it is guaranteed to be the first packet */ 2432 /* tcp_full_space because it is guaranteed to be the first packet */
2433 tcp_select_initial_window(tcp_full_space(sk), 2433 tcp_select_initial_window(tcp_full_space(sk),
2434 mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), 2434 mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
@@ -2555,6 +2555,11 @@ static void tcp_connect_init(struct sock *sk)
2555 2555
2556 tcp_initialize_rcv_mss(sk); 2556 tcp_initialize_rcv_mss(sk);
2557 2557
2558 /* limit the window selection if the user enforce a smaller rx buffer */
2559 if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
2560 (tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0))
2561 tp->window_clamp = tcp_full_space(sk);
2562
2558 tcp_select_initial_window(tcp_full_space(sk), 2563 tcp_select_initial_window(tcp_full_space(sk),
2559 tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), 2564 tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
2560 &tp->rcv_wnd, 2565 &tp->rcv_wnd,
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 74c54b30600f..74a6aa003657 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -140,10 +140,10 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk)
140 */ 140 */
141static bool retransmits_timed_out(struct sock *sk, 141static bool retransmits_timed_out(struct sock *sk,
142 unsigned int boundary, 142 unsigned int boundary,
143 unsigned int timeout,
143 bool syn_set) 144 bool syn_set)
144{ 145{
145 unsigned int timeout, linear_backoff_thresh; 146 unsigned int linear_backoff_thresh, start_ts;
146 unsigned int start_ts;
147 unsigned int rto_base = syn_set ? TCP_TIMEOUT_INIT : TCP_RTO_MIN; 147 unsigned int rto_base = syn_set ? TCP_TIMEOUT_INIT : TCP_RTO_MIN;
148 148
149 if (!inet_csk(sk)->icsk_retransmits) 149 if (!inet_csk(sk)->icsk_retransmits)
@@ -154,14 +154,15 @@ static bool retransmits_timed_out(struct sock *sk,
154 else 154 else
155 start_ts = tcp_sk(sk)->retrans_stamp; 155 start_ts = tcp_sk(sk)->retrans_stamp;
156 156
157 linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base); 157 if (likely(timeout == 0)) {
158 158 linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base);
159 if (boundary <= linear_backoff_thresh)
160 timeout = ((2 << boundary) - 1) * rto_base;
161 else
162 timeout = ((2 << linear_backoff_thresh) - 1) * rto_base +
163 (boundary - linear_backoff_thresh) * TCP_RTO_MAX;
164 159
160 if (boundary <= linear_backoff_thresh)
161 timeout = ((2 << boundary) - 1) * rto_base;
162 else
163 timeout = ((2 << linear_backoff_thresh) - 1) * rto_base +
164 (boundary - linear_backoff_thresh) * TCP_RTO_MAX;
165 }
165 return (tcp_time_stamp - start_ts) >= timeout; 166 return (tcp_time_stamp - start_ts) >= timeout;
166} 167}
167 168
@@ -178,7 +179,7 @@ static int tcp_write_timeout(struct sock *sk)
178 retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; 179 retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
179 syn_set = 1; 180 syn_set = 1;
180 } else { 181 } else {
181 if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0)) { 182 if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0, 0)) {
182 /* Black hole detection */ 183 /* Black hole detection */
183 tcp_mtu_probing(icsk, sk); 184 tcp_mtu_probing(icsk, sk);
184 185
@@ -191,14 +192,15 @@ static int tcp_write_timeout(struct sock *sk)
191 192
192 retry_until = tcp_orphan_retries(sk, alive); 193 retry_until = tcp_orphan_retries(sk, alive);
193 do_reset = alive || 194 do_reset = alive ||
194 !retransmits_timed_out(sk, retry_until, 0); 195 !retransmits_timed_out(sk, retry_until, 0, 0);
195 196
196 if (tcp_out_of_resources(sk, do_reset)) 197 if (tcp_out_of_resources(sk, do_reset))
197 return 1; 198 return 1;
198 } 199 }
199 } 200 }
200 201
201 if (retransmits_timed_out(sk, retry_until, syn_set)) { 202 if (retransmits_timed_out(sk, retry_until,
203 syn_set ? 0 : icsk->icsk_user_timeout, syn_set)) {
202 /* Has it gone just too far? */ 204 /* Has it gone just too far? */
203 tcp_write_err(sk); 205 tcp_write_err(sk);
204 return 1; 206 return 1;
@@ -365,18 +367,19 @@ void tcp_retransmit_timer(struct sock *sk)
365 if (icsk->icsk_retransmits == 0) { 367 if (icsk->icsk_retransmits == 0) {
366 int mib_idx; 368 int mib_idx;
367 369
368 if (icsk->icsk_ca_state == TCP_CA_Disorder) { 370 if (icsk->icsk_ca_state == TCP_CA_Recovery) {
369 if (tcp_is_sack(tp))
370 mib_idx = LINUX_MIB_TCPSACKFAILURES;
371 else
372 mib_idx = LINUX_MIB_TCPRENOFAILURES;
373 } else if (icsk->icsk_ca_state == TCP_CA_Recovery) {
374 if (tcp_is_sack(tp)) 371 if (tcp_is_sack(tp))
375 mib_idx = LINUX_MIB_TCPSACKRECOVERYFAIL; 372 mib_idx = LINUX_MIB_TCPSACKRECOVERYFAIL;
376 else 373 else
377 mib_idx = LINUX_MIB_TCPRENORECOVERYFAIL; 374 mib_idx = LINUX_MIB_TCPRENORECOVERYFAIL;
378 } else if (icsk->icsk_ca_state == TCP_CA_Loss) { 375 } else if (icsk->icsk_ca_state == TCP_CA_Loss) {
379 mib_idx = LINUX_MIB_TCPLOSSFAILURES; 376 mib_idx = LINUX_MIB_TCPLOSSFAILURES;
377 } else if ((icsk->icsk_ca_state == TCP_CA_Disorder) ||
378 tp->sacked_out) {
379 if (tcp_is_sack(tp))
380 mib_idx = LINUX_MIB_TCPSACKFAILURES;
381 else
382 mib_idx = LINUX_MIB_TCPRENOFAILURES;
380 } else { 383 } else {
381 mib_idx = LINUX_MIB_TCPTIMEOUTS; 384 mib_idx = LINUX_MIB_TCPTIMEOUTS;
382 } 385 }
@@ -440,7 +443,7 @@ out_reset_timer:
440 icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); 443 icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
441 } 444 }
442 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX); 445 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX);
443 if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1, 0)) 446 if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1, 0, 0))
444 __sk_dst_reset(sk); 447 __sk_dst_reset(sk);
445 448
446out:; 449out:;
@@ -560,7 +563,14 @@ static void tcp_keepalive_timer (unsigned long data)
560 elapsed = keepalive_time_elapsed(tp); 563 elapsed = keepalive_time_elapsed(tp);
561 564
562 if (elapsed >= keepalive_time_when(tp)) { 565 if (elapsed >= keepalive_time_when(tp)) {
563 if (icsk->icsk_probes_out >= keepalive_probes(tp)) { 566 /* If the TCP_USER_TIMEOUT option is enabled, use that
567 * to determine when to timeout instead.
568 */
569 if ((icsk->icsk_user_timeout != 0 &&
570 elapsed >= icsk->icsk_user_timeout &&
571 icsk->icsk_probes_out > 0) ||
572 (icsk->icsk_user_timeout == 0 &&
573 icsk->icsk_probes_out >= keepalive_probes(tp))) {
564 tcp_send_active_reset(sk, GFP_ATOMIC); 574 tcp_send_active_reset(sk, GFP_ATOMIC);
565 tcp_write_err(sk); 575 tcp_write_err(sk);
566 goto out; 576 goto out;
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
index 20151d6a6241..a534dda5456e 100644
--- a/net/ipv4/tcp_westwood.c
+++ b/net/ipv4/tcp_westwood.c
@@ -80,7 +80,7 @@ static void tcp_westwood_init(struct sock *sk)
80 */ 80 */
81static inline u32 westwood_do_filter(u32 a, u32 b) 81static inline u32 westwood_do_filter(u32 a, u32 b)
82{ 82{
83 return (((7 * a) + b) >> 3); 83 return ((7 * a) + b) >> 3;
84} 84}
85 85
86static void westwood_filter(struct westwood *w, u32 delta) 86static void westwood_filter(struct westwood *w, u32 delta)
diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c
index 59186ca7808a..9a17bd2a0a37 100644
--- a/net/ipv4/tunnel4.c
+++ b/net/ipv4/tunnel4.c
@@ -14,8 +14,8 @@
14#include <net/protocol.h> 14#include <net/protocol.h>
15#include <net/xfrm.h> 15#include <net/xfrm.h>
16 16
17static struct xfrm_tunnel *tunnel4_handlers; 17static struct xfrm_tunnel *tunnel4_handlers __read_mostly;
18static struct xfrm_tunnel *tunnel64_handlers; 18static struct xfrm_tunnel *tunnel64_handlers __read_mostly;
19static DEFINE_MUTEX(tunnel4_mutex); 19static DEFINE_MUTEX(tunnel4_mutex);
20 20
21static inline struct xfrm_tunnel **fam_handlers(unsigned short family) 21static inline struct xfrm_tunnel **fam_handlers(unsigned short family)
@@ -39,7 +39,7 @@ int xfrm4_tunnel_register(struct xfrm_tunnel *handler, unsigned short family)
39 } 39 }
40 40
41 handler->next = *pprev; 41 handler->next = *pprev;
42 *pprev = handler; 42 rcu_assign_pointer(*pprev, handler);
43 43
44 ret = 0; 44 ret = 0;
45 45
@@ -73,6 +73,11 @@ int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler, unsigned short family)
73} 73}
74EXPORT_SYMBOL(xfrm4_tunnel_deregister); 74EXPORT_SYMBOL(xfrm4_tunnel_deregister);
75 75
76#define for_each_tunnel_rcu(head, handler) \
77 for (handler = rcu_dereference(head); \
78 handler != NULL; \
79 handler = rcu_dereference(handler->next)) \
80
76static int tunnel4_rcv(struct sk_buff *skb) 81static int tunnel4_rcv(struct sk_buff *skb)
77{ 82{
78 struct xfrm_tunnel *handler; 83 struct xfrm_tunnel *handler;
@@ -80,7 +85,7 @@ static int tunnel4_rcv(struct sk_buff *skb)
80 if (!pskb_may_pull(skb, sizeof(struct iphdr))) 85 if (!pskb_may_pull(skb, sizeof(struct iphdr)))
81 goto drop; 86 goto drop;
82 87
83 for (handler = tunnel4_handlers; handler; handler = handler->next) 88 for_each_tunnel_rcu(tunnel4_handlers, handler)
84 if (!handler->handler(skb)) 89 if (!handler->handler(skb))
85 return 0; 90 return 0;
86 91
@@ -99,7 +104,7 @@ static int tunnel64_rcv(struct sk_buff *skb)
99 if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) 104 if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
100 goto drop; 105 goto drop;
101 106
102 for (handler = tunnel64_handlers; handler; handler = handler->next) 107 for_each_tunnel_rcu(tunnel64_handlers, handler)
103 if (!handler->handler(skb)) 108 if (!handler->handler(skb))
104 return 0; 109 return 0;
105 110
@@ -115,7 +120,7 @@ static void tunnel4_err(struct sk_buff *skb, u32 info)
115{ 120{
116 struct xfrm_tunnel *handler; 121 struct xfrm_tunnel *handler;
117 122
118 for (handler = tunnel4_handlers; handler; handler = handler->next) 123 for_each_tunnel_rcu(tunnel4_handlers, handler)
119 if (!handler->err_handler(skb, info)) 124 if (!handler->err_handler(skb, info))
120 break; 125 break;
121} 126}
@@ -125,7 +130,7 @@ static void tunnel64_err(struct sk_buff *skb, u32 info)
125{ 130{
126 struct xfrm_tunnel *handler; 131 struct xfrm_tunnel *handler;
127 132
128 for (handler = tunnel64_handlers; handler; handler = handler->next) 133 for_each_tunnel_rcu(tunnel64_handlers, handler)
129 if (!handler->err_handler(skb, info)) 134 if (!handler->err_handler(skb, info))
130 break; 135 break;
131} 136}
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index fb23c2e63b52..b3f7e8cf18ac 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -797,7 +797,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
797 return -EOPNOTSUPP; 797 return -EOPNOTSUPP;
798 798
799 ipc.opt = NULL; 799 ipc.opt = NULL;
800 ipc.shtx.flags = 0; 800 ipc.tx_flags = 0;
801 801
802 if (up->pending) { 802 if (up->pending) {
803 /* 803 /*
@@ -845,7 +845,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
845 ipc.addr = inet->inet_saddr; 845 ipc.addr = inet->inet_saddr;
846 846
847 ipc.oif = sk->sk_bound_dev_if; 847 ipc.oif = sk->sk_bound_dev_if;
848 err = sock_tx_timestamp(msg, sk, &ipc.shtx); 848 err = sock_tx_timestamp(sk, &ipc.tx_flags);
849 if (err) 849 if (err)
850 return err; 850 return err;
851 if (msg->msg_controllen) { 851 if (msg->msg_controllen) {
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index a580349f0b8a..4464f3bff6a7 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -174,7 +174,7 @@ static inline int xfrm4_garbage_collect(struct dst_ops *ops)
174 struct net *net = container_of(ops, struct net, xfrm.xfrm4_dst_ops); 174 struct net *net = container_of(ops, struct net, xfrm.xfrm4_dst_ops);
175 175
176 xfrm4_policy_afinfo.garbage_collect(net); 176 xfrm4_policy_afinfo.garbage_collect(net);
177 return (atomic_read(&ops->entries) > ops->gc_thresh * 2); 177 return (dst_entries_get_slow(ops) > ops->gc_thresh * 2);
178} 178}
179 179
180static void xfrm4_update_pmtu(struct dst_entry *dst, u32 mtu) 180static void xfrm4_update_pmtu(struct dst_entry *dst, u32 mtu)
@@ -232,7 +232,6 @@ static struct dst_ops xfrm4_dst_ops = {
232 .ifdown = xfrm4_dst_ifdown, 232 .ifdown = xfrm4_dst_ifdown,
233 .local_out = __ip_local_out, 233 .local_out = __ip_local_out,
234 .gc_thresh = 1024, 234 .gc_thresh = 1024,
235 .entries = ATOMIC_INIT(0),
236}; 235};
237 236
238static struct xfrm_policy_afinfo xfrm4_policy_afinfo = { 237static struct xfrm_policy_afinfo xfrm4_policy_afinfo = {
@@ -288,6 +287,7 @@ void __init xfrm4_init(int rt_max_size)
288 * and start cleaning when were 1/2 full 287 * and start cleaning when were 1/2 full
289 */ 288 */
290 xfrm4_dst_ops.gc_thresh = rt_max_size/2; 289 xfrm4_dst_ops.gc_thresh = rt_max_size/2;
290 dst_entries_init(&xfrm4_dst_ops);
291 291
292 xfrm4_state_init(); 292 xfrm4_state_init();
293 xfrm4_policy_init(); 293 xfrm4_policy_init();
diff --git a/net/ipv4/xfrm4_tunnel.c b/net/ipv4/xfrm4_tunnel.c
index 41f5982d2087..82806455e859 100644
--- a/net/ipv4/xfrm4_tunnel.c
+++ b/net/ipv4/xfrm4_tunnel.c
@@ -58,14 +58,14 @@ static int xfrm_tunnel_err(struct sk_buff *skb, u32 info)
58 return -ENOENT; 58 return -ENOENT;
59} 59}
60 60
61static struct xfrm_tunnel xfrm_tunnel_handler = { 61static struct xfrm_tunnel xfrm_tunnel_handler __read_mostly = {
62 .handler = xfrm_tunnel_rcv, 62 .handler = xfrm_tunnel_rcv,
63 .err_handler = xfrm_tunnel_err, 63 .err_handler = xfrm_tunnel_err,
64 .priority = 2, 64 .priority = 2,
65}; 65};
66 66
67#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 67#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
68static struct xfrm_tunnel xfrm64_tunnel_handler = { 68static struct xfrm_tunnel xfrm64_tunnel_handler __read_mostly = {
69 .handler = xfrm_tunnel_rcv, 69 .handler = xfrm_tunnel_rcv,
70 .err_handler = xfrm_tunnel_err, 70 .err_handler = xfrm_tunnel_err,
71 .priority = 2, 71 .priority = 2,