aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Kconfig8
-rw-r--r--net/ipv4/Makefile1
-rw-r--r--net/ipv4/af_inet.c8
-rw-r--r--net/ipv4/arp.c231
-rw-r--r--net/ipv4/datagram.c2
-rw-r--r--net/ipv4/devinet.c4
-rw-r--r--net/ipv4/fib_frontend.c35
-rw-r--r--net/ipv4/fib_trie.c55
-rw-r--r--net/ipv4/gre.c151
-rw-r--r--net/ipv4/icmp.c4
-rw-r--r--net/ipv4/inet_diag.c2
-rw-r--r--net/ipv4/ip_fragment.c6
-rw-r--r--net/ipv4/ip_gre.c236
-rw-r--r--net/ipv4/ip_options.c3
-rw-r--r--net/ipv4/ip_output.c24
-rw-r--r--net/ipv4/ipip.c215
-rw-r--r--net/ipv4/ipmr.c428
-rw-r--r--net/ipv4/netfilter/arp_tables.c2
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c31
-rw-r--r--net/ipv4/protocol.c31
-rw-r--r--net/ipv4/raw.c2
-rw-r--r--net/ipv4/route.c83
-rw-r--r--net/ipv4/tcp.c11
-rw-r--r--net/ipv4/tcp_input.c29
-rw-r--r--net/ipv4/tcp_ipv4.c2
-rw-r--r--net/ipv4/tcp_minisocks.c2
-rw-r--r--net/ipv4/tcp_output.c31
-rw-r--r--net/ipv4/tcp_timer.c37
-rw-r--r--net/ipv4/tcp_westwood.c2
-rw-r--r--net/ipv4/tunnel4.c19
-rw-r--r--net/ipv4/udp.c4
-rw-r--r--net/ipv4/xfrm4_tunnel.c4
32 files changed, 1006 insertions, 697 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 7cd7760144f7..e848e6c062cd 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -215,9 +215,15 @@ config NET_IPIP
215 be inserted in and removed from the running kernel whenever you 215 be inserted in and removed from the running kernel whenever you
216 want). Most people won't need this and can say N. 216 want). Most people won't need this and can say N.
217 217
218config NET_IPGRE_DEMUX
219 tristate "IP: GRE demultiplexer"
220 help
221 This is helper module to demultiplex GRE packets on GRE version field criteria.
222 Required by ip_gre and pptp modules.
223
218config NET_IPGRE 224config NET_IPGRE
219 tristate "IP: GRE tunnels over IP" 225 tristate "IP: GRE tunnels over IP"
220 depends on IPV6 || IPV6=n 226 depends on (IPV6 || IPV6=n) && NET_IPGRE_DEMUX
221 help 227 help
222 Tunneling means encapsulating data of one protocol type within 228 Tunneling means encapsulating data of one protocol type within
223 another protocol and sending it over a channel that understands the 229 another protocol and sending it over a channel that understands the
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 80ff87ce43aa..4978d22f9a75 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -20,6 +20,7 @@ obj-$(CONFIG_PROC_FS) += proc.o
20obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o 20obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o
21obj-$(CONFIG_IP_MROUTE) += ipmr.o 21obj-$(CONFIG_IP_MROUTE) += ipmr.o
22obj-$(CONFIG_NET_IPIP) += ipip.o 22obj-$(CONFIG_NET_IPIP) += ipip.o
23obj-$(CONFIG_NET_IPGRE_DEMUX) += gre.o
23obj-$(CONFIG_NET_IPGRE) += ip_gre.o 24obj-$(CONFIG_NET_IPGRE) += ip_gre.o
24obj-$(CONFIG_SYN_COOKIES) += syncookies.o 25obj-$(CONFIG_SYN_COOKIES) += syncookies.o
25obj-$(CONFIG_INET_AH) += ah4.o 26obj-$(CONFIG_INET_AH) += ah4.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 6a1100c25a9f..f581f77d1097 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -227,18 +227,16 @@ EXPORT_SYMBOL(inet_ehash_secret);
227 227
228/* 228/*
229 * inet_ehash_secret must be set exactly once 229 * inet_ehash_secret must be set exactly once
230 * Instead of using a dedicated spinlock, we (ab)use inetsw_lock
231 */ 230 */
232void build_ehash_secret(void) 231void build_ehash_secret(void)
233{ 232{
234 u32 rnd; 233 u32 rnd;
234
235 do { 235 do {
236 get_random_bytes(&rnd, sizeof(rnd)); 236 get_random_bytes(&rnd, sizeof(rnd));
237 } while (rnd == 0); 237 } while (rnd == 0);
238 spin_lock_bh(&inetsw_lock); 238
239 if (!inet_ehash_secret) 239 cmpxchg(&inet_ehash_secret, 0, rnd);
240 inet_ehash_secret = rnd;
241 spin_unlock_bh(&inetsw_lock);
242} 240}
243EXPORT_SYMBOL(build_ehash_secret); 241EXPORT_SYMBOL(build_ehash_secret);
244 242
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 96c1955b3e2f..d9031ad67826 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -55,7 +55,7 @@
55 * Stuart Cheshire : Metricom and grat arp fixes 55 * Stuart Cheshire : Metricom and grat arp fixes
56 * *** FOR 2.1 clean this up *** 56 * *** FOR 2.1 clean this up ***
57 * Lawrence V. Stefani: (08/12/96) Added FDDI support. 57 * Lawrence V. Stefani: (08/12/96) Added FDDI support.
58 * Alan Cox : Took the AP1000 nasty FDDI hack and 58 * Alan Cox : Took the AP1000 nasty FDDI hack and
59 * folded into the mainstream FDDI code. 59 * folded into the mainstream FDDI code.
60 * Ack spit, Linus how did you allow that 60 * Ack spit, Linus how did you allow that
61 * one in... 61 * one in...
@@ -120,7 +120,7 @@ EXPORT_SYMBOL(clip_tbl_hook);
120#endif 120#endif
121 121
122#include <asm/system.h> 122#include <asm/system.h>
123#include <asm/uaccess.h> 123#include <linux/uaccess.h>
124 124
125#include <linux/netfilter_arp.h> 125#include <linux/netfilter_arp.h>
126 126
@@ -161,7 +161,7 @@ static const struct neigh_ops arp_direct_ops = {
161 .queue_xmit = dev_queue_xmit, 161 .queue_xmit = dev_queue_xmit,
162}; 162};
163 163
164const struct neigh_ops arp_broken_ops = { 164static const struct neigh_ops arp_broken_ops = {
165 .family = AF_INET, 165 .family = AF_INET,
166 .solicit = arp_solicit, 166 .solicit = arp_solicit,
167 .error_report = arp_error_report, 167 .error_report = arp_error_report,
@@ -170,35 +170,34 @@ const struct neigh_ops arp_broken_ops = {
170 .hh_output = dev_queue_xmit, 170 .hh_output = dev_queue_xmit,
171 .queue_xmit = dev_queue_xmit, 171 .queue_xmit = dev_queue_xmit,
172}; 172};
173EXPORT_SYMBOL(arp_broken_ops);
174 173
175struct neigh_table arp_tbl = { 174struct neigh_table arp_tbl = {
176 .family = AF_INET, 175 .family = AF_INET,
177 .entry_size = sizeof(struct neighbour) + 4, 176 .entry_size = sizeof(struct neighbour) + 4,
178 .key_len = 4, 177 .key_len = 4,
179 .hash = arp_hash, 178 .hash = arp_hash,
180 .constructor = arp_constructor, 179 .constructor = arp_constructor,
181 .proxy_redo = parp_redo, 180 .proxy_redo = parp_redo,
182 .id = "arp_cache", 181 .id = "arp_cache",
183 .parms = { 182 .parms = {
184 .tbl = &arp_tbl, 183 .tbl = &arp_tbl,
185 .base_reachable_time = 30 * HZ, 184 .base_reachable_time = 30 * HZ,
186 .retrans_time = 1 * HZ, 185 .retrans_time = 1 * HZ,
187 .gc_staletime = 60 * HZ, 186 .gc_staletime = 60 * HZ,
188 .reachable_time = 30 * HZ, 187 .reachable_time = 30 * HZ,
189 .delay_probe_time = 5 * HZ, 188 .delay_probe_time = 5 * HZ,
190 .queue_len = 3, 189 .queue_len = 3,
191 .ucast_probes = 3, 190 .ucast_probes = 3,
192 .mcast_probes = 3, 191 .mcast_probes = 3,
193 .anycast_delay = 1 * HZ, 192 .anycast_delay = 1 * HZ,
194 .proxy_delay = (8 * HZ) / 10, 193 .proxy_delay = (8 * HZ) / 10,
195 .proxy_qlen = 64, 194 .proxy_qlen = 64,
196 .locktime = 1 * HZ, 195 .locktime = 1 * HZ,
197 }, 196 },
198 .gc_interval = 30 * HZ, 197 .gc_interval = 30 * HZ,
199 .gc_thresh1 = 128, 198 .gc_thresh1 = 128,
200 .gc_thresh2 = 512, 199 .gc_thresh2 = 512,
201 .gc_thresh3 = 1024, 200 .gc_thresh3 = 1024,
202}; 201};
203EXPORT_SYMBOL(arp_tbl); 202EXPORT_SYMBOL(arp_tbl);
204 203
@@ -233,7 +232,7 @@ static u32 arp_hash(const void *pkey, const struct net_device *dev)
233 232
234static int arp_constructor(struct neighbour *neigh) 233static int arp_constructor(struct neighbour *neigh)
235{ 234{
236 __be32 addr = *(__be32*)neigh->primary_key; 235 __be32 addr = *(__be32 *)neigh->primary_key;
237 struct net_device *dev = neigh->dev; 236 struct net_device *dev = neigh->dev;
238 struct in_device *in_dev; 237 struct in_device *in_dev;
239 struct neigh_parms *parms; 238 struct neigh_parms *parms;
@@ -296,16 +295,19 @@ static int arp_constructor(struct neighbour *neigh)
296 neigh->ops = &arp_broken_ops; 295 neigh->ops = &arp_broken_ops;
297 neigh->output = neigh->ops->output; 296 neigh->output = neigh->ops->output;
298 return 0; 297 return 0;
298#else
299 break;
299#endif 300#endif
300 ;} 301 }
301#endif 302#endif
302 if (neigh->type == RTN_MULTICAST) { 303 if (neigh->type == RTN_MULTICAST) {
303 neigh->nud_state = NUD_NOARP; 304 neigh->nud_state = NUD_NOARP;
304 arp_mc_map(addr, neigh->ha, dev, 1); 305 arp_mc_map(addr, neigh->ha, dev, 1);
305 } else if (dev->flags&(IFF_NOARP|IFF_LOOPBACK)) { 306 } else if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) {
306 neigh->nud_state = NUD_NOARP; 307 neigh->nud_state = NUD_NOARP;
307 memcpy(neigh->ha, dev->dev_addr, dev->addr_len); 308 memcpy(neigh->ha, dev->dev_addr, dev->addr_len);
308 } else if (neigh->type == RTN_BROADCAST || dev->flags&IFF_POINTOPOINT) { 309 } else if (neigh->type == RTN_BROADCAST ||
310 (dev->flags & IFF_POINTOPOINT)) {
309 neigh->nud_state = NUD_NOARP; 311 neigh->nud_state = NUD_NOARP;
310 memcpy(neigh->ha, dev->broadcast, dev->addr_len); 312 memcpy(neigh->ha, dev->broadcast, dev->addr_len);
311 } 313 }
@@ -315,7 +317,7 @@ static int arp_constructor(struct neighbour *neigh)
315 else 317 else
316 neigh->ops = &arp_generic_ops; 318 neigh->ops = &arp_generic_ops;
317 319
318 if (neigh->nud_state&NUD_VALID) 320 if (neigh->nud_state & NUD_VALID)
319 neigh->output = neigh->ops->connected_output; 321 neigh->output = neigh->ops->connected_output;
320 else 322 else
321 neigh->output = neigh->ops->output; 323 neigh->output = neigh->ops->output;
@@ -334,7 +336,7 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
334 __be32 saddr = 0; 336 __be32 saddr = 0;
335 u8 *dst_ha = NULL; 337 u8 *dst_ha = NULL;
336 struct net_device *dev = neigh->dev; 338 struct net_device *dev = neigh->dev;
337 __be32 target = *(__be32*)neigh->primary_key; 339 __be32 target = *(__be32 *)neigh->primary_key;
338 int probes = atomic_read(&neigh->probes); 340 int probes = atomic_read(&neigh->probes);
339 struct in_device *in_dev; 341 struct in_device *in_dev;
340 342
@@ -347,7 +349,8 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
347 switch (IN_DEV_ARP_ANNOUNCE(in_dev)) { 349 switch (IN_DEV_ARP_ANNOUNCE(in_dev)) {
348 default: 350 default:
349 case 0: /* By default announce any local IP */ 351 case 0: /* By default announce any local IP */
350 if (skb && inet_addr_type(dev_net(dev), ip_hdr(skb)->saddr) == RTN_LOCAL) 352 if (skb && inet_addr_type(dev_net(dev),
353 ip_hdr(skb)->saddr) == RTN_LOCAL)
351 saddr = ip_hdr(skb)->saddr; 354 saddr = ip_hdr(skb)->saddr;
352 break; 355 break;
353 case 1: /* Restrict announcements of saddr in same subnet */ 356 case 1: /* Restrict announcements of saddr in same subnet */
@@ -369,16 +372,21 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
369 if (!saddr) 372 if (!saddr)
370 saddr = inet_select_addr(dev, target, RT_SCOPE_LINK); 373 saddr = inet_select_addr(dev, target, RT_SCOPE_LINK);
371 374
372 if ((probes -= neigh->parms->ucast_probes) < 0) { 375 probes -= neigh->parms->ucast_probes;
373 if (!(neigh->nud_state&NUD_VALID)) 376 if (probes < 0) {
374 printk(KERN_DEBUG "trying to ucast probe in NUD_INVALID\n"); 377 if (!(neigh->nud_state & NUD_VALID))
378 printk(KERN_DEBUG
379 "trying to ucast probe in NUD_INVALID\n");
375 dst_ha = neigh->ha; 380 dst_ha = neigh->ha;
376 read_lock_bh(&neigh->lock); 381 read_lock_bh(&neigh->lock);
377 } else if ((probes -= neigh->parms->app_probes) < 0) { 382 } else {
383 probes -= neigh->parms->app_probes;
384 if (probes < 0) {
378#ifdef CONFIG_ARPD 385#ifdef CONFIG_ARPD
379 neigh_app_ns(neigh); 386 neigh_app_ns(neigh);
380#endif 387#endif
381 return; 388 return;
389 }
382 } 390 }
383 391
384 arp_send(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr, 392 arp_send(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr,
@@ -451,7 +459,8 @@ static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev)
451 * is allowed to use this function, it is scheduled to be removed. --ANK 459 * is allowed to use this function, it is scheduled to be removed. --ANK
452 */ 460 */
453 461
454static int arp_set_predefined(int addr_hint, unsigned char * haddr, __be32 paddr, struct net_device * dev) 462static int arp_set_predefined(int addr_hint, unsigned char *haddr,
463 __be32 paddr, struct net_device *dev)
455{ 464{
456 switch (addr_hint) { 465 switch (addr_hint) {
457 case RTN_LOCAL: 466 case RTN_LOCAL:
@@ -483,7 +492,8 @@ int arp_find(unsigned char *haddr, struct sk_buff *skb)
483 492
484 paddr = skb_rtable(skb)->rt_gateway; 493 paddr = skb_rtable(skb)->rt_gateway;
485 494
486 if (arp_set_predefined(inet_addr_type(dev_net(dev), paddr), haddr, paddr, dev)) 495 if (arp_set_predefined(inet_addr_type(dev_net(dev), paddr), haddr,
496 paddr, dev))
487 return 0; 497 return 0;
488 498
489 n = __neigh_lookup(&arp_tbl, &paddr, dev, 1); 499 n = __neigh_lookup(&arp_tbl, &paddr, dev, 1);
@@ -515,13 +525,14 @@ int arp_bind_neighbour(struct dst_entry *dst)
515 return -EINVAL; 525 return -EINVAL;
516 if (n == NULL) { 526 if (n == NULL) {
517 __be32 nexthop = ((struct rtable *)dst)->rt_gateway; 527 __be32 nexthop = ((struct rtable *)dst)->rt_gateway;
518 if (dev->flags&(IFF_LOOPBACK|IFF_POINTOPOINT)) 528 if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
519 nexthop = 0; 529 nexthop = 0;
520 n = __neigh_lookup_errno( 530 n = __neigh_lookup_errno(
521#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE) 531#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
522 dev->type == ARPHRD_ATM ? clip_tbl_hook : 532 dev->type == ARPHRD_ATM ?
533 clip_tbl_hook :
523#endif 534#endif
524 &arp_tbl, &nexthop, dev); 535 &arp_tbl, &nexthop, dev);
525 if (IS_ERR(n)) 536 if (IS_ERR(n))
526 return PTR_ERR(n); 537 return PTR_ERR(n);
527 dst->neighbour = n; 538 dst->neighbour = n;
@@ -543,8 +554,8 @@ static inline int arp_fwd_proxy(struct in_device *in_dev,
543 554
544 if (!IN_DEV_PROXY_ARP(in_dev)) 555 if (!IN_DEV_PROXY_ARP(in_dev))
545 return 0; 556 return 0;
546 557 imi = IN_DEV_MEDIUM_ID(in_dev);
547 if ((imi = IN_DEV_MEDIUM_ID(in_dev)) == 0) 558 if (imi == 0)
548 return 1; 559 return 1;
549 if (imi == -1) 560 if (imi == -1)
550 return 0; 561 return 0;
@@ -555,7 +566,7 @@ static inline int arp_fwd_proxy(struct in_device *in_dev,
555 if (out_dev) 566 if (out_dev)
556 omi = IN_DEV_MEDIUM_ID(out_dev); 567 omi = IN_DEV_MEDIUM_ID(out_dev);
557 568
558 return (omi != imi && omi != -1); 569 return omi != imi && omi != -1;
559} 570}
560 571
561/* 572/*
@@ -685,7 +696,7 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip,
685 arp->ar_pln = 4; 696 arp->ar_pln = 4;
686 arp->ar_op = htons(type); 697 arp->ar_op = htons(type);
687 698
688 arp_ptr=(unsigned char *)(arp+1); 699 arp_ptr = (unsigned char *)(arp + 1);
689 700
690 memcpy(arp_ptr, src_hw, dev->addr_len); 701 memcpy(arp_ptr, src_hw, dev->addr_len);
691 arp_ptr += dev->addr_len; 702 arp_ptr += dev->addr_len;
@@ -735,9 +746,8 @@ void arp_send(int type, int ptype, __be32 dest_ip,
735 746
736 skb = arp_create(type, ptype, dest_ip, dev, src_ip, 747 skb = arp_create(type, ptype, dest_ip, dev, src_ip,
737 dest_hw, src_hw, target_hw); 748 dest_hw, src_hw, target_hw);
738 if (skb == NULL) { 749 if (skb == NULL)
739 return; 750 return;
740 }
741 751
742 arp_xmit(skb); 752 arp_xmit(skb);
743} 753}
@@ -815,7 +825,7 @@ static int arp_process(struct sk_buff *skb)
815/* 825/*
816 * Extract fields 826 * Extract fields
817 */ 827 */
818 arp_ptr= (unsigned char *)(arp+1); 828 arp_ptr = (unsigned char *)(arp + 1);
819 sha = arp_ptr; 829 sha = arp_ptr;
820 arp_ptr += dev->addr_len; 830 arp_ptr += dev->addr_len;
821 memcpy(&sip, arp_ptr, 4); 831 memcpy(&sip, arp_ptr, 4);
@@ -869,16 +879,17 @@ static int arp_process(struct sk_buff *skb)
869 addr_type = rt->rt_type; 879 addr_type = rt->rt_type;
870 880
871 if (addr_type == RTN_LOCAL) { 881 if (addr_type == RTN_LOCAL) {
872 int dont_send = 0; 882 int dont_send;
873 883
874 if (!dont_send) 884 dont_send = arp_ignore(in_dev, sip, tip);
875 dont_send |= arp_ignore(in_dev,sip,tip);
876 if (!dont_send && IN_DEV_ARPFILTER(in_dev)) 885 if (!dont_send && IN_DEV_ARPFILTER(in_dev))
877 dont_send |= arp_filter(sip,tip,dev); 886 dont_send |= arp_filter(sip, tip, dev);
878 if (!dont_send) { 887 if (!dont_send) {
879 n = neigh_event_ns(&arp_tbl, sha, &sip, dev); 888 n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
880 if (n) { 889 if (n) {
881 arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha); 890 arp_send(ARPOP_REPLY, ETH_P_ARP, sip,
891 dev, tip, sha, dev->dev_addr,
892 sha);
882 neigh_release(n); 893 neigh_release(n);
883 } 894 }
884 } 895 }
@@ -887,8 +898,7 @@ static int arp_process(struct sk_buff *skb)
887 if (addr_type == RTN_UNICAST && 898 if (addr_type == RTN_UNICAST &&
888 (arp_fwd_proxy(in_dev, dev, rt) || 899 (arp_fwd_proxy(in_dev, dev, rt) ||
889 arp_fwd_pvlan(in_dev, dev, rt, sip, tip) || 900 arp_fwd_pvlan(in_dev, dev, rt, sip, tip) ||
890 pneigh_lookup(&arp_tbl, net, &tip, dev, 0))) 901 pneigh_lookup(&arp_tbl, net, &tip, dev, 0))) {
891 {
892 n = neigh_event_ns(&arp_tbl, sha, &sip, dev); 902 n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
893 if (n) 903 if (n)
894 neigh_release(n); 904 neigh_release(n);
@@ -896,9 +906,12 @@ static int arp_process(struct sk_buff *skb)
896 if (NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED || 906 if (NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED ||
897 skb->pkt_type == PACKET_HOST || 907 skb->pkt_type == PACKET_HOST ||
898 in_dev->arp_parms->proxy_delay == 0) { 908 in_dev->arp_parms->proxy_delay == 0) {
899 arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha); 909 arp_send(ARPOP_REPLY, ETH_P_ARP, sip,
910 dev, tip, sha, dev->dev_addr,
911 sha);
900 } else { 912 } else {
901 pneigh_enqueue(&arp_tbl, in_dev->arp_parms, skb); 913 pneigh_enqueue(&arp_tbl,
914 in_dev->arp_parms, skb);
902 return 0; 915 return 0;
903 } 916 }
904 goto out; 917 goto out;
@@ -939,7 +952,8 @@ static int arp_process(struct sk_buff *skb)
939 if (arp->ar_op != htons(ARPOP_REPLY) || 952 if (arp->ar_op != htons(ARPOP_REPLY) ||
940 skb->pkt_type != PACKET_HOST) 953 skb->pkt_type != PACKET_HOST)
941 state = NUD_STALE; 954 state = NUD_STALE;
942 neigh_update(n, sha, state, override ? NEIGH_UPDATE_F_OVERRIDE : 0); 955 neigh_update(n, sha, state,
956 override ? NEIGH_UPDATE_F_OVERRIDE : 0);
943 neigh_release(n); 957 neigh_release(n);
944 } 958 }
945 959
@@ -975,7 +989,8 @@ static int arp_rcv(struct sk_buff *skb, struct net_device *dev,
975 arp->ar_pln != 4) 989 arp->ar_pln != 4)
976 goto freeskb; 990 goto freeskb;
977 991
978 if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) 992 skb = skb_share_check(skb, GFP_ATOMIC);
993 if (skb == NULL)
979 goto out_of_mem; 994 goto out_of_mem;
980 995
981 memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb)); 996 memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb));
@@ -1019,7 +1034,7 @@ static int arp_req_set_public(struct net *net, struct arpreq *r,
1019 return -EINVAL; 1034 return -EINVAL;
1020 if (!dev && (r->arp_flags & ATF_COM)) { 1035 if (!dev && (r->arp_flags & ATF_COM)) {
1021 dev = dev_getbyhwaddr(net, r->arp_ha.sa_family, 1036 dev = dev_getbyhwaddr(net, r->arp_ha.sa_family,
1022 r->arp_ha.sa_data); 1037 r->arp_ha.sa_data);
1023 if (!dev) 1038 if (!dev)
1024 return -ENODEV; 1039 return -ENODEV;
1025 } 1040 }
@@ -1033,7 +1048,7 @@ static int arp_req_set_public(struct net *net, struct arpreq *r,
1033} 1048}
1034 1049
1035static int arp_req_set(struct net *net, struct arpreq *r, 1050static int arp_req_set(struct net *net, struct arpreq *r,
1036 struct net_device * dev) 1051 struct net_device *dev)
1037{ 1052{
1038 __be32 ip; 1053 __be32 ip;
1039 struct neighbour *neigh; 1054 struct neighbour *neigh;
@@ -1046,10 +1061,11 @@ static int arp_req_set(struct net *net, struct arpreq *r,
1046 if (r->arp_flags & ATF_PERM) 1061 if (r->arp_flags & ATF_PERM)
1047 r->arp_flags |= ATF_COM; 1062 r->arp_flags |= ATF_COM;
1048 if (dev == NULL) { 1063 if (dev == NULL) {
1049 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = ip, 1064 struct flowi fl = { .nl_u.ip4_u = { .daddr = ip,
1050 .tos = RTO_ONLINK } } }; 1065 .tos = RTO_ONLINK } };
1051 struct rtable * rt; 1066 struct rtable *rt;
1052 if ((err = ip_route_output_key(net, &rt, &fl)) != 0) 1067 err = ip_route_output_key(net, &rt, &fl);
1068 if (err != 0)
1053 return err; 1069 return err;
1054 dev = rt->dst.dev; 1070 dev = rt->dst.dev;
1055 ip_rt_put(rt); 1071 ip_rt_put(rt);
@@ -1083,9 +1099,9 @@ static int arp_req_set(struct net *net, struct arpreq *r,
1083 unsigned state = NUD_STALE; 1099 unsigned state = NUD_STALE;
1084 if (r->arp_flags & ATF_PERM) 1100 if (r->arp_flags & ATF_PERM)
1085 state = NUD_PERMANENT; 1101 state = NUD_PERMANENT;
1086 err = neigh_update(neigh, (r->arp_flags&ATF_COM) ? 1102 err = neigh_update(neigh, (r->arp_flags & ATF_COM) ?
1087 r->arp_ha.sa_data : NULL, state, 1103 r->arp_ha.sa_data : NULL, state,
1088 NEIGH_UPDATE_F_OVERRIDE| 1104 NEIGH_UPDATE_F_OVERRIDE |
1089 NEIGH_UPDATE_F_ADMIN); 1105 NEIGH_UPDATE_F_ADMIN);
1090 neigh_release(neigh); 1106 neigh_release(neigh);
1091 } 1107 }
@@ -1094,12 +1110,12 @@ static int arp_req_set(struct net *net, struct arpreq *r,
1094 1110
1095static unsigned arp_state_to_flags(struct neighbour *neigh) 1111static unsigned arp_state_to_flags(struct neighbour *neigh)
1096{ 1112{
1097 unsigned flags = 0;
1098 if (neigh->nud_state&NUD_PERMANENT) 1113 if (neigh->nud_state&NUD_PERMANENT)
1099 flags = ATF_PERM|ATF_COM; 1114 return ATF_PERM | ATF_COM;
1100 else if (neigh->nud_state&NUD_VALID) 1115 else if (neigh->nud_state&NUD_VALID)
1101 flags = ATF_COM; 1116 return ATF_COM;
1102 return flags; 1117 else
1118 return 0;
1103} 1119}
1104 1120
1105/* 1121/*
@@ -1142,7 +1158,7 @@ static int arp_req_delete_public(struct net *net, struct arpreq *r,
1142} 1158}
1143 1159
1144static int arp_req_delete(struct net *net, struct arpreq *r, 1160static int arp_req_delete(struct net *net, struct arpreq *r,
1145 struct net_device * dev) 1161 struct net_device *dev)
1146{ 1162{
1147 int err; 1163 int err;
1148 __be32 ip; 1164 __be32 ip;
@@ -1153,10 +1169,11 @@ static int arp_req_delete(struct net *net, struct arpreq *r,
1153 1169
1154 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; 1170 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;
1155 if (dev == NULL) { 1171 if (dev == NULL) {
1156 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = ip, 1172 struct flowi fl = { .nl_u.ip4_u = { .daddr = ip,
1157 .tos = RTO_ONLINK } } }; 1173 .tos = RTO_ONLINK } };
1158 struct rtable * rt; 1174 struct rtable *rt;
1159 if ((err = ip_route_output_key(net, &rt, &fl)) != 0) 1175 err = ip_route_output_key(net, &rt, &fl);
1176 if (err != 0)
1160 return err; 1177 return err;
1161 dev = rt->dst.dev; 1178 dev = rt->dst.dev;
1162 ip_rt_put(rt); 1179 ip_rt_put(rt);
@@ -1166,7 +1183,7 @@ static int arp_req_delete(struct net *net, struct arpreq *r,
1166 err = -ENXIO; 1183 err = -ENXIO;
1167 neigh = neigh_lookup(&arp_tbl, &ip, dev); 1184 neigh = neigh_lookup(&arp_tbl, &ip, dev);
1168 if (neigh) { 1185 if (neigh) {
1169 if (neigh->nud_state&~NUD_NOARP) 1186 if (neigh->nud_state & ~NUD_NOARP)
1170 err = neigh_update(neigh, NULL, NUD_FAILED, 1187 err = neigh_update(neigh, NULL, NUD_FAILED,
1171 NEIGH_UPDATE_F_OVERRIDE| 1188 NEIGH_UPDATE_F_OVERRIDE|
1172 NEIGH_UPDATE_F_ADMIN); 1189 NEIGH_UPDATE_F_ADMIN);
@@ -1186,24 +1203,24 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1186 struct net_device *dev = NULL; 1203 struct net_device *dev = NULL;
1187 1204
1188 switch (cmd) { 1205 switch (cmd) {
1189 case SIOCDARP: 1206 case SIOCDARP:
1190 case SIOCSARP: 1207 case SIOCSARP:
1191 if (!capable(CAP_NET_ADMIN)) 1208 if (!capable(CAP_NET_ADMIN))
1192 return -EPERM; 1209 return -EPERM;
1193 case SIOCGARP: 1210 case SIOCGARP:
1194 err = copy_from_user(&r, arg, sizeof(struct arpreq)); 1211 err = copy_from_user(&r, arg, sizeof(struct arpreq));
1195 if (err) 1212 if (err)
1196 return -EFAULT; 1213 return -EFAULT;
1197 break; 1214 break;
1198 default: 1215 default:
1199 return -EINVAL; 1216 return -EINVAL;
1200 } 1217 }
1201 1218
1202 if (r.arp_pa.sa_family != AF_INET) 1219 if (r.arp_pa.sa_family != AF_INET)
1203 return -EPFNOSUPPORT; 1220 return -EPFNOSUPPORT;
1204 1221
1205 if (!(r.arp_flags & ATF_PUBL) && 1222 if (!(r.arp_flags & ATF_PUBL) &&
1206 (r.arp_flags & (ATF_NETMASK|ATF_DONTPUB))) 1223 (r.arp_flags & (ATF_NETMASK | ATF_DONTPUB)))
1207 return -EINVAL; 1224 return -EINVAL;
1208 if (!(r.arp_flags & ATF_NETMASK)) 1225 if (!(r.arp_flags & ATF_NETMASK))
1209 ((struct sockaddr_in *)&r.arp_netmask)->sin_addr.s_addr = 1226 ((struct sockaddr_in *)&r.arp_netmask)->sin_addr.s_addr =
@@ -1211,7 +1228,8 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1211 rtnl_lock(); 1228 rtnl_lock();
1212 if (r.arp_dev[0]) { 1229 if (r.arp_dev[0]) {
1213 err = -ENODEV; 1230 err = -ENODEV;
1214 if ((dev = __dev_get_by_name(net, r.arp_dev)) == NULL) 1231 dev = __dev_get_by_name(net, r.arp_dev);
1232 if (dev == NULL)
1215 goto out; 1233 goto out;
1216 1234
1217 /* Mmmm... It is wrong... ARPHRD_NETROM==0 */ 1235 /* Mmmm... It is wrong... ARPHRD_NETROM==0 */
@@ -1243,7 +1261,8 @@ out:
1243 return err; 1261 return err;
1244} 1262}
1245 1263
1246static int arp_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) 1264static int arp_netdev_event(struct notifier_block *this, unsigned long event,
1265 void *ptr)
1247{ 1266{
1248 struct net_device *dev = ptr; 1267 struct net_device *dev = ptr;
1249 1268
@@ -1311,12 +1330,13 @@ static char *ax2asc2(ax25_address *a, char *buf)
1311 for (n = 0, s = buf; n < 6; n++) { 1330 for (n = 0, s = buf; n < 6; n++) {
1312 c = (a->ax25_call[n] >> 1) & 0x7F; 1331 c = (a->ax25_call[n] >> 1) & 0x7F;
1313 1332
1314 if (c != ' ') *s++ = c; 1333 if (c != ' ')
1334 *s++ = c;
1315 } 1335 }
1316 1336
1317 *s++ = '-'; 1337 *s++ = '-';
1318 1338 n = (a->ax25_call[6] >> 1) & 0x0F;
1319 if ((n = ((a->ax25_call[6] >> 1) & 0x0F)) > 9) { 1339 if (n > 9) {
1320 *s++ = '1'; 1340 *s++ = '1';
1321 n -= 10; 1341 n -= 10;
1322 } 1342 }
@@ -1325,10 +1345,9 @@ static char *ax2asc2(ax25_address *a, char *buf)
1325 *s++ = '\0'; 1345 *s++ = '\0';
1326 1346
1327 if (*buf == '\0' || *buf == '-') 1347 if (*buf == '\0' || *buf == '-')
1328 return "*"; 1348 return "*";
1329 1349
1330 return buf; 1350 return buf;
1331
1332} 1351}
1333#endif /* CONFIG_AX25 */ 1352#endif /* CONFIG_AX25 */
1334 1353
@@ -1408,10 +1427,10 @@ static void *arp_seq_start(struct seq_file *seq, loff_t *pos)
1408/* ------------------------------------------------------------------------ */ 1427/* ------------------------------------------------------------------------ */
1409 1428
1410static const struct seq_operations arp_seq_ops = { 1429static const struct seq_operations arp_seq_ops = {
1411 .start = arp_seq_start, 1430 .start = arp_seq_start,
1412 .next = neigh_seq_next, 1431 .next = neigh_seq_next,
1413 .stop = neigh_seq_stop, 1432 .stop = neigh_seq_stop,
1414 .show = arp_seq_show, 1433 .show = arp_seq_show,
1415}; 1434};
1416 1435
1417static int arp_seq_open(struct inode *inode, struct file *file) 1436static int arp_seq_open(struct inode *inode, struct file *file)
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index 721a8a37b45c..174be6caa5c8 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -73,6 +73,6 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
73 inet->inet_id = jiffies; 73 inet->inet_id = jiffies;
74 74
75 sk_dst_set(sk, &rt->dst); 75 sk_dst_set(sk, &rt->dst);
76 return(0); 76 return 0;
77} 77}
78EXPORT_SYMBOL(ip4_datagram_connect); 78EXPORT_SYMBOL(ip4_datagram_connect);
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index da14c49284f4..c2ff48fa18c7 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -209,7 +209,7 @@ static void inetdev_destroy(struct in_device *in_dev)
209 inet_free_ifa(ifa); 209 inet_free_ifa(ifa);
210 } 210 }
211 211
212 dev->ip_ptr = NULL; 212 rcu_assign_pointer(dev->ip_ptr, NULL);
213 213
214 devinet_sysctl_unregister(in_dev); 214 devinet_sysctl_unregister(in_dev);
215 neigh_parms_release(&arp_tbl, in_dev->arp_parms); 215 neigh_parms_release(&arp_tbl, in_dev->arp_parms);
@@ -1059,7 +1059,7 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
1059 switch (event) { 1059 switch (event) {
1060 case NETDEV_REGISTER: 1060 case NETDEV_REGISTER:
1061 printk(KERN_DEBUG "inetdev_event: bug\n"); 1061 printk(KERN_DEBUG "inetdev_event: bug\n");
1062 dev->ip_ptr = NULL; 1062 rcu_assign_pointer(dev->ip_ptr, NULL);
1063 break; 1063 break;
1064 case NETDEV_UP: 1064 case NETDEV_UP:
1065 if (!inetdev_valid_mtu(dev->mtu)) 1065 if (!inetdev_valid_mtu(dev->mtu))
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 7d02a9f999fa..4a69a957872b 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -147,35 +147,40 @@ static void fib_flush(struct net *net)
147 rt_cache_flush(net, -1); 147 rt_cache_flush(net, -1);
148} 148}
149 149
150/* 150/**
151 * Find the first device with a given source address. 151 * __ip_dev_find - find the first device with a given source address.
152 * @net: the net namespace
153 * @addr: the source address
154 * @devref: if true, take a reference on the found device
155 *
156 * If a caller uses devref=false, it should be protected by RCU
152 */ 157 */
153 158struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref)
154struct net_device * ip_dev_find(struct net *net, __be32 addr)
155{ 159{
156 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = addr } } }; 160 struct flowi fl = {
157 struct fib_result res; 161 .nl_u = {
162 .ip4_u = {
163 .daddr = addr
164 }
165 },
166 .flags = FLOWI_FLAG_MATCH_ANY_IIF
167 };
168 struct fib_result res = { 0 };
158 struct net_device *dev = NULL; 169 struct net_device *dev = NULL;
159 struct fib_table *local_table;
160 170
161#ifdef CONFIG_IP_MULTIPLE_TABLES 171 if (fib_lookup(net, &fl, &res))
162 res.r = NULL;
163#endif
164
165 local_table = fib_get_table(net, RT_TABLE_LOCAL);
166 if (!local_table || fib_table_lookup(local_table, &fl, &res))
167 return NULL; 172 return NULL;
168 if (res.type != RTN_LOCAL) 173 if (res.type != RTN_LOCAL)
169 goto out; 174 goto out;
170 dev = FIB_RES_DEV(res); 175 dev = FIB_RES_DEV(res);
171 176
172 if (dev) 177 if (dev && devref)
173 dev_hold(dev); 178 dev_hold(dev);
174out: 179out:
175 fib_res_put(&res); 180 fib_res_put(&res);
176 return dev; 181 return dev;
177} 182}
178EXPORT_SYMBOL(ip_dev_find); 183EXPORT_SYMBOL(__ip_dev_find);
179 184
180/* 185/*
181 * Find address type as if only "dev" was present in the system. If 186 * Find address type as if only "dev" was present in the system. If
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 4a8e370862bc..a96e5ec211a0 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -186,9 +186,7 @@ static inline struct tnode *node_parent_rcu(struct node *node)
186{ 186{
187 struct tnode *ret = node_parent(node); 187 struct tnode *ret = node_parent(node);
188 188
189 return rcu_dereference_check(ret, 189 return rcu_dereference_rtnl(ret);
190 rcu_read_lock_held() ||
191 lockdep_rtnl_is_held());
192} 190}
193 191
194/* Same as rcu_assign_pointer 192/* Same as rcu_assign_pointer
@@ -211,9 +209,7 @@ static inline struct node *tnode_get_child_rcu(struct tnode *tn, unsigned int i)
211{ 209{
212 struct node *ret = tnode_get_child(tn, i); 210 struct node *ret = tnode_get_child(tn, i);
213 211
214 return rcu_dereference_check(ret, 212 return rcu_dereference_rtnl(ret);
215 rcu_read_lock_held() ||
216 lockdep_rtnl_is_held());
217} 213}
218 214
219static inline int tnode_child_length(const struct tnode *tn) 215static inline int tnode_child_length(const struct tnode *tn)
@@ -459,8 +455,8 @@ static struct tnode *tnode_new(t_key key, int pos, int bits)
459 tn->empty_children = 1<<bits; 455 tn->empty_children = 1<<bits;
460 } 456 }
461 457
462 pr_debug("AT %p s=%u %lu\n", tn, (unsigned int) sizeof(struct tnode), 458 pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
463 (unsigned long) (sizeof(struct node) << bits)); 459 sizeof(struct node) << bits);
464 return tn; 460 return tn;
465} 461}
466 462
@@ -609,11 +605,10 @@ static struct node *resize(struct trie *t, struct tnode *tn)
609 605
610 /* Keep root node larger */ 606 /* Keep root node larger */
611 607
612 if (!node_parent((struct node*) tn)) { 608 if (!node_parent((struct node *)tn)) {
613 inflate_threshold_use = inflate_threshold_root; 609 inflate_threshold_use = inflate_threshold_root;
614 halve_threshold_use = halve_threshold_root; 610 halve_threshold_use = halve_threshold_root;
615 } 611 } else {
616 else {
617 inflate_threshold_use = inflate_threshold; 612 inflate_threshold_use = inflate_threshold;
618 halve_threshold_use = halve_threshold; 613 halve_threshold_use = halve_threshold;
619 } 614 }
@@ -639,7 +634,7 @@ static struct node *resize(struct trie *t, struct tnode *tn)
639 check_tnode(tn); 634 check_tnode(tn);
640 635
641 /* Return if at least one inflate is run */ 636 /* Return if at least one inflate is run */
642 if( max_work != MAX_WORK) 637 if (max_work != MAX_WORK)
643 return (struct node *) tn; 638 return (struct node *) tn;
644 639
645 /* 640 /*
@@ -966,9 +961,7 @@ fib_find_node(struct trie *t, u32 key)
966 struct node *n; 961 struct node *n;
967 962
968 pos = 0; 963 pos = 0;
969 n = rcu_dereference_check(t->trie, 964 n = rcu_dereference_rtnl(t->trie);
970 rcu_read_lock_held() ||
971 lockdep_rtnl_is_held());
972 965
973 while (n != NULL && NODE_TYPE(n) == T_TNODE) { 966 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
974 tn = (struct tnode *) n; 967 tn = (struct tnode *) n;
@@ -1748,16 +1741,14 @@ static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c)
1748 1741
1749 /* Node empty, walk back up to parent */ 1742 /* Node empty, walk back up to parent */
1750 c = (struct node *) p; 1743 c = (struct node *) p;
1751 } while ( (p = node_parent_rcu(c)) != NULL); 1744 } while ((p = node_parent_rcu(c)) != NULL);
1752 1745
1753 return NULL; /* Root of trie */ 1746 return NULL; /* Root of trie */
1754} 1747}
1755 1748
1756static struct leaf *trie_firstleaf(struct trie *t) 1749static struct leaf *trie_firstleaf(struct trie *t)
1757{ 1750{
1758 struct tnode *n = (struct tnode *) rcu_dereference_check(t->trie, 1751 struct tnode *n = (struct tnode *)rcu_dereference_rtnl(t->trie);
1759 rcu_read_lock_held() ||
1760 lockdep_rtnl_is_held());
1761 1752
1762 if (!n) 1753 if (!n)
1763 return NULL; 1754 return NULL;
@@ -2043,14 +2034,14 @@ struct fib_trie_iter {
2043 struct seq_net_private p; 2034 struct seq_net_private p;
2044 struct fib_table *tb; 2035 struct fib_table *tb;
2045 struct tnode *tnode; 2036 struct tnode *tnode;
2046 unsigned index; 2037 unsigned int index;
2047 unsigned depth; 2038 unsigned int depth;
2048}; 2039};
2049 2040
2050static struct node *fib_trie_get_next(struct fib_trie_iter *iter) 2041static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
2051{ 2042{
2052 struct tnode *tn = iter->tnode; 2043 struct tnode *tn = iter->tnode;
2053 unsigned cindex = iter->index; 2044 unsigned int cindex = iter->index;
2054 struct tnode *p; 2045 struct tnode *p;
2055 2046
2056 /* A single entry routing table */ 2047 /* A single entry routing table */
@@ -2159,7 +2150,7 @@ static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2159 */ 2150 */
2160static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat) 2151static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2161{ 2152{
2162 unsigned i, max, pointers, bytes, avdepth; 2153 unsigned int i, max, pointers, bytes, avdepth;
2163 2154
2164 if (stat->leaves) 2155 if (stat->leaves)
2165 avdepth = stat->totdepth*100 / stat->leaves; 2156 avdepth = stat->totdepth*100 / stat->leaves;
@@ -2356,7 +2347,8 @@ static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2356 2347
2357static void seq_indent(struct seq_file *seq, int n) 2348static void seq_indent(struct seq_file *seq, int n)
2358{ 2349{
2359 while (n-- > 0) seq_puts(seq, " "); 2350 while (n-- > 0)
2351 seq_puts(seq, " ");
2360} 2352}
2361 2353
2362static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s) 2354static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
@@ -2388,7 +2380,7 @@ static const char *const rtn_type_names[__RTN_MAX] = {
2388 [RTN_XRESOLVE] = "XRESOLVE", 2380 [RTN_XRESOLVE] = "XRESOLVE",
2389}; 2381};
2390 2382
2391static inline const char *rtn_type(char *buf, size_t len, unsigned t) 2383static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
2392{ 2384{
2393 if (t < __RTN_MAX && rtn_type_names[t]) 2385 if (t < __RTN_MAX && rtn_type_names[t])
2394 return rtn_type_names[t]; 2386 return rtn_type_names[t];
@@ -2544,13 +2536,12 @@ static void fib_route_seq_stop(struct seq_file *seq, void *v)
2544 rcu_read_unlock(); 2536 rcu_read_unlock();
2545} 2537}
2546 2538
2547static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi) 2539static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2548{ 2540{
2549 static unsigned type2flags[RTN_MAX + 1] = { 2541 unsigned int flags = 0;
2550 [7] = RTF_REJECT, [8] = RTF_REJECT,
2551 };
2552 unsigned flags = type2flags[type];
2553 2542
2543 if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT)
2544 flags = RTF_REJECT;
2554 if (fi && fi->fib_nh->nh_gw) 2545 if (fi && fi->fib_nh->nh_gw)
2555 flags |= RTF_GATEWAY; 2546 flags |= RTF_GATEWAY;
2556 if (mask == htonl(0xFFFFFFFF)) 2547 if (mask == htonl(0xFFFFFFFF))
@@ -2562,7 +2553,7 @@ static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2562/* 2553/*
2563 * This outputs /proc/net/route. 2554 * This outputs /proc/net/route.
2564 * The format of the file is not supposed to be changed 2555 * The format of the file is not supposed to be changed
2565 * and needs to be same as fib_hash output to avoid breaking 2556 * and needs to be same as fib_hash output to avoid breaking
2566 * legacy utilities 2557 * legacy utilities
2567 */ 2558 */
2568static int fib_route_seq_show(struct seq_file *seq, void *v) 2559static int fib_route_seq_show(struct seq_file *seq, void *v)
@@ -2587,7 +2578,7 @@ static int fib_route_seq_show(struct seq_file *seq, void *v)
2587 2578
2588 list_for_each_entry_rcu(fa, &li->falh, fa_list) { 2579 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2589 const struct fib_info *fi = fa->fa_info; 2580 const struct fib_info *fi = fa->fa_info;
2590 unsigned flags = fib_flag_trans(fa->fa_type, mask, fi); 2581 unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
2591 int len; 2582 int len;
2592 2583
2593 if (fa->fa_type == RTN_BROADCAST 2584 if (fa->fa_type == RTN_BROADCAST
diff --git a/net/ipv4/gre.c b/net/ipv4/gre.c
new file mode 100644
index 000000000000..caea6885fdbd
--- /dev/null
+++ b/net/ipv4/gre.c
@@ -0,0 +1,151 @@
1/*
2 * GRE over IPv4 demultiplexer driver
3 *
4 * Authors: Dmitry Kozlov (xeb@mail.ru)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 *
11 */
12
13#include <linux/module.h>
14#include <linux/kernel.h>
15#include <linux/kmod.h>
16#include <linux/skbuff.h>
17#include <linux/in.h>
18#include <linux/netdevice.h>
19#include <linux/version.h>
20#include <linux/spinlock.h>
21#include <net/protocol.h>
22#include <net/gre.h>
23
24
25static const struct gre_protocol *gre_proto[GREPROTO_MAX] __read_mostly;
26static DEFINE_SPINLOCK(gre_proto_lock);
27
28int gre_add_protocol(const struct gre_protocol *proto, u8 version)
29{
30 if (version >= GREPROTO_MAX)
31 goto err_out;
32
33 spin_lock(&gre_proto_lock);
34 if (gre_proto[version])
35 goto err_out_unlock;
36
37 rcu_assign_pointer(gre_proto[version], proto);
38 spin_unlock(&gre_proto_lock);
39 return 0;
40
41err_out_unlock:
42 spin_unlock(&gre_proto_lock);
43err_out:
44 return -1;
45}
46EXPORT_SYMBOL_GPL(gre_add_protocol);
47
48int gre_del_protocol(const struct gre_protocol *proto, u8 version)
49{
50 if (version >= GREPROTO_MAX)
51 goto err_out;
52
53 spin_lock(&gre_proto_lock);
54 if (gre_proto[version] != proto)
55 goto err_out_unlock;
56 rcu_assign_pointer(gre_proto[version], NULL);
57 spin_unlock(&gre_proto_lock);
58 synchronize_rcu();
59 return 0;
60
61err_out_unlock:
62 spin_unlock(&gre_proto_lock);
63err_out:
64 return -1;
65}
66EXPORT_SYMBOL_GPL(gre_del_protocol);
67
68static int gre_rcv(struct sk_buff *skb)
69{
70 const struct gre_protocol *proto;
71 u8 ver;
72 int ret;
73
74 if (!pskb_may_pull(skb, 12))
75 goto drop;
76
77 ver = skb->data[1]&0x7f;
78 if (ver >= GREPROTO_MAX)
79 goto drop;
80
81 rcu_read_lock();
82 proto = rcu_dereference(gre_proto[ver]);
83 if (!proto || !proto->handler)
84 goto drop_unlock;
85 ret = proto->handler(skb);
86 rcu_read_unlock();
87 return ret;
88
89drop_unlock:
90 rcu_read_unlock();
91drop:
92 kfree_skb(skb);
93 return NET_RX_DROP;
94}
95
96static void gre_err(struct sk_buff *skb, u32 info)
97{
98 const struct gre_protocol *proto;
99 u8 ver;
100
101 if (!pskb_may_pull(skb, 12))
102 goto drop;
103
104 ver = skb->data[1]&0x7f;
105 if (ver >= GREPROTO_MAX)
106 goto drop;
107
108 rcu_read_lock();
109 proto = rcu_dereference(gre_proto[ver]);
110 if (!proto || !proto->err_handler)
111 goto drop_unlock;
112 proto->err_handler(skb, info);
113 rcu_read_unlock();
114 return;
115
116drop_unlock:
117 rcu_read_unlock();
118drop:
119 kfree_skb(skb);
120}
121
122static const struct net_protocol net_gre_protocol = {
123 .handler = gre_rcv,
124 .err_handler = gre_err,
125 .netns_ok = 1,
126};
127
128static int __init gre_init(void)
129{
130 pr_info("GRE over IPv4 demultiplexor driver");
131
132 if (inet_add_protocol(&net_gre_protocol, IPPROTO_GRE) < 0) {
133 pr_err("gre: can't add protocol\n");
134 return -EAGAIN;
135 }
136
137 return 0;
138}
139
140static void __exit gre_exit(void)
141{
142 inet_del_protocol(&net_gre_protocol, IPPROTO_GRE);
143}
144
145module_init(gre_init);
146module_exit(gre_exit);
147
148MODULE_DESCRIPTION("GRE over IPv4 demultiplexer driver");
149MODULE_AUTHOR("D. Kozlov (xeb@mail.ru)");
150MODULE_LICENSE("GPL");
151
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index a0d847c7cba5..96bc7f9475a3 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -379,7 +379,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
379 inet->tos = ip_hdr(skb)->tos; 379 inet->tos = ip_hdr(skb)->tos;
380 daddr = ipc.addr = rt->rt_src; 380 daddr = ipc.addr = rt->rt_src;
381 ipc.opt = NULL; 381 ipc.opt = NULL;
382 ipc.shtx.flags = 0; 382 ipc.tx_flags = 0;
383 if (icmp_param->replyopts.optlen) { 383 if (icmp_param->replyopts.optlen) {
384 ipc.opt = &icmp_param->replyopts; 384 ipc.opt = &icmp_param->replyopts;
385 if (ipc.opt->srr) 385 if (ipc.opt->srr)
@@ -538,7 +538,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
538 inet_sk(sk)->tos = tos; 538 inet_sk(sk)->tos = tos;
539 ipc.addr = iph->saddr; 539 ipc.addr = iph->saddr;
540 ipc.opt = &icmp_param.replyopts; 540 ipc.opt = &icmp_param.replyopts;
541 ipc.shtx.flags = 0; 541 ipc.tx_flags = 0;
542 542
543 { 543 {
544 struct flowi fl = { 544 struct flowi fl = {
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index e5fa2ddce320..ba8042665849 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -425,7 +425,7 @@ static int inet_diag_bc_run(const void *bc, int len,
425 bc += op->no; 425 bc += op->no;
426 } 426 }
427 } 427 }
428 return (len == 0); 428 return len == 0;
429} 429}
430 430
431static int valid_cc(const void *bc, int len, int cc) 431static int valid_cc(const void *bc, int len, int cc)
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index b7c41654dde5..168440834ade 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -116,11 +116,11 @@ static int ip4_frag_match(struct inet_frag_queue *q, void *a)
116 struct ip4_create_arg *arg = a; 116 struct ip4_create_arg *arg = a;
117 117
118 qp = container_of(q, struct ipq, q); 118 qp = container_of(q, struct ipq, q);
119 return (qp->id == arg->iph->id && 119 return qp->id == arg->iph->id &&
120 qp->saddr == arg->iph->saddr && 120 qp->saddr == arg->iph->saddr &&
121 qp->daddr == arg->iph->daddr && 121 qp->daddr == arg->iph->daddr &&
122 qp->protocol == arg->iph->protocol && 122 qp->protocol == arg->iph->protocol &&
123 qp->user == arg->user); 123 qp->user == arg->user;
124} 124}
125 125
126/* Memory Tracking Functions. */ 126/* Memory Tracking Functions. */
@@ -542,7 +542,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
542 /* If the first fragment is fragmented itself, we split 542 /* If the first fragment is fragmented itself, we split
543 * it to two chunks: the first with data and paged part 543 * it to two chunks: the first with data and paged part
544 * and the second, holding only fragments. */ 544 * and the second, holding only fragments. */
545 if (skb_has_frags(head)) { 545 if (skb_has_frag_list(head)) {
546 struct sk_buff *clone; 546 struct sk_buff *clone;
547 int i, plen = 0; 547 int i, plen = 0;
548 548
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 35c93e8b6a46..fbe2c473a06a 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -44,6 +44,7 @@
44#include <net/net_namespace.h> 44#include <net/net_namespace.h>
45#include <net/netns/generic.h> 45#include <net/netns/generic.h>
46#include <net/rtnetlink.h> 46#include <net/rtnetlink.h>
47#include <net/gre.h>
47 48
48#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 49#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
49#include <net/ipv6.h> 50#include <net/ipv6.h>
@@ -63,13 +64,13 @@
63 We cannot track such dead loops during route installation, 64 We cannot track such dead loops during route installation,
64 it is infeasible task. The most general solutions would be 65 it is infeasible task. The most general solutions would be
65 to keep skb->encapsulation counter (sort of local ttl), 66 to keep skb->encapsulation counter (sort of local ttl),
66 and silently drop packet when it expires. It is the best 67 and silently drop packet when it expires. It is a good
67 solution, but it supposes maintaing new variable in ALL 68 solution, but it supposes maintaing new variable in ALL
68 skb, even if no tunneling is used. 69 skb, even if no tunneling is used.
69 70
70 Current solution: HARD_TX_LOCK lock breaks dead loops. 71 Current solution: xmit_recursion breaks dead loops. This is a percpu
71 72 counter, since when we enter the first ndo_xmit(), cpu migration is
72 73 forbidden. We force an exit if this counter reaches RECURSION_LIMIT
73 74
74 2. Networking dead loops would not kill routers, but would really 75 2. Networking dead loops would not kill routers, but would really
75 kill network. IP hop limit plays role of "t->recursion" in this case, 76 kill network. IP hop limit plays role of "t->recursion" in this case,
@@ -128,7 +129,7 @@ static int ipgre_tunnel_bind_dev(struct net_device *dev);
128 129
129static int ipgre_net_id __read_mostly; 130static int ipgre_net_id __read_mostly;
130struct ipgre_net { 131struct ipgre_net {
131 struct ip_tunnel *tunnels[4][HASH_SIZE]; 132 struct ip_tunnel __rcu *tunnels[4][HASH_SIZE];
132 133
133 struct net_device *fb_tunnel_dev; 134 struct net_device *fb_tunnel_dev;
134}; 135};
@@ -158,13 +159,40 @@ struct ipgre_net {
158#define tunnels_l tunnels[1] 159#define tunnels_l tunnels[1]
159#define tunnels_wc tunnels[0] 160#define tunnels_wc tunnels[0]
160/* 161/*
161 * Locking : hash tables are protected by RCU and a spinlock 162 * Locking : hash tables are protected by RCU and RTNL
162 */ 163 */
163static DEFINE_SPINLOCK(ipgre_lock);
164 164
165#define for_each_ip_tunnel_rcu(start) \ 165#define for_each_ip_tunnel_rcu(start) \
166 for (t = rcu_dereference(start); t; t = rcu_dereference(t->next)) 166 for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
167 167
168/* often modified stats are per cpu, other are shared (netdev->stats) */
169struct pcpu_tstats {
170 unsigned long rx_packets;
171 unsigned long rx_bytes;
172 unsigned long tx_packets;
173 unsigned long tx_bytes;
174};
175
176static struct net_device_stats *ipgre_get_stats(struct net_device *dev)
177{
178 struct pcpu_tstats sum = { 0 };
179 int i;
180
181 for_each_possible_cpu(i) {
182 const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
183
184 sum.rx_packets += tstats->rx_packets;
185 sum.rx_bytes += tstats->rx_bytes;
186 sum.tx_packets += tstats->tx_packets;
187 sum.tx_bytes += tstats->tx_bytes;
188 }
189 dev->stats.rx_packets = sum.rx_packets;
190 dev->stats.rx_bytes = sum.rx_bytes;
191 dev->stats.tx_packets = sum.tx_packets;
192 dev->stats.tx_bytes = sum.tx_bytes;
193 return &dev->stats;
194}
195
168/* Given src, dst and key, find appropriate for input tunnel. */ 196/* Given src, dst and key, find appropriate for input tunnel. */
169 197
170static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev, 198static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
@@ -173,8 +201,8 @@ static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
173{ 201{
174 struct net *net = dev_net(dev); 202 struct net *net = dev_net(dev);
175 int link = dev->ifindex; 203 int link = dev->ifindex;
176 unsigned h0 = HASH(remote); 204 unsigned int h0 = HASH(remote);
177 unsigned h1 = HASH(key); 205 unsigned int h1 = HASH(key);
178 struct ip_tunnel *t, *cand = NULL; 206 struct ip_tunnel *t, *cand = NULL;
179 struct ipgre_net *ign = net_generic(net, ipgre_net_id); 207 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
180 int dev_type = (gre_proto == htons(ETH_P_TEB)) ? 208 int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
@@ -289,13 +317,13 @@ static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
289 return NULL; 317 return NULL;
290} 318}
291 319
292static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign, 320static struct ip_tunnel __rcu **__ipgre_bucket(struct ipgre_net *ign,
293 struct ip_tunnel_parm *parms) 321 struct ip_tunnel_parm *parms)
294{ 322{
295 __be32 remote = parms->iph.daddr; 323 __be32 remote = parms->iph.daddr;
296 __be32 local = parms->iph.saddr; 324 __be32 local = parms->iph.saddr;
297 __be32 key = parms->i_key; 325 __be32 key = parms->i_key;
298 unsigned h = HASH(key); 326 unsigned int h = HASH(key);
299 int prio = 0; 327 int prio = 0;
300 328
301 if (local) 329 if (local)
@@ -308,7 +336,7 @@ static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign,
308 return &ign->tunnels[prio][h]; 336 return &ign->tunnels[prio][h];
309} 337}
310 338
311static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign, 339static inline struct ip_tunnel __rcu **ipgre_bucket(struct ipgre_net *ign,
312 struct ip_tunnel *t) 340 struct ip_tunnel *t)
313{ 341{
314 return __ipgre_bucket(ign, &t->parms); 342 return __ipgre_bucket(ign, &t->parms);
@@ -316,23 +344,22 @@ static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign,
316 344
317static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t) 345static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
318{ 346{
319 struct ip_tunnel **tp = ipgre_bucket(ign, t); 347 struct ip_tunnel __rcu **tp = ipgre_bucket(ign, t);
320 348
321 spin_lock_bh(&ipgre_lock); 349 rcu_assign_pointer(t->next, rtnl_dereference(*tp));
322 t->next = *tp;
323 rcu_assign_pointer(*tp, t); 350 rcu_assign_pointer(*tp, t);
324 spin_unlock_bh(&ipgre_lock);
325} 351}
326 352
327static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t) 353static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
328{ 354{
329 struct ip_tunnel **tp; 355 struct ip_tunnel __rcu **tp;
330 356 struct ip_tunnel *iter;
331 for (tp = ipgre_bucket(ign, t); *tp; tp = &(*tp)->next) { 357
332 if (t == *tp) { 358 for (tp = ipgre_bucket(ign, t);
333 spin_lock_bh(&ipgre_lock); 359 (iter = rtnl_dereference(*tp)) != NULL;
334 *tp = t->next; 360 tp = &iter->next) {
335 spin_unlock_bh(&ipgre_lock); 361 if (t == iter) {
362 rcu_assign_pointer(*tp, t->next);
336 break; 363 break;
337 } 364 }
338 } 365 }
@@ -346,10 +373,13 @@ static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
346 __be32 local = parms->iph.saddr; 373 __be32 local = parms->iph.saddr;
347 __be32 key = parms->i_key; 374 __be32 key = parms->i_key;
348 int link = parms->link; 375 int link = parms->link;
349 struct ip_tunnel *t, **tp; 376 struct ip_tunnel *t;
377 struct ip_tunnel __rcu **tp;
350 struct ipgre_net *ign = net_generic(net, ipgre_net_id); 378 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
351 379
352 for (tp = __ipgre_bucket(ign, parms); (t = *tp) != NULL; tp = &t->next) 380 for (tp = __ipgre_bucket(ign, parms);
381 (t = rtnl_dereference(*tp)) != NULL;
382 tp = &t->next)
353 if (local == t->parms.iph.saddr && 383 if (local == t->parms.iph.saddr &&
354 remote == t->parms.iph.daddr && 384 remote == t->parms.iph.daddr &&
355 key == t->parms.i_key && 385 key == t->parms.i_key &&
@@ -360,7 +390,7 @@ static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
360 return t; 390 return t;
361} 391}
362 392
363static struct ip_tunnel * ipgre_tunnel_locate(struct net *net, 393static struct ip_tunnel *ipgre_tunnel_locate(struct net *net,
364 struct ip_tunnel_parm *parms, int create) 394 struct ip_tunnel_parm *parms, int create)
365{ 395{
366 struct ip_tunnel *t, *nt; 396 struct ip_tunnel *t, *nt;
@@ -582,7 +612,7 @@ static int ipgre_rcv(struct sk_buff *skb)
582 if ((tunnel = ipgre_tunnel_lookup(skb->dev, 612 if ((tunnel = ipgre_tunnel_lookup(skb->dev,
583 iph->saddr, iph->daddr, key, 613 iph->saddr, iph->daddr, key,
584 gre_proto))) { 614 gre_proto))) {
585 struct net_device_stats *stats = &tunnel->dev->stats; 615 struct pcpu_tstats *tstats;
586 616
587 secpath_reset(skb); 617 secpath_reset(skb);
588 618
@@ -606,22 +636,22 @@ static int ipgre_rcv(struct sk_buff *skb)
606 /* Looped back packet, drop it! */ 636 /* Looped back packet, drop it! */
607 if (skb_rtable(skb)->fl.iif == 0) 637 if (skb_rtable(skb)->fl.iif == 0)
608 goto drop; 638 goto drop;
609 stats->multicast++; 639 tunnel->dev->stats.multicast++;
610 skb->pkt_type = PACKET_BROADCAST; 640 skb->pkt_type = PACKET_BROADCAST;
611 } 641 }
612#endif 642#endif
613 643
614 if (((flags&GRE_CSUM) && csum) || 644 if (((flags&GRE_CSUM) && csum) ||
615 (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) { 645 (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
616 stats->rx_crc_errors++; 646 tunnel->dev->stats.rx_crc_errors++;
617 stats->rx_errors++; 647 tunnel->dev->stats.rx_errors++;
618 goto drop; 648 goto drop;
619 } 649 }
620 if (tunnel->parms.i_flags&GRE_SEQ) { 650 if (tunnel->parms.i_flags&GRE_SEQ) {
621 if (!(flags&GRE_SEQ) || 651 if (!(flags&GRE_SEQ) ||
622 (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) { 652 (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
623 stats->rx_fifo_errors++; 653 tunnel->dev->stats.rx_fifo_errors++;
624 stats->rx_errors++; 654 tunnel->dev->stats.rx_errors++;
625 goto drop; 655 goto drop;
626 } 656 }
627 tunnel->i_seqno = seqno + 1; 657 tunnel->i_seqno = seqno + 1;
@@ -630,8 +660,8 @@ static int ipgre_rcv(struct sk_buff *skb)
630 /* Warning: All skb pointers will be invalidated! */ 660 /* Warning: All skb pointers will be invalidated! */
631 if (tunnel->dev->type == ARPHRD_ETHER) { 661 if (tunnel->dev->type == ARPHRD_ETHER) {
632 if (!pskb_may_pull(skb, ETH_HLEN)) { 662 if (!pskb_may_pull(skb, ETH_HLEN)) {
633 stats->rx_length_errors++; 663 tunnel->dev->stats.rx_length_errors++;
634 stats->rx_errors++; 664 tunnel->dev->stats.rx_errors++;
635 goto drop; 665 goto drop;
636 } 666 }
637 667
@@ -640,14 +670,20 @@ static int ipgre_rcv(struct sk_buff *skb)
640 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); 670 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
641 } 671 }
642 672
643 skb_tunnel_rx(skb, tunnel->dev); 673 tstats = this_cpu_ptr(tunnel->dev->tstats);
674 tstats->rx_packets++;
675 tstats->rx_bytes += skb->len;
676
677 __skb_tunnel_rx(skb, tunnel->dev);
644 678
645 skb_reset_network_header(skb); 679 skb_reset_network_header(skb);
646 ipgre_ecn_decapsulate(iph, skb); 680 ipgre_ecn_decapsulate(iph, skb);
647 681
648 netif_rx(skb); 682 if (netif_rx(skb) == NET_RX_DROP)
683 tunnel->dev->stats.rx_dropped++;
684
649 rcu_read_unlock(); 685 rcu_read_unlock();
650 return(0); 686 return 0;
651 } 687 }
652 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); 688 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
653 689
@@ -655,20 +691,19 @@ drop:
655 rcu_read_unlock(); 691 rcu_read_unlock();
656drop_nolock: 692drop_nolock:
657 kfree_skb(skb); 693 kfree_skb(skb);
658 return(0); 694 return 0;
659} 695}
660 696
661static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) 697static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
662{ 698{
663 struct ip_tunnel *tunnel = netdev_priv(dev); 699 struct ip_tunnel *tunnel = netdev_priv(dev);
664 struct net_device_stats *stats = &dev->stats; 700 struct pcpu_tstats *tstats;
665 struct netdev_queue *txq = netdev_get_tx_queue(dev, 0);
666 struct iphdr *old_iph = ip_hdr(skb); 701 struct iphdr *old_iph = ip_hdr(skb);
667 struct iphdr *tiph; 702 struct iphdr *tiph;
668 u8 tos; 703 u8 tos;
669 __be16 df; 704 __be16 df;
670 struct rtable *rt; /* Route to the other host */ 705 struct rtable *rt; /* Route to the other host */
671 struct net_device *tdev; /* Device to other host */ 706 struct net_device *tdev; /* Device to other host */
672 struct iphdr *iph; /* Our new IP header */ 707 struct iphdr *iph; /* Our new IP header */
673 unsigned int max_headroom; /* The extra header space needed */ 708 unsigned int max_headroom; /* The extra header space needed */
674 int gre_hlen; 709 int gre_hlen;
@@ -690,7 +725,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
690 /* NBMA tunnel */ 725 /* NBMA tunnel */
691 726
692 if (skb_dst(skb) == NULL) { 727 if (skb_dst(skb) == NULL) {
693 stats->tx_fifo_errors++; 728 dev->stats.tx_fifo_errors++;
694 goto tx_error; 729 goto tx_error;
695 } 730 }
696 731
@@ -736,14 +771,20 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
736 } 771 }
737 772
738 { 773 {
739 struct flowi fl = { .oif = tunnel->parms.link, 774 struct flowi fl = {
740 .nl_u = { .ip4_u = 775 .oif = tunnel->parms.link,
741 { .daddr = dst, 776 .nl_u = {
742 .saddr = tiph->saddr, 777 .ip4_u = {
743 .tos = RT_TOS(tos) } }, 778 .daddr = dst,
744 .proto = IPPROTO_GRE }; 779 .saddr = tiph->saddr,
780 .tos = RT_TOS(tos)
781 }
782 },
783 .proto = IPPROTO_GRE
784 }
785;
745 if (ip_route_output_key(dev_net(dev), &rt, &fl)) { 786 if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
746 stats->tx_carrier_errors++; 787 dev->stats.tx_carrier_errors++;
747 goto tx_error; 788 goto tx_error;
748 } 789 }
749 } 790 }
@@ -751,7 +792,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
751 792
752 if (tdev == dev) { 793 if (tdev == dev) {
753 ip_rt_put(rt); 794 ip_rt_put(rt);
754 stats->collisions++; 795 dev->stats.collisions++;
755 goto tx_error; 796 goto tx_error;
756 } 797 }
757 798
@@ -814,7 +855,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
814 dev->needed_headroom = max_headroom; 855 dev->needed_headroom = max_headroom;
815 if (!new_skb) { 856 if (!new_skb) {
816 ip_rt_put(rt); 857 ip_rt_put(rt);
817 txq->tx_dropped++; 858 dev->stats.tx_dropped++;
818 dev_kfree_skb(skb); 859 dev_kfree_skb(skb);
819 return NETDEV_TX_OK; 860 return NETDEV_TX_OK;
820 } 861 }
@@ -881,15 +922,15 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
881 } 922 }
882 923
883 nf_reset(skb); 924 nf_reset(skb);
884 925 tstats = this_cpu_ptr(dev->tstats);
885 IPTUNNEL_XMIT(); 926 __IPTUNNEL_XMIT(tstats, &dev->stats);
886 return NETDEV_TX_OK; 927 return NETDEV_TX_OK;
887 928
888tx_error_icmp: 929tx_error_icmp:
889 dst_link_failure(skb); 930 dst_link_failure(skb);
890 931
891tx_error: 932tx_error:
892 stats->tx_errors++; 933 dev->stats.tx_errors++;
893 dev_kfree_skb(skb); 934 dev_kfree_skb(skb);
894 return NETDEV_TX_OK; 935 return NETDEV_TX_OK;
895} 936}
@@ -909,13 +950,19 @@ static int ipgre_tunnel_bind_dev(struct net_device *dev)
909 /* Guess output device to choose reasonable mtu and needed_headroom */ 950 /* Guess output device to choose reasonable mtu and needed_headroom */
910 951
911 if (iph->daddr) { 952 if (iph->daddr) {
912 struct flowi fl = { .oif = tunnel->parms.link, 953 struct flowi fl = {
913 .nl_u = { .ip4_u = 954 .oif = tunnel->parms.link,
914 { .daddr = iph->daddr, 955 .nl_u = {
915 .saddr = iph->saddr, 956 .ip4_u = {
916 .tos = RT_TOS(iph->tos) } }, 957 .daddr = iph->daddr,
917 .proto = IPPROTO_GRE }; 958 .saddr = iph->saddr,
959 .tos = RT_TOS(iph->tos)
960 }
961 },
962 .proto = IPPROTO_GRE
963 };
918 struct rtable *rt; 964 struct rtable *rt;
965
919 if (!ip_route_output_key(dev_net(dev), &rt, &fl)) { 966 if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
920 tdev = rt->dst.dev; 967 tdev = rt->dst.dev;
921 ip_rt_put(rt); 968 ip_rt_put(rt);
@@ -1012,7 +1059,7 @@ ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
1012 break; 1059 break;
1013 } 1060 }
1014 } else { 1061 } else {
1015 unsigned nflags = 0; 1062 unsigned int nflags = 0;
1016 1063
1017 t = netdev_priv(dev); 1064 t = netdev_priv(dev);
1018 1065
@@ -1125,7 +1172,7 @@ static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1125 1172
1126static int ipgre_header(struct sk_buff *skb, struct net_device *dev, 1173static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1127 unsigned short type, 1174 unsigned short type,
1128 const void *daddr, const void *saddr, unsigned len) 1175 const void *daddr, const void *saddr, unsigned int len)
1129{ 1176{
1130 struct ip_tunnel *t = netdev_priv(dev); 1177 struct ip_tunnel *t = netdev_priv(dev);
1131 struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen); 1178 struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
@@ -1167,13 +1214,19 @@ static int ipgre_open(struct net_device *dev)
1167 struct ip_tunnel *t = netdev_priv(dev); 1214 struct ip_tunnel *t = netdev_priv(dev);
1168 1215
1169 if (ipv4_is_multicast(t->parms.iph.daddr)) { 1216 if (ipv4_is_multicast(t->parms.iph.daddr)) {
1170 struct flowi fl = { .oif = t->parms.link, 1217 struct flowi fl = {
1171 .nl_u = { .ip4_u = 1218 .oif = t->parms.link,
1172 { .daddr = t->parms.iph.daddr, 1219 .nl_u = {
1173 .saddr = t->parms.iph.saddr, 1220 .ip4_u = {
1174 .tos = RT_TOS(t->parms.iph.tos) } }, 1221 .daddr = t->parms.iph.daddr,
1175 .proto = IPPROTO_GRE }; 1222 .saddr = t->parms.iph.saddr,
1223 .tos = RT_TOS(t->parms.iph.tos)
1224 }
1225 },
1226 .proto = IPPROTO_GRE
1227 };
1176 struct rtable *rt; 1228 struct rtable *rt;
1229
1177 if (ip_route_output_key(dev_net(dev), &rt, &fl)) 1230 if (ip_route_output_key(dev_net(dev), &rt, &fl))
1178 return -EADDRNOTAVAIL; 1231 return -EADDRNOTAVAIL;
1179 dev = rt->dst.dev; 1232 dev = rt->dst.dev;
@@ -1213,12 +1266,19 @@ static const struct net_device_ops ipgre_netdev_ops = {
1213 .ndo_start_xmit = ipgre_tunnel_xmit, 1266 .ndo_start_xmit = ipgre_tunnel_xmit,
1214 .ndo_do_ioctl = ipgre_tunnel_ioctl, 1267 .ndo_do_ioctl = ipgre_tunnel_ioctl,
1215 .ndo_change_mtu = ipgre_tunnel_change_mtu, 1268 .ndo_change_mtu = ipgre_tunnel_change_mtu,
1269 .ndo_get_stats = ipgre_get_stats,
1216}; 1270};
1217 1271
1272static void ipgre_dev_free(struct net_device *dev)
1273{
1274 free_percpu(dev->tstats);
1275 free_netdev(dev);
1276}
1277
1218static void ipgre_tunnel_setup(struct net_device *dev) 1278static void ipgre_tunnel_setup(struct net_device *dev)
1219{ 1279{
1220 dev->netdev_ops = &ipgre_netdev_ops; 1280 dev->netdev_ops = &ipgre_netdev_ops;
1221 dev->destructor = free_netdev; 1281 dev->destructor = ipgre_dev_free;
1222 1282
1223 dev->type = ARPHRD_IPGRE; 1283 dev->type = ARPHRD_IPGRE;
1224 dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4; 1284 dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
@@ -1256,6 +1316,10 @@ static int ipgre_tunnel_init(struct net_device *dev)
1256 } else 1316 } else
1257 dev->header_ops = &ipgre_header_ops; 1317 dev->header_ops = &ipgre_header_ops;
1258 1318
1319 dev->tstats = alloc_percpu(struct pcpu_tstats);
1320 if (!dev->tstats)
1321 return -ENOMEM;
1322
1259 return 0; 1323 return 0;
1260} 1324}
1261 1325
@@ -1274,14 +1338,13 @@ static void ipgre_fb_tunnel_init(struct net_device *dev)
1274 tunnel->hlen = sizeof(struct iphdr) + 4; 1338 tunnel->hlen = sizeof(struct iphdr) + 4;
1275 1339
1276 dev_hold(dev); 1340 dev_hold(dev);
1277 ign->tunnels_wc[0] = tunnel; 1341 rcu_assign_pointer(ign->tunnels_wc[0], tunnel);
1278} 1342}
1279 1343
1280 1344
1281static const struct net_protocol ipgre_protocol = { 1345static const struct gre_protocol ipgre_protocol = {
1282 .handler = ipgre_rcv, 1346 .handler = ipgre_rcv,
1283 .err_handler = ipgre_err, 1347 .err_handler = ipgre_err,
1284 .netns_ok = 1,
1285}; 1348};
1286 1349
1287static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head) 1350static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
@@ -1291,11 +1354,13 @@ static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
1291 for (prio = 0; prio < 4; prio++) { 1354 for (prio = 0; prio < 4; prio++) {
1292 int h; 1355 int h;
1293 for (h = 0; h < HASH_SIZE; h++) { 1356 for (h = 0; h < HASH_SIZE; h++) {
1294 struct ip_tunnel *t = ign->tunnels[prio][h]; 1357 struct ip_tunnel *t;
1358
1359 t = rtnl_dereference(ign->tunnels[prio][h]);
1295 1360
1296 while (t != NULL) { 1361 while (t != NULL) {
1297 unregister_netdevice_queue(t->dev, head); 1362 unregister_netdevice_queue(t->dev, head);
1298 t = t->next; 1363 t = rtnl_dereference(t->next);
1299 } 1364 }
1300 } 1365 }
1301 } 1366 }
@@ -1441,6 +1506,10 @@ static int ipgre_tap_init(struct net_device *dev)
1441 1506
1442 ipgre_tunnel_bind_dev(dev); 1507 ipgre_tunnel_bind_dev(dev);
1443 1508
1509 dev->tstats = alloc_percpu(struct pcpu_tstats);
1510 if (!dev->tstats)
1511 return -ENOMEM;
1512
1444 return 0; 1513 return 0;
1445} 1514}
1446 1515
@@ -1451,6 +1520,7 @@ static const struct net_device_ops ipgre_tap_netdev_ops = {
1451 .ndo_set_mac_address = eth_mac_addr, 1520 .ndo_set_mac_address = eth_mac_addr,
1452 .ndo_validate_addr = eth_validate_addr, 1521 .ndo_validate_addr = eth_validate_addr,
1453 .ndo_change_mtu = ipgre_tunnel_change_mtu, 1522 .ndo_change_mtu = ipgre_tunnel_change_mtu,
1523 .ndo_get_stats = ipgre_get_stats,
1454}; 1524};
1455 1525
1456static void ipgre_tap_setup(struct net_device *dev) 1526static void ipgre_tap_setup(struct net_device *dev)
@@ -1459,7 +1529,7 @@ static void ipgre_tap_setup(struct net_device *dev)
1459 ether_setup(dev); 1529 ether_setup(dev);
1460 1530
1461 dev->netdev_ops = &ipgre_tap_netdev_ops; 1531 dev->netdev_ops = &ipgre_tap_netdev_ops;
1462 dev->destructor = free_netdev; 1532 dev->destructor = ipgre_dev_free;
1463 1533
1464 dev->iflink = 0; 1534 dev->iflink = 0;
1465 dev->features |= NETIF_F_NETNS_LOCAL; 1535 dev->features |= NETIF_F_NETNS_LOCAL;
@@ -1487,6 +1557,10 @@ static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nla
1487 if (!tb[IFLA_MTU]) 1557 if (!tb[IFLA_MTU])
1488 dev->mtu = mtu; 1558 dev->mtu = mtu;
1489 1559
1560 /* Can use a lockless transmit, unless we generate output sequences */
1561 if (!(nt->parms.o_flags & GRE_SEQ))
1562 dev->features |= NETIF_F_LLTX;
1563
1490 err = register_netdevice(dev); 1564 err = register_netdevice(dev);
1491 if (err) 1565 if (err)
1492 goto out; 1566 goto out;
@@ -1522,7 +1596,7 @@ static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1522 t = nt; 1596 t = nt;
1523 1597
1524 if (dev->type != ARPHRD_ETHER) { 1598 if (dev->type != ARPHRD_ETHER) {
1525 unsigned nflags = 0; 1599 unsigned int nflags = 0;
1526 1600
1527 if (ipv4_is_multicast(p.iph.daddr)) 1601 if (ipv4_is_multicast(p.iph.daddr))
1528 nflags = IFF_BROADCAST; 1602 nflags = IFF_BROADCAST;
@@ -1663,7 +1737,7 @@ static int __init ipgre_init(void)
1663 if (err < 0) 1737 if (err < 0)
1664 return err; 1738 return err;
1665 1739
1666 err = inet_add_protocol(&ipgre_protocol, IPPROTO_GRE); 1740 err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
1667 if (err < 0) { 1741 if (err < 0) {
1668 printk(KERN_INFO "ipgre init: can't add protocol\n"); 1742 printk(KERN_INFO "ipgre init: can't add protocol\n");
1669 goto add_proto_failed; 1743 goto add_proto_failed;
@@ -1683,7 +1757,7 @@ out:
1683tap_ops_failed: 1757tap_ops_failed:
1684 rtnl_link_unregister(&ipgre_link_ops); 1758 rtnl_link_unregister(&ipgre_link_ops);
1685rtnl_link_failed: 1759rtnl_link_failed:
1686 inet_del_protocol(&ipgre_protocol, IPPROTO_GRE); 1760 gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1687add_proto_failed: 1761add_proto_failed:
1688 unregister_pernet_device(&ipgre_net_ops); 1762 unregister_pernet_device(&ipgre_net_ops);
1689 goto out; 1763 goto out;
@@ -1693,7 +1767,7 @@ static void __exit ipgre_fini(void)
1693{ 1767{
1694 rtnl_link_unregister(&ipgre_tap_ops); 1768 rtnl_link_unregister(&ipgre_tap_ops);
1695 rtnl_link_unregister(&ipgre_link_ops); 1769 rtnl_link_unregister(&ipgre_link_ops);
1696 if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) 1770 if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0)
1697 printk(KERN_INFO "ipgre close: can't remove protocol\n"); 1771 printk(KERN_INFO "ipgre close: can't remove protocol\n");
1698 unregister_pernet_device(&ipgre_net_ops); 1772 unregister_pernet_device(&ipgre_net_ops);
1699} 1773}
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index ba9836c488ed..1906fa35860c 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -466,7 +466,7 @@ error:
466 } 466 }
467 return -EINVAL; 467 return -EINVAL;
468} 468}
469 469EXPORT_SYMBOL(ip_options_compile);
470 470
471/* 471/*
472 * Undo all the changes done by ip_options_compile(). 472 * Undo all the changes done by ip_options_compile().
@@ -646,3 +646,4 @@ int ip_options_rcv_srr(struct sk_buff *skb)
646 } 646 }
647 return 0; 647 return 0;
648} 648}
649EXPORT_SYMBOL(ip_options_rcv_srr);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 7649d7750075..439d2a34ee44 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -487,7 +487,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
487 * LATER: this step can be merged to real generation of fragments, 487 * LATER: this step can be merged to real generation of fragments,
488 * we can switch to copy when see the first bad fragment. 488 * we can switch to copy when see the first bad fragment.
489 */ 489 */
490 if (skb_has_frags(skb)) { 490 if (skb_has_frag_list(skb)) {
491 struct sk_buff *frag, *frag2; 491 struct sk_buff *frag, *frag2;
492 int first_len = skb_pagelen(skb); 492 int first_len = skb_pagelen(skb);
493 493
@@ -844,10 +844,9 @@ int ip_append_data(struct sock *sk,
844 inet->cork.length = 0; 844 inet->cork.length = 0;
845 sk->sk_sndmsg_page = NULL; 845 sk->sk_sndmsg_page = NULL;
846 sk->sk_sndmsg_off = 0; 846 sk->sk_sndmsg_off = 0;
847 if ((exthdrlen = rt->dst.header_len) != 0) { 847 exthdrlen = rt->dst.header_len;
848 length += exthdrlen; 848 length += exthdrlen;
849 transhdrlen += exthdrlen; 849 transhdrlen += exthdrlen;
850 }
851 } else { 850 } else {
852 rt = (struct rtable *)inet->cork.dst; 851 rt = (struct rtable *)inet->cork.dst;
853 if (inet->cork.flags & IPCORK_OPT) 852 if (inet->cork.flags & IPCORK_OPT)
@@ -934,16 +933,19 @@ alloc_new_skb:
934 !(rt->dst.dev->features&NETIF_F_SG)) 933 !(rt->dst.dev->features&NETIF_F_SG))
935 alloclen = mtu; 934 alloclen = mtu;
936 else 935 else
937 alloclen = datalen + fragheaderlen; 936 alloclen = fraglen;
938 937
939 /* The last fragment gets additional space at tail. 938 /* The last fragment gets additional space at tail.
940 * Note, with MSG_MORE we overallocate on fragments, 939 * Note, with MSG_MORE we overallocate on fragments,
941 * because we have no idea what fragment will be 940 * because we have no idea what fragment will be
942 * the last. 941 * the last.
943 */ 942 */
944 if (datalen == length + fraggap) 943 if (datalen == length + fraggap) {
945 alloclen += rt->dst.trailer_len; 944 alloclen += rt->dst.trailer_len;
946 945 /* make sure mtu is not reached */
946 if (datalen > mtu - fragheaderlen - rt->dst.trailer_len)
947 datalen -= ALIGN(rt->dst.trailer_len, 8);
948 }
947 if (transhdrlen) { 949 if (transhdrlen) {
948 skb = sock_alloc_send_skb(sk, 950 skb = sock_alloc_send_skb(sk,
949 alloclen + hh_len + 15, 951 alloclen + hh_len + 15,
@@ -960,7 +962,7 @@ alloc_new_skb:
960 else 962 else
961 /* only the initial fragment is 963 /* only the initial fragment is
962 time stamped */ 964 time stamped */
963 ipc->shtx.flags = 0; 965 ipc->tx_flags = 0;
964 } 966 }
965 if (skb == NULL) 967 if (skb == NULL)
966 goto error; 968 goto error;
@@ -971,7 +973,7 @@ alloc_new_skb:
971 skb->ip_summed = csummode; 973 skb->ip_summed = csummode;
972 skb->csum = 0; 974 skb->csum = 0;
973 skb_reserve(skb, hh_len); 975 skb_reserve(skb, hh_len);
974 *skb_tx(skb) = ipc->shtx; 976 skb_shinfo(skb)->tx_flags = ipc->tx_flags;
975 977
976 /* 978 /*
977 * Find where to start putting bytes. 979 * Find where to start putting bytes.
@@ -1391,7 +1393,7 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar
1391 1393
1392 daddr = ipc.addr = rt->rt_src; 1394 daddr = ipc.addr = rt->rt_src;
1393 ipc.opt = NULL; 1395 ipc.opt = NULL;
1394 ipc.shtx.flags = 0; 1396 ipc.tx_flags = 0;
1395 1397
1396 if (replyopts.opt.optlen) { 1398 if (replyopts.opt.optlen) {
1397 ipc.opt = &replyopts.opt; 1399 ipc.opt = &replyopts.opt;
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index ec036731a70b..6ad46c28ede2 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -122,31 +122,59 @@
122 122
123static int ipip_net_id __read_mostly; 123static int ipip_net_id __read_mostly;
124struct ipip_net { 124struct ipip_net {
125 struct ip_tunnel *tunnels_r_l[HASH_SIZE]; 125 struct ip_tunnel __rcu *tunnels_r_l[HASH_SIZE];
126 struct ip_tunnel *tunnels_r[HASH_SIZE]; 126 struct ip_tunnel __rcu *tunnels_r[HASH_SIZE];
127 struct ip_tunnel *tunnels_l[HASH_SIZE]; 127 struct ip_tunnel __rcu *tunnels_l[HASH_SIZE];
128 struct ip_tunnel *tunnels_wc[1]; 128 struct ip_tunnel __rcu *tunnels_wc[1];
129 struct ip_tunnel **tunnels[4]; 129 struct ip_tunnel __rcu **tunnels[4];
130 130
131 struct net_device *fb_tunnel_dev; 131 struct net_device *fb_tunnel_dev;
132}; 132};
133 133
134static void ipip_tunnel_init(struct net_device *dev); 134static int ipip_tunnel_init(struct net_device *dev);
135static void ipip_tunnel_setup(struct net_device *dev); 135static void ipip_tunnel_setup(struct net_device *dev);
136static void ipip_dev_free(struct net_device *dev);
136 137
137/* 138/*
138 * Locking : hash tables are protected by RCU and a spinlock 139 * Locking : hash tables are protected by RCU and RTNL
139 */ 140 */
140static DEFINE_SPINLOCK(ipip_lock);
141 141
142#define for_each_ip_tunnel_rcu(start) \ 142#define for_each_ip_tunnel_rcu(start) \
143 for (t = rcu_dereference(start); t; t = rcu_dereference(t->next)) 143 for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
144 144
145/* often modified stats are per cpu, other are shared (netdev->stats) */
146struct pcpu_tstats {
147 unsigned long rx_packets;
148 unsigned long rx_bytes;
149 unsigned long tx_packets;
150 unsigned long tx_bytes;
151};
152
153static struct net_device_stats *ipip_get_stats(struct net_device *dev)
154{
155 struct pcpu_tstats sum = { 0 };
156 int i;
157
158 for_each_possible_cpu(i) {
159 const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
160
161 sum.rx_packets += tstats->rx_packets;
162 sum.rx_bytes += tstats->rx_bytes;
163 sum.tx_packets += tstats->tx_packets;
164 sum.tx_bytes += tstats->tx_bytes;
165 }
166 dev->stats.rx_packets = sum.rx_packets;
167 dev->stats.rx_bytes = sum.rx_bytes;
168 dev->stats.tx_packets = sum.tx_packets;
169 dev->stats.tx_bytes = sum.tx_bytes;
170 return &dev->stats;
171}
172
145static struct ip_tunnel * ipip_tunnel_lookup(struct net *net, 173static struct ip_tunnel * ipip_tunnel_lookup(struct net *net,
146 __be32 remote, __be32 local) 174 __be32 remote, __be32 local)
147{ 175{
148 unsigned h0 = HASH(remote); 176 unsigned int h0 = HASH(remote);
149 unsigned h1 = HASH(local); 177 unsigned int h1 = HASH(local);
150 struct ip_tunnel *t; 178 struct ip_tunnel *t;
151 struct ipip_net *ipn = net_generic(net, ipip_net_id); 179 struct ipip_net *ipn = net_generic(net, ipip_net_id);
152 180
@@ -169,12 +197,12 @@ static struct ip_tunnel * ipip_tunnel_lookup(struct net *net,
169 return NULL; 197 return NULL;
170} 198}
171 199
172static struct ip_tunnel **__ipip_bucket(struct ipip_net *ipn, 200static struct ip_tunnel __rcu **__ipip_bucket(struct ipip_net *ipn,
173 struct ip_tunnel_parm *parms) 201 struct ip_tunnel_parm *parms)
174{ 202{
175 __be32 remote = parms->iph.daddr; 203 __be32 remote = parms->iph.daddr;
176 __be32 local = parms->iph.saddr; 204 __be32 local = parms->iph.saddr;
177 unsigned h = 0; 205 unsigned int h = 0;
178 int prio = 0; 206 int prio = 0;
179 207
180 if (remote) { 208 if (remote) {
@@ -188,7 +216,7 @@ static struct ip_tunnel **__ipip_bucket(struct ipip_net *ipn,
188 return &ipn->tunnels[prio][h]; 216 return &ipn->tunnels[prio][h];
189} 217}
190 218
191static inline struct ip_tunnel **ipip_bucket(struct ipip_net *ipn, 219static inline struct ip_tunnel __rcu **ipip_bucket(struct ipip_net *ipn,
192 struct ip_tunnel *t) 220 struct ip_tunnel *t)
193{ 221{
194 return __ipip_bucket(ipn, &t->parms); 222 return __ipip_bucket(ipn, &t->parms);
@@ -196,13 +224,14 @@ static inline struct ip_tunnel **ipip_bucket(struct ipip_net *ipn,
196 224
197static void ipip_tunnel_unlink(struct ipip_net *ipn, struct ip_tunnel *t) 225static void ipip_tunnel_unlink(struct ipip_net *ipn, struct ip_tunnel *t)
198{ 226{
199 struct ip_tunnel **tp; 227 struct ip_tunnel __rcu **tp;
200 228 struct ip_tunnel *iter;
201 for (tp = ipip_bucket(ipn, t); *tp; tp = &(*tp)->next) { 229
202 if (t == *tp) { 230 for (tp = ipip_bucket(ipn, t);
203 spin_lock_bh(&ipip_lock); 231 (iter = rtnl_dereference(*tp)) != NULL;
204 *tp = t->next; 232 tp = &iter->next) {
205 spin_unlock_bh(&ipip_lock); 233 if (t == iter) {
234 rcu_assign_pointer(*tp, t->next);
206 break; 235 break;
207 } 236 }
208 } 237 }
@@ -210,12 +239,10 @@ static void ipip_tunnel_unlink(struct ipip_net *ipn, struct ip_tunnel *t)
210 239
211static void ipip_tunnel_link(struct ipip_net *ipn, struct ip_tunnel *t) 240static void ipip_tunnel_link(struct ipip_net *ipn, struct ip_tunnel *t)
212{ 241{
213 struct ip_tunnel **tp = ipip_bucket(ipn, t); 242 struct ip_tunnel __rcu **tp = ipip_bucket(ipn, t);
214 243
215 spin_lock_bh(&ipip_lock); 244 rcu_assign_pointer(t->next, rtnl_dereference(*tp));
216 t->next = *tp;
217 rcu_assign_pointer(*tp, t); 245 rcu_assign_pointer(*tp, t);
218 spin_unlock_bh(&ipip_lock);
219} 246}
220 247
221static struct ip_tunnel * ipip_tunnel_locate(struct net *net, 248static struct ip_tunnel * ipip_tunnel_locate(struct net *net,
@@ -223,12 +250,15 @@ static struct ip_tunnel * ipip_tunnel_locate(struct net *net,
223{ 250{
224 __be32 remote = parms->iph.daddr; 251 __be32 remote = parms->iph.daddr;
225 __be32 local = parms->iph.saddr; 252 __be32 local = parms->iph.saddr;
226 struct ip_tunnel *t, **tp, *nt; 253 struct ip_tunnel *t, *nt;
254 struct ip_tunnel __rcu **tp;
227 struct net_device *dev; 255 struct net_device *dev;
228 char name[IFNAMSIZ]; 256 char name[IFNAMSIZ];
229 struct ipip_net *ipn = net_generic(net, ipip_net_id); 257 struct ipip_net *ipn = net_generic(net, ipip_net_id);
230 258
231 for (tp = __ipip_bucket(ipn, parms); (t = *tp) != NULL; tp = &t->next) { 259 for (tp = __ipip_bucket(ipn, parms);
260 (t = rtnl_dereference(*tp)) != NULL;
261 tp = &t->next) {
232 if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) 262 if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr)
233 return t; 263 return t;
234 } 264 }
@@ -238,7 +268,7 @@ static struct ip_tunnel * ipip_tunnel_locate(struct net *net,
238 if (parms->name[0]) 268 if (parms->name[0])
239 strlcpy(name, parms->name, IFNAMSIZ); 269 strlcpy(name, parms->name, IFNAMSIZ);
240 else 270 else
241 sprintf(name, "tunl%%d"); 271 strcpy(name, "tunl%d");
242 272
243 dev = alloc_netdev(sizeof(*t), name, ipip_tunnel_setup); 273 dev = alloc_netdev(sizeof(*t), name, ipip_tunnel_setup);
244 if (dev == NULL) 274 if (dev == NULL)
@@ -254,7 +284,8 @@ static struct ip_tunnel * ipip_tunnel_locate(struct net *net,
254 nt = netdev_priv(dev); 284 nt = netdev_priv(dev);
255 nt->parms = *parms; 285 nt->parms = *parms;
256 286
257 ipip_tunnel_init(dev); 287 if (ipip_tunnel_init(dev) < 0)
288 goto failed_free;
258 289
259 if (register_netdevice(dev) < 0) 290 if (register_netdevice(dev) < 0)
260 goto failed_free; 291 goto failed_free;
@@ -264,20 +295,19 @@ static struct ip_tunnel * ipip_tunnel_locate(struct net *net,
264 return nt; 295 return nt;
265 296
266failed_free: 297failed_free:
267 free_netdev(dev); 298 ipip_dev_free(dev);
268 return NULL; 299 return NULL;
269} 300}
270 301
302/* called with RTNL */
271static void ipip_tunnel_uninit(struct net_device *dev) 303static void ipip_tunnel_uninit(struct net_device *dev)
272{ 304{
273 struct net *net = dev_net(dev); 305 struct net *net = dev_net(dev);
274 struct ipip_net *ipn = net_generic(net, ipip_net_id); 306 struct ipip_net *ipn = net_generic(net, ipip_net_id);
275 307
276 if (dev == ipn->fb_tunnel_dev) { 308 if (dev == ipn->fb_tunnel_dev)
277 spin_lock_bh(&ipip_lock); 309 rcu_assign_pointer(ipn->tunnels_wc[0], NULL);
278 ipn->tunnels_wc[0] = NULL; 310 else
279 spin_unlock_bh(&ipip_lock);
280 } else
281 ipip_tunnel_unlink(ipn, netdev_priv(dev)); 311 ipip_tunnel_unlink(ipn, netdev_priv(dev));
282 dev_put(dev); 312 dev_put(dev);
283} 313}
@@ -359,8 +389,10 @@ static int ipip_rcv(struct sk_buff *skb)
359 const struct iphdr *iph = ip_hdr(skb); 389 const struct iphdr *iph = ip_hdr(skb);
360 390
361 rcu_read_lock(); 391 rcu_read_lock();
362 if ((tunnel = ipip_tunnel_lookup(dev_net(skb->dev), 392 tunnel = ipip_tunnel_lookup(dev_net(skb->dev), iph->saddr, iph->daddr);
363 iph->saddr, iph->daddr)) != NULL) { 393 if (tunnel != NULL) {
394 struct pcpu_tstats *tstats;
395
364 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { 396 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
365 rcu_read_unlock(); 397 rcu_read_unlock();
366 kfree_skb(skb); 398 kfree_skb(skb);
@@ -374,10 +406,17 @@ static int ipip_rcv(struct sk_buff *skb)
374 skb->protocol = htons(ETH_P_IP); 406 skb->protocol = htons(ETH_P_IP);
375 skb->pkt_type = PACKET_HOST; 407 skb->pkt_type = PACKET_HOST;
376 408
377 skb_tunnel_rx(skb, tunnel->dev); 409 tstats = this_cpu_ptr(tunnel->dev->tstats);
410 tstats->rx_packets++;
411 tstats->rx_bytes += skb->len;
412
413 __skb_tunnel_rx(skb, tunnel->dev);
378 414
379 ipip_ecn_decapsulate(iph, skb); 415 ipip_ecn_decapsulate(iph, skb);
380 netif_rx(skb); 416
417 if (netif_rx(skb) == NET_RX_DROP)
418 tunnel->dev->stats.rx_dropped++;
419
381 rcu_read_unlock(); 420 rcu_read_unlock();
382 return 0; 421 return 0;
383 } 422 }
@@ -394,13 +433,12 @@ static int ipip_rcv(struct sk_buff *skb)
394static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) 433static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
395{ 434{
396 struct ip_tunnel *tunnel = netdev_priv(dev); 435 struct ip_tunnel *tunnel = netdev_priv(dev);
397 struct net_device_stats *stats = &dev->stats; 436 struct pcpu_tstats *tstats;
398 struct netdev_queue *txq = netdev_get_tx_queue(dev, 0);
399 struct iphdr *tiph = &tunnel->parms.iph; 437 struct iphdr *tiph = &tunnel->parms.iph;
400 u8 tos = tunnel->parms.iph.tos; 438 u8 tos = tunnel->parms.iph.tos;
401 __be16 df = tiph->frag_off; 439 __be16 df = tiph->frag_off;
402 struct rtable *rt; /* Route to the other host */ 440 struct rtable *rt; /* Route to the other host */
403 struct net_device *tdev; /* Device to other host */ 441 struct net_device *tdev; /* Device to other host */
404 struct iphdr *old_iph = ip_hdr(skb); 442 struct iphdr *old_iph = ip_hdr(skb);
405 struct iphdr *iph; /* Our new IP header */ 443 struct iphdr *iph; /* Our new IP header */
406 unsigned int max_headroom; /* The extra header space needed */ 444 unsigned int max_headroom; /* The extra header space needed */
@@ -410,13 +448,13 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
410 if (skb->protocol != htons(ETH_P_IP)) 448 if (skb->protocol != htons(ETH_P_IP))
411 goto tx_error; 449 goto tx_error;
412 450
413 if (tos&1) 451 if (tos & 1)
414 tos = old_iph->tos; 452 tos = old_iph->tos;
415 453
416 if (!dst) { 454 if (!dst) {
417 /* NBMA tunnel */ 455 /* NBMA tunnel */
418 if ((rt = skb_rtable(skb)) == NULL) { 456 if ((rt = skb_rtable(skb)) == NULL) {
419 stats->tx_fifo_errors++; 457 dev->stats.tx_fifo_errors++;
420 goto tx_error; 458 goto tx_error;
421 } 459 }
422 if ((dst = rt->rt_gateway) == 0) 460 if ((dst = rt->rt_gateway) == 0)
@@ -424,14 +462,20 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
424 } 462 }
425 463
426 { 464 {
427 struct flowi fl = { .oif = tunnel->parms.link, 465 struct flowi fl = {
428 .nl_u = { .ip4_u = 466 .oif = tunnel->parms.link,
429 { .daddr = dst, 467 .nl_u = {
430 .saddr = tiph->saddr, 468 .ip4_u = {
431 .tos = RT_TOS(tos) } }, 469 .daddr = dst,
432 .proto = IPPROTO_IPIP }; 470 .saddr = tiph->saddr,
471 .tos = RT_TOS(tos)
472 }
473 },
474 .proto = IPPROTO_IPIP
475 };
476
433 if (ip_route_output_key(dev_net(dev), &rt, &fl)) { 477 if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
434 stats->tx_carrier_errors++; 478 dev->stats.tx_carrier_errors++;
435 goto tx_error_icmp; 479 goto tx_error_icmp;
436 } 480 }
437 } 481 }
@@ -439,7 +483,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
439 483
440 if (tdev == dev) { 484 if (tdev == dev) {
441 ip_rt_put(rt); 485 ip_rt_put(rt);
442 stats->collisions++; 486 dev->stats.collisions++;
443 goto tx_error; 487 goto tx_error;
444 } 488 }
445 489
@@ -449,7 +493,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
449 mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr); 493 mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr);
450 494
451 if (mtu < 68) { 495 if (mtu < 68) {
452 stats->collisions++; 496 dev->stats.collisions++;
453 ip_rt_put(rt); 497 ip_rt_put(rt);
454 goto tx_error; 498 goto tx_error;
455 } 499 }
@@ -485,7 +529,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
485 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); 529 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
486 if (!new_skb) { 530 if (!new_skb) {
487 ip_rt_put(rt); 531 ip_rt_put(rt);
488 txq->tx_dropped++; 532 dev->stats.tx_dropped++;
489 dev_kfree_skb(skb); 533 dev_kfree_skb(skb);
490 return NETDEV_TX_OK; 534 return NETDEV_TX_OK;
491 } 535 }
@@ -522,14 +566,14 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
522 iph->ttl = old_iph->ttl; 566 iph->ttl = old_iph->ttl;
523 567
524 nf_reset(skb); 568 nf_reset(skb);
525 569 tstats = this_cpu_ptr(dev->tstats);
526 IPTUNNEL_XMIT(); 570 __IPTUNNEL_XMIT(tstats, &dev->stats);
527 return NETDEV_TX_OK; 571 return NETDEV_TX_OK;
528 572
529tx_error_icmp: 573tx_error_icmp:
530 dst_link_failure(skb); 574 dst_link_failure(skb);
531tx_error: 575tx_error:
532 stats->tx_errors++; 576 dev->stats.tx_errors++;
533 dev_kfree_skb(skb); 577 dev_kfree_skb(skb);
534 return NETDEV_TX_OK; 578 return NETDEV_TX_OK;
535} 579}
@@ -544,13 +588,19 @@ static void ipip_tunnel_bind_dev(struct net_device *dev)
544 iph = &tunnel->parms.iph; 588 iph = &tunnel->parms.iph;
545 589
546 if (iph->daddr) { 590 if (iph->daddr) {
547 struct flowi fl = { .oif = tunnel->parms.link, 591 struct flowi fl = {
548 .nl_u = { .ip4_u = 592 .oif = tunnel->parms.link,
549 { .daddr = iph->daddr, 593 .nl_u = {
550 .saddr = iph->saddr, 594 .ip4_u = {
551 .tos = RT_TOS(iph->tos) } }, 595 .daddr = iph->daddr,
552 .proto = IPPROTO_IPIP }; 596 .saddr = iph->saddr,
597 .tos = RT_TOS(iph->tos)
598 }
599 },
600 .proto = IPPROTO_IPIP
601 };
553 struct rtable *rt; 602 struct rtable *rt;
603
554 if (!ip_route_output_key(dev_net(dev), &rt, &fl)) { 604 if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
555 tdev = rt->dst.dev; 605 tdev = rt->dst.dev;
556 ip_rt_put(rt); 606 ip_rt_put(rt);
@@ -696,13 +746,19 @@ static const struct net_device_ops ipip_netdev_ops = {
696 .ndo_start_xmit = ipip_tunnel_xmit, 746 .ndo_start_xmit = ipip_tunnel_xmit,
697 .ndo_do_ioctl = ipip_tunnel_ioctl, 747 .ndo_do_ioctl = ipip_tunnel_ioctl,
698 .ndo_change_mtu = ipip_tunnel_change_mtu, 748 .ndo_change_mtu = ipip_tunnel_change_mtu,
699 749 .ndo_get_stats = ipip_get_stats,
700}; 750};
701 751
752static void ipip_dev_free(struct net_device *dev)
753{
754 free_percpu(dev->tstats);
755 free_netdev(dev);
756}
757
702static void ipip_tunnel_setup(struct net_device *dev) 758static void ipip_tunnel_setup(struct net_device *dev)
703{ 759{
704 dev->netdev_ops = &ipip_netdev_ops; 760 dev->netdev_ops = &ipip_netdev_ops;
705 dev->destructor = free_netdev; 761 dev->destructor = ipip_dev_free;
706 762
707 dev->type = ARPHRD_TUNNEL; 763 dev->type = ARPHRD_TUNNEL;
708 dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr); 764 dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr);
@@ -711,10 +767,11 @@ static void ipip_tunnel_setup(struct net_device *dev)
711 dev->iflink = 0; 767 dev->iflink = 0;
712 dev->addr_len = 4; 768 dev->addr_len = 4;
713 dev->features |= NETIF_F_NETNS_LOCAL; 769 dev->features |= NETIF_F_NETNS_LOCAL;
770 dev->features |= NETIF_F_LLTX;
714 dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; 771 dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
715} 772}
716 773
717static void ipip_tunnel_init(struct net_device *dev) 774static int ipip_tunnel_init(struct net_device *dev)
718{ 775{
719 struct ip_tunnel *tunnel = netdev_priv(dev); 776 struct ip_tunnel *tunnel = netdev_priv(dev);
720 777
@@ -725,9 +782,15 @@ static void ipip_tunnel_init(struct net_device *dev)
725 memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); 782 memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
726 783
727 ipip_tunnel_bind_dev(dev); 784 ipip_tunnel_bind_dev(dev);
785
786 dev->tstats = alloc_percpu(struct pcpu_tstats);
787 if (!dev->tstats)
788 return -ENOMEM;
789
790 return 0;
728} 791}
729 792
730static void __net_init ipip_fb_tunnel_init(struct net_device *dev) 793static int __net_init ipip_fb_tunnel_init(struct net_device *dev)
731{ 794{
732 struct ip_tunnel *tunnel = netdev_priv(dev); 795 struct ip_tunnel *tunnel = netdev_priv(dev);
733 struct iphdr *iph = &tunnel->parms.iph; 796 struct iphdr *iph = &tunnel->parms.iph;
@@ -740,11 +803,16 @@ static void __net_init ipip_fb_tunnel_init(struct net_device *dev)
740 iph->protocol = IPPROTO_IPIP; 803 iph->protocol = IPPROTO_IPIP;
741 iph->ihl = 5; 804 iph->ihl = 5;
742 805
806 dev->tstats = alloc_percpu(struct pcpu_tstats);
807 if (!dev->tstats)
808 return -ENOMEM;
809
743 dev_hold(dev); 810 dev_hold(dev);
744 ipn->tunnels_wc[0] = tunnel; 811 rcu_assign_pointer(ipn->tunnels_wc[0], tunnel);
812 return 0;
745} 813}
746 814
747static struct xfrm_tunnel ipip_handler = { 815static struct xfrm_tunnel ipip_handler __read_mostly = {
748 .handler = ipip_rcv, 816 .handler = ipip_rcv,
749 .err_handler = ipip_err, 817 .err_handler = ipip_err,
750 .priority = 1, 818 .priority = 1,
@@ -760,11 +828,12 @@ static void ipip_destroy_tunnels(struct ipip_net *ipn, struct list_head *head)
760 for (prio = 1; prio < 4; prio++) { 828 for (prio = 1; prio < 4; prio++) {
761 int h; 829 int h;
762 for (h = 0; h < HASH_SIZE; h++) { 830 for (h = 0; h < HASH_SIZE; h++) {
763 struct ip_tunnel *t = ipn->tunnels[prio][h]; 831 struct ip_tunnel *t;
764 832
833 t = rtnl_dereference(ipn->tunnels[prio][h]);
765 while (t != NULL) { 834 while (t != NULL) {
766 unregister_netdevice_queue(t->dev, head); 835 unregister_netdevice_queue(t->dev, head);
767 t = t->next; 836 t = rtnl_dereference(t->next);
768 } 837 }
769 } 838 }
770 } 839 }
@@ -789,7 +858,9 @@ static int __net_init ipip_init_net(struct net *net)
789 } 858 }
790 dev_net_set(ipn->fb_tunnel_dev, net); 859 dev_net_set(ipn->fb_tunnel_dev, net);
791 860
792 ipip_fb_tunnel_init(ipn->fb_tunnel_dev); 861 err = ipip_fb_tunnel_init(ipn->fb_tunnel_dev);
862 if (err)
863 goto err_reg_dev;
793 864
794 if ((err = register_netdev(ipn->fb_tunnel_dev))) 865 if ((err = register_netdev(ipn->fb_tunnel_dev)))
795 goto err_reg_dev; 866 goto err_reg_dev;
@@ -797,7 +868,7 @@ static int __net_init ipip_init_net(struct net *net)
797 return 0; 868 return 0;
798 869
799err_reg_dev: 870err_reg_dev:
800 free_netdev(ipn->fb_tunnel_dev); 871 ipip_dev_free(ipn->fb_tunnel_dev);
801err_alloc_dev: 872err_alloc_dev:
802 /* nothing */ 873 /* nothing */
803 return err; 874 return err;
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 179fcab866fc..86dd5691af46 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -75,7 +75,7 @@ struct mr_table {
75 struct net *net; 75 struct net *net;
76#endif 76#endif
77 u32 id; 77 u32 id;
78 struct sock *mroute_sk; 78 struct sock __rcu *mroute_sk;
79 struct timer_list ipmr_expire_timer; 79 struct timer_list ipmr_expire_timer;
80 struct list_head mfc_unres_queue; 80 struct list_head mfc_unres_queue;
81 struct list_head mfc_cache_array[MFC_LINES]; 81 struct list_head mfc_cache_array[MFC_LINES];
@@ -98,7 +98,7 @@ struct ipmr_result {
98}; 98};
99 99
100/* Big lock, protecting vif table, mrt cache and mroute socket state. 100/* Big lock, protecting vif table, mrt cache and mroute socket state.
101 Note that the changes are semaphored via rtnl_lock. 101 * Note that the changes are semaphored via rtnl_lock.
102 */ 102 */
103 103
104static DEFINE_RWLOCK(mrt_lock); 104static DEFINE_RWLOCK(mrt_lock);
@@ -113,11 +113,11 @@ static DEFINE_RWLOCK(mrt_lock);
113static DEFINE_SPINLOCK(mfc_unres_lock); 113static DEFINE_SPINLOCK(mfc_unres_lock);
114 114
115/* We return to original Alan's scheme. Hash table of resolved 115/* We return to original Alan's scheme. Hash table of resolved
116 entries is changed only in process context and protected 116 * entries is changed only in process context and protected
117 with weak lock mrt_lock. Queue of unresolved entries is protected 117 * with weak lock mrt_lock. Queue of unresolved entries is protected
118 with strong spinlock mfc_unres_lock. 118 * with strong spinlock mfc_unres_lock.
119 119 *
120 In this case data path is free of exclusive locks at all. 120 * In this case data path is free of exclusive locks at all.
121 */ 121 */
122 122
123static struct kmem_cache *mrt_cachep __read_mostly; 123static struct kmem_cache *mrt_cachep __read_mostly;
@@ -396,9 +396,9 @@ struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v)
396 set_fs(KERNEL_DS); 396 set_fs(KERNEL_DS);
397 err = ops->ndo_do_ioctl(dev, &ifr, SIOCADDTUNNEL); 397 err = ops->ndo_do_ioctl(dev, &ifr, SIOCADDTUNNEL);
398 set_fs(oldfs); 398 set_fs(oldfs);
399 } else 399 } else {
400 err = -EOPNOTSUPP; 400 err = -EOPNOTSUPP;
401 401 }
402 dev = NULL; 402 dev = NULL;
403 403
404 if (err == 0 && 404 if (err == 0 &&
@@ -495,7 +495,8 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
495 dev->iflink = 0; 495 dev->iflink = 0;
496 496
497 rcu_read_lock(); 497 rcu_read_lock();
498 if ((in_dev = __in_dev_get_rcu(dev)) == NULL) { 498 in_dev = __in_dev_get_rcu(dev);
499 if (!in_dev) {
499 rcu_read_unlock(); 500 rcu_read_unlock();
500 goto failure; 501 goto failure;
501 } 502 }
@@ -552,9 +553,10 @@ static int vif_delete(struct mr_table *mrt, int vifi, int notify,
552 mrt->mroute_reg_vif_num = -1; 553 mrt->mroute_reg_vif_num = -1;
553#endif 554#endif
554 555
555 if (vifi+1 == mrt->maxvif) { 556 if (vifi + 1 == mrt->maxvif) {
556 int tmp; 557 int tmp;
557 for (tmp=vifi-1; tmp>=0; tmp--) { 558
559 for (tmp = vifi - 1; tmp >= 0; tmp--) {
558 if (VIF_EXISTS(mrt, tmp)) 560 if (VIF_EXISTS(mrt, tmp))
559 break; 561 break;
560 } 562 }
@@ -565,25 +567,33 @@ static int vif_delete(struct mr_table *mrt, int vifi, int notify,
565 567
566 dev_set_allmulti(dev, -1); 568 dev_set_allmulti(dev, -1);
567 569
568 if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) { 570 in_dev = __in_dev_get_rtnl(dev);
571 if (in_dev) {
569 IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--; 572 IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--;
570 ip_rt_multicast_event(in_dev); 573 ip_rt_multicast_event(in_dev);
571 } 574 }
572 575
573 if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER) && !notify) 576 if (v->flags & (VIFF_TUNNEL | VIFF_REGISTER) && !notify)
574 unregister_netdevice_queue(dev, head); 577 unregister_netdevice_queue(dev, head);
575 578
576 dev_put(dev); 579 dev_put(dev);
577 return 0; 580 return 0;
578} 581}
579 582
580static inline void ipmr_cache_free(struct mfc_cache *c) 583static void ipmr_cache_free_rcu(struct rcu_head *head)
581{ 584{
585 struct mfc_cache *c = container_of(head, struct mfc_cache, rcu);
586
582 kmem_cache_free(mrt_cachep, c); 587 kmem_cache_free(mrt_cachep, c);
583} 588}
584 589
590static inline void ipmr_cache_free(struct mfc_cache *c)
591{
592 call_rcu(&c->rcu, ipmr_cache_free_rcu);
593}
594
585/* Destroy an unresolved cache entry, killing queued skbs 595/* Destroy an unresolved cache entry, killing queued skbs
586 and reporting error to netlink readers. 596 * and reporting error to netlink readers.
587 */ 597 */
588 598
589static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c) 599static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c)
@@ -605,8 +615,9 @@ static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c)
605 memset(&e->msg, 0, sizeof(e->msg)); 615 memset(&e->msg, 0, sizeof(e->msg));
606 616
607 rtnl_unicast(skb, net, NETLINK_CB(skb).pid); 617 rtnl_unicast(skb, net, NETLINK_CB(skb).pid);
608 } else 618 } else {
609 kfree_skb(skb); 619 kfree_skb(skb);
620 }
610 } 621 }
611 622
612 ipmr_cache_free(c); 623 ipmr_cache_free(c);
@@ -724,13 +735,13 @@ static int vif_add(struct net *net, struct mr_table *mrt,
724 case 0: 735 case 0:
725 if (vifc->vifc_flags == VIFF_USE_IFINDEX) { 736 if (vifc->vifc_flags == VIFF_USE_IFINDEX) {
726 dev = dev_get_by_index(net, vifc->vifc_lcl_ifindex); 737 dev = dev_get_by_index(net, vifc->vifc_lcl_ifindex);
727 if (dev && dev->ip_ptr == NULL) { 738 if (dev && __in_dev_get_rtnl(dev) == NULL) {
728 dev_put(dev); 739 dev_put(dev);
729 return -EADDRNOTAVAIL; 740 return -EADDRNOTAVAIL;
730 } 741 }
731 } else 742 } else {
732 dev = ip_dev_find(net, vifc->vifc_lcl_addr.s_addr); 743 dev = ip_dev_find(net, vifc->vifc_lcl_addr.s_addr);
733 744 }
734 if (!dev) 745 if (!dev)
735 return -EADDRNOTAVAIL; 746 return -EADDRNOTAVAIL;
736 err = dev_set_allmulti(dev, 1); 747 err = dev_set_allmulti(dev, 1);
@@ -743,16 +754,16 @@ static int vif_add(struct net *net, struct mr_table *mrt,
743 return -EINVAL; 754 return -EINVAL;
744 } 755 }
745 756
746 if ((in_dev = __in_dev_get_rtnl(dev)) == NULL) { 757 in_dev = __in_dev_get_rtnl(dev);
758 if (!in_dev) {
747 dev_put(dev); 759 dev_put(dev);
748 return -EADDRNOTAVAIL; 760 return -EADDRNOTAVAIL;
749 } 761 }
750 IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++; 762 IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++;
751 ip_rt_multicast_event(in_dev); 763 ip_rt_multicast_event(in_dev);
752 764
753 /* 765 /* Fill in the VIF structures */
754 * Fill in the VIF structures 766
755 */
756 v->rate_limit = vifc->vifc_rate_limit; 767 v->rate_limit = vifc->vifc_rate_limit;
757 v->local = vifc->vifc_lcl_addr.s_addr; 768 v->local = vifc->vifc_lcl_addr.s_addr;
758 v->remote = vifc->vifc_rmt_addr.s_addr; 769 v->remote = vifc->vifc_rmt_addr.s_addr;
@@ -765,14 +776,14 @@ static int vif_add(struct net *net, struct mr_table *mrt,
765 v->pkt_in = 0; 776 v->pkt_in = 0;
766 v->pkt_out = 0; 777 v->pkt_out = 0;
767 v->link = dev->ifindex; 778 v->link = dev->ifindex;
768 if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER)) 779 if (v->flags & (VIFF_TUNNEL | VIFF_REGISTER))
769 v->link = dev->iflink; 780 v->link = dev->iflink;
770 781
771 /* And finish update writing critical data */ 782 /* And finish update writing critical data */
772 write_lock_bh(&mrt_lock); 783 write_lock_bh(&mrt_lock);
773 v->dev = dev; 784 v->dev = dev;
774#ifdef CONFIG_IP_PIMSM 785#ifdef CONFIG_IP_PIMSM
775 if (v->flags&VIFF_REGISTER) 786 if (v->flags & VIFF_REGISTER)
776 mrt->mroute_reg_vif_num = vifi; 787 mrt->mroute_reg_vif_num = vifi;
777#endif 788#endif
778 if (vifi+1 > mrt->maxvif) 789 if (vifi+1 > mrt->maxvif)
@@ -781,6 +792,7 @@ static int vif_add(struct net *net, struct mr_table *mrt,
781 return 0; 792 return 0;
782} 793}
783 794
795/* called with rcu_read_lock() */
784static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt, 796static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt,
785 __be32 origin, 797 __be32 origin,
786 __be32 mcastgrp) 798 __be32 mcastgrp)
@@ -788,7 +800,7 @@ static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt,
788 int line = MFC_HASH(mcastgrp, origin); 800 int line = MFC_HASH(mcastgrp, origin);
789 struct mfc_cache *c; 801 struct mfc_cache *c;
790 802
791 list_for_each_entry(c, &mrt->mfc_cache_array[line], list) { 803 list_for_each_entry_rcu(c, &mrt->mfc_cache_array[line], list) {
792 if (c->mfc_origin == origin && c->mfc_mcastgrp == mcastgrp) 804 if (c->mfc_origin == origin && c->mfc_mcastgrp == mcastgrp)
793 return c; 805 return c;
794 } 806 }
@@ -801,19 +813,20 @@ static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt,
801static struct mfc_cache *ipmr_cache_alloc(void) 813static struct mfc_cache *ipmr_cache_alloc(void)
802{ 814{
803 struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL); 815 struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
804 if (c == NULL) 816
805 return NULL; 817 if (c)
806 c->mfc_un.res.minvif = MAXVIFS; 818 c->mfc_un.res.minvif = MAXVIFS;
807 return c; 819 return c;
808} 820}
809 821
810static struct mfc_cache *ipmr_cache_alloc_unres(void) 822static struct mfc_cache *ipmr_cache_alloc_unres(void)
811{ 823{
812 struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC); 824 struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC);
813 if (c == NULL) 825
814 return NULL; 826 if (c) {
815 skb_queue_head_init(&c->mfc_un.unres.unresolved); 827 skb_queue_head_init(&c->mfc_un.unres.unresolved);
816 c->mfc_un.unres.expires = jiffies + 10*HZ; 828 c->mfc_un.unres.expires = jiffies + 10*HZ;
829 }
817 return c; 830 return c;
818} 831}
819 832
@@ -827,17 +840,15 @@ static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt,
827 struct sk_buff *skb; 840 struct sk_buff *skb;
828 struct nlmsgerr *e; 841 struct nlmsgerr *e;
829 842
830 /* 843 /* Play the pending entries through our router */
831 * Play the pending entries through our router
832 */
833 844
834 while ((skb = __skb_dequeue(&uc->mfc_un.unres.unresolved))) { 845 while ((skb = __skb_dequeue(&uc->mfc_un.unres.unresolved))) {
835 if (ip_hdr(skb)->version == 0) { 846 if (ip_hdr(skb)->version == 0) {
836 struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr)); 847 struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
837 848
838 if (__ipmr_fill_mroute(mrt, skb, c, NLMSG_DATA(nlh)) > 0) { 849 if (__ipmr_fill_mroute(mrt, skb, c, NLMSG_DATA(nlh)) > 0) {
839 nlh->nlmsg_len = (skb_tail_pointer(skb) - 850 nlh->nlmsg_len = skb_tail_pointer(skb) -
840 (u8 *)nlh); 851 (u8 *)nlh;
841 } else { 852 } else {
842 nlh->nlmsg_type = NLMSG_ERROR; 853 nlh->nlmsg_type = NLMSG_ERROR;
843 nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr)); 854 nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
@@ -848,8 +859,9 @@ static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt,
848 } 859 }
849 860
850 rtnl_unicast(skb, net, NETLINK_CB(skb).pid); 861 rtnl_unicast(skb, net, NETLINK_CB(skb).pid);
851 } else 862 } else {
852 ip_mr_forward(net, mrt, skb, c, 0); 863 ip_mr_forward(net, mrt, skb, c, 0);
864 }
853 } 865 }
854} 866}
855 867
@@ -867,6 +879,7 @@ static int ipmr_cache_report(struct mr_table *mrt,
867 const int ihl = ip_hdrlen(pkt); 879 const int ihl = ip_hdrlen(pkt);
868 struct igmphdr *igmp; 880 struct igmphdr *igmp;
869 struct igmpmsg *msg; 881 struct igmpmsg *msg;
882 struct sock *mroute_sk;
870 int ret; 883 int ret;
871 884
872#ifdef CONFIG_IP_PIMSM 885#ifdef CONFIG_IP_PIMSM
@@ -882,9 +895,9 @@ static int ipmr_cache_report(struct mr_table *mrt,
882#ifdef CONFIG_IP_PIMSM 895#ifdef CONFIG_IP_PIMSM
883 if (assert == IGMPMSG_WHOLEPKT) { 896 if (assert == IGMPMSG_WHOLEPKT) {
884 /* Ugly, but we have no choice with this interface. 897 /* Ugly, but we have no choice with this interface.
885 Duplicate old header, fix ihl, length etc. 898 * Duplicate old header, fix ihl, length etc.
886 And all this only to mangle msg->im_msgtype and 899 * And all this only to mangle msg->im_msgtype and
887 to set msg->im_mbz to "mbz" :-) 900 * to set msg->im_mbz to "mbz" :-)
888 */ 901 */
889 skb_push(skb, sizeof(struct iphdr)); 902 skb_push(skb, sizeof(struct iphdr));
890 skb_reset_network_header(skb); 903 skb_reset_network_header(skb);
@@ -901,39 +914,38 @@ static int ipmr_cache_report(struct mr_table *mrt,
901#endif 914#endif
902 { 915 {
903 916
904 /* 917 /* Copy the IP header */
905 * Copy the IP header
906 */
907 918
908 skb->network_header = skb->tail; 919 skb->network_header = skb->tail;
909 skb_put(skb, ihl); 920 skb_put(skb, ihl);
910 skb_copy_to_linear_data(skb, pkt->data, ihl); 921 skb_copy_to_linear_data(skb, pkt->data, ihl);
911 ip_hdr(skb)->protocol = 0; /* Flag to the kernel this is a route add */ 922 ip_hdr(skb)->protocol = 0; /* Flag to the kernel this is a route add */
912 msg = (struct igmpmsg *)skb_network_header(skb); 923 msg = (struct igmpmsg *)skb_network_header(skb);
913 msg->im_vif = vifi; 924 msg->im_vif = vifi;
914 skb_dst_set(skb, dst_clone(skb_dst(pkt))); 925 skb_dst_set(skb, dst_clone(skb_dst(pkt)));
915 926
916 /* 927 /* Add our header */
917 * Add our header
918 */
919 928
920 igmp=(struct igmphdr *)skb_put(skb, sizeof(struct igmphdr)); 929 igmp = (struct igmphdr *)skb_put(skb, sizeof(struct igmphdr));
921 igmp->type = 930 igmp->type =
922 msg->im_msgtype = assert; 931 msg->im_msgtype = assert;
923 igmp->code = 0; 932 igmp->code = 0;
924 ip_hdr(skb)->tot_len = htons(skb->len); /* Fix the length */ 933 ip_hdr(skb)->tot_len = htons(skb->len); /* Fix the length */
925 skb->transport_header = skb->network_header; 934 skb->transport_header = skb->network_header;
926 } 935 }
927 936
928 if (mrt->mroute_sk == NULL) { 937 rcu_read_lock();
938 mroute_sk = rcu_dereference(mrt->mroute_sk);
939 if (mroute_sk == NULL) {
940 rcu_read_unlock();
929 kfree_skb(skb); 941 kfree_skb(skb);
930 return -EINVAL; 942 return -EINVAL;
931 } 943 }
932 944
933 /* 945 /* Deliver to mrouted */
934 * Deliver to mrouted 946
935 */ 947 ret = sock_queue_rcv_skb(mroute_sk, skb);
936 ret = sock_queue_rcv_skb(mrt->mroute_sk, skb); 948 rcu_read_unlock();
937 if (ret < 0) { 949 if (ret < 0) {
938 if (net_ratelimit()) 950 if (net_ratelimit())
939 printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n"); 951 printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n");
@@ -965,9 +977,7 @@ ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb)
965 } 977 }
966 978
967 if (!found) { 979 if (!found) {
968 /* 980 /* Create a new entry if allowable */
969 * Create a new entry if allowable
970 */
971 981
972 if (atomic_read(&mrt->cache_resolve_queue_len) >= 10 || 982 if (atomic_read(&mrt->cache_resolve_queue_len) >= 10 ||
973 (c = ipmr_cache_alloc_unres()) == NULL) { 983 (c = ipmr_cache_alloc_unres()) == NULL) {
@@ -977,16 +987,14 @@ ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb)
977 return -ENOBUFS; 987 return -ENOBUFS;
978 } 988 }
979 989
980 /* 990 /* Fill in the new cache entry */
981 * Fill in the new cache entry 991
982 */
983 c->mfc_parent = -1; 992 c->mfc_parent = -1;
984 c->mfc_origin = iph->saddr; 993 c->mfc_origin = iph->saddr;
985 c->mfc_mcastgrp = iph->daddr; 994 c->mfc_mcastgrp = iph->daddr;
986 995
987 /* 996 /* Reflect first query at mrouted. */
988 * Reflect first query at mrouted. 997
989 */
990 err = ipmr_cache_report(mrt, skb, vifi, IGMPMSG_NOCACHE); 998 err = ipmr_cache_report(mrt, skb, vifi, IGMPMSG_NOCACHE);
991 if (err < 0) { 999 if (err < 0) {
992 /* If the report failed throw the cache entry 1000 /* If the report failed throw the cache entry
@@ -1006,10 +1014,9 @@ ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb)
1006 mod_timer(&mrt->ipmr_expire_timer, c->mfc_un.unres.expires); 1014 mod_timer(&mrt->ipmr_expire_timer, c->mfc_un.unres.expires);
1007 } 1015 }
1008 1016
1009 /* 1017 /* See if we can append the packet */
1010 * See if we can append the packet 1018
1011 */ 1019 if (c->mfc_un.unres.unresolved.qlen > 3) {
1012 if (c->mfc_un.unres.unresolved.qlen>3) {
1013 kfree_skb(skb); 1020 kfree_skb(skb);
1014 err = -ENOBUFS; 1021 err = -ENOBUFS;
1015 } else { 1022 } else {
@@ -1035,9 +1042,7 @@ static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc)
1035 list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[line], list) { 1042 list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[line], list) {
1036 if (c->mfc_origin == mfc->mfcc_origin.s_addr && 1043 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
1037 c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) { 1044 c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
1038 write_lock_bh(&mrt_lock); 1045 list_del_rcu(&c->list);
1039 list_del(&c->list);
1040 write_unlock_bh(&mrt_lock);
1041 1046
1042 ipmr_cache_free(c); 1047 ipmr_cache_free(c);
1043 return 0; 1048 return 0;
@@ -1090,9 +1095,7 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
1090 if (!mrtsock) 1095 if (!mrtsock)
1091 c->mfc_flags |= MFC_STATIC; 1096 c->mfc_flags |= MFC_STATIC;
1092 1097
1093 write_lock_bh(&mrt_lock); 1098 list_add_rcu(&c->list, &mrt->mfc_cache_array[line]);
1094 list_add(&c->list, &mrt->mfc_cache_array[line]);
1095 write_unlock_bh(&mrt_lock);
1096 1099
1097 /* 1100 /*
1098 * Check to see if we resolved a queued list. If so we 1101 * Check to see if we resolved a queued list. If so we
@@ -1130,26 +1133,21 @@ static void mroute_clean_tables(struct mr_table *mrt)
1130 LIST_HEAD(list); 1133 LIST_HEAD(list);
1131 struct mfc_cache *c, *next; 1134 struct mfc_cache *c, *next;
1132 1135
1133 /* 1136 /* Shut down all active vif entries */
1134 * Shut down all active vif entries 1137
1135 */
1136 for (i = 0; i < mrt->maxvif; i++) { 1138 for (i = 0; i < mrt->maxvif; i++) {
1137 if (!(mrt->vif_table[i].flags&VIFF_STATIC)) 1139 if (!(mrt->vif_table[i].flags & VIFF_STATIC))
1138 vif_delete(mrt, i, 0, &list); 1140 vif_delete(mrt, i, 0, &list);
1139 } 1141 }
1140 unregister_netdevice_many(&list); 1142 unregister_netdevice_many(&list);
1141 1143
1142 /* 1144 /* Wipe the cache */
1143 * Wipe the cache 1145
1144 */
1145 for (i = 0; i < MFC_LINES; i++) { 1146 for (i = 0; i < MFC_LINES; i++) {
1146 list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[i], list) { 1147 list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[i], list) {
1147 if (c->mfc_flags&MFC_STATIC) 1148 if (c->mfc_flags & MFC_STATIC)
1148 continue; 1149 continue;
1149 write_lock_bh(&mrt_lock); 1150 list_del_rcu(&c->list);
1150 list_del(&c->list);
1151 write_unlock_bh(&mrt_lock);
1152
1153 ipmr_cache_free(c); 1151 ipmr_cache_free(c);
1154 } 1152 }
1155 } 1153 }
@@ -1164,6 +1162,9 @@ static void mroute_clean_tables(struct mr_table *mrt)
1164 } 1162 }
1165} 1163}
1166 1164
1165/* called from ip_ra_control(), before an RCU grace period,
1166 * we dont need to call synchronize_rcu() here
1167 */
1167static void mrtsock_destruct(struct sock *sk) 1168static void mrtsock_destruct(struct sock *sk)
1168{ 1169{
1169 struct net *net = sock_net(sk); 1170 struct net *net = sock_net(sk);
@@ -1171,13 +1172,9 @@ static void mrtsock_destruct(struct sock *sk)
1171 1172
1172 rtnl_lock(); 1173 rtnl_lock();
1173 ipmr_for_each_table(mrt, net) { 1174 ipmr_for_each_table(mrt, net) {
1174 if (sk == mrt->mroute_sk) { 1175 if (sk == rtnl_dereference(mrt->mroute_sk)) {
1175 IPV4_DEVCONF_ALL(net, MC_FORWARDING)--; 1176 IPV4_DEVCONF_ALL(net, MC_FORWARDING)--;
1176 1177 rcu_assign_pointer(mrt->mroute_sk, NULL);
1177 write_lock_bh(&mrt_lock);
1178 mrt->mroute_sk = NULL;
1179 write_unlock_bh(&mrt_lock);
1180
1181 mroute_clean_tables(mrt); 1178 mroute_clean_tables(mrt);
1182 } 1179 }
1183 } 1180 }
@@ -1204,7 +1201,8 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
1204 return -ENOENT; 1201 return -ENOENT;
1205 1202
1206 if (optname != MRT_INIT) { 1203 if (optname != MRT_INIT) {
1207 if (sk != mrt->mroute_sk && !capable(CAP_NET_ADMIN)) 1204 if (sk != rcu_dereference_raw(mrt->mroute_sk) &&
1205 !capable(CAP_NET_ADMIN))
1208 return -EACCES; 1206 return -EACCES;
1209 } 1207 }
1210 1208
@@ -1217,23 +1215,20 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
1217 return -ENOPROTOOPT; 1215 return -ENOPROTOOPT;
1218 1216
1219 rtnl_lock(); 1217 rtnl_lock();
1220 if (mrt->mroute_sk) { 1218 if (rtnl_dereference(mrt->mroute_sk)) {
1221 rtnl_unlock(); 1219 rtnl_unlock();
1222 return -EADDRINUSE; 1220 return -EADDRINUSE;
1223 } 1221 }
1224 1222
1225 ret = ip_ra_control(sk, 1, mrtsock_destruct); 1223 ret = ip_ra_control(sk, 1, mrtsock_destruct);
1226 if (ret == 0) { 1224 if (ret == 0) {
1227 write_lock_bh(&mrt_lock); 1225 rcu_assign_pointer(mrt->mroute_sk, sk);
1228 mrt->mroute_sk = sk;
1229 write_unlock_bh(&mrt_lock);
1230
1231 IPV4_DEVCONF_ALL(net, MC_FORWARDING)++; 1226 IPV4_DEVCONF_ALL(net, MC_FORWARDING)++;
1232 } 1227 }
1233 rtnl_unlock(); 1228 rtnl_unlock();
1234 return ret; 1229 return ret;
1235 case MRT_DONE: 1230 case MRT_DONE:
1236 if (sk != mrt->mroute_sk) 1231 if (sk != rcu_dereference_raw(mrt->mroute_sk))
1237 return -EACCES; 1232 return -EACCES;
1238 return ip_ra_control(sk, 0, NULL); 1233 return ip_ra_control(sk, 0, NULL);
1239 case MRT_ADD_VIF: 1234 case MRT_ADD_VIF:
@@ -1246,7 +1241,8 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
1246 return -ENFILE; 1241 return -ENFILE;
1247 rtnl_lock(); 1242 rtnl_lock();
1248 if (optname == MRT_ADD_VIF) { 1243 if (optname == MRT_ADD_VIF) {
1249 ret = vif_add(net, mrt, &vif, sk == mrt->mroute_sk); 1244 ret = vif_add(net, mrt, &vif,
1245 sk == rtnl_dereference(mrt->mroute_sk));
1250 } else { 1246 } else {
1251 ret = vif_delete(mrt, vif.vifc_vifi, 0, NULL); 1247 ret = vif_delete(mrt, vif.vifc_vifi, 0, NULL);
1252 } 1248 }
@@ -1267,7 +1263,8 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
1267 if (optname == MRT_DEL_MFC) 1263 if (optname == MRT_DEL_MFC)
1268 ret = ipmr_mfc_delete(mrt, &mfc); 1264 ret = ipmr_mfc_delete(mrt, &mfc);
1269 else 1265 else
1270 ret = ipmr_mfc_add(net, mrt, &mfc, sk == mrt->mroute_sk); 1266 ret = ipmr_mfc_add(net, mrt, &mfc,
1267 sk == rtnl_dereference(mrt->mroute_sk));
1271 rtnl_unlock(); 1268 rtnl_unlock();
1272 return ret; 1269 return ret;
1273 /* 1270 /*
@@ -1276,7 +1273,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
1276 case MRT_ASSERT: 1273 case MRT_ASSERT:
1277 { 1274 {
1278 int v; 1275 int v;
1279 if (get_user(v,(int __user *)optval)) 1276 if (get_user(v, (int __user *)optval))
1280 return -EFAULT; 1277 return -EFAULT;
1281 mrt->mroute_do_assert = (v) ? 1 : 0; 1278 mrt->mroute_do_assert = (v) ? 1 : 0;
1282 return 0; 1279 return 0;
@@ -1286,7 +1283,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
1286 { 1283 {
1287 int v; 1284 int v;
1288 1285
1289 if (get_user(v,(int __user *)optval)) 1286 if (get_user(v, (int __user *)optval))
1290 return -EFAULT; 1287 return -EFAULT;
1291 v = (v) ? 1 : 0; 1288 v = (v) ? 1 : 0;
1292 1289
@@ -1309,14 +1306,16 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
1309 return -EINVAL; 1306 return -EINVAL;
1310 if (get_user(v, (u32 __user *)optval)) 1307 if (get_user(v, (u32 __user *)optval))
1311 return -EFAULT; 1308 return -EFAULT;
1312 if (sk == mrt->mroute_sk)
1313 return -EBUSY;
1314 1309
1315 rtnl_lock(); 1310 rtnl_lock();
1316 ret = 0; 1311 ret = 0;
1317 if (!ipmr_new_table(net, v)) 1312 if (sk == rtnl_dereference(mrt->mroute_sk)) {
1318 ret = -ENOMEM; 1313 ret = -EBUSY;
1319 raw_sk(sk)->ipmr_table = v; 1314 } else {
1315 if (!ipmr_new_table(net, v))
1316 ret = -ENOMEM;
1317 raw_sk(sk)->ipmr_table = v;
1318 }
1320 rtnl_unlock(); 1319 rtnl_unlock();
1321 return ret; 1320 return ret;
1322 } 1321 }
@@ -1347,9 +1346,9 @@ int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int
1347 1346
1348 if (optname != MRT_VERSION && 1347 if (optname != MRT_VERSION &&
1349#ifdef CONFIG_IP_PIMSM 1348#ifdef CONFIG_IP_PIMSM
1350 optname!=MRT_PIM && 1349 optname != MRT_PIM &&
1351#endif 1350#endif
1352 optname!=MRT_ASSERT) 1351 optname != MRT_ASSERT)
1353 return -ENOPROTOOPT; 1352 return -ENOPROTOOPT;
1354 1353
1355 if (get_user(olr, optlen)) 1354 if (get_user(olr, optlen))
@@ -1416,19 +1415,19 @@ int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
1416 if (copy_from_user(&sr, arg, sizeof(sr))) 1415 if (copy_from_user(&sr, arg, sizeof(sr)))
1417 return -EFAULT; 1416 return -EFAULT;
1418 1417
1419 read_lock(&mrt_lock); 1418 rcu_read_lock();
1420 c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr); 1419 c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr);
1421 if (c) { 1420 if (c) {
1422 sr.pktcnt = c->mfc_un.res.pkt; 1421 sr.pktcnt = c->mfc_un.res.pkt;
1423 sr.bytecnt = c->mfc_un.res.bytes; 1422 sr.bytecnt = c->mfc_un.res.bytes;
1424 sr.wrong_if = c->mfc_un.res.wrong_if; 1423 sr.wrong_if = c->mfc_un.res.wrong_if;
1425 read_unlock(&mrt_lock); 1424 rcu_read_unlock();
1426 1425
1427 if (copy_to_user(arg, &sr, sizeof(sr))) 1426 if (copy_to_user(arg, &sr, sizeof(sr)))
1428 return -EFAULT; 1427 return -EFAULT;
1429 return 0; 1428 return 0;
1430 } 1429 }
1431 read_unlock(&mrt_lock); 1430 rcu_read_unlock();
1432 return -EADDRNOTAVAIL; 1431 return -EADDRNOTAVAIL;
1433 default: 1432 default:
1434 return -ENOIOCTLCMD; 1433 return -ENOIOCTLCMD;
@@ -1465,7 +1464,7 @@ static struct notifier_block ip_mr_notifier = {
1465}; 1464};
1466 1465
1467/* 1466/*
1468 * Encapsulate a packet by attaching a valid IPIP header to it. 1467 * Encapsulate a packet by attaching a valid IPIP header to it.
1469 * This avoids tunnel drivers and other mess and gives us the speed so 1468 * This avoids tunnel drivers and other mess and gives us the speed so
1470 * important for multicast video. 1469 * important for multicast video.
1471 */ 1470 */
@@ -1480,7 +1479,7 @@ static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)
1480 skb_reset_network_header(skb); 1479 skb_reset_network_header(skb);
1481 iph = ip_hdr(skb); 1480 iph = ip_hdr(skb);
1482 1481
1483 iph->version = 4; 1482 iph->version = 4;
1484 iph->tos = old_iph->tos; 1483 iph->tos = old_iph->tos;
1485 iph->ttl = old_iph->ttl; 1484 iph->ttl = old_iph->ttl;
1486 iph->frag_off = 0; 1485 iph->frag_off = 0;
@@ -1498,7 +1497,7 @@ static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)
1498 1497
1499static inline int ipmr_forward_finish(struct sk_buff *skb) 1498static inline int ipmr_forward_finish(struct sk_buff *skb)
1500{ 1499{
1501 struct ip_options * opt = &(IPCB(skb)->opt); 1500 struct ip_options *opt = &(IPCB(skb)->opt);
1502 1501
1503 IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS); 1502 IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);
1504 1503
@@ -1535,22 +1534,34 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
1535 } 1534 }
1536#endif 1535#endif
1537 1536
1538 if (vif->flags&VIFF_TUNNEL) { 1537 if (vif->flags & VIFF_TUNNEL) {
1539 struct flowi fl = { .oif = vif->link, 1538 struct flowi fl = {
1540 .nl_u = { .ip4_u = 1539 .oif = vif->link,
1541 { .daddr = vif->remote, 1540 .nl_u = {
1542 .saddr = vif->local, 1541 .ip4_u = {
1543 .tos = RT_TOS(iph->tos) } }, 1542 .daddr = vif->remote,
1544 .proto = IPPROTO_IPIP }; 1543 .saddr = vif->local,
1544 .tos = RT_TOS(iph->tos)
1545 }
1546 },
1547 .proto = IPPROTO_IPIP
1548 };
1549
1545 if (ip_route_output_key(net, &rt, &fl)) 1550 if (ip_route_output_key(net, &rt, &fl))
1546 goto out_free; 1551 goto out_free;
1547 encap = sizeof(struct iphdr); 1552 encap = sizeof(struct iphdr);
1548 } else { 1553 } else {
1549 struct flowi fl = { .oif = vif->link, 1554 struct flowi fl = {
1550 .nl_u = { .ip4_u = 1555 .oif = vif->link,
1551 { .daddr = iph->daddr, 1556 .nl_u = {
1552 .tos = RT_TOS(iph->tos) } }, 1557 .ip4_u = {
1553 .proto = IPPROTO_IPIP }; 1558 .daddr = iph->daddr,
1559 .tos = RT_TOS(iph->tos)
1560 }
1561 },
1562 .proto = IPPROTO_IPIP
1563 };
1564
1554 if (ip_route_output_key(net, &rt, &fl)) 1565 if (ip_route_output_key(net, &rt, &fl))
1555 goto out_free; 1566 goto out_free;
1556 } 1567 }
@@ -1559,8 +1570,8 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
1559 1570
1560 if (skb->len+encap > dst_mtu(&rt->dst) && (ntohs(iph->frag_off) & IP_DF)) { 1571 if (skb->len+encap > dst_mtu(&rt->dst) && (ntohs(iph->frag_off) & IP_DF)) {
1561 /* Do not fragment multicasts. Alas, IPv4 does not 1572 /* Do not fragment multicasts. Alas, IPv4 does not
1562 allow to send ICMP, so that packets will disappear 1573 * allow to send ICMP, so that packets will disappear
1563 to blackhole. 1574 * to blackhole.
1564 */ 1575 */
1565 1576
1566 IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_FRAGFAILS); 1577 IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
@@ -1583,7 +1594,8 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
1583 ip_decrease_ttl(ip_hdr(skb)); 1594 ip_decrease_ttl(ip_hdr(skb));
1584 1595
1585 /* FIXME: forward and output firewalls used to be called here. 1596 /* FIXME: forward and output firewalls used to be called here.
1586 * What do we do with netfilter? -- RR */ 1597 * What do we do with netfilter? -- RR
1598 */
1587 if (vif->flags & VIFF_TUNNEL) { 1599 if (vif->flags & VIFF_TUNNEL) {
1588 ip_encap(skb, vif->local, vif->remote); 1600 ip_encap(skb, vif->local, vif->remote);
1589 /* FIXME: extra output firewall step used to be here. --RR */ 1601 /* FIXME: extra output firewall step used to be here. --RR */
@@ -1644,15 +1656,15 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt,
1644 1656
1645 if (skb_rtable(skb)->fl.iif == 0) { 1657 if (skb_rtable(skb)->fl.iif == 0) {
1646 /* It is our own packet, looped back. 1658 /* It is our own packet, looped back.
1647 Very complicated situation... 1659 * Very complicated situation...
1648 1660 *
1649 The best workaround until routing daemons will be 1661 * The best workaround until routing daemons will be
1650 fixed is not to redistribute packet, if it was 1662 * fixed is not to redistribute packet, if it was
1651 send through wrong interface. It means, that 1663 * send through wrong interface. It means, that
1652 multicast applications WILL NOT work for 1664 * multicast applications WILL NOT work for
1653 (S,G), which have default multicast route pointing 1665 * (S,G), which have default multicast route pointing
1654 to wrong oif. In any case, it is not a good 1666 * to wrong oif. In any case, it is not a good
1655 idea to use multicasting applications on router. 1667 * idea to use multicasting applications on router.
1656 */ 1668 */
1657 goto dont_forward; 1669 goto dont_forward;
1658 } 1670 }
@@ -1662,9 +1674,9 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt,
1662 1674
1663 if (true_vifi >= 0 && mrt->mroute_do_assert && 1675 if (true_vifi >= 0 && mrt->mroute_do_assert &&
1664 /* pimsm uses asserts, when switching from RPT to SPT, 1676 /* pimsm uses asserts, when switching from RPT to SPT,
1665 so that we cannot check that packet arrived on an oif. 1677 * so that we cannot check that packet arrived on an oif.
1666 It is bad, but otherwise we would need to move pretty 1678 * It is bad, but otherwise we would need to move pretty
1667 large chunk of pimd to kernel. Ough... --ANK 1679 * large chunk of pimd to kernel. Ough... --ANK
1668 */ 1680 */
1669 (mrt->mroute_do_pim || 1681 (mrt->mroute_do_pim ||
1670 cache->mfc_un.res.ttls[true_vifi] < 255) && 1682 cache->mfc_un.res.ttls[true_vifi] < 255) &&
@@ -1682,10 +1694,12 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt,
1682 /* 1694 /*
1683 * Forward the frame 1695 * Forward the frame
1684 */ 1696 */
1685 for (ct = cache->mfc_un.res.maxvif-1; ct >= cache->mfc_un.res.minvif; ct--) { 1697 for (ct = cache->mfc_un.res.maxvif - 1;
1698 ct >= cache->mfc_un.res.minvif; ct--) {
1686 if (ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) { 1699 if (ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) {
1687 if (psend != -1) { 1700 if (psend != -1) {
1688 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); 1701 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1702
1689 if (skb2) 1703 if (skb2)
1690 ipmr_queue_xmit(net, mrt, skb2, cache, 1704 ipmr_queue_xmit(net, mrt, skb2, cache,
1691 psend); 1705 psend);
@@ -1696,6 +1710,7 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt,
1696 if (psend != -1) { 1710 if (psend != -1) {
1697 if (local) { 1711 if (local) {
1698 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); 1712 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1713
1699 if (skb2) 1714 if (skb2)
1700 ipmr_queue_xmit(net, mrt, skb2, cache, psend); 1715 ipmr_queue_xmit(net, mrt, skb2, cache, psend);
1701 } else { 1716 } else {
@@ -1713,6 +1728,7 @@ dont_forward:
1713 1728
1714/* 1729/*
1715 * Multicast packets for forwarding arrive here 1730 * Multicast packets for forwarding arrive here
1731 * Called with rcu_read_lock();
1716 */ 1732 */
1717 1733
1718int ip_mr_input(struct sk_buff *skb) 1734int ip_mr_input(struct sk_buff *skb)
@@ -1724,9 +1740,9 @@ int ip_mr_input(struct sk_buff *skb)
1724 int err; 1740 int err;
1725 1741
1726 /* Packet is looped back after forward, it should not be 1742 /* Packet is looped back after forward, it should not be
1727 forwarded second time, but still can be delivered locally. 1743 * forwarded second time, but still can be delivered locally.
1728 */ 1744 */
1729 if (IPCB(skb)->flags&IPSKB_FORWARDED) 1745 if (IPCB(skb)->flags & IPSKB_FORWARDED)
1730 goto dont_forward; 1746 goto dont_forward;
1731 1747
1732 err = ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt); 1748 err = ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt);
@@ -1736,28 +1752,28 @@ int ip_mr_input(struct sk_buff *skb)
1736 } 1752 }
1737 1753
1738 if (!local) { 1754 if (!local) {
1739 if (IPCB(skb)->opt.router_alert) { 1755 if (IPCB(skb)->opt.router_alert) {
1740 if (ip_call_ra_chain(skb)) 1756 if (ip_call_ra_chain(skb))
1741 return 0; 1757 return 0;
1742 } else if (ip_hdr(skb)->protocol == IPPROTO_IGMP){ 1758 } else if (ip_hdr(skb)->protocol == IPPROTO_IGMP) {
1743 /* IGMPv1 (and broken IGMPv2 implementations sort of 1759 /* IGMPv1 (and broken IGMPv2 implementations sort of
1744 Cisco IOS <= 11.2(8)) do not put router alert 1760 * Cisco IOS <= 11.2(8)) do not put router alert
1745 option to IGMP packets destined to routable 1761 * option to IGMP packets destined to routable
1746 groups. It is very bad, because it means 1762 * groups. It is very bad, because it means
1747 that we can forward NO IGMP messages. 1763 * that we can forward NO IGMP messages.
1748 */ 1764 */
1749 read_lock(&mrt_lock); 1765 struct sock *mroute_sk;
1750 if (mrt->mroute_sk) { 1766
1751 nf_reset(skb); 1767 mroute_sk = rcu_dereference(mrt->mroute_sk);
1752 raw_rcv(mrt->mroute_sk, skb); 1768 if (mroute_sk) {
1753 read_unlock(&mrt_lock); 1769 nf_reset(skb);
1754 return 0; 1770 raw_rcv(mroute_sk, skb);
1755 } 1771 return 0;
1756 read_unlock(&mrt_lock); 1772 }
1757 } 1773 }
1758 } 1774 }
1759 1775
1760 read_lock(&mrt_lock); 1776 /* already under rcu_read_lock() */
1761 cache = ipmr_cache_find(mrt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr); 1777 cache = ipmr_cache_find(mrt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr);
1762 1778
1763 /* 1779 /*
@@ -1769,13 +1785,12 @@ int ip_mr_input(struct sk_buff *skb)
1769 if (local) { 1785 if (local) {
1770 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); 1786 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1771 ip_local_deliver(skb); 1787 ip_local_deliver(skb);
1772 if (skb2 == NULL) { 1788 if (skb2 == NULL)
1773 read_unlock(&mrt_lock);
1774 return -ENOBUFS; 1789 return -ENOBUFS;
1775 }
1776 skb = skb2; 1790 skb = skb2;
1777 } 1791 }
1778 1792
1793 read_lock(&mrt_lock);
1779 vif = ipmr_find_vif(mrt, skb->dev); 1794 vif = ipmr_find_vif(mrt, skb->dev);
1780 if (vif >= 0) { 1795 if (vif >= 0) {
1781 int err2 = ipmr_cache_unresolved(mrt, vif, skb); 1796 int err2 = ipmr_cache_unresolved(mrt, vif, skb);
@@ -1788,8 +1803,8 @@ int ip_mr_input(struct sk_buff *skb)
1788 return -ENODEV; 1803 return -ENODEV;
1789 } 1804 }
1790 1805
1806 read_lock(&mrt_lock);
1791 ip_mr_forward(net, mrt, skb, cache, local); 1807 ip_mr_forward(net, mrt, skb, cache, local);
1792
1793 read_unlock(&mrt_lock); 1808 read_unlock(&mrt_lock);
1794 1809
1795 if (local) 1810 if (local)
@@ -1805,6 +1820,7 @@ dont_forward:
1805} 1820}
1806 1821
1807#ifdef CONFIG_IP_PIMSM 1822#ifdef CONFIG_IP_PIMSM
1823/* called with rcu_read_lock() */
1808static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb, 1824static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
1809 unsigned int pimlen) 1825 unsigned int pimlen)
1810{ 1826{
@@ -1813,10 +1829,10 @@ static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
1813 1829
1814 encap = (struct iphdr *)(skb_transport_header(skb) + pimlen); 1830 encap = (struct iphdr *)(skb_transport_header(skb) + pimlen);
1815 /* 1831 /*
1816 Check that: 1832 * Check that:
1817 a. packet is really destinted to a multicast group 1833 * a. packet is really sent to a multicast group
1818 b. packet is not a NULL-REGISTER 1834 * b. packet is not a NULL-REGISTER
1819 c. packet is not truncated 1835 * c. packet is not truncated
1820 */ 1836 */
1821 if (!ipv4_is_multicast(encap->daddr) || 1837 if (!ipv4_is_multicast(encap->daddr) ||
1822 encap->tot_len == 0 || 1838 encap->tot_len == 0 ||
@@ -1826,26 +1842,23 @@ static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
1826 read_lock(&mrt_lock); 1842 read_lock(&mrt_lock);
1827 if (mrt->mroute_reg_vif_num >= 0) 1843 if (mrt->mroute_reg_vif_num >= 0)
1828 reg_dev = mrt->vif_table[mrt->mroute_reg_vif_num].dev; 1844 reg_dev = mrt->vif_table[mrt->mroute_reg_vif_num].dev;
1829 if (reg_dev)
1830 dev_hold(reg_dev);
1831 read_unlock(&mrt_lock); 1845 read_unlock(&mrt_lock);
1832 1846
1833 if (reg_dev == NULL) 1847 if (reg_dev == NULL)
1834 return 1; 1848 return 1;
1835 1849
1836 skb->mac_header = skb->network_header; 1850 skb->mac_header = skb->network_header;
1837 skb_pull(skb, (u8*)encap - skb->data); 1851 skb_pull(skb, (u8 *)encap - skb->data);
1838 skb_reset_network_header(skb); 1852 skb_reset_network_header(skb);
1839 skb->protocol = htons(ETH_P_IP); 1853 skb->protocol = htons(ETH_P_IP);
1840 skb->ip_summed = 0; 1854 skb->ip_summed = CHECKSUM_NONE;
1841 skb->pkt_type = PACKET_HOST; 1855 skb->pkt_type = PACKET_HOST;
1842 1856
1843 skb_tunnel_rx(skb, reg_dev); 1857 skb_tunnel_rx(skb, reg_dev);
1844 1858
1845 netif_rx(skb); 1859 netif_rx(skb);
1846 dev_put(reg_dev);
1847 1860
1848 return 0; 1861 return NET_RX_SUCCESS;
1849} 1862}
1850#endif 1863#endif
1851 1864
@@ -1854,7 +1867,7 @@ static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
1854 * Handle IGMP messages of PIMv1 1867 * Handle IGMP messages of PIMv1
1855 */ 1868 */
1856 1869
1857int pim_rcv_v1(struct sk_buff * skb) 1870int pim_rcv_v1(struct sk_buff *skb)
1858{ 1871{
1859 struct igmphdr *pim; 1872 struct igmphdr *pim;
1860 struct net *net = dev_net(skb->dev); 1873 struct net *net = dev_net(skb->dev);
@@ -1881,7 +1894,7 @@ drop:
1881#endif 1894#endif
1882 1895
1883#ifdef CONFIG_IP_PIMSM_V2 1896#ifdef CONFIG_IP_PIMSM_V2
1884static int pim_rcv(struct sk_buff * skb) 1897static int pim_rcv(struct sk_buff *skb)
1885{ 1898{
1886 struct pimreghdr *pim; 1899 struct pimreghdr *pim;
1887 struct net *net = dev_net(skb->dev); 1900 struct net *net = dev_net(skb->dev);
@@ -1891,8 +1904,8 @@ static int pim_rcv(struct sk_buff * skb)
1891 goto drop; 1904 goto drop;
1892 1905
1893 pim = (struct pimreghdr *)skb_transport_header(skb); 1906 pim = (struct pimreghdr *)skb_transport_header(skb);
1894 if (pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) || 1907 if (pim->type != ((PIM_VERSION << 4) | (PIM_REGISTER)) ||
1895 (pim->flags&PIM_NULL_REGISTER) || 1908 (pim->flags & PIM_NULL_REGISTER) ||
1896 (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 && 1909 (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 &&
1897 csum_fold(skb_checksum(skb, 0, skb->len, 0)))) 1910 csum_fold(skb_checksum(skb, 0, skb->len, 0))))
1898 goto drop; 1911 goto drop;
@@ -1958,28 +1971,33 @@ int ipmr_get_route(struct net *net,
1958 if (mrt == NULL) 1971 if (mrt == NULL)
1959 return -ENOENT; 1972 return -ENOENT;
1960 1973
1961 read_lock(&mrt_lock); 1974 rcu_read_lock();
1962 cache = ipmr_cache_find(mrt, rt->rt_src, rt->rt_dst); 1975 cache = ipmr_cache_find(mrt, rt->rt_src, rt->rt_dst);
1963 1976
1964 if (cache == NULL) { 1977 if (cache == NULL) {
1965 struct sk_buff *skb2; 1978 struct sk_buff *skb2;
1966 struct iphdr *iph; 1979 struct iphdr *iph;
1967 struct net_device *dev; 1980 struct net_device *dev;
1968 int vif; 1981 int vif = -1;
1969 1982
1970 if (nowait) { 1983 if (nowait) {
1971 read_unlock(&mrt_lock); 1984 rcu_read_unlock();
1972 return -EAGAIN; 1985 return -EAGAIN;
1973 } 1986 }
1974 1987
1975 dev = skb->dev; 1988 dev = skb->dev;
1976 if (dev == NULL || (vif = ipmr_find_vif(mrt, dev)) < 0) { 1989 read_lock(&mrt_lock);
1990 if (dev)
1991 vif = ipmr_find_vif(mrt, dev);
1992 if (vif < 0) {
1977 read_unlock(&mrt_lock); 1993 read_unlock(&mrt_lock);
1994 rcu_read_unlock();
1978 return -ENODEV; 1995 return -ENODEV;
1979 } 1996 }
1980 skb2 = skb_clone(skb, GFP_ATOMIC); 1997 skb2 = skb_clone(skb, GFP_ATOMIC);
1981 if (!skb2) { 1998 if (!skb2) {
1982 read_unlock(&mrt_lock); 1999 read_unlock(&mrt_lock);
2000 rcu_read_unlock();
1983 return -ENOMEM; 2001 return -ENOMEM;
1984 } 2002 }
1985 2003
@@ -1992,13 +2010,16 @@ int ipmr_get_route(struct net *net,
1992 iph->version = 0; 2010 iph->version = 0;
1993 err = ipmr_cache_unresolved(mrt, vif, skb2); 2011 err = ipmr_cache_unresolved(mrt, vif, skb2);
1994 read_unlock(&mrt_lock); 2012 read_unlock(&mrt_lock);
2013 rcu_read_unlock();
1995 return err; 2014 return err;
1996 } 2015 }
1997 2016
1998 if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY)) 2017 read_lock(&mrt_lock);
2018 if (!nowait && (rtm->rtm_flags & RTM_F_NOTIFY))
1999 cache->mfc_flags |= MFC_NOTIFY; 2019 cache->mfc_flags |= MFC_NOTIFY;
2000 err = __ipmr_fill_mroute(mrt, skb, cache, rtm); 2020 err = __ipmr_fill_mroute(mrt, skb, cache, rtm);
2001 read_unlock(&mrt_lock); 2021 read_unlock(&mrt_lock);
2022 rcu_read_unlock();
2002 return err; 2023 return err;
2003} 2024}
2004 2025
@@ -2050,14 +2071,14 @@ static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
2050 s_h = cb->args[1]; 2071 s_h = cb->args[1];
2051 s_e = cb->args[2]; 2072 s_e = cb->args[2];
2052 2073
2053 read_lock(&mrt_lock); 2074 rcu_read_lock();
2054 ipmr_for_each_table(mrt, net) { 2075 ipmr_for_each_table(mrt, net) {
2055 if (t < s_t) 2076 if (t < s_t)
2056 goto next_table; 2077 goto next_table;
2057 if (t > s_t) 2078 if (t > s_t)
2058 s_h = 0; 2079 s_h = 0;
2059 for (h = s_h; h < MFC_LINES; h++) { 2080 for (h = s_h; h < MFC_LINES; h++) {
2060 list_for_each_entry(mfc, &mrt->mfc_cache_array[h], list) { 2081 list_for_each_entry_rcu(mfc, &mrt->mfc_cache_array[h], list) {
2061 if (e < s_e) 2082 if (e < s_e)
2062 goto next_entry; 2083 goto next_entry;
2063 if (ipmr_fill_mroute(mrt, skb, 2084 if (ipmr_fill_mroute(mrt, skb,
@@ -2075,7 +2096,7 @@ next_table:
2075 t++; 2096 t++;
2076 } 2097 }
2077done: 2098done:
2078 read_unlock(&mrt_lock); 2099 rcu_read_unlock();
2079 2100
2080 cb->args[2] = e; 2101 cb->args[2] = e;
2081 cb->args[1] = h; 2102 cb->args[1] = h;
@@ -2086,7 +2107,8 @@ done:
2086 2107
2087#ifdef CONFIG_PROC_FS 2108#ifdef CONFIG_PROC_FS
2088/* 2109/*
2089 * The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif 2110 * The /proc interfaces to multicast routing :
2111 * /proc/net/ip_mr_cache & /proc/net/ip_mr_vif
2090 */ 2112 */
2091struct ipmr_vif_iter { 2113struct ipmr_vif_iter {
2092 struct seq_net_private p; 2114 struct seq_net_private p;
@@ -2208,14 +2230,14 @@ static struct mfc_cache *ipmr_mfc_seq_idx(struct net *net,
2208 struct mr_table *mrt = it->mrt; 2230 struct mr_table *mrt = it->mrt;
2209 struct mfc_cache *mfc; 2231 struct mfc_cache *mfc;
2210 2232
2211 read_lock(&mrt_lock); 2233 rcu_read_lock();
2212 for (it->ct = 0; it->ct < MFC_LINES; it->ct++) { 2234 for (it->ct = 0; it->ct < MFC_LINES; it->ct++) {
2213 it->cache = &mrt->mfc_cache_array[it->ct]; 2235 it->cache = &mrt->mfc_cache_array[it->ct];
2214 list_for_each_entry(mfc, it->cache, list) 2236 list_for_each_entry_rcu(mfc, it->cache, list)
2215 if (pos-- == 0) 2237 if (pos-- == 0)
2216 return mfc; 2238 return mfc;
2217 } 2239 }
2218 read_unlock(&mrt_lock); 2240 rcu_read_unlock();
2219 2241
2220 spin_lock_bh(&mfc_unres_lock); 2242 spin_lock_bh(&mfc_unres_lock);
2221 it->cache = &mrt->mfc_unres_queue; 2243 it->cache = &mrt->mfc_unres_queue;
@@ -2274,7 +2296,7 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2274 } 2296 }
2275 2297
2276 /* exhausted cache_array, show unresolved */ 2298 /* exhausted cache_array, show unresolved */
2277 read_unlock(&mrt_lock); 2299 rcu_read_unlock();
2278 it->cache = &mrt->mfc_unres_queue; 2300 it->cache = &mrt->mfc_unres_queue;
2279 it->ct = 0; 2301 it->ct = 0;
2280 2302
@@ -2282,7 +2304,7 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2282 if (!list_empty(it->cache)) 2304 if (!list_empty(it->cache))
2283 return list_first_entry(it->cache, struct mfc_cache, list); 2305 return list_first_entry(it->cache, struct mfc_cache, list);
2284 2306
2285 end_of_list: 2307end_of_list:
2286 spin_unlock_bh(&mfc_unres_lock); 2308 spin_unlock_bh(&mfc_unres_lock);
2287 it->cache = NULL; 2309 it->cache = NULL;
2288 2310
@@ -2297,7 +2319,7 @@ static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
2297 if (it->cache == &mrt->mfc_unres_queue) 2319 if (it->cache == &mrt->mfc_unres_queue)
2298 spin_unlock_bh(&mfc_unres_lock); 2320 spin_unlock_bh(&mfc_unres_lock);
2299 else if (it->cache == &mrt->mfc_cache_array[it->ct]) 2321 else if (it->cache == &mrt->mfc_cache_array[it->ct])
2300 read_unlock(&mrt_lock); 2322 rcu_read_unlock();
2301} 2323}
2302 2324
2303static int ipmr_mfc_seq_show(struct seq_file *seq, void *v) 2325static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
@@ -2323,7 +2345,7 @@ static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
2323 mfc->mfc_un.res.bytes, 2345 mfc->mfc_un.res.bytes,
2324 mfc->mfc_un.res.wrong_if); 2346 mfc->mfc_un.res.wrong_if);
2325 for (n = mfc->mfc_un.res.minvif; 2347 for (n = mfc->mfc_un.res.minvif;
2326 n < mfc->mfc_un.res.maxvif; n++ ) { 2348 n < mfc->mfc_un.res.maxvif; n++) {
2327 if (VIF_EXISTS(mrt, n) && 2349 if (VIF_EXISTS(mrt, n) &&
2328 mfc->mfc_un.res.ttls[n] < 255) 2350 mfc->mfc_un.res.ttls[n] < 255)
2329 seq_printf(seq, 2351 seq_printf(seq,
@@ -2421,7 +2443,7 @@ int __init ip_mr_init(void)
2421 2443
2422 mrt_cachep = kmem_cache_create("ip_mrt_cache", 2444 mrt_cachep = kmem_cache_create("ip_mrt_cache",
2423 sizeof(struct mfc_cache), 2445 sizeof(struct mfc_cache),
2424 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, 2446 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC,
2425 NULL); 2447 NULL);
2426 if (!mrt_cachep) 2448 if (!mrt_cachep)
2427 return -ENOMEM; 2449 return -ENOMEM;
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index e8f4f9a57f12..8b642f152468 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -72,7 +72,7 @@ static inline int arp_devaddr_compare(const struct arpt_devaddr_info *ap,
72 for (i = 0; i < len; i++) 72 for (i = 0; i < len; i++)
73 ret |= (hdr_addr[i] ^ ap->addr[i]) & ap->mask[i]; 73 ret |= (hdr_addr[i] ^ ap->addr[i]) & ap->mask[i];
74 74
75 return (ret != 0); 75 return ret != 0;
76} 76}
77 77
78/* 78/*
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 3a43cf36db87..1e26a4897655 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -29,6 +29,7 @@
29#include <net/netfilter/nf_conntrack.h> 29#include <net/netfilter/nf_conntrack.h>
30#include <net/net_namespace.h> 30#include <net/net_namespace.h>
31#include <net/checksum.h> 31#include <net/checksum.h>
32#include <net/ip.h>
32 33
33#define CLUSTERIP_VERSION "0.8" 34#define CLUSTERIP_VERSION "0.8"
34 35
@@ -231,24 +232,22 @@ clusterip_hashfn(const struct sk_buff *skb,
231{ 232{
232 const struct iphdr *iph = ip_hdr(skb); 233 const struct iphdr *iph = ip_hdr(skb);
233 unsigned long hashval; 234 unsigned long hashval;
234 u_int16_t sport, dport; 235 u_int16_t sport = 0, dport = 0;
235 const u_int16_t *ports; 236 int poff;
236 237
237 switch (iph->protocol) { 238 poff = proto_ports_offset(iph->protocol);
238 case IPPROTO_TCP: 239 if (poff >= 0) {
239 case IPPROTO_UDP: 240 const u_int16_t *ports;
240 case IPPROTO_UDPLITE: 241 u16 _ports[2];
241 case IPPROTO_SCTP: 242
242 case IPPROTO_DCCP: 243 ports = skb_header_pointer(skb, iph->ihl * 4 + poff, 4, _ports);
243 case IPPROTO_ICMP: 244 if (ports) {
244 ports = (const void *)iph+iph->ihl*4; 245 sport = ports[0];
245 sport = ports[0]; 246 dport = ports[1];
246 dport = ports[1]; 247 }
247 break; 248 } else {
248 default:
249 if (net_ratelimit()) 249 if (net_ratelimit())
250 pr_info("unknown protocol %u\n", iph->protocol); 250 pr_info("unknown protocol %u\n", iph->protocol);
251 sport = dport = 0;
252 } 251 }
253 252
254 switch (config->hash_mode) { 253 switch (config->hash_mode) {
diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c
index f2d297351405..65699c24411c 100644
--- a/net/ipv4/protocol.c
+++ b/net/ipv4/protocol.c
@@ -28,8 +28,7 @@
28#include <linux/spinlock.h> 28#include <linux/spinlock.h>
29#include <net/protocol.h> 29#include <net/protocol.h>
30 30
31const struct net_protocol *inet_protos[MAX_INET_PROTOS] ____cacheline_aligned_in_smp; 31const struct net_protocol *inet_protos[MAX_INET_PROTOS] __read_mostly;
32static DEFINE_SPINLOCK(inet_proto_lock);
33 32
34/* 33/*
35 * Add a protocol handler to the hash tables 34 * Add a protocol handler to the hash tables
@@ -37,20 +36,9 @@ static DEFINE_SPINLOCK(inet_proto_lock);
37 36
38int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol) 37int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol)
39{ 38{
40 int hash, ret; 39 int hash = protocol & (MAX_INET_PROTOS - 1);
41 40
42 hash = protocol & (MAX_INET_PROTOS - 1); 41 return !cmpxchg(&inet_protos[hash], NULL, prot) ? 0 : -1;
43
44 spin_lock_bh(&inet_proto_lock);
45 if (inet_protos[hash]) {
46 ret = -1;
47 } else {
48 inet_protos[hash] = prot;
49 ret = 0;
50 }
51 spin_unlock_bh(&inet_proto_lock);
52
53 return ret;
54} 42}
55EXPORT_SYMBOL(inet_add_protocol); 43EXPORT_SYMBOL(inet_add_protocol);
56 44
@@ -60,18 +48,9 @@ EXPORT_SYMBOL(inet_add_protocol);
60 48
61int inet_del_protocol(const struct net_protocol *prot, unsigned char protocol) 49int inet_del_protocol(const struct net_protocol *prot, unsigned char protocol)
62{ 50{
63 int hash, ret; 51 int ret, hash = protocol & (MAX_INET_PROTOS - 1);
64
65 hash = protocol & (MAX_INET_PROTOS - 1);
66 52
67 spin_lock_bh(&inet_proto_lock); 53 ret = (cmpxchg(&inet_protos[hash], prot, NULL) == prot) ? 0 : -1;
68 if (inet_protos[hash] == prot) {
69 inet_protos[hash] = NULL;
70 ret = 0;
71 } else {
72 ret = -1;
73 }
74 spin_unlock_bh(&inet_proto_lock);
75 54
76 synchronize_net(); 55 synchronize_net();
77 56
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 009a7b2aa1ef..1f85ef289895 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -505,7 +505,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
505 505
506 ipc.addr = inet->inet_saddr; 506 ipc.addr = inet->inet_saddr;
507 ipc.opt = NULL; 507 ipc.opt = NULL;
508 ipc.shtx.flags = 0; 508 ipc.tx_flags = 0;
509 ipc.oif = sk->sk_bound_dev_if; 509 ipc.oif = sk->sk_bound_dev_if;
510 510
511 if (msg->msg_controllen) { 511 if (msg->msg_controllen) {
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index ac6559cb54f9..04e0df82b88c 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1107,6 +1107,7 @@ restart:
1107 * on the route gc list. 1107 * on the route gc list.
1108 */ 1108 */
1109 1109
1110 rt->dst.flags |= DST_NOCACHE;
1110 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) { 1111 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1111 int err = arp_bind_neighbour(&rt->dst); 1112 int err = arp_bind_neighbour(&rt->dst);
1112 if (err) { 1113 if (err) {
@@ -1268,18 +1269,11 @@ skip_hashing:
1268 1269
1269void rt_bind_peer(struct rtable *rt, int create) 1270void rt_bind_peer(struct rtable *rt, int create)
1270{ 1271{
1271 static DEFINE_SPINLOCK(rt_peer_lock);
1272 struct inet_peer *peer; 1272 struct inet_peer *peer;
1273 1273
1274 peer = inet_getpeer(rt->rt_dst, create); 1274 peer = inet_getpeer(rt->rt_dst, create);
1275 1275
1276 spin_lock_bh(&rt_peer_lock); 1276 if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1277 if (rt->peer == NULL) {
1278 rt->peer = peer;
1279 peer = NULL;
1280 }
1281 spin_unlock_bh(&rt_peer_lock);
1282 if (peer)
1283 inet_putpeer(peer); 1277 inet_putpeer(peer);
1284} 1278}
1285 1279
@@ -2365,9 +2359,8 @@ static int __mkroute_output(struct rtable **result,
2365 struct rtable *rth; 2359 struct rtable *rth;
2366 struct in_device *in_dev; 2360 struct in_device *in_dev;
2367 u32 tos = RT_FL_TOS(oldflp); 2361 u32 tos = RT_FL_TOS(oldflp);
2368 int err = 0;
2369 2362
2370 if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK)) 2363 if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags & IFF_LOOPBACK))
2371 return -EINVAL; 2364 return -EINVAL;
2372 2365
2373 if (fl->fl4_dst == htonl(0xFFFFFFFF)) 2366 if (fl->fl4_dst == htonl(0xFFFFFFFF))
@@ -2380,11 +2373,12 @@ static int __mkroute_output(struct rtable **result,
2380 if (dev_out->flags & IFF_LOOPBACK) 2373 if (dev_out->flags & IFF_LOOPBACK)
2381 flags |= RTCF_LOCAL; 2374 flags |= RTCF_LOCAL;
2382 2375
2383 /* get work reference to inet device */ 2376 rcu_read_lock();
2384 in_dev = in_dev_get(dev_out); 2377 in_dev = __in_dev_get_rcu(dev_out);
2385 if (!in_dev) 2378 if (!in_dev) {
2379 rcu_read_unlock();
2386 return -EINVAL; 2380 return -EINVAL;
2387 2381 }
2388 if (res->type == RTN_BROADCAST) { 2382 if (res->type == RTN_BROADCAST) {
2389 flags |= RTCF_BROADCAST | RTCF_LOCAL; 2383 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2390 if (res->fi) { 2384 if (res->fi) {
@@ -2392,13 +2386,13 @@ static int __mkroute_output(struct rtable **result,
2392 res->fi = NULL; 2386 res->fi = NULL;
2393 } 2387 }
2394 } else if (res->type == RTN_MULTICAST) { 2388 } else if (res->type == RTN_MULTICAST) {
2395 flags |= RTCF_MULTICAST|RTCF_LOCAL; 2389 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2396 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src, 2390 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2397 oldflp->proto)) 2391 oldflp->proto))
2398 flags &= ~RTCF_LOCAL; 2392 flags &= ~RTCF_LOCAL;
2399 /* If multicast route do not exist use 2393 /* If multicast route do not exist use
2400 default one, but do not gateway in this case. 2394 * default one, but do not gateway in this case.
2401 Yes, it is hack. 2395 * Yes, it is hack.
2402 */ 2396 */
2403 if (res->fi && res->prefixlen < 4) { 2397 if (res->fi && res->prefixlen < 4) {
2404 fib_info_put(res->fi); 2398 fib_info_put(res->fi);
@@ -2409,9 +2403,12 @@ static int __mkroute_output(struct rtable **result,
2409 2403
2410 rth = dst_alloc(&ipv4_dst_ops); 2404 rth = dst_alloc(&ipv4_dst_ops);
2411 if (!rth) { 2405 if (!rth) {
2412 err = -ENOBUFS; 2406 rcu_read_unlock();
2413 goto cleanup; 2407 return -ENOBUFS;
2414 } 2408 }
2409 in_dev_hold(in_dev);
2410 rcu_read_unlock();
2411 rth->idev = in_dev;
2415 2412
2416 atomic_set(&rth->dst.__refcnt, 1); 2413 atomic_set(&rth->dst.__refcnt, 1);
2417 rth->dst.flags= DST_HOST; 2414 rth->dst.flags= DST_HOST;
@@ -2432,7 +2429,6 @@ static int __mkroute_output(struct rtable **result,
2432 cache entry */ 2429 cache entry */
2433 rth->dst.dev = dev_out; 2430 rth->dst.dev = dev_out;
2434 dev_hold(dev_out); 2431 dev_hold(dev_out);
2435 rth->idev = in_dev_get(dev_out);
2436 rth->rt_gateway = fl->fl4_dst; 2432 rth->rt_gateway = fl->fl4_dst;
2437 rth->rt_spec_dst= fl->fl4_src; 2433 rth->rt_spec_dst= fl->fl4_src;
2438 2434
@@ -2467,13 +2463,8 @@ static int __mkroute_output(struct rtable **result,
2467 rt_set_nexthop(rth, res, 0); 2463 rt_set_nexthop(rth, res, 0);
2468 2464
2469 rth->rt_flags = flags; 2465 rth->rt_flags = flags;
2470
2471 *result = rth; 2466 *result = rth;
2472 cleanup: 2467 return 0;
2473 /* release work reference to inet device */
2474 in_dev_put(in_dev);
2475
2476 return err;
2477} 2468}
2478 2469
2479static int ip_mkroute_output(struct rtable **rp, 2470static int ip_mkroute_output(struct rtable **rp,
@@ -2497,6 +2488,7 @@ static int ip_mkroute_output(struct rtable **rp,
2497 2488
2498/* 2489/*
2499 * Major route resolver routine. 2490 * Major route resolver routine.
2491 * called with rcu_read_lock();
2500 */ 2492 */
2501 2493
2502static int ip_route_output_slow(struct net *net, struct rtable **rp, 2494static int ip_route_output_slow(struct net *net, struct rtable **rp,
@@ -2515,7 +2507,7 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
2515 .iif = net->loopback_dev->ifindex, 2507 .iif = net->loopback_dev->ifindex,
2516 .oif = oldflp->oif }; 2508 .oif = oldflp->oif };
2517 struct fib_result res; 2509 struct fib_result res;
2518 unsigned flags = 0; 2510 unsigned int flags = 0;
2519 struct net_device *dev_out = NULL; 2511 struct net_device *dev_out = NULL;
2520 int free_res = 0; 2512 int free_res = 0;
2521 int err; 2513 int err;
@@ -2545,7 +2537,7 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
2545 (ipv4_is_multicast(oldflp->fl4_dst) || 2537 (ipv4_is_multicast(oldflp->fl4_dst) ||
2546 oldflp->fl4_dst == htonl(0xFFFFFFFF))) { 2538 oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2547 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ 2539 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2548 dev_out = ip_dev_find(net, oldflp->fl4_src); 2540 dev_out = __ip_dev_find(net, oldflp->fl4_src, false);
2549 if (dev_out == NULL) 2541 if (dev_out == NULL)
2550 goto out; 2542 goto out;
2551 2543
@@ -2570,26 +2562,21 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
2570 2562
2571 if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) { 2563 if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
2572 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ 2564 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2573 dev_out = ip_dev_find(net, oldflp->fl4_src); 2565 if (!__ip_dev_find(net, oldflp->fl4_src, false))
2574 if (dev_out == NULL)
2575 goto out; 2566 goto out;
2576 dev_put(dev_out);
2577 dev_out = NULL;
2578 } 2567 }
2579 } 2568 }
2580 2569
2581 2570
2582 if (oldflp->oif) { 2571 if (oldflp->oif) {
2583 dev_out = dev_get_by_index(net, oldflp->oif); 2572 dev_out = dev_get_by_index_rcu(net, oldflp->oif);
2584 err = -ENODEV; 2573 err = -ENODEV;
2585 if (dev_out == NULL) 2574 if (dev_out == NULL)
2586 goto out; 2575 goto out;
2587 2576
2588 /* RACE: Check return value of inet_select_addr instead. */ 2577 /* RACE: Check return value of inet_select_addr instead. */
2589 if (__in_dev_get_rtnl(dev_out) == NULL) { 2578 if (rcu_dereference(dev_out->ip_ptr) == NULL)
2590 dev_put(dev_out);
2591 goto out; /* Wrong error code */ 2579 goto out; /* Wrong error code */
2592 }
2593 2580
2594 if (ipv4_is_local_multicast(oldflp->fl4_dst) || 2581 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2595 oldflp->fl4_dst == htonl(0xFFFFFFFF)) { 2582 oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
@@ -2612,10 +2599,7 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
2612 fl.fl4_dst = fl.fl4_src; 2599 fl.fl4_dst = fl.fl4_src;
2613 if (!fl.fl4_dst) 2600 if (!fl.fl4_dst)
2614 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK); 2601 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2615 if (dev_out)
2616 dev_put(dev_out);
2617 dev_out = net->loopback_dev; 2602 dev_out = net->loopback_dev;
2618 dev_hold(dev_out);
2619 fl.oif = net->loopback_dev->ifindex; 2603 fl.oif = net->loopback_dev->ifindex;
2620 res.type = RTN_LOCAL; 2604 res.type = RTN_LOCAL;
2621 flags |= RTCF_LOCAL; 2605 flags |= RTCF_LOCAL;
@@ -2649,8 +2633,6 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
2649 res.type = RTN_UNICAST; 2633 res.type = RTN_UNICAST;
2650 goto make_route; 2634 goto make_route;
2651 } 2635 }
2652 if (dev_out)
2653 dev_put(dev_out);
2654 err = -ENETUNREACH; 2636 err = -ENETUNREACH;
2655 goto out; 2637 goto out;
2656 } 2638 }
@@ -2659,10 +2641,7 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
2659 if (res.type == RTN_LOCAL) { 2641 if (res.type == RTN_LOCAL) {
2660 if (!fl.fl4_src) 2642 if (!fl.fl4_src)
2661 fl.fl4_src = fl.fl4_dst; 2643 fl.fl4_src = fl.fl4_dst;
2662 if (dev_out)
2663 dev_put(dev_out);
2664 dev_out = net->loopback_dev; 2644 dev_out = net->loopback_dev;
2665 dev_hold(dev_out);
2666 fl.oif = dev_out->ifindex; 2645 fl.oif = dev_out->ifindex;
2667 if (res.fi) 2646 if (res.fi)
2668 fib_info_put(res.fi); 2647 fib_info_put(res.fi);
@@ -2682,28 +2661,23 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
2682 if (!fl.fl4_src) 2661 if (!fl.fl4_src)
2683 fl.fl4_src = FIB_RES_PREFSRC(res); 2662 fl.fl4_src = FIB_RES_PREFSRC(res);
2684 2663
2685 if (dev_out)
2686 dev_put(dev_out);
2687 dev_out = FIB_RES_DEV(res); 2664 dev_out = FIB_RES_DEV(res);
2688 dev_hold(dev_out);
2689 fl.oif = dev_out->ifindex; 2665 fl.oif = dev_out->ifindex;
2690 2666
2691 2667
2692make_route: 2668make_route:
2693 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags); 2669 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2694 2670
2695
2696 if (free_res) 2671 if (free_res)
2697 fib_res_put(&res); 2672 fib_res_put(&res);
2698 if (dev_out)
2699 dev_put(dev_out);
2700out: return err; 2673out: return err;
2701} 2674}
2702 2675
2703int __ip_route_output_key(struct net *net, struct rtable **rp, 2676int __ip_route_output_key(struct net *net, struct rtable **rp,
2704 const struct flowi *flp) 2677 const struct flowi *flp)
2705{ 2678{
2706 unsigned hash; 2679 unsigned int hash;
2680 int res;
2707 struct rtable *rth; 2681 struct rtable *rth;
2708 2682
2709 if (!rt_caching(net)) 2683 if (!rt_caching(net))
@@ -2734,7 +2708,10 @@ int __ip_route_output_key(struct net *net, struct rtable **rp,
2734 rcu_read_unlock_bh(); 2708 rcu_read_unlock_bh();
2735 2709
2736slow_output: 2710slow_output:
2737 return ip_route_output_slow(net, rp, flp); 2711 rcu_read_lock();
2712 res = ip_route_output_slow(net, rp, flp);
2713 rcu_read_unlock();
2714 return res;
2738} 2715}
2739EXPORT_SYMBOL_GPL(__ip_route_output_key); 2716EXPORT_SYMBOL_GPL(__ip_route_output_key);
2740 2717
@@ -2798,7 +2775,7 @@ static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi
2798 2775
2799 dst_release(&(*rp)->dst); 2776 dst_release(&(*rp)->dst);
2800 *rp = rt; 2777 *rp = rt;
2801 return (rt ? 0 : -ENOMEM); 2778 return rt ? 0 : -ENOMEM;
2802} 2779}
2803 2780
2804int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp, 2781int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index f115ea68a4ef..1664a0590bb8 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2392,7 +2392,12 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
2392 err = tp->af_specific->md5_parse(sk, optval, optlen); 2392 err = tp->af_specific->md5_parse(sk, optval, optlen);
2393 break; 2393 break;
2394#endif 2394#endif
2395 2395 case TCP_USER_TIMEOUT:
2396 /* Cap the max timeout in ms TCP will retry/retrans
2397 * before giving up and aborting (ETIMEDOUT) a connection.
2398 */
2399 icsk->icsk_user_timeout = msecs_to_jiffies(val);
2400 break;
2396 default: 2401 default:
2397 err = -ENOPROTOOPT; 2402 err = -ENOPROTOOPT;
2398 break; 2403 break;
@@ -2611,6 +2616,10 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
2611 case TCP_THIN_DUPACK: 2616 case TCP_THIN_DUPACK:
2612 val = tp->thin_dupack; 2617 val = tp->thin_dupack;
2613 break; 2618 break;
2619
2620 case TCP_USER_TIMEOUT:
2621 val = jiffies_to_msecs(icsk->icsk_user_timeout);
2622 break;
2614 default: 2623 default:
2615 return -ENOPROTOOPT; 2624 return -ENOPROTOOPT;
2616 } 2625 }
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index b55f60f6fcbe..f6fdd727a23d 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -182,7 +182,7 @@ static void tcp_incr_quickack(struct sock *sk)
182 icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS); 182 icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS);
183} 183}
184 184
185void tcp_enter_quickack_mode(struct sock *sk) 185static void tcp_enter_quickack_mode(struct sock *sk)
186{ 186{
187 struct inet_connection_sock *icsk = inet_csk(sk); 187 struct inet_connection_sock *icsk = inet_csk(sk);
188 tcp_incr_quickack(sk); 188 tcp_incr_quickack(sk);
@@ -805,25 +805,12 @@ void tcp_update_metrics(struct sock *sk)
805 } 805 }
806} 806}
807 807
808/* Numbers are taken from RFC3390.
809 *
810 * John Heffner states:
811 *
812 * The RFC specifies a window of no more than 4380 bytes
813 * unless 2*MSS > 4380. Reading the pseudocode in the RFC
814 * is a bit misleading because they use a clamp at 4380 bytes
815 * rather than use a multiplier in the relevant range.
816 */
817__u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst) 808__u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst)
818{ 809{
819 __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); 810 __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
820 811
821 if (!cwnd) { 812 if (!cwnd)
822 if (tp->mss_cache > 1460) 813 cwnd = rfc3390_bytes_to_packets(tp->mss_cache);
823 cwnd = 2;
824 else
825 cwnd = (tp->mss_cache > 1095) ? 3 : 4;
826 }
827 return min_t(__u32, cwnd, tp->snd_cwnd_clamp); 814 return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
828} 815}
829 816
@@ -2314,7 +2301,7 @@ static inline int tcp_dupack_heuristics(struct tcp_sock *tp)
2314 2301
2315static inline int tcp_skb_timedout(struct sock *sk, struct sk_buff *skb) 2302static inline int tcp_skb_timedout(struct sock *sk, struct sk_buff *skb)
2316{ 2303{
2317 return (tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto); 2304 return tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto;
2318} 2305}
2319 2306
2320static inline int tcp_head_timedout(struct sock *sk) 2307static inline int tcp_head_timedout(struct sock *sk)
@@ -3412,8 +3399,8 @@ static void tcp_ack_probe(struct sock *sk)
3412 3399
3413static inline int tcp_ack_is_dubious(const struct sock *sk, const int flag) 3400static inline int tcp_ack_is_dubious(const struct sock *sk, const int flag)
3414{ 3401{
3415 return (!(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) || 3402 return !(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) ||
3416 inet_csk(sk)->icsk_ca_state != TCP_CA_Open); 3403 inet_csk(sk)->icsk_ca_state != TCP_CA_Open;
3417} 3404}
3418 3405
3419static inline int tcp_may_raise_cwnd(const struct sock *sk, const int flag) 3406static inline int tcp_may_raise_cwnd(const struct sock *sk, const int flag)
@@ -3430,9 +3417,9 @@ static inline int tcp_may_update_window(const struct tcp_sock *tp,
3430 const u32 ack, const u32 ack_seq, 3417 const u32 ack, const u32 ack_seq,
3431 const u32 nwin) 3418 const u32 nwin)
3432{ 3419{
3433 return (after(ack, tp->snd_una) || 3420 return after(ack, tp->snd_una) ||
3434 after(ack_seq, tp->snd_wl1) || 3421 after(ack_seq, tp->snd_wl1) ||
3435 (ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd)); 3422 (ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd);
3436} 3423}
3437 3424
3438/* Update our send window. 3425/* Update our send window.
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 020766292bb0..a0232f3a358b 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2571,7 +2571,6 @@ struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2571 2571
2572 return tcp_gro_receive(head, skb); 2572 return tcp_gro_receive(head, skb);
2573} 2573}
2574EXPORT_SYMBOL(tcp4_gro_receive);
2575 2574
2576int tcp4_gro_complete(struct sk_buff *skb) 2575int tcp4_gro_complete(struct sk_buff *skb)
2577{ 2576{
@@ -2584,7 +2583,6 @@ int tcp4_gro_complete(struct sk_buff *skb)
2584 2583
2585 return tcp_gro_complete(skb); 2584 return tcp_gro_complete(skb);
2586} 2585}
2587EXPORT_SYMBOL(tcp4_gro_complete);
2588 2586
2589struct proto tcp_prot = { 2587struct proto tcp_prot = {
2590 .name = "TCP", 2588 .name = "TCP",
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index f25b56cb85cb..43cf901d7659 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -55,7 +55,7 @@ static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
55 return 1; 55 return 1;
56 if (after(end_seq, s_win) && before(seq, e_win)) 56 if (after(end_seq, s_win) && before(seq, e_win))
57 return 1; 57 return 1;
58 return (seq == e_win && seq == end_seq); 58 return seq == e_win && seq == end_seq;
59} 59}
60 60
61/* 61/*
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index de3bd8458588..05b1ecf36763 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -224,16 +224,10 @@ void tcp_select_initial_window(int __space, __u32 mss,
224 } 224 }
225 } 225 }
226 226
227 /* Set initial window to value enough for senders, 227 /* Set initial window to value enough for senders, following RFC5681. */
228 * following RFC2414. Senders, not following this RFC,
229 * will be satisfied with 2.
230 */
231 if (mss > (1 << *rcv_wscale)) { 228 if (mss > (1 << *rcv_wscale)) {
232 int init_cwnd = 4; 229 int init_cwnd = rfc3390_bytes_to_packets(mss);
233 if (mss > 1460 * 3) 230
234 init_cwnd = 2;
235 else if (mss > 1460)
236 init_cwnd = 3;
237 /* when initializing use the value from init_rcv_wnd 231 /* when initializing use the value from init_rcv_wnd
238 * rather than the default from above 232 * rather than the default from above
239 */ 233 */
@@ -1376,9 +1370,9 @@ static inline int tcp_nagle_check(const struct tcp_sock *tp,
1376 const struct sk_buff *skb, 1370 const struct sk_buff *skb,
1377 unsigned mss_now, int nonagle) 1371 unsigned mss_now, int nonagle)
1378{ 1372{
1379 return (skb->len < mss_now && 1373 return skb->len < mss_now &&
1380 ((nonagle & TCP_NAGLE_CORK) || 1374 ((nonagle & TCP_NAGLE_CORK) ||
1381 (!nonagle && tp->packets_out && tcp_minshall_check(tp)))); 1375 (!nonagle && tp->packets_out && tcp_minshall_check(tp)));
1382} 1376}
1383 1377
1384/* Return non-zero if the Nagle test allows this packet to be 1378/* Return non-zero if the Nagle test allows this packet to be
@@ -1449,10 +1443,10 @@ int tcp_may_send_now(struct sock *sk)
1449 struct tcp_sock *tp = tcp_sk(sk); 1443 struct tcp_sock *tp = tcp_sk(sk);
1450 struct sk_buff *skb = tcp_send_head(sk); 1444 struct sk_buff *skb = tcp_send_head(sk);
1451 1445
1452 return (skb && 1446 return skb &&
1453 tcp_snd_test(sk, skb, tcp_current_mss(sk), 1447 tcp_snd_test(sk, skb, tcp_current_mss(sk),
1454 (tcp_skb_is_last(sk, skb) ? 1448 (tcp_skb_is_last(sk, skb) ?
1455 tp->nonagle : TCP_NAGLE_PUSH))); 1449 tp->nonagle : TCP_NAGLE_PUSH));
1456} 1450}
1457 1451
1458/* Trim TSO SKB to LEN bytes, put the remaining data into a new packet 1452/* Trim TSO SKB to LEN bytes, put the remaining data into a new packet
@@ -2429,6 +2423,12 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2429 __u8 rcv_wscale; 2423 __u8 rcv_wscale;
2430 /* Set this up on the first call only */ 2424 /* Set this up on the first call only */
2431 req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW); 2425 req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW);
2426
2427 /* limit the window selection if the user enforce a smaller rx buffer */
2428 if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
2429 (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0))
2430 req->window_clamp = tcp_full_space(sk);
2431
2432 /* tcp_full_space because it is guaranteed to be the first packet */ 2432 /* tcp_full_space because it is guaranteed to be the first packet */
2433 tcp_select_initial_window(tcp_full_space(sk), 2433 tcp_select_initial_window(tcp_full_space(sk),
2434 mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), 2434 mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
@@ -2555,6 +2555,11 @@ static void tcp_connect_init(struct sock *sk)
2555 2555
2556 tcp_initialize_rcv_mss(sk); 2556 tcp_initialize_rcv_mss(sk);
2557 2557
2558 /* limit the window selection if the user enforce a smaller rx buffer */
2559 if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
2560 (tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0))
2561 tp->window_clamp = tcp_full_space(sk);
2562
2558 tcp_select_initial_window(tcp_full_space(sk), 2563 tcp_select_initial_window(tcp_full_space(sk),
2559 tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), 2564 tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
2560 &tp->rcv_wnd, 2565 &tp->rcv_wnd,
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 74c54b30600f..f3c8c6c019ae 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -140,10 +140,10 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk)
140 */ 140 */
141static bool retransmits_timed_out(struct sock *sk, 141static bool retransmits_timed_out(struct sock *sk,
142 unsigned int boundary, 142 unsigned int boundary,
143 unsigned int timeout,
143 bool syn_set) 144 bool syn_set)
144{ 145{
145 unsigned int timeout, linear_backoff_thresh; 146 unsigned int linear_backoff_thresh, start_ts;
146 unsigned int start_ts;
147 unsigned int rto_base = syn_set ? TCP_TIMEOUT_INIT : TCP_RTO_MIN; 147 unsigned int rto_base = syn_set ? TCP_TIMEOUT_INIT : TCP_RTO_MIN;
148 148
149 if (!inet_csk(sk)->icsk_retransmits) 149 if (!inet_csk(sk)->icsk_retransmits)
@@ -154,14 +154,15 @@ static bool retransmits_timed_out(struct sock *sk,
154 else 154 else
155 start_ts = tcp_sk(sk)->retrans_stamp; 155 start_ts = tcp_sk(sk)->retrans_stamp;
156 156
157 linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base); 157 if (likely(timeout == 0)) {
158 158 linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base);
159 if (boundary <= linear_backoff_thresh)
160 timeout = ((2 << boundary) - 1) * rto_base;
161 else
162 timeout = ((2 << linear_backoff_thresh) - 1) * rto_base +
163 (boundary - linear_backoff_thresh) * TCP_RTO_MAX;
164 159
160 if (boundary <= linear_backoff_thresh)
161 timeout = ((2 << boundary) - 1) * rto_base;
162 else
163 timeout = ((2 << linear_backoff_thresh) - 1) * rto_base +
164 (boundary - linear_backoff_thresh) * TCP_RTO_MAX;
165 }
165 return (tcp_time_stamp - start_ts) >= timeout; 166 return (tcp_time_stamp - start_ts) >= timeout;
166} 167}
167 168
@@ -178,7 +179,7 @@ static int tcp_write_timeout(struct sock *sk)
178 retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; 179 retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
179 syn_set = 1; 180 syn_set = 1;
180 } else { 181 } else {
181 if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0)) { 182 if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0, 0)) {
182 /* Black hole detection */ 183 /* Black hole detection */
183 tcp_mtu_probing(icsk, sk); 184 tcp_mtu_probing(icsk, sk);
184 185
@@ -191,14 +192,15 @@ static int tcp_write_timeout(struct sock *sk)
191 192
192 retry_until = tcp_orphan_retries(sk, alive); 193 retry_until = tcp_orphan_retries(sk, alive);
193 do_reset = alive || 194 do_reset = alive ||
194 !retransmits_timed_out(sk, retry_until, 0); 195 !retransmits_timed_out(sk, retry_until, 0, 0);
195 196
196 if (tcp_out_of_resources(sk, do_reset)) 197 if (tcp_out_of_resources(sk, do_reset))
197 return 1; 198 return 1;
198 } 199 }
199 } 200 }
200 201
201 if (retransmits_timed_out(sk, retry_until, syn_set)) { 202 if (retransmits_timed_out(sk, retry_until,
203 syn_set ? 0 : icsk->icsk_user_timeout, syn_set)) {
202 /* Has it gone just too far? */ 204 /* Has it gone just too far? */
203 tcp_write_err(sk); 205 tcp_write_err(sk);
204 return 1; 206 return 1;
@@ -440,7 +442,7 @@ out_reset_timer:
440 icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); 442 icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
441 } 443 }
442 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX); 444 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX);
443 if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1, 0)) 445 if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1, 0, 0))
444 __sk_dst_reset(sk); 446 __sk_dst_reset(sk);
445 447
446out:; 448out:;
@@ -560,7 +562,14 @@ static void tcp_keepalive_timer (unsigned long data)
560 elapsed = keepalive_time_elapsed(tp); 562 elapsed = keepalive_time_elapsed(tp);
561 563
562 if (elapsed >= keepalive_time_when(tp)) { 564 if (elapsed >= keepalive_time_when(tp)) {
563 if (icsk->icsk_probes_out >= keepalive_probes(tp)) { 565 /* If the TCP_USER_TIMEOUT option is enabled, use that
566 * to determine when to timeout instead.
567 */
568 if ((icsk->icsk_user_timeout != 0 &&
569 elapsed >= icsk->icsk_user_timeout &&
570 icsk->icsk_probes_out > 0) ||
571 (icsk->icsk_user_timeout == 0 &&
572 icsk->icsk_probes_out >= keepalive_probes(tp))) {
564 tcp_send_active_reset(sk, GFP_ATOMIC); 573 tcp_send_active_reset(sk, GFP_ATOMIC);
565 tcp_write_err(sk); 574 tcp_write_err(sk);
566 goto out; 575 goto out;
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
index 20151d6a6241..a534dda5456e 100644
--- a/net/ipv4/tcp_westwood.c
+++ b/net/ipv4/tcp_westwood.c
@@ -80,7 +80,7 @@ static void tcp_westwood_init(struct sock *sk)
80 */ 80 */
81static inline u32 westwood_do_filter(u32 a, u32 b) 81static inline u32 westwood_do_filter(u32 a, u32 b)
82{ 82{
83 return (((7 * a) + b) >> 3); 83 return ((7 * a) + b) >> 3;
84} 84}
85 85
86static void westwood_filter(struct westwood *w, u32 delta) 86static void westwood_filter(struct westwood *w, u32 delta)
diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c
index 59186ca7808a..9a17bd2a0a37 100644
--- a/net/ipv4/tunnel4.c
+++ b/net/ipv4/tunnel4.c
@@ -14,8 +14,8 @@
14#include <net/protocol.h> 14#include <net/protocol.h>
15#include <net/xfrm.h> 15#include <net/xfrm.h>
16 16
17static struct xfrm_tunnel *tunnel4_handlers; 17static struct xfrm_tunnel *tunnel4_handlers __read_mostly;
18static struct xfrm_tunnel *tunnel64_handlers; 18static struct xfrm_tunnel *tunnel64_handlers __read_mostly;
19static DEFINE_MUTEX(tunnel4_mutex); 19static DEFINE_MUTEX(tunnel4_mutex);
20 20
21static inline struct xfrm_tunnel **fam_handlers(unsigned short family) 21static inline struct xfrm_tunnel **fam_handlers(unsigned short family)
@@ -39,7 +39,7 @@ int xfrm4_tunnel_register(struct xfrm_tunnel *handler, unsigned short family)
39 } 39 }
40 40
41 handler->next = *pprev; 41 handler->next = *pprev;
42 *pprev = handler; 42 rcu_assign_pointer(*pprev, handler);
43 43
44 ret = 0; 44 ret = 0;
45 45
@@ -73,6 +73,11 @@ int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler, unsigned short family)
73} 73}
74EXPORT_SYMBOL(xfrm4_tunnel_deregister); 74EXPORT_SYMBOL(xfrm4_tunnel_deregister);
75 75
76#define for_each_tunnel_rcu(head, handler) \
77 for (handler = rcu_dereference(head); \
78 handler != NULL; \
79 handler = rcu_dereference(handler->next)) \
80
76static int tunnel4_rcv(struct sk_buff *skb) 81static int tunnel4_rcv(struct sk_buff *skb)
77{ 82{
78 struct xfrm_tunnel *handler; 83 struct xfrm_tunnel *handler;
@@ -80,7 +85,7 @@ static int tunnel4_rcv(struct sk_buff *skb)
80 if (!pskb_may_pull(skb, sizeof(struct iphdr))) 85 if (!pskb_may_pull(skb, sizeof(struct iphdr)))
81 goto drop; 86 goto drop;
82 87
83 for (handler = tunnel4_handlers; handler; handler = handler->next) 88 for_each_tunnel_rcu(tunnel4_handlers, handler)
84 if (!handler->handler(skb)) 89 if (!handler->handler(skb))
85 return 0; 90 return 0;
86 91
@@ -99,7 +104,7 @@ static int tunnel64_rcv(struct sk_buff *skb)
99 if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) 104 if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
100 goto drop; 105 goto drop;
101 106
102 for (handler = tunnel64_handlers; handler; handler = handler->next) 107 for_each_tunnel_rcu(tunnel64_handlers, handler)
103 if (!handler->handler(skb)) 108 if (!handler->handler(skb))
104 return 0; 109 return 0;
105 110
@@ -115,7 +120,7 @@ static void tunnel4_err(struct sk_buff *skb, u32 info)
115{ 120{
116 struct xfrm_tunnel *handler; 121 struct xfrm_tunnel *handler;
117 122
118 for (handler = tunnel4_handlers; handler; handler = handler->next) 123 for_each_tunnel_rcu(tunnel4_handlers, handler)
119 if (!handler->err_handler(skb, info)) 124 if (!handler->err_handler(skb, info))
120 break; 125 break;
121} 126}
@@ -125,7 +130,7 @@ static void tunnel64_err(struct sk_buff *skb, u32 info)
125{ 130{
126 struct xfrm_tunnel *handler; 131 struct xfrm_tunnel *handler;
127 132
128 for (handler = tunnel64_handlers; handler; handler = handler->next) 133 for_each_tunnel_rcu(tunnel64_handlers, handler)
129 if (!handler->err_handler(skb, info)) 134 if (!handler->err_handler(skb, info))
130 break; 135 break;
131} 136}
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index fb23c2e63b52..b3f7e8cf18ac 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -797,7 +797,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
797 return -EOPNOTSUPP; 797 return -EOPNOTSUPP;
798 798
799 ipc.opt = NULL; 799 ipc.opt = NULL;
800 ipc.shtx.flags = 0; 800 ipc.tx_flags = 0;
801 801
802 if (up->pending) { 802 if (up->pending) {
803 /* 803 /*
@@ -845,7 +845,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
845 ipc.addr = inet->inet_saddr; 845 ipc.addr = inet->inet_saddr;
846 846
847 ipc.oif = sk->sk_bound_dev_if; 847 ipc.oif = sk->sk_bound_dev_if;
848 err = sock_tx_timestamp(msg, sk, &ipc.shtx); 848 err = sock_tx_timestamp(sk, &ipc.tx_flags);
849 if (err) 849 if (err)
850 return err; 850 return err;
851 if (msg->msg_controllen) { 851 if (msg->msg_controllen) {
diff --git a/net/ipv4/xfrm4_tunnel.c b/net/ipv4/xfrm4_tunnel.c
index 41f5982d2087..82806455e859 100644
--- a/net/ipv4/xfrm4_tunnel.c
+++ b/net/ipv4/xfrm4_tunnel.c
@@ -58,14 +58,14 @@ static int xfrm_tunnel_err(struct sk_buff *skb, u32 info)
58 return -ENOENT; 58 return -ENOENT;
59} 59}
60 60
61static struct xfrm_tunnel xfrm_tunnel_handler = { 61static struct xfrm_tunnel xfrm_tunnel_handler __read_mostly = {
62 .handler = xfrm_tunnel_rcv, 62 .handler = xfrm_tunnel_rcv,
63 .err_handler = xfrm_tunnel_err, 63 .err_handler = xfrm_tunnel_err,
64 .priority = 2, 64 .priority = 2,
65}; 65};
66 66
67#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 67#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
68static struct xfrm_tunnel xfrm64_tunnel_handler = { 68static struct xfrm_tunnel xfrm64_tunnel_handler __read_mostly = {
69 .handler = xfrm_tunnel_rcv, 69 .handler = xfrm_tunnel_rcv,
70 .err_handler = xfrm_tunnel_err, 70 .err_handler = xfrm_tunnel_err,
71 .priority = 2, 71 .priority = 2,