aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Kconfig10
-rw-r--r--net/ipv4/af_inet.c84
-rw-r--r--net/ipv4/arp.c51
-rw-r--r--net/ipv4/datagram.c4
-rw-r--r--net/ipv4/devinet.c1
-rw-r--r--net/ipv4/fib_frontend.c13
-rw-r--r--net/ipv4/icmp.c37
-rw-r--r--net/ipv4/igmp.c32
-rw-r--r--net/ipv4/inet_connection_sock.c21
-rw-r--r--net/ipv4/inet_fragment.c1
-rw-r--r--net/ipv4/inet_hashtables.c4
-rw-r--r--net/ipv4/inetpeer.c244
-rw-r--r--net/ipv4/ip_forward.c10
-rw-r--r--net/ipv4/ip_fragment.c27
-rw-r--r--net/ipv4/ip_gre.c16
-rw-r--r--net/ipv4/ip_input.c26
-rw-r--r--net/ipv4/ip_output.c83
-rw-r--r--net/ipv4/ip_sockglue.c45
-rw-r--r--net/ipv4/ipconfig.c7
-rw-r--r--net/ipv4/ipip.c8
-rw-r--r--net/ipv4/ipmr.c22
-rw-r--r--net/ipv4/netfilter.c12
-rw-r--r--net/ipv4/netfilter/arp_tables.c7
-rw-r--r--net/ipv4/netfilter/ip_queue.c57
-rw-r--r--net/ipv4/netfilter/ip_tables.c6
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c48
-rw-r--r--net/ipv4/netfilter/ipt_LOG.c54
-rw-r--r--net/ipv4/netfilter/ipt_NETMAP.c6
-rw-r--r--net/ipv4/netfilter/ipt_REJECT.c2
-rw-r--r--net/ipv4/netfilter/nf_defrag_ipv4.c5
-rw-r--r--net/ipv4/netfilter/nf_nat_core.c2
-rw-r--r--net/ipv4/netfilter/nf_nat_rule.c10
-rw-r--r--net/ipv4/netfilter/nf_nat_standalone.c10
-rw-r--r--net/ipv4/proc.c16
-rw-r--r--net/ipv4/protocol.c3
-rw-r--r--net/ipv4/raw.c22
-rw-r--r--net/ipv4/route.c511
-rw-r--r--net/ipv4/syncookies.c107
-rw-r--r--net/ipv4/tcp.c66
-rw-r--r--net/ipv4/tcp_hybla.c4
-rw-r--r--net/ipv4/tcp_input.c24
-rw-r--r--net/ipv4/tcp_ipv4.c182
-rw-r--r--net/ipv4/tcp_minisocks.c9
-rw-r--r--net/ipv4/tcp_output.c80
-rw-r--r--net/ipv4/tcp_timer.c1
-rw-r--r--net/ipv4/tunnel4.c2
-rw-r--r--net/ipv4/udp.c30
-rw-r--r--net/ipv4/udplite.c3
-rw-r--r--net/ipv4/xfrm4_input.c1
-rw-r--r--net/ipv4/xfrm4_policy.c4
50 files changed, 1119 insertions, 911 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 8e3a1fd938ab..7c3a7d191249 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -303,7 +303,7 @@ config ARPD
303 If unsure, say N. 303 If unsure, say N.
304 304
305config SYN_COOKIES 305config SYN_COOKIES
306 bool "IP: TCP syncookie support (disabled per default)" 306 bool "IP: TCP syncookie support"
307 ---help--- 307 ---help---
308 Normal TCP/IP networking is open to an attack known as "SYN 308 Normal TCP/IP networking is open to an attack known as "SYN
309 flooding". This denial-of-service attack prevents legitimate remote 309 flooding". This denial-of-service attack prevents legitimate remote
@@ -328,13 +328,13 @@ config SYN_COOKIES
328 server is really overloaded. If this happens frequently better turn 328 server is really overloaded. If this happens frequently better turn
329 them off. 329 them off.
330 330
331 If you say Y here, note that SYN cookies aren't enabled by default; 331 If you say Y here, you can disable SYN cookies at run time by
332 you can enable them by saying Y to "/proc file system support" and 332 saying Y to "/proc file system support" and
333 "Sysctl support" below and executing the command 333 "Sysctl support" below and executing the command
334 334
335 echo 1 >/proc/sys/net/ipv4/tcp_syncookies 335 echo 0 > /proc/sys/net/ipv4/tcp_syncookies
336 336
337 at boot time after the /proc file system has been mounted. 337 after the /proc file system has been mounted.
338 338
339 If unsure, say N. 339 If unsure, say N.
340 340
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 551ce564b035..6a1100c25a9f 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -355,6 +355,8 @@ lookup_protocol:
355 inet = inet_sk(sk); 355 inet = inet_sk(sk);
356 inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0; 356 inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;
357 357
358 inet->nodefrag = 0;
359
358 if (SOCK_RAW == sock->type) { 360 if (SOCK_RAW == sock->type) {
359 inet->inet_num = protocol; 361 inet->inet_num = protocol;
360 if (IPPROTO_RAW == protocol) 362 if (IPPROTO_RAW == protocol)
@@ -725,28 +727,31 @@ int inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
725 sock_rps_record_flow(sk); 727 sock_rps_record_flow(sk);
726 728
727 /* We may need to bind the socket. */ 729 /* We may need to bind the socket. */
728 if (!inet_sk(sk)->inet_num && inet_autobind(sk)) 730 if (!inet_sk(sk)->inet_num && !sk->sk_prot->no_autobind &&
731 inet_autobind(sk))
729 return -EAGAIN; 732 return -EAGAIN;
730 733
731 return sk->sk_prot->sendmsg(iocb, sk, msg, size); 734 return sk->sk_prot->sendmsg(iocb, sk, msg, size);
732} 735}
733EXPORT_SYMBOL(inet_sendmsg); 736EXPORT_SYMBOL(inet_sendmsg);
734 737
735static ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset, 738ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset,
736 size_t size, int flags) 739 size_t size, int flags)
737{ 740{
738 struct sock *sk = sock->sk; 741 struct sock *sk = sock->sk;
739 742
740 sock_rps_record_flow(sk); 743 sock_rps_record_flow(sk);
741 744
742 /* We may need to bind the socket. */ 745 /* We may need to bind the socket. */
743 if (!inet_sk(sk)->inet_num && inet_autobind(sk)) 746 if (!inet_sk(sk)->inet_num && !sk->sk_prot->no_autobind &&
747 inet_autobind(sk))
744 return -EAGAIN; 748 return -EAGAIN;
745 749
746 if (sk->sk_prot->sendpage) 750 if (sk->sk_prot->sendpage)
747 return sk->sk_prot->sendpage(sk, page, offset, size, flags); 751 return sk->sk_prot->sendpage(sk, page, offset, size, flags);
748 return sock_no_sendpage(sock, page, offset, size, flags); 752 return sock_no_sendpage(sock, page, offset, size, flags);
749} 753}
754EXPORT_SYMBOL(inet_sendpage);
750 755
751int inet_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, 756int inet_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
752 size_t size, int flags) 757 size_t size, int flags)
@@ -892,10 +897,10 @@ const struct proto_ops inet_stream_ops = {
892 .shutdown = inet_shutdown, 897 .shutdown = inet_shutdown,
893 .setsockopt = sock_common_setsockopt, 898 .setsockopt = sock_common_setsockopt,
894 .getsockopt = sock_common_getsockopt, 899 .getsockopt = sock_common_getsockopt,
895 .sendmsg = tcp_sendmsg, 900 .sendmsg = inet_sendmsg,
896 .recvmsg = inet_recvmsg, 901 .recvmsg = inet_recvmsg,
897 .mmap = sock_no_mmap, 902 .mmap = sock_no_mmap,
898 .sendpage = tcp_sendpage, 903 .sendpage = inet_sendpage,
899 .splice_read = tcp_splice_read, 904 .splice_read = tcp_splice_read,
900#ifdef CONFIG_COMPAT 905#ifdef CONFIG_COMPAT
901 .compat_setsockopt = compat_sock_common_setsockopt, 906 .compat_setsockopt = compat_sock_common_setsockopt,
@@ -1100,7 +1105,7 @@ static int inet_sk_reselect_saddr(struct sock *sk)
1100 if (err) 1105 if (err)
1101 return err; 1106 return err;
1102 1107
1103 sk_setup_caps(sk, &rt->u.dst); 1108 sk_setup_caps(sk, &rt->dst);
1104 1109
1105 new_saddr = rt->rt_src; 1110 new_saddr = rt->rt_src;
1106 1111
@@ -1166,7 +1171,7 @@ int inet_sk_rebuild_header(struct sock *sk)
1166 err = ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0); 1171 err = ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0);
1167} 1172}
1168 if (!err) 1173 if (!err)
1169 sk_setup_caps(sk, &rt->u.dst); 1174 sk_setup_caps(sk, &rt->dst);
1170 else { 1175 else {
1171 /* Routing failed... */ 1176 /* Routing failed... */
1172 sk->sk_route_caps = 0; 1177 sk->sk_route_caps = 0;
@@ -1425,13 +1430,49 @@ unsigned long snmp_fold_field(void __percpu *mib[], int offt)
1425} 1430}
1426EXPORT_SYMBOL_GPL(snmp_fold_field); 1431EXPORT_SYMBOL_GPL(snmp_fold_field);
1427 1432
1428int snmp_mib_init(void __percpu *ptr[2], size_t mibsize) 1433#if BITS_PER_LONG==32
1434
1435u64 snmp_fold_field64(void __percpu *mib[], int offt, size_t syncp_offset)
1436{
1437 u64 res = 0;
1438 int cpu;
1439
1440 for_each_possible_cpu(cpu) {
1441 void *bhptr, *userptr;
1442 struct u64_stats_sync *syncp;
1443 u64 v_bh, v_user;
1444 unsigned int start;
1445
1446 /* first mib used by softirq context, we must use _bh() accessors */
1447 bhptr = per_cpu_ptr(SNMP_STAT_BHPTR(mib), cpu);
1448 syncp = (struct u64_stats_sync *)(bhptr + syncp_offset);
1449 do {
1450 start = u64_stats_fetch_begin_bh(syncp);
1451 v_bh = *(((u64 *) bhptr) + offt);
1452 } while (u64_stats_fetch_retry_bh(syncp, start));
1453
1454 /* second mib used in USER context */
1455 userptr = per_cpu_ptr(SNMP_STAT_USRPTR(mib), cpu);
1456 syncp = (struct u64_stats_sync *)(userptr + syncp_offset);
1457 do {
1458 start = u64_stats_fetch_begin(syncp);
1459 v_user = *(((u64 *) userptr) + offt);
1460 } while (u64_stats_fetch_retry(syncp, start));
1461
1462 res += v_bh + v_user;
1463 }
1464 return res;
1465}
1466EXPORT_SYMBOL_GPL(snmp_fold_field64);
1467#endif
1468
1469int snmp_mib_init(void __percpu *ptr[2], size_t mibsize, size_t align)
1429{ 1470{
1430 BUG_ON(ptr == NULL); 1471 BUG_ON(ptr == NULL);
1431 ptr[0] = __alloc_percpu(mibsize, __alignof__(unsigned long)); 1472 ptr[0] = __alloc_percpu(mibsize, align);
1432 if (!ptr[0]) 1473 if (!ptr[0])
1433 goto err0; 1474 goto err0;
1434 ptr[1] = __alloc_percpu(mibsize, __alignof__(unsigned long)); 1475 ptr[1] = __alloc_percpu(mibsize, align);
1435 if (!ptr[1]) 1476 if (!ptr[1])
1436 goto err1; 1477 goto err1;
1437 return 0; 1478 return 0;
@@ -1488,25 +1529,32 @@ static const struct net_protocol icmp_protocol = {
1488static __net_init int ipv4_mib_init_net(struct net *net) 1529static __net_init int ipv4_mib_init_net(struct net *net)
1489{ 1530{
1490 if (snmp_mib_init((void __percpu **)net->mib.tcp_statistics, 1531 if (snmp_mib_init((void __percpu **)net->mib.tcp_statistics,
1491 sizeof(struct tcp_mib)) < 0) 1532 sizeof(struct tcp_mib),
1533 __alignof__(struct tcp_mib)) < 0)
1492 goto err_tcp_mib; 1534 goto err_tcp_mib;
1493 if (snmp_mib_init((void __percpu **)net->mib.ip_statistics, 1535 if (snmp_mib_init((void __percpu **)net->mib.ip_statistics,
1494 sizeof(struct ipstats_mib)) < 0) 1536 sizeof(struct ipstats_mib),
1537 __alignof__(struct ipstats_mib)) < 0)
1495 goto err_ip_mib; 1538 goto err_ip_mib;
1496 if (snmp_mib_init((void __percpu **)net->mib.net_statistics, 1539 if (snmp_mib_init((void __percpu **)net->mib.net_statistics,
1497 sizeof(struct linux_mib)) < 0) 1540 sizeof(struct linux_mib),
1541 __alignof__(struct linux_mib)) < 0)
1498 goto err_net_mib; 1542 goto err_net_mib;
1499 if (snmp_mib_init((void __percpu **)net->mib.udp_statistics, 1543 if (snmp_mib_init((void __percpu **)net->mib.udp_statistics,
1500 sizeof(struct udp_mib)) < 0) 1544 sizeof(struct udp_mib),
1545 __alignof__(struct udp_mib)) < 0)
1501 goto err_udp_mib; 1546 goto err_udp_mib;
1502 if (snmp_mib_init((void __percpu **)net->mib.udplite_statistics, 1547 if (snmp_mib_init((void __percpu **)net->mib.udplite_statistics,
1503 sizeof(struct udp_mib)) < 0) 1548 sizeof(struct udp_mib),
1549 __alignof__(struct udp_mib)) < 0)
1504 goto err_udplite_mib; 1550 goto err_udplite_mib;
1505 if (snmp_mib_init((void __percpu **)net->mib.icmp_statistics, 1551 if (snmp_mib_init((void __percpu **)net->mib.icmp_statistics,
1506 sizeof(struct icmp_mib)) < 0) 1552 sizeof(struct icmp_mib),
1553 __alignof__(struct icmp_mib)) < 0)
1507 goto err_icmp_mib; 1554 goto err_icmp_mib;
1508 if (snmp_mib_init((void __percpu **)net->mib.icmpmsg_statistics, 1555 if (snmp_mib_init((void __percpu **)net->mib.icmpmsg_statistics,
1509 sizeof(struct icmpmsg_mib)) < 0) 1556 sizeof(struct icmpmsg_mib),
1557 __alignof__(struct icmpmsg_mib)) < 0)
1510 goto err_icmpmsg_mib; 1558 goto err_icmpmsg_mib;
1511 1559
1512 tcp_mib_init(net); 1560 tcp_mib_init(net);
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index f094b75810db..96c1955b3e2f 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -116,6 +116,7 @@
116#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE) 116#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
117#include <net/atmclip.h> 117#include <net/atmclip.h>
118struct neigh_table *clip_tbl_hook; 118struct neigh_table *clip_tbl_hook;
119EXPORT_SYMBOL(clip_tbl_hook);
119#endif 120#endif
120 121
121#include <asm/system.h> 122#include <asm/system.h>
@@ -169,6 +170,7 @@ const struct neigh_ops arp_broken_ops = {
169 .hh_output = dev_queue_xmit, 170 .hh_output = dev_queue_xmit,
170 .queue_xmit = dev_queue_xmit, 171 .queue_xmit = dev_queue_xmit,
171}; 172};
173EXPORT_SYMBOL(arp_broken_ops);
172 174
173struct neigh_table arp_tbl = { 175struct neigh_table arp_tbl = {
174 .family = AF_INET, 176 .family = AF_INET,
@@ -198,6 +200,7 @@ struct neigh_table arp_tbl = {
198 .gc_thresh2 = 512, 200 .gc_thresh2 = 512,
199 .gc_thresh3 = 1024, 201 .gc_thresh3 = 1024,
200}; 202};
203EXPORT_SYMBOL(arp_tbl);
201 204
202int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir) 205int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir)
203{ 206{
@@ -333,11 +336,14 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
333 struct net_device *dev = neigh->dev; 336 struct net_device *dev = neigh->dev;
334 __be32 target = *(__be32*)neigh->primary_key; 337 __be32 target = *(__be32*)neigh->primary_key;
335 int probes = atomic_read(&neigh->probes); 338 int probes = atomic_read(&neigh->probes);
336 struct in_device *in_dev = in_dev_get(dev); 339 struct in_device *in_dev;
337 340
338 if (!in_dev) 341 rcu_read_lock();
342 in_dev = __in_dev_get_rcu(dev);
343 if (!in_dev) {
344 rcu_read_unlock();
339 return; 345 return;
340 346 }
341 switch (IN_DEV_ARP_ANNOUNCE(in_dev)) { 347 switch (IN_DEV_ARP_ANNOUNCE(in_dev)) {
342 default: 348 default:
343 case 0: /* By default announce any local IP */ 349 case 0: /* By default announce any local IP */
@@ -358,9 +364,8 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
358 case 2: /* Avoid secondary IPs, get a primary/preferred one */ 364 case 2: /* Avoid secondary IPs, get a primary/preferred one */
359 break; 365 break;
360 } 366 }
367 rcu_read_unlock();
361 368
362 if (in_dev)
363 in_dev_put(in_dev);
364 if (!saddr) 369 if (!saddr)
365 saddr = inet_select_addr(dev, target, RT_SCOPE_LINK); 370 saddr = inet_select_addr(dev, target, RT_SCOPE_LINK);
366 371
@@ -427,7 +432,7 @@ static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev)
427 432
428 if (ip_route_output_key(net, &rt, &fl) < 0) 433 if (ip_route_output_key(net, &rt, &fl) < 0)
429 return 1; 434 return 1;
430 if (rt->u.dst.dev != dev) { 435 if (rt->dst.dev != dev) {
431 NET_INC_STATS_BH(net, LINUX_MIB_ARPFILTER); 436 NET_INC_STATS_BH(net, LINUX_MIB_ARPFILTER);
432 flag = 1; 437 flag = 1;
433 } 438 }
@@ -497,6 +502,7 @@ int arp_find(unsigned char *haddr, struct sk_buff *skb)
497 kfree_skb(skb); 502 kfree_skb(skb);
498 return 1; 503 return 1;
499} 504}
505EXPORT_SYMBOL(arp_find);
500 506
501/* END OF OBSOLETE FUNCTIONS */ 507/* END OF OBSOLETE FUNCTIONS */
502 508
@@ -532,7 +538,7 @@ static inline int arp_fwd_proxy(struct in_device *in_dev,
532 struct in_device *out_dev; 538 struct in_device *out_dev;
533 int imi, omi = -1; 539 int imi, omi = -1;
534 540
535 if (rt->u.dst.dev == dev) 541 if (rt->dst.dev == dev)
536 return 0; 542 return 0;
537 543
538 if (!IN_DEV_PROXY_ARP(in_dev)) 544 if (!IN_DEV_PROXY_ARP(in_dev))
@@ -545,10 +551,10 @@ static inline int arp_fwd_proxy(struct in_device *in_dev,
545 551
546 /* place to check for proxy_arp for routes */ 552 /* place to check for proxy_arp for routes */
547 553
548 if ((out_dev = in_dev_get(rt->u.dst.dev)) != NULL) { 554 out_dev = __in_dev_get_rcu(rt->dst.dev);
555 if (out_dev)
549 omi = IN_DEV_MEDIUM_ID(out_dev); 556 omi = IN_DEV_MEDIUM_ID(out_dev);
550 in_dev_put(out_dev); 557
551 }
552 return (omi != imi && omi != -1); 558 return (omi != imi && omi != -1);
553} 559}
554 560
@@ -576,7 +582,7 @@ static inline int arp_fwd_pvlan(struct in_device *in_dev,
576 __be32 sip, __be32 tip) 582 __be32 sip, __be32 tip)
577{ 583{
578 /* Private VLAN is only concerned about the same ethernet segment */ 584 /* Private VLAN is only concerned about the same ethernet segment */
579 if (rt->u.dst.dev != dev) 585 if (rt->dst.dev != dev)
580 return 0; 586 return 0;
581 587
582 /* Don't reply on self probes (often done by windowz boxes)*/ 588 /* Don't reply on self probes (often done by windowz boxes)*/
@@ -698,6 +704,7 @@ out:
698 kfree_skb(skb); 704 kfree_skb(skb);
699 return NULL; 705 return NULL;
700} 706}
707EXPORT_SYMBOL(arp_create);
701 708
702/* 709/*
703 * Send an arp packet. 710 * Send an arp packet.
@@ -707,6 +714,7 @@ void arp_xmit(struct sk_buff *skb)
707 /* Send it off, maybe filter it using firewalling first. */ 714 /* Send it off, maybe filter it using firewalling first. */
708 NF_HOOK(NFPROTO_ARP, NF_ARP_OUT, skb, NULL, skb->dev, dev_queue_xmit); 715 NF_HOOK(NFPROTO_ARP, NF_ARP_OUT, skb, NULL, skb->dev, dev_queue_xmit);
709} 716}
717EXPORT_SYMBOL(arp_xmit);
710 718
711/* 719/*
712 * Create and send an arp packet. 720 * Create and send an arp packet.
@@ -733,6 +741,7 @@ void arp_send(int type, int ptype, __be32 dest_ip,
733 741
734 arp_xmit(skb); 742 arp_xmit(skb);
735} 743}
744EXPORT_SYMBOL(arp_send);
736 745
737/* 746/*
738 * Process an arp request. 747 * Process an arp request.
@@ -741,7 +750,7 @@ void arp_send(int type, int ptype, __be32 dest_ip,
741static int arp_process(struct sk_buff *skb) 750static int arp_process(struct sk_buff *skb)
742{ 751{
743 struct net_device *dev = skb->dev; 752 struct net_device *dev = skb->dev;
744 struct in_device *in_dev = in_dev_get(dev); 753 struct in_device *in_dev = __in_dev_get_rcu(dev);
745 struct arphdr *arp; 754 struct arphdr *arp;
746 unsigned char *arp_ptr; 755 unsigned char *arp_ptr;
747 struct rtable *rt; 756 struct rtable *rt;
@@ -890,7 +899,6 @@ static int arp_process(struct sk_buff *skb)
890 arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha); 899 arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha);
891 } else { 900 } else {
892 pneigh_enqueue(&arp_tbl, in_dev->arp_parms, skb); 901 pneigh_enqueue(&arp_tbl, in_dev->arp_parms, skb);
893 in_dev_put(in_dev);
894 return 0; 902 return 0;
895 } 903 }
896 goto out; 904 goto out;
@@ -936,8 +944,6 @@ static int arp_process(struct sk_buff *skb)
936 } 944 }
937 945
938out: 946out:
939 if (in_dev)
940 in_dev_put(in_dev);
941 consume_skb(skb); 947 consume_skb(skb);
942 return 0; 948 return 0;
943} 949}
@@ -1045,7 +1051,7 @@ static int arp_req_set(struct net *net, struct arpreq *r,
1045 struct rtable * rt; 1051 struct rtable * rt;
1046 if ((err = ip_route_output_key(net, &rt, &fl)) != 0) 1052 if ((err = ip_route_output_key(net, &rt, &fl)) != 0)
1047 return err; 1053 return err;
1048 dev = rt->u.dst.dev; 1054 dev = rt->dst.dev;
1049 ip_rt_put(rt); 1055 ip_rt_put(rt);
1050 if (!dev) 1056 if (!dev)
1051 return -EINVAL; 1057 return -EINVAL;
@@ -1152,7 +1158,7 @@ static int arp_req_delete(struct net *net, struct arpreq *r,
1152 struct rtable * rt; 1158 struct rtable * rt;
1153 if ((err = ip_route_output_key(net, &rt, &fl)) != 0) 1159 if ((err = ip_route_output_key(net, &rt, &fl)) != 0)
1154 return err; 1160 return err;
1155 dev = rt->u.dst.dev; 1161 dev = rt->dst.dev;
1156 ip_rt_put(rt); 1162 ip_rt_put(rt);
1157 if (!dev) 1163 if (!dev)
1158 return -EINVAL; 1164 return -EINVAL;
@@ -1453,14 +1459,3 @@ static int __init arp_proc_init(void)
1453} 1459}
1454 1460
1455#endif /* CONFIG_PROC_FS */ 1461#endif /* CONFIG_PROC_FS */
1456
1457EXPORT_SYMBOL(arp_broken_ops);
1458EXPORT_SYMBOL(arp_find);
1459EXPORT_SYMBOL(arp_create);
1460EXPORT_SYMBOL(arp_xmit);
1461EXPORT_SYMBOL(arp_send);
1462EXPORT_SYMBOL(arp_tbl);
1463
1464#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
1465EXPORT_SYMBOL(clip_tbl_hook);
1466#endif
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index fb2465811b48..f0550941df7b 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -69,9 +69,7 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
69 sk->sk_state = TCP_ESTABLISHED; 69 sk->sk_state = TCP_ESTABLISHED;
70 inet->inet_id = jiffies; 70 inet->inet_id = jiffies;
71 71
72 sk_dst_set(sk, &rt->u.dst); 72 sk_dst_set(sk, &rt->dst);
73 return(0); 73 return(0);
74} 74}
75
76EXPORT_SYMBOL(ip4_datagram_connect); 75EXPORT_SYMBOL(ip4_datagram_connect);
77
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 382bc768ed56..da14c49284f4 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1081,6 +1081,7 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
1081 } 1081 }
1082 ip_mc_up(in_dev); 1082 ip_mc_up(in_dev);
1083 /* fall through */ 1083 /* fall through */
1084 case NETDEV_NOTIFY_PEERS:
1084 case NETDEV_CHANGEADDR: 1085 case NETDEV_CHANGEADDR:
1085 /* Send gratuitous ARP to notify of link change */ 1086 /* Send gratuitous ARP to notify of link change */
1086 if (IN_DEV_ARP_NOTIFY(in_dev)) { 1087 if (IN_DEV_ARP_NOTIFY(in_dev)) {
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 4f0ed458c883..a43968918350 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -175,6 +175,7 @@ out:
175 fib_res_put(&res); 175 fib_res_put(&res);
176 return dev; 176 return dev;
177} 177}
178EXPORT_SYMBOL(ip_dev_find);
178 179
179/* 180/*
180 * Find address type as if only "dev" was present in the system. If 181 * Find address type as if only "dev" was present in the system. If
@@ -214,12 +215,14 @@ unsigned int inet_addr_type(struct net *net, __be32 addr)
214{ 215{
215 return __inet_dev_addr_type(net, NULL, addr); 216 return __inet_dev_addr_type(net, NULL, addr);
216} 217}
218EXPORT_SYMBOL(inet_addr_type);
217 219
218unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev, 220unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev,
219 __be32 addr) 221 __be32 addr)
220{ 222{
221 return __inet_dev_addr_type(net, dev, addr); 223 return __inet_dev_addr_type(net, dev, addr);
222} 224}
225EXPORT_SYMBOL(inet_dev_addr_type);
223 226
224/* Given (packet source, input interface) and optional (dst, oif, tos): 227/* Given (packet source, input interface) and optional (dst, oif, tos):
225 - (main) check, that source is valid i.e. not broadcast or our local 228 - (main) check, that source is valid i.e. not broadcast or our local
@@ -284,7 +287,7 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
284 if (no_addr) 287 if (no_addr)
285 goto last_resort; 288 goto last_resort;
286 if (rpf == 1) 289 if (rpf == 1)
287 goto e_inval; 290 goto e_rpf;
288 fl.oif = dev->ifindex; 291 fl.oif = dev->ifindex;
289 292
290 ret = 0; 293 ret = 0;
@@ -299,7 +302,7 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
299 302
300last_resort: 303last_resort:
301 if (rpf) 304 if (rpf)
302 goto e_inval; 305 goto e_rpf;
303 *spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); 306 *spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
304 *itag = 0; 307 *itag = 0;
305 return 0; 308 return 0;
@@ -308,6 +311,8 @@ e_inval_res:
308 fib_res_put(&res); 311 fib_res_put(&res);
309e_inval: 312e_inval:
310 return -EINVAL; 313 return -EINVAL;
314e_rpf:
315 return -EXDEV;
311} 316}
312 317
313static inline __be32 sk_extract_addr(struct sockaddr *addr) 318static inline __be32 sk_extract_addr(struct sockaddr *addr)
@@ -1075,7 +1080,3 @@ void __init ip_fib_init(void)
1075 1080
1076 fib_hash_init(); 1081 fib_hash_init();
1077} 1082}
1078
1079EXPORT_SYMBOL(inet_addr_type);
1080EXPORT_SYMBOL(inet_dev_addr_type);
1081EXPORT_SYMBOL(ip_dev_find);
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index d65e9215bcd7..a0d847c7cba5 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -181,6 +181,7 @@ const struct icmp_err icmp_err_convert[] = {
181 .fatal = 1, 181 .fatal = 1,
182 }, 182 },
183}; 183};
184EXPORT_SYMBOL(icmp_err_convert);
184 185
185/* 186/*
186 * ICMP control array. This specifies what to do with each ICMP. 187 * ICMP control array. This specifies what to do with each ICMP.
@@ -267,11 +268,12 @@ int xrlim_allow(struct dst_entry *dst, int timeout)
267 dst->rate_tokens = token; 268 dst->rate_tokens = token;
268 return rc; 269 return rc;
269} 270}
271EXPORT_SYMBOL(xrlim_allow);
270 272
271static inline int icmpv4_xrlim_allow(struct net *net, struct rtable *rt, 273static inline int icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
272 int type, int code) 274 int type, int code)
273{ 275{
274 struct dst_entry *dst = &rt->u.dst; 276 struct dst_entry *dst = &rt->dst;
275 int rc = 1; 277 int rc = 1;
276 278
277 if (type > NR_ICMP_TYPES) 279 if (type > NR_ICMP_TYPES)
@@ -327,7 +329,7 @@ static void icmp_push_reply(struct icmp_bxm *icmp_param,
327 struct sock *sk; 329 struct sock *sk;
328 struct sk_buff *skb; 330 struct sk_buff *skb;
329 331
330 sk = icmp_sk(dev_net((*rt)->u.dst.dev)); 332 sk = icmp_sk(dev_net((*rt)->dst.dev));
331 if (ip_append_data(sk, icmp_glue_bits, icmp_param, 333 if (ip_append_data(sk, icmp_glue_bits, icmp_param,
332 icmp_param->data_len+icmp_param->head_len, 334 icmp_param->data_len+icmp_param->head_len,
333 icmp_param->head_len, 335 icmp_param->head_len,
@@ -359,7 +361,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
359{ 361{
360 struct ipcm_cookie ipc; 362 struct ipcm_cookie ipc;
361 struct rtable *rt = skb_rtable(skb); 363 struct rtable *rt = skb_rtable(skb);
362 struct net *net = dev_net(rt->u.dst.dev); 364 struct net *net = dev_net(rt->dst.dev);
363 struct sock *sk; 365 struct sock *sk;
364 struct inet_sock *inet; 366 struct inet_sock *inet;
365 __be32 daddr; 367 __be32 daddr;
@@ -427,7 +429,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
427 429
428 if (!rt) 430 if (!rt)
429 goto out; 431 goto out;
430 net = dev_net(rt->u.dst.dev); 432 net = dev_net(rt->dst.dev);
431 433
432 /* 434 /*
433 * Find the original header. It is expected to be valid, of course. 435 * Find the original header. It is expected to be valid, of course.
@@ -596,9 +598,9 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
596 /* Ugh! */ 598 /* Ugh! */
597 orefdst = skb_in->_skb_refdst; /* save old refdst */ 599 orefdst = skb_in->_skb_refdst; /* save old refdst */
598 err = ip_route_input(skb_in, fl.fl4_dst, fl.fl4_src, 600 err = ip_route_input(skb_in, fl.fl4_dst, fl.fl4_src,
599 RT_TOS(tos), rt2->u.dst.dev); 601 RT_TOS(tos), rt2->dst.dev);
600 602
601 dst_release(&rt2->u.dst); 603 dst_release(&rt2->dst);
602 rt2 = skb_rtable(skb_in); 604 rt2 = skb_rtable(skb_in);
603 skb_in->_skb_refdst = orefdst; /* restore old refdst */ 605 skb_in->_skb_refdst = orefdst; /* restore old refdst */
604 } 606 }
@@ -610,7 +612,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
610 XFRM_LOOKUP_ICMP); 612 XFRM_LOOKUP_ICMP);
611 switch (err) { 613 switch (err) {
612 case 0: 614 case 0:
613 dst_release(&rt->u.dst); 615 dst_release(&rt->dst);
614 rt = rt2; 616 rt = rt2;
615 break; 617 break;
616 case -EPERM: 618 case -EPERM:
@@ -629,7 +631,7 @@ route_done:
629 631
630 /* RFC says return as much as we can without exceeding 576 bytes. */ 632 /* RFC says return as much as we can without exceeding 576 bytes. */
631 633
632 room = dst_mtu(&rt->u.dst); 634 room = dst_mtu(&rt->dst);
633 if (room > 576) 635 if (room > 576)
634 room = 576; 636 room = 576;
635 room -= sizeof(struct iphdr) + icmp_param.replyopts.optlen; 637 room -= sizeof(struct iphdr) + icmp_param.replyopts.optlen;
@@ -647,6 +649,7 @@ out_unlock:
647 icmp_xmit_unlock(sk); 649 icmp_xmit_unlock(sk);
648out:; 650out:;
649} 651}
652EXPORT_SYMBOL(icmp_send);
650 653
651 654
652/* 655/*
@@ -925,6 +928,7 @@ static void icmp_address(struct sk_buff *skb)
925/* 928/*
926 * RFC1812 (4.3.3.9). A router SHOULD listen all replies, and complain 929 * RFC1812 (4.3.3.9). A router SHOULD listen all replies, and complain
927 * loudly if an inconsistency is found. 930 * loudly if an inconsistency is found.
931 * called with rcu_read_lock()
928 */ 932 */
929 933
930static void icmp_address_reply(struct sk_buff *skb) 934static void icmp_address_reply(struct sk_buff *skb)
@@ -935,12 +939,12 @@ static void icmp_address_reply(struct sk_buff *skb)
935 struct in_ifaddr *ifa; 939 struct in_ifaddr *ifa;
936 940
937 if (skb->len < 4 || !(rt->rt_flags&RTCF_DIRECTSRC)) 941 if (skb->len < 4 || !(rt->rt_flags&RTCF_DIRECTSRC))
938 goto out; 942 return;
939 943
940 in_dev = in_dev_get(dev); 944 in_dev = __in_dev_get_rcu(dev);
941 if (!in_dev) 945 if (!in_dev)
942 goto out; 946 return;
943 rcu_read_lock(); 947
944 if (in_dev->ifa_list && 948 if (in_dev->ifa_list &&
945 IN_DEV_LOG_MARTIANS(in_dev) && 949 IN_DEV_LOG_MARTIANS(in_dev) &&
946 IN_DEV_FORWARD(in_dev)) { 950 IN_DEV_FORWARD(in_dev)) {
@@ -958,9 +962,6 @@ static void icmp_address_reply(struct sk_buff *skb)
958 mp, dev->name, &rt->rt_src); 962 mp, dev->name, &rt->rt_src);
959 } 963 }
960 } 964 }
961 rcu_read_unlock();
962 in_dev_put(in_dev);
963out:;
964} 965}
965 966
966static void icmp_discard(struct sk_buff *skb) 967static void icmp_discard(struct sk_buff *skb)
@@ -974,7 +975,7 @@ int icmp_rcv(struct sk_buff *skb)
974{ 975{
975 struct icmphdr *icmph; 976 struct icmphdr *icmph;
976 struct rtable *rt = skb_rtable(skb); 977 struct rtable *rt = skb_rtable(skb);
977 struct net *net = dev_net(rt->u.dst.dev); 978 struct net *net = dev_net(rt->dst.dev);
978 979
979 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { 980 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
980 struct sec_path *sp = skb_sec_path(skb); 981 struct sec_path *sp = skb_sec_path(skb);
@@ -1216,7 +1217,3 @@ int __init icmp_init(void)
1216{ 1217{
1217 return register_pernet_subsys(&icmp_sk_ops); 1218 return register_pernet_subsys(&icmp_sk_ops);
1218} 1219}
1219
1220EXPORT_SYMBOL(icmp_err_convert);
1221EXPORT_SYMBOL(icmp_send);
1222EXPORT_SYMBOL(xrlim_allow);
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 5fff865a4fa7..a1ad0e7180d2 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -312,7 +312,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
312 return NULL; 312 return NULL;
313 } 313 }
314 314
315 skb_dst_set(skb, &rt->u.dst); 315 skb_dst_set(skb, &rt->dst);
316 skb->dev = dev; 316 skb->dev = dev;
317 317
318 skb_reserve(skb, LL_RESERVED_SPACE(dev)); 318 skb_reserve(skb, LL_RESERVED_SPACE(dev));
@@ -330,7 +330,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
330 pip->saddr = rt->rt_src; 330 pip->saddr = rt->rt_src;
331 pip->protocol = IPPROTO_IGMP; 331 pip->protocol = IPPROTO_IGMP;
332 pip->tot_len = 0; /* filled in later */ 332 pip->tot_len = 0; /* filled in later */
333 ip_select_ident(pip, &rt->u.dst, NULL); 333 ip_select_ident(pip, &rt->dst, NULL);
334 ((u8*)&pip[1])[0] = IPOPT_RA; 334 ((u8*)&pip[1])[0] = IPOPT_RA;
335 ((u8*)&pip[1])[1] = 4; 335 ((u8*)&pip[1])[1] = 4;
336 ((u8*)&pip[1])[2] = 0; 336 ((u8*)&pip[1])[2] = 0;
@@ -660,7 +660,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
660 return -1; 660 return -1;
661 } 661 }
662 662
663 skb_dst_set(skb, &rt->u.dst); 663 skb_dst_set(skb, &rt->dst);
664 664
665 skb_reserve(skb, LL_RESERVED_SPACE(dev)); 665 skb_reserve(skb, LL_RESERVED_SPACE(dev));
666 666
@@ -676,7 +676,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
676 iph->daddr = dst; 676 iph->daddr = dst;
677 iph->saddr = rt->rt_src; 677 iph->saddr = rt->rt_src;
678 iph->protocol = IPPROTO_IGMP; 678 iph->protocol = IPPROTO_IGMP;
679 ip_select_ident(iph, &rt->u.dst, NULL); 679 ip_select_ident(iph, &rt->dst, NULL);
680 ((u8*)&iph[1])[0] = IPOPT_RA; 680 ((u8*)&iph[1])[0] = IPOPT_RA;
681 ((u8*)&iph[1])[1] = 4; 681 ((u8*)&iph[1])[1] = 4;
682 ((u8*)&iph[1])[2] = 0; 682 ((u8*)&iph[1])[2] = 0;
@@ -916,18 +916,19 @@ static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
916 read_unlock(&in_dev->mc_list_lock); 916 read_unlock(&in_dev->mc_list_lock);
917} 917}
918 918
919/* called in rcu_read_lock() section */
919int igmp_rcv(struct sk_buff *skb) 920int igmp_rcv(struct sk_buff *skb)
920{ 921{
921 /* This basically follows the spec line by line -- see RFC1112 */ 922 /* This basically follows the spec line by line -- see RFC1112 */
922 struct igmphdr *ih; 923 struct igmphdr *ih;
923 struct in_device *in_dev = in_dev_get(skb->dev); 924 struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
924 int len = skb->len; 925 int len = skb->len;
925 926
926 if (in_dev == NULL) 927 if (in_dev == NULL)
927 goto drop; 928 goto drop;
928 929
929 if (!pskb_may_pull(skb, sizeof(struct igmphdr))) 930 if (!pskb_may_pull(skb, sizeof(struct igmphdr)))
930 goto drop_ref; 931 goto drop;
931 932
932 switch (skb->ip_summed) { 933 switch (skb->ip_summed) {
933 case CHECKSUM_COMPLETE: 934 case CHECKSUM_COMPLETE:
@@ -937,7 +938,7 @@ int igmp_rcv(struct sk_buff *skb)
937 case CHECKSUM_NONE: 938 case CHECKSUM_NONE:
938 skb->csum = 0; 939 skb->csum = 0;
939 if (__skb_checksum_complete(skb)) 940 if (__skb_checksum_complete(skb))
940 goto drop_ref; 941 goto drop;
941 } 942 }
942 943
943 ih = igmp_hdr(skb); 944 ih = igmp_hdr(skb);
@@ -957,7 +958,6 @@ int igmp_rcv(struct sk_buff *skb)
957 break; 958 break;
958 case IGMP_PIM: 959 case IGMP_PIM:
959#ifdef CONFIG_IP_PIMSM_V1 960#ifdef CONFIG_IP_PIMSM_V1
960 in_dev_put(in_dev);
961 return pim_rcv_v1(skb); 961 return pim_rcv_v1(skb);
962#endif 962#endif
963 case IGMPV3_HOST_MEMBERSHIP_REPORT: 963 case IGMPV3_HOST_MEMBERSHIP_REPORT:
@@ -971,8 +971,6 @@ int igmp_rcv(struct sk_buff *skb)
971 break; 971 break;
972 } 972 }
973 973
974drop_ref:
975 in_dev_put(in_dev);
976drop: 974drop:
977 kfree_skb(skb); 975 kfree_skb(skb);
978 return 0; 976 return 0;
@@ -1246,6 +1244,7 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
1246out: 1244out:
1247 return; 1245 return;
1248} 1246}
1247EXPORT_SYMBOL(ip_mc_inc_group);
1249 1248
1250/* 1249/*
1251 * Resend IGMP JOIN report; used for bonding. 1250 * Resend IGMP JOIN report; used for bonding.
@@ -1268,6 +1267,7 @@ void ip_mc_rejoin_group(struct ip_mc_list *im)
1268 igmp_ifc_event(in_dev); 1267 igmp_ifc_event(in_dev);
1269#endif 1268#endif
1270} 1269}
1270EXPORT_SYMBOL(ip_mc_rejoin_group);
1271 1271
1272/* 1272/*
1273 * A socket has left a multicast group on device dev 1273 * A socket has left a multicast group on device dev
@@ -1298,6 +1298,7 @@ void ip_mc_dec_group(struct in_device *in_dev, __be32 addr)
1298 } 1298 }
1299 } 1299 }
1300} 1300}
1301EXPORT_SYMBOL(ip_mc_dec_group);
1301 1302
1302/* Device changing type */ 1303/* Device changing type */
1303 1304
@@ -1427,7 +1428,7 @@ static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr)
1427 } 1428 }
1428 1429
1429 if (!dev && !ip_route_output_key(net, &rt, &fl)) { 1430 if (!dev && !ip_route_output_key(net, &rt, &fl)) {
1430 dev = rt->u.dst.dev; 1431 dev = rt->dst.dev;
1431 ip_rt_put(rt); 1432 ip_rt_put(rt);
1432 } 1433 }
1433 if (dev) { 1434 if (dev) {
@@ -1646,8 +1647,7 @@ static int sf_setstate(struct ip_mc_list *pmc)
1646 if (dpsf->sf_inaddr == psf->sf_inaddr) 1647 if (dpsf->sf_inaddr == psf->sf_inaddr)
1647 break; 1648 break;
1648 if (!dpsf) { 1649 if (!dpsf) {
1649 dpsf = (struct ip_sf_list *) 1650 dpsf = kmalloc(sizeof(*dpsf), GFP_ATOMIC);
1650 kmalloc(sizeof(*dpsf), GFP_ATOMIC);
1651 if (!dpsf) 1651 if (!dpsf)
1652 continue; 1652 continue;
1653 *dpsf = *psf; 1653 *dpsf = *psf;
@@ -1807,6 +1807,7 @@ done:
1807 rtnl_unlock(); 1807 rtnl_unlock();
1808 return err; 1808 return err;
1809} 1809}
1810EXPORT_SYMBOL(ip_mc_join_group);
1810 1811
1811static void ip_sf_socklist_reclaim(struct rcu_head *rp) 1812static void ip_sf_socklist_reclaim(struct rcu_head *rp)
1812{ 1813{
@@ -2679,8 +2680,3 @@ int __init igmp_mc_proc_init(void)
2679 return register_pernet_subsys(&igmp_net_ops); 2680 return register_pernet_subsys(&igmp_net_ops);
2680} 2681}
2681#endif 2682#endif
2682
2683EXPORT_SYMBOL(ip_mc_dec_group);
2684EXPORT_SYMBOL(ip_mc_inc_group);
2685EXPORT_SYMBOL(ip_mc_join_group);
2686EXPORT_SYMBOL(ip_mc_rejoin_group);
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 70eb3507c406..7174370b1195 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -84,7 +84,6 @@ int inet_csk_bind_conflict(const struct sock *sk,
84 } 84 }
85 return node != NULL; 85 return node != NULL;
86} 86}
87
88EXPORT_SYMBOL_GPL(inet_csk_bind_conflict); 87EXPORT_SYMBOL_GPL(inet_csk_bind_conflict);
89 88
90/* Obtain a reference to a local port for the given sock, 89/* Obtain a reference to a local port for the given sock,
@@ -212,7 +211,6 @@ fail:
212 local_bh_enable(); 211 local_bh_enable();
213 return ret; 212 return ret;
214} 213}
215
216EXPORT_SYMBOL_GPL(inet_csk_get_port); 214EXPORT_SYMBOL_GPL(inet_csk_get_port);
217 215
218/* 216/*
@@ -305,7 +303,6 @@ out_err:
305 *err = error; 303 *err = error;
306 goto out; 304 goto out;
307} 305}
308
309EXPORT_SYMBOL(inet_csk_accept); 306EXPORT_SYMBOL(inet_csk_accept);
310 307
311/* 308/*
@@ -327,7 +324,6 @@ void inet_csk_init_xmit_timers(struct sock *sk,
327 setup_timer(&sk->sk_timer, keepalive_handler, (unsigned long)sk); 324 setup_timer(&sk->sk_timer, keepalive_handler, (unsigned long)sk);
328 icsk->icsk_pending = icsk->icsk_ack.pending = 0; 325 icsk->icsk_pending = icsk->icsk_ack.pending = 0;
329} 326}
330
331EXPORT_SYMBOL(inet_csk_init_xmit_timers); 327EXPORT_SYMBOL(inet_csk_init_xmit_timers);
332 328
333void inet_csk_clear_xmit_timers(struct sock *sk) 329void inet_csk_clear_xmit_timers(struct sock *sk)
@@ -340,21 +336,18 @@ void inet_csk_clear_xmit_timers(struct sock *sk)
340 sk_stop_timer(sk, &icsk->icsk_delack_timer); 336 sk_stop_timer(sk, &icsk->icsk_delack_timer);
341 sk_stop_timer(sk, &sk->sk_timer); 337 sk_stop_timer(sk, &sk->sk_timer);
342} 338}
343
344EXPORT_SYMBOL(inet_csk_clear_xmit_timers); 339EXPORT_SYMBOL(inet_csk_clear_xmit_timers);
345 340
346void inet_csk_delete_keepalive_timer(struct sock *sk) 341void inet_csk_delete_keepalive_timer(struct sock *sk)
347{ 342{
348 sk_stop_timer(sk, &sk->sk_timer); 343 sk_stop_timer(sk, &sk->sk_timer);
349} 344}
350
351EXPORT_SYMBOL(inet_csk_delete_keepalive_timer); 345EXPORT_SYMBOL(inet_csk_delete_keepalive_timer);
352 346
353void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len) 347void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len)
354{ 348{
355 sk_reset_timer(sk, &sk->sk_timer, jiffies + len); 349 sk_reset_timer(sk, &sk->sk_timer, jiffies + len);
356} 350}
357
358EXPORT_SYMBOL(inet_csk_reset_keepalive_timer); 351EXPORT_SYMBOL(inet_csk_reset_keepalive_timer);
359 352
360struct dst_entry *inet_csk_route_req(struct sock *sk, 353struct dst_entry *inet_csk_route_req(struct sock *sk,
@@ -383,7 +376,7 @@ struct dst_entry *inet_csk_route_req(struct sock *sk,
383 goto no_route; 376 goto no_route;
384 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) 377 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
385 goto route_err; 378 goto route_err;
386 return &rt->u.dst; 379 return &rt->dst;
387 380
388route_err: 381route_err:
389 ip_rt_put(rt); 382 ip_rt_put(rt);
@@ -391,7 +384,6 @@ no_route:
391 IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES); 384 IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES);
392 return NULL; 385 return NULL;
393} 386}
394
395EXPORT_SYMBOL_GPL(inet_csk_route_req); 387EXPORT_SYMBOL_GPL(inet_csk_route_req);
396 388
397static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport, 389static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport,
@@ -433,7 +425,6 @@ struct request_sock *inet_csk_search_req(const struct sock *sk,
433 425
434 return req; 426 return req;
435} 427}
436
437EXPORT_SYMBOL_GPL(inet_csk_search_req); 428EXPORT_SYMBOL_GPL(inet_csk_search_req);
438 429
439void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, 430void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
@@ -447,11 +438,11 @@ void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
447 reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout); 438 reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout);
448 inet_csk_reqsk_queue_added(sk, timeout); 439 inet_csk_reqsk_queue_added(sk, timeout);
449} 440}
441EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add);
450 442
451/* Only thing we need from tcp.h */ 443/* Only thing we need from tcp.h */
452extern int sysctl_tcp_synack_retries; 444extern int sysctl_tcp_synack_retries;
453 445
454EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add);
455 446
456/* Decide when to expire the request and when to resend SYN-ACK */ 447/* Decide when to expire the request and when to resend SYN-ACK */
457static inline void syn_ack_recalc(struct request_sock *req, const int thresh, 448static inline void syn_ack_recalc(struct request_sock *req, const int thresh,
@@ -569,7 +560,6 @@ void inet_csk_reqsk_queue_prune(struct sock *parent,
569 if (lopt->qlen) 560 if (lopt->qlen)
570 inet_csk_reset_keepalive_timer(parent, interval); 561 inet_csk_reset_keepalive_timer(parent, interval);
571} 562}
572
573EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_prune); 563EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_prune);
574 564
575struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req, 565struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req,
@@ -599,7 +589,6 @@ struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req,
599 } 589 }
600 return newsk; 590 return newsk;
601} 591}
602
603EXPORT_SYMBOL_GPL(inet_csk_clone); 592EXPORT_SYMBOL_GPL(inet_csk_clone);
604 593
605/* 594/*
@@ -630,7 +619,6 @@ void inet_csk_destroy_sock(struct sock *sk)
630 percpu_counter_dec(sk->sk_prot->orphan_count); 619 percpu_counter_dec(sk->sk_prot->orphan_count);
631 sock_put(sk); 620 sock_put(sk);
632} 621}
633
634EXPORT_SYMBOL(inet_csk_destroy_sock); 622EXPORT_SYMBOL(inet_csk_destroy_sock);
635 623
636int inet_csk_listen_start(struct sock *sk, const int nr_table_entries) 624int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
@@ -665,7 +653,6 @@ int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
665 __reqsk_queue_destroy(&icsk->icsk_accept_queue); 653 __reqsk_queue_destroy(&icsk->icsk_accept_queue);
666 return -EADDRINUSE; 654 return -EADDRINUSE;
667} 655}
668
669EXPORT_SYMBOL_GPL(inet_csk_listen_start); 656EXPORT_SYMBOL_GPL(inet_csk_listen_start);
670 657
671/* 658/*
@@ -720,7 +707,6 @@ void inet_csk_listen_stop(struct sock *sk)
720 } 707 }
721 WARN_ON(sk->sk_ack_backlog); 708 WARN_ON(sk->sk_ack_backlog);
722} 709}
723
724EXPORT_SYMBOL_GPL(inet_csk_listen_stop); 710EXPORT_SYMBOL_GPL(inet_csk_listen_stop);
725 711
726void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr) 712void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr)
@@ -732,7 +718,6 @@ void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr)
732 sin->sin_addr.s_addr = inet->inet_daddr; 718 sin->sin_addr.s_addr = inet->inet_daddr;
733 sin->sin_port = inet->inet_dport; 719 sin->sin_port = inet->inet_dport;
734} 720}
735
736EXPORT_SYMBOL_GPL(inet_csk_addr2sockaddr); 721EXPORT_SYMBOL_GPL(inet_csk_addr2sockaddr);
737 722
738#ifdef CONFIG_COMPAT 723#ifdef CONFIG_COMPAT
@@ -747,7 +732,6 @@ int inet_csk_compat_getsockopt(struct sock *sk, int level, int optname,
747 return icsk->icsk_af_ops->getsockopt(sk, level, optname, 732 return icsk->icsk_af_ops->getsockopt(sk, level, optname,
748 optval, optlen); 733 optval, optlen);
749} 734}
750
751EXPORT_SYMBOL_GPL(inet_csk_compat_getsockopt); 735EXPORT_SYMBOL_GPL(inet_csk_compat_getsockopt);
752 736
753int inet_csk_compat_setsockopt(struct sock *sk, int level, int optname, 737int inet_csk_compat_setsockopt(struct sock *sk, int level, int optname,
@@ -761,6 +745,5 @@ int inet_csk_compat_setsockopt(struct sock *sk, int level, int optname,
761 return icsk->icsk_af_ops->setsockopt(sk, level, optname, 745 return icsk->icsk_af_ops->setsockopt(sk, level, optname,
762 optval, optlen); 746 optval, optlen);
763} 747}
764
765EXPORT_SYMBOL_GPL(inet_csk_compat_setsockopt); 748EXPORT_SYMBOL_GPL(inet_csk_compat_setsockopt);
766#endif 749#endif
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index a2ca6aed763b..5ff2a51b6d0c 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -114,7 +114,6 @@ void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f)
114 fq->last_in |= INET_FRAG_COMPLETE; 114 fq->last_in |= INET_FRAG_COMPLETE;
115 } 115 }
116} 116}
117
118EXPORT_SYMBOL(inet_frag_kill); 117EXPORT_SYMBOL(inet_frag_kill);
119 118
120static inline void frag_kfree_skb(struct netns_frags *nf, struct inet_frags *f, 119static inline void frag_kfree_skb(struct netns_frags *nf, struct inet_frags *f,
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index d3e160a88219..fb7ad5a21ff3 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -99,7 +99,6 @@ void inet_put_port(struct sock *sk)
99 __inet_put_port(sk); 99 __inet_put_port(sk);
100 local_bh_enable(); 100 local_bh_enable();
101} 101}
102
103EXPORT_SYMBOL(inet_put_port); 102EXPORT_SYMBOL(inet_put_port);
104 103
105void __inet_inherit_port(struct sock *sk, struct sock *child) 104void __inet_inherit_port(struct sock *sk, struct sock *child)
@@ -116,7 +115,6 @@ void __inet_inherit_port(struct sock *sk, struct sock *child)
116 inet_csk(child)->icsk_bind_hash = tb; 115 inet_csk(child)->icsk_bind_hash = tb;
117 spin_unlock(&head->lock); 116 spin_unlock(&head->lock);
118} 117}
119
120EXPORT_SYMBOL_GPL(__inet_inherit_port); 118EXPORT_SYMBOL_GPL(__inet_inherit_port);
121 119
122static inline int compute_score(struct sock *sk, struct net *net, 120static inline int compute_score(struct sock *sk, struct net *net,
@@ -546,7 +544,6 @@ int inet_hash_connect(struct inet_timewait_death_row *death_row,
546 return __inet_hash_connect(death_row, sk, inet_sk_port_offset(sk), 544 return __inet_hash_connect(death_row, sk, inet_sk_port_offset(sk),
547 __inet_check_established, __inet_hash_nolisten); 545 __inet_check_established, __inet_hash_nolisten);
548} 546}
549
550EXPORT_SYMBOL_GPL(inet_hash_connect); 547EXPORT_SYMBOL_GPL(inet_hash_connect);
551 548
552void inet_hashinfo_init(struct inet_hashinfo *h) 549void inet_hashinfo_init(struct inet_hashinfo *h)
@@ -560,5 +557,4 @@ void inet_hashinfo_init(struct inet_hashinfo *h)
560 i + LISTENING_NULLS_BASE); 557 i + LISTENING_NULLS_BASE);
561 } 558 }
562} 559}
563
564EXPORT_SYMBOL_GPL(inet_hashinfo_init); 560EXPORT_SYMBOL_GPL(inet_hashinfo_init);
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index 6bcfe52a9c87..9ffa24b9a804 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -51,8 +51,8 @@
51 * lookups performed with disabled BHs. 51 * lookups performed with disabled BHs.
52 * 52 *
53 * Serialisation issues. 53 * Serialisation issues.
54 * 1. Nodes may appear in the tree only with the pool write lock held. 54 * 1. Nodes may appear in the tree only with the pool lock held.
55 * 2. Nodes may disappear from the tree only with the pool write lock held 55 * 2. Nodes may disappear from the tree only with the pool lock held
56 * AND reference count being 0. 56 * AND reference count being 0.
57 * 3. Nodes appears and disappears from unused node list only under 57 * 3. Nodes appears and disappears from unused node list only under
58 * "inet_peer_unused_lock". 58 * "inet_peer_unused_lock".
@@ -64,23 +64,31 @@
64 * usually under some other lock to prevent node disappearing 64 * usually under some other lock to prevent node disappearing
65 * dtime: unused node list lock 65 * dtime: unused node list lock
66 * v4daddr: unchangeable 66 * v4daddr: unchangeable
67 * ip_id_count: idlock 67 * ip_id_count: atomic value (no lock needed)
68 */ 68 */
69 69
70static struct kmem_cache *peer_cachep __read_mostly; 70static struct kmem_cache *peer_cachep __read_mostly;
71 71
72#define node_height(x) x->avl_height 72#define node_height(x) x->avl_height
73static struct inet_peer peer_fake_node = { 73
74 .avl_left = &peer_fake_node, 74#define peer_avl_empty ((struct inet_peer *)&peer_fake_node)
75 .avl_right = &peer_fake_node, 75static const struct inet_peer peer_fake_node = {
76 .avl_left = peer_avl_empty,
77 .avl_right = peer_avl_empty,
76 .avl_height = 0 78 .avl_height = 0
77}; 79};
78#define peer_avl_empty (&peer_fake_node) 80
79static struct inet_peer *peer_root = peer_avl_empty; 81static struct {
80static DEFINE_RWLOCK(peer_pool_lock); 82 struct inet_peer *root;
83 spinlock_t lock;
84 int total;
85} peers = {
86 .root = peer_avl_empty,
87 .lock = __SPIN_LOCK_UNLOCKED(peers.lock),
88 .total = 0,
89};
81#define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */ 90#define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */
82 91
83static int peer_total;
84/* Exported for sysctl_net_ipv4. */ 92/* Exported for sysctl_net_ipv4. */
85int inet_peer_threshold __read_mostly = 65536 + 128; /* start to throw entries more 93int inet_peer_threshold __read_mostly = 65536 + 128; /* start to throw entries more
86 * aggressively at this stage */ 94 * aggressively at this stage */
@@ -89,8 +97,13 @@ int inet_peer_maxttl __read_mostly = 10 * 60 * HZ; /* usual time to live: 10 min
89int inet_peer_gc_mintime __read_mostly = 10 * HZ; 97int inet_peer_gc_mintime __read_mostly = 10 * HZ;
90int inet_peer_gc_maxtime __read_mostly = 120 * HZ; 98int inet_peer_gc_maxtime __read_mostly = 120 * HZ;
91 99
92static LIST_HEAD(unused_peers); 100static struct {
93static DEFINE_SPINLOCK(inet_peer_unused_lock); 101 struct list_head list;
102 spinlock_t lock;
103} unused_peers = {
104 .list = LIST_HEAD_INIT(unused_peers.list),
105 .lock = __SPIN_LOCK_UNLOCKED(unused_peers.lock),
106};
94 107
95static void peer_check_expire(unsigned long dummy); 108static void peer_check_expire(unsigned long dummy);
96static DEFINE_TIMER(peer_periodic_timer, peer_check_expire, 0, 0); 109static DEFINE_TIMER(peer_periodic_timer, peer_check_expire, 0, 0);
@@ -116,7 +129,7 @@ void __init inet_initpeers(void)
116 129
117 peer_cachep = kmem_cache_create("inet_peer_cache", 130 peer_cachep = kmem_cache_create("inet_peer_cache",
118 sizeof(struct inet_peer), 131 sizeof(struct inet_peer),
119 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, 132 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC,
120 NULL); 133 NULL);
121 134
122 /* All the timers, started at system startup tend 135 /* All the timers, started at system startup tend
@@ -131,38 +144,69 @@ void __init inet_initpeers(void)
131/* Called with or without local BH being disabled. */ 144/* Called with or without local BH being disabled. */
132static void unlink_from_unused(struct inet_peer *p) 145static void unlink_from_unused(struct inet_peer *p)
133{ 146{
134 spin_lock_bh(&inet_peer_unused_lock); 147 if (!list_empty(&p->unused)) {
135 list_del_init(&p->unused); 148 spin_lock_bh(&unused_peers.lock);
136 spin_unlock_bh(&inet_peer_unused_lock); 149 list_del_init(&p->unused);
150 spin_unlock_bh(&unused_peers.lock);
151 }
137} 152}
138 153
139/* 154/*
140 * Called with local BH disabled and the pool lock held. 155 * Called with local BH disabled and the pool lock held.
141 * _stack is known to be NULL or not at compile time,
142 * so compiler will optimize the if (_stack) tests.
143 */ 156 */
144#define lookup(_daddr, _stack) \ 157#define lookup(_daddr, _stack) \
145({ \ 158({ \
146 struct inet_peer *u, **v; \ 159 struct inet_peer *u, **v; \
147 if (_stack != NULL) { \ 160 \
148 stackptr = _stack; \ 161 stackptr = _stack; \
149 *stackptr++ = &peer_root; \ 162 *stackptr++ = &peers.root; \
150 } \ 163 for (u = peers.root; u != peer_avl_empty; ) { \
151 for (u = peer_root; u != peer_avl_empty; ) { \
152 if (_daddr == u->v4daddr) \ 164 if (_daddr == u->v4daddr) \
153 break; \ 165 break; \
154 if ((__force __u32)_daddr < (__force __u32)u->v4daddr) \ 166 if ((__force __u32)_daddr < (__force __u32)u->v4daddr) \
155 v = &u->avl_left; \ 167 v = &u->avl_left; \
156 else \ 168 else \
157 v = &u->avl_right; \ 169 v = &u->avl_right; \
158 if (_stack != NULL) \ 170 *stackptr++ = v; \
159 *stackptr++ = v; \
160 u = *v; \ 171 u = *v; \
161 } \ 172 } \
162 u; \ 173 u; \
163}) 174})
164 175
165/* Called with local BH disabled and the pool write lock held. */ 176/*
177 * Called with rcu_read_lock_bh()
178 * Because we hold no lock against a writer, its quite possible we fall
179 * in an endless loop.
180 * But every pointer we follow is guaranteed to be valid thanks to RCU.
181 * We exit from this function if number of links exceeds PEER_MAXDEPTH
182 */
183static struct inet_peer *lookup_rcu_bh(__be32 daddr)
184{
185 struct inet_peer *u = rcu_dereference_bh(peers.root);
186 int count = 0;
187
188 while (u != peer_avl_empty) {
189 if (daddr == u->v4daddr) {
190 /* Before taking a reference, check if this entry was
191 * deleted, unlink_from_pool() sets refcnt=-1 to make
192 * distinction between an unused entry (refcnt=0) and
193 * a freed one.
194 */
195 if (unlikely(!atomic_add_unless(&u->refcnt, 1, -1)))
196 u = NULL;
197 return u;
198 }
199 if ((__force __u32)daddr < (__force __u32)u->v4daddr)
200 u = rcu_dereference_bh(u->avl_left);
201 else
202 u = rcu_dereference_bh(u->avl_right);
203 if (unlikely(++count == PEER_MAXDEPTH))
204 break;
205 }
206 return NULL;
207}
208
209/* Called with local BH disabled and the pool lock held. */
166#define lookup_rightempty(start) \ 210#define lookup_rightempty(start) \
167({ \ 211({ \
168 struct inet_peer *u, **v; \ 212 struct inet_peer *u, **v; \
@@ -176,9 +220,10 @@ static void unlink_from_unused(struct inet_peer *p)
176 u; \ 220 u; \
177}) 221})
178 222
179/* Called with local BH disabled and the pool write lock held. 223/* Called with local BH disabled and the pool lock held.
180 * Variable names are the proof of operation correctness. 224 * Variable names are the proof of operation correctness.
181 * Look into mm/map_avl.c for more detail description of the ideas. */ 225 * Look into mm/map_avl.c for more detail description of the ideas.
226 */
182static void peer_avl_rebalance(struct inet_peer **stack[], 227static void peer_avl_rebalance(struct inet_peer **stack[],
183 struct inet_peer ***stackend) 228 struct inet_peer ***stackend)
184{ 229{
@@ -254,15 +299,21 @@ static void peer_avl_rebalance(struct inet_peer **stack[],
254 } 299 }
255} 300}
256 301
257/* Called with local BH disabled and the pool write lock held. */ 302/* Called with local BH disabled and the pool lock held. */
258#define link_to_pool(n) \ 303#define link_to_pool(n) \
259do { \ 304do { \
260 n->avl_height = 1; \ 305 n->avl_height = 1; \
261 n->avl_left = peer_avl_empty; \ 306 n->avl_left = peer_avl_empty; \
262 n->avl_right = peer_avl_empty; \ 307 n->avl_right = peer_avl_empty; \
308 smp_wmb(); /* lockless readers can catch us now */ \
263 **--stackptr = n; \ 309 **--stackptr = n; \
264 peer_avl_rebalance(stack, stackptr); \ 310 peer_avl_rebalance(stack, stackptr); \
265} while(0) 311} while (0)
312
313static void inetpeer_free_rcu(struct rcu_head *head)
314{
315 kmem_cache_free(peer_cachep, container_of(head, struct inet_peer, rcu));
316}
266 317
267/* May be called with local BH enabled. */ 318/* May be called with local BH enabled. */
268static void unlink_from_pool(struct inet_peer *p) 319static void unlink_from_pool(struct inet_peer *p)
@@ -271,13 +322,14 @@ static void unlink_from_pool(struct inet_peer *p)
271 322
272 do_free = 0; 323 do_free = 0;
273 324
274 write_lock_bh(&peer_pool_lock); 325 spin_lock_bh(&peers.lock);
275 /* Check the reference counter. It was artificially incremented by 1 326 /* Check the reference counter. It was artificially incremented by 1
276 * in cleanup() function to prevent sudden disappearing. If the 327 * in cleanup() function to prevent sudden disappearing. If we can
277 * reference count is still 1 then the node is referenced only as `p' 328 * atomically (because of lockless readers) take this last reference,
278 * here and from the pool. So under the exclusive pool lock it's safe 329 * it's safe to remove the node and free it later.
279 * to remove the node and free it later. */ 330 * We use refcnt=-1 to alert lockless readers this entry is deleted.
280 if (atomic_read(&p->refcnt) == 1) { 331 */
332 if (atomic_cmpxchg(&p->refcnt, 1, -1) == 1) {
281 struct inet_peer **stack[PEER_MAXDEPTH]; 333 struct inet_peer **stack[PEER_MAXDEPTH];
282 struct inet_peer ***stackptr, ***delp; 334 struct inet_peer ***stackptr, ***delp;
283 if (lookup(p->v4daddr, stack) != p) 335 if (lookup(p->v4daddr, stack) != p)
@@ -303,20 +355,21 @@ static void unlink_from_pool(struct inet_peer *p)
303 delp[1] = &t->avl_left; /* was &p->avl_left */ 355 delp[1] = &t->avl_left; /* was &p->avl_left */
304 } 356 }
305 peer_avl_rebalance(stack, stackptr); 357 peer_avl_rebalance(stack, stackptr);
306 peer_total--; 358 peers.total--;
307 do_free = 1; 359 do_free = 1;
308 } 360 }
309 write_unlock_bh(&peer_pool_lock); 361 spin_unlock_bh(&peers.lock);
310 362
311 if (do_free) 363 if (do_free)
312 kmem_cache_free(peer_cachep, p); 364 call_rcu_bh(&p->rcu, inetpeer_free_rcu);
313 else 365 else
314 /* The node is used again. Decrease the reference counter 366 /* The node is used again. Decrease the reference counter
315 * back. The loop "cleanup -> unlink_from_unused 367 * back. The loop "cleanup -> unlink_from_unused
316 * -> unlink_from_pool -> putpeer -> link_to_unused 368 * -> unlink_from_pool -> putpeer -> link_to_unused
317 * -> cleanup (for the same node)" 369 * -> cleanup (for the same node)"
318 * doesn't really exist because the entry will have a 370 * doesn't really exist because the entry will have a
319 * recent deletion time and will not be cleaned again soon. */ 371 * recent deletion time and will not be cleaned again soon.
372 */
320 inet_putpeer(p); 373 inet_putpeer(p);
321} 374}
322 375
@@ -326,16 +379,16 @@ static int cleanup_once(unsigned long ttl)
326 struct inet_peer *p = NULL; 379 struct inet_peer *p = NULL;
327 380
328 /* Remove the first entry from the list of unused nodes. */ 381 /* Remove the first entry from the list of unused nodes. */
329 spin_lock_bh(&inet_peer_unused_lock); 382 spin_lock_bh(&unused_peers.lock);
330 if (!list_empty(&unused_peers)) { 383 if (!list_empty(&unused_peers.list)) {
331 __u32 delta; 384 __u32 delta;
332 385
333 p = list_first_entry(&unused_peers, struct inet_peer, unused); 386 p = list_first_entry(&unused_peers.list, struct inet_peer, unused);
334 delta = (__u32)jiffies - p->dtime; 387 delta = (__u32)jiffies - p->dtime;
335 388
336 if (delta < ttl) { 389 if (delta < ttl) {
337 /* Do not prune fresh entries. */ 390 /* Do not prune fresh entries. */
338 spin_unlock_bh(&inet_peer_unused_lock); 391 spin_unlock_bh(&unused_peers.lock);
339 return -1; 392 return -1;
340 } 393 }
341 394
@@ -345,7 +398,7 @@ static int cleanup_once(unsigned long ttl)
345 * before unlink_from_pool() call. */ 398 * before unlink_from_pool() call. */
346 atomic_inc(&p->refcnt); 399 atomic_inc(&p->refcnt);
347 } 400 }
348 spin_unlock_bh(&inet_peer_unused_lock); 401 spin_unlock_bh(&unused_peers.lock);
349 402
350 if (p == NULL) 403 if (p == NULL)
351 /* It means that the total number of USED entries has 404 /* It means that the total number of USED entries has
@@ -360,62 +413,56 @@ static int cleanup_once(unsigned long ttl)
360/* Called with or without local BH being disabled. */ 413/* Called with or without local BH being disabled. */
361struct inet_peer *inet_getpeer(__be32 daddr, int create) 414struct inet_peer *inet_getpeer(__be32 daddr, int create)
362{ 415{
363 struct inet_peer *p, *n; 416 struct inet_peer *p;
364 struct inet_peer **stack[PEER_MAXDEPTH], ***stackptr; 417 struct inet_peer **stack[PEER_MAXDEPTH], ***stackptr;
365 418
366 /* Look up for the address quickly. */ 419 /* Look up for the address quickly, lockless.
367 read_lock_bh(&peer_pool_lock); 420 * Because of a concurrent writer, we might not find an existing entry.
368 p = lookup(daddr, NULL); 421 */
369 if (p != peer_avl_empty) 422 rcu_read_lock_bh();
370 atomic_inc(&p->refcnt); 423 p = lookup_rcu_bh(daddr);
371 read_unlock_bh(&peer_pool_lock); 424 rcu_read_unlock_bh();
425
426 if (p) {
427 /* The existing node has been found.
428 * Remove the entry from unused list if it was there.
429 */
430 unlink_from_unused(p);
431 return p;
432 }
372 433
434 /* retry an exact lookup, taking the lock before.
435 * At least, nodes should be hot in our cache.
436 */
437 spin_lock_bh(&peers.lock);
438 p = lookup(daddr, stack);
373 if (p != peer_avl_empty) { 439 if (p != peer_avl_empty) {
374 /* The existing node has been found. */ 440 atomic_inc(&p->refcnt);
441 spin_unlock_bh(&peers.lock);
375 /* Remove the entry from unused list if it was there. */ 442 /* Remove the entry from unused list if it was there. */
376 unlink_from_unused(p); 443 unlink_from_unused(p);
377 return p; 444 return p;
378 } 445 }
446 p = create ? kmem_cache_alloc(peer_cachep, GFP_ATOMIC) : NULL;
447 if (p) {
448 p->v4daddr = daddr;
449 atomic_set(&p->refcnt, 1);
450 atomic_set(&p->rid, 0);
451 atomic_set(&p->ip_id_count, secure_ip_id(daddr));
452 p->tcp_ts_stamp = 0;
453 INIT_LIST_HEAD(&p->unused);
454
455
456 /* Link the node. */
457 link_to_pool(p);
458 peers.total++;
459 }
460 spin_unlock_bh(&peers.lock);
379 461
380 if (!create) 462 if (peers.total >= inet_peer_threshold)
381 return NULL;
382
383 /* Allocate the space outside the locked region. */
384 n = kmem_cache_alloc(peer_cachep, GFP_ATOMIC);
385 if (n == NULL)
386 return NULL;
387 n->v4daddr = daddr;
388 atomic_set(&n->refcnt, 1);
389 atomic_set(&n->rid, 0);
390 atomic_set(&n->ip_id_count, secure_ip_id(daddr));
391 n->tcp_ts_stamp = 0;
392
393 write_lock_bh(&peer_pool_lock);
394 /* Check if an entry has suddenly appeared. */
395 p = lookup(daddr, stack);
396 if (p != peer_avl_empty)
397 goto out_free;
398
399 /* Link the node. */
400 link_to_pool(n);
401 INIT_LIST_HEAD(&n->unused);
402 peer_total++;
403 write_unlock_bh(&peer_pool_lock);
404
405 if (peer_total >= inet_peer_threshold)
406 /* Remove one less-recently-used entry. */ 463 /* Remove one less-recently-used entry. */
407 cleanup_once(0); 464 cleanup_once(0);
408 465
409 return n;
410
411out_free:
412 /* The appropriate node is already in the pool. */
413 atomic_inc(&p->refcnt);
414 write_unlock_bh(&peer_pool_lock);
415 /* Remove the entry from unused list if it was there. */
416 unlink_from_unused(p);
417 /* Free preallocated the preallocated node. */
418 kmem_cache_free(peer_cachep, n);
419 return p; 466 return p;
420} 467}
421 468
@@ -425,12 +472,12 @@ static void peer_check_expire(unsigned long dummy)
425 unsigned long now = jiffies; 472 unsigned long now = jiffies;
426 int ttl; 473 int ttl;
427 474
428 if (peer_total >= inet_peer_threshold) 475 if (peers.total >= inet_peer_threshold)
429 ttl = inet_peer_minttl; 476 ttl = inet_peer_minttl;
430 else 477 else
431 ttl = inet_peer_maxttl 478 ttl = inet_peer_maxttl
432 - (inet_peer_maxttl - inet_peer_minttl) / HZ * 479 - (inet_peer_maxttl - inet_peer_minttl) / HZ *
433 peer_total / inet_peer_threshold * HZ; 480 peers.total / inet_peer_threshold * HZ;
434 while (!cleanup_once(ttl)) { 481 while (!cleanup_once(ttl)) {
435 if (jiffies != now) 482 if (jiffies != now)
436 break; 483 break;
@@ -439,22 +486,25 @@ static void peer_check_expire(unsigned long dummy)
439 /* Trigger the timer after inet_peer_gc_mintime .. inet_peer_gc_maxtime 486 /* Trigger the timer after inet_peer_gc_mintime .. inet_peer_gc_maxtime
440 * interval depending on the total number of entries (more entries, 487 * interval depending on the total number of entries (more entries,
441 * less interval). */ 488 * less interval). */
442 if (peer_total >= inet_peer_threshold) 489 if (peers.total >= inet_peer_threshold)
443 peer_periodic_timer.expires = jiffies + inet_peer_gc_mintime; 490 peer_periodic_timer.expires = jiffies + inet_peer_gc_mintime;
444 else 491 else
445 peer_periodic_timer.expires = jiffies 492 peer_periodic_timer.expires = jiffies
446 + inet_peer_gc_maxtime 493 + inet_peer_gc_maxtime
447 - (inet_peer_gc_maxtime - inet_peer_gc_mintime) / HZ * 494 - (inet_peer_gc_maxtime - inet_peer_gc_mintime) / HZ *
448 peer_total / inet_peer_threshold * HZ; 495 peers.total / inet_peer_threshold * HZ;
449 add_timer(&peer_periodic_timer); 496 add_timer(&peer_periodic_timer);
450} 497}
451 498
452void inet_putpeer(struct inet_peer *p) 499void inet_putpeer(struct inet_peer *p)
453{ 500{
454 spin_lock_bh(&inet_peer_unused_lock); 501 local_bh_disable();
455 if (atomic_dec_and_test(&p->refcnt)) { 502
456 list_add_tail(&p->unused, &unused_peers); 503 if (atomic_dec_and_lock(&p->refcnt, &unused_peers.lock)) {
504 list_add_tail(&p->unused, &unused_peers.list);
457 p->dtime = (__u32)jiffies; 505 p->dtime = (__u32)jiffies;
506 spin_unlock(&unused_peers.lock);
458 } 507 }
459 spin_unlock_bh(&inet_peer_unused_lock); 508
509 local_bh_enable();
460} 510}
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index 56cdf68a074c..99461f09320f 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -87,16 +87,16 @@ int ip_forward(struct sk_buff *skb)
87 if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway) 87 if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
88 goto sr_failed; 88 goto sr_failed;
89 89
90 if (unlikely(skb->len > dst_mtu(&rt->u.dst) && !skb_is_gso(skb) && 90 if (unlikely(skb->len > dst_mtu(&rt->dst) && !skb_is_gso(skb) &&
91 (ip_hdr(skb)->frag_off & htons(IP_DF))) && !skb->local_df) { 91 (ip_hdr(skb)->frag_off & htons(IP_DF))) && !skb->local_df) {
92 IP_INC_STATS(dev_net(rt->u.dst.dev), IPSTATS_MIB_FRAGFAILS); 92 IP_INC_STATS(dev_net(rt->dst.dev), IPSTATS_MIB_FRAGFAILS);
93 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, 93 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
94 htonl(dst_mtu(&rt->u.dst))); 94 htonl(dst_mtu(&rt->dst)));
95 goto drop; 95 goto drop;
96 } 96 }
97 97
98 /* We are about to mangle packet. Copy it! */ 98 /* We are about to mangle packet. Copy it! */
99 if (skb_cow(skb, LL_RESERVED_SPACE(rt->u.dst.dev)+rt->u.dst.header_len)) 99 if (skb_cow(skb, LL_RESERVED_SPACE(rt->dst.dev)+rt->dst.header_len))
100 goto drop; 100 goto drop;
101 iph = ip_hdr(skb); 101 iph = ip_hdr(skb);
102 102
@@ -113,7 +113,7 @@ int ip_forward(struct sk_buff *skb)
113 skb->priority = rt_tos2priority(iph->tos); 113 skb->priority = rt_tos2priority(iph->tos);
114 114
115 return NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, skb, skb->dev, 115 return NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, skb, skb->dev,
116 rt->u.dst.dev, ip_forward_finish); 116 rt->dst.dev, ip_forward_finish);
117 117
118sr_failed: 118sr_failed:
119 /* 119 /*
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 75347ea70ea0..b7c41654dde5 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -124,11 +124,8 @@ static int ip4_frag_match(struct inet_frag_queue *q, void *a)
124} 124}
125 125
126/* Memory Tracking Functions. */ 126/* Memory Tracking Functions. */
127static __inline__ void frag_kfree_skb(struct netns_frags *nf, 127static void frag_kfree_skb(struct netns_frags *nf, struct sk_buff *skb)
128 struct sk_buff *skb, int *work)
129{ 128{
130 if (work)
131 *work -= skb->truesize;
132 atomic_sub(skb->truesize, &nf->mem); 129 atomic_sub(skb->truesize, &nf->mem);
133 kfree_skb(skb); 130 kfree_skb(skb);
134} 131}
@@ -309,7 +306,7 @@ static int ip_frag_reinit(struct ipq *qp)
309 fp = qp->q.fragments; 306 fp = qp->q.fragments;
310 do { 307 do {
311 struct sk_buff *xp = fp->next; 308 struct sk_buff *xp = fp->next;
312 frag_kfree_skb(qp->q.net, fp, NULL); 309 frag_kfree_skb(qp->q.net, fp);
313 fp = xp; 310 fp = xp;
314 } while (fp); 311 } while (fp);
315 312
@@ -317,6 +314,7 @@ static int ip_frag_reinit(struct ipq *qp)
317 qp->q.len = 0; 314 qp->q.len = 0;
318 qp->q.meat = 0; 315 qp->q.meat = 0;
319 qp->q.fragments = NULL; 316 qp->q.fragments = NULL;
317 qp->q.fragments_tail = NULL;
320 qp->iif = 0; 318 qp->iif = 0;
321 319
322 return 0; 320 return 0;
@@ -389,6 +387,11 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
389 * in the chain of fragments so far. We must know where to put 387 * in the chain of fragments so far. We must know where to put
390 * this fragment, right? 388 * this fragment, right?
391 */ 389 */
390 prev = qp->q.fragments_tail;
391 if (!prev || FRAG_CB(prev)->offset < offset) {
392 next = NULL;
393 goto found;
394 }
392 prev = NULL; 395 prev = NULL;
393 for (next = qp->q.fragments; next != NULL; next = next->next) { 396 for (next = qp->q.fragments; next != NULL; next = next->next) {
394 if (FRAG_CB(next)->offset >= offset) 397 if (FRAG_CB(next)->offset >= offset)
@@ -396,6 +399,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
396 prev = next; 399 prev = next;
397 } 400 }
398 401
402found:
399 /* We found where to put this one. Check for overlap with 403 /* We found where to put this one. Check for overlap with
400 * preceding fragment, and, if needed, align things so that 404 * preceding fragment, and, if needed, align things so that
401 * any overlaps are eliminated. 405 * any overlaps are eliminated.
@@ -446,7 +450,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
446 qp->q.fragments = next; 450 qp->q.fragments = next;
447 451
448 qp->q.meat -= free_it->len; 452 qp->q.meat -= free_it->len;
449 frag_kfree_skb(qp->q.net, free_it, NULL); 453 frag_kfree_skb(qp->q.net, free_it);
450 } 454 }
451 } 455 }
452 456
@@ -454,6 +458,8 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
454 458
455 /* Insert this fragment in the chain of fragments. */ 459 /* Insert this fragment in the chain of fragments. */
456 skb->next = next; 460 skb->next = next;
461 if (!next)
462 qp->q.fragments_tail = skb;
457 if (prev) 463 if (prev)
458 prev->next = skb; 464 prev->next = skb;
459 else 465 else
@@ -507,6 +513,8 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
507 goto out_nomem; 513 goto out_nomem;
508 514
509 fp->next = head->next; 515 fp->next = head->next;
516 if (!fp->next)
517 qp->q.fragments_tail = fp;
510 prev->next = fp; 518 prev->next = fp;
511 519
512 skb_morph(head, qp->q.fragments); 520 skb_morph(head, qp->q.fragments);
@@ -556,7 +564,6 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
556 564
557 skb_shinfo(head)->frag_list = head->next; 565 skb_shinfo(head)->frag_list = head->next;
558 skb_push(head, head->data - skb_network_header(head)); 566 skb_push(head, head->data - skb_network_header(head));
559 atomic_sub(head->truesize, &qp->q.net->mem);
560 567
561 for (fp=head->next; fp; fp = fp->next) { 568 for (fp=head->next; fp; fp = fp->next) {
562 head->data_len += fp->len; 569 head->data_len += fp->len;
@@ -566,8 +573,8 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
566 else if (head->ip_summed == CHECKSUM_COMPLETE) 573 else if (head->ip_summed == CHECKSUM_COMPLETE)
567 head->csum = csum_add(head->csum, fp->csum); 574 head->csum = csum_add(head->csum, fp->csum);
568 head->truesize += fp->truesize; 575 head->truesize += fp->truesize;
569 atomic_sub(fp->truesize, &qp->q.net->mem);
570 } 576 }
577 atomic_sub(head->truesize, &qp->q.net->mem);
571 578
572 head->next = NULL; 579 head->next = NULL;
573 head->dev = dev; 580 head->dev = dev;
@@ -578,6 +585,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
578 iph->tot_len = htons(len); 585 iph->tot_len = htons(len);
579 IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS); 586 IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS);
580 qp->q.fragments = NULL; 587 qp->q.fragments = NULL;
588 qp->q.fragments_tail = NULL;
581 return 0; 589 return 0;
582 590
583out_nomem: 591out_nomem:
@@ -624,6 +632,7 @@ int ip_defrag(struct sk_buff *skb, u32 user)
624 kfree_skb(skb); 632 kfree_skb(skb);
625 return -ENOMEM; 633 return -ENOMEM;
626} 634}
635EXPORT_SYMBOL(ip_defrag);
627 636
628#ifdef CONFIG_SYSCTL 637#ifdef CONFIG_SYSCTL
629static int zero; 638static int zero;
@@ -777,5 +786,3 @@ void __init ipfrag_init(void)
777 ip4_frags.secret_interval = 10 * 60 * HZ; 786 ip4_frags.secret_interval = 10 * 60 * HZ;
778 inet_frags_init(&ip4_frags); 787 inet_frags_init(&ip4_frags);
779} 788}
780
781EXPORT_SYMBOL(ip_defrag);
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 32618e11076d..945b20a5ad50 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -731,6 +731,8 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
731 tos = 0; 731 tos = 0;
732 if (skb->protocol == htons(ETH_P_IP)) 732 if (skb->protocol == htons(ETH_P_IP))
733 tos = old_iph->tos; 733 tos = old_iph->tos;
734 else if (skb->protocol == htons(ETH_P_IPV6))
735 tos = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
734 } 736 }
735 737
736 { 738 {
@@ -745,7 +747,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
745 goto tx_error; 747 goto tx_error;
746 } 748 }
747 } 749 }
748 tdev = rt->u.dst.dev; 750 tdev = rt->dst.dev;
749 751
750 if (tdev == dev) { 752 if (tdev == dev) {
751 ip_rt_put(rt); 753 ip_rt_put(rt);
@@ -755,7 +757,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
755 757
756 df = tiph->frag_off; 758 df = tiph->frag_off;
757 if (df) 759 if (df)
758 mtu = dst_mtu(&rt->u.dst) - dev->hard_header_len - tunnel->hlen; 760 mtu = dst_mtu(&rt->dst) - dev->hard_header_len - tunnel->hlen;
759 else 761 else
760 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu; 762 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
761 763
@@ -803,7 +805,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
803 tunnel->err_count = 0; 805 tunnel->err_count = 0;
804 } 806 }
805 807
806 max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->u.dst.header_len; 808 max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->dst.header_len;
807 809
808 if (skb_headroom(skb) < max_headroom || skb_shared(skb)|| 810 if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
809 (skb_cloned(skb) && !skb_clone_writable(skb, 0))) { 811 (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
@@ -830,7 +832,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
830 IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | 832 IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
831 IPSKB_REROUTED); 833 IPSKB_REROUTED);
832 skb_dst_drop(skb); 834 skb_dst_drop(skb);
833 skb_dst_set(skb, &rt->u.dst); 835 skb_dst_set(skb, &rt->dst);
834 836
835 /* 837 /*
836 * Push down and install the IPIP header. 838 * Push down and install the IPIP header.
@@ -853,7 +855,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
853 iph->ttl = ((struct ipv6hdr *)old_iph)->hop_limit; 855 iph->ttl = ((struct ipv6hdr *)old_iph)->hop_limit;
854#endif 856#endif
855 else 857 else
856 iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT); 858 iph->ttl = dst_metric(&rt->dst, RTAX_HOPLIMIT);
857 } 859 }
858 860
859 ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags; 861 ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
@@ -915,7 +917,7 @@ static int ipgre_tunnel_bind_dev(struct net_device *dev)
915 .proto = IPPROTO_GRE }; 917 .proto = IPPROTO_GRE };
916 struct rtable *rt; 918 struct rtable *rt;
917 if (!ip_route_output_key(dev_net(dev), &rt, &fl)) { 919 if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
918 tdev = rt->u.dst.dev; 920 tdev = rt->dst.dev;
919 ip_rt_put(rt); 921 ip_rt_put(rt);
920 } 922 }
921 923
@@ -1174,7 +1176,7 @@ static int ipgre_open(struct net_device *dev)
1174 struct rtable *rt; 1176 struct rtable *rt;
1175 if (ip_route_output_key(dev_net(dev), &rt, &fl)) 1177 if (ip_route_output_key(dev_net(dev), &rt, &fl))
1176 return -EADDRNOTAVAIL; 1178 return -EADDRNOTAVAIL;
1177 dev = rt->u.dst.dev; 1179 dev = rt->dst.dev;
1178 ip_rt_put(rt); 1180 ip_rt_put(rt);
1179 if (__in_dev_get_rtnl(dev) == NULL) 1181 if (__in_dev_get_rtnl(dev) == NULL)
1180 return -EADDRNOTAVAIL; 1182 return -EADDRNOTAVAIL;
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index d930dc5e4d85..d859bcc26cb7 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -146,7 +146,7 @@
146#include <linux/netlink.h> 146#include <linux/netlink.h>
147 147
148/* 148/*
149 * Process Router Attention IP option 149 * Process Router Attention IP option (RFC 2113)
150 */ 150 */
151int ip_call_ra_chain(struct sk_buff *skb) 151int ip_call_ra_chain(struct sk_buff *skb)
152{ 152{
@@ -155,8 +155,7 @@ int ip_call_ra_chain(struct sk_buff *skb)
155 struct sock *last = NULL; 155 struct sock *last = NULL;
156 struct net_device *dev = skb->dev; 156 struct net_device *dev = skb->dev;
157 157
158 read_lock(&ip_ra_lock); 158 for (ra = rcu_dereference(ip_ra_chain); ra; ra = rcu_dereference(ra->next)) {
159 for (ra = ip_ra_chain; ra; ra = ra->next) {
160 struct sock *sk = ra->sk; 159 struct sock *sk = ra->sk;
161 160
162 /* If socket is bound to an interface, only report 161 /* If socket is bound to an interface, only report
@@ -167,10 +166,8 @@ int ip_call_ra_chain(struct sk_buff *skb)
167 sk->sk_bound_dev_if == dev->ifindex) && 166 sk->sk_bound_dev_if == dev->ifindex) &&
168 net_eq(sock_net(sk), dev_net(dev))) { 167 net_eq(sock_net(sk), dev_net(dev))) {
169 if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { 168 if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
170 if (ip_defrag(skb, IP_DEFRAG_CALL_RA_CHAIN)) { 169 if (ip_defrag(skb, IP_DEFRAG_CALL_RA_CHAIN))
171 read_unlock(&ip_ra_lock);
172 return 1; 170 return 1;
173 }
174 } 171 }
175 if (last) { 172 if (last) {
176 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); 173 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
@@ -183,10 +180,8 @@ int ip_call_ra_chain(struct sk_buff *skb)
183 180
184 if (last) { 181 if (last) {
185 raw_rcv(last, skb); 182 raw_rcv(last, skb);
186 read_unlock(&ip_ra_lock);
187 return 1; 183 return 1;
188 } 184 }
189 read_unlock(&ip_ra_lock);
190 return 0; 185 return 0;
191} 186}
192 187
@@ -298,18 +293,16 @@ static inline int ip_rcv_options(struct sk_buff *skb)
298 } 293 }
299 294
300 if (unlikely(opt->srr)) { 295 if (unlikely(opt->srr)) {
301 struct in_device *in_dev = in_dev_get(dev); 296 struct in_device *in_dev = __in_dev_get_rcu(dev);
297
302 if (in_dev) { 298 if (in_dev) {
303 if (!IN_DEV_SOURCE_ROUTE(in_dev)) { 299 if (!IN_DEV_SOURCE_ROUTE(in_dev)) {
304 if (IN_DEV_LOG_MARTIANS(in_dev) && 300 if (IN_DEV_LOG_MARTIANS(in_dev) &&
305 net_ratelimit()) 301 net_ratelimit())
306 printk(KERN_INFO "source route option %pI4 -> %pI4\n", 302 printk(KERN_INFO "source route option %pI4 -> %pI4\n",
307 &iph->saddr, &iph->daddr); 303 &iph->saddr, &iph->daddr);
308 in_dev_put(in_dev);
309 goto drop; 304 goto drop;
310 } 305 }
311
312 in_dev_put(in_dev);
313 } 306 }
314 307
315 if (ip_options_rcv_srr(skb)) 308 if (ip_options_rcv_srr(skb))
@@ -340,13 +333,16 @@ static int ip_rcv_finish(struct sk_buff *skb)
340 else if (err == -ENETUNREACH) 333 else if (err == -ENETUNREACH)
341 IP_INC_STATS_BH(dev_net(skb->dev), 334 IP_INC_STATS_BH(dev_net(skb->dev),
342 IPSTATS_MIB_INNOROUTES); 335 IPSTATS_MIB_INNOROUTES);
336 else if (err == -EXDEV)
337 NET_INC_STATS_BH(dev_net(skb->dev),
338 LINUX_MIB_IPRPFILTER);
343 goto drop; 339 goto drop;
344 } 340 }
345 } 341 }
346 342
347#ifdef CONFIG_NET_CLS_ROUTE 343#ifdef CONFIG_NET_CLS_ROUTE
348 if (unlikely(skb_dst(skb)->tclassid)) { 344 if (unlikely(skb_dst(skb)->tclassid)) {
349 struct ip_rt_acct *st = per_cpu_ptr(ip_rt_acct, smp_processor_id()); 345 struct ip_rt_acct *st = this_cpu_ptr(ip_rt_acct);
350 u32 idx = skb_dst(skb)->tclassid; 346 u32 idx = skb_dst(skb)->tclassid;
351 st[idx&0xFF].o_packets++; 347 st[idx&0xFF].o_packets++;
352 st[idx&0xFF].o_bytes += skb->len; 348 st[idx&0xFF].o_bytes += skb->len;
@@ -360,10 +356,10 @@ static int ip_rcv_finish(struct sk_buff *skb)
360 356
361 rt = skb_rtable(skb); 357 rt = skb_rtable(skb);
362 if (rt->rt_type == RTN_MULTICAST) { 358 if (rt->rt_type == RTN_MULTICAST) {
363 IP_UPD_PO_STATS_BH(dev_net(rt->u.dst.dev), IPSTATS_MIB_INMCAST, 359 IP_UPD_PO_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INMCAST,
364 skb->len); 360 skb->len);
365 } else if (rt->rt_type == RTN_BROADCAST) 361 } else if (rt->rt_type == RTN_BROADCAST)
366 IP_UPD_PO_STATS_BH(dev_net(rt->u.dst.dev), IPSTATS_MIB_INBCAST, 362 IP_UPD_PO_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INBCAST,
367 skb->len); 363 skb->len);
368 364
369 return dst_input(skb); 365 return dst_input(skb);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 9a4a6c96cb0d..6652bd9da676 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -89,6 +89,7 @@ __inline__ void ip_send_check(struct iphdr *iph)
89 iph->check = 0; 89 iph->check = 0;
90 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); 90 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
91} 91}
92EXPORT_SYMBOL(ip_send_check);
92 93
93int __ip_local_out(struct sk_buff *skb) 94int __ip_local_out(struct sk_buff *skb)
94{ 95{
@@ -151,15 +152,15 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
151 iph->version = 4; 152 iph->version = 4;
152 iph->ihl = 5; 153 iph->ihl = 5;
153 iph->tos = inet->tos; 154 iph->tos = inet->tos;
154 if (ip_dont_fragment(sk, &rt->u.dst)) 155 if (ip_dont_fragment(sk, &rt->dst))
155 iph->frag_off = htons(IP_DF); 156 iph->frag_off = htons(IP_DF);
156 else 157 else
157 iph->frag_off = 0; 158 iph->frag_off = 0;
158 iph->ttl = ip_select_ttl(inet, &rt->u.dst); 159 iph->ttl = ip_select_ttl(inet, &rt->dst);
159 iph->daddr = rt->rt_dst; 160 iph->daddr = rt->rt_dst;
160 iph->saddr = rt->rt_src; 161 iph->saddr = rt->rt_src;
161 iph->protocol = sk->sk_protocol; 162 iph->protocol = sk->sk_protocol;
162 ip_select_ident(iph, &rt->u.dst, sk); 163 ip_select_ident(iph, &rt->dst, sk);
163 164
164 if (opt && opt->optlen) { 165 if (opt && opt->optlen) {
165 iph->ihl += opt->optlen>>2; 166 iph->ihl += opt->optlen>>2;
@@ -172,7 +173,6 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
172 /* Send it out. */ 173 /* Send it out. */
173 return ip_local_out(skb); 174 return ip_local_out(skb);
174} 175}
175
176EXPORT_SYMBOL_GPL(ip_build_and_send_pkt); 176EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
177 177
178static inline int ip_finish_output2(struct sk_buff *skb) 178static inline int ip_finish_output2(struct sk_buff *skb)
@@ -240,7 +240,7 @@ int ip_mc_output(struct sk_buff *skb)
240{ 240{
241 struct sock *sk = skb->sk; 241 struct sock *sk = skb->sk;
242 struct rtable *rt = skb_rtable(skb); 242 struct rtable *rt = skb_rtable(skb);
243 struct net_device *dev = rt->u.dst.dev; 243 struct net_device *dev = rt->dst.dev;
244 244
245 /* 245 /*
246 * If the indicated interface is up and running, send the packet. 246 * If the indicated interface is up and running, send the packet.
@@ -359,9 +359,9 @@ int ip_queue_xmit(struct sk_buff *skb)
359 if (ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0)) 359 if (ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0))
360 goto no_route; 360 goto no_route;
361 } 361 }
362 sk_setup_caps(sk, &rt->u.dst); 362 sk_setup_caps(sk, &rt->dst);
363 } 363 }
364 skb_dst_set_noref(skb, &rt->u.dst); 364 skb_dst_set_noref(skb, &rt->dst);
365 365
366packet_routed: 366packet_routed:
367 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) 367 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
@@ -372,11 +372,11 @@ packet_routed:
372 skb_reset_network_header(skb); 372 skb_reset_network_header(skb);
373 iph = ip_hdr(skb); 373 iph = ip_hdr(skb);
374 *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff)); 374 *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
375 if (ip_dont_fragment(sk, &rt->u.dst) && !skb->local_df) 375 if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df)
376 iph->frag_off = htons(IP_DF); 376 iph->frag_off = htons(IP_DF);
377 else 377 else
378 iph->frag_off = 0; 378 iph->frag_off = 0;
379 iph->ttl = ip_select_ttl(inet, &rt->u.dst); 379 iph->ttl = ip_select_ttl(inet, &rt->dst);
380 iph->protocol = sk->sk_protocol; 380 iph->protocol = sk->sk_protocol;
381 iph->saddr = rt->rt_src; 381 iph->saddr = rt->rt_src;
382 iph->daddr = rt->rt_dst; 382 iph->daddr = rt->rt_dst;
@@ -387,7 +387,7 @@ packet_routed:
387 ip_options_build(skb, opt, inet->inet_daddr, rt, 0); 387 ip_options_build(skb, opt, inet->inet_daddr, rt, 0);
388 } 388 }
389 389
390 ip_select_ident_more(iph, &rt->u.dst, sk, 390 ip_select_ident_more(iph, &rt->dst, sk,
391 (skb_shinfo(skb)->gso_segs ?: 1) - 1); 391 (skb_shinfo(skb)->gso_segs ?: 1) - 1);
392 392
393 skb->priority = sk->sk_priority; 393 skb->priority = sk->sk_priority;
@@ -403,6 +403,7 @@ no_route:
403 kfree_skb(skb); 403 kfree_skb(skb);
404 return -EHOSTUNREACH; 404 return -EHOSTUNREACH;
405} 405}
406EXPORT_SYMBOL(ip_queue_xmit);
406 407
407 408
408static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) 409static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
@@ -411,7 +412,7 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
411 to->priority = from->priority; 412 to->priority = from->priority;
412 to->protocol = from->protocol; 413 to->protocol = from->protocol;
413 skb_dst_drop(to); 414 skb_dst_drop(to);
414 skb_dst_set(to, dst_clone(skb_dst(from))); 415 skb_dst_copy(to, from);
415 to->dev = from->dev; 416 to->dev = from->dev;
416 to->mark = from->mark; 417 to->mark = from->mark;
417 418
@@ -442,7 +443,6 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
442int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) 443int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
443{ 444{
444 struct iphdr *iph; 445 struct iphdr *iph;
445 int raw = 0;
446 int ptr; 446 int ptr;
447 struct net_device *dev; 447 struct net_device *dev;
448 struct sk_buff *skb2; 448 struct sk_buff *skb2;
@@ -452,7 +452,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
452 struct rtable *rt = skb_rtable(skb); 452 struct rtable *rt = skb_rtable(skb);
453 int err = 0; 453 int err = 0;
454 454
455 dev = rt->u.dst.dev; 455 dev = rt->dst.dev;
456 456
457 /* 457 /*
458 * Point into the IP datagram header. 458 * Point into the IP datagram header.
@@ -473,7 +473,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
473 */ 473 */
474 474
475 hlen = iph->ihl * 4; 475 hlen = iph->ihl * 4;
476 mtu = dst_mtu(&rt->u.dst) - hlen; /* Size of data space */ 476 mtu = dst_mtu(&rt->dst) - hlen; /* Size of data space */
477#ifdef CONFIG_BRIDGE_NETFILTER 477#ifdef CONFIG_BRIDGE_NETFILTER
478 if (skb->nf_bridge) 478 if (skb->nf_bridge)
479 mtu -= nf_bridge_mtu_reduction(skb); 479 mtu -= nf_bridge_mtu_reduction(skb);
@@ -580,13 +580,13 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
580 580
581slow_path: 581slow_path:
582 left = skb->len - hlen; /* Space per frame */ 582 left = skb->len - hlen; /* Space per frame */
583 ptr = raw + hlen; /* Where to start from */ 583 ptr = hlen; /* Where to start from */
584 584
585 /* for bridged IP traffic encapsulated inside f.e. a vlan header, 585 /* for bridged IP traffic encapsulated inside f.e. a vlan header,
586 * we need to make room for the encapsulating header 586 * we need to make room for the encapsulating header
587 */ 587 */
588 pad = nf_bridge_pad(skb); 588 pad = nf_bridge_pad(skb);
589 ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, pad); 589 ll_rs = LL_RESERVED_SPACE_EXTRA(rt->dst.dev, pad);
590 mtu -= pad; 590 mtu -= pad;
591 591
592 /* 592 /*
@@ -697,7 +697,6 @@ fail:
697 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); 697 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
698 return err; 698 return err;
699} 699}
700
701EXPORT_SYMBOL(ip_fragment); 700EXPORT_SYMBOL(ip_fragment);
702 701
703int 702int
@@ -716,6 +715,7 @@ ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk
716 } 715 }
717 return 0; 716 return 0;
718} 717}
718EXPORT_SYMBOL(ip_generic_getfrag);
719 719
720static inline __wsum 720static inline __wsum
721csum_page(struct page *page, int offset, int copy) 721csum_page(struct page *page, int offset, int copy)
@@ -833,13 +833,13 @@ int ip_append_data(struct sock *sk,
833 */ 833 */
834 *rtp = NULL; 834 *rtp = NULL;
835 inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ? 835 inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ?
836 rt->u.dst.dev->mtu : 836 rt->dst.dev->mtu :
837 dst_mtu(rt->u.dst.path); 837 dst_mtu(rt->dst.path);
838 inet->cork.dst = &rt->u.dst; 838 inet->cork.dst = &rt->dst;
839 inet->cork.length = 0; 839 inet->cork.length = 0;
840 sk->sk_sndmsg_page = NULL; 840 sk->sk_sndmsg_page = NULL;
841 sk->sk_sndmsg_off = 0; 841 sk->sk_sndmsg_off = 0;
842 if ((exthdrlen = rt->u.dst.header_len) != 0) { 842 if ((exthdrlen = rt->dst.header_len) != 0) {
843 length += exthdrlen; 843 length += exthdrlen;
844 transhdrlen += exthdrlen; 844 transhdrlen += exthdrlen;
845 } 845 }
@@ -852,7 +852,7 @@ int ip_append_data(struct sock *sk,
852 exthdrlen = 0; 852 exthdrlen = 0;
853 mtu = inet->cork.fragsize; 853 mtu = inet->cork.fragsize;
854 } 854 }
855 hh_len = LL_RESERVED_SPACE(rt->u.dst.dev); 855 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
856 856
857 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); 857 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
858 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; 858 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
@@ -869,14 +869,16 @@ int ip_append_data(struct sock *sk,
869 */ 869 */
870 if (transhdrlen && 870 if (transhdrlen &&
871 length + fragheaderlen <= mtu && 871 length + fragheaderlen <= mtu &&
872 rt->u.dst.dev->features & NETIF_F_V4_CSUM && 872 rt->dst.dev->features & NETIF_F_V4_CSUM &&
873 !exthdrlen) 873 !exthdrlen)
874 csummode = CHECKSUM_PARTIAL; 874 csummode = CHECKSUM_PARTIAL;
875 875
876 skb = skb_peek_tail(&sk->sk_write_queue);
877
876 inet->cork.length += length; 878 inet->cork.length += length;
877 if (((length> mtu) || !skb_queue_empty(&sk->sk_write_queue)) && 879 if (((length > mtu) || (skb && skb_is_gso(skb))) &&
878 (sk->sk_protocol == IPPROTO_UDP) && 880 (sk->sk_protocol == IPPROTO_UDP) &&
879 (rt->u.dst.dev->features & NETIF_F_UFO)) { 881 (rt->dst.dev->features & NETIF_F_UFO)) {
880 err = ip_ufo_append_data(sk, getfrag, from, length, hh_len, 882 err = ip_ufo_append_data(sk, getfrag, from, length, hh_len,
881 fragheaderlen, transhdrlen, mtu, 883 fragheaderlen, transhdrlen, mtu,
882 flags); 884 flags);
@@ -892,7 +894,7 @@ int ip_append_data(struct sock *sk,
892 * adding appropriate IP header. 894 * adding appropriate IP header.
893 */ 895 */
894 896
895 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) 897 if (!skb)
896 goto alloc_new_skb; 898 goto alloc_new_skb;
897 899
898 while (length > 0) { 900 while (length > 0) {
@@ -924,7 +926,7 @@ alloc_new_skb:
924 fraglen = datalen + fragheaderlen; 926 fraglen = datalen + fragheaderlen;
925 927
926 if ((flags & MSG_MORE) && 928 if ((flags & MSG_MORE) &&
927 !(rt->u.dst.dev->features&NETIF_F_SG)) 929 !(rt->dst.dev->features&NETIF_F_SG))
928 alloclen = mtu; 930 alloclen = mtu;
929 else 931 else
930 alloclen = datalen + fragheaderlen; 932 alloclen = datalen + fragheaderlen;
@@ -935,7 +937,7 @@ alloc_new_skb:
935 * the last. 937 * the last.
936 */ 938 */
937 if (datalen == length + fraggap) 939 if (datalen == length + fraggap)
938 alloclen += rt->u.dst.trailer_len; 940 alloclen += rt->dst.trailer_len;
939 941
940 if (transhdrlen) { 942 if (transhdrlen) {
941 skb = sock_alloc_send_skb(sk, 943 skb = sock_alloc_send_skb(sk,
@@ -1008,7 +1010,7 @@ alloc_new_skb:
1008 if (copy > length) 1010 if (copy > length)
1009 copy = length; 1011 copy = length;
1010 1012
1011 if (!(rt->u.dst.dev->features&NETIF_F_SG)) { 1013 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1012 unsigned int off; 1014 unsigned int off;
1013 1015
1014 off = skb->len; 1016 off = skb->len;
@@ -1103,10 +1105,10 @@ ssize_t ip_append_page(struct sock *sk, struct page *page,
1103 if (inet->cork.flags & IPCORK_OPT) 1105 if (inet->cork.flags & IPCORK_OPT)
1104 opt = inet->cork.opt; 1106 opt = inet->cork.opt;
1105 1107
1106 if (!(rt->u.dst.dev->features&NETIF_F_SG)) 1108 if (!(rt->dst.dev->features&NETIF_F_SG))
1107 return -EOPNOTSUPP; 1109 return -EOPNOTSUPP;
1108 1110
1109 hh_len = LL_RESERVED_SPACE(rt->u.dst.dev); 1111 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1110 mtu = inet->cork.fragsize; 1112 mtu = inet->cork.fragsize;
1111 1113
1112 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); 1114 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
@@ -1121,8 +1123,9 @@ ssize_t ip_append_page(struct sock *sk, struct page *page,
1121 return -EINVAL; 1123 return -EINVAL;
1122 1124
1123 inet->cork.length += size; 1125 inet->cork.length += size;
1124 if ((sk->sk_protocol == IPPROTO_UDP) && 1126 if ((size + skb->len > mtu) &&
1125 (rt->u.dst.dev->features & NETIF_F_UFO)) { 1127 (sk->sk_protocol == IPPROTO_UDP) &&
1128 (rt->dst.dev->features & NETIF_F_UFO)) {
1126 skb_shinfo(skb)->gso_size = mtu - fragheaderlen; 1129 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
1127 skb_shinfo(skb)->gso_type = SKB_GSO_UDP; 1130 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1128 } 1131 }
@@ -1274,8 +1277,8 @@ int ip_push_pending_frames(struct sock *sk)
1274 * If local_df is set too, we still allow to fragment this frame 1277 * If local_df is set too, we still allow to fragment this frame
1275 * locally. */ 1278 * locally. */
1276 if (inet->pmtudisc >= IP_PMTUDISC_DO || 1279 if (inet->pmtudisc >= IP_PMTUDISC_DO ||
1277 (skb->len <= dst_mtu(&rt->u.dst) && 1280 (skb->len <= dst_mtu(&rt->dst) &&
1278 ip_dont_fragment(sk, &rt->u.dst))) 1281 ip_dont_fragment(sk, &rt->dst)))
1279 df = htons(IP_DF); 1282 df = htons(IP_DF);
1280 1283
1281 if (inet->cork.flags & IPCORK_OPT) 1284 if (inet->cork.flags & IPCORK_OPT)
@@ -1284,7 +1287,7 @@ int ip_push_pending_frames(struct sock *sk)
1284 if (rt->rt_type == RTN_MULTICAST) 1287 if (rt->rt_type == RTN_MULTICAST)
1285 ttl = inet->mc_ttl; 1288 ttl = inet->mc_ttl;
1286 else 1289 else
1287 ttl = ip_select_ttl(inet, &rt->u.dst); 1290 ttl = ip_select_ttl(inet, &rt->dst);
1288 1291
1289 iph = (struct iphdr *)skb->data; 1292 iph = (struct iphdr *)skb->data;
1290 iph->version = 4; 1293 iph->version = 4;
@@ -1295,7 +1298,7 @@ int ip_push_pending_frames(struct sock *sk)
1295 } 1298 }
1296 iph->tos = inet->tos; 1299 iph->tos = inet->tos;
1297 iph->frag_off = df; 1300 iph->frag_off = df;
1298 ip_select_ident(iph, &rt->u.dst, sk); 1301 ip_select_ident(iph, &rt->dst, sk);
1299 iph->ttl = ttl; 1302 iph->ttl = ttl;
1300 iph->protocol = sk->sk_protocol; 1303 iph->protocol = sk->sk_protocol;
1301 iph->saddr = rt->rt_src; 1304 iph->saddr = rt->rt_src;
@@ -1308,7 +1311,7 @@ int ip_push_pending_frames(struct sock *sk)
1308 * on dst refcount 1311 * on dst refcount
1309 */ 1312 */
1310 inet->cork.dst = NULL; 1313 inet->cork.dst = NULL;
1311 skb_dst_set(skb, &rt->u.dst); 1314 skb_dst_set(skb, &rt->dst);
1312 1315
1313 if (iph->protocol == IPPROTO_ICMP) 1316 if (iph->protocol == IPPROTO_ICMP)
1314 icmp_out_count(net, ((struct icmphdr *) 1317 icmp_out_count(net, ((struct icmphdr *)
@@ -1445,7 +1448,3 @@ void __init ip_init(void)
1445 igmp_mc_proc_init(); 1448 igmp_mc_proc_init();
1446#endif 1449#endif
1447} 1450}
1448
1449EXPORT_SYMBOL(ip_generic_getfrag);
1450EXPORT_SYMBOL(ip_queue_xmit);
1451EXPORT_SYMBOL(ip_send_check);
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index ce231780a2b1..6c40a8c46e79 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -239,7 +239,16 @@ int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc)
239 sent to multicast group to reach destination designated router. 239 sent to multicast group to reach destination designated router.
240 */ 240 */
241struct ip_ra_chain *ip_ra_chain; 241struct ip_ra_chain *ip_ra_chain;
242DEFINE_RWLOCK(ip_ra_lock); 242static DEFINE_SPINLOCK(ip_ra_lock);
243
244
245static void ip_ra_destroy_rcu(struct rcu_head *head)
246{
247 struct ip_ra_chain *ra = container_of(head, struct ip_ra_chain, rcu);
248
249 sock_put(ra->saved_sk);
250 kfree(ra);
251}
243 252
244int ip_ra_control(struct sock *sk, unsigned char on, 253int ip_ra_control(struct sock *sk, unsigned char on,
245 void (*destructor)(struct sock *)) 254 void (*destructor)(struct sock *))
@@ -251,35 +260,42 @@ int ip_ra_control(struct sock *sk, unsigned char on,
251 260
252 new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL; 261 new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL;
253 262
254 write_lock_bh(&ip_ra_lock); 263 spin_lock_bh(&ip_ra_lock);
255 for (rap = &ip_ra_chain; (ra = *rap) != NULL; rap = &ra->next) { 264 for (rap = &ip_ra_chain; (ra = *rap) != NULL; rap = &ra->next) {
256 if (ra->sk == sk) { 265 if (ra->sk == sk) {
257 if (on) { 266 if (on) {
258 write_unlock_bh(&ip_ra_lock); 267 spin_unlock_bh(&ip_ra_lock);
259 kfree(new_ra); 268 kfree(new_ra);
260 return -EADDRINUSE; 269 return -EADDRINUSE;
261 } 270 }
262 *rap = ra->next; 271 /* dont let ip_call_ra_chain() use sk again */
263 write_unlock_bh(&ip_ra_lock); 272 ra->sk = NULL;
273 rcu_assign_pointer(*rap, ra->next);
274 spin_unlock_bh(&ip_ra_lock);
264 275
265 if (ra->destructor) 276 if (ra->destructor)
266 ra->destructor(sk); 277 ra->destructor(sk);
267 sock_put(sk); 278 /*
268 kfree(ra); 279 * Delay sock_put(sk) and kfree(ra) after one rcu grace
280 * period. This guarantee ip_call_ra_chain() dont need
281 * to mess with socket refcounts.
282 */
283 ra->saved_sk = sk;
284 call_rcu(&ra->rcu, ip_ra_destroy_rcu);
269 return 0; 285 return 0;
270 } 286 }
271 } 287 }
272 if (new_ra == NULL) { 288 if (new_ra == NULL) {
273 write_unlock_bh(&ip_ra_lock); 289 spin_unlock_bh(&ip_ra_lock);
274 return -ENOBUFS; 290 return -ENOBUFS;
275 } 291 }
276 new_ra->sk = sk; 292 new_ra->sk = sk;
277 new_ra->destructor = destructor; 293 new_ra->destructor = destructor;
278 294
279 new_ra->next = ra; 295 new_ra->next = ra;
280 *rap = new_ra; 296 rcu_assign_pointer(*rap, new_ra);
281 sock_hold(sk); 297 sock_hold(sk);
282 write_unlock_bh(&ip_ra_lock); 298 spin_unlock_bh(&ip_ra_lock);
283 299
284 return 0; 300 return 0;
285} 301}
@@ -449,7 +465,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
449 (1<<IP_MTU_DISCOVER) | (1<<IP_RECVERR) | 465 (1<<IP_MTU_DISCOVER) | (1<<IP_RECVERR) |
450 (1<<IP_ROUTER_ALERT) | (1<<IP_FREEBIND) | 466 (1<<IP_ROUTER_ALERT) | (1<<IP_FREEBIND) |
451 (1<<IP_PASSSEC) | (1<<IP_TRANSPARENT) | 467 (1<<IP_PASSSEC) | (1<<IP_TRANSPARENT) |
452 (1<<IP_MINTTL))) || 468 (1<<IP_MINTTL) | (1<<IP_NODEFRAG))) ||
453 optname == IP_MULTICAST_TTL || 469 optname == IP_MULTICAST_TTL ||
454 optname == IP_MULTICAST_ALL || 470 optname == IP_MULTICAST_ALL ||
455 optname == IP_MULTICAST_LOOP || 471 optname == IP_MULTICAST_LOOP ||
@@ -572,6 +588,13 @@ static int do_ip_setsockopt(struct sock *sk, int level,
572 } 588 }
573 inet->hdrincl = val ? 1 : 0; 589 inet->hdrincl = val ? 1 : 0;
574 break; 590 break;
591 case IP_NODEFRAG:
592 if (sk->sk_type != SOCK_RAW) {
593 err = -ENOPROTOOPT;
594 break;
595 }
596 inet->nodefrag = val ? 1 : 0;
597 break;
575 case IP_MTU_DISCOVER: 598 case IP_MTU_DISCOVER:
576 if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_PROBE) 599 if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_PROBE)
577 goto e_inval; 600 goto e_inval;
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index b9d84e800cf4..3a6e1ec5e9ae 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -665,6 +665,13 @@ ic_dhcp_init_options(u8 *options)
665 memcpy(e, ic_req_params, sizeof(ic_req_params)); 665 memcpy(e, ic_req_params, sizeof(ic_req_params));
666 e += sizeof(ic_req_params); 666 e += sizeof(ic_req_params);
667 667
668 if (ic_host_name_set) {
669 *e++ = 12; /* host-name */
670 len = strlen(utsname()->nodename);
671 *e++ = len;
672 memcpy(e, utsname()->nodename, len);
673 e += len;
674 }
668 if (*vendor_class_identifier) { 675 if (*vendor_class_identifier) {
669 printk(KERN_INFO "DHCP: sending class identifier \"%s\"\n", 676 printk(KERN_INFO "DHCP: sending class identifier \"%s\"\n",
670 vendor_class_identifier); 677 vendor_class_identifier);
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 7fd636711037..ec036731a70b 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -435,7 +435,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
435 goto tx_error_icmp; 435 goto tx_error_icmp;
436 } 436 }
437 } 437 }
438 tdev = rt->u.dst.dev; 438 tdev = rt->dst.dev;
439 439
440 if (tdev == dev) { 440 if (tdev == dev) {
441 ip_rt_put(rt); 441 ip_rt_put(rt);
@@ -446,7 +446,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
446 df |= old_iph->frag_off & htons(IP_DF); 446 df |= old_iph->frag_off & htons(IP_DF);
447 447
448 if (df) { 448 if (df) {
449 mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr); 449 mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr);
450 450
451 if (mtu < 68) { 451 if (mtu < 68) {
452 stats->collisions++; 452 stats->collisions++;
@@ -503,7 +503,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
503 IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | 503 IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
504 IPSKB_REROUTED); 504 IPSKB_REROUTED);
505 skb_dst_drop(skb); 505 skb_dst_drop(skb);
506 skb_dst_set(skb, &rt->u.dst); 506 skb_dst_set(skb, &rt->dst);
507 507
508 /* 508 /*
509 * Push down and install the IPIP header. 509 * Push down and install the IPIP header.
@@ -552,7 +552,7 @@ static void ipip_tunnel_bind_dev(struct net_device *dev)
552 .proto = IPPROTO_IPIP }; 552 .proto = IPPROTO_IPIP };
553 struct rtable *rt; 553 struct rtable *rt;
554 if (!ip_route_output_key(dev_net(dev), &rt, &fl)) { 554 if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
555 tdev = rt->u.dst.dev; 555 tdev = rt->dst.dev;
556 ip_rt_put(rt); 556 ip_rt_put(rt);
557 } 557 }
558 dev->flags |= IFF_POINTOPOINT; 558 dev->flags |= IFF_POINTOPOINT;
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 45889103b3e2..179fcab866fc 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -267,8 +267,10 @@ static void __net_exit ipmr_rules_exit(struct net *net)
267{ 267{
268 struct mr_table *mrt, *next; 268 struct mr_table *mrt, *next;
269 269
270 list_for_each_entry_safe(mrt, next, &net->ipv4.mr_tables, list) 270 list_for_each_entry_safe(mrt, next, &net->ipv4.mr_tables, list) {
271 list_del(&mrt->list);
271 kfree(mrt); 272 kfree(mrt);
273 }
272 fib_rules_unregister(net->ipv4.mr_rules_ops); 274 fib_rules_unregister(net->ipv4.mr_rules_ops);
273} 275}
274#else 276#else
@@ -440,8 +442,10 @@ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
440 int err; 442 int err;
441 443
442 err = ipmr_fib_lookup(net, &fl, &mrt); 444 err = ipmr_fib_lookup(net, &fl, &mrt);
443 if (err < 0) 445 if (err < 0) {
446 kfree_skb(skb);
444 return err; 447 return err;
448 }
445 449
446 read_lock(&mrt_lock); 450 read_lock(&mrt_lock);
447 dev->stats.tx_bytes += skb->len; 451 dev->stats.tx_bytes += skb->len;
@@ -1551,9 +1555,9 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
1551 goto out_free; 1555 goto out_free;
1552 } 1556 }
1553 1557
1554 dev = rt->u.dst.dev; 1558 dev = rt->dst.dev;
1555 1559
1556 if (skb->len+encap > dst_mtu(&rt->u.dst) && (ntohs(iph->frag_off) & IP_DF)) { 1560 if (skb->len+encap > dst_mtu(&rt->dst) && (ntohs(iph->frag_off) & IP_DF)) {
1557 /* Do not fragment multicasts. Alas, IPv4 does not 1561 /* Do not fragment multicasts. Alas, IPv4 does not
1558 allow to send ICMP, so that packets will disappear 1562 allow to send ICMP, so that packets will disappear
1559 to blackhole. 1563 to blackhole.
@@ -1564,7 +1568,7 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
1564 goto out_free; 1568 goto out_free;
1565 } 1569 }
1566 1570
1567 encap += LL_RESERVED_SPACE(dev) + rt->u.dst.header_len; 1571 encap += LL_RESERVED_SPACE(dev) + rt->dst.header_len;
1568 1572
1569 if (skb_cow(skb, encap)) { 1573 if (skb_cow(skb, encap)) {
1570 ip_rt_put(rt); 1574 ip_rt_put(rt);
@@ -1575,7 +1579,7 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
1575 vif->bytes_out += skb->len; 1579 vif->bytes_out += skb->len;
1576 1580
1577 skb_dst_drop(skb); 1581 skb_dst_drop(skb);
1578 skb_dst_set(skb, &rt->u.dst); 1582 skb_dst_set(skb, &rt->dst);
1579 ip_decrease_ttl(ip_hdr(skb)); 1583 ip_decrease_ttl(ip_hdr(skb));
1580 1584
1581 /* FIXME: forward and output firewalls used to be called here. 1585 /* FIXME: forward and output firewalls used to be called here.
@@ -1726,8 +1730,10 @@ int ip_mr_input(struct sk_buff *skb)
1726 goto dont_forward; 1730 goto dont_forward;
1727 1731
1728 err = ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt); 1732 err = ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt);
1729 if (err < 0) 1733 if (err < 0) {
1734 kfree_skb(skb);
1730 return err; 1735 return err;
1736 }
1731 1737
1732 if (!local) { 1738 if (!local) {
1733 if (IPCB(skb)->opt.router_alert) { 1739 if (IPCB(skb)->opt.router_alert) {
@@ -1911,7 +1917,7 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
1911 struct rtattr *mp_head; 1917 struct rtattr *mp_head;
1912 1918
1913 /* If cache is unresolved, don't try to parse IIF and OIF */ 1919 /* If cache is unresolved, don't try to parse IIF and OIF */
1914 if (c->mfc_parent > MAXVIFS) 1920 if (c->mfc_parent >= MAXVIFS)
1915 return -ENOENT; 1921 return -ENOENT;
1916 1922
1917 if (VIF_EXISTS(mrt, c->mfc_parent)) 1923 if (VIF_EXISTS(mrt, c->mfc_parent))
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index 07de855e2175..d88a46c54fd1 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -43,7 +43,7 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
43 43
44 /* Drop old route. */ 44 /* Drop old route. */
45 skb_dst_drop(skb); 45 skb_dst_drop(skb);
46 skb_dst_set(skb, &rt->u.dst); 46 skb_dst_set(skb, &rt->dst);
47 } else { 47 } else {
48 /* non-local src, find valid iif to satisfy 48 /* non-local src, find valid iif to satisfy
49 * rp-filter when calling ip_route_input. */ 49 * rp-filter when calling ip_route_input. */
@@ -53,11 +53,11 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
53 53
54 orefdst = skb->_skb_refdst; 54 orefdst = skb->_skb_refdst;
55 if (ip_route_input(skb, iph->daddr, iph->saddr, 55 if (ip_route_input(skb, iph->daddr, iph->saddr,
56 RT_TOS(iph->tos), rt->u.dst.dev) != 0) { 56 RT_TOS(iph->tos), rt->dst.dev) != 0) {
57 dst_release(&rt->u.dst); 57 dst_release(&rt->dst);
58 return -1; 58 return -1;
59 } 59 }
60 dst_release(&rt->u.dst); 60 dst_release(&rt->dst);
61 refdst_drop(orefdst); 61 refdst_drop(orefdst);
62 } 62 }
63 63
@@ -212,9 +212,7 @@ static __sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook,
212 skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, protocol, 212 skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, protocol,
213 skb->len - dataoff, 0); 213 skb->len - dataoff, 0);
214 skb->ip_summed = CHECKSUM_NONE; 214 skb->ip_summed = CHECKSUM_NONE;
215 csum = __skb_checksum_complete_head(skb, dataoff + len); 215 return __skb_checksum_complete_head(skb, dataoff + len);
216 if (!csum)
217 skb->ip_summed = CHECKSUM_UNNECESSARY;
218 } 216 }
219 return csum; 217 return csum;
220} 218}
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 1ac01b128621..16c0ba0a2728 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -758,7 +758,7 @@ static struct xt_counters *alloc_counters(const struct xt_table *table)
758 * about). 758 * about).
759 */ 759 */
760 countersize = sizeof(struct xt_counters) * private->number; 760 countersize = sizeof(struct xt_counters) * private->number;
761 counters = vmalloc_node(countersize, numa_node_id()); 761 counters = vmalloc(countersize);
762 762
763 if (counters == NULL) 763 if (counters == NULL)
764 return ERR_PTR(-ENOMEM); 764 return ERR_PTR(-ENOMEM);
@@ -1005,8 +1005,7 @@ static int __do_replace(struct net *net, const char *name,
1005 struct arpt_entry *iter; 1005 struct arpt_entry *iter;
1006 1006
1007 ret = 0; 1007 ret = 0;
1008 counters = vmalloc_node(num_counters * sizeof(struct xt_counters), 1008 counters = vmalloc(num_counters * sizeof(struct xt_counters));
1009 numa_node_id());
1010 if (!counters) { 1009 if (!counters) {
1011 ret = -ENOMEM; 1010 ret = -ENOMEM;
1012 goto out; 1011 goto out;
@@ -1159,7 +1158,7 @@ static int do_add_counters(struct net *net, const void __user *user,
1159 if (len != size + num_counters * sizeof(struct xt_counters)) 1158 if (len != size + num_counters * sizeof(struct xt_counters))
1160 return -EINVAL; 1159 return -EINVAL;
1161 1160
1162 paddc = vmalloc_node(len - size, numa_node_id()); 1161 paddc = vmalloc(len - size);
1163 if (!paddc) 1162 if (!paddc)
1164 return -ENOMEM; 1163 return -ENOMEM;
1165 1164
diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c
index a4e5fc5df4bf..d2c1311cb28d 100644
--- a/net/ipv4/netfilter/ip_queue.c
+++ b/net/ipv4/netfilter/ip_queue.c
@@ -42,7 +42,7 @@ typedef int (*ipq_cmpfn)(struct nf_queue_entry *, unsigned long);
42 42
43static unsigned char copy_mode __read_mostly = IPQ_COPY_NONE; 43static unsigned char copy_mode __read_mostly = IPQ_COPY_NONE;
44static unsigned int queue_maxlen __read_mostly = IPQ_QMAX_DEFAULT; 44static unsigned int queue_maxlen __read_mostly = IPQ_QMAX_DEFAULT;
45static DEFINE_RWLOCK(queue_lock); 45static DEFINE_SPINLOCK(queue_lock);
46static int peer_pid __read_mostly; 46static int peer_pid __read_mostly;
47static unsigned int copy_range __read_mostly; 47static unsigned int copy_range __read_mostly;
48static unsigned int queue_total; 48static unsigned int queue_total;
@@ -72,10 +72,10 @@ __ipq_set_mode(unsigned char mode, unsigned int range)
72 break; 72 break;
73 73
74 case IPQ_COPY_PACKET: 74 case IPQ_COPY_PACKET:
75 copy_mode = mode; 75 if (range > 0xFFFF)
76 range = 0xFFFF;
76 copy_range = range; 77 copy_range = range;
77 if (copy_range > 0xFFFF) 78 copy_mode = mode;
78 copy_range = 0xFFFF;
79 break; 79 break;
80 80
81 default: 81 default:
@@ -101,7 +101,7 @@ ipq_find_dequeue_entry(unsigned long id)
101{ 101{
102 struct nf_queue_entry *entry = NULL, *i; 102 struct nf_queue_entry *entry = NULL, *i;
103 103
104 write_lock_bh(&queue_lock); 104 spin_lock_bh(&queue_lock);
105 105
106 list_for_each_entry(i, &queue_list, list) { 106 list_for_each_entry(i, &queue_list, list) {
107 if ((unsigned long)i == id) { 107 if ((unsigned long)i == id) {
@@ -115,7 +115,7 @@ ipq_find_dequeue_entry(unsigned long id)
115 queue_total--; 115 queue_total--;
116 } 116 }
117 117
118 write_unlock_bh(&queue_lock); 118 spin_unlock_bh(&queue_lock);
119 return entry; 119 return entry;
120} 120}
121 121
@@ -136,9 +136,9 @@ __ipq_flush(ipq_cmpfn cmpfn, unsigned long data)
136static void 136static void
137ipq_flush(ipq_cmpfn cmpfn, unsigned long data) 137ipq_flush(ipq_cmpfn cmpfn, unsigned long data)
138{ 138{
139 write_lock_bh(&queue_lock); 139 spin_lock_bh(&queue_lock);
140 __ipq_flush(cmpfn, data); 140 __ipq_flush(cmpfn, data);
141 write_unlock_bh(&queue_lock); 141 spin_unlock_bh(&queue_lock);
142} 142}
143 143
144static struct sk_buff * 144static struct sk_buff *
@@ -152,9 +152,7 @@ ipq_build_packet_message(struct nf_queue_entry *entry, int *errp)
152 struct nlmsghdr *nlh; 152 struct nlmsghdr *nlh;
153 struct timeval tv; 153 struct timeval tv;
154 154
155 read_lock_bh(&queue_lock); 155 switch (ACCESS_ONCE(copy_mode)) {
156
157 switch (copy_mode) {
158 case IPQ_COPY_META: 156 case IPQ_COPY_META:
159 case IPQ_COPY_NONE: 157 case IPQ_COPY_NONE:
160 size = NLMSG_SPACE(sizeof(*pmsg)); 158 size = NLMSG_SPACE(sizeof(*pmsg));
@@ -162,26 +160,21 @@ ipq_build_packet_message(struct nf_queue_entry *entry, int *errp)
162 160
163 case IPQ_COPY_PACKET: 161 case IPQ_COPY_PACKET:
164 if (entry->skb->ip_summed == CHECKSUM_PARTIAL && 162 if (entry->skb->ip_summed == CHECKSUM_PARTIAL &&
165 (*errp = skb_checksum_help(entry->skb))) { 163 (*errp = skb_checksum_help(entry->skb)))
166 read_unlock_bh(&queue_lock);
167 return NULL; 164 return NULL;
168 } 165
169 if (copy_range == 0 || copy_range > entry->skb->len) 166 data_len = ACCESS_ONCE(copy_range);
167 if (data_len == 0 || data_len > entry->skb->len)
170 data_len = entry->skb->len; 168 data_len = entry->skb->len;
171 else
172 data_len = copy_range;
173 169
174 size = NLMSG_SPACE(sizeof(*pmsg) + data_len); 170 size = NLMSG_SPACE(sizeof(*pmsg) + data_len);
175 break; 171 break;
176 172
177 default: 173 default:
178 *errp = -EINVAL; 174 *errp = -EINVAL;
179 read_unlock_bh(&queue_lock);
180 return NULL; 175 return NULL;
181 } 176 }
182 177
183 read_unlock_bh(&queue_lock);
184
185 skb = alloc_skb(size, GFP_ATOMIC); 178 skb = alloc_skb(size, GFP_ATOMIC);
186 if (!skb) 179 if (!skb)
187 goto nlmsg_failure; 180 goto nlmsg_failure;
@@ -242,7 +235,7 @@ ipq_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
242 if (nskb == NULL) 235 if (nskb == NULL)
243 return status; 236 return status;
244 237
245 write_lock_bh(&queue_lock); 238 spin_lock_bh(&queue_lock);
246 239
247 if (!peer_pid) 240 if (!peer_pid)
248 goto err_out_free_nskb; 241 goto err_out_free_nskb;
@@ -266,14 +259,14 @@ ipq_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
266 259
267 __ipq_enqueue_entry(entry); 260 __ipq_enqueue_entry(entry);
268 261
269 write_unlock_bh(&queue_lock); 262 spin_unlock_bh(&queue_lock);
270 return status; 263 return status;
271 264
272err_out_free_nskb: 265err_out_free_nskb:
273 kfree_skb(nskb); 266 kfree_skb(nskb);
274 267
275err_out_unlock: 268err_out_unlock:
276 write_unlock_bh(&queue_lock); 269 spin_unlock_bh(&queue_lock);
277 return status; 270 return status;
278} 271}
279 272
@@ -342,9 +335,9 @@ ipq_set_mode(unsigned char mode, unsigned int range)
342{ 335{
343 int status; 336 int status;
344 337
345 write_lock_bh(&queue_lock); 338 spin_lock_bh(&queue_lock);
346 status = __ipq_set_mode(mode, range); 339 status = __ipq_set_mode(mode, range);
347 write_unlock_bh(&queue_lock); 340 spin_unlock_bh(&queue_lock);
348 return status; 341 return status;
349} 342}
350 343
@@ -440,11 +433,11 @@ __ipq_rcv_skb(struct sk_buff *skb)
440 if (security_netlink_recv(skb, CAP_NET_ADMIN)) 433 if (security_netlink_recv(skb, CAP_NET_ADMIN))
441 RCV_SKB_FAIL(-EPERM); 434 RCV_SKB_FAIL(-EPERM);
442 435
443 write_lock_bh(&queue_lock); 436 spin_lock_bh(&queue_lock);
444 437
445 if (peer_pid) { 438 if (peer_pid) {
446 if (peer_pid != pid) { 439 if (peer_pid != pid) {
447 write_unlock_bh(&queue_lock); 440 spin_unlock_bh(&queue_lock);
448 RCV_SKB_FAIL(-EBUSY); 441 RCV_SKB_FAIL(-EBUSY);
449 } 442 }
450 } else { 443 } else {
@@ -452,7 +445,7 @@ __ipq_rcv_skb(struct sk_buff *skb)
452 peer_pid = pid; 445 peer_pid = pid;
453 } 446 }
454 447
455 write_unlock_bh(&queue_lock); 448 spin_unlock_bh(&queue_lock);
456 449
457 status = ipq_receive_peer(NLMSG_DATA(nlh), type, 450 status = ipq_receive_peer(NLMSG_DATA(nlh), type,
458 nlmsglen - NLMSG_LENGTH(0)); 451 nlmsglen - NLMSG_LENGTH(0));
@@ -497,10 +490,10 @@ ipq_rcv_nl_event(struct notifier_block *this,
497 struct netlink_notify *n = ptr; 490 struct netlink_notify *n = ptr;
498 491
499 if (event == NETLINK_URELEASE && n->protocol == NETLINK_FIREWALL) { 492 if (event == NETLINK_URELEASE && n->protocol == NETLINK_FIREWALL) {
500 write_lock_bh(&queue_lock); 493 spin_lock_bh(&queue_lock);
501 if ((net_eq(n->net, &init_net)) && (n->pid == peer_pid)) 494 if ((net_eq(n->net, &init_net)) && (n->pid == peer_pid))
502 __ipq_reset(); 495 __ipq_reset();
503 write_unlock_bh(&queue_lock); 496 spin_unlock_bh(&queue_lock);
504 } 497 }
505 return NOTIFY_DONE; 498 return NOTIFY_DONE;
506} 499}
@@ -527,7 +520,7 @@ static ctl_table ipq_table[] = {
527#ifdef CONFIG_PROC_FS 520#ifdef CONFIG_PROC_FS
528static int ip_queue_show(struct seq_file *m, void *v) 521static int ip_queue_show(struct seq_file *m, void *v)
529{ 522{
530 read_lock_bh(&queue_lock); 523 spin_lock_bh(&queue_lock);
531 524
532 seq_printf(m, 525 seq_printf(m,
533 "Peer PID : %d\n" 526 "Peer PID : %d\n"
@@ -545,7 +538,7 @@ static int ip_queue_show(struct seq_file *m, void *v)
545 queue_dropped, 538 queue_dropped,
546 queue_user_dropped); 539 queue_user_dropped);
547 540
548 read_unlock_bh(&queue_lock); 541 spin_unlock_bh(&queue_lock);
549 return 0; 542 return 0;
550} 543}
551 544
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 63958f3394a5..b38c11810c65 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -336,7 +336,7 @@ ipt_do_table(struct sk_buff *skb,
336 cpu = smp_processor_id(); 336 cpu = smp_processor_id();
337 table_base = private->entries[cpu]; 337 table_base = private->entries[cpu];
338 jumpstack = (struct ipt_entry **)private->jumpstack[cpu]; 338 jumpstack = (struct ipt_entry **)private->jumpstack[cpu];
339 stackptr = &private->stackptr[cpu]; 339 stackptr = per_cpu_ptr(private->stackptr, cpu);
340 origptr = *stackptr; 340 origptr = *stackptr;
341 341
342 e = get_entry(table_base, private->hook_entry[hook]); 342 e = get_entry(table_base, private->hook_entry[hook]);
@@ -928,7 +928,7 @@ static struct xt_counters *alloc_counters(const struct xt_table *table)
928 (other than comefrom, which userspace doesn't care 928 (other than comefrom, which userspace doesn't care
929 about). */ 929 about). */
930 countersize = sizeof(struct xt_counters) * private->number; 930 countersize = sizeof(struct xt_counters) * private->number;
931 counters = vmalloc_node(countersize, numa_node_id()); 931 counters = vmalloc(countersize);
932 932
933 if (counters == NULL) 933 if (counters == NULL)
934 return ERR_PTR(-ENOMEM); 934 return ERR_PTR(-ENOMEM);
@@ -1352,7 +1352,7 @@ do_add_counters(struct net *net, const void __user *user,
1352 if (len != size + num_counters * sizeof(struct xt_counters)) 1352 if (len != size + num_counters * sizeof(struct xt_counters))
1353 return -EINVAL; 1353 return -EINVAL;
1354 1354
1355 paddc = vmalloc_node(len - size, numa_node_id()); 1355 paddc = vmalloc(len - size);
1356 if (!paddc) 1356 if (!paddc)
1357 return -ENOMEM; 1357 return -ENOMEM;
1358 1358
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index f91c94b9a790..64d0875f5192 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -53,12 +53,13 @@ struct clusterip_config {
53#endif 53#endif
54 enum clusterip_hashmode hash_mode; /* which hashing mode */ 54 enum clusterip_hashmode hash_mode; /* which hashing mode */
55 u_int32_t hash_initval; /* hash initialization */ 55 u_int32_t hash_initval; /* hash initialization */
56 struct rcu_head rcu;
56}; 57};
57 58
58static LIST_HEAD(clusterip_configs); 59static LIST_HEAD(clusterip_configs);
59 60
60/* clusterip_lock protects the clusterip_configs list */ 61/* clusterip_lock protects the clusterip_configs list */
61static DEFINE_RWLOCK(clusterip_lock); 62static DEFINE_SPINLOCK(clusterip_lock);
62 63
63#ifdef CONFIG_PROC_FS 64#ifdef CONFIG_PROC_FS
64static const struct file_operations clusterip_proc_fops; 65static const struct file_operations clusterip_proc_fops;
@@ -71,11 +72,17 @@ clusterip_config_get(struct clusterip_config *c)
71 atomic_inc(&c->refcount); 72 atomic_inc(&c->refcount);
72} 73}
73 74
75
76static void clusterip_config_rcu_free(struct rcu_head *head)
77{
78 kfree(container_of(head, struct clusterip_config, rcu));
79}
80
74static inline void 81static inline void
75clusterip_config_put(struct clusterip_config *c) 82clusterip_config_put(struct clusterip_config *c)
76{ 83{
77 if (atomic_dec_and_test(&c->refcount)) 84 if (atomic_dec_and_test(&c->refcount))
78 kfree(c); 85 call_rcu_bh(&c->rcu, clusterip_config_rcu_free);
79} 86}
80 87
81/* decrease the count of entries using/referencing this config. If last 88/* decrease the count of entries using/referencing this config. If last
@@ -84,10 +91,11 @@ clusterip_config_put(struct clusterip_config *c)
84static inline void 91static inline void
85clusterip_config_entry_put(struct clusterip_config *c) 92clusterip_config_entry_put(struct clusterip_config *c)
86{ 93{
87 write_lock_bh(&clusterip_lock); 94 local_bh_disable();
88 if (atomic_dec_and_test(&c->entries)) { 95 if (atomic_dec_and_lock(&c->entries, &clusterip_lock)) {
89 list_del(&c->list); 96 list_del_rcu(&c->list);
90 write_unlock_bh(&clusterip_lock); 97 spin_unlock(&clusterip_lock);
98 local_bh_enable();
91 99
92 dev_mc_del(c->dev, c->clustermac); 100 dev_mc_del(c->dev, c->clustermac);
93 dev_put(c->dev); 101 dev_put(c->dev);
@@ -100,7 +108,7 @@ clusterip_config_entry_put(struct clusterip_config *c)
100#endif 108#endif
101 return; 109 return;
102 } 110 }
103 write_unlock_bh(&clusterip_lock); 111 local_bh_enable();
104} 112}
105 113
106static struct clusterip_config * 114static struct clusterip_config *
@@ -108,7 +116,7 @@ __clusterip_config_find(__be32 clusterip)
108{ 116{
109 struct clusterip_config *c; 117 struct clusterip_config *c;
110 118
111 list_for_each_entry(c, &clusterip_configs, list) { 119 list_for_each_entry_rcu(c, &clusterip_configs, list) {
112 if (c->clusterip == clusterip) 120 if (c->clusterip == clusterip)
113 return c; 121 return c;
114 } 122 }
@@ -121,16 +129,15 @@ clusterip_config_find_get(__be32 clusterip, int entry)
121{ 129{
122 struct clusterip_config *c; 130 struct clusterip_config *c;
123 131
124 read_lock_bh(&clusterip_lock); 132 rcu_read_lock_bh();
125 c = __clusterip_config_find(clusterip); 133 c = __clusterip_config_find(clusterip);
126 if (!c) { 134 if (c) {
127 read_unlock_bh(&clusterip_lock); 135 if (unlikely(!atomic_inc_not_zero(&c->refcount)))
128 return NULL; 136 c = NULL;
137 else if (entry)
138 atomic_inc(&c->entries);
129 } 139 }
130 atomic_inc(&c->refcount); 140 rcu_read_unlock_bh();
131 if (entry)
132 atomic_inc(&c->entries);
133 read_unlock_bh(&clusterip_lock);
134 141
135 return c; 142 return c;
136} 143}
@@ -181,9 +188,9 @@ clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip,
181 } 188 }
182#endif 189#endif
183 190
184 write_lock_bh(&clusterip_lock); 191 spin_lock_bh(&clusterip_lock);
185 list_add(&c->list, &clusterip_configs); 192 list_add_rcu(&c->list, &clusterip_configs);
186 write_unlock_bh(&clusterip_lock); 193 spin_unlock_bh(&clusterip_lock);
187 194
188 return c; 195 return c;
189} 196}
@@ -733,6 +740,9 @@ static void __exit clusterip_tg_exit(void)
733#endif 740#endif
734 nf_unregister_hook(&cip_arp_ops); 741 nf_unregister_hook(&cip_arp_ops);
735 xt_unregister_target(&clusterip_tg_reg); 742 xt_unregister_target(&clusterip_tg_reg);
743
744 /* Wait for completion of call_rcu_bh()'s (clusterip_config_rcu_free) */
745 rcu_barrier_bh();
736} 746}
737 747
738module_init(clusterip_tg_init); 748module_init(clusterip_tg_init);
diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c
index 5234f4f3499a..915fc17d7ce2 100644
--- a/net/ipv4/netfilter/ipt_LOG.c
+++ b/net/ipv4/netfilter/ipt_LOG.c
@@ -13,6 +13,7 @@
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/spinlock.h> 14#include <linux/spinlock.h>
15#include <linux/skbuff.h> 15#include <linux/skbuff.h>
16#include <linux/if_arp.h>
16#include <linux/ip.h> 17#include <linux/ip.h>
17#include <net/icmp.h> 18#include <net/icmp.h>
18#include <net/udp.h> 19#include <net/udp.h>
@@ -363,6 +364,42 @@ static void dump_packet(const struct nf_loginfo *info,
363 /* maxlen = 230+ 91 + 230 + 252 = 803 */ 364 /* maxlen = 230+ 91 + 230 + 252 = 803 */
364} 365}
365 366
367static void dump_mac_header(const struct nf_loginfo *info,
368 const struct sk_buff *skb)
369{
370 struct net_device *dev = skb->dev;
371 unsigned int logflags = 0;
372
373 if (info->type == NF_LOG_TYPE_LOG)
374 logflags = info->u.log.logflags;
375
376 if (!(logflags & IPT_LOG_MACDECODE))
377 goto fallback;
378
379 switch (dev->type) {
380 case ARPHRD_ETHER:
381 printk("MACSRC=%pM MACDST=%pM MACPROTO=%04x ",
382 eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
383 ntohs(eth_hdr(skb)->h_proto));
384 return;
385 default:
386 break;
387 }
388
389fallback:
390 printk("MAC=");
391 if (dev->hard_header_len &&
392 skb->mac_header != skb->network_header) {
393 const unsigned char *p = skb_mac_header(skb);
394 unsigned int i;
395
396 printk("%02x", *p++);
397 for (i = 1; i < dev->hard_header_len; i++, p++)
398 printk(":%02x", *p);
399 }
400 printk(" ");
401}
402
366static struct nf_loginfo default_loginfo = { 403static struct nf_loginfo default_loginfo = {
367 .type = NF_LOG_TYPE_LOG, 404 .type = NF_LOG_TYPE_LOG,
368 .u = { 405 .u = {
@@ -404,20 +441,9 @@ ipt_log_packet(u_int8_t pf,
404 } 441 }
405#endif 442#endif
406 443
407 if (in && !out) { 444 /* MAC logging for input path only. */
408 /* MAC logging for input chain only. */ 445 if (in && !out)
409 printk("MAC="); 446 dump_mac_header(loginfo, skb);
410 if (skb->dev && skb->dev->hard_header_len &&
411 skb->mac_header != skb->network_header) {
412 int i;
413 const unsigned char *p = skb_mac_header(skb);
414 for (i = 0; i < skb->dev->hard_header_len; i++,p++)
415 printk("%02x%c", *p,
416 i==skb->dev->hard_header_len - 1
417 ? ' ':':');
418 } else
419 printk(" ");
420 }
421 447
422 dump_packet(loginfo, skb, 0); 448 dump_packet(loginfo, skb, 0);
423 printk("\n"); 449 printk("\n");
diff --git a/net/ipv4/netfilter/ipt_NETMAP.c b/net/ipv4/netfilter/ipt_NETMAP.c
index f43867d1697f..6cdb298f1035 100644
--- a/net/ipv4/netfilter/ipt_NETMAP.c
+++ b/net/ipv4/netfilter/ipt_NETMAP.c
@@ -48,7 +48,8 @@ netmap_tg(struct sk_buff *skb, const struct xt_action_param *par)
48 48
49 NF_CT_ASSERT(par->hooknum == NF_INET_PRE_ROUTING || 49 NF_CT_ASSERT(par->hooknum == NF_INET_PRE_ROUTING ||
50 par->hooknum == NF_INET_POST_ROUTING || 50 par->hooknum == NF_INET_POST_ROUTING ||
51 par->hooknum == NF_INET_LOCAL_OUT); 51 par->hooknum == NF_INET_LOCAL_OUT ||
52 par->hooknum == NF_INET_LOCAL_IN);
52 ct = nf_ct_get(skb, &ctinfo); 53 ct = nf_ct_get(skb, &ctinfo);
53 54
54 netmask = ~(mr->range[0].min_ip ^ mr->range[0].max_ip); 55 netmask = ~(mr->range[0].min_ip ^ mr->range[0].max_ip);
@@ -77,7 +78,8 @@ static struct xt_target netmap_tg_reg __read_mostly = {
77 .table = "nat", 78 .table = "nat",
78 .hooks = (1 << NF_INET_PRE_ROUTING) | 79 .hooks = (1 << NF_INET_PRE_ROUTING) |
79 (1 << NF_INET_POST_ROUTING) | 80 (1 << NF_INET_POST_ROUTING) |
80 (1 << NF_INET_LOCAL_OUT), 81 (1 << NF_INET_LOCAL_OUT) |
82 (1 << NF_INET_LOCAL_IN),
81 .checkentry = netmap_tg_check, 83 .checkentry = netmap_tg_check,
82 .me = THIS_MODULE 84 .me = THIS_MODULE
83}; 85};
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index f5f4a888e4ec..bbbd2736c549 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -109,7 +109,7 @@ static void send_reset(struct sk_buff *oldskb, int hook)
109 addr_type = RTN_LOCAL; 109 addr_type = RTN_LOCAL;
110 110
111 /* ip_route_me_harder expects skb->dst to be set */ 111 /* ip_route_me_harder expects skb->dst to be set */
112 skb_dst_set(nskb, dst_clone(skb_dst(oldskb))); 112 skb_dst_set_noref(nskb, skb_dst(oldskb));
113 113
114 if (ip_route_me_harder(nskb, addr_type)) 114 if (ip_route_me_harder(nskb, addr_type))
115 goto free_nskb; 115 goto free_nskb;
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
index cb763ae9ed90..eab8de32f200 100644
--- a/net/ipv4/netfilter/nf_defrag_ipv4.c
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -66,6 +66,11 @@ static unsigned int ipv4_conntrack_defrag(unsigned int hooknum,
66 const struct net_device *out, 66 const struct net_device *out,
67 int (*okfn)(struct sk_buff *)) 67 int (*okfn)(struct sk_buff *))
68{ 68{
69 struct inet_sock *inet = inet_sk(skb->sk);
70
71 if (inet && inet->nodefrag)
72 return NF_ACCEPT;
73
69#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) 74#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
70#if !defined(CONFIG_NF_NAT) && !defined(CONFIG_NF_NAT_MODULE) 75#if !defined(CONFIG_NF_NAT) && !defined(CONFIG_NF_NAT_MODULE)
71 /* Previously seen (loopback)? Ignore. Do this before 76 /* Previously seen (loopback)? Ignore. Do this before
diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c
index 4f8bddb760c9..c7719b283ada 100644
--- a/net/ipv4/netfilter/nf_nat_core.c
+++ b/net/ipv4/netfilter/nf_nat_core.c
@@ -742,7 +742,7 @@ static int __init nf_nat_init(void)
742 spin_unlock_bh(&nf_nat_lock); 742 spin_unlock_bh(&nf_nat_lock);
743 743
744 /* Initialize fake conntrack so that NAT will skip it */ 744 /* Initialize fake conntrack so that NAT will skip it */
745 nf_conntrack_untracked.status |= IPS_NAT_DONE_MASK; 745 nf_ct_untracked_status_or(IPS_NAT_DONE_MASK);
746 746
747 l3proto = nf_ct_l3proto_find_get((u_int16_t)AF_INET); 747 l3proto = nf_ct_l3proto_find_get((u_int16_t)AF_INET);
748 748
diff --git a/net/ipv4/netfilter/nf_nat_rule.c b/net/ipv4/netfilter/nf_nat_rule.c
index 98ed78281aee..ebbd319f62f5 100644
--- a/net/ipv4/netfilter/nf_nat_rule.c
+++ b/net/ipv4/netfilter/nf_nat_rule.c
@@ -28,7 +28,8 @@
28 28
29#define NAT_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | \ 29#define NAT_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | \
30 (1 << NF_INET_POST_ROUTING) | \ 30 (1 << NF_INET_POST_ROUTING) | \
31 (1 << NF_INET_LOCAL_OUT)) 31 (1 << NF_INET_LOCAL_OUT) | \
32 (1 << NF_INET_LOCAL_IN))
32 33
33static const struct xt_table nat_table = { 34static const struct xt_table nat_table = {
34 .name = "nat", 35 .name = "nat",
@@ -45,7 +46,8 @@ ipt_snat_target(struct sk_buff *skb, const struct xt_action_param *par)
45 enum ip_conntrack_info ctinfo; 46 enum ip_conntrack_info ctinfo;
46 const struct nf_nat_multi_range_compat *mr = par->targinfo; 47 const struct nf_nat_multi_range_compat *mr = par->targinfo;
47 48
48 NF_CT_ASSERT(par->hooknum == NF_INET_POST_ROUTING); 49 NF_CT_ASSERT(par->hooknum == NF_INET_POST_ROUTING ||
50 par->hooknum == NF_INET_LOCAL_IN);
49 51
50 ct = nf_ct_get(skb, &ctinfo); 52 ct = nf_ct_get(skb, &ctinfo);
51 53
@@ -99,7 +101,7 @@ static int ipt_dnat_checkentry(const struct xt_tgchk_param *par)
99 return 0; 101 return 0;
100} 102}
101 103
102unsigned int 104static unsigned int
103alloc_null_binding(struct nf_conn *ct, unsigned int hooknum) 105alloc_null_binding(struct nf_conn *ct, unsigned int hooknum)
104{ 106{
105 /* Force range to this IP; let proto decide mapping for 107 /* Force range to this IP; let proto decide mapping for
@@ -141,7 +143,7 @@ static struct xt_target ipt_snat_reg __read_mostly = {
141 .target = ipt_snat_target, 143 .target = ipt_snat_target,
142 .targetsize = sizeof(struct nf_nat_multi_range_compat), 144 .targetsize = sizeof(struct nf_nat_multi_range_compat),
143 .table = "nat", 145 .table = "nat",
144 .hooks = 1 << NF_INET_POST_ROUTING, 146 .hooks = (1 << NF_INET_POST_ROUTING) | (1 << NF_INET_LOCAL_IN),
145 .checkentry = ipt_snat_checkentry, 147 .checkentry = ipt_snat_checkentry,
146 .family = AF_INET, 148 .family = AF_INET,
147}; 149};
diff --git a/net/ipv4/netfilter/nf_nat_standalone.c b/net/ipv4/netfilter/nf_nat_standalone.c
index beb25819c9c9..95481fee8bdb 100644
--- a/net/ipv4/netfilter/nf_nat_standalone.c
+++ b/net/ipv4/netfilter/nf_nat_standalone.c
@@ -98,7 +98,7 @@ nf_nat_fn(unsigned int hooknum,
98 return NF_ACCEPT; 98 return NF_ACCEPT;
99 99
100 /* Don't try to NAT if this packet is not conntracked */ 100 /* Don't try to NAT if this packet is not conntracked */
101 if (ct == &nf_conntrack_untracked) 101 if (nf_ct_is_untracked(ct))
102 return NF_ACCEPT; 102 return NF_ACCEPT;
103 103
104 nat = nfct_nat(ct); 104 nat = nfct_nat(ct);
@@ -131,13 +131,7 @@ nf_nat_fn(unsigned int hooknum,
131 if (!nf_nat_initialized(ct, maniptype)) { 131 if (!nf_nat_initialized(ct, maniptype)) {
132 unsigned int ret; 132 unsigned int ret;
133 133
134 if (hooknum == NF_INET_LOCAL_IN) 134 ret = nf_nat_rule_find(skb, hooknum, in, out, ct);
135 /* LOCAL_IN hook doesn't have a chain! */
136 ret = alloc_null_binding(ct, hooknum);
137 else
138 ret = nf_nat_rule_find(skb, hooknum, in, out,
139 ct);
140
141 if (ret != NF_ACCEPT) 135 if (ret != NF_ACCEPT)
142 return ret; 136 return ret;
143 } else 137 } else
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 3dc9914c1dce..4ae1f203f7cb 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -252,6 +252,7 @@ static const struct snmp_mib snmp4_net_list[] = {
252 SNMP_MIB_ITEM("TCPBacklogDrop", LINUX_MIB_TCPBACKLOGDROP), 252 SNMP_MIB_ITEM("TCPBacklogDrop", LINUX_MIB_TCPBACKLOGDROP),
253 SNMP_MIB_ITEM("TCPMinTTLDrop", LINUX_MIB_TCPMINTTLDROP), 253 SNMP_MIB_ITEM("TCPMinTTLDrop", LINUX_MIB_TCPMINTTLDROP),
254 SNMP_MIB_ITEM("TCPDeferAcceptDrop", LINUX_MIB_TCPDEFERACCEPTDROP), 254 SNMP_MIB_ITEM("TCPDeferAcceptDrop", LINUX_MIB_TCPDEFERACCEPTDROP),
255 SNMP_MIB_ITEM("IPReversePathFilter", LINUX_MIB_IPRPFILTER),
255 SNMP_MIB_SENTINEL 256 SNMP_MIB_SENTINEL
256}; 257};
257 258
@@ -342,10 +343,12 @@ static int snmp_seq_show(struct seq_file *seq, void *v)
342 IPV4_DEVCONF_ALL(net, FORWARDING) ? 1 : 2, 343 IPV4_DEVCONF_ALL(net, FORWARDING) ? 1 : 2,
343 sysctl_ip_default_ttl); 344 sysctl_ip_default_ttl);
344 345
346 BUILD_BUG_ON(offsetof(struct ipstats_mib, mibs) != 0);
345 for (i = 0; snmp4_ipstats_list[i].name != NULL; i++) 347 for (i = 0; snmp4_ipstats_list[i].name != NULL; i++)
346 seq_printf(seq, " %lu", 348 seq_printf(seq, " %llu",
347 snmp_fold_field((void __percpu **)net->mib.ip_statistics, 349 snmp_fold_field64((void __percpu **)net->mib.ip_statistics,
348 snmp4_ipstats_list[i].entry)); 350 snmp4_ipstats_list[i].entry,
351 offsetof(struct ipstats_mib, syncp)));
349 352
350 icmp_put(seq); /* RFC 2011 compatibility */ 353 icmp_put(seq); /* RFC 2011 compatibility */
351 icmpmsg_put(seq); 354 icmpmsg_put(seq);
@@ -431,9 +434,10 @@ static int netstat_seq_show(struct seq_file *seq, void *v)
431 434
432 seq_puts(seq, "\nIpExt:"); 435 seq_puts(seq, "\nIpExt:");
433 for (i = 0; snmp4_ipextstats_list[i].name != NULL; i++) 436 for (i = 0; snmp4_ipextstats_list[i].name != NULL; i++)
434 seq_printf(seq, " %lu", 437 seq_printf(seq, " %llu",
435 snmp_fold_field((void __percpu **)net->mib.ip_statistics, 438 snmp_fold_field64((void __percpu **)net->mib.ip_statistics,
436 snmp4_ipextstats_list[i].entry)); 439 snmp4_ipextstats_list[i].entry,
440 offsetof(struct ipstats_mib, syncp)));
437 441
438 seq_putc(seq, '\n'); 442 seq_putc(seq, '\n');
439 return 0; 443 return 0;
diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c
index 542f22fc98b3..f2d297351405 100644
--- a/net/ipv4/protocol.c
+++ b/net/ipv4/protocol.c
@@ -52,6 +52,7 @@ int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol)
52 52
53 return ret; 53 return ret;
54} 54}
55EXPORT_SYMBOL(inet_add_protocol);
55 56
56/* 57/*
57 * Remove a protocol from the hash tables. 58 * Remove a protocol from the hash tables.
@@ -76,6 +77,4 @@ int inet_del_protocol(const struct net_protocol *prot, unsigned char protocol)
76 77
77 return ret; 78 return ret;
78} 79}
79
80EXPORT_SYMBOL(inet_add_protocol);
81EXPORT_SYMBOL(inet_del_protocol); 80EXPORT_SYMBOL(inet_del_protocol);
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 2c7a1639388a..009a7b2aa1ef 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -314,7 +314,7 @@ int raw_rcv(struct sock *sk, struct sk_buff *skb)
314} 314}
315 315
316static int raw_send_hdrinc(struct sock *sk, void *from, size_t length, 316static int raw_send_hdrinc(struct sock *sk, void *from, size_t length,
317 struct rtable *rt, 317 struct rtable **rtp,
318 unsigned int flags) 318 unsigned int flags)
319{ 319{
320 struct inet_sock *inet = inet_sk(sk); 320 struct inet_sock *inet = inet_sk(sk);
@@ -323,25 +323,27 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length,
323 struct sk_buff *skb; 323 struct sk_buff *skb;
324 unsigned int iphlen; 324 unsigned int iphlen;
325 int err; 325 int err;
326 struct rtable *rt = *rtp;
326 327
327 if (length > rt->u.dst.dev->mtu) { 328 if (length > rt->dst.dev->mtu) {
328 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport, 329 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport,
329 rt->u.dst.dev->mtu); 330 rt->dst.dev->mtu);
330 return -EMSGSIZE; 331 return -EMSGSIZE;
331 } 332 }
332 if (flags&MSG_PROBE) 333 if (flags&MSG_PROBE)
333 goto out; 334 goto out;
334 335
335 skb = sock_alloc_send_skb(sk, 336 skb = sock_alloc_send_skb(sk,
336 length + LL_ALLOCATED_SPACE(rt->u.dst.dev) + 15, 337 length + LL_ALLOCATED_SPACE(rt->dst.dev) + 15,
337 flags & MSG_DONTWAIT, &err); 338 flags & MSG_DONTWAIT, &err);
338 if (skb == NULL) 339 if (skb == NULL)
339 goto error; 340 goto error;
340 skb_reserve(skb, LL_RESERVED_SPACE(rt->u.dst.dev)); 341 skb_reserve(skb, LL_RESERVED_SPACE(rt->dst.dev));
341 342
342 skb->priority = sk->sk_priority; 343 skb->priority = sk->sk_priority;
343 skb->mark = sk->sk_mark; 344 skb->mark = sk->sk_mark;
344 skb_dst_set(skb, dst_clone(&rt->u.dst)); 345 skb_dst_set(skb, &rt->dst);
346 *rtp = NULL;
345 347
346 skb_reset_network_header(skb); 348 skb_reset_network_header(skb);
347 iph = ip_hdr(skb); 349 iph = ip_hdr(skb);
@@ -373,7 +375,7 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length,
373 iph->check = 0; 375 iph->check = 0;
374 iph->tot_len = htons(length); 376 iph->tot_len = htons(length);
375 if (!iph->id) 377 if (!iph->id)
376 ip_select_ident(iph, &rt->u.dst, NULL); 378 ip_select_ident(iph, &rt->dst, NULL);
377 379
378 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); 380 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
379 } 381 }
@@ -382,7 +384,7 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length,
382 skb_transport_header(skb))->type); 384 skb_transport_header(skb))->type);
383 385
384 err = NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL, 386 err = NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL,
385 rt->u.dst.dev, dst_output); 387 rt->dst.dev, dst_output);
386 if (err > 0) 388 if (err > 0)
387 err = net_xmit_errno(err); 389 err = net_xmit_errno(err);
388 if (err) 390 if (err)
@@ -576,7 +578,7 @@ back_from_confirm:
576 578
577 if (inet->hdrincl) 579 if (inet->hdrincl)
578 err = raw_send_hdrinc(sk, msg->msg_iov, len, 580 err = raw_send_hdrinc(sk, msg->msg_iov, len,
579 rt, msg->msg_flags); 581 &rt, msg->msg_flags);
580 582
581 else { 583 else {
582 if (!ipc.addr) 584 if (!ipc.addr)
@@ -604,7 +606,7 @@ out:
604 return len; 606 return len;
605 607
606do_confirm: 608do_confirm:
607 dst_confirm(&rt->u.dst); 609 dst_confirm(&rt->dst);
608 if (!(msg->msg_flags & MSG_PROBE) || len) 610 if (!(msg->msg_flags & MSG_PROBE) || len)
609 goto back_from_confirm; 611 goto back_from_confirm;
610 err = 0; 612 err = 0;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 560acc677ce4..562ce92de2a6 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -253,8 +253,7 @@ static unsigned rt_hash_mask __read_mostly;
253static unsigned int rt_hash_log __read_mostly; 253static unsigned int rt_hash_log __read_mostly;
254 254
255static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); 255static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
256#define RT_CACHE_STAT_INC(field) \ 256#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
257 (__raw_get_cpu_var(rt_cache_stat).field++)
258 257
259static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx, 258static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
260 int genid) 259 int genid)
@@ -287,10 +286,10 @@ static struct rtable *rt_cache_get_first(struct seq_file *seq)
287 rcu_read_lock_bh(); 286 rcu_read_lock_bh();
288 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain); 287 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
289 while (r) { 288 while (r) {
290 if (dev_net(r->u.dst.dev) == seq_file_net(seq) && 289 if (dev_net(r->dst.dev) == seq_file_net(seq) &&
291 r->rt_genid == st->genid) 290 r->rt_genid == st->genid)
292 return r; 291 return r;
293 r = rcu_dereference_bh(r->u.dst.rt_next); 292 r = rcu_dereference_bh(r->dst.rt_next);
294 } 293 }
295 rcu_read_unlock_bh(); 294 rcu_read_unlock_bh();
296 } 295 }
@@ -302,7 +301,7 @@ static struct rtable *__rt_cache_get_next(struct seq_file *seq,
302{ 301{
303 struct rt_cache_iter_state *st = seq->private; 302 struct rt_cache_iter_state *st = seq->private;
304 303
305 r = r->u.dst.rt_next; 304 r = r->dst.rt_next;
306 while (!r) { 305 while (!r) {
307 rcu_read_unlock_bh(); 306 rcu_read_unlock_bh();
308 do { 307 do {
@@ -320,7 +319,7 @@ static struct rtable *rt_cache_get_next(struct seq_file *seq,
320{ 319{
321 struct rt_cache_iter_state *st = seq->private; 320 struct rt_cache_iter_state *st = seq->private;
322 while ((r = __rt_cache_get_next(seq, r)) != NULL) { 321 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
323 if (dev_net(r->u.dst.dev) != seq_file_net(seq)) 322 if (dev_net(r->dst.dev) != seq_file_net(seq))
324 continue; 323 continue;
325 if (r->rt_genid == st->genid) 324 if (r->rt_genid == st->genid)
326 break; 325 break;
@@ -378,19 +377,19 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v)
378 377
379 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t" 378 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
380 "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n", 379 "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
381 r->u.dst.dev ? r->u.dst.dev->name : "*", 380 r->dst.dev ? r->dst.dev->name : "*",
382 (__force u32)r->rt_dst, 381 (__force u32)r->rt_dst,
383 (__force u32)r->rt_gateway, 382 (__force u32)r->rt_gateway,
384 r->rt_flags, atomic_read(&r->u.dst.__refcnt), 383 r->rt_flags, atomic_read(&r->dst.__refcnt),
385 r->u.dst.__use, 0, (__force u32)r->rt_src, 384 r->dst.__use, 0, (__force u32)r->rt_src,
386 (dst_metric(&r->u.dst, RTAX_ADVMSS) ? 385 (dst_metric(&r->dst, RTAX_ADVMSS) ?
387 (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0), 386 (int)dst_metric(&r->dst, RTAX_ADVMSS) + 40 : 0),
388 dst_metric(&r->u.dst, RTAX_WINDOW), 387 dst_metric(&r->dst, RTAX_WINDOW),
389 (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) + 388 (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
390 dst_metric(&r->u.dst, RTAX_RTTVAR)), 389 dst_metric(&r->dst, RTAX_RTTVAR)),
391 r->fl.fl4_tos, 390 r->fl.fl4_tos,
392 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1, 391 r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1,
393 r->u.dst.hh ? (r->u.dst.hh->hh_output == 392 r->dst.hh ? (r->dst.hh->hh_output ==
394 dev_queue_xmit) : 0, 393 dev_queue_xmit) : 0,
395 r->rt_spec_dst, &len); 394 r->rt_spec_dst, &len);
396 395
@@ -609,13 +608,13 @@ static inline int ip_rt_proc_init(void)
609 608
610static inline void rt_free(struct rtable *rt) 609static inline void rt_free(struct rtable *rt)
611{ 610{
612 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free); 611 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
613} 612}
614 613
615static inline void rt_drop(struct rtable *rt) 614static inline void rt_drop(struct rtable *rt)
616{ 615{
617 ip_rt_put(rt); 616 ip_rt_put(rt);
618 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free); 617 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
619} 618}
620 619
621static inline int rt_fast_clean(struct rtable *rth) 620static inline int rt_fast_clean(struct rtable *rth)
@@ -623,13 +622,13 @@ static inline int rt_fast_clean(struct rtable *rth)
623 /* Kill broadcast/multicast entries very aggresively, if they 622 /* Kill broadcast/multicast entries very aggresively, if they
624 collide in hash table with more useful entries */ 623 collide in hash table with more useful entries */
625 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) && 624 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
626 rth->fl.iif && rth->u.dst.rt_next; 625 rth->fl.iif && rth->dst.rt_next;
627} 626}
628 627
629static inline int rt_valuable(struct rtable *rth) 628static inline int rt_valuable(struct rtable *rth)
630{ 629{
631 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) || 630 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
632 rth->u.dst.expires; 631 rth->dst.expires;
633} 632}
634 633
635static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2) 634static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
@@ -637,15 +636,15 @@ static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long t
637 unsigned long age; 636 unsigned long age;
638 int ret = 0; 637 int ret = 0;
639 638
640 if (atomic_read(&rth->u.dst.__refcnt)) 639 if (atomic_read(&rth->dst.__refcnt))
641 goto out; 640 goto out;
642 641
643 ret = 1; 642 ret = 1;
644 if (rth->u.dst.expires && 643 if (rth->dst.expires &&
645 time_after_eq(jiffies, rth->u.dst.expires)) 644 time_after_eq(jiffies, rth->dst.expires))
646 goto out; 645 goto out;
647 646
648 age = jiffies - rth->u.dst.lastuse; 647 age = jiffies - rth->dst.lastuse;
649 ret = 0; 648 ret = 0;
650 if ((age <= tmo1 && !rt_fast_clean(rth)) || 649 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
651 (age <= tmo2 && rt_valuable(rth))) 650 (age <= tmo2 && rt_valuable(rth)))
@@ -661,7 +660,7 @@ out: return ret;
661 */ 660 */
662static inline u32 rt_score(struct rtable *rt) 661static inline u32 rt_score(struct rtable *rt)
663{ 662{
664 u32 score = jiffies - rt->u.dst.lastuse; 663 u32 score = jiffies - rt->dst.lastuse;
665 664
666 score = ~score & ~(3<<30); 665 score = ~score & ~(3<<30);
667 666
@@ -701,12 +700,12 @@ static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
701 700
702static inline int compare_netns(struct rtable *rt1, struct rtable *rt2) 701static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
703{ 702{
704 return net_eq(dev_net(rt1->u.dst.dev), dev_net(rt2->u.dst.dev)); 703 return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
705} 704}
706 705
707static inline int rt_is_expired(struct rtable *rth) 706static inline int rt_is_expired(struct rtable *rth)
708{ 707{
709 return rth->rt_genid != rt_genid(dev_net(rth->u.dst.dev)); 708 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
710} 709}
711 710
712/* 711/*
@@ -735,7 +734,7 @@ static void rt_do_flush(int process_context)
735 rth = rt_hash_table[i].chain; 734 rth = rt_hash_table[i].chain;
736 735
737 /* defer releasing the head of the list after spin_unlock */ 736 /* defer releasing the head of the list after spin_unlock */
738 for (tail = rth; tail; tail = tail->u.dst.rt_next) 737 for (tail = rth; tail; tail = tail->dst.rt_next)
739 if (!rt_is_expired(tail)) 738 if (!rt_is_expired(tail))
740 break; 739 break;
741 if (rth != tail) 740 if (rth != tail)
@@ -744,9 +743,9 @@ static void rt_do_flush(int process_context)
744 /* call rt_free on entries after the tail requiring flush */ 743 /* call rt_free on entries after the tail requiring flush */
745 prev = &rt_hash_table[i].chain; 744 prev = &rt_hash_table[i].chain;
746 for (p = *prev; p; p = next) { 745 for (p = *prev; p; p = next) {
747 next = p->u.dst.rt_next; 746 next = p->dst.rt_next;
748 if (!rt_is_expired(p)) { 747 if (!rt_is_expired(p)) {
749 prev = &p->u.dst.rt_next; 748 prev = &p->dst.rt_next;
750 } else { 749 } else {
751 *prev = next; 750 *prev = next;
752 rt_free(p); 751 rt_free(p);
@@ -761,7 +760,7 @@ static void rt_do_flush(int process_context)
761 spin_unlock_bh(rt_hash_lock_addr(i)); 760 spin_unlock_bh(rt_hash_lock_addr(i));
762 761
763 for (; rth != tail; rth = next) { 762 for (; rth != tail; rth = next) {
764 next = rth->u.dst.rt_next; 763 next = rth->dst.rt_next;
765 rt_free(rth); 764 rt_free(rth);
766 } 765 }
767 } 766 }
@@ -792,7 +791,7 @@ static int has_noalias(const struct rtable *head, const struct rtable *rth)
792 while (aux != rth) { 791 while (aux != rth) {
793 if (compare_hash_inputs(&aux->fl, &rth->fl)) 792 if (compare_hash_inputs(&aux->fl, &rth->fl))
794 return 0; 793 return 0;
795 aux = aux->u.dst.rt_next; 794 aux = aux->dst.rt_next;
796 } 795 }
797 return ONE; 796 return ONE;
798} 797}
@@ -832,18 +831,18 @@ static void rt_check_expire(void)
832 length = 0; 831 length = 0;
833 spin_lock_bh(rt_hash_lock_addr(i)); 832 spin_lock_bh(rt_hash_lock_addr(i));
834 while ((rth = *rthp) != NULL) { 833 while ((rth = *rthp) != NULL) {
835 prefetch(rth->u.dst.rt_next); 834 prefetch(rth->dst.rt_next);
836 if (rt_is_expired(rth)) { 835 if (rt_is_expired(rth)) {
837 *rthp = rth->u.dst.rt_next; 836 *rthp = rth->dst.rt_next;
838 rt_free(rth); 837 rt_free(rth);
839 continue; 838 continue;
840 } 839 }
841 if (rth->u.dst.expires) { 840 if (rth->dst.expires) {
842 /* Entry is expired even if it is in use */ 841 /* Entry is expired even if it is in use */
843 if (time_before_eq(jiffies, rth->u.dst.expires)) { 842 if (time_before_eq(jiffies, rth->dst.expires)) {
844nofree: 843nofree:
845 tmo >>= 1; 844 tmo >>= 1;
846 rthp = &rth->u.dst.rt_next; 845 rthp = &rth->dst.rt_next;
847 /* 846 /*
848 * We only count entries on 847 * We only count entries on
849 * a chain with equal hash inputs once 848 * a chain with equal hash inputs once
@@ -859,7 +858,7 @@ nofree:
859 goto nofree; 858 goto nofree;
860 859
861 /* Cleanup aged off entries. */ 860 /* Cleanup aged off entries. */
862 *rthp = rth->u.dst.rt_next; 861 *rthp = rth->dst.rt_next;
863 rt_free(rth); 862 rt_free(rth);
864 } 863 }
865 spin_unlock_bh(rt_hash_lock_addr(i)); 864 spin_unlock_bh(rt_hash_lock_addr(i));
@@ -1000,10 +999,10 @@ static int rt_garbage_collect(struct dst_ops *ops)
1000 if (!rt_is_expired(rth) && 999 if (!rt_is_expired(rth) &&
1001 !rt_may_expire(rth, tmo, expire)) { 1000 !rt_may_expire(rth, tmo, expire)) {
1002 tmo >>= 1; 1001 tmo >>= 1;
1003 rthp = &rth->u.dst.rt_next; 1002 rthp = &rth->dst.rt_next;
1004 continue; 1003 continue;
1005 } 1004 }
1006 *rthp = rth->u.dst.rt_next; 1005 *rthp = rth->dst.rt_next;
1007 rt_free(rth); 1006 rt_free(rth);
1008 goal--; 1007 goal--;
1009 } 1008 }
@@ -1069,7 +1068,7 @@ static int slow_chain_length(const struct rtable *head)
1069 1068
1070 while (rth) { 1069 while (rth) {
1071 length += has_noalias(head, rth); 1070 length += has_noalias(head, rth);
1072 rth = rth->u.dst.rt_next; 1071 rth = rth->dst.rt_next;
1073 } 1072 }
1074 return length >> FRACT_BITS; 1073 return length >> FRACT_BITS;
1075} 1074}
@@ -1091,7 +1090,7 @@ restart:
1091 candp = NULL; 1090 candp = NULL;
1092 now = jiffies; 1091 now = jiffies;
1093 1092
1094 if (!rt_caching(dev_net(rt->u.dst.dev))) { 1093 if (!rt_caching(dev_net(rt->dst.dev))) {
1095 /* 1094 /*
1096 * If we're not caching, just tell the caller we 1095 * If we're not caching, just tell the caller we
1097 * were successful and don't touch the route. The 1096 * were successful and don't touch the route. The
@@ -1109,7 +1108,7 @@ restart:
1109 */ 1108 */
1110 1109
1111 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) { 1110 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1112 int err = arp_bind_neighbour(&rt->u.dst); 1111 int err = arp_bind_neighbour(&rt->dst);
1113 if (err) { 1112 if (err) {
1114 if (net_ratelimit()) 1113 if (net_ratelimit())
1115 printk(KERN_WARNING 1114 printk(KERN_WARNING
@@ -1128,19 +1127,19 @@ restart:
1128 spin_lock_bh(rt_hash_lock_addr(hash)); 1127 spin_lock_bh(rt_hash_lock_addr(hash));
1129 while ((rth = *rthp) != NULL) { 1128 while ((rth = *rthp) != NULL) {
1130 if (rt_is_expired(rth)) { 1129 if (rt_is_expired(rth)) {
1131 *rthp = rth->u.dst.rt_next; 1130 *rthp = rth->dst.rt_next;
1132 rt_free(rth); 1131 rt_free(rth);
1133 continue; 1132 continue;
1134 } 1133 }
1135 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) { 1134 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
1136 /* Put it first */ 1135 /* Put it first */
1137 *rthp = rth->u.dst.rt_next; 1136 *rthp = rth->dst.rt_next;
1138 /* 1137 /*
1139 * Since lookup is lockfree, the deletion 1138 * Since lookup is lockfree, the deletion
1140 * must be visible to another weakly ordered CPU before 1139 * must be visible to another weakly ordered CPU before
1141 * the insertion at the start of the hash chain. 1140 * the insertion at the start of the hash chain.
1142 */ 1141 */
1143 rcu_assign_pointer(rth->u.dst.rt_next, 1142 rcu_assign_pointer(rth->dst.rt_next,
1144 rt_hash_table[hash].chain); 1143 rt_hash_table[hash].chain);
1145 /* 1144 /*
1146 * Since lookup is lockfree, the update writes 1145 * Since lookup is lockfree, the update writes
@@ -1148,18 +1147,18 @@ restart:
1148 */ 1147 */
1149 rcu_assign_pointer(rt_hash_table[hash].chain, rth); 1148 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1150 1149
1151 dst_use(&rth->u.dst, now); 1150 dst_use(&rth->dst, now);
1152 spin_unlock_bh(rt_hash_lock_addr(hash)); 1151 spin_unlock_bh(rt_hash_lock_addr(hash));
1153 1152
1154 rt_drop(rt); 1153 rt_drop(rt);
1155 if (rp) 1154 if (rp)
1156 *rp = rth; 1155 *rp = rth;
1157 else 1156 else
1158 skb_dst_set(skb, &rth->u.dst); 1157 skb_dst_set(skb, &rth->dst);
1159 return 0; 1158 return 0;
1160 } 1159 }
1161 1160
1162 if (!atomic_read(&rth->u.dst.__refcnt)) { 1161 if (!atomic_read(&rth->dst.__refcnt)) {
1163 u32 score = rt_score(rth); 1162 u32 score = rt_score(rth);
1164 1163
1165 if (score <= min_score) { 1164 if (score <= min_score) {
@@ -1171,7 +1170,7 @@ restart:
1171 1170
1172 chain_length++; 1171 chain_length++;
1173 1172
1174 rthp = &rth->u.dst.rt_next; 1173 rthp = &rth->dst.rt_next;
1175 } 1174 }
1176 1175
1177 if (cand) { 1176 if (cand) {
@@ -1182,17 +1181,17 @@ restart:
1182 * only 2 entries per bucket. We will see. 1181 * only 2 entries per bucket. We will see.
1183 */ 1182 */
1184 if (chain_length > ip_rt_gc_elasticity) { 1183 if (chain_length > ip_rt_gc_elasticity) {
1185 *candp = cand->u.dst.rt_next; 1184 *candp = cand->dst.rt_next;
1186 rt_free(cand); 1185 rt_free(cand);
1187 } 1186 }
1188 } else { 1187 } else {
1189 if (chain_length > rt_chain_length_max && 1188 if (chain_length > rt_chain_length_max &&
1190 slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) { 1189 slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1191 struct net *net = dev_net(rt->u.dst.dev); 1190 struct net *net = dev_net(rt->dst.dev);
1192 int num = ++net->ipv4.current_rt_cache_rebuild_count; 1191 int num = ++net->ipv4.current_rt_cache_rebuild_count;
1193 if (!rt_caching(net)) { 1192 if (!rt_caching(net)) {
1194 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n", 1193 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1195 rt->u.dst.dev->name, num); 1194 rt->dst.dev->name, num);
1196 } 1195 }
1197 rt_emergency_hash_rebuild(net); 1196 rt_emergency_hash_rebuild(net);
1198 spin_unlock_bh(rt_hash_lock_addr(hash)); 1197 spin_unlock_bh(rt_hash_lock_addr(hash));
@@ -1207,7 +1206,7 @@ restart:
1207 route or unicast forwarding path. 1206 route or unicast forwarding path.
1208 */ 1207 */
1209 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) { 1208 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1210 int err = arp_bind_neighbour(&rt->u.dst); 1209 int err = arp_bind_neighbour(&rt->dst);
1211 if (err) { 1210 if (err) {
1212 spin_unlock_bh(rt_hash_lock_addr(hash)); 1211 spin_unlock_bh(rt_hash_lock_addr(hash));
1213 1212
@@ -1238,14 +1237,14 @@ restart:
1238 } 1237 }
1239 } 1238 }
1240 1239
1241 rt->u.dst.rt_next = rt_hash_table[hash].chain; 1240 rt->dst.rt_next = rt_hash_table[hash].chain;
1242 1241
1243#if RT_CACHE_DEBUG >= 2 1242#if RT_CACHE_DEBUG >= 2
1244 if (rt->u.dst.rt_next) { 1243 if (rt->dst.rt_next) {
1245 struct rtable *trt; 1244 struct rtable *trt;
1246 printk(KERN_DEBUG "rt_cache @%02x: %pI4", 1245 printk(KERN_DEBUG "rt_cache @%02x: %pI4",
1247 hash, &rt->rt_dst); 1246 hash, &rt->rt_dst);
1248 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next) 1247 for (trt = rt->dst.rt_next; trt; trt = trt->dst.rt_next)
1249 printk(" . %pI4", &trt->rt_dst); 1248 printk(" . %pI4", &trt->rt_dst);
1250 printk("\n"); 1249 printk("\n");
1251 } 1250 }
@@ -1263,7 +1262,7 @@ skip_hashing:
1263 if (rp) 1262 if (rp)
1264 *rp = rt; 1263 *rp = rt;
1265 else 1264 else
1266 skb_dst_set(skb, &rt->u.dst); 1265 skb_dst_set(skb, &rt->dst);
1267 return 0; 1266 return 0;
1268} 1267}
1269 1268
@@ -1325,6 +1324,7 @@ void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1325 1324
1326 ip_select_fb_ident(iph); 1325 ip_select_fb_ident(iph);
1327} 1326}
1327EXPORT_SYMBOL(__ip_select_ident);
1328 1328
1329static void rt_del(unsigned hash, struct rtable *rt) 1329static void rt_del(unsigned hash, struct rtable *rt)
1330{ 1330{
@@ -1335,20 +1335,21 @@ static void rt_del(unsigned hash, struct rtable *rt)
1335 ip_rt_put(rt); 1335 ip_rt_put(rt);
1336 while ((aux = *rthp) != NULL) { 1336 while ((aux = *rthp) != NULL) {
1337 if (aux == rt || rt_is_expired(aux)) { 1337 if (aux == rt || rt_is_expired(aux)) {
1338 *rthp = aux->u.dst.rt_next; 1338 *rthp = aux->dst.rt_next;
1339 rt_free(aux); 1339 rt_free(aux);
1340 continue; 1340 continue;
1341 } 1341 }
1342 rthp = &aux->u.dst.rt_next; 1342 rthp = &aux->dst.rt_next;
1343 } 1343 }
1344 spin_unlock_bh(rt_hash_lock_addr(hash)); 1344 spin_unlock_bh(rt_hash_lock_addr(hash));
1345} 1345}
1346 1346
1347/* called in rcu_read_lock() section */
1347void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, 1348void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1348 __be32 saddr, struct net_device *dev) 1349 __be32 saddr, struct net_device *dev)
1349{ 1350{
1350 int i, k; 1351 int i, k;
1351 struct in_device *in_dev = in_dev_get(dev); 1352 struct in_device *in_dev = __in_dev_get_rcu(dev);
1352 struct rtable *rth, **rthp; 1353 struct rtable *rth, **rthp;
1353 __be32 skeys[2] = { saddr, 0 }; 1354 __be32 skeys[2] = { saddr, 0 };
1354 int ikeys[2] = { dev->ifindex, 0 }; 1355 int ikeys[2] = { dev->ifindex, 0 };
@@ -1384,7 +1385,6 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1384 1385
1385 rthp=&rt_hash_table[hash].chain; 1386 rthp=&rt_hash_table[hash].chain;
1386 1387
1387 rcu_read_lock();
1388 while ((rth = rcu_dereference(*rthp)) != NULL) { 1388 while ((rth = rcu_dereference(*rthp)) != NULL) {
1389 struct rtable *rt; 1389 struct rtable *rt;
1390 1390
@@ -1393,44 +1393,42 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1393 rth->fl.oif != ikeys[k] || 1393 rth->fl.oif != ikeys[k] ||
1394 rth->fl.iif != 0 || 1394 rth->fl.iif != 0 ||
1395 rt_is_expired(rth) || 1395 rt_is_expired(rth) ||
1396 !net_eq(dev_net(rth->u.dst.dev), net)) { 1396 !net_eq(dev_net(rth->dst.dev), net)) {
1397 rthp = &rth->u.dst.rt_next; 1397 rthp = &rth->dst.rt_next;
1398 continue; 1398 continue;
1399 } 1399 }
1400 1400
1401 if (rth->rt_dst != daddr || 1401 if (rth->rt_dst != daddr ||
1402 rth->rt_src != saddr || 1402 rth->rt_src != saddr ||
1403 rth->u.dst.error || 1403 rth->dst.error ||
1404 rth->rt_gateway != old_gw || 1404 rth->rt_gateway != old_gw ||
1405 rth->u.dst.dev != dev) 1405 rth->dst.dev != dev)
1406 break; 1406 break;
1407 1407
1408 dst_hold(&rth->u.dst); 1408 dst_hold(&rth->dst);
1409 rcu_read_unlock();
1410 1409
1411 rt = dst_alloc(&ipv4_dst_ops); 1410 rt = dst_alloc(&ipv4_dst_ops);
1412 if (rt == NULL) { 1411 if (rt == NULL) {
1413 ip_rt_put(rth); 1412 ip_rt_put(rth);
1414 in_dev_put(in_dev);
1415 return; 1413 return;
1416 } 1414 }
1417 1415
1418 /* Copy all the information. */ 1416 /* Copy all the information. */
1419 *rt = *rth; 1417 *rt = *rth;
1420 rt->u.dst.__use = 1; 1418 rt->dst.__use = 1;
1421 atomic_set(&rt->u.dst.__refcnt, 1); 1419 atomic_set(&rt->dst.__refcnt, 1);
1422 rt->u.dst.child = NULL; 1420 rt->dst.child = NULL;
1423 if (rt->u.dst.dev) 1421 if (rt->dst.dev)
1424 dev_hold(rt->u.dst.dev); 1422 dev_hold(rt->dst.dev);
1425 if (rt->idev) 1423 if (rt->idev)
1426 in_dev_hold(rt->idev); 1424 in_dev_hold(rt->idev);
1427 rt->u.dst.obsolete = -1; 1425 rt->dst.obsolete = -1;
1428 rt->u.dst.lastuse = jiffies; 1426 rt->dst.lastuse = jiffies;
1429 rt->u.dst.path = &rt->u.dst; 1427 rt->dst.path = &rt->dst;
1430 rt->u.dst.neighbour = NULL; 1428 rt->dst.neighbour = NULL;
1431 rt->u.dst.hh = NULL; 1429 rt->dst.hh = NULL;
1432#ifdef CONFIG_XFRM 1430#ifdef CONFIG_XFRM
1433 rt->u.dst.xfrm = NULL; 1431 rt->dst.xfrm = NULL;
1434#endif 1432#endif
1435 rt->rt_genid = rt_genid(net); 1433 rt->rt_genid = rt_genid(net);
1436 rt->rt_flags |= RTCF_REDIRECTED; 1434 rt->rt_flags |= RTCF_REDIRECTED;
@@ -1439,23 +1437,23 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1439 rt->rt_gateway = new_gw; 1437 rt->rt_gateway = new_gw;
1440 1438
1441 /* Redirect received -> path was valid */ 1439 /* Redirect received -> path was valid */
1442 dst_confirm(&rth->u.dst); 1440 dst_confirm(&rth->dst);
1443 1441
1444 if (rt->peer) 1442 if (rt->peer)
1445 atomic_inc(&rt->peer->refcnt); 1443 atomic_inc(&rt->peer->refcnt);
1446 1444
1447 if (arp_bind_neighbour(&rt->u.dst) || 1445 if (arp_bind_neighbour(&rt->dst) ||
1448 !(rt->u.dst.neighbour->nud_state & 1446 !(rt->dst.neighbour->nud_state &
1449 NUD_VALID)) { 1447 NUD_VALID)) {
1450 if (rt->u.dst.neighbour) 1448 if (rt->dst.neighbour)
1451 neigh_event_send(rt->u.dst.neighbour, NULL); 1449 neigh_event_send(rt->dst.neighbour, NULL);
1452 ip_rt_put(rth); 1450 ip_rt_put(rth);
1453 rt_drop(rt); 1451 rt_drop(rt);
1454 goto do_next; 1452 goto do_next;
1455 } 1453 }
1456 1454
1457 netevent.old = &rth->u.dst; 1455 netevent.old = &rth->dst;
1458 netevent.new = &rt->u.dst; 1456 netevent.new = &rt->dst;
1459 call_netevent_notifiers(NETEVENT_REDIRECT, 1457 call_netevent_notifiers(NETEVENT_REDIRECT,
1460 &netevent); 1458 &netevent);
1461 1459
@@ -1464,12 +1462,10 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1464 ip_rt_put(rt); 1462 ip_rt_put(rt);
1465 goto do_next; 1463 goto do_next;
1466 } 1464 }
1467 rcu_read_unlock();
1468 do_next: 1465 do_next:
1469 ; 1466 ;
1470 } 1467 }
1471 } 1468 }
1472 in_dev_put(in_dev);
1473 return; 1469 return;
1474 1470
1475reject_redirect: 1471reject_redirect:
@@ -1480,7 +1476,7 @@ reject_redirect:
1480 &old_gw, dev->name, &new_gw, 1476 &old_gw, dev->name, &new_gw,
1481 &saddr, &daddr); 1477 &saddr, &daddr);
1482#endif 1478#endif
1483 in_dev_put(in_dev); 1479 ;
1484} 1480}
1485 1481
1486static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) 1482static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
@@ -1493,8 +1489,8 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1493 ip_rt_put(rt); 1489 ip_rt_put(rt);
1494 ret = NULL; 1490 ret = NULL;
1495 } else if ((rt->rt_flags & RTCF_REDIRECTED) || 1491 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1496 (rt->u.dst.expires && 1492 (rt->dst.expires &&
1497 time_after_eq(jiffies, rt->u.dst.expires))) { 1493 time_after_eq(jiffies, rt->dst.expires))) {
1498 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src, 1494 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1499 rt->fl.oif, 1495 rt->fl.oif,
1500 rt_genid(dev_net(dst->dev))); 1496 rt_genid(dev_net(dst->dev)));
@@ -1532,7 +1528,7 @@ void ip_rt_send_redirect(struct sk_buff *skb)
1532 int log_martians; 1528 int log_martians;
1533 1529
1534 rcu_read_lock(); 1530 rcu_read_lock();
1535 in_dev = __in_dev_get_rcu(rt->u.dst.dev); 1531 in_dev = __in_dev_get_rcu(rt->dst.dev);
1536 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) { 1532 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1537 rcu_read_unlock(); 1533 rcu_read_unlock();
1538 return; 1534 return;
@@ -1543,30 +1539,30 @@ void ip_rt_send_redirect(struct sk_buff *skb)
1543 /* No redirected packets during ip_rt_redirect_silence; 1539 /* No redirected packets during ip_rt_redirect_silence;
1544 * reset the algorithm. 1540 * reset the algorithm.
1545 */ 1541 */
1546 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence)) 1542 if (time_after(jiffies, rt->dst.rate_last + ip_rt_redirect_silence))
1547 rt->u.dst.rate_tokens = 0; 1543 rt->dst.rate_tokens = 0;
1548 1544
1549 /* Too many ignored redirects; do not send anything 1545 /* Too many ignored redirects; do not send anything
1550 * set u.dst.rate_last to the last seen redirected packet. 1546 * set dst.rate_last to the last seen redirected packet.
1551 */ 1547 */
1552 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) { 1548 if (rt->dst.rate_tokens >= ip_rt_redirect_number) {
1553 rt->u.dst.rate_last = jiffies; 1549 rt->dst.rate_last = jiffies;
1554 return; 1550 return;
1555 } 1551 }
1556 1552
1557 /* Check for load limit; set rate_last to the latest sent 1553 /* Check for load limit; set rate_last to the latest sent
1558 * redirect. 1554 * redirect.
1559 */ 1555 */
1560 if (rt->u.dst.rate_tokens == 0 || 1556 if (rt->dst.rate_tokens == 0 ||
1561 time_after(jiffies, 1557 time_after(jiffies,
1562 (rt->u.dst.rate_last + 1558 (rt->dst.rate_last +
1563 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) { 1559 (ip_rt_redirect_load << rt->dst.rate_tokens)))) {
1564 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway); 1560 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1565 rt->u.dst.rate_last = jiffies; 1561 rt->dst.rate_last = jiffies;
1566 ++rt->u.dst.rate_tokens; 1562 ++rt->dst.rate_tokens;
1567#ifdef CONFIG_IP_ROUTE_VERBOSE 1563#ifdef CONFIG_IP_ROUTE_VERBOSE
1568 if (log_martians && 1564 if (log_martians &&
1569 rt->u.dst.rate_tokens == ip_rt_redirect_number && 1565 rt->dst.rate_tokens == ip_rt_redirect_number &&
1570 net_ratelimit()) 1566 net_ratelimit())
1571 printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n", 1567 printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1572 &rt->rt_src, rt->rt_iif, 1568 &rt->rt_src, rt->rt_iif,
@@ -1581,7 +1577,7 @@ static int ip_error(struct sk_buff *skb)
1581 unsigned long now; 1577 unsigned long now;
1582 int code; 1578 int code;
1583 1579
1584 switch (rt->u.dst.error) { 1580 switch (rt->dst.error) {
1585 case EINVAL: 1581 case EINVAL:
1586 default: 1582 default:
1587 goto out; 1583 goto out;
@@ -1590,7 +1586,7 @@ static int ip_error(struct sk_buff *skb)
1590 break; 1586 break;
1591 case ENETUNREACH: 1587 case ENETUNREACH:
1592 code = ICMP_NET_UNREACH; 1588 code = ICMP_NET_UNREACH;
1593 IP_INC_STATS_BH(dev_net(rt->u.dst.dev), 1589 IP_INC_STATS_BH(dev_net(rt->dst.dev),
1594 IPSTATS_MIB_INNOROUTES); 1590 IPSTATS_MIB_INNOROUTES);
1595 break; 1591 break;
1596 case EACCES: 1592 case EACCES:
@@ -1599,12 +1595,12 @@ static int ip_error(struct sk_buff *skb)
1599 } 1595 }
1600 1596
1601 now = jiffies; 1597 now = jiffies;
1602 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last; 1598 rt->dst.rate_tokens += now - rt->dst.rate_last;
1603 if (rt->u.dst.rate_tokens > ip_rt_error_burst) 1599 if (rt->dst.rate_tokens > ip_rt_error_burst)
1604 rt->u.dst.rate_tokens = ip_rt_error_burst; 1600 rt->dst.rate_tokens = ip_rt_error_burst;
1605 rt->u.dst.rate_last = now; 1601 rt->dst.rate_last = now;
1606 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) { 1602 if (rt->dst.rate_tokens >= ip_rt_error_cost) {
1607 rt->u.dst.rate_tokens -= ip_rt_error_cost; 1603 rt->dst.rate_tokens -= ip_rt_error_cost;
1608 icmp_send(skb, ICMP_DEST_UNREACH, code, 0); 1604 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1609 } 1605 }
1610 1606
@@ -1649,7 +1645,7 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1649 1645
1650 rcu_read_lock(); 1646 rcu_read_lock();
1651 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; 1647 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1652 rth = rcu_dereference(rth->u.dst.rt_next)) { 1648 rth = rcu_dereference(rth->dst.rt_next)) {
1653 unsigned short mtu = new_mtu; 1649 unsigned short mtu = new_mtu;
1654 1650
1655 if (rth->fl.fl4_dst != daddr || 1651 if (rth->fl.fl4_dst != daddr ||
@@ -1658,8 +1654,8 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1658 rth->rt_src != iph->saddr || 1654 rth->rt_src != iph->saddr ||
1659 rth->fl.oif != ikeys[k] || 1655 rth->fl.oif != ikeys[k] ||
1660 rth->fl.iif != 0 || 1656 rth->fl.iif != 0 ||
1661 dst_metric_locked(&rth->u.dst, RTAX_MTU) || 1657 dst_metric_locked(&rth->dst, RTAX_MTU) ||
1662 !net_eq(dev_net(rth->u.dst.dev), net) || 1658 !net_eq(dev_net(rth->dst.dev), net) ||
1663 rt_is_expired(rth)) 1659 rt_is_expired(rth))
1664 continue; 1660 continue;
1665 1661
@@ -1667,22 +1663,22 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1667 1663
1668 /* BSD 4.2 compatibility hack :-( */ 1664 /* BSD 4.2 compatibility hack :-( */
1669 if (mtu == 0 && 1665 if (mtu == 0 &&
1670 old_mtu >= dst_mtu(&rth->u.dst) && 1666 old_mtu >= dst_mtu(&rth->dst) &&
1671 old_mtu >= 68 + (iph->ihl << 2)) 1667 old_mtu >= 68 + (iph->ihl << 2))
1672 old_mtu -= iph->ihl << 2; 1668 old_mtu -= iph->ihl << 2;
1673 1669
1674 mtu = guess_mtu(old_mtu); 1670 mtu = guess_mtu(old_mtu);
1675 } 1671 }
1676 if (mtu <= dst_mtu(&rth->u.dst)) { 1672 if (mtu <= dst_mtu(&rth->dst)) {
1677 if (mtu < dst_mtu(&rth->u.dst)) { 1673 if (mtu < dst_mtu(&rth->dst)) {
1678 dst_confirm(&rth->u.dst); 1674 dst_confirm(&rth->dst);
1679 if (mtu < ip_rt_min_pmtu) { 1675 if (mtu < ip_rt_min_pmtu) {
1680 mtu = ip_rt_min_pmtu; 1676 mtu = ip_rt_min_pmtu;
1681 rth->u.dst.metrics[RTAX_LOCK-1] |= 1677 rth->dst.metrics[RTAX_LOCK-1] |=
1682 (1 << RTAX_MTU); 1678 (1 << RTAX_MTU);
1683 } 1679 }
1684 rth->u.dst.metrics[RTAX_MTU-1] = mtu; 1680 rth->dst.metrics[RTAX_MTU-1] = mtu;
1685 dst_set_expires(&rth->u.dst, 1681 dst_set_expires(&rth->dst,
1686 ip_rt_mtu_expires); 1682 ip_rt_mtu_expires);
1687 } 1683 }
1688 est_mtu = mtu; 1684 est_mtu = mtu;
@@ -1755,7 +1751,7 @@ static void ipv4_link_failure(struct sk_buff *skb)
1755 1751
1756 rt = skb_rtable(skb); 1752 rt = skb_rtable(skb);
1757 if (rt) 1753 if (rt)
1758 dst_set_expires(&rt->u.dst, 0); 1754 dst_set_expires(&rt->dst, 0);
1759} 1755}
1760 1756
1761static int ip_rt_bug(struct sk_buff *skb) 1757static int ip_rt_bug(struct sk_buff *skb)
@@ -1783,11 +1779,11 @@ void ip_rt_get_source(u8 *addr, struct rtable *rt)
1783 1779
1784 if (rt->fl.iif == 0) 1780 if (rt->fl.iif == 0)
1785 src = rt->rt_src; 1781 src = rt->rt_src;
1786 else if (fib_lookup(dev_net(rt->u.dst.dev), &rt->fl, &res) == 0) { 1782 else if (fib_lookup(dev_net(rt->dst.dev), &rt->fl, &res) == 0) {
1787 src = FIB_RES_PREFSRC(res); 1783 src = FIB_RES_PREFSRC(res);
1788 fib_res_put(&res); 1784 fib_res_put(&res);
1789 } else 1785 } else
1790 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway, 1786 src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1791 RT_SCOPE_UNIVERSE); 1787 RT_SCOPE_UNIVERSE);
1792 memcpy(addr, &src, 4); 1788 memcpy(addr, &src, 4);
1793} 1789}
@@ -1795,10 +1791,10 @@ void ip_rt_get_source(u8 *addr, struct rtable *rt)
1795#ifdef CONFIG_NET_CLS_ROUTE 1791#ifdef CONFIG_NET_CLS_ROUTE
1796static void set_class_tag(struct rtable *rt, u32 tag) 1792static void set_class_tag(struct rtable *rt, u32 tag)
1797{ 1793{
1798 if (!(rt->u.dst.tclassid & 0xFFFF)) 1794 if (!(rt->dst.tclassid & 0xFFFF))
1799 rt->u.dst.tclassid |= tag & 0xFFFF; 1795 rt->dst.tclassid |= tag & 0xFFFF;
1800 if (!(rt->u.dst.tclassid & 0xFFFF0000)) 1796 if (!(rt->dst.tclassid & 0xFFFF0000))
1801 rt->u.dst.tclassid |= tag & 0xFFFF0000; 1797 rt->dst.tclassid |= tag & 0xFFFF0000;
1802} 1798}
1803#endif 1799#endif
1804 1800
@@ -1810,30 +1806,30 @@ static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1810 if (FIB_RES_GW(*res) && 1806 if (FIB_RES_GW(*res) &&
1811 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) 1807 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1812 rt->rt_gateway = FIB_RES_GW(*res); 1808 rt->rt_gateway = FIB_RES_GW(*res);
1813 memcpy(rt->u.dst.metrics, fi->fib_metrics, 1809 memcpy(rt->dst.metrics, fi->fib_metrics,
1814 sizeof(rt->u.dst.metrics)); 1810 sizeof(rt->dst.metrics));
1815 if (fi->fib_mtu == 0) { 1811 if (fi->fib_mtu == 0) {
1816 rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu; 1812 rt->dst.metrics[RTAX_MTU-1] = rt->dst.dev->mtu;
1817 if (dst_metric_locked(&rt->u.dst, RTAX_MTU) && 1813 if (dst_metric_locked(&rt->dst, RTAX_MTU) &&
1818 rt->rt_gateway != rt->rt_dst && 1814 rt->rt_gateway != rt->rt_dst &&
1819 rt->u.dst.dev->mtu > 576) 1815 rt->dst.dev->mtu > 576)
1820 rt->u.dst.metrics[RTAX_MTU-1] = 576; 1816 rt->dst.metrics[RTAX_MTU-1] = 576;
1821 } 1817 }
1822#ifdef CONFIG_NET_CLS_ROUTE 1818#ifdef CONFIG_NET_CLS_ROUTE
1823 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid; 1819 rt->dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1824#endif 1820#endif
1825 } else 1821 } else
1826 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu; 1822 rt->dst.metrics[RTAX_MTU-1]= rt->dst.dev->mtu;
1827 1823
1828 if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0) 1824 if (dst_metric(&rt->dst, RTAX_HOPLIMIT) == 0)
1829 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl; 1825 rt->dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1830 if (dst_mtu(&rt->u.dst) > IP_MAX_MTU) 1826 if (dst_mtu(&rt->dst) > IP_MAX_MTU)
1831 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU; 1827 rt->dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1832 if (dst_metric(&rt->u.dst, RTAX_ADVMSS) == 0) 1828 if (dst_metric(&rt->dst, RTAX_ADVMSS) == 0)
1833 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40, 1829 rt->dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->dst.dev->mtu - 40,
1834 ip_rt_min_advmss); 1830 ip_rt_min_advmss);
1835 if (dst_metric(&rt->u.dst, RTAX_ADVMSS) > 65535 - 40) 1831 if (dst_metric(&rt->dst, RTAX_ADVMSS) > 65535 - 40)
1836 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40; 1832 rt->dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1837 1833
1838#ifdef CONFIG_NET_CLS_ROUTE 1834#ifdef CONFIG_NET_CLS_ROUTE
1839#ifdef CONFIG_IP_MULTIPLE_TABLES 1835#ifdef CONFIG_IP_MULTIPLE_TABLES
@@ -1844,14 +1840,16 @@ static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1844 rt->rt_type = res->type; 1840 rt->rt_type = res->type;
1845} 1841}
1846 1842
1843/* called in rcu_read_lock() section */
1847static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, 1844static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1848 u8 tos, struct net_device *dev, int our) 1845 u8 tos, struct net_device *dev, int our)
1849{ 1846{
1850 unsigned hash; 1847 unsigned int hash;
1851 struct rtable *rth; 1848 struct rtable *rth;
1852 __be32 spec_dst; 1849 __be32 spec_dst;
1853 struct in_device *in_dev = in_dev_get(dev); 1850 struct in_device *in_dev = __in_dev_get_rcu(dev);
1854 u32 itag = 0; 1851 u32 itag = 0;
1852 int err;
1855 1853
1856 /* Primary sanity checks. */ 1854 /* Primary sanity checks. */
1857 1855
@@ -1866,21 +1864,23 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1866 if (!ipv4_is_local_multicast(daddr)) 1864 if (!ipv4_is_local_multicast(daddr))
1867 goto e_inval; 1865 goto e_inval;
1868 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); 1866 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1869 } else if (fib_validate_source(saddr, 0, tos, 0, 1867 } else {
1870 dev, &spec_dst, &itag, 0) < 0) 1868 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1871 goto e_inval; 1869 &itag, 0);
1872 1870 if (err < 0)
1871 goto e_err;
1872 }
1873 rth = dst_alloc(&ipv4_dst_ops); 1873 rth = dst_alloc(&ipv4_dst_ops);
1874 if (!rth) 1874 if (!rth)
1875 goto e_nobufs; 1875 goto e_nobufs;
1876 1876
1877 rth->u.dst.output = ip_rt_bug; 1877 rth->dst.output = ip_rt_bug;
1878 rth->u.dst.obsolete = -1; 1878 rth->dst.obsolete = -1;
1879 1879
1880 atomic_set(&rth->u.dst.__refcnt, 1); 1880 atomic_set(&rth->dst.__refcnt, 1);
1881 rth->u.dst.flags= DST_HOST; 1881 rth->dst.flags= DST_HOST;
1882 if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) 1882 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1883 rth->u.dst.flags |= DST_NOPOLICY; 1883 rth->dst.flags |= DST_NOPOLICY;
1884 rth->fl.fl4_dst = daddr; 1884 rth->fl.fl4_dst = daddr;
1885 rth->rt_dst = daddr; 1885 rth->rt_dst = daddr;
1886 rth->fl.fl4_tos = tos; 1886 rth->fl.fl4_tos = tos;
@@ -1888,13 +1888,13 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1888 rth->fl.fl4_src = saddr; 1888 rth->fl.fl4_src = saddr;
1889 rth->rt_src = saddr; 1889 rth->rt_src = saddr;
1890#ifdef CONFIG_NET_CLS_ROUTE 1890#ifdef CONFIG_NET_CLS_ROUTE
1891 rth->u.dst.tclassid = itag; 1891 rth->dst.tclassid = itag;
1892#endif 1892#endif
1893 rth->rt_iif = 1893 rth->rt_iif =
1894 rth->fl.iif = dev->ifindex; 1894 rth->fl.iif = dev->ifindex;
1895 rth->u.dst.dev = init_net.loopback_dev; 1895 rth->dst.dev = init_net.loopback_dev;
1896 dev_hold(rth->u.dst.dev); 1896 dev_hold(rth->dst.dev);
1897 rth->idev = in_dev_get(rth->u.dst.dev); 1897 rth->idev = in_dev_get(rth->dst.dev);
1898 rth->fl.oif = 0; 1898 rth->fl.oif = 0;
1899 rth->rt_gateway = daddr; 1899 rth->rt_gateway = daddr;
1900 rth->rt_spec_dst= spec_dst; 1900 rth->rt_spec_dst= spec_dst;
@@ -1902,27 +1902,25 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1902 rth->rt_flags = RTCF_MULTICAST; 1902 rth->rt_flags = RTCF_MULTICAST;
1903 rth->rt_type = RTN_MULTICAST; 1903 rth->rt_type = RTN_MULTICAST;
1904 if (our) { 1904 if (our) {
1905 rth->u.dst.input= ip_local_deliver; 1905 rth->dst.input= ip_local_deliver;
1906 rth->rt_flags |= RTCF_LOCAL; 1906 rth->rt_flags |= RTCF_LOCAL;
1907 } 1907 }
1908 1908
1909#ifdef CONFIG_IP_MROUTE 1909#ifdef CONFIG_IP_MROUTE
1910 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev)) 1910 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1911 rth->u.dst.input = ip_mr_input; 1911 rth->dst.input = ip_mr_input;
1912#endif 1912#endif
1913 RT_CACHE_STAT_INC(in_slow_mc); 1913 RT_CACHE_STAT_INC(in_slow_mc);
1914 1914
1915 in_dev_put(in_dev);
1916 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev))); 1915 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1917 return rt_intern_hash(hash, rth, NULL, skb, dev->ifindex); 1916 return rt_intern_hash(hash, rth, NULL, skb, dev->ifindex);
1918 1917
1919e_nobufs: 1918e_nobufs:
1920 in_dev_put(in_dev);
1921 return -ENOBUFS; 1919 return -ENOBUFS;
1922
1923e_inval: 1920e_inval:
1924 in_dev_put(in_dev);
1925 return -EINVAL; 1921 return -EINVAL;
1922e_err:
1923 return err;
1926} 1924}
1927 1925
1928 1926
@@ -1956,22 +1954,22 @@ static void ip_handle_martian_source(struct net_device *dev,
1956#endif 1954#endif
1957} 1955}
1958 1956
1957/* called in rcu_read_lock() section */
1959static int __mkroute_input(struct sk_buff *skb, 1958static int __mkroute_input(struct sk_buff *skb,
1960 struct fib_result *res, 1959 struct fib_result *res,
1961 struct in_device *in_dev, 1960 struct in_device *in_dev,
1962 __be32 daddr, __be32 saddr, u32 tos, 1961 __be32 daddr, __be32 saddr, u32 tos,
1963 struct rtable **result) 1962 struct rtable **result)
1964{ 1963{
1965
1966 struct rtable *rth; 1964 struct rtable *rth;
1967 int err; 1965 int err;
1968 struct in_device *out_dev; 1966 struct in_device *out_dev;
1969 unsigned flags = 0; 1967 unsigned int flags = 0;
1970 __be32 spec_dst; 1968 __be32 spec_dst;
1971 u32 itag; 1969 u32 itag;
1972 1970
1973 /* get a working reference to the output device */ 1971 /* get a working reference to the output device */
1974 out_dev = in_dev_get(FIB_RES_DEV(*res)); 1972 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1975 if (out_dev == NULL) { 1973 if (out_dev == NULL) {
1976 if (net_ratelimit()) 1974 if (net_ratelimit())
1977 printk(KERN_CRIT "Bug in ip_route_input" \ 1975 printk(KERN_CRIT "Bug in ip_route_input" \
@@ -1986,7 +1984,6 @@ static int __mkroute_input(struct sk_buff *skb,
1986 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, 1984 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1987 saddr); 1985 saddr);
1988 1986
1989 err = -EINVAL;
1990 goto cleanup; 1987 goto cleanup;
1991 } 1988 }
1992 1989
@@ -2020,12 +2017,12 @@ static int __mkroute_input(struct sk_buff *skb,
2020 goto cleanup; 2017 goto cleanup;
2021 } 2018 }
2022 2019
2023 atomic_set(&rth->u.dst.__refcnt, 1); 2020 atomic_set(&rth->dst.__refcnt, 1);
2024 rth->u.dst.flags= DST_HOST; 2021 rth->dst.flags= DST_HOST;
2025 if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) 2022 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2026 rth->u.dst.flags |= DST_NOPOLICY; 2023 rth->dst.flags |= DST_NOPOLICY;
2027 if (IN_DEV_CONF_GET(out_dev, NOXFRM)) 2024 if (IN_DEV_CONF_GET(out_dev, NOXFRM))
2028 rth->u.dst.flags |= DST_NOXFRM; 2025 rth->dst.flags |= DST_NOXFRM;
2029 rth->fl.fl4_dst = daddr; 2026 rth->fl.fl4_dst = daddr;
2030 rth->rt_dst = daddr; 2027 rth->rt_dst = daddr;
2031 rth->fl.fl4_tos = tos; 2028 rth->fl.fl4_tos = tos;
@@ -2035,16 +2032,16 @@ static int __mkroute_input(struct sk_buff *skb,
2035 rth->rt_gateway = daddr; 2032 rth->rt_gateway = daddr;
2036 rth->rt_iif = 2033 rth->rt_iif =
2037 rth->fl.iif = in_dev->dev->ifindex; 2034 rth->fl.iif = in_dev->dev->ifindex;
2038 rth->u.dst.dev = (out_dev)->dev; 2035 rth->dst.dev = (out_dev)->dev;
2039 dev_hold(rth->u.dst.dev); 2036 dev_hold(rth->dst.dev);
2040 rth->idev = in_dev_get(rth->u.dst.dev); 2037 rth->idev = in_dev_get(rth->dst.dev);
2041 rth->fl.oif = 0; 2038 rth->fl.oif = 0;
2042 rth->rt_spec_dst= spec_dst; 2039 rth->rt_spec_dst= spec_dst;
2043 2040
2044 rth->u.dst.obsolete = -1; 2041 rth->dst.obsolete = -1;
2045 rth->u.dst.input = ip_forward; 2042 rth->dst.input = ip_forward;
2046 rth->u.dst.output = ip_output; 2043 rth->dst.output = ip_output;
2047 rth->rt_genid = rt_genid(dev_net(rth->u.dst.dev)); 2044 rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2048 2045
2049 rt_set_nexthop(rth, res, itag); 2046 rt_set_nexthop(rth, res, itag);
2050 2047
@@ -2053,8 +2050,6 @@ static int __mkroute_input(struct sk_buff *skb,
2053 *result = rth; 2050 *result = rth;
2054 err = 0; 2051 err = 0;
2055 cleanup: 2052 cleanup:
2056 /* release the working reference to the output device */
2057 in_dev_put(out_dev);
2058 return err; 2053 return err;
2059} 2054}
2060 2055
@@ -2080,7 +2075,7 @@ static int ip_mkroute_input(struct sk_buff *skb,
2080 2075
2081 /* put it into the cache */ 2076 /* put it into the cache */
2082 hash = rt_hash(daddr, saddr, fl->iif, 2077 hash = rt_hash(daddr, saddr, fl->iif,
2083 rt_genid(dev_net(rth->u.dst.dev))); 2078 rt_genid(dev_net(rth->dst.dev)));
2084 return rt_intern_hash(hash, rth, NULL, skb, fl->iif); 2079 return rt_intern_hash(hash, rth, NULL, skb, fl->iif);
2085} 2080}
2086 2081
@@ -2098,7 +2093,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2098 u8 tos, struct net_device *dev) 2093 u8 tos, struct net_device *dev)
2099{ 2094{
2100 struct fib_result res; 2095 struct fib_result res;
2101 struct in_device *in_dev = in_dev_get(dev); 2096 struct in_device *in_dev = __in_dev_get_rcu(dev);
2102 struct flowi fl = { .nl_u = { .ip4_u = 2097 struct flowi fl = { .nl_u = { .ip4_u =
2103 { .daddr = daddr, 2098 { .daddr = daddr,
2104 .saddr = saddr, 2099 .saddr = saddr,
@@ -2158,13 +2153,12 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2158 goto brd_input; 2153 goto brd_input;
2159 2154
2160 if (res.type == RTN_LOCAL) { 2155 if (res.type == RTN_LOCAL) {
2161 int result; 2156 err = fib_validate_source(saddr, daddr, tos,
2162 result = fib_validate_source(saddr, daddr, tos,
2163 net->loopback_dev->ifindex, 2157 net->loopback_dev->ifindex,
2164 dev, &spec_dst, &itag, skb->mark); 2158 dev, &spec_dst, &itag, skb->mark);
2165 if (result < 0) 2159 if (err < 0)
2166 goto martian_source; 2160 goto martian_source_keep_err;
2167 if (result) 2161 if (err)
2168 flags |= RTCF_DIRECTSRC; 2162 flags |= RTCF_DIRECTSRC;
2169 spec_dst = daddr; 2163 spec_dst = daddr;
2170 goto local_input; 2164 goto local_input;
@@ -2177,7 +2171,6 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2177 2171
2178 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos); 2172 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
2179done: 2173done:
2180 in_dev_put(in_dev);
2181 if (free_res) 2174 if (free_res)
2182 fib_res_put(&res); 2175 fib_res_put(&res);
2183out: return err; 2176out: return err;
@@ -2192,7 +2185,7 @@ brd_input:
2192 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, 2185 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2193 &itag, skb->mark); 2186 &itag, skb->mark);
2194 if (err < 0) 2187 if (err < 0)
2195 goto martian_source; 2188 goto martian_source_keep_err;
2196 if (err) 2189 if (err)
2197 flags |= RTCF_DIRECTSRC; 2190 flags |= RTCF_DIRECTSRC;
2198 } 2191 }
@@ -2205,14 +2198,14 @@ local_input:
2205 if (!rth) 2198 if (!rth)
2206 goto e_nobufs; 2199 goto e_nobufs;
2207 2200
2208 rth->u.dst.output= ip_rt_bug; 2201 rth->dst.output= ip_rt_bug;
2209 rth->u.dst.obsolete = -1; 2202 rth->dst.obsolete = -1;
2210 rth->rt_genid = rt_genid(net); 2203 rth->rt_genid = rt_genid(net);
2211 2204
2212 atomic_set(&rth->u.dst.__refcnt, 1); 2205 atomic_set(&rth->dst.__refcnt, 1);
2213 rth->u.dst.flags= DST_HOST; 2206 rth->dst.flags= DST_HOST;
2214 if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) 2207 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2215 rth->u.dst.flags |= DST_NOPOLICY; 2208 rth->dst.flags |= DST_NOPOLICY;
2216 rth->fl.fl4_dst = daddr; 2209 rth->fl.fl4_dst = daddr;
2217 rth->rt_dst = daddr; 2210 rth->rt_dst = daddr;
2218 rth->fl.fl4_tos = tos; 2211 rth->fl.fl4_tos = tos;
@@ -2220,20 +2213,20 @@ local_input:
2220 rth->fl.fl4_src = saddr; 2213 rth->fl.fl4_src = saddr;
2221 rth->rt_src = saddr; 2214 rth->rt_src = saddr;
2222#ifdef CONFIG_NET_CLS_ROUTE 2215#ifdef CONFIG_NET_CLS_ROUTE
2223 rth->u.dst.tclassid = itag; 2216 rth->dst.tclassid = itag;
2224#endif 2217#endif
2225 rth->rt_iif = 2218 rth->rt_iif =
2226 rth->fl.iif = dev->ifindex; 2219 rth->fl.iif = dev->ifindex;
2227 rth->u.dst.dev = net->loopback_dev; 2220 rth->dst.dev = net->loopback_dev;
2228 dev_hold(rth->u.dst.dev); 2221 dev_hold(rth->dst.dev);
2229 rth->idev = in_dev_get(rth->u.dst.dev); 2222 rth->idev = in_dev_get(rth->dst.dev);
2230 rth->rt_gateway = daddr; 2223 rth->rt_gateway = daddr;
2231 rth->rt_spec_dst= spec_dst; 2224 rth->rt_spec_dst= spec_dst;
2232 rth->u.dst.input= ip_local_deliver; 2225 rth->dst.input= ip_local_deliver;
2233 rth->rt_flags = flags|RTCF_LOCAL; 2226 rth->rt_flags = flags|RTCF_LOCAL;
2234 if (res.type == RTN_UNREACHABLE) { 2227 if (res.type == RTN_UNREACHABLE) {
2235 rth->u.dst.input= ip_error; 2228 rth->dst.input= ip_error;
2236 rth->u.dst.error= -err; 2229 rth->dst.error= -err;
2237 rth->rt_flags &= ~RTCF_LOCAL; 2230 rth->rt_flags &= ~RTCF_LOCAL;
2238 } 2231 }
2239 rth->rt_type = res.type; 2232 rth->rt_type = res.type;
@@ -2273,8 +2266,10 @@ e_nobufs:
2273 goto done; 2266 goto done;
2274 2267
2275martian_source: 2268martian_source:
2269 err = -EINVAL;
2270martian_source_keep_err:
2276 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr); 2271 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2277 goto e_inval; 2272 goto done;
2278} 2273}
2279 2274
2280int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr, 2275int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
@@ -2284,32 +2279,34 @@ int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2284 unsigned hash; 2279 unsigned hash;
2285 int iif = dev->ifindex; 2280 int iif = dev->ifindex;
2286 struct net *net; 2281 struct net *net;
2282 int res;
2287 2283
2288 net = dev_net(dev); 2284 net = dev_net(dev);
2289 2285
2286 rcu_read_lock();
2287
2290 if (!rt_caching(net)) 2288 if (!rt_caching(net))
2291 goto skip_cache; 2289 goto skip_cache;
2292 2290
2293 tos &= IPTOS_RT_MASK; 2291 tos &= IPTOS_RT_MASK;
2294 hash = rt_hash(daddr, saddr, iif, rt_genid(net)); 2292 hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2295 2293
2296 rcu_read_lock();
2297 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; 2294 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2298 rth = rcu_dereference(rth->u.dst.rt_next)) { 2295 rth = rcu_dereference(rth->dst.rt_next)) {
2299 if ((((__force u32)rth->fl.fl4_dst ^ (__force u32)daddr) | 2296 if ((((__force u32)rth->fl.fl4_dst ^ (__force u32)daddr) |
2300 ((__force u32)rth->fl.fl4_src ^ (__force u32)saddr) | 2297 ((__force u32)rth->fl.fl4_src ^ (__force u32)saddr) |
2301 (rth->fl.iif ^ iif) | 2298 (rth->fl.iif ^ iif) |
2302 rth->fl.oif | 2299 rth->fl.oif |
2303 (rth->fl.fl4_tos ^ tos)) == 0 && 2300 (rth->fl.fl4_tos ^ tos)) == 0 &&
2304 rth->fl.mark == skb->mark && 2301 rth->fl.mark == skb->mark &&
2305 net_eq(dev_net(rth->u.dst.dev), net) && 2302 net_eq(dev_net(rth->dst.dev), net) &&
2306 !rt_is_expired(rth)) { 2303 !rt_is_expired(rth)) {
2307 if (noref) { 2304 if (noref) {
2308 dst_use_noref(&rth->u.dst, jiffies); 2305 dst_use_noref(&rth->dst, jiffies);
2309 skb_dst_set_noref(skb, &rth->u.dst); 2306 skb_dst_set_noref(skb, &rth->dst);
2310 } else { 2307 } else {
2311 dst_use(&rth->u.dst, jiffies); 2308 dst_use(&rth->dst, jiffies);
2312 skb_dst_set(skb, &rth->u.dst); 2309 skb_dst_set(skb, &rth->dst);
2313 } 2310 }
2314 RT_CACHE_STAT_INC(in_hit); 2311 RT_CACHE_STAT_INC(in_hit);
2315 rcu_read_unlock(); 2312 rcu_read_unlock();
@@ -2317,7 +2314,6 @@ int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2317 } 2314 }
2318 RT_CACHE_STAT_INC(in_hlist_search); 2315 RT_CACHE_STAT_INC(in_hlist_search);
2319 } 2316 }
2320 rcu_read_unlock();
2321 2317
2322skip_cache: 2318skip_cache:
2323 /* Multicast recognition logic is moved from route cache to here. 2319 /* Multicast recognition logic is moved from route cache to here.
@@ -2332,12 +2328,11 @@ skip_cache:
2332 route cache entry is created eventually. 2328 route cache entry is created eventually.
2333 */ 2329 */
2334 if (ipv4_is_multicast(daddr)) { 2330 if (ipv4_is_multicast(daddr)) {
2335 struct in_device *in_dev; 2331 struct in_device *in_dev = __in_dev_get_rcu(dev);
2336 2332
2337 rcu_read_lock(); 2333 if (in_dev) {
2338 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2339 int our = ip_check_mc(in_dev, daddr, saddr, 2334 int our = ip_check_mc(in_dev, daddr, saddr,
2340 ip_hdr(skb)->protocol); 2335 ip_hdr(skb)->protocol);
2341 if (our 2336 if (our
2342#ifdef CONFIG_IP_MROUTE 2337#ifdef CONFIG_IP_MROUTE
2343 || 2338 ||
@@ -2345,15 +2340,18 @@ skip_cache:
2345 IN_DEV_MFORWARD(in_dev)) 2340 IN_DEV_MFORWARD(in_dev))
2346#endif 2341#endif
2347 ) { 2342 ) {
2343 int res = ip_route_input_mc(skb, daddr, saddr,
2344 tos, dev, our);
2348 rcu_read_unlock(); 2345 rcu_read_unlock();
2349 return ip_route_input_mc(skb, daddr, saddr, 2346 return res;
2350 tos, dev, our);
2351 } 2347 }
2352 } 2348 }
2353 rcu_read_unlock(); 2349 rcu_read_unlock();
2354 return -EINVAL; 2350 return -EINVAL;
2355 } 2351 }
2356 return ip_route_input_slow(skb, daddr, saddr, tos, dev); 2352 res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2353 rcu_read_unlock();
2354 return res;
2357} 2355}
2358EXPORT_SYMBOL(ip_route_input_common); 2356EXPORT_SYMBOL(ip_route_input_common);
2359 2357
@@ -2415,12 +2413,12 @@ static int __mkroute_output(struct rtable **result,
2415 goto cleanup; 2413 goto cleanup;
2416 } 2414 }
2417 2415
2418 atomic_set(&rth->u.dst.__refcnt, 1); 2416 atomic_set(&rth->dst.__refcnt, 1);
2419 rth->u.dst.flags= DST_HOST; 2417 rth->dst.flags= DST_HOST;
2420 if (IN_DEV_CONF_GET(in_dev, NOXFRM)) 2418 if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2421 rth->u.dst.flags |= DST_NOXFRM; 2419 rth->dst.flags |= DST_NOXFRM;
2422 if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) 2420 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2423 rth->u.dst.flags |= DST_NOPOLICY; 2421 rth->dst.flags |= DST_NOPOLICY;
2424 2422
2425 rth->fl.fl4_dst = oldflp->fl4_dst; 2423 rth->fl.fl4_dst = oldflp->fl4_dst;
2426 rth->fl.fl4_tos = tos; 2424 rth->fl.fl4_tos = tos;
@@ -2432,35 +2430,35 @@ static int __mkroute_output(struct rtable **result,
2432 rth->rt_iif = oldflp->oif ? : dev_out->ifindex; 2430 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
2433 /* get references to the devices that are to be hold by the routing 2431 /* get references to the devices that are to be hold by the routing
2434 cache entry */ 2432 cache entry */
2435 rth->u.dst.dev = dev_out; 2433 rth->dst.dev = dev_out;
2436 dev_hold(dev_out); 2434 dev_hold(dev_out);
2437 rth->idev = in_dev_get(dev_out); 2435 rth->idev = in_dev_get(dev_out);
2438 rth->rt_gateway = fl->fl4_dst; 2436 rth->rt_gateway = fl->fl4_dst;
2439 rth->rt_spec_dst= fl->fl4_src; 2437 rth->rt_spec_dst= fl->fl4_src;
2440 2438
2441 rth->u.dst.output=ip_output; 2439 rth->dst.output=ip_output;
2442 rth->u.dst.obsolete = -1; 2440 rth->dst.obsolete = -1;
2443 rth->rt_genid = rt_genid(dev_net(dev_out)); 2441 rth->rt_genid = rt_genid(dev_net(dev_out));
2444 2442
2445 RT_CACHE_STAT_INC(out_slow_tot); 2443 RT_CACHE_STAT_INC(out_slow_tot);
2446 2444
2447 if (flags & RTCF_LOCAL) { 2445 if (flags & RTCF_LOCAL) {
2448 rth->u.dst.input = ip_local_deliver; 2446 rth->dst.input = ip_local_deliver;
2449 rth->rt_spec_dst = fl->fl4_dst; 2447 rth->rt_spec_dst = fl->fl4_dst;
2450 } 2448 }
2451 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { 2449 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2452 rth->rt_spec_dst = fl->fl4_src; 2450 rth->rt_spec_dst = fl->fl4_src;
2453 if (flags & RTCF_LOCAL && 2451 if (flags & RTCF_LOCAL &&
2454 !(dev_out->flags & IFF_LOOPBACK)) { 2452 !(dev_out->flags & IFF_LOOPBACK)) {
2455 rth->u.dst.output = ip_mc_output; 2453 rth->dst.output = ip_mc_output;
2456 RT_CACHE_STAT_INC(out_slow_mc); 2454 RT_CACHE_STAT_INC(out_slow_mc);
2457 } 2455 }
2458#ifdef CONFIG_IP_MROUTE 2456#ifdef CONFIG_IP_MROUTE
2459 if (res->type == RTN_MULTICAST) { 2457 if (res->type == RTN_MULTICAST) {
2460 if (IN_DEV_MFORWARD(in_dev) && 2458 if (IN_DEV_MFORWARD(in_dev) &&
2461 !ipv4_is_local_multicast(oldflp->fl4_dst)) { 2459 !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2462 rth->u.dst.input = ip_mr_input; 2460 rth->dst.input = ip_mr_input;
2463 rth->u.dst.output = ip_mc_output; 2461 rth->dst.output = ip_mc_output;
2464 } 2462 }
2465 } 2463 }
2466#endif 2464#endif
@@ -2715,7 +2713,7 @@ int __ip_route_output_key(struct net *net, struct rtable **rp,
2715 2713
2716 rcu_read_lock_bh(); 2714 rcu_read_lock_bh();
2717 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth; 2715 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2718 rth = rcu_dereference_bh(rth->u.dst.rt_next)) { 2716 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2719 if (rth->fl.fl4_dst == flp->fl4_dst && 2717 if (rth->fl.fl4_dst == flp->fl4_dst &&
2720 rth->fl.fl4_src == flp->fl4_src && 2718 rth->fl.fl4_src == flp->fl4_src &&
2721 rth->fl.iif == 0 && 2719 rth->fl.iif == 0 &&
@@ -2723,9 +2721,9 @@ int __ip_route_output_key(struct net *net, struct rtable **rp,
2723 rth->fl.mark == flp->mark && 2721 rth->fl.mark == flp->mark &&
2724 !((rth->fl.fl4_tos ^ flp->fl4_tos) & 2722 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2725 (IPTOS_RT_MASK | RTO_ONLINK)) && 2723 (IPTOS_RT_MASK | RTO_ONLINK)) &&
2726 net_eq(dev_net(rth->u.dst.dev), net) && 2724 net_eq(dev_net(rth->dst.dev), net) &&
2727 !rt_is_expired(rth)) { 2725 !rt_is_expired(rth)) {
2728 dst_use(&rth->u.dst, jiffies); 2726 dst_use(&rth->dst, jiffies);
2729 RT_CACHE_STAT_INC(out_hit); 2727 RT_CACHE_STAT_INC(out_hit);
2730 rcu_read_unlock_bh(); 2728 rcu_read_unlock_bh();
2731 *rp = rth; 2729 *rp = rth;
@@ -2738,7 +2736,6 @@ int __ip_route_output_key(struct net *net, struct rtable **rp,
2738slow_output: 2736slow_output:
2739 return ip_route_output_slow(net, rp, flp); 2737 return ip_route_output_slow(net, rp, flp);
2740} 2738}
2741
2742EXPORT_SYMBOL_GPL(__ip_route_output_key); 2739EXPORT_SYMBOL_GPL(__ip_route_output_key);
2743 2740
2744static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu) 2741static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
@@ -2762,15 +2759,15 @@ static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi
2762 dst_alloc(&ipv4_dst_blackhole_ops); 2759 dst_alloc(&ipv4_dst_blackhole_ops);
2763 2760
2764 if (rt) { 2761 if (rt) {
2765 struct dst_entry *new = &rt->u.dst; 2762 struct dst_entry *new = &rt->dst;
2766 2763
2767 atomic_set(&new->__refcnt, 1); 2764 atomic_set(&new->__refcnt, 1);
2768 new->__use = 1; 2765 new->__use = 1;
2769 new->input = dst_discard; 2766 new->input = dst_discard;
2770 new->output = dst_discard; 2767 new->output = dst_discard;
2771 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32)); 2768 memcpy(new->metrics, ort->dst.metrics, RTAX_MAX*sizeof(u32));
2772 2769
2773 new->dev = ort->u.dst.dev; 2770 new->dev = ort->dst.dev;
2774 if (new->dev) 2771 if (new->dev)
2775 dev_hold(new->dev); 2772 dev_hold(new->dev);
2776 2773
@@ -2794,7 +2791,7 @@ static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi
2794 dst_free(new); 2791 dst_free(new);
2795 } 2792 }
2796 2793
2797 dst_release(&(*rp)->u.dst); 2794 dst_release(&(*rp)->dst);
2798 *rp = rt; 2795 *rp = rt;
2799 return (rt ? 0 : -ENOMEM); 2796 return (rt ? 0 : -ENOMEM);
2800} 2797}
@@ -2822,13 +2819,13 @@ int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2822 2819
2823 return 0; 2820 return 0;
2824} 2821}
2825
2826EXPORT_SYMBOL_GPL(ip_route_output_flow); 2822EXPORT_SYMBOL_GPL(ip_route_output_flow);
2827 2823
2828int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp) 2824int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2829{ 2825{
2830 return ip_route_output_flow(net, rp, flp, NULL, 0); 2826 return ip_route_output_flow(net, rp, flp, NULL, 0);
2831} 2827}
2828EXPORT_SYMBOL(ip_route_output_key);
2832 2829
2833static int rt_fill_info(struct net *net, 2830static int rt_fill_info(struct net *net,
2834 struct sk_buff *skb, u32 pid, u32 seq, int event, 2831 struct sk_buff *skb, u32 pid, u32 seq, int event,
@@ -2864,11 +2861,11 @@ static int rt_fill_info(struct net *net,
2864 r->rtm_src_len = 32; 2861 r->rtm_src_len = 32;
2865 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src); 2862 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2866 } 2863 }
2867 if (rt->u.dst.dev) 2864 if (rt->dst.dev)
2868 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex); 2865 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2869#ifdef CONFIG_NET_CLS_ROUTE 2866#ifdef CONFIG_NET_CLS_ROUTE
2870 if (rt->u.dst.tclassid) 2867 if (rt->dst.tclassid)
2871 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid); 2868 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
2872#endif 2869#endif
2873 if (rt->fl.iif) 2870 if (rt->fl.iif)
2874 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst); 2871 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
@@ -2878,12 +2875,13 @@ static int rt_fill_info(struct net *net,
2878 if (rt->rt_dst != rt->rt_gateway) 2875 if (rt->rt_dst != rt->rt_gateway)
2879 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway); 2876 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2880 2877
2881 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0) 2878 if (rtnetlink_put_metrics(skb, rt->dst.metrics) < 0)
2882 goto nla_put_failure; 2879 goto nla_put_failure;
2883 2880
2884 error = rt->u.dst.error; 2881 error = rt->dst.error;
2885 expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0; 2882 expires = rt->dst.expires ? rt->dst.expires - jiffies : 0;
2886 if (rt->peer) { 2883 if (rt->peer) {
2884 inet_peer_refcheck(rt->peer);
2887 id = atomic_read(&rt->peer->ip_id_count) & 0xffff; 2885 id = atomic_read(&rt->peer->ip_id_count) & 0xffff;
2888 if (rt->peer->tcp_ts_stamp) { 2886 if (rt->peer->tcp_ts_stamp) {
2889 ts = rt->peer->tcp_ts; 2887 ts = rt->peer->tcp_ts;
@@ -2914,7 +2912,7 @@ static int rt_fill_info(struct net *net,
2914 NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif); 2912 NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2915 } 2913 }
2916 2914
2917 if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage, 2915 if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
2918 expires, error) < 0) 2916 expires, error) < 0)
2919 goto nla_put_failure; 2917 goto nla_put_failure;
2920 2918
@@ -2979,8 +2977,8 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
2979 local_bh_enable(); 2977 local_bh_enable();
2980 2978
2981 rt = skb_rtable(skb); 2979 rt = skb_rtable(skb);
2982 if (err == 0 && rt->u.dst.error) 2980 if (err == 0 && rt->dst.error)
2983 err = -rt->u.dst.error; 2981 err = -rt->dst.error;
2984 } else { 2982 } else {
2985 struct flowi fl = { 2983 struct flowi fl = {
2986 .nl_u = { 2984 .nl_u = {
@@ -2998,7 +2996,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
2998 if (err) 2996 if (err)
2999 goto errout_free; 2997 goto errout_free;
3000 2998
3001 skb_dst_set(skb, &rt->u.dst); 2999 skb_dst_set(skb, &rt->dst);
3002 if (rtm->rtm_flags & RTM_F_NOTIFY) 3000 if (rtm->rtm_flags & RTM_F_NOTIFY)
3003 rt->rt_flags |= RTCF_NOTIFY; 3001 rt->rt_flags |= RTCF_NOTIFY;
3004 3002
@@ -3034,12 +3032,12 @@ int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
3034 continue; 3032 continue;
3035 rcu_read_lock_bh(); 3033 rcu_read_lock_bh();
3036 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt; 3034 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3037 rt = rcu_dereference_bh(rt->u.dst.rt_next), idx++) { 3035 rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3038 if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx) 3036 if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3039 continue; 3037 continue;
3040 if (rt_is_expired(rt)) 3038 if (rt_is_expired(rt))
3041 continue; 3039 continue;
3042 skb_dst_set_noref(skb, &rt->u.dst); 3040 skb_dst_set_noref(skb, &rt->dst);
3043 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid, 3041 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3044 cb->nlh->nlmsg_seq, RTM_NEWROUTE, 3042 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3045 1, NLM_F_MULTI) <= 0) { 3043 1, NLM_F_MULTI) <= 0) {
@@ -3365,6 +3363,3 @@ void __init ip_static_sysctl_init(void)
3365 register_sysctl_paths(ipv4_path, ipv4_skeleton); 3363 register_sysctl_paths(ipv4_path, ipv4_skeleton);
3366} 3364}
3367#endif 3365#endif
3368
3369EXPORT_SYMBOL(__ip_select_ident);
3370EXPORT_SYMBOL(ip_route_output_key);
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 5c24db4a3c91..650cace2180d 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -18,8 +18,8 @@
18#include <net/tcp.h> 18#include <net/tcp.h>
19#include <net/route.h> 19#include <net/route.h>
20 20
21/* Timestamps: lowest 9 bits store TCP options */ 21/* Timestamps: lowest bits store TCP options */
22#define TSBITS 9 22#define TSBITS 6
23#define TSMASK (((__u32)1 << TSBITS) - 1) 23#define TSMASK (((__u32)1 << TSBITS) - 1)
24 24
25extern int sysctl_tcp_syncookies; 25extern int sysctl_tcp_syncookies;
@@ -58,7 +58,7 @@ static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport,
58 58
59/* 59/*
60 * when syncookies are in effect and tcp timestamps are enabled we encode 60 * when syncookies are in effect and tcp timestamps are enabled we encode
61 * tcp options in the lowest 9 bits of the timestamp value that will be 61 * tcp options in the lower bits of the timestamp value that will be
62 * sent in the syn-ack. 62 * sent in the syn-ack.
63 * Since subsequent timestamps use the normal tcp_time_stamp value, we 63 * Since subsequent timestamps use the normal tcp_time_stamp value, we
64 * must make sure that the resulting initial timestamp is <= tcp_time_stamp. 64 * must make sure that the resulting initial timestamp is <= tcp_time_stamp.
@@ -70,11 +70,10 @@ __u32 cookie_init_timestamp(struct request_sock *req)
70 u32 options = 0; 70 u32 options = 0;
71 71
72 ireq = inet_rsk(req); 72 ireq = inet_rsk(req);
73 if (ireq->wscale_ok) { 73
74 options = ireq->snd_wscale; 74 options = ireq->wscale_ok ? ireq->snd_wscale : 0xf;
75 options |= ireq->rcv_wscale << 4; 75 options |= ireq->sack_ok << 4;
76 } 76 options |= ireq->ecn_ok << 5;
77 options |= ireq->sack_ok << 8;
78 77
79 ts = ts_now & ~TSMASK; 78 ts = ts_now & ~TSMASK;
80 ts |= options; 79 ts |= options;
@@ -138,23 +137,23 @@ static __u32 check_tcp_syn_cookie(__u32 cookie, __be32 saddr, __be32 daddr,
138} 137}
139 138
140/* 139/*
141 * This table has to be sorted and terminated with (__u16)-1. 140 * MSS Values are taken from the 2009 paper
142 * XXX generate a better table. 141 * 'Measuring TCP Maximum Segment Size' by S. Alcock and R. Nelson:
143 * Unresolved Issues: HIPPI with a 64k MSS is not well supported. 142 * - values 1440 to 1460 accounted for 80% of observed mss values
143 * - values outside the 536-1460 range are rare (<0.2%).
144 *
145 * Table must be sorted.
144 */ 146 */
145static __u16 const msstab[] = { 147static __u16 const msstab[] = {
146 64 - 1, 148 64,
147 256 - 1, 149 512,
148 512 - 1, 150 536,
149 536 - 1, 151 1024,
150 1024 - 1, 152 1440,
151 1440 - 1, 153 1460,
152 1460 - 1, 154 4312,
153 4312 - 1, 155 8960,
154 (__u16)-1
155}; 156};
156/* The number doesn't include the -1 terminator */
157#define NUM_MSS (ARRAY_SIZE(msstab) - 1)
158 157
159/* 158/*
160 * Generate a syncookie. mssp points to the mss, which is returned 159 * Generate a syncookie. mssp points to the mss, which is returned
@@ -169,10 +168,10 @@ __u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp)
169 168
170 tcp_synq_overflow(sk); 169 tcp_synq_overflow(sk);
171 170
172 /* XXX sort msstab[] by probability? Binary search? */ 171 for (mssind = ARRAY_SIZE(msstab) - 1; mssind ; mssind--)
173 for (mssind = 0; mss > msstab[mssind + 1]; mssind++) 172 if (mss >= msstab[mssind])
174 ; 173 break;
175 *mssp = msstab[mssind] + 1; 174 *mssp = msstab[mssind];
176 175
177 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT); 176 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT);
178 177
@@ -202,7 +201,7 @@ static inline int cookie_check(struct sk_buff *skb, __u32 cookie)
202 jiffies / (HZ * 60), 201 jiffies / (HZ * 60),
203 COUNTER_TRIES); 202 COUNTER_TRIES);
204 203
205 return mssind < NUM_MSS ? msstab[mssind] + 1 : 0; 204 return mssind < ARRAY_SIZE(msstab) ? msstab[mssind] : 0;
206} 205}
207 206
208static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, 207static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
@@ -227,26 +226,38 @@ static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
227 * additional tcp options in the timestamp. 226 * additional tcp options in the timestamp.
228 * This extracts these options from the timestamp echo. 227 * This extracts these options from the timestamp echo.
229 * 228 *
230 * The lowest 4 bits are for snd_wscale 229 * The lowest 4 bits store snd_wscale.
231 * The next 4 lsb are for rcv_wscale 230 * next 2 bits indicate SACK and ECN support.
232 * The next lsb is for sack_ok 231 *
232 * return false if we decode an option that should not be.
233 */ 233 */
234void cookie_check_timestamp(struct tcp_options_received *tcp_opt) 234bool cookie_check_timestamp(struct tcp_options_received *tcp_opt, bool *ecn_ok)
235{ 235{
236 /* echoed timestamp, 9 lowest bits contain options */ 236 /* echoed timestamp, lowest bits contain options */
237 u32 options = tcp_opt->rcv_tsecr & TSMASK; 237 u32 options = tcp_opt->rcv_tsecr & TSMASK;
238 238
239 tcp_opt->snd_wscale = options & 0xf; 239 if (!tcp_opt->saw_tstamp) {
240 options >>= 4; 240 tcp_clear_options(tcp_opt);
241 tcp_opt->rcv_wscale = options & 0xf; 241 return true;
242 }
243
244 if (!sysctl_tcp_timestamps)
245 return false;
242 246
243 tcp_opt->sack_ok = (options >> 4) & 0x1; 247 tcp_opt->sack_ok = (options >> 4) & 0x1;
248 *ecn_ok = (options >> 5) & 1;
249 if (*ecn_ok && !sysctl_tcp_ecn)
250 return false;
251
252 if (tcp_opt->sack_ok && !sysctl_tcp_sack)
253 return false;
244 254
245 if (tcp_opt->sack_ok) 255 if ((options & 0xf) == 0xf)
246 tcp_sack_reset(tcp_opt); 256 return true; /* no window scaling */
247 257
248 if (tcp_opt->snd_wscale || tcp_opt->rcv_wscale) 258 tcp_opt->wscale_ok = 1;
249 tcp_opt->wscale_ok = 1; 259 tcp_opt->snd_wscale = options & 0xf;
260 return sysctl_tcp_window_scaling != 0;
250} 261}
251EXPORT_SYMBOL(cookie_check_timestamp); 262EXPORT_SYMBOL(cookie_check_timestamp);
252 263
@@ -265,8 +276,9 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
265 int mss; 276 int mss;
266 struct rtable *rt; 277 struct rtable *rt;
267 __u8 rcv_wscale; 278 __u8 rcv_wscale;
279 bool ecn_ok;
268 280
269 if (!sysctl_tcp_syncookies || !th->ack) 281 if (!sysctl_tcp_syncookies || !th->ack || th->rst)
270 goto out; 282 goto out;
271 283
272 if (tcp_synq_no_recent_overflow(sk) || 284 if (tcp_synq_no_recent_overflow(sk) ||
@@ -281,8 +293,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
281 memset(&tcp_opt, 0, sizeof(tcp_opt)); 293 memset(&tcp_opt, 0, sizeof(tcp_opt));
282 tcp_parse_options(skb, &tcp_opt, &hash_location, 0); 294 tcp_parse_options(skb, &tcp_opt, &hash_location, 0);
283 295
284 if (tcp_opt.saw_tstamp) 296 if (!cookie_check_timestamp(&tcp_opt, &ecn_ok))
285 cookie_check_timestamp(&tcp_opt); 297 goto out;
286 298
287 ret = NULL; 299 ret = NULL;
288 req = inet_reqsk_alloc(&tcp_request_sock_ops); /* for safety */ 300 req = inet_reqsk_alloc(&tcp_request_sock_ops); /* for safety */
@@ -298,9 +310,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
298 ireq->rmt_port = th->source; 310 ireq->rmt_port = th->source;
299 ireq->loc_addr = ip_hdr(skb)->daddr; 311 ireq->loc_addr = ip_hdr(skb)->daddr;
300 ireq->rmt_addr = ip_hdr(skb)->saddr; 312 ireq->rmt_addr = ip_hdr(skb)->saddr;
301 ireq->ecn_ok = 0; 313 ireq->ecn_ok = ecn_ok;
302 ireq->snd_wscale = tcp_opt.snd_wscale; 314 ireq->snd_wscale = tcp_opt.snd_wscale;
303 ireq->rcv_wscale = tcp_opt.rcv_wscale;
304 ireq->sack_ok = tcp_opt.sack_ok; 315 ireq->sack_ok = tcp_opt.sack_ok;
305 ireq->wscale_ok = tcp_opt.wscale_ok; 316 ireq->wscale_ok = tcp_opt.wscale_ok;
306 ireq->tstamp_ok = tcp_opt.saw_tstamp; 317 ireq->tstamp_ok = tcp_opt.saw_tstamp;
@@ -347,22 +358,22 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
347 { .sport = th->dest, 358 { .sport = th->dest,
348 .dport = th->source } } }; 359 .dport = th->source } } };
349 security_req_classify_flow(req, &fl); 360 security_req_classify_flow(req, &fl);
350 if (ip_route_output_key(&init_net, &rt, &fl)) { 361 if (ip_route_output_key(sock_net(sk), &rt, &fl)) {
351 reqsk_free(req); 362 reqsk_free(req);
352 goto out; 363 goto out;
353 } 364 }
354 } 365 }
355 366
356 /* Try to redo what tcp_v4_send_synack did. */ 367 /* Try to redo what tcp_v4_send_synack did. */
357 req->window_clamp = tp->window_clamp ? :dst_metric(&rt->u.dst, RTAX_WINDOW); 368 req->window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW);
358 369
359 tcp_select_initial_window(tcp_full_space(sk), req->mss, 370 tcp_select_initial_window(tcp_full_space(sk), req->mss,
360 &req->rcv_wnd, &req->window_clamp, 371 &req->rcv_wnd, &req->window_clamp,
361 ireq->wscale_ok, &rcv_wscale, 372 ireq->wscale_ok, &rcv_wscale,
362 dst_metric(&rt->u.dst, RTAX_INITRWND)); 373 dst_metric(&rt->dst, RTAX_INITRWND));
363 374
364 ireq->rcv_wscale = rcv_wscale; 375 ireq->rcv_wscale = rcv_wscale;
365 376
366 ret = get_cookie_sock(sk, skb, req, &rt->u.dst); 377 ret = get_cookie_sock(sk, skb, req, &rt->dst);
367out: return ret; 378out: return ret;
368} 379}
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 6596b4feeddc..86b9f67abede 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -315,7 +315,6 @@ struct tcp_splice_state {
315 * is strict, actions are advisory and have some latency. 315 * is strict, actions are advisory and have some latency.
316 */ 316 */
317int tcp_memory_pressure __read_mostly; 317int tcp_memory_pressure __read_mostly;
318
319EXPORT_SYMBOL(tcp_memory_pressure); 318EXPORT_SYMBOL(tcp_memory_pressure);
320 319
321void tcp_enter_memory_pressure(struct sock *sk) 320void tcp_enter_memory_pressure(struct sock *sk)
@@ -325,7 +324,6 @@ void tcp_enter_memory_pressure(struct sock *sk)
325 tcp_memory_pressure = 1; 324 tcp_memory_pressure = 1;
326 } 325 }
327} 326}
328
329EXPORT_SYMBOL(tcp_enter_memory_pressure); 327EXPORT_SYMBOL(tcp_enter_memory_pressure);
330 328
331/* Convert seconds to retransmits based on initial and max timeout */ 329/* Convert seconds to retransmits based on initial and max timeout */
@@ -460,6 +458,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
460 } 458 }
461 return mask; 459 return mask;
462} 460}
461EXPORT_SYMBOL(tcp_poll);
463 462
464int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) 463int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
465{ 464{
@@ -508,10 +507,11 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
508 507
509 return put_user(answ, (int __user *)arg); 508 return put_user(answ, (int __user *)arg);
510} 509}
510EXPORT_SYMBOL(tcp_ioctl);
511 511
512static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb) 512static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
513{ 513{
514 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; 514 TCP_SKB_CB(skb)->flags |= TCPHDR_PSH;
515 tp->pushed_seq = tp->write_seq; 515 tp->pushed_seq = tp->write_seq;
516} 516}
517 517
@@ -527,7 +527,7 @@ static inline void skb_entail(struct sock *sk, struct sk_buff *skb)
527 527
528 skb->csum = 0; 528 skb->csum = 0;
529 tcb->seq = tcb->end_seq = tp->write_seq; 529 tcb->seq = tcb->end_seq = tp->write_seq;
530 tcb->flags = TCPCB_FLAG_ACK; 530 tcb->flags = TCPHDR_ACK;
531 tcb->sacked = 0; 531 tcb->sacked = 0;
532 skb_header_release(skb); 532 skb_header_release(skb);
533 tcp_add_write_queue_tail(sk, skb); 533 tcp_add_write_queue_tail(sk, skb);
@@ -608,6 +608,7 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
608 ssize_t spliced; 608 ssize_t spliced;
609 int ret; 609 int ret;
610 610
611 sock_rps_record_flow(sk);
611 /* 612 /*
612 * We can't seek on a socket input 613 * We can't seek on a socket input
613 */ 614 */
@@ -675,6 +676,7 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
675 676
676 return ret; 677 return ret;
677} 678}
679EXPORT_SYMBOL(tcp_splice_read);
678 680
679struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp) 681struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp)
680{ 682{
@@ -815,7 +817,7 @@ new_segment:
815 skb_shinfo(skb)->gso_segs = 0; 817 skb_shinfo(skb)->gso_segs = 0;
816 818
817 if (!copied) 819 if (!copied)
818 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH; 820 TCP_SKB_CB(skb)->flags &= ~TCPHDR_PSH;
819 821
820 copied += copy; 822 copied += copy;
821 poffset += copy; 823 poffset += copy;
@@ -856,15 +858,15 @@ out_err:
856 return sk_stream_error(sk, flags, err); 858 return sk_stream_error(sk, flags, err);
857} 859}
858 860
859ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, 861int tcp_sendpage(struct sock *sk, struct page *page, int offset,
860 size_t size, int flags) 862 size_t size, int flags)
861{ 863{
862 ssize_t res; 864 ssize_t res;
863 struct sock *sk = sock->sk;
864 865
865 if (!(sk->sk_route_caps & NETIF_F_SG) || 866 if (!(sk->sk_route_caps & NETIF_F_SG) ||
866 !(sk->sk_route_caps & NETIF_F_ALL_CSUM)) 867 !(sk->sk_route_caps & NETIF_F_ALL_CSUM))
867 return sock_no_sendpage(sock, page, offset, size, flags); 868 return sock_no_sendpage(sk->sk_socket, page, offset, size,
869 flags);
868 870
869 lock_sock(sk); 871 lock_sock(sk);
870 TCP_CHECK_TIMER(sk); 872 TCP_CHECK_TIMER(sk);
@@ -873,6 +875,7 @@ ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
873 release_sock(sk); 875 release_sock(sk);
874 return res; 876 return res;
875} 877}
878EXPORT_SYMBOL(tcp_sendpage);
876 879
877#define TCP_PAGE(sk) (sk->sk_sndmsg_page) 880#define TCP_PAGE(sk) (sk->sk_sndmsg_page)
878#define TCP_OFF(sk) (sk->sk_sndmsg_off) 881#define TCP_OFF(sk) (sk->sk_sndmsg_off)
@@ -897,10 +900,9 @@ static inline int select_size(struct sock *sk, int sg)
897 return tmp; 900 return tmp;
898} 901}
899 902
900int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, 903int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
901 size_t size) 904 size_t size)
902{ 905{
903 struct sock *sk = sock->sk;
904 struct iovec *iov; 906 struct iovec *iov;
905 struct tcp_sock *tp = tcp_sk(sk); 907 struct tcp_sock *tp = tcp_sk(sk);
906 struct sk_buff *skb; 908 struct sk_buff *skb;
@@ -1061,7 +1063,7 @@ new_segment:
1061 } 1063 }
1062 1064
1063 if (!copied) 1065 if (!copied)
1064 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH; 1066 TCP_SKB_CB(skb)->flags &= ~TCPHDR_PSH;
1065 1067
1066 tp->write_seq += copy; 1068 tp->write_seq += copy;
1067 TCP_SKB_CB(skb)->end_seq += copy; 1069 TCP_SKB_CB(skb)->end_seq += copy;
@@ -1121,6 +1123,7 @@ out_err:
1121 release_sock(sk); 1123 release_sock(sk);
1122 return err; 1124 return err;
1123} 1125}
1126EXPORT_SYMBOL(tcp_sendmsg);
1124 1127
1125/* 1128/*
1126 * Handle reading urgent data. BSD has very simple semantics for 1129 * Handle reading urgent data. BSD has very simple semantics for
@@ -1380,6 +1383,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1380 tcp_cleanup_rbuf(sk, copied); 1383 tcp_cleanup_rbuf(sk, copied);
1381 return copied; 1384 return copied;
1382} 1385}
1386EXPORT_SYMBOL(tcp_read_sock);
1383 1387
1384/* 1388/*
1385 * This routine copies from a sock struct into the user buffer. 1389 * This routine copies from a sock struct into the user buffer.
@@ -1774,6 +1778,7 @@ recv_urg:
1774 err = tcp_recv_urg(sk, msg, len, flags); 1778 err = tcp_recv_urg(sk, msg, len, flags);
1775 goto out; 1779 goto out;
1776} 1780}
1781EXPORT_SYMBOL(tcp_recvmsg);
1777 1782
1778void tcp_set_state(struct sock *sk, int state) 1783void tcp_set_state(struct sock *sk, int state)
1779{ 1784{
@@ -1866,6 +1871,7 @@ void tcp_shutdown(struct sock *sk, int how)
1866 tcp_send_fin(sk); 1871 tcp_send_fin(sk);
1867 } 1872 }
1868} 1873}
1874EXPORT_SYMBOL(tcp_shutdown);
1869 1875
1870void tcp_close(struct sock *sk, long timeout) 1876void tcp_close(struct sock *sk, long timeout)
1871{ 1877{
@@ -1898,6 +1904,10 @@ void tcp_close(struct sock *sk, long timeout)
1898 1904
1899 sk_mem_reclaim(sk); 1905 sk_mem_reclaim(sk);
1900 1906
1907 /* If socket has been already reset (e.g. in tcp_reset()) - kill it. */
1908 if (sk->sk_state == TCP_CLOSE)
1909 goto adjudge_to_death;
1910
1901 /* As outlined in RFC 2525, section 2.17, we send a RST here because 1911 /* As outlined in RFC 2525, section 2.17, we send a RST here because
1902 * data was lost. To witness the awful effects of the old behavior of 1912 * data was lost. To witness the awful effects of the old behavior of
1903 * always doing a FIN, run an older 2.1.x kernel or 2.0.x, start a bulk 1913 * always doing a FIN, run an older 2.1.x kernel or 2.0.x, start a bulk
@@ -2025,6 +2035,7 @@ out:
2025 local_bh_enable(); 2035 local_bh_enable();
2026 sock_put(sk); 2036 sock_put(sk);
2027} 2037}
2038EXPORT_SYMBOL(tcp_close);
2028 2039
2029/* These states need RST on ABORT according to RFC793 */ 2040/* These states need RST on ABORT according to RFC793 */
2030 2041
@@ -2098,6 +2109,7 @@ int tcp_disconnect(struct sock *sk, int flags)
2098 sk->sk_error_report(sk); 2109 sk->sk_error_report(sk);
2099 return err; 2110 return err;
2100} 2111}
2112EXPORT_SYMBOL(tcp_disconnect);
2101 2113
2102/* 2114/*
2103 * Socket option code for TCP. 2115 * Socket option code for TCP.
@@ -2396,6 +2408,7 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
2396 optval, optlen); 2408 optval, optlen);
2397 return do_tcp_setsockopt(sk, level, optname, optval, optlen); 2409 return do_tcp_setsockopt(sk, level, optname, optval, optlen);
2398} 2410}
2411EXPORT_SYMBOL(tcp_setsockopt);
2399 2412
2400#ifdef CONFIG_COMPAT 2413#ifdef CONFIG_COMPAT
2401int compat_tcp_setsockopt(struct sock *sk, int level, int optname, 2414int compat_tcp_setsockopt(struct sock *sk, int level, int optname,
@@ -2406,7 +2419,6 @@ int compat_tcp_setsockopt(struct sock *sk, int level, int optname,
2406 optval, optlen); 2419 optval, optlen);
2407 return do_tcp_setsockopt(sk, level, optname, optval, optlen); 2420 return do_tcp_setsockopt(sk, level, optname, optval, optlen);
2408} 2421}
2409
2410EXPORT_SYMBOL(compat_tcp_setsockopt); 2422EXPORT_SYMBOL(compat_tcp_setsockopt);
2411#endif 2423#endif
2412 2424
@@ -2472,7 +2484,6 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
2472 2484
2473 info->tcpi_total_retrans = tp->total_retrans; 2485 info->tcpi_total_retrans = tp->total_retrans;
2474} 2486}
2475
2476EXPORT_SYMBOL_GPL(tcp_get_info); 2487EXPORT_SYMBOL_GPL(tcp_get_info);
2477 2488
2478static int do_tcp_getsockopt(struct sock *sk, int level, 2489static int do_tcp_getsockopt(struct sock *sk, int level,
@@ -2611,6 +2622,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2611 optval, optlen); 2622 optval, optlen);
2612 return do_tcp_getsockopt(sk, level, optname, optval, optlen); 2623 return do_tcp_getsockopt(sk, level, optname, optval, optlen);
2613} 2624}
2625EXPORT_SYMBOL(tcp_getsockopt);
2614 2626
2615#ifdef CONFIG_COMPAT 2627#ifdef CONFIG_COMPAT
2616int compat_tcp_getsockopt(struct sock *sk, int level, int optname, 2628int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
@@ -2621,7 +2633,6 @@ int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
2621 optval, optlen); 2633 optval, optlen);
2622 return do_tcp_getsockopt(sk, level, optname, optval, optlen); 2634 return do_tcp_getsockopt(sk, level, optname, optval, optlen);
2623} 2635}
2624
2625EXPORT_SYMBOL(compat_tcp_getsockopt); 2636EXPORT_SYMBOL(compat_tcp_getsockopt);
2626#endif 2637#endif
2627 2638
@@ -2858,7 +2869,6 @@ void tcp_free_md5sig_pool(void)
2858 if (pool) 2869 if (pool)
2859 __tcp_free_md5sig_pool(pool); 2870 __tcp_free_md5sig_pool(pool);
2860} 2871}
2861
2862EXPORT_SYMBOL(tcp_free_md5sig_pool); 2872EXPORT_SYMBOL(tcp_free_md5sig_pool);
2863 2873
2864static struct tcp_md5sig_pool * __percpu * 2874static struct tcp_md5sig_pool * __percpu *
@@ -2934,7 +2944,6 @@ retry:
2934 } 2944 }
2935 return pool; 2945 return pool;
2936} 2946}
2937
2938EXPORT_SYMBOL(tcp_alloc_md5sig_pool); 2947EXPORT_SYMBOL(tcp_alloc_md5sig_pool);
2939 2948
2940 2949
@@ -2958,7 +2967,7 @@ struct tcp_md5sig_pool *tcp_get_md5sig_pool(void)
2958 spin_unlock(&tcp_md5sig_pool_lock); 2967 spin_unlock(&tcp_md5sig_pool_lock);
2959 2968
2960 if (p) 2969 if (p)
2961 return *per_cpu_ptr(p, smp_processor_id()); 2970 return *this_cpu_ptr(p);
2962 2971
2963 local_bh_enable(); 2972 local_bh_enable();
2964 return NULL; 2973 return NULL;
@@ -2986,7 +2995,6 @@ int tcp_md5_hash_header(struct tcp_md5sig_pool *hp,
2986 th->check = old_checksum; 2995 th->check = old_checksum;
2987 return err; 2996 return err;
2988} 2997}
2989
2990EXPORT_SYMBOL(tcp_md5_hash_header); 2998EXPORT_SYMBOL(tcp_md5_hash_header);
2991 2999
2992int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp, 3000int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
@@ -2999,6 +3007,7 @@ int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
2999 const unsigned head_data_len = skb_headlen(skb) > header_len ? 3007 const unsigned head_data_len = skb_headlen(skb) > header_len ?
3000 skb_headlen(skb) - header_len : 0; 3008 skb_headlen(skb) - header_len : 0;
3001 const struct skb_shared_info *shi = skb_shinfo(skb); 3009 const struct skb_shared_info *shi = skb_shinfo(skb);
3010 struct sk_buff *frag_iter;
3002 3011
3003 sg_init_table(&sg, 1); 3012 sg_init_table(&sg, 1);
3004 3013
@@ -3013,9 +3022,12 @@ int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
3013 return 1; 3022 return 1;
3014 } 3023 }
3015 3024
3025 skb_walk_frags(skb, frag_iter)
3026 if (tcp_md5_hash_skb_data(hp, frag_iter, 0))
3027 return 1;
3028
3016 return 0; 3029 return 0;
3017} 3030}
3018
3019EXPORT_SYMBOL(tcp_md5_hash_skb_data); 3031EXPORT_SYMBOL(tcp_md5_hash_skb_data);
3020 3032
3021int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, struct tcp_md5sig_key *key) 3033int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, struct tcp_md5sig_key *key)
@@ -3025,7 +3037,6 @@ int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, struct tcp_md5sig_key *key)
3025 sg_init_one(&sg, key->key, key->keylen); 3037 sg_init_one(&sg, key->key, key->keylen);
3026 return crypto_hash_update(&hp->md5_desc, &sg, key->keylen); 3038 return crypto_hash_update(&hp->md5_desc, &sg, key->keylen);
3027} 3039}
3028
3029EXPORT_SYMBOL(tcp_md5_hash_key); 3040EXPORT_SYMBOL(tcp_md5_hash_key);
3030 3041
3031#endif 3042#endif
@@ -3297,16 +3308,3 @@ void __init tcp_init(void)
3297 tcp_secret_retiring = &tcp_secret_two; 3308 tcp_secret_retiring = &tcp_secret_two;
3298 tcp_secret_secondary = &tcp_secret_two; 3309 tcp_secret_secondary = &tcp_secret_two;
3299} 3310}
3300
3301EXPORT_SYMBOL(tcp_close);
3302EXPORT_SYMBOL(tcp_disconnect);
3303EXPORT_SYMBOL(tcp_getsockopt);
3304EXPORT_SYMBOL(tcp_ioctl);
3305EXPORT_SYMBOL(tcp_poll);
3306EXPORT_SYMBOL(tcp_read_sock);
3307EXPORT_SYMBOL(tcp_recvmsg);
3308EXPORT_SYMBOL(tcp_sendmsg);
3309EXPORT_SYMBOL(tcp_splice_read);
3310EXPORT_SYMBOL(tcp_sendpage);
3311EXPORT_SYMBOL(tcp_setsockopt);
3312EXPORT_SYMBOL(tcp_shutdown);
diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c
index c209e054a634..377bc9349371 100644
--- a/net/ipv4/tcp_hybla.c
+++ b/net/ipv4/tcp_hybla.c
@@ -126,8 +126,8 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
126 * calculate 2^fract in a <<7 value. 126 * calculate 2^fract in a <<7 value.
127 */ 127 */
128 is_slowstart = 1; 128 is_slowstart = 1;
129 increment = ((1 << ca->rho) * hybla_fraction(rho_fractions)) 129 increment = ((1 << min(ca->rho, 16U)) *
130 - 128; 130 hybla_fraction(rho_fractions)) - 128;
131 } else { 131 } else {
132 /* 132 /*
133 * congestion avoidance 133 * congestion avoidance
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 3e6dafcb1071..3c426cb318e7 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -78,10 +78,13 @@ int sysctl_tcp_window_scaling __read_mostly = 1;
78int sysctl_tcp_sack __read_mostly = 1; 78int sysctl_tcp_sack __read_mostly = 1;
79int sysctl_tcp_fack __read_mostly = 1; 79int sysctl_tcp_fack __read_mostly = 1;
80int sysctl_tcp_reordering __read_mostly = TCP_FASTRETRANS_THRESH; 80int sysctl_tcp_reordering __read_mostly = TCP_FASTRETRANS_THRESH;
81EXPORT_SYMBOL(sysctl_tcp_reordering);
81int sysctl_tcp_ecn __read_mostly = 2; 82int sysctl_tcp_ecn __read_mostly = 2;
83EXPORT_SYMBOL(sysctl_tcp_ecn);
82int sysctl_tcp_dsack __read_mostly = 1; 84int sysctl_tcp_dsack __read_mostly = 1;
83int sysctl_tcp_app_win __read_mostly = 31; 85int sysctl_tcp_app_win __read_mostly = 31;
84int sysctl_tcp_adv_win_scale __read_mostly = 2; 86int sysctl_tcp_adv_win_scale __read_mostly = 2;
87EXPORT_SYMBOL(sysctl_tcp_adv_win_scale);
85 88
86int sysctl_tcp_stdurg __read_mostly; 89int sysctl_tcp_stdurg __read_mostly;
87int sysctl_tcp_rfc1337 __read_mostly; 90int sysctl_tcp_rfc1337 __read_mostly;
@@ -419,6 +422,7 @@ void tcp_initialize_rcv_mss(struct sock *sk)
419 422
420 inet_csk(sk)->icsk_ack.rcv_mss = hint; 423 inet_csk(sk)->icsk_ack.rcv_mss = hint;
421} 424}
425EXPORT_SYMBOL(tcp_initialize_rcv_mss);
422 426
423/* Receiver "autotuning" code. 427/* Receiver "autotuning" code.
424 * 428 *
@@ -2639,7 +2643,7 @@ static void DBGUNDO(struct sock *sk, const char *msg)
2639 if (sk->sk_family == AF_INET) { 2643 if (sk->sk_family == AF_INET) {
2640 printk(KERN_DEBUG "Undo %s %pI4/%u c%u l%u ss%u/%u p%u\n", 2644 printk(KERN_DEBUG "Undo %s %pI4/%u c%u l%u ss%u/%u p%u\n",
2641 msg, 2645 msg,
2642 &inet->daddr, ntohs(inet->dport), 2646 &inet->inet_daddr, ntohs(inet->inet_dport),
2643 tp->snd_cwnd, tcp_left_out(tp), 2647 tp->snd_cwnd, tcp_left_out(tp),
2644 tp->snd_ssthresh, tp->prior_ssthresh, 2648 tp->snd_ssthresh, tp->prior_ssthresh,
2645 tp->packets_out); 2649 tp->packets_out);
@@ -2649,7 +2653,7 @@ static void DBGUNDO(struct sock *sk, const char *msg)
2649 struct ipv6_pinfo *np = inet6_sk(sk); 2653 struct ipv6_pinfo *np = inet6_sk(sk);
2650 printk(KERN_DEBUG "Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n", 2654 printk(KERN_DEBUG "Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n",
2651 msg, 2655 msg,
2652 &np->daddr, ntohs(inet->dport), 2656 &np->daddr, ntohs(inet->inet_dport),
2653 tp->snd_cwnd, tcp_left_out(tp), 2657 tp->snd_cwnd, tcp_left_out(tp),
2654 tp->snd_ssthresh, tp->prior_ssthresh, 2658 tp->snd_ssthresh, tp->prior_ssthresh,
2655 tp->packets_out); 2659 tp->packets_out);
@@ -2938,6 +2942,7 @@ void tcp_simple_retransmit(struct sock *sk)
2938 } 2942 }
2939 tcp_xmit_retransmit_queue(sk); 2943 tcp_xmit_retransmit_queue(sk);
2940} 2944}
2945EXPORT_SYMBOL(tcp_simple_retransmit);
2941 2946
2942/* Process an event, which can update packets-in-flight not trivially. 2947/* Process an event, which can update packets-in-flight not trivially.
2943 * Main goal of this function is to calculate new estimate for left_out, 2948 * Main goal of this function is to calculate new estimate for left_out,
@@ -3286,7 +3291,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3286 * connection startup slow start one packet too 3291 * connection startup slow start one packet too
3287 * quickly. This is severely frowned upon behavior. 3292 * quickly. This is severely frowned upon behavior.
3288 */ 3293 */
3289 if (!(scb->flags & TCPCB_FLAG_SYN)) { 3294 if (!(scb->flags & TCPHDR_SYN)) {
3290 flag |= FLAG_DATA_ACKED; 3295 flag |= FLAG_DATA_ACKED;
3291 } else { 3296 } else {
3292 flag |= FLAG_SYN_ACKED; 3297 flag |= FLAG_SYN_ACKED;
@@ -3858,6 +3863,7 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
3858 } 3863 }
3859 } 3864 }
3860} 3865}
3866EXPORT_SYMBOL(tcp_parse_options);
3861 3867
3862static int tcp_parse_aligned_timestamp(struct tcp_sock *tp, struct tcphdr *th) 3868static int tcp_parse_aligned_timestamp(struct tcp_sock *tp, struct tcphdr *th)
3863{ 3869{
@@ -3931,6 +3937,7 @@ u8 *tcp_parse_md5sig_option(struct tcphdr *th)
3931 } 3937 }
3932 return NULL; 3938 return NULL;
3933} 3939}
3940EXPORT_SYMBOL(tcp_parse_md5sig_option);
3934#endif 3941#endif
3935 3942
3936static inline void tcp_store_ts_recent(struct tcp_sock *tp) 3943static inline void tcp_store_ts_recent(struct tcp_sock *tp)
@@ -5432,6 +5439,7 @@ discard:
5432 __kfree_skb(skb); 5439 __kfree_skb(skb);
5433 return 0; 5440 return 0;
5434} 5441}
5442EXPORT_SYMBOL(tcp_rcv_established);
5435 5443
5436static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, 5444static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
5437 struct tcphdr *th, unsigned len) 5445 struct tcphdr *th, unsigned len)
@@ -5931,14 +5939,4 @@ discard:
5931 } 5939 }
5932 return 0; 5940 return 0;
5933} 5941}
5934
5935EXPORT_SYMBOL(sysctl_tcp_ecn);
5936EXPORT_SYMBOL(sysctl_tcp_reordering);
5937EXPORT_SYMBOL(sysctl_tcp_adv_win_scale);
5938EXPORT_SYMBOL(tcp_parse_options);
5939#ifdef CONFIG_TCP_MD5SIG
5940EXPORT_SYMBOL(tcp_parse_md5sig_option);
5941#endif
5942EXPORT_SYMBOL(tcp_rcv_established);
5943EXPORT_SYMBOL(tcp_rcv_state_process); 5942EXPORT_SYMBOL(tcp_rcv_state_process);
5944EXPORT_SYMBOL(tcp_initialize_rcv_mss);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 202cf09c4cd4..020766292bb0 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -84,6 +84,7 @@
84 84
85int sysctl_tcp_tw_reuse __read_mostly; 85int sysctl_tcp_tw_reuse __read_mostly;
86int sysctl_tcp_low_latency __read_mostly; 86int sysctl_tcp_low_latency __read_mostly;
87EXPORT_SYMBOL(sysctl_tcp_low_latency);
87 88
88 89
89#ifdef CONFIG_TCP_MD5SIG 90#ifdef CONFIG_TCP_MD5SIG
@@ -100,6 +101,7 @@ struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
100#endif 101#endif
101 102
102struct inet_hashinfo tcp_hashinfo; 103struct inet_hashinfo tcp_hashinfo;
104EXPORT_SYMBOL(tcp_hashinfo);
103 105
104static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb) 106static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
105{ 107{
@@ -139,7 +141,6 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
139 141
140 return 0; 142 return 0;
141} 143}
142
143EXPORT_SYMBOL_GPL(tcp_twsk_unique); 144EXPORT_SYMBOL_GPL(tcp_twsk_unique);
144 145
145/* This will initiate an outgoing connection. */ 146/* This will initiate an outgoing connection. */
@@ -204,10 +205,12 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
204 * TIME-WAIT * and initialize rx_opt.ts_recent from it, 205 * TIME-WAIT * and initialize rx_opt.ts_recent from it,
205 * when trying new connection. 206 * when trying new connection.
206 */ 207 */
207 if (peer != NULL && 208 if (peer) {
208 (u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) { 209 inet_peer_refcheck(peer);
209 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp; 210 if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
210 tp->rx_opt.ts_recent = peer->tcp_ts; 211 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
212 tp->rx_opt.ts_recent = peer->tcp_ts;
213 }
211 } 214 }
212 } 215 }
213 216
@@ -237,7 +240,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
237 240
238 /* OK, now commit destination to socket. */ 241 /* OK, now commit destination to socket. */
239 sk->sk_gso_type = SKB_GSO_TCPV4; 242 sk->sk_gso_type = SKB_GSO_TCPV4;
240 sk_setup_caps(sk, &rt->u.dst); 243 sk_setup_caps(sk, &rt->dst);
241 244
242 if (!tp->write_seq) 245 if (!tp->write_seq)
243 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr, 246 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
@@ -265,6 +268,7 @@ failure:
265 inet->inet_dport = 0; 268 inet->inet_dport = 0;
266 return err; 269 return err;
267} 270}
271EXPORT_SYMBOL(tcp_v4_connect);
268 272
269/* 273/*
270 * This routine does path mtu discovery as defined in RFC1191. 274 * This routine does path mtu discovery as defined in RFC1191.
@@ -543,6 +547,7 @@ void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
543 547
544 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr); 548 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
545} 549}
550EXPORT_SYMBOL(tcp_v4_send_check);
546 551
547int tcp_v4_gso_send_check(struct sk_buff *skb) 552int tcp_v4_gso_send_check(struct sk_buff *skb)
548{ 553{
@@ -793,19 +798,20 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req)
793 kfree(inet_rsk(req)->opt); 798 kfree(inet_rsk(req)->opt);
794} 799}
795 800
796#ifdef CONFIG_SYN_COOKIES 801static void syn_flood_warning(const struct sk_buff *skb)
797static void syn_flood_warning(struct sk_buff *skb)
798{ 802{
799 static unsigned long warntime; 803 const char *msg;
800 804
801 if (time_after(jiffies, (warntime + HZ * 60))) { 805#ifdef CONFIG_SYN_COOKIES
802 warntime = jiffies; 806 if (sysctl_tcp_syncookies)
803 printk(KERN_INFO 807 msg = "Sending cookies";
804 "possible SYN flooding on port %d. Sending cookies.\n", 808 else
805 ntohs(tcp_hdr(skb)->dest));
806 }
807}
808#endif 809#endif
810 msg = "Dropping request";
811
812 pr_info("TCP: Possible SYN flooding on port %d. %s.\n",
813 ntohs(tcp_hdr(skb)->dest), msg);
814}
809 815
810/* 816/*
811 * Save and compile IPv4 options into the request_sock if needed. 817 * Save and compile IPv4 options into the request_sock if needed.
@@ -857,7 +863,6 @@ struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
857{ 863{
858 return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->inet_daddr); 864 return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->inet_daddr);
859} 865}
860
861EXPORT_SYMBOL(tcp_v4_md5_lookup); 866EXPORT_SYMBOL(tcp_v4_md5_lookup);
862 867
863static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk, 868static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
@@ -924,7 +929,6 @@ int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
924 } 929 }
925 return 0; 930 return 0;
926} 931}
927
928EXPORT_SYMBOL(tcp_v4_md5_do_add); 932EXPORT_SYMBOL(tcp_v4_md5_do_add);
929 933
930static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk, 934static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
@@ -962,7 +966,6 @@ int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
962 } 966 }
963 return -ENOENT; 967 return -ENOENT;
964} 968}
965
966EXPORT_SYMBOL(tcp_v4_md5_do_del); 969EXPORT_SYMBOL(tcp_v4_md5_do_del);
967 970
968static void tcp_v4_clear_md5_list(struct sock *sk) 971static void tcp_v4_clear_md5_list(struct sock *sk)
@@ -1135,7 +1138,6 @@ clear_hash_noput:
1135 memset(md5_hash, 0, 16); 1138 memset(md5_hash, 0, 16);
1136 return 1; 1139 return 1;
1137} 1140}
1138
1139EXPORT_SYMBOL(tcp_v4_md5_hash_skb); 1141EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1140 1142
1141static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb) 1143static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
@@ -1243,6 +1245,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1243 * evidently real one. 1245 * evidently real one.
1244 */ 1246 */
1245 if (inet_csk_reqsk_queue_is_full(sk) && !isn) { 1247 if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1248 if (net_ratelimit())
1249 syn_flood_warning(skb);
1246#ifdef CONFIG_SYN_COOKIES 1250#ifdef CONFIG_SYN_COOKIES
1247 if (sysctl_tcp_syncookies) { 1251 if (sysctl_tcp_syncookies) {
1248 want_cookie = 1; 1252 want_cookie = 1;
@@ -1323,15 +1327,12 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1323 if (security_inet_conn_request(sk, skb, req)) 1327 if (security_inet_conn_request(sk, skb, req))
1324 goto drop_and_free; 1328 goto drop_and_free;
1325 1329
1326 if (!want_cookie) 1330 if (!want_cookie || tmp_opt.tstamp_ok)
1327 TCP_ECN_create_request(req, tcp_hdr(skb)); 1331 TCP_ECN_create_request(req, tcp_hdr(skb));
1328 1332
1329 if (want_cookie) { 1333 if (want_cookie) {
1330#ifdef CONFIG_SYN_COOKIES
1331 syn_flood_warning(skb);
1332 req->cookie_ts = tmp_opt.tstamp_ok;
1333#endif
1334 isn = cookie_v4_init_sequence(sk, skb, &req->mss); 1334 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1335 req->cookie_ts = tmp_opt.tstamp_ok;
1335 } else if (!isn) { 1336 } else if (!isn) {
1336 struct inet_peer *peer = NULL; 1337 struct inet_peer *peer = NULL;
1337 1338
@@ -1349,6 +1350,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1349 (dst = inet_csk_route_req(sk, req)) != NULL && 1350 (dst = inet_csk_route_req(sk, req)) != NULL &&
1350 (peer = rt_get_peer((struct rtable *)dst)) != NULL && 1351 (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1351 peer->v4daddr == saddr) { 1352 peer->v4daddr == saddr) {
1353 inet_peer_refcheck(peer);
1352 if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL && 1354 if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
1353 (s32)(peer->tcp_ts - req->ts_recent) > 1355 (s32)(peer->tcp_ts - req->ts_recent) >
1354 TCP_PAWS_WINDOW) { 1356 TCP_PAWS_WINDOW) {
@@ -1393,6 +1395,7 @@ drop_and_free:
1393drop: 1395drop:
1394 return 0; 1396 return 0;
1395} 1397}
1398EXPORT_SYMBOL(tcp_v4_conn_request);
1396 1399
1397 1400
1398/* 1401/*
@@ -1478,6 +1481,7 @@ exit:
1478 dst_release(dst); 1481 dst_release(dst);
1479 return NULL; 1482 return NULL;
1480} 1483}
1484EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1481 1485
1482static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) 1486static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1483{ 1487{
@@ -1504,7 +1508,7 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1504 } 1508 }
1505 1509
1506#ifdef CONFIG_SYN_COOKIES 1510#ifdef CONFIG_SYN_COOKIES
1507 if (!th->rst && !th->syn && th->ack) 1511 if (!th->syn)
1508 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt)); 1512 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1509#endif 1513#endif
1510 return sk; 1514 return sk;
@@ -1555,6 +1559,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1555#endif 1559#endif
1556 1560
1557 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ 1561 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1562 sock_rps_save_rxhash(sk, skb->rxhash);
1558 TCP_CHECK_TIMER(sk); 1563 TCP_CHECK_TIMER(sk);
1559 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) { 1564 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1560 rsk = sk; 1565 rsk = sk;
@@ -1579,7 +1584,9 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1579 } 1584 }
1580 return 0; 1585 return 0;
1581 } 1586 }
1582 } 1587 } else
1588 sock_rps_save_rxhash(sk, skb->rxhash);
1589
1583 1590
1584 TCP_CHECK_TIMER(sk); 1591 TCP_CHECK_TIMER(sk);
1585 if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) { 1592 if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
@@ -1604,6 +1611,7 @@ csum_err:
1604 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); 1611 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1605 goto discard; 1612 goto discard;
1606} 1613}
1614EXPORT_SYMBOL(tcp_v4_do_rcv);
1607 1615
1608/* 1616/*
1609 * From tcp_input.c 1617 * From tcp_input.c
@@ -1672,8 +1680,6 @@ process:
1672 1680
1673 skb->dev = NULL; 1681 skb->dev = NULL;
1674 1682
1675 sock_rps_save_rxhash(sk, skb->rxhash);
1676
1677 bh_lock_sock_nested(sk); 1683 bh_lock_sock_nested(sk);
1678 ret = 0; 1684 ret = 0;
1679 if (!sock_owned_by_user(sk)) { 1685 if (!sock_owned_by_user(sk)) {
@@ -1792,6 +1798,7 @@ int tcp_v4_remember_stamp(struct sock *sk)
1792 1798
1793 return 0; 1799 return 0;
1794} 1800}
1801EXPORT_SYMBOL(tcp_v4_remember_stamp);
1795 1802
1796int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw) 1803int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1797{ 1804{
@@ -1831,6 +1838,7 @@ const struct inet_connection_sock_af_ops ipv4_specific = {
1831 .compat_getsockopt = compat_ip_getsockopt, 1838 .compat_getsockopt = compat_ip_getsockopt,
1832#endif 1839#endif
1833}; 1840};
1841EXPORT_SYMBOL(ipv4_specific);
1834 1842
1835#ifdef CONFIG_TCP_MD5SIG 1843#ifdef CONFIG_TCP_MD5SIG
1836static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { 1844static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
@@ -1959,7 +1967,6 @@ void tcp_v4_destroy_sock(struct sock *sk)
1959 1967
1960 percpu_counter_dec(&tcp_sockets_allocated); 1968 percpu_counter_dec(&tcp_sockets_allocated);
1961} 1969}
1962
1963EXPORT_SYMBOL(tcp_v4_destroy_sock); 1970EXPORT_SYMBOL(tcp_v4_destroy_sock);
1964 1971
1965#ifdef CONFIG_PROC_FS 1972#ifdef CONFIG_PROC_FS
@@ -1977,6 +1984,11 @@ static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1977 hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL; 1984 hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1978} 1985}
1979 1986
1987/*
1988 * Get next listener socket follow cur. If cur is NULL, get first socket
1989 * starting from bucket given in st->bucket; when st->bucket is zero the
1990 * very first socket in the hash table is returned.
1991 */
1980static void *listening_get_next(struct seq_file *seq, void *cur) 1992static void *listening_get_next(struct seq_file *seq, void *cur)
1981{ 1993{
1982 struct inet_connection_sock *icsk; 1994 struct inet_connection_sock *icsk;
@@ -1987,14 +1999,15 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
1987 struct net *net = seq_file_net(seq); 1999 struct net *net = seq_file_net(seq);
1988 2000
1989 if (!sk) { 2001 if (!sk) {
1990 st->bucket = 0; 2002 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1991 ilb = &tcp_hashinfo.listening_hash[0];
1992 spin_lock_bh(&ilb->lock); 2003 spin_lock_bh(&ilb->lock);
1993 sk = sk_nulls_head(&ilb->head); 2004 sk = sk_nulls_head(&ilb->head);
2005 st->offset = 0;
1994 goto get_sk; 2006 goto get_sk;
1995 } 2007 }
1996 ilb = &tcp_hashinfo.listening_hash[st->bucket]; 2008 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1997 ++st->num; 2009 ++st->num;
2010 ++st->offset;
1998 2011
1999 if (st->state == TCP_SEQ_STATE_OPENREQ) { 2012 if (st->state == TCP_SEQ_STATE_OPENREQ) {
2000 struct request_sock *req = cur; 2013 struct request_sock *req = cur;
@@ -2009,6 +2022,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
2009 } 2022 }
2010 req = req->dl_next; 2023 req = req->dl_next;
2011 } 2024 }
2025 st->offset = 0;
2012 if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries) 2026 if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
2013 break; 2027 break;
2014get_req: 2028get_req:
@@ -2044,6 +2058,7 @@ start_req:
2044 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); 2058 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2045 } 2059 }
2046 spin_unlock_bh(&ilb->lock); 2060 spin_unlock_bh(&ilb->lock);
2061 st->offset = 0;
2047 if (++st->bucket < INET_LHTABLE_SIZE) { 2062 if (++st->bucket < INET_LHTABLE_SIZE) {
2048 ilb = &tcp_hashinfo.listening_hash[st->bucket]; 2063 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2049 spin_lock_bh(&ilb->lock); 2064 spin_lock_bh(&ilb->lock);
@@ -2057,7 +2072,12 @@ out:
2057 2072
2058static void *listening_get_idx(struct seq_file *seq, loff_t *pos) 2073static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2059{ 2074{
2060 void *rc = listening_get_next(seq, NULL); 2075 struct tcp_iter_state *st = seq->private;
2076 void *rc;
2077
2078 st->bucket = 0;
2079 st->offset = 0;
2080 rc = listening_get_next(seq, NULL);
2061 2081
2062 while (rc && *pos) { 2082 while (rc && *pos) {
2063 rc = listening_get_next(seq, rc); 2083 rc = listening_get_next(seq, rc);
@@ -2072,13 +2092,18 @@ static inline int empty_bucket(struct tcp_iter_state *st)
2072 hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain); 2092 hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
2073} 2093}
2074 2094
2095/*
2096 * Get first established socket starting from bucket given in st->bucket.
2097 * If st->bucket is zero, the very first socket in the hash is returned.
2098 */
2075static void *established_get_first(struct seq_file *seq) 2099static void *established_get_first(struct seq_file *seq)
2076{ 2100{
2077 struct tcp_iter_state *st = seq->private; 2101 struct tcp_iter_state *st = seq->private;
2078 struct net *net = seq_file_net(seq); 2102 struct net *net = seq_file_net(seq);
2079 void *rc = NULL; 2103 void *rc = NULL;
2080 2104
2081 for (st->bucket = 0; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) { 2105 st->offset = 0;
2106 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2082 struct sock *sk; 2107 struct sock *sk;
2083 struct hlist_nulls_node *node; 2108 struct hlist_nulls_node *node;
2084 struct inet_timewait_sock *tw; 2109 struct inet_timewait_sock *tw;
@@ -2123,6 +2148,7 @@ static void *established_get_next(struct seq_file *seq, void *cur)
2123 struct net *net = seq_file_net(seq); 2148 struct net *net = seq_file_net(seq);
2124 2149
2125 ++st->num; 2150 ++st->num;
2151 ++st->offset;
2126 2152
2127 if (st->state == TCP_SEQ_STATE_TIME_WAIT) { 2153 if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2128 tw = cur; 2154 tw = cur;
@@ -2139,6 +2165,7 @@ get_tw:
2139 st->state = TCP_SEQ_STATE_ESTABLISHED; 2165 st->state = TCP_SEQ_STATE_ESTABLISHED;
2140 2166
2141 /* Look for next non empty bucket */ 2167 /* Look for next non empty bucket */
2168 st->offset = 0;
2142 while (++st->bucket <= tcp_hashinfo.ehash_mask && 2169 while (++st->bucket <= tcp_hashinfo.ehash_mask &&
2143 empty_bucket(st)) 2170 empty_bucket(st))
2144 ; 2171 ;
@@ -2166,7 +2193,11 @@ out:
2166 2193
2167static void *established_get_idx(struct seq_file *seq, loff_t pos) 2194static void *established_get_idx(struct seq_file *seq, loff_t pos)
2168{ 2195{
2169 void *rc = established_get_first(seq); 2196 struct tcp_iter_state *st = seq->private;
2197 void *rc;
2198
2199 st->bucket = 0;
2200 rc = established_get_first(seq);
2170 2201
2171 while (rc && pos) { 2202 while (rc && pos) {
2172 rc = established_get_next(seq, rc); 2203 rc = established_get_next(seq, rc);
@@ -2191,24 +2222,72 @@ static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2191 return rc; 2222 return rc;
2192} 2223}
2193 2224
2225static void *tcp_seek_last_pos(struct seq_file *seq)
2226{
2227 struct tcp_iter_state *st = seq->private;
2228 int offset = st->offset;
2229 int orig_num = st->num;
2230 void *rc = NULL;
2231
2232 switch (st->state) {
2233 case TCP_SEQ_STATE_OPENREQ:
2234 case TCP_SEQ_STATE_LISTENING:
2235 if (st->bucket >= INET_LHTABLE_SIZE)
2236 break;
2237 st->state = TCP_SEQ_STATE_LISTENING;
2238 rc = listening_get_next(seq, NULL);
2239 while (offset-- && rc)
2240 rc = listening_get_next(seq, rc);
2241 if (rc)
2242 break;
2243 st->bucket = 0;
2244 /* Fallthrough */
2245 case TCP_SEQ_STATE_ESTABLISHED:
2246 case TCP_SEQ_STATE_TIME_WAIT:
2247 st->state = TCP_SEQ_STATE_ESTABLISHED;
2248 if (st->bucket > tcp_hashinfo.ehash_mask)
2249 break;
2250 rc = established_get_first(seq);
2251 while (offset-- && rc)
2252 rc = established_get_next(seq, rc);
2253 }
2254
2255 st->num = orig_num;
2256
2257 return rc;
2258}
2259
2194static void *tcp_seq_start(struct seq_file *seq, loff_t *pos) 2260static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2195{ 2261{
2196 struct tcp_iter_state *st = seq->private; 2262 struct tcp_iter_state *st = seq->private;
2263 void *rc;
2264
2265 if (*pos && *pos == st->last_pos) {
2266 rc = tcp_seek_last_pos(seq);
2267 if (rc)
2268 goto out;
2269 }
2270
2197 st->state = TCP_SEQ_STATE_LISTENING; 2271 st->state = TCP_SEQ_STATE_LISTENING;
2198 st->num = 0; 2272 st->num = 0;
2199 return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; 2273 st->bucket = 0;
2274 st->offset = 0;
2275 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2276
2277out:
2278 st->last_pos = *pos;
2279 return rc;
2200} 2280}
2201 2281
2202static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2282static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2203{ 2283{
2284 struct tcp_iter_state *st = seq->private;
2204 void *rc = NULL; 2285 void *rc = NULL;
2205 struct tcp_iter_state *st;
2206 2286
2207 if (v == SEQ_START_TOKEN) { 2287 if (v == SEQ_START_TOKEN) {
2208 rc = tcp_get_idx(seq, 0); 2288 rc = tcp_get_idx(seq, 0);
2209 goto out; 2289 goto out;
2210 } 2290 }
2211 st = seq->private;
2212 2291
2213 switch (st->state) { 2292 switch (st->state) {
2214 case TCP_SEQ_STATE_OPENREQ: 2293 case TCP_SEQ_STATE_OPENREQ:
@@ -2216,6 +2295,8 @@ static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2216 rc = listening_get_next(seq, v); 2295 rc = listening_get_next(seq, v);
2217 if (!rc) { 2296 if (!rc) {
2218 st->state = TCP_SEQ_STATE_ESTABLISHED; 2297 st->state = TCP_SEQ_STATE_ESTABLISHED;
2298 st->bucket = 0;
2299 st->offset = 0;
2219 rc = established_get_first(seq); 2300 rc = established_get_first(seq);
2220 } 2301 }
2221 break; 2302 break;
@@ -2226,6 +2307,7 @@ static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2226 } 2307 }
2227out: 2308out:
2228 ++*pos; 2309 ++*pos;
2310 st->last_pos = *pos;
2229 return rc; 2311 return rc;
2230} 2312}
2231 2313
@@ -2264,6 +2346,7 @@ static int tcp_seq_open(struct inode *inode, struct file *file)
2264 2346
2265 s = ((struct seq_file *)file->private_data)->private; 2347 s = ((struct seq_file *)file->private_data)->private;
2266 s->family = afinfo->family; 2348 s->family = afinfo->family;
2349 s->last_pos = 0;
2267 return 0; 2350 return 0;
2268} 2351}
2269 2352
@@ -2287,11 +2370,13 @@ int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2287 rc = -ENOMEM; 2370 rc = -ENOMEM;
2288 return rc; 2371 return rc;
2289} 2372}
2373EXPORT_SYMBOL(tcp_proc_register);
2290 2374
2291void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo) 2375void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2292{ 2376{
2293 proc_net_remove(net, afinfo->name); 2377 proc_net_remove(net, afinfo->name);
2294} 2378}
2379EXPORT_SYMBOL(tcp_proc_unregister);
2295 2380
2296static void get_openreq4(struct sock *sk, struct request_sock *req, 2381static void get_openreq4(struct sock *sk, struct request_sock *req,
2297 struct seq_file *f, int i, int uid, int *len) 2382 struct seq_file *f, int i, int uid, int *len)
@@ -2515,6 +2600,8 @@ struct proto tcp_prot = {
2515 .setsockopt = tcp_setsockopt, 2600 .setsockopt = tcp_setsockopt,
2516 .getsockopt = tcp_getsockopt, 2601 .getsockopt = tcp_getsockopt,
2517 .recvmsg = tcp_recvmsg, 2602 .recvmsg = tcp_recvmsg,
2603 .sendmsg = tcp_sendmsg,
2604 .sendpage = tcp_sendpage,
2518 .backlog_rcv = tcp_v4_do_rcv, 2605 .backlog_rcv = tcp_v4_do_rcv,
2519 .hash = inet_hash, 2606 .hash = inet_hash,
2520 .unhash = inet_unhash, 2607 .unhash = inet_unhash,
@@ -2533,11 +2620,13 @@ struct proto tcp_prot = {
2533 .twsk_prot = &tcp_timewait_sock_ops, 2620 .twsk_prot = &tcp_timewait_sock_ops,
2534 .rsk_prot = &tcp_request_sock_ops, 2621 .rsk_prot = &tcp_request_sock_ops,
2535 .h.hashinfo = &tcp_hashinfo, 2622 .h.hashinfo = &tcp_hashinfo,
2623 .no_autobind = true,
2536#ifdef CONFIG_COMPAT 2624#ifdef CONFIG_COMPAT
2537 .compat_setsockopt = compat_tcp_setsockopt, 2625 .compat_setsockopt = compat_tcp_setsockopt,
2538 .compat_getsockopt = compat_tcp_getsockopt, 2626 .compat_getsockopt = compat_tcp_getsockopt,
2539#endif 2627#endif
2540}; 2628};
2629EXPORT_SYMBOL(tcp_prot);
2541 2630
2542 2631
2543static int __net_init tcp_sk_init(struct net *net) 2632static int __net_init tcp_sk_init(struct net *net)
@@ -2568,20 +2657,3 @@ void __init tcp_v4_init(void)
2568 if (register_pernet_subsys(&tcp_sk_ops)) 2657 if (register_pernet_subsys(&tcp_sk_ops))
2569 panic("Failed to create the TCP control socket.\n"); 2658 panic("Failed to create the TCP control socket.\n");
2570} 2659}
2571
2572EXPORT_SYMBOL(ipv4_specific);
2573EXPORT_SYMBOL(tcp_hashinfo);
2574EXPORT_SYMBOL(tcp_prot);
2575EXPORT_SYMBOL(tcp_v4_conn_request);
2576EXPORT_SYMBOL(tcp_v4_connect);
2577EXPORT_SYMBOL(tcp_v4_do_rcv);
2578EXPORT_SYMBOL(tcp_v4_remember_stamp);
2579EXPORT_SYMBOL(tcp_v4_send_check);
2580EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2581
2582#ifdef CONFIG_PROC_FS
2583EXPORT_SYMBOL(tcp_proc_register);
2584EXPORT_SYMBOL(tcp_proc_unregister);
2585#endif
2586EXPORT_SYMBOL(sysctl_tcp_low_latency);
2587
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 794c2e122a41..f25b56cb85cb 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -47,7 +47,6 @@ struct inet_timewait_death_row tcp_death_row = {
47 .twcal_timer = TIMER_INITIALIZER(inet_twdr_twcal_tick, 0, 47 .twcal_timer = TIMER_INITIALIZER(inet_twdr_twcal_tick, 0,
48 (unsigned long)&tcp_death_row), 48 (unsigned long)&tcp_death_row),
49}; 49};
50
51EXPORT_SYMBOL_GPL(tcp_death_row); 50EXPORT_SYMBOL_GPL(tcp_death_row);
52 51
53static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) 52static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
@@ -262,6 +261,7 @@ kill:
262 inet_twsk_put(tw); 261 inet_twsk_put(tw);
263 return TCP_TW_SUCCESS; 262 return TCP_TW_SUCCESS;
264} 263}
264EXPORT_SYMBOL(tcp_timewait_state_process);
265 265
266/* 266/*
267 * Move a socket to time-wait or dead fin-wait-2 state. 267 * Move a socket to time-wait or dead fin-wait-2 state.
@@ -362,7 +362,6 @@ void tcp_twsk_destructor(struct sock *sk)
362 tcp_free_md5sig_pool(); 362 tcp_free_md5sig_pool();
363#endif 363#endif
364} 364}
365
366EXPORT_SYMBOL_GPL(tcp_twsk_destructor); 365EXPORT_SYMBOL_GPL(tcp_twsk_destructor);
367 366
368static inline void TCP_ECN_openreq_child(struct tcp_sock *tp, 367static inline void TCP_ECN_openreq_child(struct tcp_sock *tp,
@@ -510,6 +509,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
510 } 509 }
511 return newsk; 510 return newsk;
512} 511}
512EXPORT_SYMBOL(tcp_create_openreq_child);
513 513
514/* 514/*
515 * Process an incoming packet for SYN_RECV sockets represented 515 * Process an incoming packet for SYN_RECV sockets represented
@@ -706,6 +706,7 @@ embryonic_reset:
706 inet_csk_reqsk_queue_drop(sk, req, prev); 706 inet_csk_reqsk_queue_drop(sk, req, prev);
707 return NULL; 707 return NULL;
708} 708}
709EXPORT_SYMBOL(tcp_check_req);
709 710
710/* 711/*
711 * Queue segment on the new socket if the new socket is active, 712 * Queue segment on the new socket if the new socket is active,
@@ -737,8 +738,4 @@ int tcp_child_process(struct sock *parent, struct sock *child,
737 sock_put(child); 738 sock_put(child);
738 return ret; 739 return ret;
739} 740}
740
741EXPORT_SYMBOL(tcp_check_req);
742EXPORT_SYMBOL(tcp_child_process); 741EXPORT_SYMBOL(tcp_child_process);
743EXPORT_SYMBOL(tcp_create_openreq_child);
744EXPORT_SYMBOL(tcp_timewait_state_process);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index b4ed957f201a..de3bd8458588 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -247,6 +247,7 @@ void tcp_select_initial_window(int __space, __u32 mss,
247 /* Set the clamp no higher than max representable value */ 247 /* Set the clamp no higher than max representable value */
248 (*window_clamp) = min(65535U << (*rcv_wscale), *window_clamp); 248 (*window_clamp) = min(65535U << (*rcv_wscale), *window_clamp);
249} 249}
250EXPORT_SYMBOL(tcp_select_initial_window);
250 251
251/* Chose a new window to advertise, update state in tcp_sock for the 252/* Chose a new window to advertise, update state in tcp_sock for the
252 * socket, and return result with RFC1323 scaling applied. The return 253 * socket, and return result with RFC1323 scaling applied. The return
@@ -294,9 +295,9 @@ static u16 tcp_select_window(struct sock *sk)
294/* Packet ECN state for a SYN-ACK */ 295/* Packet ECN state for a SYN-ACK */
295static inline void TCP_ECN_send_synack(struct tcp_sock *tp, struct sk_buff *skb) 296static inline void TCP_ECN_send_synack(struct tcp_sock *tp, struct sk_buff *skb)
296{ 297{
297 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_CWR; 298 TCP_SKB_CB(skb)->flags &= ~TCPHDR_CWR;
298 if (!(tp->ecn_flags & TCP_ECN_OK)) 299 if (!(tp->ecn_flags & TCP_ECN_OK))
299 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_ECE; 300 TCP_SKB_CB(skb)->flags &= ~TCPHDR_ECE;
300} 301}
301 302
302/* Packet ECN state for a SYN. */ 303/* Packet ECN state for a SYN. */
@@ -306,7 +307,7 @@ static inline void TCP_ECN_send_syn(struct sock *sk, struct sk_buff *skb)
306 307
307 tp->ecn_flags = 0; 308 tp->ecn_flags = 0;
308 if (sysctl_tcp_ecn == 1) { 309 if (sysctl_tcp_ecn == 1) {
309 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_ECE | TCPCB_FLAG_CWR; 310 TCP_SKB_CB(skb)->flags |= TCPHDR_ECE | TCPHDR_CWR;
310 tp->ecn_flags = TCP_ECN_OK; 311 tp->ecn_flags = TCP_ECN_OK;
311 } 312 }
312} 313}
@@ -361,7 +362,7 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
361 skb_shinfo(skb)->gso_type = 0; 362 skb_shinfo(skb)->gso_type = 0;
362 363
363 TCP_SKB_CB(skb)->seq = seq; 364 TCP_SKB_CB(skb)->seq = seq;
364 if (flags & (TCPCB_FLAG_SYN | TCPCB_FLAG_FIN)) 365 if (flags & (TCPHDR_SYN | TCPHDR_FIN))
365 seq++; 366 seq++;
366 TCP_SKB_CB(skb)->end_seq = seq; 367 TCP_SKB_CB(skb)->end_seq = seq;
367} 368}
@@ -820,7 +821,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
820 tcb = TCP_SKB_CB(skb); 821 tcb = TCP_SKB_CB(skb);
821 memset(&opts, 0, sizeof(opts)); 822 memset(&opts, 0, sizeof(opts));
822 823
823 if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) 824 if (unlikely(tcb->flags & TCPHDR_SYN))
824 tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5); 825 tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5);
825 else 826 else
826 tcp_options_size = tcp_established_options(sk, skb, &opts, 827 tcp_options_size = tcp_established_options(sk, skb, &opts,
@@ -843,7 +844,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
843 *(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | 844 *(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) |
844 tcb->flags); 845 tcb->flags);
845 846
846 if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) { 847 if (unlikely(tcb->flags & TCPHDR_SYN)) {
847 /* RFC1323: The window in SYN & SYN/ACK segments 848 /* RFC1323: The window in SYN & SYN/ACK segments
848 * is never scaled. 849 * is never scaled.
849 */ 850 */
@@ -866,7 +867,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
866 } 867 }
867 868
868 tcp_options_write((__be32 *)(th + 1), tp, &opts); 869 tcp_options_write((__be32 *)(th + 1), tp, &opts);
869 if (likely((tcb->flags & TCPCB_FLAG_SYN) == 0)) 870 if (likely((tcb->flags & TCPHDR_SYN) == 0))
870 TCP_ECN_send(sk, skb, tcp_header_size); 871 TCP_ECN_send(sk, skb, tcp_header_size);
871 872
872#ifdef CONFIG_TCP_MD5SIG 873#ifdef CONFIG_TCP_MD5SIG
@@ -880,7 +881,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
880 881
881 icsk->icsk_af_ops->send_check(sk, skb); 882 icsk->icsk_af_ops->send_check(sk, skb);
882 883
883 if (likely(tcb->flags & TCPCB_FLAG_ACK)) 884 if (likely(tcb->flags & TCPHDR_ACK))
884 tcp_event_ack_sent(sk, tcp_skb_pcount(skb)); 885 tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
885 886
886 if (skb->len != tcp_header_size) 887 if (skb->len != tcp_header_size)
@@ -1023,7 +1024,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
1023 1024
1024 /* PSH and FIN should only be set in the second packet. */ 1025 /* PSH and FIN should only be set in the second packet. */
1025 flags = TCP_SKB_CB(skb)->flags; 1026 flags = TCP_SKB_CB(skb)->flags;
1026 TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN | TCPCB_FLAG_PSH); 1027 TCP_SKB_CB(skb)->flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
1027 TCP_SKB_CB(buff)->flags = flags; 1028 TCP_SKB_CB(buff)->flags = flags;
1028 TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked; 1029 TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked;
1029 1030
@@ -1189,6 +1190,7 @@ void tcp_mtup_init(struct sock *sk)
1189 icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, sysctl_tcp_base_mss); 1190 icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, sysctl_tcp_base_mss);
1190 icsk->icsk_mtup.probe_size = 0; 1191 icsk->icsk_mtup.probe_size = 0;
1191} 1192}
1193EXPORT_SYMBOL(tcp_mtup_init);
1192 1194
1193/* This function synchronize snd mss to current pmtu/exthdr set. 1195/* This function synchronize snd mss to current pmtu/exthdr set.
1194 1196
@@ -1232,6 +1234,7 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
1232 1234
1233 return mss_now; 1235 return mss_now;
1234} 1236}
1237EXPORT_SYMBOL(tcp_sync_mss);
1235 1238
1236/* Compute the current effective MSS, taking SACKs and IP options, 1239/* Compute the current effective MSS, taking SACKs and IP options,
1237 * and even PMTU discovery events into account. 1240 * and even PMTU discovery events into account.
@@ -1328,8 +1331,7 @@ static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp,
1328 u32 in_flight, cwnd; 1331 u32 in_flight, cwnd;
1329 1332
1330 /* Don't be strict about the congestion window for the final FIN. */ 1333 /* Don't be strict about the congestion window for the final FIN. */
1331 if ((TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) && 1334 if ((TCP_SKB_CB(skb)->flags & TCPHDR_FIN) && tcp_skb_pcount(skb) == 1)
1332 tcp_skb_pcount(skb) == 1)
1333 return 1; 1335 return 1;
1334 1336
1335 in_flight = tcp_packets_in_flight(tp); 1337 in_flight = tcp_packets_in_flight(tp);
@@ -1398,7 +1400,7 @@ static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb,
1398 * Nagle can be ignored during F-RTO too (see RFC4138). 1400 * Nagle can be ignored during F-RTO too (see RFC4138).
1399 */ 1401 */
1400 if (tcp_urg_mode(tp) || (tp->frto_counter == 2) || 1402 if (tcp_urg_mode(tp) || (tp->frto_counter == 2) ||
1401 (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)) 1403 (TCP_SKB_CB(skb)->flags & TCPHDR_FIN))
1402 return 1; 1404 return 1;
1403 1405
1404 if (!tcp_nagle_check(tp, skb, cur_mss, nonagle)) 1406 if (!tcp_nagle_check(tp, skb, cur_mss, nonagle))
@@ -1461,7 +1463,7 @@ int tcp_may_send_now(struct sock *sk)
1461 * packet has never been sent out before (and thus is not cloned). 1463 * packet has never been sent out before (and thus is not cloned).
1462 */ 1464 */
1463static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, 1465static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
1464 unsigned int mss_now) 1466 unsigned int mss_now, gfp_t gfp)
1465{ 1467{
1466 struct sk_buff *buff; 1468 struct sk_buff *buff;
1467 int nlen = skb->len - len; 1469 int nlen = skb->len - len;
@@ -1471,7 +1473,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
1471 if (skb->len != skb->data_len) 1473 if (skb->len != skb->data_len)
1472 return tcp_fragment(sk, skb, len, mss_now); 1474 return tcp_fragment(sk, skb, len, mss_now);
1473 1475
1474 buff = sk_stream_alloc_skb(sk, 0, GFP_ATOMIC); 1476 buff = sk_stream_alloc_skb(sk, 0, gfp);
1475 if (unlikely(buff == NULL)) 1477 if (unlikely(buff == NULL))
1476 return -ENOMEM; 1478 return -ENOMEM;
1477 1479
@@ -1487,7 +1489,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
1487 1489
1488 /* PSH and FIN should only be set in the second packet. */ 1490 /* PSH and FIN should only be set in the second packet. */
1489 flags = TCP_SKB_CB(skb)->flags; 1491 flags = TCP_SKB_CB(skb)->flags;
1490 TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN | TCPCB_FLAG_PSH); 1492 TCP_SKB_CB(skb)->flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
1491 TCP_SKB_CB(buff)->flags = flags; 1493 TCP_SKB_CB(buff)->flags = flags;
1492 1494
1493 /* This packet was never sent out yet, so no SACK bits. */ 1495 /* This packet was never sent out yet, so no SACK bits. */
@@ -1518,7 +1520,7 @@ static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
1518 const struct inet_connection_sock *icsk = inet_csk(sk); 1520 const struct inet_connection_sock *icsk = inet_csk(sk);
1519 u32 send_win, cong_win, limit, in_flight; 1521 u32 send_win, cong_win, limit, in_flight;
1520 1522
1521 if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) 1523 if (TCP_SKB_CB(skb)->flags & TCPHDR_FIN)
1522 goto send_now; 1524 goto send_now;
1523 1525
1524 if (icsk->icsk_ca_state != TCP_CA_Open) 1526 if (icsk->icsk_ca_state != TCP_CA_Open)
@@ -1644,7 +1646,7 @@ static int tcp_mtu_probe(struct sock *sk)
1644 1646
1645 TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq; 1647 TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq;
1646 TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size; 1648 TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size;
1647 TCP_SKB_CB(nskb)->flags = TCPCB_FLAG_ACK; 1649 TCP_SKB_CB(nskb)->flags = TCPHDR_ACK;
1648 TCP_SKB_CB(nskb)->sacked = 0; 1650 TCP_SKB_CB(nskb)->sacked = 0;
1649 nskb->csum = 0; 1651 nskb->csum = 0;
1650 nskb->ip_summed = skb->ip_summed; 1652 nskb->ip_summed = skb->ip_summed;
@@ -1669,7 +1671,7 @@ static int tcp_mtu_probe(struct sock *sk)
1669 sk_wmem_free_skb(sk, skb); 1671 sk_wmem_free_skb(sk, skb);
1670 } else { 1672 } else {
1671 TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags & 1673 TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags &
1672 ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH); 1674 ~(TCPHDR_FIN|TCPHDR_PSH);
1673 if (!skb_shinfo(skb)->nr_frags) { 1675 if (!skb_shinfo(skb)->nr_frags) {
1674 skb_pull(skb, copy); 1676 skb_pull(skb, copy);
1675 if (skb->ip_summed != CHECKSUM_PARTIAL) 1677 if (skb->ip_summed != CHECKSUM_PARTIAL)
@@ -1769,7 +1771,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1769 cwnd_quota); 1771 cwnd_quota);
1770 1772
1771 if (skb->len > limit && 1773 if (skb->len > limit &&
1772 unlikely(tso_fragment(sk, skb, limit, mss_now))) 1774 unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
1773 break; 1775 break;
1774 1776
1775 TCP_SKB_CB(skb)->when = tcp_time_stamp; 1777 TCP_SKB_CB(skb)->when = tcp_time_stamp;
@@ -2020,7 +2022,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
2020 2022
2021 if (!sysctl_tcp_retrans_collapse) 2023 if (!sysctl_tcp_retrans_collapse)
2022 return; 2024 return;
2023 if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) 2025 if (TCP_SKB_CB(skb)->flags & TCPHDR_SYN)
2024 return; 2026 return;
2025 2027
2026 tcp_for_write_queue_from_safe(skb, tmp, sk) { 2028 tcp_for_write_queue_from_safe(skb, tmp, sk) {
@@ -2112,7 +2114,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
2112 * since it is cheap to do so and saves bytes on the network. 2114 * since it is cheap to do so and saves bytes on the network.
2113 */ 2115 */
2114 if (skb->len > 0 && 2116 if (skb->len > 0 &&
2115 (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) && 2117 (TCP_SKB_CB(skb)->flags & TCPHDR_FIN) &&
2116 tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) { 2118 tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) {
2117 if (!pskb_trim(skb, 0)) { 2119 if (!pskb_trim(skb, 0)) {
2118 /* Reuse, even though it does some unnecessary work */ 2120 /* Reuse, even though it does some unnecessary work */
@@ -2208,6 +2210,9 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
2208 int mib_idx; 2210 int mib_idx;
2209 int fwd_rexmitting = 0; 2211 int fwd_rexmitting = 0;
2210 2212
2213 if (!tp->packets_out)
2214 return;
2215
2211 if (!tp->lost_out) 2216 if (!tp->lost_out)
2212 tp->retransmit_high = tp->snd_una; 2217 tp->retransmit_high = tp->snd_una;
2213 2218
@@ -2301,7 +2306,7 @@ void tcp_send_fin(struct sock *sk)
2301 mss_now = tcp_current_mss(sk); 2306 mss_now = tcp_current_mss(sk);
2302 2307
2303 if (tcp_send_head(sk) != NULL) { 2308 if (tcp_send_head(sk) != NULL) {
2304 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN; 2309 TCP_SKB_CB(skb)->flags |= TCPHDR_FIN;
2305 TCP_SKB_CB(skb)->end_seq++; 2310 TCP_SKB_CB(skb)->end_seq++;
2306 tp->write_seq++; 2311 tp->write_seq++;
2307 } else { 2312 } else {
@@ -2318,7 +2323,7 @@ void tcp_send_fin(struct sock *sk)
2318 skb_reserve(skb, MAX_TCP_HEADER); 2323 skb_reserve(skb, MAX_TCP_HEADER);
2319 /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */ 2324 /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
2320 tcp_init_nondata_skb(skb, tp->write_seq, 2325 tcp_init_nondata_skb(skb, tp->write_seq,
2321 TCPCB_FLAG_ACK | TCPCB_FLAG_FIN); 2326 TCPHDR_ACK | TCPHDR_FIN);
2322 tcp_queue_skb(sk, skb); 2327 tcp_queue_skb(sk, skb);
2323 } 2328 }
2324 __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_OFF); 2329 __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_OFF);
@@ -2343,7 +2348,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority)
2343 /* Reserve space for headers and prepare control bits. */ 2348 /* Reserve space for headers and prepare control bits. */
2344 skb_reserve(skb, MAX_TCP_HEADER); 2349 skb_reserve(skb, MAX_TCP_HEADER);
2345 tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk), 2350 tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk),
2346 TCPCB_FLAG_ACK | TCPCB_FLAG_RST); 2351 TCPHDR_ACK | TCPHDR_RST);
2347 /* Send it off. */ 2352 /* Send it off. */
2348 TCP_SKB_CB(skb)->when = tcp_time_stamp; 2353 TCP_SKB_CB(skb)->when = tcp_time_stamp;
2349 if (tcp_transmit_skb(sk, skb, 0, priority)) 2354 if (tcp_transmit_skb(sk, skb, 0, priority))
@@ -2363,11 +2368,11 @@ int tcp_send_synack(struct sock *sk)
2363 struct sk_buff *skb; 2368 struct sk_buff *skb;
2364 2369
2365 skb = tcp_write_queue_head(sk); 2370 skb = tcp_write_queue_head(sk);
2366 if (skb == NULL || !(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN)) { 2371 if (skb == NULL || !(TCP_SKB_CB(skb)->flags & TCPHDR_SYN)) {
2367 printk(KERN_DEBUG "tcp_send_synack: wrong queue state\n"); 2372 printk(KERN_DEBUG "tcp_send_synack: wrong queue state\n");
2368 return -EFAULT; 2373 return -EFAULT;
2369 } 2374 }
2370 if (!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_ACK)) { 2375 if (!(TCP_SKB_CB(skb)->flags & TCPHDR_ACK)) {
2371 if (skb_cloned(skb)) { 2376 if (skb_cloned(skb)) {
2372 struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC); 2377 struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC);
2373 if (nskb == NULL) 2378 if (nskb == NULL)
@@ -2381,7 +2386,7 @@ int tcp_send_synack(struct sock *sk)
2381 skb = nskb; 2386 skb = nskb;
2382 } 2387 }
2383 2388
2384 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_ACK; 2389 TCP_SKB_CB(skb)->flags |= TCPHDR_ACK;
2385 TCP_ECN_send_synack(tcp_sk(sk), skb); 2390 TCP_ECN_send_synack(tcp_sk(sk), skb);
2386 } 2391 }
2387 TCP_SKB_CB(skb)->when = tcp_time_stamp; 2392 TCP_SKB_CB(skb)->when = tcp_time_stamp;
@@ -2460,7 +2465,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2460 * not even correctly set) 2465 * not even correctly set)
2461 */ 2466 */
2462 tcp_init_nondata_skb(skb, tcp_rsk(req)->snt_isn, 2467 tcp_init_nondata_skb(skb, tcp_rsk(req)->snt_isn,
2463 TCPCB_FLAG_SYN | TCPCB_FLAG_ACK); 2468 TCPHDR_SYN | TCPHDR_ACK);
2464 2469
2465 if (OPTION_COOKIE_EXTENSION & opts.options) { 2470 if (OPTION_COOKIE_EXTENSION & opts.options) {
2466 if (s_data_desired) { 2471 if (s_data_desired) {
@@ -2515,6 +2520,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2515 2520
2516 return skb; 2521 return skb;
2517} 2522}
2523EXPORT_SYMBOL(tcp_make_synack);
2518 2524
2519/* Do all connect socket setups that can be done AF independent. */ 2525/* Do all connect socket setups that can be done AF independent. */
2520static void tcp_connect_init(struct sock *sk) 2526static void tcp_connect_init(struct sock *sk)
@@ -2592,7 +2598,7 @@ int tcp_connect(struct sock *sk)
2592 skb_reserve(buff, MAX_TCP_HEADER); 2598 skb_reserve(buff, MAX_TCP_HEADER);
2593 2599
2594 tp->snd_nxt = tp->write_seq; 2600 tp->snd_nxt = tp->write_seq;
2595 tcp_init_nondata_skb(buff, tp->write_seq++, TCPCB_FLAG_SYN); 2601 tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN);
2596 TCP_ECN_send_syn(sk, buff); 2602 TCP_ECN_send_syn(sk, buff);
2597 2603
2598 /* Send it off. */ 2604 /* Send it off. */
@@ -2617,6 +2623,7 @@ int tcp_connect(struct sock *sk)
2617 inet_csk(sk)->icsk_rto, TCP_RTO_MAX); 2623 inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
2618 return 0; 2624 return 0;
2619} 2625}
2626EXPORT_SYMBOL(tcp_connect);
2620 2627
2621/* Send out a delayed ack, the caller does the policy checking 2628/* Send out a delayed ack, the caller does the policy checking
2622 * to see if we should even be here. See tcp_input.c:tcp_ack_snd_check() 2629 * to see if we should even be here. See tcp_input.c:tcp_ack_snd_check()
@@ -2698,7 +2705,7 @@ void tcp_send_ack(struct sock *sk)
2698 2705
2699 /* Reserve space for headers and prepare control bits. */ 2706 /* Reserve space for headers and prepare control bits. */
2700 skb_reserve(buff, MAX_TCP_HEADER); 2707 skb_reserve(buff, MAX_TCP_HEADER);
2701 tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPCB_FLAG_ACK); 2708 tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK);
2702 2709
2703 /* Send it off, this clears delayed acks for us. */ 2710 /* Send it off, this clears delayed acks for us. */
2704 TCP_SKB_CB(buff)->when = tcp_time_stamp; 2711 TCP_SKB_CB(buff)->when = tcp_time_stamp;
@@ -2732,7 +2739,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
2732 * end to send an ack. Don't queue or clone SKB, just 2739 * end to send an ack. Don't queue or clone SKB, just
2733 * send it. 2740 * send it.
2734 */ 2741 */
2735 tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPCB_FLAG_ACK); 2742 tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK);
2736 TCP_SKB_CB(skb)->when = tcp_time_stamp; 2743 TCP_SKB_CB(skb)->when = tcp_time_stamp;
2737 return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC); 2744 return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC);
2738} 2745}
@@ -2762,13 +2769,13 @@ int tcp_write_wakeup(struct sock *sk)
2762 if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq || 2769 if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq ||
2763 skb->len > mss) { 2770 skb->len > mss) {
2764 seg_size = min(seg_size, mss); 2771 seg_size = min(seg_size, mss);
2765 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; 2772 TCP_SKB_CB(skb)->flags |= TCPHDR_PSH;
2766 if (tcp_fragment(sk, skb, seg_size, mss)) 2773 if (tcp_fragment(sk, skb, seg_size, mss))
2767 return -1; 2774 return -1;
2768 } else if (!tcp_skb_pcount(skb)) 2775 } else if (!tcp_skb_pcount(skb))
2769 tcp_set_skb_tso_segs(sk, skb, mss); 2776 tcp_set_skb_tso_segs(sk, skb, mss);
2770 2777
2771 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; 2778 TCP_SKB_CB(skb)->flags |= TCPHDR_PSH;
2772 TCP_SKB_CB(skb)->when = tcp_time_stamp; 2779 TCP_SKB_CB(skb)->when = tcp_time_stamp;
2773 err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); 2780 err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
2774 if (!err) 2781 if (!err)
@@ -2821,10 +2828,3 @@ void tcp_send_probe0(struct sock *sk)
2821 TCP_RTO_MAX); 2828 TCP_RTO_MAX);
2822 } 2829 }
2823} 2830}
2824
2825EXPORT_SYMBOL(tcp_select_initial_window);
2826EXPORT_SYMBOL(tcp_connect);
2827EXPORT_SYMBOL(tcp_make_synack);
2828EXPORT_SYMBOL(tcp_simple_retransmit);
2829EXPORT_SYMBOL(tcp_sync_mss);
2830EXPORT_SYMBOL(tcp_mtup_init);
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 440a5c6004f6..808bb920c9f5 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -41,7 +41,6 @@ void tcp_init_xmit_timers(struct sock *sk)
41 inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer, 41 inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
42 &tcp_keepalive_timer); 42 &tcp_keepalive_timer);
43} 43}
44
45EXPORT_SYMBOL(tcp_init_xmit_timers); 44EXPORT_SYMBOL(tcp_init_xmit_timers);
46 45
47static void tcp_write_err(struct sock *sk) 46static void tcp_write_err(struct sock *sk)
diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c
index 3b3813cc80b9..59186ca7808a 100644
--- a/net/ipv4/tunnel4.c
+++ b/net/ipv4/tunnel4.c
@@ -48,7 +48,6 @@ err:
48 48
49 return ret; 49 return ret;
50} 50}
51
52EXPORT_SYMBOL(xfrm4_tunnel_register); 51EXPORT_SYMBOL(xfrm4_tunnel_register);
53 52
54int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler, unsigned short family) 53int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler, unsigned short family)
@@ -72,7 +71,6 @@ int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler, unsigned short family)
72 71
73 return ret; 72 return ret;
74} 73}
75
76EXPORT_SYMBOL(xfrm4_tunnel_deregister); 74EXPORT_SYMBOL(xfrm4_tunnel_deregister);
77 75
78static int tunnel4_rcv(struct sk_buff *skb) 76static int tunnel4_rcv(struct sk_buff *skb)
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 9de6a698f91d..32e0bef60d0a 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -633,9 +633,9 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
633 if (!inet->recverr) { 633 if (!inet->recverr) {
634 if (!harderr || sk->sk_state != TCP_ESTABLISHED) 634 if (!harderr || sk->sk_state != TCP_ESTABLISHED)
635 goto out; 635 goto out;
636 } else { 636 } else
637 ip_icmp_error(sk, skb, err, uh->dest, info, (u8 *)(uh+1)); 637 ip_icmp_error(sk, skb, err, uh->dest, info, (u8 *)(uh+1));
638 } 638
639 sk->sk_err = err; 639 sk->sk_err = err;
640 sk->sk_error_report(sk); 640 sk->sk_error_report(sk);
641out: 641out:
@@ -914,7 +914,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
914 !sock_flag(sk, SOCK_BROADCAST)) 914 !sock_flag(sk, SOCK_BROADCAST))
915 goto out; 915 goto out;
916 if (connected) 916 if (connected)
917 sk_dst_set(sk, dst_clone(&rt->u.dst)); 917 sk_dst_set(sk, dst_clone(&rt->dst));
918 } 918 }
919 919
920 if (msg->msg_flags&MSG_CONFIRM) 920 if (msg->msg_flags&MSG_CONFIRM)
@@ -978,7 +978,7 @@ out:
978 return err; 978 return err;
979 979
980do_confirm: 980do_confirm:
981 dst_confirm(&rt->u.dst); 981 dst_confirm(&rt->dst);
982 if (!(msg->msg_flags&MSG_PROBE) || len) 982 if (!(msg->msg_flags&MSG_PROBE) || len)
983 goto back_from_confirm; 983 goto back_from_confirm;
984 err = 0; 984 err = 0;
@@ -1063,10 +1063,11 @@ static unsigned int first_packet_length(struct sock *sk)
1063 spin_unlock_bh(&rcvq->lock); 1063 spin_unlock_bh(&rcvq->lock);
1064 1064
1065 if (!skb_queue_empty(&list_kill)) { 1065 if (!skb_queue_empty(&list_kill)) {
1066 lock_sock_bh(sk); 1066 bool slow = lock_sock_fast(sk);
1067
1067 __skb_queue_purge(&list_kill); 1068 __skb_queue_purge(&list_kill);
1068 sk_mem_reclaim_partial(sk); 1069 sk_mem_reclaim_partial(sk);
1069 unlock_sock_bh(sk); 1070 unlock_sock_fast(sk, slow);
1070 } 1071 }
1071 return res; 1072 return res;
1072} 1073}
@@ -1123,6 +1124,7 @@ int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1123 int peeked; 1124 int peeked;
1124 int err; 1125 int err;
1125 int is_udplite = IS_UDPLITE(sk); 1126 int is_udplite = IS_UDPLITE(sk);
1127 bool slow;
1126 1128
1127 /* 1129 /*
1128 * Check any passed addresses 1130 * Check any passed addresses
@@ -1197,10 +1199,10 @@ out:
1197 return err; 1199 return err;
1198 1200
1199csum_copy_err: 1201csum_copy_err:
1200 lock_sock_bh(sk); 1202 slow = lock_sock_fast(sk);
1201 if (!skb_kill_datagram(sk, skb, flags)) 1203 if (!skb_kill_datagram(sk, skb, flags))
1202 UDP_INC_STATS_USER(sock_net(sk), UDP_MIB_INERRORS, is_udplite); 1204 UDP_INC_STATS_USER(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
1203 unlock_sock_bh(sk); 1205 unlock_sock_fast(sk, slow);
1204 1206
1205 if (noblock) 1207 if (noblock)
1206 return -EAGAIN; 1208 return -EAGAIN;
@@ -1625,9 +1627,9 @@ int udp_rcv(struct sk_buff *skb)
1625 1627
1626void udp_destroy_sock(struct sock *sk) 1628void udp_destroy_sock(struct sock *sk)
1627{ 1629{
1628 lock_sock_bh(sk); 1630 bool slow = lock_sock_fast(sk);
1629 udp_flush_pending_frames(sk); 1631 udp_flush_pending_frames(sk);
1630 unlock_sock_bh(sk); 1632 unlock_sock_fast(sk, slow);
1631} 1633}
1632 1634
1633/* 1635/*
@@ -1686,8 +1688,8 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
1686 return -ENOPROTOOPT; 1688 return -ENOPROTOOPT;
1687 if (val != 0 && val < 8) /* Illegal coverage: use default (8) */ 1689 if (val != 0 && val < 8) /* Illegal coverage: use default (8) */
1688 val = 8; 1690 val = 8;
1689 else if (val > USHORT_MAX) 1691 else if (val > USHRT_MAX)
1690 val = USHORT_MAX; 1692 val = USHRT_MAX;
1691 up->pcslen = val; 1693 up->pcslen = val;
1692 up->pcflag |= UDPLITE_SEND_CC; 1694 up->pcflag |= UDPLITE_SEND_CC;
1693 break; 1695 break;
@@ -1700,8 +1702,8 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
1700 return -ENOPROTOOPT; 1702 return -ENOPROTOOPT;
1701 if (val != 0 && val < 8) /* Avoid silly minimal values. */ 1703 if (val != 0 && val < 8) /* Avoid silly minimal values. */
1702 val = 8; 1704 val = 8;
1703 else if (val > USHORT_MAX) 1705 else if (val > USHRT_MAX)
1704 val = USHORT_MAX; 1706 val = USHRT_MAX;
1705 up->pcrlen = val; 1707 up->pcrlen = val;
1706 up->pcflag |= UDPLITE_RECV_CC; 1708 up->pcflag |= UDPLITE_RECV_CC;
1707 break; 1709 break;
diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c
index 6610bf76369f..ab76aa928fa9 100644
--- a/net/ipv4/udplite.c
+++ b/net/ipv4/udplite.c
@@ -58,6 +58,7 @@ struct proto udplite_prot = {
58 .compat_getsockopt = compat_udp_getsockopt, 58 .compat_getsockopt = compat_udp_getsockopt,
59#endif 59#endif
60}; 60};
61EXPORT_SYMBOL(udplite_prot);
61 62
62static struct inet_protosw udplite4_protosw = { 63static struct inet_protosw udplite4_protosw = {
63 .type = SOCK_DGRAM, 64 .type = SOCK_DGRAM,
@@ -127,5 +128,3 @@ out_unregister_proto:
127out_register_err: 128out_register_err:
128 printk(KERN_CRIT "%s: Cannot add UDP-Lite protocol.\n", __func__); 129 printk(KERN_CRIT "%s: Cannot add UDP-Lite protocol.\n", __func__);
129} 130}
130
131EXPORT_SYMBOL(udplite_prot);
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
index ad8fbb871aa0..06814b6216dc 100644
--- a/net/ipv4/xfrm4_input.c
+++ b/net/ipv4/xfrm4_input.c
@@ -163,5 +163,4 @@ int xfrm4_rcv(struct sk_buff *skb)
163{ 163{
164 return xfrm4_rcv_spi(skb, ip_hdr(skb)->protocol, 0); 164 return xfrm4_rcv_spi(skb, ip_hdr(skb)->protocol, 0);
165} 165}
166
167EXPORT_SYMBOL(xfrm4_rcv); 166EXPORT_SYMBOL(xfrm4_rcv);
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 1705476670ef..869078d4eeb9 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -37,7 +37,7 @@ static struct dst_entry *xfrm4_dst_lookup(struct net *net, int tos,
37 fl.fl4_src = saddr->a4; 37 fl.fl4_src = saddr->a4;
38 38
39 err = __ip_route_output_key(net, &rt, &fl); 39 err = __ip_route_output_key(net, &rt, &fl);
40 dst = &rt->u.dst; 40 dst = &rt->dst;
41 if (err) 41 if (err)
42 dst = ERR_PTR(err); 42 dst = ERR_PTR(err);
43 return dst; 43 return dst;
@@ -108,6 +108,8 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
108 u8 *xprth = skb_network_header(skb) + iph->ihl * 4; 108 u8 *xprth = skb_network_header(skb) + iph->ihl * 4;
109 109
110 memset(fl, 0, sizeof(struct flowi)); 110 memset(fl, 0, sizeof(struct flowi));
111 fl->mark = skb->mark;
112
111 if (!(iph->frag_off & htons(IP_MF | IP_OFFSET))) { 113 if (!(iph->frag_off & htons(IP_MF | IP_OFFSET))) {
112 switch (iph->protocol) { 114 switch (iph->protocol) {
113 case IPPROTO_UDP: 115 case IPPROTO_UDP: